]> git.proxmox.com Git - mirror_ubuntu-kernels.git/commitdiff
Merge tag 'perf-tools-for-v5.16-2021-11-07-without-bpftool-fix' of git://git.kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 8 Nov 2021 17:25:26 +0000 (09:25 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 8 Nov 2021 17:25:26 +0000 (09:25 -0800)
Pull perf tools updates from Arnaldo Carvalho de Melo:
 "perf annotate:
   - Add riscv64 support.
   - Add fusion logic for AMD microarchs.

  perf record:
   - Add an option to control the synthesizing behavior:
       --synth <no|all|task|mmap|cgroup>

  core:
   - Allow controlling synthesizing PERF_RECORD_ metadata events during
     record.
   - perf.data reader prep work for multithreaded processing.
   - Fix missing exclude_{host,guest} setting in PMUs that don't support
     it and that were causing the feature detection code to disable it
     for all events, even the ones in PMUs that support it.
   - Fix the default use of precise events on AMD, that were always
     falling back to non-precise because perf_event_attr.exclude_guest=1
     was set and IBS does not have filtering capability, refusing
     precise + exclude_guest.
   - Add bitfield_swap() to handle branch_stack endian issue.

  perf script:
   - Show binary offsets for userspace addresses in callchains.
   - Support instruction latency via new "ins_lat" selectable field.
   - Add dlfilter-show-cycles

  perf inject:
   - Add vmlinux and ignore-vmlinux arguments, similar to other tools.

  perf list:
   - Display PMU prefix for partially supported hybrid cache events.
   - Display hybrid PMU events with cpu type.

  perf stat:
   - Improve metrics documentation of data structures.
   - Fix memory leaks in the metric code.
   - Use NAN for missing event IDs.
   - Don't compute unused events.
   - Fix memory leak on error path.
   - Encode and use metric-id as a metric qualifier.
   - Allow metrics with no events.
   - Avoid events for an 'if' constant result.
   - Only add a referenced metric once.
   - Simplify metric_refs calculation.
   - Allow modifiers on metrics.

  perf test:
   - Add workload test of metric and metric groups.
   - Workload test of all PMUs.
   - vmlinux-kallsyms: Ignore hidden symbols.
   - Add pmu-event test for event described as "config=".
   - Verify more event members in pmu-events test.
   - Add endian test for struct branch_flags on the sample-parsing test.
   - Improve temp file cleanup in several tests.

  perf daemon:
   - Address MSAN warnings on send_cmd().

  perf kmem:
   - Improve man page for record options

  perf srcline:
   - Use long-running addr2line per DSO, greatly speeding up the
     'srcline' sort order.

  perf symbols:
   - Ignore $a/$d symbols for ARM modules.
   - Fix /proc/kcore access on 32 bit systems.

  Kernel UAPI copies:
   - Update copy of linux/socket.h with the kernel sources, no change in
     tooling output.

  libbpf:
   - Pull in bpf_program__get_prog_info_linear() from libbpf, too much
     specific to perf.
   - Deprecate bpf_map__resize() in favor of bpf_map_set_max_entries()
   - Install libbpf headers locally when building.
   - Bump minimum LLVM C++ std to GNU++14.

  libperf:
   - Use binary search in perf_cpu_map__idx() as array are sorted.

  libtracefs:
   - Enable libtracefs dynamic linking.

  libtraceevent:
   - Increase logging when verbose.

  Arch specific:

   * PowerPC:
      - Add support to expose instruction and data address registers as
        part of extended regs.

  Vendor events:

   * JSON parser:
      - Support ConfigCode to set the config= in PMUs
      - Make the JSON parser more conformant when in strict mode.

   * All JSON files:
      - Fix all remaining invalid JSON files.

   * ARM:
      - Syntax corrections in Neoverse N1 json.
      - Categorise the Neoverse V1 counters.
      - Add new armv8 PMU events.
      - Revise hip08 uncore events.

  Hardware tracing:

   * auxtrace:
      - Add missing Z option to ITRACE_HELP.
      - Add itrace A option to approximate IPC.
      - Add itrace d+o option to direct debug log to stdout.

   * Intel PT:
      - Add support for PERF_RECORD_AUX_OUTPUT_HW_ID
      - Support itrace A option to approximate IPC
      - Support itrace d+o option to direct debug log to stdout"

* tag 'perf-tools-for-v5.16-2021-11-07-without-bpftool-fix' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux: (120 commits)
  perf build: Install libbpf headers locally when building
  perf MANIFEST: Add bpftool files to allow building with BUILD_BPF_SKEL=1
  perf metric: Fix memory leaks
  perf parse-event: Add init and exit to parse_event_error
  perf parse-events: Rename parse_events_error functions
  perf stat: Fix memory leak on error path
  perf tools: Use __BYTE_ORDER__
  perf inject: Add vmlinux and ignore-vmlinux arguments
  perf tools: Check vmlinux/kallsyms arguments in all tools
  perf tools: Refactor out kernel symbol argument sanity checking
  perf symbols: Ignore $a/$d symbols for ARM modules
  perf evsel: Don't set exclude_guest by default
  perf evsel: Fix missing exclude_{host,guest} setting
  perf bpf: Add missing free to bpf_event__print_bpf_prog_info()
  perf beauty: Update copy of linux/socket.h with the kernel sources
  perf clang: Fixes for more recent LLVM/clang
  tools: Bump minimum LLVM C++ std to GNU++14
  perf bpf: Pull in bpf_program__get_prog_info_linear()
  Revert "perf bench futex: Add support for 32-bit systems with 64-bit time_t"
  perf test sample-parsing: Add endian test for struct branch_flags
  ...

598 files changed:
Documentation/ABI/testing/sysfs-bus-pci
Documentation/admin-guide/blockdev/zram.rst
Documentation/admin-guide/cgroup-v1/memory.rst
Documentation/admin-guide/filesystem-monitoring.rst [new file with mode: 0644]
Documentation/admin-guide/index.rst
Documentation/admin-guide/kernel-parameters.txt
Documentation/admin-guide/mm/damon/index.rst
Documentation/admin-guide/mm/damon/reclaim.rst [new file with mode: 0644]
Documentation/admin-guide/mm/damon/start.rst
Documentation/admin-guide/mm/damon/usage.rst
Documentation/admin-guide/mm/hugetlbpage.rst
Documentation/admin-guide/mm/index.rst
Documentation/admin-guide/mm/memory-hotplug.rst
Documentation/admin-guide/mm/pagemap.rst
Documentation/admin-guide/mm/swap_numa.rst [new file with mode: 0644]
Documentation/admin-guide/mm/zswap.rst [new file with mode: 0644]
Documentation/core-api/memory-hotplug.rst
Documentation/dev-tools/kfence.rst
Documentation/devicetree/bindings/auxdisplay/holtek,ht16k33.yaml
Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/pci/qcom,pcie.txt
Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml [new file with mode: 0644]
Documentation/kbuild/makefiles.rst
Documentation/translations/zh_CN/core-api/memory-hotplug.rst
Documentation/vm/damon/design.rst
Documentation/vm/damon/faq.rst
Documentation/vm/damon/index.rst
Documentation/vm/index.rst
Documentation/vm/page_owner.rst
Documentation/vm/swap_numa.rst [deleted file]
Documentation/vm/zswap.rst [deleted file]
MAINTAINERS
Makefile
arch/alpha/Kbuild
arch/alpha/Makefile
arch/alpha/kernel/core_irongate.c
arch/arc/Kbuild
arch/arc/Makefile
arch/arc/mm/init.c
arch/arm/Kbuild
arch/arm/Makefile
arch/arm/mach-hisi/platmcpm.c
arch/arm/mm/init.c
arch/arm64/Kbuild
arch/arm64/Kconfig
arch/arm64/Makefile
arch/arm64/kernel/Makefile
arch/arm64/lib/Makefile
arch/arm64/mm/kasan_init.c
arch/arm64/mm/mmu.c
arch/csky/Kbuild
arch/csky/Makefile
arch/h8300/Kbuild
arch/h8300/Makefile
arch/ia64/Makefile
arch/ia64/mm/contig.c
arch/ia64/mm/init.c
arch/m68k/Makefile
arch/m68k/mm/mcfmmu.c
arch/m68k/mm/motorola.c
arch/microblaze/Kbuild
arch/microblaze/Makefile
arch/microblaze/pci/pci-common.c
arch/mips/Kbuild
arch/mips/Makefile
arch/mips/boot/Makefile
arch/mips/loongson64/init.c
arch/mips/mm/init.c
arch/mips/ralink/Kconfig
arch/mips/sgi-ip27/ip27-memory.c
arch/mips/sgi-ip30/ip30-setup.c
arch/nds32/Kbuild
arch/nds32/Makefile
arch/nios2/Kbuild
arch/nios2/Makefile
arch/nios2/boot/Makefile
arch/openrisc/Kbuild
arch/openrisc/Makefile
arch/parisc/Kbuild
arch/parisc/Makefile
arch/powerpc/Kbuild
arch/powerpc/Makefile
arch/powerpc/configs/skiroot_defconfig
arch/powerpc/include/asm/machdep.h
arch/powerpc/include/asm/ppc-pci.h
arch/powerpc/include/asm/sections.h
arch/powerpc/kernel/dt_cpu_ftrs.c
arch/powerpc/kernel/eeh.c
arch/powerpc/kernel/eeh_driver.c
arch/powerpc/kernel/paca.c
arch/powerpc/kernel/pci-common.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/setup_64.c
arch/powerpc/lib/Makefile
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/powernv/pci-sriov.c
arch/powerpc/platforms/powernv/setup.c
arch/powerpc/platforms/pseries/setup.c
arch/powerpc/platforms/pseries/svm.c
arch/riscv/Kbuild
arch/riscv/Makefile
arch/riscv/kernel/setup.c
arch/s390/Kbuild
arch/s390/Kconfig
arch/s390/Makefile
arch/s390/boot/compressed/decompressor.h
arch/s390/boot/head.S
arch/s390/boot/ipl_parm.c
arch/s390/boot/pgm_check_info.c
arch/s390/boot/startup.c
arch/s390/configs/debug_defconfig
arch/s390/configs/defconfig
arch/s390/include/asm/barrier.h
arch/s390/include/asm/bitops.h
arch/s390/include/asm/cpu.h
arch/s390/include/asm/debug.h
arch/s390/include/asm/ftrace.h
arch/s390/include/asm/jump_label.h
arch/s390/include/asm/livepatch.h
arch/s390/include/asm/lowcore.h
arch/s390/include/asm/nospec-branch.h
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/ptrace.h
arch/s390/include/asm/sclp.h
arch/s390/include/asm/sections.h
arch/s390/include/asm/setup.h
arch/s390/include/asm/string.h
arch/s390/include/asm/text-patching.h [new file with mode: 0644]
arch/s390/include/uapi/asm/setup.h
arch/s390/kernel/alternative.c
arch/s390/kernel/asm-offsets.c
arch/s390/kernel/cpcmd.c
arch/s390/kernel/dumpstack.c
arch/s390/kernel/early.c
arch/s390/kernel/entry.S
arch/s390/kernel/entry.h
arch/s390/kernel/ftrace.c
arch/s390/kernel/head64.S
arch/s390/kernel/irq.c
arch/s390/kernel/jump_label.c
arch/s390/kernel/kprobes.c
arch/s390/kernel/machine_kexec_file.c
arch/s390/kernel/mcount.S
arch/s390/kernel/nospec-branch.c
arch/s390/kernel/nospec-sysfs.c
arch/s390/kernel/perf_cpum_cf.c
arch/s390/kernel/process.c
arch/s390/kernel/setup.c
arch/s390/kernel/smp.c
arch/s390/kernel/syscall.c
arch/s390/kernel/traps.c
arch/s390/kernel/uv.c
arch/s390/kernel/vmlinux.lds.S
arch/s390/kvm/interrupt.c
arch/s390/lib/Makefile
arch/s390/lib/spinlock.c
arch/s390/lib/string.c
arch/s390/lib/test_kprobes.c [new file with mode: 0644]
arch/s390/lib/test_kprobes.h [new file with mode: 0644]
arch/s390/lib/test_kprobes_asm.S [new file with mode: 0644]
arch/s390/lib/test_unwind.c
arch/s390/mm/cmm.c
arch/s390/mm/dump_pagetables.c
arch/s390/mm/init.c
arch/s390/mm/kasan_init.c
arch/s390/mm/pageattr.c
arch/s390/mm/vmem.c
arch/s390/net/bpf_jit_comp.c
arch/s390/pci/pci.c
arch/s390/pci/pci_dma.c
arch/s390/pci/pci_event.c
arch/s390/pci/pci_sysfs.c
arch/sh/Kbuild
arch/sh/Makefile
arch/sh/boards/mach-ap325rxa/setup.c
arch/sh/boards/mach-ecovec24/setup.c
arch/sh/boards/mach-kfr2r09/setup.c
arch/sh/boards/mach-migor/setup.c
arch/sh/boards/mach-se/7724/setup.c
arch/sparc/Kbuild
arch/sparc/Makefile
arch/sparc/boot/Makefile
arch/sparc/kernel/pci.c
arch/sparc/kernel/smp_64.c
arch/um/kernel/mem.c
arch/x86/Kbuild
arch/x86/Kconfig
arch/x86/Makefile
arch/x86/events/intel/uncore.c
arch/x86/kernel/probe_roms.c
arch/x86/kernel/setup.c
arch/x86/kernel/setup_percpu.c
arch/x86/mm/init.c
arch/x86/mm/init_32.c
arch/x86/mm/kasan_init_64.c
arch/x86/mm/numa.c
arch/x86/mm/numa_emulation.c
arch/x86/pci/common.c
arch/x86/xen/mmu_pv.c
arch/x86/xen/p2m.c
arch/x86/xen/setup.c
arch/xtensa/Makefile
arch/xtensa/boot/boot-elf/bootstrap.S
arch/xtensa/boot/boot-redboot/bootstrap.S
arch/xtensa/include/asm/asmmacro.h
arch/xtensa/include/asm/atomic.h
arch/xtensa/include/asm/cmpxchg.h
arch/xtensa/include/asm/core.h
arch/xtensa/include/asm/processor.h
arch/xtensa/include/asm/sections.h [new file with mode: 0644]
arch/xtensa/include/asm/traps.h
arch/xtensa/kernel/align.S
arch/xtensa/kernel/entry.S
arch/xtensa/kernel/head.S
arch/xtensa/kernel/mcount.S
arch/xtensa/kernel/process.c
arch/xtensa/kernel/setup.c
arch/xtensa/kernel/signal.c
arch/xtensa/kernel/traps.c
arch/xtensa/kernel/vectors.S
arch/xtensa/kernel/vmlinux.lds.S
arch/xtensa/lib/strncpy_user.S
arch/xtensa/lib/usercopy.S
crypto/Makefile
drivers/acpi/pci_root.c
drivers/auxdisplay/Kconfig
drivers/auxdisplay/Makefile
drivers/auxdisplay/cfag12864bfb.c
drivers/auxdisplay/ht16k33.c
drivers/auxdisplay/img-ascii-lcd.c
drivers/auxdisplay/ks0108.c
drivers/auxdisplay/line-display.c [new file with mode: 0644]
drivers/auxdisplay/line-display.h [new file with mode: 0644]
drivers/base/Makefile
drivers/base/arch_numa.c
drivers/base/node.c
drivers/bcma/host_pci.c
drivers/block/zram/zram_drv.c
drivers/crypto/hisilicon/qm.c
drivers/crypto/qat/qat_4xxx/adf_drv.c
drivers/crypto/qat/qat_c3xxx/adf_drv.c
drivers/crypto/qat/qat_c62x/adf_drv.c
drivers/crypto/qat/qat_common/adf_aer.c
drivers/crypto/qat/qat_common/adf_common_drv.h
drivers/crypto/qat/qat_dh895xcc/adf_drv.c
drivers/firmware/efi/memmap.c
drivers/hwmon/occ/p9_sbe.c
drivers/i2c/busses/i2c-xgene-slimpro.c
drivers/iommu/apple-dart.c
drivers/macintosh/smu.c
drivers/message/fusion/mptbase.c
drivers/message/fusion/mptbase.h
drivers/message/fusion/mptctl.c
drivers/message/fusion/mptlan.c
drivers/misc/cxl/guest.c
drivers/misc/cxl/pci.c
drivers/mmc/core/mmc_test.c
drivers/mtd/mtdcore.c
drivers/net/ethernet/chelsio/cxgb3/common.h
drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
drivers/net/ethernet/chelsio/cxgb3/t3_hw.c
drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
drivers/net/ethernet/marvell/prestera/prestera_pci.c
drivers/net/ethernet/mellanox/mlxsw/pci.c
drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
drivers/of/irq.c
drivers/of/kexec.c
drivers/of/of_reserved_mem.c
drivers/pci/controller/Kconfig
drivers/pci/controller/Makefile
drivers/pci/controller/cadence/pci-j721e.c
drivers/pci/controller/cadence/pcie-cadence-plat.c
drivers/pci/controller/dwc/Kconfig
drivers/pci/controller/dwc/Makefile
drivers/pci/controller/dwc/pci-dra7xx.c
drivers/pci/controller/dwc/pci-imx6.c
drivers/pci/controller/dwc/pcie-designware-ep.c
drivers/pci/controller/dwc/pcie-designware-host.c
drivers/pci/controller/dwc/pcie-designware.c
drivers/pci/controller/dwc/pcie-kirin.c
drivers/pci/controller/dwc/pcie-qcom-ep.c [new file with mode: 0644]
drivers/pci/controller/dwc/pcie-qcom.c
drivers/pci/controller/dwc/pcie-uniphier.c
drivers/pci/controller/dwc/pcie-visconti.c
drivers/pci/controller/pci-aardvark.c
drivers/pci/controller/pci-hyperv.c
drivers/pci/controller/pci-thunder-ecam.c
drivers/pci/controller/pci-xgene-msi.c
drivers/pci/controller/pci-xgene.c
drivers/pci/controller/pcie-apple.c [new file with mode: 0644]
drivers/pci/controller/pcie-brcmstb.c
drivers/pci/controller/pcie-iproc.c
drivers/pci/controller/pcie-mt7621.c [new file with mode: 0644]
drivers/pci/controller/pcie-rcar-ep.c
drivers/pci/controller/pcie-rcar-host.c
drivers/pci/controller/vmd.c
drivers/pci/endpoint/functions/pci-epf-ntb.c
drivers/pci/endpoint/pci-ep-cfs.c
drivers/pci/endpoint/pci-epc-core.c
drivers/pci/endpoint/pci-epf-core.c
drivers/pci/hotplug/acpiphp_glue.c
drivers/pci/hotplug/cpqphp.h
drivers/pci/hotplug/cpqphp_ctrl.c
drivers/pci/hotplug/cpqphp_pci.c
drivers/pci/hotplug/ibmphp.h
drivers/pci/hotplug/pciehp.h
drivers/pci/hotplug/pciehp_core.c
drivers/pci/hotplug/pciehp_hpc.c
drivers/pci/hotplug/shpchp_hpc.c
drivers/pci/iov.c
drivers/pci/msi.c
drivers/pci/of.c
drivers/pci/p2pdma.c
drivers/pci/pci-bridge-emul.c
drivers/pci/pci-driver.c
drivers/pci/pci-sysfs.c
drivers/pci/pci.c
drivers/pci/pci.h
drivers/pci/pcie/Makefile
drivers/pci/pcie/aer.c
drivers/pci/pcie/aspm.c
drivers/pci/pcie/err.c
drivers/pci/pcie/portdrv.h
drivers/pci/pcie/portdrv_core.c
drivers/pci/pcie/portdrv_pci.c
drivers/pci/probe.c
drivers/pci/quirks.c
drivers/pci/rom.c
drivers/pci/setup-bus.c
drivers/pci/setup-irq.c
drivers/pci/switch/switchtec.c
drivers/pci/vpd.c
drivers/pci/xen-pcifront.c
drivers/rapidio/devices/rio_mport_cdev.c
drivers/s390/block/dasd_genhd.c
drivers/s390/block/dcssblk.c
drivers/s390/block/scm_blk.c
drivers/s390/char/sclp.c
drivers/s390/char/sclp.h
drivers/s390/char/sclp_early.c
drivers/s390/char/sclp_ftp.c
drivers/s390/char/sclp_sd.c
drivers/s390/char/sclp_vt220.c
drivers/s390/cio/css.c
drivers/s390/cio/device.c
drivers/s390/cio/device_ops.c
drivers/s390/crypto/ap_bus.c
drivers/s390/crypto/ap_debug.h
drivers/s390/crypto/ap_queue.c
drivers/s390/crypto/vfio_ap_drv.c
drivers/s390/crypto/vfio_ap_ops.c
drivers/s390/crypto/vfio_ap_private.h
drivers/s390/crypto/zcrypt_api.c
drivers/s390/crypto/zcrypt_card.c
drivers/s390/crypto/zcrypt_debug.h
drivers/s390/crypto/zcrypt_error.h
drivers/s390/crypto/zcrypt_msgtype50.c
drivers/s390/crypto/zcrypt_msgtype6.c
drivers/s390/crypto/zcrypt_queue.c
drivers/ssb/pcihost_wrapper.c
drivers/staging/Kconfig
drivers/staging/Makefile
drivers/staging/mt7621-pci/Kconfig [deleted file]
drivers/staging/mt7621-pci/Makefile [deleted file]
drivers/staging/mt7621-pci/TODO [deleted file]
drivers/staging/mt7621-pci/mediatek,mt7621-pci.txt [deleted file]
drivers/staging/mt7621-pci/pci-mt7621.c [deleted file]
drivers/usb/early/xhci-dbc.c
drivers/usb/host/xhci-pci.c
drivers/virtio/Kconfig
drivers/xen/swiotlb-xen.c
fs/cifs/cifsfs.c
fs/cifs/cifsglob.h
fs/cifs/connect.c
fs/cifs/fs_context.c
fs/cifs/fs_context.h
fs/cifs/misc.c
fs/cifs/smb2maperror.c
fs/cifs/smb2misc.c
fs/cifs/smb2ops.c
fs/cifs/smb2pdu.c
fs/cifs/smb2pdu.h
fs/cifs/smb2proto.h
fs/cifs/smb2transport.c
fs/cifs/trace.h
fs/d_path.c
fs/ext4/super.c
fs/isofs/inode.c
fs/nfsd/filecache.c
fs/notify/fanotify/fanotify.c
fs/notify/fanotify/fanotify.h
fs/notify/fanotify/fanotify_user.c
fs/notify/fsnotify.c
fs/notify/group.c
fs/notify/inotify/inotify_fsnotify.c
fs/notify/inotify/inotify_user.c
fs/notify/notification.c
fs/ocfs2/alloc.c
fs/ocfs2/dlm/dlmrecovery.c
fs/ocfs2/file.c
fs/ocfs2/inode.c
fs/ocfs2/journal.c
fs/ocfs2/journal.h
fs/ocfs2/super.c
fs/open.c
fs/posix_acl.c
fs/proc/task_mmu.c
fs/quota/quota_tree.c
fs/reiserfs/super.c
fs/smbfs_common/smb2pdu.h [new file with mode: 0644]
fs/super.c
include/asm-generic/sections.h
include/linux/acpi.h
include/linux/backing-dev-defs.h
include/linux/backing-dev.h
include/linux/cma.h
include/linux/compiler-gcc.h
include/linux/compiler_attributes.h
include/linux/compiler_types.h
include/linux/cpuset.h
include/linux/damon.h
include/linux/fanotify.h
include/linux/fs.h
include/linux/fsnotify.h
include/linux/fsnotify_backend.h
include/linux/gfp.h
include/linux/highmem.h
include/linux/hugetlb.h
include/linux/io-mapping.h
include/linux/irqdomain.h
include/linux/kasan.h
include/linux/kernel.h
include/linux/kfence.h
include/linux/memblock.h
include/linux/memcontrol.h
include/linux/memory.h
include/linux/memory_hotplug.h
include/linux/mempolicy.h
include/linux/migrate.h
include/linux/migrate_mode.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmzone.h
include/linux/node.h
include/linux/page-flags.h
include/linux/pci.h
include/linux/percpu.h
include/linux/slab.h
include/linux/slub_def.h
include/linux/stackdepot.h
include/linux/stacktrace.h
include/linux/swap.h
include/linux/switchtec.h
include/linux/vmalloc.h
include/trace/events/mmap_lock.h
include/trace/events/vmscan.h
include/trace/events/writeback.h
include/uapi/linux/fanotify.h
include/uapi/linux/map_to_14segment.h [new file with mode: 0644]
include/uapi/linux/pci_regs.h
init/Kconfig
init/initramfs.c
init/main.c
kernel/audit_fsnotify.c
kernel/audit_watch.c
kernel/cgroup/cpuset.c
kernel/dma/swiotlb.c
kernel/extable.c
kernel/irq/irqdomain.c
kernel/kexec_file.c
kernel/locking/lockdep.c
kernel/module.c
kernel/printk/printk.c
kernel/sched/topology.c
kernel/stacktrace.c
kernel/tsacct.c
kernel/workqueue.c
lib/Kconfig.debug
lib/Kconfig.kfence
lib/bootconfig.c
lib/cpumask.c
lib/raid6/Makefile
lib/stackdepot.c
lib/test_kasan.c
lib/test_kasan_module.c
lib/test_vmalloc.c
mm/Kconfig
mm/backing-dev.c
mm/cma.c
mm/compaction.c
mm/damon/Kconfig
mm/damon/Makefile
mm/damon/core.c
mm/damon/dbgfs-test.h
mm/damon/dbgfs.c
mm/damon/paddr.c [new file with mode: 0644]
mm/damon/prmtv-common.c [new file with mode: 0644]
mm/damon/prmtv-common.h [new file with mode: 0644]
mm/damon/reclaim.c [new file with mode: 0644]
mm/damon/vaddr-test.h
mm/damon/vaddr.c
mm/debug.c
mm/debug_vm_pgtable.c
mm/filemap.c
mm/gup.c
mm/highmem.c
mm/hugetlb.c
mm/hugetlb_cgroup.c
mm/internal.h
mm/kasan/common.c
mm/kasan/generic.c
mm/kasan/kasan.h
mm/kasan/shadow.c
mm/kfence/core.c
mm/kfence/kfence.h
mm/kfence/kfence_test.c
mm/khugepaged.c
mm/list_lru.c
mm/memblock.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/migrate.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/nommu.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_ext.c
mm/page_isolation.c
mm/percpu.c
mm/readahead.c
mm/rmap.c
mm/shmem.c
mm/slab.c
mm/slab_common.c
mm/slub.c
mm/sparse-vmemmap.c
mm/sparse.c
mm/swap.c
mm/swapfile.c
mm/userfaultfd.c
mm/vmalloc.c
mm/vmpressure.c
mm/vmscan.c
mm/vmstat.c
mm/zsmalloc.c
net/ipv4/tcp.c
net/ipv4/udp.c
net/netfilter/ipvs/ip_vs_ctl.c
net/openvswitch/meter.c
net/sctp/protocol.c
samples/Kconfig
samples/Makefile
samples/fanotify/.gitignore [new file with mode: 0644]
samples/fanotify/Makefile [new file with mode: 0644]
samples/fanotify/fs-monitor.c [new file with mode: 0644]
samples/ftrace/Makefile
samples/ftrace/ftrace-direct-modify.c
samples/ftrace/ftrace-direct-too.c
samples/ftrace/ftrace-direct.c
scripts/Makefile.build
scripts/Makefile.debug [new file with mode: 0644]
scripts/Makefile.lib
scripts/Makefile.package
scripts/checkpatch.pl
scripts/decodecode
scripts/kconfig/conf.c
scripts/kconfig/confdata.c
scripts/kconfig/lexer.l
scripts/kconfig/lkc_proto.h
scripts/kconfig/menu.c
scripts/kconfig/symbol.c
scripts/link-vmlinux.sh
scripts/package/buildtar
scripts/spelling.txt
security/Kconfig
tools/testing/selftests/damon/debugfs_attrs.sh
tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc
tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc
tools/testing/selftests/memory-hotplug/config
tools/testing/selftests/vm/.gitignore
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/hugepage-mremap.c [new file with mode: 0644]
tools/testing/selftests/vm/ksm_tests.c
tools/testing/selftests/vm/madv_populate.c
tools/testing/selftests/vm/run_vmtests.sh
tools/testing/selftests/vm/transhuge-stress.c
tools/testing/selftests/vm/userfaultfd.c
tools/vm/page-types.c
tools/vm/page_owner_sort.c
usr/gen_init_cpio.c

index 16afe3f59cbd832f76d831e9358a85b7c1e25ff5..6fc2c2efe8ab2655c9ce697865a45a0aaf1ec798 100644 (file)
@@ -100,6 +100,17 @@ Description:
                This attribute indicates the mode that the irq vector named by
                the file is in (msi vs. msix)
 
+What:          /sys/bus/pci/devices/.../irq
+Date:          August 2021
+Contact:       Linux PCI developers <linux-pci@vger.kernel.org>
+Description:
+               If a driver has enabled MSI (not MSI-X), "irq" contains the
+               IRQ of the first MSI vector. Otherwise "irq" contains the
+               IRQ of the legacy INTx interrupt.
+
+               "irq" being set to 0 indicates that the device isn't
+               capable of generating legacy INTx interrupts.
+
 What:          /sys/bus/pci/devices/.../remove
 Date:          January 2009
 Contact:       Linux PCI developers <linux-pci@vger.kernel.org>
index 700329d25f5799dbd69c7724bac55b0f290804d8..3e11926a4df95dc08eeb319329d7db8568743da5 100644 (file)
@@ -328,6 +328,14 @@ as idle::
 From now on, any pages on zram are idle pages. The idle mark
 will be removed until someone requests access of the block.
 IOW, unless there is access request, those pages are still idle pages.
+Additionally, when CONFIG_ZRAM_MEMORY_TRACKING is enabled pages can be
+marked as idle based on how long (in seconds) it's been since they were
+last accessed::
+
+        echo 86400 > /sys/block/zramX/idle
+
+In this example all pages which haven't been accessed in more than 86400
+seconds (one day) will be marked idle.
 
 Admin can request writeback of those idle pages at right timing via::
 
index 41191b5fb69d9cc0663d20789aad230d3a82b24d..faac50149a222f4893855e1851d6a4c54d9613fc 100644 (file)
@@ -87,10 +87,8 @@ Brief summary of control files.
  memory.oom_control                 set/show oom controls.
  memory.numa_stat                   show the number of memory usage per numa
                                     node
- memory.kmem.limit_in_bytes          set/show hard limit for kernel memory
-                                     This knob is deprecated and shouldn't be
-                                     used. It is planned that this be removed in
-                                     the foreseeable future.
+ memory.kmem.limit_in_bytes          This knob is deprecated and writing to
+                                     it will return -ENOTSUPP.
  memory.kmem.usage_in_bytes          show current kernel memory allocation
  memory.kmem.failcnt                 show the number of kernel memory usage
                                     hits limits
@@ -518,11 +516,6 @@ will be charged as a new owner of it.
   charged file caches. Some out-of-use page caches may keep charged until
   memory pressure happens. If you want to avoid that, force_empty will be useful.
 
-  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
-  kernel pages will still be seen. This is not considered a failure and the
-  write will still return success. In this case, it is expected that
-  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
-
 5.2 stat file
 -------------
 
diff --git a/Documentation/admin-guide/filesystem-monitoring.rst b/Documentation/admin-guide/filesystem-monitoring.rst
new file mode 100644 (file)
index 0000000..ab8dba7
--- /dev/null
@@ -0,0 +1,78 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================================
+File system Monitoring with fanotify
+====================================
+
+File system Error Reporting
+===========================
+
+Fanotify supports the FAN_FS_ERROR event type for file system-wide error
+reporting.  It is meant to be used by file system health monitoring
+daemons, which listen for these events and take actions (notify
+sysadmin, start recovery) when a file system problem is detected.
+
+By design, a FAN_FS_ERROR notification exposes sufficient information
+for a monitoring tool to know a problem in the file system has happened.
+It doesn't necessarily provide a user space application with semantics
+to verify an IO operation was successfully executed.  That is out of
+scope for this feature.  Instead, it is only meant as a framework for
+early file system problem detection and reporting recovery tools.
+
+When a file system operation fails, it is common for dozens of kernel
+errors to cascade after the initial failure, hiding the original failure
+log, which is usually the most useful debug data to troubleshoot the
+problem.  For this reason, FAN_FS_ERROR tries to report only the first
+error that occurred for a file system since the last notification, and
+it simply counts additional errors.  This ensures that the most
+important pieces of information are never lost.
+
+FAN_FS_ERROR requires the fanotify group to be setup with the
+FAN_REPORT_FID flag.
+
+At the time of this writing, the only file system that emits FAN_FS_ERROR
+notifications is Ext4.
+
+A FAN_FS_ERROR Notification has the following format::
+
+  ::
+
+     [ Notification Metadata (Mandatory) ]
+     [ Generic Error Record  (Mandatory) ]
+     [ FID record            (Mandatory) ]
+
+The order of records is not guaranteed, and new records might be added
+in the future.  Therefore, applications must not rely on the order and
+must be prepared to skip over unknown records. Please refer to
+``samples/fanotify/fs-monitor.c`` for an example parser.
+
+Generic error record
+--------------------
+
+The generic error record provides enough information for a file system
+agnostic tool to learn about a problem in the file system, without
+providing any additional details about the problem.  This record is
+identified by ``struct fanotify_event_info_header.info_type`` being set
+to FAN_EVENT_INFO_TYPE_ERROR.
+
+  ::
+
+     struct fanotify_event_info_error {
+          struct fanotify_event_info_header hdr;
+         __s32 error;
+         __u32 error_count;
+     };
+
+The `error` field identifies the type of error using errno values.
+`error_count` tracks the number of errors that occurred and were
+suppressed to preserve the original error information, since the last
+notification.
+
+FID record
+----------
+
+The FID record can be used to uniquely identify the inode that triggered
+the error through the combination of fsid and file handle.  A file system
+specific application can use that information to attempt a recovery
+procedure.  Errors that are not related to an inode are reported with an
+empty file handle of type FILEID_INVALID.
index dc00afcabb95f90da2a7562d9f443c0b4ba6eae6..1bedab498104af95a2f52257fe0ee728276fc092 100644 (file)
@@ -82,6 +82,7 @@ configure specific aspects of kernel behavior to your liking.
    edid
    efi-stub
    ext4
+   filesystem-monitoring
    nfs/index
    gpio/index
    highuid
index 756bfb7d5235e6485f067c9801ff4d04a73c4d9d..0905d2cdb2d5b08724c0bb332b1d8ca347b5ef58 100644 (file)
                        registers.  Default set by CONFIG_HPET_MMAP_DEFAULT.
 
        hugetlb_cma=    [HW,CMA] The size of a CMA area used for allocation
-                       of gigantic hugepages.
-                       Format: nn[KMGTPE]
+                       of gigantic hugepages. Or using node format, the size
+                       of a CMA area per node can be specified.
+                       Format: nn[KMGTPE] or (node format)
+                               <node>:nn[KMGTPE][,<node>:nn[KMGTPE]]
 
                        Reserve a CMA area of given size and allocate gigantic
                        hugepages using the CMA allocator. If enabled, the
                        the number of pages of hugepagesz to be allocated.
                        If this is the first HugeTLB parameter on the command
                        line, it specifies the number of pages to allocate for
-                       the default huge page size.  See also
-                       Documentation/admin-guide/mm/hugetlbpage.rst.
-                       Format: <integer>
+                       the default huge page size. If using node format, the
+                       number of pages to allocate per-node can be specified.
+                       See also Documentation/admin-guide/mm/hugetlbpage.rst.
+                       Format: <integer> or (node format)
+                               <node>:<integer>[,<node>:<integer>]
 
        hugepagesz=
                        [HW] The size of the HugeTLB pages.  This is used in
                        an IOTLB flush. Default is lazy flushing before reuse,
                        which is faster.
 
+       s390_iommu_aperture=    [KNL,S390]
+                       Specifies the size of the per device DMA address space
+                       accessible through the DMA and IOMMU APIs as a decimal
+                       factor of the size of main memory.
+                       The default is 1 meaning that one can concurrently use
+                       as many DMA addresses as physical memory is installed,
+                       if supported by hardware, and thus map all of memory
+                       once. With a value of 2 one can map all of memory twice
+                       and so on. As a special case a factor of 0 imposes no
+                       restrictions other than those given by hardware at the
+                       cost of significant additional memory use for tables.
+
        sa1100ir        [NET]
                        See drivers/net/irda/sa1100_ir.c.
 
index 8c5dde3a575447bf7e0ac0590b21c2eaef10f8a5..61aff88347f3c00c96edfcac3a14a4d0e3c4d42b 100644 (file)
@@ -13,3 +13,4 @@ optimize those.
 
    start
    usage
+   reclaim
diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst
new file mode 100644 (file)
index 0000000..fb9def3
--- /dev/null
@@ -0,0 +1,235 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+DAMON-based Reclamation
+=======================
+
+DAMON-based Reclamation (DAMON_RECLAIM) is a static kernel module that aimed to
+be used for proactive and lightweight reclamation under light memory pressure.
+It doesn't aim to replace the LRU-list based page_granularity reclamation, but
+to be selectively used for different level of memory pressure and requirements.
+
+Where Proactive Reclamation is Required?
+========================================
+
+On general memory over-committed systems, proactively reclaiming cold pages
+helps saving memory and reducing latency spikes that incurred by the direct
+reclaim of the process or CPU consumption of kswapd, while incurring only
+minimal performance degradation [1]_ [2]_ .
+
+Free Pages Reporting [3]_ based memory over-commit virtualization systems are
+good example of the cases.  In such systems, the guest VMs reports their free
+memory to host, and the host reallocates the reported memory to other guests.
+As a result, the memory of the systems are fully utilized.  However, the
+guests could be not so memory-frugal, mainly because some kernel subsystems and
+user-space applications are designed to use as much memory as available.  Then,
+guests could report only small amount of memory as free to host, results in
+memory utilization drop of the systems.  Running the proactive reclamation in
+guests could mitigate this problem.
+
+How It Works?
+=============
+
+DAMON_RECLAIM finds memory regions that didn't accessed for specific time
+duration and page out.  To avoid it consuming too much CPU for the paging out
+operation, a speed limit can be configured.  Under the speed limit, it pages
+out memory regions that didn't accessed longer time first.  System
+administrators can also configure under what situation this scheme should
+automatically activated and deactivated with three memory pressure watermarks.
+
+Interface: Module Parameters
+============================
+
+To use this feature, you should first ensure your system is running on a kernel
+that is built with ``CONFIG_DAMON_RECLAIM=y``.
+
+To let sysadmins enable or disable it and tune for the given system,
+DAMON_RECLAIM utilizes module parameters.  That is, you can put
+``damon_reclaim.<parameter>=<value>`` on the kernel boot command line or write
+proper values to ``/sys/modules/damon_reclaim/parameters/<parameter>`` files.
+
+Note that the parameter values except ``enabled`` are applied only when
+DAMON_RECLAIM starts.  Therefore, if you want to apply new parameter values in
+runtime and DAMON_RECLAIM is already enabled, you should disable and re-enable
+it via ``enabled`` parameter file.  Writing of the new values to proper
+parameter values should be done before the re-enablement.
+
+Below are the description of each parameter.
+
+enabled
+-------
+
+Enable or disable DAMON_RECLAIM.
+
+You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``.
+Setting it as ``N`` disables DAMON_RECLAIM.  Note that DAMON_RECLAIM could do
+no real monitoring and reclamation due to the watermarks-based activation
+condition.  Refer to below descriptions for the watermarks parameter for this.
+
+min_age
+-------
+
+Time threshold for cold memory regions identification in microseconds.
+
+If a memory region is not accessed for this or longer time, DAMON_RECLAIM
+identifies the region as cold, and reclaims it.
+
+120 seconds by default.
+
+quota_ms
+--------
+
+Limit of time for the reclamation in milliseconds.
+
+DAMON_RECLAIM tries to use only up to this time within a time window
+(quota_reset_interval_ms) for trying reclamation of cold pages.  This can be
+used for limiting CPU consumption of DAMON_RECLAIM.  If the value is zero, the
+limit is disabled.
+
+10 ms by default.
+
+quota_sz
+--------
+
+Limit of size of memory for the reclamation in bytes.
+
+DAMON_RECLAIM charges amount of memory which it tried to reclaim within a time
+window (quota_reset_interval_ms) and makes no more than this limit is tried.
+This can be used for limiting consumption of CPU and IO.  If this value is
+zero, the limit is disabled.
+
+128 MiB by default.
+
+quota_reset_interval_ms
+-----------------------
+
+The time/size quota charge reset interval in milliseconds.
+
+The charget reset interval for the quota of time (quota_ms) and size
+(quota_sz).  That is, DAMON_RECLAIM does not try reclamation for more than
+quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
+milliseconds.
+
+1 second by default.
+
+wmarks_interval
+---------------
+
+Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
+enabled but inactive due to its watermarks rule.
+
+wmarks_high
+-----------
+
+Free memory rate (per thousand) for the high watermark.
+
+If free memory of the system in bytes per thousand bytes is higher than this,
+DAMON_RECLAIM becomes inactive, so it does nothing but only periodically checks
+the watermarks.
+
+wmarks_mid
+----------
+
+Free memory rate (per thousand) for the middle watermark.
+
+If free memory of the system in bytes per thousand bytes is between this and
+the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring and
+the reclaiming.
+
+wmarks_low
+----------
+
+Free memory rate (per thousand) for the low watermark.
+
+If free memory of the system in bytes per thousand bytes is lower than this,
+DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks the
+watermarks.  In the case, the system falls back to the LRU-list based page
+granularity reclamation logic.
+
+sample_interval
+---------------
+
+Sampling interval for the monitoring in microseconds.
+
+The sampling interval of DAMON for the cold memory monitoring.  Please refer to
+the DAMON documentation (:doc:`usage`) for more detail.
+
+aggr_interval
+-------------
+
+Aggregation interval for the monitoring in microseconds.
+
+The aggregation interval of DAMON for the cold memory monitoring.  Please
+refer to the DAMON documentation (:doc:`usage`) for more detail.
+
+min_nr_regions
+--------------
+
+Minimum number of monitoring regions.
+
+The minimal number of monitoring regions of DAMON for the cold memory
+monitoring.  This can be used to set lower-bound of the monitoring quality.
+But, setting this too high could result in increased monitoring overhead.
+Please refer to the DAMON documentation (:doc:`usage`) for more detail.
+
+max_nr_regions
+--------------
+
+Maximum number of monitoring regions.
+
+The maximum number of monitoring regions of DAMON for the cold memory
+monitoring.  This can be used to set upper-bound of the monitoring overhead.
+However, setting this too low could result in bad monitoring quality.  Please
+refer to the DAMON documentation (:doc:`usage`) for more detail.
+
+monitor_region_start
+--------------------
+
+Start of target memory region in physical address.
+
+The start physical address of memory region that DAMON_RECLAIM will do work
+against.  That is, DAMON_RECLAIM will find cold memory regions in this region
+and reclaims.  By default, biggest System RAM is used as the region.
+
+monitor_region_end
+------------------
+
+End of target memory region in physical address.
+
+The end physical address of memory region that DAMON_RECLAIM will do work
+against.  That is, DAMON_RECLAIM will find cold memory regions in this region
+and reclaims.  By default, biggest System RAM is used as the region.
+
+kdamond_pid
+-----------
+
+PID of the DAMON thread.
+
+If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.  Else,
+-1.
+
+Example
+=======
+
+Below runtime example commands make DAMON_RECLAIM to find memory regions that
+not accessed for 30 seconds or more and pages out.  The reclamation is limited
+to be done only up to 1 GiB per second to avoid DAMON_RECLAIM consuming too
+much CPU time for the paging out operation.  It also asks DAMON_RECLAIM to do
+nothing if the system's free memory rate is more than 50%, but start the real
+works if it becomes lower than 40%.  If DAMON_RECLAIM doesn't make progress and
+therefore the free memory rate becomes lower than 20%, it asks DAMON_RECLAIM to
+do nothing again, so that we can fall back to the LRU-list based page
+granularity reclamation. ::
+
+    # cd /sys/modules/damon_reclaim/parameters
+    # echo 30000000 > min_age
+    # echo $((1 * 1024 * 1024 * 1024)) > quota_sz
+    # echo 1000 > quota_reset_interval_ms
+    # echo 500 > wmarks_high
+    # echo 400 > wmarks_mid
+    # echo 200 > wmarks_low
+    # echo Y > enabled
+
+.. [1] https://research.google/pubs/pub48551/
+.. [2] https://lwn.net/Articles/787611/
+.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html
index d5eb89a8fc386189cb52aebf39b1986365881df6..4d5ca2c46288a13c587f6feccc40ee843044656c 100644 (file)
@@ -6,39 +6,9 @@ Getting Started
 
 This document briefly describes how you can use DAMON by demonstrating its
 default user space tool.  Please note that this document describes only a part
-of its features for brevity.  Please refer to :doc:`usage` for more details.
-
-
-TL; DR
-======
-
-Follow the commands below to monitor and visualize the memory access pattern of
-your workload. ::
-
-    # # build the kernel with CONFIG_DAMON_*=y, install it, and reboot
-    # mount -t debugfs none /sys/kernel/debug/
-    # git clone https://github.com/awslabs/damo
-    # ./damo/damo record $(pidof <your workload>)
-    # ./damo/damo report heat --plot_ascii
-
-The final command draws the access heatmap of ``<your workload>``.  The heatmap
-shows which memory region (x-axis) is accessed when (y-axis) and how frequently
-(number; the higher the more accesses have been observed). ::
-
-    111111111111111111111111111111111111111111111111111111110000
-    111121111111111111111111111111211111111111111111111111110000
-    000000000000000000000000000000000000000000000000001555552000
-    000000000000000000000000000000000000000000000222223555552000
-    000000000000000000000000000000000000000011111677775000000000
-    000000000000000000000000000000000000000488888000000000000000
-    000000000000000000000000000000000177888400000000000000000000
-    000000000000000000000000000046666522222100000000000000000000
-    000000000000000000000014444344444300000000000000000000000000
-    000000000000000002222245555510000000000000000000000000000000
-    # access_frequency:  0  1  2  3  4  5  6  7  8  9
-    # x-axis: space (140286319947776-140286426374096: 101.496 MiB)
-    # y-axis: time (605442256436361-605479951866441: 37.695430s)
-    # resolution: 60x10 (1.692 MiB and 3.770s for each character)
+of its features for brevity.  Please refer to the usage `doc
+<https://github.com/awslabs/damo/blob/next/USAGE.md>`_ of the tool for more
+details.
 
 
 Prerequisites
@@ -91,24 +61,74 @@ pattern in the ``damon.data`` file.
 Visualizing Recorded Patterns
 =============================
 
-The following three commands visualize the recorded access patterns and save
-the results as separate image files. ::
-
-    $ damo report heats --heatmap access_pattern_heatmap.png
-    $ damo report wss --range 0 101 1 --plot wss_dist.png
-    $ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png
-
-- ``access_pattern_heatmap.png`` will visualize the data access pattern in a
-  heatmap, showing which memory region (y-axis) got accessed when (x-axis)
-  and how frequently (color).
-- ``wss_dist.png`` will show the distribution of the working set size.
-- ``wss_chron_change.png`` will show how the working set size has
-  chronologically changed.
-
-You can view the visualizations of this example workload at [1]_.
-Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_.
-
-.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns
-.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html
-.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html
-.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html
+You can visualize the pattern in a heatmap, showing which memory region
+(x-axis) got accessed when (y-axis) and how frequently (number).::
+
+    $ sudo damo report heats --heatmap stdout
+    22222222222222222222222222222222222222211111111111111111111111111111111111111100
+    44444444444444444444444444444444444444434444444444444444444444444444444444443200
+    44444444444444444444444444444444444444433444444444444444444444444444444444444200
+    33333333333333333333333333333333333333344555555555555555555555555555555555555200
+    33333333333333333333333333333333333344444444444444444444444444444444444444444200
+    22222222222222222222222222222222222223355555555555555555555555555555555555555200
+    00000000000000000000000000000000000000288888888888888888888888888888888888888400
+    00000000000000000000000000000000000000288888888888888888888888888888888888888400
+    33333333333333333333333333333333333333355555555555555555555555555555555555555200
+    88888888888888888888888888888888888888600000000000000000000000000000000000000000
+    88888888888888888888888888888888888888600000000000000000000000000000000000000000
+    33333333333333333333333333333333333333444444444444444444444444444444444444443200
+    00000000000000000000000000000000000000288888888888888888888888888888888888888400
+    [...]
+    # access_frequency:  0  1  2  3  4  5  6  7  8  9
+    # x-axis: space (139728247021568-139728453431248: 196.848 MiB)
+    # y-axis: time (15256597248362-15326899978162: 1 m 10.303 s)
+    # resolution: 80x40 (2.461 MiB and 1.758 s for each character)
+
+You can also visualize the distribution of the working set size, sorted by the
+size.::
+
+    $ sudo damo report wss --range 0 101 10
+    # <percentile> <wss>
+    # target_id     18446632103789443072
+    # avr:  107.708 MiB
+      0             0 B |                                                           |
+     10      95.328 MiB |****************************                               |
+     20      95.332 MiB |****************************                               |
+     30      95.340 MiB |****************************                               |
+     40      95.387 MiB |****************************                               |
+     50      95.387 MiB |****************************                               |
+     60      95.398 MiB |****************************                               |
+     70      95.398 MiB |****************************                               |
+     80      95.504 MiB |****************************                               |
+     90     190.703 MiB |*********************************************************  |
+    100     196.875 MiB |***********************************************************|
+
+Using ``--sortby`` option with the above command, you can show how the working
+set size has chronologically changed.::
+
+    $ sudo damo report wss --range 0 101 10 --sortby time
+    # <percentile> <wss>
+    # target_id     18446632103789443072
+    # avr:  107.708 MiB
+      0       3.051 MiB |                                                           |
+     10     190.703 MiB |***********************************************************|
+     20      95.336 MiB |*****************************                              |
+     30      95.328 MiB |*****************************                              |
+     40      95.387 MiB |*****************************                              |
+     50      95.332 MiB |*****************************                              |
+     60      95.320 MiB |*****************************                              |
+     70      95.398 MiB |*****************************                              |
+     80      95.398 MiB |*****************************                              |
+     90      95.340 MiB |*****************************                              |
+    100      95.398 MiB |*****************************                              |
+
+
+Data Access Pattern Aware Memory Management
+===========================================
+
+Below three commands make every memory region of size >=4K that doesn't
+accessed for >=60 seconds in your workload to be swapped out. ::
+
+    $ echo "#min-size max-size min-acc max-acc min-age max-age action" > test_scheme
+    $ echo "4K        max      0       0       60s     max     pageout" >> test_scheme
+    $ damo schemes -c test_scheme <pid of your workload>
index a72cda374abac64c43be5cb2b94f519f5091e335..ed96bbf0daffc9d504440f11b03020202f92cbc8 100644 (file)
@@ -10,15 +10,16 @@ DAMON provides below three interfaces for different users.
   This is for privileged people such as system administrators who want a
   just-working human-friendly interface.  Using this, users can use the DAMON’s
   major features in a human-friendly way.  It may not be highly tuned for
-  special cases, though.  It supports only virtual address spaces monitoring.
+  special cases, though.  It supports both virtual and physical address spaces
+  monitoring.
 - *debugfs interface.*
   This is for privileged user space programmers who want more optimized use of
   DAMON.  Using this, users can use DAMON’s major features by reading
   from and writing to special debugfs files.  Therefore, you can write and use
   your personalized DAMON debugfs wrapper programs that reads/writes the
   debugfs files instead of you.  The DAMON user space tool is also a reference
-  implementation of such programs.  It supports only virtual address spaces
-  monitoring.
+  implementation of such programs.  It supports both virtual and physical
+  address spaces monitoring.
 - *Kernel Space Programming Interface.*
   This is for kernel space programmers.  Using this, users can utilize every
   feature of DAMON most flexibly and efficiently by writing kernel space
@@ -34,8 +35,9 @@ the reason, this document describes only the debugfs interface
 debugfs Interface
 =================
 
-DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under
-its debugfs directory, ``<debugfs>/damon/``.
+DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``,
+``schemes`` and ``monitor_on`` under its debugfs directory,
+``<debugfs>/damon/``.
 
 
 Attributes
@@ -71,9 +73,106 @@ check it again::
     # cat target_ids
     42 4242
 
+Users can also monitor the physical memory address space of the system by
+writing a special keyword, "``paddr\n``" to the file.  Because physical address
+space monitoring doesn't support multiple targets, reading the file will show a
+fake value, ``42``, as below::
+
+    # cd <debugfs>/damon
+    # echo paddr > target_ids
+    # cat target_ids
+    42
+
 Note that setting the target ids doesn't start the monitoring.
 
 
+Initial Monitoring Target Regions
+---------------------------------
+
+In case of the virtual address space monitoring, DAMON automatically sets and
+updates the monitoring target regions so that entire memory mappings of target
+processes can be covered.  However, users can want to limit the monitoring
+region to specific address ranges, such as the heap, the stack, or specific
+file-mapped area.  Or, some users can know the initial access pattern of their
+workloads and therefore want to set optimal initial regions for the 'adaptive
+regions adjustment'.
+
+In contrast, DAMON do not automatically sets and updates the monitoring target
+regions in case of physical memory monitoring.  Therefore, users should set the
+monitoring target regions by themselves.
+
+In such cases, users can explicitly set the initial monitoring target regions
+as they want, by writing proper values to the ``init_regions`` file.  Each line
+of the input should represent one region in below form.::
+
+    <target id> <start address> <end address>
+
+The ``target id`` should already in ``target_ids`` file, and the regions should
+be passed in address order.  For example, below commands will set a couple of
+address ranges, ``1-100`` and ``100-200`` as the initial monitoring target
+region of process 42, and another couple of address ranges, ``20-40`` and
+``50-100`` as that of process 4242.::
+
+    # cd <debugfs>/damon
+    # echo "42   1       100
+            42   100     200
+            4242 20      40
+            4242 50      100" > init_regions
+
+Note that this sets the initial monitoring target regions only.  In case of
+virtual memory monitoring, DAMON will automatically updates the boundary of the
+regions after one ``regions update interval``.  Therefore, users should set the
+``regions update interval`` large enough in this case, if they don't want the
+update.
+
+
+Schemes
+-------
+
+For usual DAMON-based data access aware memory management optimizations, users
+would simply want the system to apply a memory management action to a memory
+region of a specific size having a specific access frequency for a specific
+time.  DAMON receives such formalized operation schemes from the user and
+applies those to the target processes.  It also counts the total number and
+size of regions that each scheme is applied.  This statistics can be used for
+online analysis or tuning of the schemes.
+
+Users can get and set the schemes by reading from and writing to ``schemes``
+debugfs file.  Reading the file also shows the statistics of each scheme.  To
+the file, each of the schemes should be represented in each line in below form:
+
+    min-size max-size min-acc max-acc min-age max-age action
+
+Note that the ranges are closed interval.  Bytes for the size of regions
+(``min-size`` and ``max-size``), number of monitored accesses per aggregate
+interval for access frequency (``min-acc`` and ``max-acc``), number of
+aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a
+predefined integer for memory management actions should be used.  The supported
+numbers and their meanings are as below.
+
+ - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
+ - 1: Call ``madvise()`` for the region with ``MADV_COLD``
+ - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
+ - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
+ - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
+ - 5: Do nothing but count the statistics
+
+You can disable schemes by simply writing an empty string to the file.  For
+example, below commands applies a scheme saying "If a memory region of size in
+[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
+interval in [10, 20], page out the region", check the entered scheme again, and
+finally remove the scheme. ::
+
+    # cd <debugfs>/damon
+    # echo "4096 8192    0 5    10 20    2" > schemes
+    # cat schemes
+    4096 8192 0 5 10 20 2 0 0
+    # echo > schemes
+
+The last two integers in the 4th line of above example is the total number and
+the total size of the regions that the scheme is applied.
+
+
 Turning On/Off
 --------------
 
index 8abaeb144e44dd1a232115cbef0ef69d6df5c0e7..0166f9de34281da23adbb69dacf0516c047a5cab 100644 (file)
@@ -128,7 +128,9 @@ hugepages
        implicitly specifies the number of huge pages of default size to
        allocate.  If the number of huge pages of default size is implicitly
        specified, it can not be overwritten by a hugepagesz,hugepages
-       parameter pair for the default size.
+       parameter pair for the default size.  This parameter also has a
+       node format.  The node format specifies the number of huge pages
+       to allocate on specific nodes.
 
        For example, on an architecture with 2M default huge page size::
 
@@ -138,6 +140,14 @@ hugepages
        indicating that the hugepages=512 parameter is ignored.  If a hugepages
        parameter is preceded by an invalid hugepagesz parameter, it will
        be ignored.
+
+       Node format example::
+
+               hugepagesz=2M hugepages=0:1,1:2
+
+       It will allocate 1 2M hugepage on node0 and 2 2M hugepages on node1.
+       If the node number is invalid,  the parameter will be ignored.
+
 default_hugepagesz
        Specify the default huge page size.  This parameter can
        only be specified once on the command line.  default_hugepagesz can
@@ -234,8 +244,12 @@ will exist, of the form::
 
        hugepages-${size}kB
 
-Inside each of these directories, the same set of files will exist::
+Inside each of these directories, the set of files contained in ``/proc``
+will exist.  In addition, two additional interfaces for demoting huge
+pages may exist::
 
+        demote
+        demote_size
        nr_hugepages
        nr_hugepages_mempolicy
        nr_overcommit_hugepages
@@ -243,7 +257,29 @@ Inside each of these directories, the same set of files will exist::
        resv_hugepages
        surplus_hugepages
 
-which function as described above for the default huge page-sized case.
+The demote interfaces provide the ability to split a huge page into
+smaller huge pages.  For example, the x86 architecture supports both
+1GB and 2MB huge pages sizes.  A 1GB huge page can be split into 512
+2MB huge pages.  Demote interfaces are not available for the smallest
+huge page size.  The demote interfaces are:
+
+demote_size
+        is the size of demoted pages.  When a page is demoted a corresponding
+        number of huge pages of demote_size will be created.  By default,
+        demote_size is set to the next smaller huge page size.  If there are
+        multiple smaller huge page sizes, demote_size can be set to any of
+        these smaller sizes.  Only huge page sizes less than the current huge
+        pages size are allowed.
+
+demote
+        is used to demote a number of huge pages.  A user with root privileges
+        can write to this file.  It may not be possible to demote the
+        requested number of huge pages.  To determine how many pages were
+        actually demoted, compare the value of nr_hugepages before and after
+        writing to the demote interface.  demote is a write only interface.
+
+The interfaces which are the same as in ``/proc`` (all except demote and
+demote_size) function as described above for the default huge page-sized case.
 
 .. _mem_policy_and_hp_alloc:
 
index cbd19d5e625f389e9beb91e67de4c0a7ed8d9a92..c21b5823f12619d344577cfe6ac895810e39f982 100644 (file)
@@ -37,5 +37,7 @@ the Linux memory management.
    numaperf
    pagemap
    soft-dirty
+   swap_numa
    transhuge
    userfaultfd
+   zswap
index 03dfbc9252529d4b1c98e9ba861023cf6544388a..0f56ecd8ac054380bbea36dd6789cbe8eed127d9 100644 (file)
@@ -165,9 +165,8 @@ Or alternatively::
 
        % echo 1 > /sys/devices/system/memory/memoryXXX/online
 
-The kernel will select the target zone automatically, usually defaulting to
-``ZONE_NORMAL`` unless ``movablecore=1`` has been specified on the kernel
-command line or if the memory block would intersect the ZONE_MOVABLE already.
+The kernel will select the target zone automatically, depending on the
+configured ``online_policy``.
 
 One can explicitly request to associate an offline memory block with
 ZONE_MOVABLE by::
@@ -198,6 +197,9 @@ Auto-onlining can be enabled by writing ``online``, ``online_kernel`` or
 
        % echo online > /sys/devices/system/memory/auto_online_blocks
 
+Similarly to manual onlining, with ``online`` the kernel will select the
+target zone automatically, depending on the configured ``online_policy``.
+
 Modifying the auto-online behavior will only affect all subsequently added
 memory blocks only.
 
@@ -393,11 +395,16 @@ command line parameters are relevant:
 ======================== =======================================================
 ``memhp_default_state``         configure auto-onlining by essentially setting
                          ``/sys/devices/system/memory/auto_online_blocks``.
-``movablecore``                 configure automatic zone selection of the kernel. When
-                        set, the kernel will default to ZONE_MOVABLE, unless
-                        other zones can be kept contiguous.
+``movable_node``        configure automatic zone selection in the kernel when
+                        using the ``contig-zones`` online policy. When
+                        set, the kernel will default to ZONE_MOVABLE when
+                        onlining a memory block, unless other zones can be kept
+                        contiguous.
 ======================== =======================================================
 
+See Documentation/admin-guide/kernel-parameters.txt for a more generic
+description of these command line parameters.
+
 Module Parameters
 ------------------
 
@@ -410,24 +417,118 @@ them with ``memory_hotplug.`` such as::
 
 and they can be observed (and some even modified at runtime) via::
 
-       /sys/modules/memory_hotplug/parameters/
+       /sys/module/memory_hotplug/parameters/
 
 The following module parameters are currently defined:
 
-======================== =======================================================
-``memmap_on_memory``    read-write: Allocate memory for the memmap from the
-                        added memory block itself. Even if enabled, actual
-                        support depends on various other system properties and
-                        should only be regarded as a hint whether the behavior
-                        would be desired.
-
-                        While allocating the memmap from the memory block
-                        itself makes memory hotplug less likely to fail and
-                        keeps the memmap on the same NUMA node in any case, it
-                        can fragment physical memory in a way that huge pages
-                        in bigger granularity cannot be formed on hotplugged
-                        memory.
-======================== =======================================================
+================================ ===============================================
+``memmap_on_memory``            read-write: Allocate memory for the memmap from
+                                the added memory block itself. Even if enabled,
+                                actual support depends on various other system
+                                properties and should only be regarded as a
+                                hint whether the behavior would be desired.
+
+                                While allocating the memmap from the memory
+                                block itself makes memory hotplug less likely
+                                to fail and keeps the memmap on the same NUMA
+                                node in any case, it can fragment physical
+                                memory in a way that huge pages in bigger
+                                granularity cannot be formed on hotplugged
+                                memory.
+``online_policy``               read-write: Set the basic policy used for
+                                automatic zone selection when onlining memory
+                                blocks without specifying a target zone.
+                                ``contig-zones`` has been the kernel default
+                                before this parameter was added. After an
+                                online policy was configured and memory was
+                                online, the policy should not be changed
+                                anymore.
+
+                                When set to ``contig-zones``, the kernel will
+                                try keeping zones contiguous. If a memory block
+                                intersects multiple zones or no zone, the
+                                behavior depends on the ``movable_node`` kernel
+                                command line parameter: default to ZONE_MOVABLE
+                                if set, default to the applicable kernel zone
+                                (usually ZONE_NORMAL) if not set.
+
+                                When set to ``auto-movable``, the kernel will
+                                try onlining memory blocks to ZONE_MOVABLE if
+                                possible according to the configuration and
+                                memory device details. With this policy, one
+                                can avoid zone imbalances when eventually
+                                hotplugging a lot of memory later and still
+                                wanting to be able to hotunplug as much as
+                                possible reliably, very desirable in
+                                virtualized environments. This policy ignores
+                                the ``movable_node`` kernel command line
+                                parameter and isn't really applicable in
+                                environments that require it (e.g., bare metal
+                                with hotunpluggable nodes) where hotplugged
+                                memory might be exposed via the
+                                firmware-provided memory map early during boot
+                                to the system instead of getting detected,
+                                added and onlined  later during boot (such as
+                                done by virtio-mem or by some hypervisors
+                                implementing emulated DIMMs). As one example, a
+                                hotplugged DIMM will be onlined either
+                                completely to ZONE_MOVABLE or completely to
+                                ZONE_NORMAL, not a mixture.
+                                As another example, as many memory blocks
+                                belonging to a virtio-mem device will be
+                                onlined to ZONE_MOVABLE as possible,
+                                special-casing units of memory blocks that can
+                                only get hotunplugged together. *This policy
+                                does not protect from setups that are
+                                problematic with ZONE_MOVABLE and does not
+                                change the zone of memory blocks dynamically
+                                after they were onlined.*
+``auto_movable_ratio``          read-write: Set the maximum MOVABLE:KERNEL
+                                memory ratio in % for the ``auto-movable``
+                                online policy. Whether the ratio applies only
+                                for the system across all NUMA nodes or also
+                                per NUMA nodes depends on the
+                                ``auto_movable_numa_aware`` configuration.
+
+                                All accounting is based on present memory pages
+                                in the zones combined with accounting per
+                                memory device. Memory dedicated to the CMA
+                                allocator is accounted as MOVABLE, although
+                                residing on one of the kernel zones. The
+                                possible ratio depends on the actual workload.
+                                The kernel default is "301" %, for example,
+                                allowing for hotplugging 24 GiB to a 8 GiB VM
+                                and automatically onlining all hotplugged
+                                memory to ZONE_MOVABLE in many setups. The
+                                additional 1% deals with some pages being not
+                                present, for example, because of some firmware
+                                allocations.
+
+                                Note that ZONE_NORMAL memory provided by one
+                                memory device does not allow for more
+                                ZONE_MOVABLE memory for a different memory
+                                device. As one example, onlining memory of a
+                                hotplugged DIMM to ZONE_NORMAL will not allow
+                                for another hotplugged DIMM to get onlined to
+                                ZONE_MOVABLE automatically. In contrast, memory
+                                hotplugged by a virtio-mem device that got
+                                onlined to ZONE_NORMAL will allow for more
+                                ZONE_MOVABLE memory within *the same*
+                                virtio-mem device.
+``auto_movable_numa_aware``     read-write: Configure whether the
+                                ``auto_movable_ratio`` in the ``auto-movable``
+                                online policy also applies per NUMA
+                                node in addition to the whole system across all
+                                NUMA nodes. The kernel default is "Y".
+
+                                Disabling NUMA awareness can be helpful when
+                                dealing with NUMA nodes that should be
+                                completely hotunpluggable, onlining the memory
+                                completely to ZONE_MOVABLE automatically if
+                                possible.
+
+                                Parameter availability depends on CONFIG_NUMA.
+================================ ===============================================
 
 ZONE_MOVABLE
 ============
index 4581527c07ae14515c485e40363d0e0741f1b2d1..bfc28704856c6fa375b29fcd623ab567e891eeb5 100644 (file)
@@ -90,13 +90,14 @@ Short descriptions to the page flags
 ====================================
 
 0 - LOCKED
-   page is being locked for exclusive access, e.g. by undergoing read/write IO
+   The page is being locked for exclusive access, e.g. by undergoing read/write
+   IO.
 7 - SLAB
-   page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
+   The page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator.
    When compound page is used, SLUB/SLQB will only set this flag on the head
    page; SLOB will not flag it at all.
 10 - BUDDY
-    a free memory block managed by the buddy system allocator
+    A free memory block managed by the buddy system allocator.
     The buddy system organizes free memory in blocks of various orders.
     An order N block has 2^N physically contiguous pages, with the BUDDY flag
     set for and _only_ for the first page.
@@ -112,65 +113,65 @@ Short descriptions to the page flags
 16 - COMPOUND_TAIL
     A compound page tail (see description above).
 17 - HUGE
-    this is an integral part of a HugeTLB page
+    This is an integral part of a HugeTLB page.
 19 - HWPOISON
-    hardware detected memory corruption on this page: don't touch the data!
+    Hardware detected memory corruption on this page: don't touch the data!
 20 - NOPAGE
-    no page frame exists at the requested address
+    No page frame exists at the requested address.
 21 - KSM
-    identical memory pages dynamically shared between one or more processes
+    Identical memory pages dynamically shared between one or more processes.
 22 - THP
-    contiguous pages which construct transparent hugepages
+    Contiguous pages which construct transparent hugepages.
 23 - OFFLINE
-    page is logically offline
+    The page is logically offline.
 24 - ZERO_PAGE
-    zero page for pfn_zero or huge_zero page
+    Zero page for pfn_zero or huge_zero page.
 25 - IDLE
-    page has not been accessed since it was marked idle (see
+    The page has not been accessed since it was marked idle (see
     :ref:`Documentation/admin-guide/mm/idle_page_tracking.rst <idle_page_tracking>`).
     Note that this flag may be stale in case the page was accessed via
     a PTE. To make sure the flag is up-to-date one has to read
     ``/sys/kernel/mm/page_idle/bitmap`` first.
 26 - PGTABLE
-    page is in use as a page table
+    The page is in use as a page table.
 
 IO related page flags
 ---------------------
 
 1 - ERROR
-   IO error occurred
+   IO error occurred.
 3 - UPTODATE
-   page has up-to-date data
+   The page has up-to-date data.
    ie. for file backed page: (in-memory data revision >= on-disk one)
 4 - DIRTY
-   page has been written to, hence contains new data
+   The page has been written to, hence contains new data.
    i.e. for file backed page: (in-memory data revision >  on-disk one)
 8 - WRITEBACK
-   page is being synced to disk
+   The page is being synced to disk.
 
 LRU related page flags
 ----------------------
 
 5 - LRU
-   page is in one of the LRU lists
+   The page is in one of the LRU lists.
 6 - ACTIVE
-   page is in the active LRU list
+   The page is in the active LRU list.
 18 - UNEVICTABLE
-   page is in the unevictable (non-)LRU list It is somehow pinned and
+   The page is in the unevictable (non-)LRU list It is somehow pinned and
    not a candidate for LRU page reclaims, e.g. ramfs pages,
-   shmctl(SHM_LOCK) and mlock() memory segments
+   shmctl(SHM_LOCK) and mlock() memory segments.
 2 - REFERENCED
-   page has been referenced since last LRU list enqueue/requeue
+   The page has been referenced since last LRU list enqueue/requeue.
 9 - RECLAIM
-   page will be reclaimed soon after its pageout IO completed
+   The page will be reclaimed soon after its pageout IO completed.
 11 - MMAP
-   a memory mapped page
+   A memory mapped page.
 12 - ANON
-   a memory mapped page that is not part of a file
+   A memory mapped page that is not part of a file.
 13 - SWAPCACHE
-   page is mapped to swap space, i.e. has an associated swap entry
+   The page is mapped to swap space, i.e. has an associated swap entry.
 14 - SWAPBACKED
-   page is backed by swap/RAM
+   The page is backed by swap/RAM.
 
 The page-types tool in the tools/vm directory can be used to query the
 above flags.
diff --git a/Documentation/admin-guide/mm/swap_numa.rst b/Documentation/admin-guide/mm/swap_numa.rst
new file mode 100644 (file)
index 0000000..e0466f2
--- /dev/null
@@ -0,0 +1,80 @@
+.. _swap_numa:
+
+===========================================
+Automatically bind swap device to numa node
+===========================================
+
+If the system has more than one swap device and swap device has the node
+information, we can make use of this information to decide which swap
+device to use in get_swap_pages() to get better performance.
+
+
+How to use this feature
+=======================
+
+Swap device has priority and that decides the order of it to be used. To make
+use of automatically binding, there is no need to manipulate priority settings
+for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
+swapB, with swapA attached to node 0 and swapB attached to node 1, are going
+to be swapped on. Simply swapping them on by doing::
+
+       # swapon /dev/swapA
+       # swapon /dev/swapB
+
+Then node 0 will use the two swap devices in the order of swapA then swapB and
+node 1 will use the two swap devices in the order of swapB then swapA. Note
+that the order of them being swapped on doesn't matter.
+
+A more complex example on a 4 node machine. Assume 6 swap devices are going to
+be swapped on: swapA and swapB are attached to node 0, swapC is attached to
+node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
+The way to swap them on is the same as above::
+
+       # swapon /dev/swapA
+       # swapon /dev/swapB
+       # swapon /dev/swapC
+       # swapon /dev/swapD
+       # swapon /dev/swapE
+       # swapon /dev/swapF
+
+Then node 0 will use them in the order of::
+
+       swapA/swapB -> swapC -> swapD -> swapE -> swapF
+
+swapA and swapB will be used in a round robin mode before any other swap device.
+
+node 1 will use them in the order of::
+
+       swapC -> swapA -> swapB -> swapD -> swapE -> swapF
+
+node 2 will use them in the order of::
+
+       swapD/swapE -> swapA -> swapB -> swapC -> swapF
+
+Similaly, swapD and swapE will be used in a round robin mode before any
+other swap devices.
+
+node 3 will use them in the order of::
+
+       swapF -> swapA -> swapB -> swapC -> swapD -> swapE
+
+
+Implementation details
+======================
+
+The current code uses a priority based list, swap_avail_list, to decide
+which swap device to use and if multiple swap devices share the same
+priority, they are used round robin. This change here replaces the single
+global swap_avail_list with a per-numa-node list, i.e. for each numa node,
+it sees its own priority based list of available swap devices. Swap
+device's priority can be promoted on its matching node's swap_avail_list.
+
+The current swap device's priority is set as: user can set a >=0 value,
+or the system will pick one starting from -1 then downwards. The priority
+value in the swap_avail_list is the negated value of the swap device's
+due to plist being sorted from low to high. The new policy doesn't change
+the semantics for priority >=0 cases, the previous starting from -1 then
+downwards now becomes starting from -2 then downwards and -1 is reserved
+as the promoted value. So if multiple swap devices are attached to the same
+node, they will all be promoted to priority -1 on that node's plist and will
+be used round robin before any other swap devices.
diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst
new file mode 100644 (file)
index 0000000..8edb8d5
--- /dev/null
@@ -0,0 +1,152 @@
+.. _zswap:
+
+=====
+zswap
+=====
+
+Overview
+========
+
+Zswap is a lightweight compressed cache for swap pages. It takes pages that are
+in the process of being swapped out and attempts to compress them into a
+dynamically allocated RAM-based memory pool.  zswap basically trades CPU cycles
+for potentially reduced swap I/O.  This trade-off can also result in a
+significant performance improvement if reads from the compressed cache are
+faster than reads from a swap device.
+
+.. note::
+   Zswap is a new feature as of v3.11 and interacts heavily with memory
+   reclaim.  This interaction has not been fully explored on the large set of
+   potential configurations and workloads that exist.  For this reason, zswap
+   is a work in progress and should be considered experimental.
+
+   Some potential benefits:
+
+* Desktop/laptop users with limited RAM capacities can mitigate the
+  performance impact of swapping.
+* Overcommitted guests that share a common I/O resource can
+  dramatically reduce their swap I/O pressure, avoiding heavy handed I/O
+  throttling by the hypervisor. This allows more work to get done with less
+  impact to the guest workload and guests sharing the I/O subsystem
+* Users with SSDs as swap devices can extend the life of the device by
+  drastically reducing life-shortening writes.
+
+Zswap evicts pages from compressed cache on an LRU basis to the backing swap
+device when the compressed pool reaches its size limit.  This requirement had
+been identified in prior community discussions.
+
+Whether Zswap is enabled at the boot time depends on whether
+the ``CONFIG_ZSWAP_DEFAULT_ON`` Kconfig option is enabled or not.
+This setting can then be overridden by providing the kernel command line
+``zswap.enabled=`` option, for example ``zswap.enabled=0``.
+Zswap can also be enabled and disabled at runtime using the sysfs interface.
+An example command to enable zswap at runtime, assuming sysfs is mounted
+at ``/sys``, is::
+
+       echo 1 > /sys/module/zswap/parameters/enabled
+
+When zswap is disabled at runtime it will stop storing pages that are
+being swapped out.  However, it will _not_ immediately write out or fault
+back into memory all of the pages stored in the compressed pool.  The
+pages stored in zswap will remain in the compressed pool until they are
+either invalidated or faulted back into memory.  In order to force all
+pages out of the compressed pool, a swapoff on the swap device(s) will
+fault back into memory all swapped out pages, including those in the
+compressed pool.
+
+Design
+======
+
+Zswap receives pages for compression through the Frontswap API and is able to
+evict pages from its own compressed pool on an LRU basis and write them back to
+the backing swap device in the case that the compressed pool is full.
+
+Zswap makes use of zpool for the managing the compressed memory pool.  Each
+allocation in zpool is not directly accessible by address.  Rather, a handle is
+returned by the allocation routine and that handle must be mapped before being
+accessed.  The compressed memory pool grows on demand and shrinks as compressed
+pages are freed.  The pool is not preallocated.  By default, a zpool
+of type selected in ``CONFIG_ZSWAP_ZPOOL_DEFAULT`` Kconfig option is created,
+but it can be overridden at boot time by setting the ``zpool`` attribute,
+e.g. ``zswap.zpool=zbud``. It can also be changed at runtime using the sysfs
+``zpool`` attribute, e.g.::
+
+       echo zbud > /sys/module/zswap/parameters/zpool
+
+The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
+means the compression ratio will always be 2:1 or worse (because of half-full
+zbud pages).  The zsmalloc type zpool has a more complex compressed page
+storage method, and it can achieve greater storage densities.  However,
+zsmalloc does not implement compressed page eviction, so once zswap fills it
+cannot evict the oldest page, it can only reject new pages.
+
+When a swap page is passed from frontswap to zswap, zswap maintains a mapping
+of the swap entry, a combination of the swap type and swap offset, to the zpool
+handle that references that compressed swap page.  This mapping is achieved
+with a red-black tree per swap type.  The swap offset is the search key for the
+tree nodes.
+
+During a page fault on a PTE that is a swap entry, frontswap calls the zswap
+load function to decompress the page into the page allocated by the page fault
+handler.
+
+Once there are no PTEs referencing a swap page stored in zswap (i.e. the count
+in the swap_map goes to 0) the swap code calls the zswap invalidate function,
+via frontswap, to free the compressed entry.
+
+Zswap seeks to be simple in its policies.  Sysfs attributes allow for one user
+controlled policy:
+
+* max_pool_percent - The maximum percentage of memory that the compressed
+  pool can occupy.
+
+The default compressor is selected in ``CONFIG_ZSWAP_COMPRESSOR_DEFAULT``
+Kconfig option, but it can be overridden at boot time by setting the
+``compressor`` attribute, e.g. ``zswap.compressor=lzo``.
+It can also be changed at runtime using the sysfs "compressor"
+attribute, e.g.::
+
+       echo lzo > /sys/module/zswap/parameters/compressor
+
+When the zpool and/or compressor parameter is changed at runtime, any existing
+compressed pages are not modified; they are left in their own zpool.  When a
+request is made for a page in an old zpool, it is uncompressed using its
+original compressor.  Once all pages are removed from an old zpool, the zpool
+and its compressor are freed.
+
+Some of the pages in zswap are same-value filled pages (i.e. contents of the
+page have same value or repetitive pattern). These pages include zero-filled
+pages and they are handled differently. During store operation, a page is
+checked if it is a same-value filled page before compressing it. If true, the
+compressed length of the page is set to zero and the pattern or same-filled
+value is stored.
+
+Same-value filled pages identification feature is enabled by default and can be
+disabled at boot time by setting the ``same_filled_pages_enabled`` attribute
+to 0, e.g. ``zswap.same_filled_pages_enabled=0``. It can also be enabled and
+disabled at runtime using the sysfs ``same_filled_pages_enabled``
+attribute, e.g.::
+
+       echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
+
+When zswap same-filled page identification is disabled at runtime, it will stop
+checking for the same-value filled pages during store operation. However, the
+existing pages which are marked as same-value filled pages remain stored
+unchanged in zswap until they are either loaded or invalidated.
+
+To prevent zswap from shrinking pool when zswap is full and there's a high
+pressure on swap (this will result in flipping pages in and out zswap pool
+without any real benefit but with a performance drop for the system), a
+special parameter has been introduced to implement a sort of hysteresis to
+refuse taking pages into zswap pool until it has sufficient space if the limit
+has been hit. To set the threshold at which zswap would start accepting pages
+again after it became full, use the sysfs ``accept_threshold_percent``
+attribute, e. g.::
+
+       echo 80 > /sys/module/zswap/parameters/accept_threshold_percent
+
+Setting this parameter to 100 will disable the hysteresis.
+
+A debugfs interface is provided for various statistic about pool size, number
+of pages stored, same-value filled pages and various counters for the reasons
+pages are rejected.
index de7467e480672a0afc57cdf094860e3db5b25ab3..682259ee633ac22dabb08381c9357272b9a6e0b0 100644 (file)
@@ -57,7 +57,6 @@ The third argument (arg) passes a pointer of struct memory_notify::
                unsigned long start_pfn;
                unsigned long nr_pages;
                int status_change_nid_normal;
-               int status_change_nid_high;
                int status_change_nid;
        }
 
@@ -65,8 +64,6 @@ The third argument (arg) passes a pointer of struct memory_notify::
 - nr_pages is # of pages of online/offline memory.
 - status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
   is (will be) set/clear, if this is -1, then nodemask status is not changed.
-- status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
-  is (will be) set/clear, if this is -1, then nodemask status is not changed.
 - status_change_nid is set node id when N_MEMORY of nodemask is (will be)
   set/clear. It means a new(memoryless) node gets new memory by online and a
   node loses all memory. If this is -1, then nodemask status is not changed.
index 0fbe3308bf37ff129f26dc5848d290df707111fa..ac6b89d1a8c324e5faf175e70ad8e1cf338a3c76 100644 (file)
@@ -231,10 +231,14 @@ Guarded allocations are set up based on the sample interval. After expiration
 of the sample interval, the next allocation through the main allocator (SLAB or
 SLUB) returns a guarded allocation from the KFENCE object pool (allocation
 sizes up to PAGE_SIZE are supported). At this point, the timer is reset, and
-the next allocation is set up after the expiration of the interval. To "gate" a
-KFENCE allocation through the main allocator's fast-path without overhead,
-KFENCE relies on static branches via the static keys infrastructure. The static
-branch is toggled to redirect the allocation to KFENCE.
+the next allocation is set up after the expiration of the interval.
+
+When using ``CONFIG_KFENCE_STATIC_KEYS=y``, KFENCE allocations are "gated"
+through the main allocator's fast-path by relying on static branches via the
+static keys infrastructure. The static branch is toggled to redirect the
+allocation to KFENCE. Depending on sample interval, target workloads, and
+system architecture, this may perform better than the simple dynamic branch.
+Careful benchmarking is recommended.
 
 KFENCE objects each reside on a dedicated page, at either the left or right
 page boundaries selected at random. The pages to the left and right of the
@@ -269,6 +273,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
 first, and the chances of detecting use-after-frees of recently freed objects
 is increased.
 
+If pool utilization reaches 75% (default) or above, to reduce the risk of the
+pool eventually being fully occupied by allocated objects yet ensure diverse
+coverage of allocations, KFENCE limits currently covered allocations of the
+same source from further filling up the pool. The "source" of an allocation is
+based on its partial allocation stack trace. A side-effect is that this also
+limits frequent long-lived allocations (e.g. pagecache) of the same source
+filling up the pool permanently, which is the most common risk for the pool
+becoming full and the sampled allocation rate dropping to zero. The threshold
+at which to start limiting currently covered allocations can be configured via
+the boot parameter ``kfence.skip_covered_thresh`` (pool usage%).
+
 Interface
 ---------
 
index 64ffff460026040f45f6e8164c539ca26168712b..fc4873deb76f3190af0e9156f252ac34d98215d2 100644 (file)
@@ -14,14 +14,21 @@ allOf:
 
 properties:
   compatible:
-    const: holtek,ht16k33
+    oneOf:
+      - items:
+          - enum:
+              - adafruit,3108  # 0.56" 4-Digit 7-Segment FeatherWing Display (Red)
+              - adafruit,3130  # 0.54" Quad Alphanumeric FeatherWing Display (Red)
+          - const: holtek,ht16k33
+
+      - const: holtek,ht16k33     # Generic 16*8 LED controller with dot-matrix display
 
   reg:
     maxItems: 1
 
   refresh-rate-hz:
     maxItems: 1
-    description: Display update interval in Hertz
+    description: Display update interval in Hertz for dot-matrix displays
 
   interrupts:
     maxItems: 1
@@ -41,10 +48,22 @@ properties:
     default: 16
     description: Initial brightness level
 
+  led:
+    type: object
+    $ref: /schemas/leds/common.yaml#
+    unevaluatedProperties: false
+
 required:
   - compatible
   - reg
-  - refresh-rate-hz
+
+if:
+  properties:
+    compatible:
+      const: holtek,ht16k33
+then:
+  required:
+    - refresh-rate-hz
 
 additionalProperties: false
 
@@ -52,6 +71,7 @@ examples:
   - |
     #include <dt-bindings/interrupt-controller/irq.h>
     #include <dt-bindings/input/input.h>
+    #include <dt-bindings/leds/common.h>
     i2c1 {
             #address-cells = <1>;
             #size-cells = <0>;
@@ -73,5 +93,11 @@ examples:
                                    <MATRIX_KEY(4, 1, KEY_F9)>,
                                    <MATRIX_KEY(5, 1, KEY_F3)>,
                                    <MATRIX_KEY(6, 1, KEY_F1)>;
+
+                    led {
+                            color = <LED_COLOR_ID_RED>;
+                            function = LED_FUNCTION_BACKLIGHT;
+                            linux,default-trigger = "backlight";
+                    };
             };
       };
diff --git a/Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml b/Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml
new file mode 100644 (file)
index 0000000..044fa96
--- /dev/null
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/mediatek,mt7621-pcie.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek MT7621 PCIe controller
+
+maintainers:
+  - Sergio Paracuellos <sergio.paracuellos@gmail.com>
+
+description: |+
+  MediaTek MT7621 PCIe subsys supports a single Root Complex (RC)
+  with 3 Root Ports. Each Root Port supports a Gen1 1-lane Link
+
+allOf:
+  - $ref: /schemas/pci/pci-bus.yaml#
+
+properties:
+  compatible:
+    const: mediatek,mt7621-pci
+
+  reg:
+    items:
+      - description: host-pci bridge registers
+      - description: pcie port 0 RC control registers
+      - description: pcie port 1 RC control registers
+      - description: pcie port 2 RC control registers
+
+  ranges:
+    maxItems: 2
+
+patternProperties:
+  'pcie@[0-2],0':
+    type: object
+    $ref: /schemas/pci/pci-bus.yaml#
+
+    properties:
+      resets:
+        maxItems: 1
+
+      clocks:
+        maxItems: 1
+
+      phys:
+        maxItems: 1
+
+    required:
+      - "#interrupt-cells"
+      - interrupt-map-mask
+      - interrupt-map
+      - resets
+      - clocks
+      - phys
+      - phy-names
+      - ranges
+
+    unevaluatedProperties: false
+
+required:
+  - compatible
+  - reg
+  - ranges
+  - "#interrupt-cells"
+  - interrupt-map-mask
+  - interrupt-map
+  - reset-gpios
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/mips-gic.h>
+
+    pcie: pcie@1e140000 {
+        compatible = "mediatek,mt7621-pci";
+        reg = <0x1e140000 0x100>,
+              <0x1e142000 0x100>,
+              <0x1e143000 0x100>,
+              <0x1e144000 0x100>;
+
+        #address-cells = <3>;
+        #size-cells = <2>;
+        pinctrl-names = "default";
+        pinctrl-0 = <&pcie_pins>;
+        device_type = "pci";
+        ranges = <0x02000000 0 0x60000000 0x60000000 0 0x10000000>,  /* pci memory */
+                 <0x01000000 0 0x1e160000 0x1e160000 0 0x00010000>;  /* io space */
+        #interrupt-cells = <1>;
+        interrupt-map-mask = <0xF800 0 0 0>;
+        interrupt-map = <0x0000 0 0 0 &gic GIC_SHARED 4 IRQ_TYPE_LEVEL_HIGH>,
+                        <0x0800 0 0 0 &gic GIC_SHARED 24 IRQ_TYPE_LEVEL_HIGH>,
+                        <0x1000 0 0 0 &gic GIC_SHARED 25 IRQ_TYPE_LEVEL_HIGH>;
+        reset-gpios = <&gpio 19 GPIO_ACTIVE_LOW>;
+
+        pcie@0,0 {
+            reg = <0x0000 0 0 0 0>;
+            #address-cells = <3>;
+            #size-cells = <2>;
+            device_type = "pci";
+            #interrupt-cells = <1>;
+            interrupt-map-mask = <0 0 0 0>;
+            interrupt-map = <0 0 0 0 &gic GIC_SHARED 4 IRQ_TYPE_LEVEL_HIGH>;
+            resets = <&rstctrl 24>;
+            clocks = <&clkctrl 24>;
+            phys = <&pcie0_phy 1>;
+            phy-names = "pcie-phy0";
+            ranges;
+        };
+
+        pcie@1,0 {
+            reg = <0x0800 0 0 0 0>;
+            #address-cells = <3>;
+            #size-cells = <2>;
+            device_type = "pci";
+            #interrupt-cells = <1>;
+            interrupt-map-mask = <0 0 0 0>;
+            interrupt-map = <0 0 0 0 &gic GIC_SHARED 24 IRQ_TYPE_LEVEL_HIGH>;
+            resets = <&rstctrl 25>;
+            clocks = <&clkctrl 25>;
+            phys = <&pcie0_phy 1>;
+            phy-names = "pcie-phy1";
+            ranges;
+        };
+
+        pcie@2,0 {
+            reg = <0x1000 0 0 0 0>;
+            #address-cells = <3>;
+            #size-cells = <2>;
+            device_type = "pci";
+            #interrupt-cells = <1>;
+            interrupt-map-mask = <0 0 0 0>;
+            interrupt-map = <0 0 0 0 &gic GIC_SHARED 25 IRQ_TYPE_LEVEL_HIGH>;
+            resets = <&rstctrl 26>;
+            clocks = <&clkctrl 26>;
+            phys = <&pcie2_phy 0>;
+            phy-names = "pcie-phy2";
+            ranges;
+        };
+    };
+...
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml b/Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml
new file mode 100644 (file)
index 0000000..3d23599
--- /dev/null
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/qcom,pcie-ep.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm PCIe Endpoint Controller binding
+
+maintainers:
+  - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+
+allOf:
+  - $ref: "pci-ep.yaml#"
+
+properties:
+  compatible:
+    const: qcom,sdx55-pcie-ep
+
+  reg:
+    items:
+      - description: Qualcomm-specific PARF configuration registers
+      - description: DesignWare PCIe registers
+      - description: External local bus interface registers
+      - description: Address Translation Unit (ATU) registers
+      - description: Memory region used to map remote RC address space
+      - description: BAR memory region
+
+  reg-names:
+    items:
+      - const: parf
+      - const: dbi
+      - const: elbi
+      - const: atu
+      - const: addr_space
+      - const: mmio
+
+  clocks:
+    items:
+      - description: PCIe Auxiliary clock
+      - description: PCIe CFG AHB clock
+      - description: PCIe Master AXI clock
+      - description: PCIe Slave AXI clock
+      - description: PCIe Slave Q2A AXI clock
+      - description: PCIe Sleep clock
+      - description: PCIe Reference clock
+
+  clock-names:
+    items:
+      - const: aux
+      - const: cfg
+      - const: bus_master
+      - const: bus_slave
+      - const: slave_q2a
+      - const: sleep
+      - const: ref
+
+  qcom,perst-regs:
+    description: Reference to a syscon representing TCSR followed by the two
+                 offsets within syscon for Perst enable and Perst separation
+                 enable registers
+    $ref: "/schemas/types.yaml#/definitions/phandle-array"
+    items:
+      minItems: 3
+      maxItems: 3
+
+  interrupts:
+    items:
+      - description: PCIe Global interrupt
+      - description: PCIe Doorbell interrupt
+
+  interrupt-names:
+    items:
+      - const: global
+      - const: doorbell
+
+  reset-gpios:
+    description: GPIO used as PERST# input signal
+    maxItems: 1
+
+  wake-gpios:
+    description: GPIO used as WAKE# output signal
+    maxItems: 1
+
+  resets:
+    maxItems: 1
+
+  reset-names:
+    const: core
+
+  power-domains:
+    maxItems: 1
+
+  phys:
+    maxItems: 1
+
+  phy-names:
+    const: pciephy
+
+  num-lanes:
+    default: 2
+
+required:
+  - compatible
+  - reg
+  - reg-names
+  - clocks
+  - clock-names
+  - qcom,perst-regs
+  - interrupts
+  - interrupt-names
+  - reset-gpios
+  - resets
+  - reset-names
+  - power-domains
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,gcc-sdx55.h>
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    pcie_ep: pcie-ep@40000000 {
+        compatible = "qcom,sdx55-pcie-ep";
+        reg = <0x01c00000 0x3000>,
+              <0x40000000 0xf1d>,
+              <0x40000f20 0xc8>,
+              <0x40001000 0x1000>,
+              <0x40002000 0x1000>,
+              <0x01c03000 0x3000>;
+        reg-names = "parf", "dbi", "elbi", "atu", "addr_space",
+                    "mmio";
+
+        clocks = <&gcc GCC_PCIE_AUX_CLK>,
+             <&gcc GCC_PCIE_CFG_AHB_CLK>,
+             <&gcc GCC_PCIE_MSTR_AXI_CLK>,
+             <&gcc GCC_PCIE_SLV_AXI_CLK>,
+             <&gcc GCC_PCIE_SLV_Q2A_AXI_CLK>,
+             <&gcc GCC_PCIE_SLEEP_CLK>,
+             <&gcc GCC_PCIE_0_CLKREF_CLK>;
+        clock-names = "aux", "cfg", "bus_master", "bus_slave",
+                      "slave_q2a", "sleep", "ref";
+
+        qcom,perst-regs = <&tcsr 0xb258 0xb270>;
+
+        interrupts = <GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 145 IRQ_TYPE_LEVEL_HIGH>;
+        interrupt-names = "global", "doorbell";
+        reset-gpios = <&tlmm 57 GPIO_ACTIVE_LOW>;
+        wake-gpios = <&tlmm 53 GPIO_ACTIVE_LOW>;
+        resets = <&gcc GCC_PCIE_BCR>;
+        reset-names = "core";
+        power-domains = <&gcc PCIE_GDSC>;
+        phys = <&pcie0_lane>;
+        phy-names = "pciephy";
+        max-link-speed = <3>;
+        num-lanes = <2>;
+    };
index 3f646875f8c296e26babe273e5f2a9fe611a10e7..a0ae024c2d0ce7e15e375fdc110c35947956c0d1 100644 (file)
@@ -12,6 +12,7 @@
                        - "qcom,pcie-ipq4019" for ipq4019
                        - "qcom,pcie-ipq8074" for ipq8074
                        - "qcom,pcie-qcs404" for qcs404
+                       - "qcom,pcie-sc8180x" for sc8180x
                        - "qcom,pcie-sdm845" for sdm845
                        - "qcom,pcie-sm8250" for sm8250
                        - "qcom,pcie-ipq6018" for ipq6018
                        - "pipe"        PIPE clock
 
 - clock-names:
-       Usage: required for sm8250
+       Usage: required for sc8180x and sm8250
        Value type: <stringlist>
        Definition: Should contain the following entries
                        - "aux"         Auxiliary clock
                        - "ahb"                 AHB reset
 
 - reset-names:
-       Usage: required for sdm845 and sm8250
+       Usage: required for sc8180x, sdm845 and sm8250
        Value type: <stringlist>
        Definition: Should contain the following entries
                        - "pci"                 PCIe core reset
diff --git a/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml b/Documentation/devicetree/bindings/pci/rockchip-dw-pcie.yaml
new file mode 100644 (file)
index 0000000..142bbe5
--- /dev/null
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pci/rockchip-dw-pcie.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: DesignWare based PCIe controller on Rockchip SoCs
+
+maintainers:
+  - Shawn Lin <shawn.lin@rock-chips.com>
+  - Simon Xue <xxm@rock-chips.com>
+  - Heiko Stuebner <heiko@sntech.de>
+
+description: |+
+  RK3568 SoC PCIe host controller is based on the Synopsys DesignWare
+  PCIe IP and thus inherits all the common properties defined in
+  designware-pcie.txt.
+
+allOf:
+  - $ref: /schemas/pci/pci-bus.yaml#
+
+# We need a select here so we don't match all nodes with 'snps,dw-pcie'
+select:
+  properties:
+    compatible:
+      contains:
+        const: rockchip,rk3568-pcie
+  required:
+    - compatible
+
+properties:
+  compatible:
+    items:
+      - const: rockchip,rk3568-pcie
+      - const: snps,dw-pcie
+
+  reg:
+    items:
+      - description: Data Bus Interface (DBI) registers
+      - description: Rockchip designed configuration registers
+      - description: Config registers
+
+  reg-names:
+    items:
+      - const: dbi
+      - const: apb
+      - const: config
+
+  clocks:
+    items:
+      - description: AHB clock for PCIe master
+      - description: AHB clock for PCIe slave
+      - description: AHB clock for PCIe dbi
+      - description: APB clock for PCIe
+      - description: Auxiliary clock for PCIe
+
+  clock-names:
+    items:
+      - const: aclk_mst
+      - const: aclk_slv
+      - const: aclk_dbi
+      - const: pclk
+      - const: aux
+
+  msi-map: true
+
+  num-lanes: true
+
+  phys:
+    maxItems: 1
+
+  phy-names:
+    const: pcie-phy
+
+  power-domains:
+    maxItems: 1
+
+  ranges:
+    maxItems: 2
+
+  resets:
+    maxItems: 1
+
+  reset-names:
+    const: pipe
+
+  vpcie3v3-supply: true
+
+required:
+  - compatible
+  - reg
+  - reg-names
+  - clocks
+  - clock-names
+  - msi-map
+  - num-lanes
+  - phys
+  - phy-names
+  - power-domains
+  - resets
+  - reset-names
+
+unevaluatedProperties: false
+
+examples:
+  - |
+
+    bus {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        pcie3x2: pcie@fe280000 {
+            compatible = "rockchip,rk3568-pcie", "snps,dw-pcie";
+            reg = <0x3 0xc0800000 0x0 0x390000>,
+                  <0x0 0xfe280000 0x0 0x10000>,
+                  <0x3 0x80000000 0x0 0x100000>;
+            reg-names = "dbi", "apb", "config";
+            bus-range = <0x20 0x2f>;
+            clocks = <&cru 143>, <&cru 144>,
+                     <&cru 145>, <&cru 146>,
+                     <&cru 147>;
+            clock-names = "aclk_mst", "aclk_slv",
+                          "aclk_dbi", "pclk",
+                          "aux";
+            device_type = "pci";
+            linux,pci-domain = <2>;
+            max-link-speed = <2>;
+            msi-map = <0x2000 &its 0x2000 0x1000>;
+            num-lanes = <2>;
+            phys = <&pcie30phy>;
+            phy-names = "pcie-phy";
+            power-domains = <&power 15>;
+            ranges = <0x81000000 0x0 0x80800000 0x3 0x80800000 0x0 0x100000>,
+                     <0x83000000 0x0 0x80900000 0x3 0x80900000 0x0 0x3f700000>;
+            resets = <&cru 193>;
+            reset-names = "pipe";
+            #address-cells = <3>;
+            #size-cells = <2>;
+        };
+    };
+...
index db3af0b45bafa7872f54a3b5fd91753396ea89a6..b008b90b92c9fcb279a4e08ad2e3763ab67091a3 100644 (file)
@@ -1050,22 +1050,9 @@ is not sufficient this sometimes needs to be explicit.
 The above assignment instructs kbuild to descend down in the
 directory compressed/ when "make clean" is executed.
 
-To support the clean infrastructure in the Makefiles that build the
-final bootimage there is an optional target named archclean:
-
-       Example::
-
-               #arch/x86/Makefile
-               archclean:
-                       $(Q)$(MAKE) $(clean)=arch/x86/boot
-
-When "make clean" is executed, make will descend down in arch/x86/boot,
-and clean as usual. The Makefile located in arch/x86/boot/ may use
-the subdir- trick to descend further down.
-
 Note 1: arch/$(SRCARCH)/Makefile cannot use "subdir-", because that file is
-included in the top level makefile, and the kbuild infrastructure
-is not operational at that point.
+included in the top level makefile. Instead, arch/$(SRCARCH)/Kbuild can use
+"subdir-".
 
 Note 2: All directories listed in core-y, libs-y, drivers-y and net-y will
 be visited during "make clean".
index 0750d94424779e27e84a6b3213a799f172feb9a8..9b2841fb9a5fe7cafe280c57e8b49857b616a3a3 100644 (file)
@@ -63,7 +63,6 @@ memory_notify结构体的指针::
                unsigned long start_pfn;
                unsigned long nr_pages;
                int status_change_nid_normal;
-               int status_change_nid_high;
                int status_change_nid;
        }
 
@@ -74,9 +73,6 @@ memory_notify结构体的指针::
 - status_change_nid_normal是当nodemask的N_NORMAL_MEMORY被设置/清除时设置节
   点id,如果是-1,则nodemask状态不改变。
 
-- status_change_nid_high是当nodemask的N_HIGH_MEMORY被设置/清除时设置的节点
-  id,如果这个值为-1,那么nodemask状态不会改变。
-
 - status_change_nid是当nodemask的N_MEMORY被(将)设置/清除时设置的节点id。这
   意味着一个新的(没上线的)节点通过联机获得新的内存,而一个节点失去了所有的内
   存。如果这个值为-1,那么nodemask的状态就不会改变。
index b05159c295f4d31d96c305864103f74a03497595..210f0f50efd81eb595133e0d9affef250fcbcbf9 100644 (file)
@@ -35,13 +35,17 @@ two parts:
 1. Identification of the monitoring target address range for the address space.
 2. Access check of specific address range in the target space.
 
-DAMON currently provides the implementation of the primitives for only the
-virtual address spaces. Below two subsections describe how it works.
+DAMON currently provides the implementations of the primitives for the physical
+and virtual address spaces. Below two subsections describe how those work.
 
 
 VMA-based Target Address Range Construction
 -------------------------------------------
 
+This is only for the virtual address space primitives implementation.  That for
+the physical address space simply asks users to manually set the monitoring
+target address ranges.
+
 Only small parts in the super-huge virtual address space of the processes are
 mapped to the physical memory and accessed.  Thus, tracking the unmapped
 address regions is just wasteful.  However, because DAMON can deal with some
@@ -71,15 +75,18 @@ to make a reasonable trade-off.  Below shows this in detail::
 PTE Accessed-bit Based Access Check
 -----------------------------------
 
-The implementation for the virtual address space uses PTE Accessed-bit for
-basic access checks.  It finds the relevant PTE Accessed bit from the address
-by walking the page table for the target task of the address.  In this way, the
-implementation finds and clears the bit for next sampling target address and
-checks whether the bit set again after one sampling period.  This could disturb
-other kernel subsystems using the Accessed bits, namely Idle page tracking and
-the reclaim logic.  To avoid such disturbances, DAMON makes it mutually
-exclusive with Idle page tracking and uses ``PG_idle`` and ``PG_young`` page
-flags to solve the conflict with the reclaim logic, as Idle page tracking does.
+Both of the implementations for physical and virtual address spaces use PTE
+Accessed-bit for basic access checks.  Only one difference is the way of
+finding the relevant PTE Accessed bit(s) from the address.  While the
+implementation for the virtual address walks the page table for the target task
+of the address, the implementation for the physical address walks every page
+table having a mapping to the address.  In this way, the implementations find
+and clear the bit(s) for next sampling target address and checks whether the
+bit(s) set again after one sampling period.  This could disturb other kernel
+subsystems using the Accessed bits, namely Idle page tracking and the reclaim
+logic.  To avoid such disturbances, DAMON makes it mutually exclusive with Idle
+page tracking and uses ``PG_idle`` and ``PG_young`` page flags to solve the
+conflict with the reclaim logic, as Idle page tracking does.
 
 
 Address Space Independent Core Mechanisms
index cb3d8b585a8b339447333402d8d876e5ec56c1ce..11aea40eb328c30901aa6df912d90c627c14b914 100644 (file)
@@ -36,10 +36,9 @@ constructions and actual access checks can be implemented and configured on the
 DAMON core by the users.  In this way, DAMON users can monitor any address
 space with any access check technique.
 
-Nonetheless, DAMON provides vma tracking and PTE Accessed bit check based
+Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based
 implementations of the address space dependent functions for the virtual memory
-by default, for a reference and convenient use.  In near future, we will
-provide those for physical memory address space.
+and the physical memory by default, for a reference and convenient use.
 
 
 Can I simply monitor page granularity?
index a2858baf3bf1df807662e36961858dc8bef3376f..48c0bbff98b2fcd39b746f5765b339ae477bb631 100644 (file)
@@ -27,4 +27,3 @@ workloads and systems.
    faq
    design
    api
-   plans
index b51f0d8992f8fda0113fe6b126f22e8e1b9bf40e..6f5ffef4b716a9e246d40d49ee793891b3a942df 100644 (file)
@@ -3,27 +3,11 @@ Linux Memory Management Documentation
 =====================================
 
 This is a collection of documents about the Linux memory management (mm)
-subsystem.  If you are looking for advice on simply allocating memory,
-see the :ref:`memory_allocation`.
-
-User guides for MM features
-===========================
-
-The following documents provide guides for controlling and tuning
-various features of the Linux memory management
-
-.. toctree::
-   :maxdepth: 1
-
-   swap_numa
-   zswap
-
-Kernel developers MM documentation
-==================================
-
-The below documents describe MM internals with different level of
-details ranging from notes and mailing list responses to elaborate
-descriptions of data structures and algorithms.
+subsystem internals with different level of details ranging from notes and
+mailing list responses for elaborating descriptions of data structures and
+algorithms.  If you are looking for advice on simply allocating memory, see the
+:ref:`memory_allocation`.  For controlling and tuning guides, see the
+:doc:`admin guide <../admin-guide/mm/index>`.
 
 .. toctree::
    :maxdepth: 1
index 2175465c9bf2a6e804a1b2b66d63de0e5cdffe8e..9837fc8147dd6b68f72b3db1ffef5c71896ec999 100644 (file)
@@ -85,5 +85,26 @@ Usage
        cat /sys/kernel/debug/page_owner > page_owner_full.txt
        ./page_owner_sort page_owner_full.txt sorted_page_owner.txt
 
+   The general output of ``page_owner_full.txt`` is as follows:
+
+       Page allocated via order XXX, ...
+       PFN XXX ...
+        // Detailed stack
+
+       Page allocated via order XXX, ...
+       PFN XXX ...
+        // Detailed stack
+
+   The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
+   in buf, uses regexp to extract the page order value, counts the times
+   and pages of buf, and finally sorts them according to the times.
+
    See the result about who allocated each page
-   in the ``sorted_page_owner.txt``.
+   in the ``sorted_page_owner.txt``. General output:
+
+       XXX times, XXX pages:
+       Page allocated via order XXX, ...
+        // Detailed stack
+
+   By default, ``page_owner_sort`` is sorted according to the times of buf.
+   If you want to sort by the pages nums of buf, use the ``-m`` parameter.
diff --git a/Documentation/vm/swap_numa.rst b/Documentation/vm/swap_numa.rst
deleted file mode 100644 (file)
index e0466f2..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-.. _swap_numa:
-
-===========================================
-Automatically bind swap device to numa node
-===========================================
-
-If the system has more than one swap device and swap device has the node
-information, we can make use of this information to decide which swap
-device to use in get_swap_pages() to get better performance.
-
-
-How to use this feature
-=======================
-
-Swap device has priority and that decides the order of it to be used. To make
-use of automatically binding, there is no need to manipulate priority settings
-for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
-swapB, with swapA attached to node 0 and swapB attached to node 1, are going
-to be swapped on. Simply swapping them on by doing::
-
-       # swapon /dev/swapA
-       # swapon /dev/swapB
-
-Then node 0 will use the two swap devices in the order of swapA then swapB and
-node 1 will use the two swap devices in the order of swapB then swapA. Note
-that the order of them being swapped on doesn't matter.
-
-A more complex example on a 4 node machine. Assume 6 swap devices are going to
-be swapped on: swapA and swapB are attached to node 0, swapC is attached to
-node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
-The way to swap them on is the same as above::
-
-       # swapon /dev/swapA
-       # swapon /dev/swapB
-       # swapon /dev/swapC
-       # swapon /dev/swapD
-       # swapon /dev/swapE
-       # swapon /dev/swapF
-
-Then node 0 will use them in the order of::
-
-       swapA/swapB -> swapC -> swapD -> swapE -> swapF
-
-swapA and swapB will be used in a round robin mode before any other swap device.
-
-node 1 will use them in the order of::
-
-       swapC -> swapA -> swapB -> swapD -> swapE -> swapF
-
-node 2 will use them in the order of::
-
-       swapD/swapE -> swapA -> swapB -> swapC -> swapF
-
-Similaly, swapD and swapE will be used in a round robin mode before any
-other swap devices.
-
-node 3 will use them in the order of::
-
-       swapF -> swapA -> swapB -> swapC -> swapD -> swapE
-
-
-Implementation details
-======================
-
-The current code uses a priority based list, swap_avail_list, to decide
-which swap device to use and if multiple swap devices share the same
-priority, they are used round robin. This change here replaces the single
-global swap_avail_list with a per-numa-node list, i.e. for each numa node,
-it sees its own priority based list of available swap devices. Swap
-device's priority can be promoted on its matching node's swap_avail_list.
-
-The current swap device's priority is set as: user can set a >=0 value,
-or the system will pick one starting from -1 then downwards. The priority
-value in the swap_avail_list is the negated value of the swap device's
-due to plist being sorted from low to high. The new policy doesn't change
-the semantics for priority >=0 cases, the previous starting from -1 then
-downwards now becomes starting from -2 then downwards and -1 is reserved
-as the promoted value. So if multiple swap devices are attached to the same
-node, they will all be promoted to priority -1 on that node's plist and will
-be used round robin before any other swap devices.
diff --git a/Documentation/vm/zswap.rst b/Documentation/vm/zswap.rst
deleted file mode 100644 (file)
index 8edb8d5..0000000
+++ /dev/null
@@ -1,152 +0,0 @@
-.. _zswap:
-
-=====
-zswap
-=====
-
-Overview
-========
-
-Zswap is a lightweight compressed cache for swap pages. It takes pages that are
-in the process of being swapped out and attempts to compress them into a
-dynamically allocated RAM-based memory pool.  zswap basically trades CPU cycles
-for potentially reduced swap I/O.  This trade-off can also result in a
-significant performance improvement if reads from the compressed cache are
-faster than reads from a swap device.
-
-.. note::
-   Zswap is a new feature as of v3.11 and interacts heavily with memory
-   reclaim.  This interaction has not been fully explored on the large set of
-   potential configurations and workloads that exist.  For this reason, zswap
-   is a work in progress and should be considered experimental.
-
-   Some potential benefits:
-
-* Desktop/laptop users with limited RAM capacities can mitigate the
-  performance impact of swapping.
-* Overcommitted guests that share a common I/O resource can
-  dramatically reduce their swap I/O pressure, avoiding heavy handed I/O
-  throttling by the hypervisor. This allows more work to get done with less
-  impact to the guest workload and guests sharing the I/O subsystem
-* Users with SSDs as swap devices can extend the life of the device by
-  drastically reducing life-shortening writes.
-
-Zswap evicts pages from compressed cache on an LRU basis to the backing swap
-device when the compressed pool reaches its size limit.  This requirement had
-been identified in prior community discussions.
-
-Whether Zswap is enabled at the boot time depends on whether
-the ``CONFIG_ZSWAP_DEFAULT_ON`` Kconfig option is enabled or not.
-This setting can then be overridden by providing the kernel command line
-``zswap.enabled=`` option, for example ``zswap.enabled=0``.
-Zswap can also be enabled and disabled at runtime using the sysfs interface.
-An example command to enable zswap at runtime, assuming sysfs is mounted
-at ``/sys``, is::
-
-       echo 1 > /sys/module/zswap/parameters/enabled
-
-When zswap is disabled at runtime it will stop storing pages that are
-being swapped out.  However, it will _not_ immediately write out or fault
-back into memory all of the pages stored in the compressed pool.  The
-pages stored in zswap will remain in the compressed pool until they are
-either invalidated or faulted back into memory.  In order to force all
-pages out of the compressed pool, a swapoff on the swap device(s) will
-fault back into memory all swapped out pages, including those in the
-compressed pool.
-
-Design
-======
-
-Zswap receives pages for compression through the Frontswap API and is able to
-evict pages from its own compressed pool on an LRU basis and write them back to
-the backing swap device in the case that the compressed pool is full.
-
-Zswap makes use of zpool for the managing the compressed memory pool.  Each
-allocation in zpool is not directly accessible by address.  Rather, a handle is
-returned by the allocation routine and that handle must be mapped before being
-accessed.  The compressed memory pool grows on demand and shrinks as compressed
-pages are freed.  The pool is not preallocated.  By default, a zpool
-of type selected in ``CONFIG_ZSWAP_ZPOOL_DEFAULT`` Kconfig option is created,
-but it can be overridden at boot time by setting the ``zpool`` attribute,
-e.g. ``zswap.zpool=zbud``. It can also be changed at runtime using the sysfs
-``zpool`` attribute, e.g.::
-
-       echo zbud > /sys/module/zswap/parameters/zpool
-
-The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
-means the compression ratio will always be 2:1 or worse (because of half-full
-zbud pages).  The zsmalloc type zpool has a more complex compressed page
-storage method, and it can achieve greater storage densities.  However,
-zsmalloc does not implement compressed page eviction, so once zswap fills it
-cannot evict the oldest page, it can only reject new pages.
-
-When a swap page is passed from frontswap to zswap, zswap maintains a mapping
-of the swap entry, a combination of the swap type and swap offset, to the zpool
-handle that references that compressed swap page.  This mapping is achieved
-with a red-black tree per swap type.  The swap offset is the search key for the
-tree nodes.
-
-During a page fault on a PTE that is a swap entry, frontswap calls the zswap
-load function to decompress the page into the page allocated by the page fault
-handler.
-
-Once there are no PTEs referencing a swap page stored in zswap (i.e. the count
-in the swap_map goes to 0) the swap code calls the zswap invalidate function,
-via frontswap, to free the compressed entry.
-
-Zswap seeks to be simple in its policies.  Sysfs attributes allow for one user
-controlled policy:
-
-* max_pool_percent - The maximum percentage of memory that the compressed
-  pool can occupy.
-
-The default compressor is selected in ``CONFIG_ZSWAP_COMPRESSOR_DEFAULT``
-Kconfig option, but it can be overridden at boot time by setting the
-``compressor`` attribute, e.g. ``zswap.compressor=lzo``.
-It can also be changed at runtime using the sysfs "compressor"
-attribute, e.g.::
-
-       echo lzo > /sys/module/zswap/parameters/compressor
-
-When the zpool and/or compressor parameter is changed at runtime, any existing
-compressed pages are not modified; they are left in their own zpool.  When a
-request is made for a page in an old zpool, it is uncompressed using its
-original compressor.  Once all pages are removed from an old zpool, the zpool
-and its compressor are freed.
-
-Some of the pages in zswap are same-value filled pages (i.e. contents of the
-page have same value or repetitive pattern). These pages include zero-filled
-pages and they are handled differently. During store operation, a page is
-checked if it is a same-value filled page before compressing it. If true, the
-compressed length of the page is set to zero and the pattern or same-filled
-value is stored.
-
-Same-value filled pages identification feature is enabled by default and can be
-disabled at boot time by setting the ``same_filled_pages_enabled`` attribute
-to 0, e.g. ``zswap.same_filled_pages_enabled=0``. It can also be enabled and
-disabled at runtime using the sysfs ``same_filled_pages_enabled``
-attribute, e.g.::
-
-       echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
-
-When zswap same-filled page identification is disabled at runtime, it will stop
-checking for the same-value filled pages during store operation. However, the
-existing pages which are marked as same-value filled pages remain stored
-unchanged in zswap until they are either loaded or invalidated.
-
-To prevent zswap from shrinking pool when zswap is full and there's a high
-pressure on swap (this will result in flipping pages in and out zswap pool
-without any real benefit but with a performance drop for the system), a
-special parameter has been introduced to implement a sort of hysteresis to
-refuse taking pages into zswap pool until it has sufficient space if the limit
-has been hit. To set the threshold at which zswap would start accepting pages
-again after it became full, use the sysfs ``accept_threshold_percent``
-attribute, e. g.::
-
-       echo 80 > /sys/module/zswap/parameters/accept_threshold_percent
-
-Setting this parameter to 100 will disable the hysteresis.
-
-A debugfs interface is provided for various statistic about pool size, number
-of pages stored, same-value filled pages and various counters for the reasons
-pages are rejected.
index 74158b271cb74d8a6dbb493279dceb9900940b83..9096c64d8d099b3e738a664587673186efcfc07b 100644 (file)
@@ -1297,6 +1297,13 @@ S:       Maintained
 F:     Documentation/devicetree/bindings/iommu/apple,dart.yaml
 F:     drivers/iommu/apple-dart.c
 
+APPLE PCIE CONTROLLER DRIVER
+M:     Alyssa Rosenzweig <alyssa@rosenzweig.io>
+M:     Marc Zyngier <maz@kernel.org>
+L:     linux-pci@vger.kernel.org
+S:     Maintained
+F:     drivers/pci/controller/pcie-apple.c
+
 APPLE SMC DRIVER
 M:     Henrik Rydberg <rydberg@bitmath.org>
 L:     linux-hwmon@vger.kernel.org
@@ -3163,6 +3170,7 @@ F:        lib/*audit.c
 AUXILIARY DISPLAY DRIVERS
 M:     Miguel Ojeda <ojeda@kernel.org>
 S:     Maintained
+F:     Documentation/devicetree/bindings/auxdisplay/
 F:     drivers/auxdisplay/
 F:     include/linux/cfag12864b.h
 
@@ -5220,7 +5228,7 @@ F:        net/ax25/ax25_timer.c
 F:     net/ax25/sysctl_net_ax25.c
 
 DATA ACCESS MONITOR
-M:     SeongJae Park <sjpark@amazon.de>
+M:     SeongJae Park <sj@kernel.org>
 L:     linux-mm@kvack.org
 S:     Maintained
 F:     Documentation/admin-guide/mm/damon/
@@ -12005,6 +12013,12 @@ S:     Maintained
 F:     Documentation/devicetree/bindings/i2c/i2c-mt7621.txt
 F:     drivers/i2c/busses/i2c-mt7621.c
 
+MEDIATEK MT7621 PCIE CONTROLLER DRIVER
+M:     Sergio Paracuellos <sergio.paracuellos@gmail.com>
+S:     Maintained
+F:     Documentation/devicetree/bindings/pci/mediatek,mt7621-pcie.yaml
+F:     drivers/pci/controller/pcie-mt7621.c
+
 MEDIATEK MT7621 PHY PCI DRIVER
 M:     Sergio Paracuellos <sergio.paracuellos@gmail.com>
 S:     Maintained
@@ -14647,9 +14661,12 @@ M:     Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
 R:     Krzysztof Wilczyński <kw@linux.com>
 L:     linux-pci@vger.kernel.org
 S:     Supported
+Q:     https://patchwork.kernel.org/project/linux-pci/list/
+B:     https://bugzilla.kernel.org
+C:     irc://irc.oftc.net/linux-pci
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git
 F:     Documentation/PCI/endpoint/*
 F:     Documentation/misc-devices/pci-endpoint-test.rst
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/kishon/pci-endpoint.git
 F:     drivers/misc/pci_endpoint_test.c
 F:     drivers/pci/endpoint/
 F:     tools/pci/
@@ -14695,15 +14712,21 @@ R:    Rob Herring <robh@kernel.org>
 R:     Krzysztof Wilczyński <kw@linux.com>
 L:     linux-pci@vger.kernel.org
 S:     Supported
-Q:     http://patchwork.ozlabs.org/project/linux-pci/list/
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/
+Q:     https://patchwork.kernel.org/project/linux-pci/list/
+B:     https://bugzilla.kernel.org
+C:     irc://irc.oftc.net/linux-pci
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git
 F:     drivers/pci/controller/
+F:     drivers/pci/pci-bridge-emul.c
+F:     drivers/pci/pci-bridge-emul.h
 
 PCI SUBSYSTEM
 M:     Bjorn Helgaas <bhelgaas@google.com>
 L:     linux-pci@vger.kernel.org
 S:     Supported
-Q:     http://patchwork.ozlabs.org/project/linux-pci/list/
+Q:     https://patchwork.kernel.org/project/linux-pci/list/
+B:     https://bugzilla.kernel.org
+C:     irc://irc.oftc.net/linux-pci
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git
 F:     Documentation/PCI/
 F:     Documentation/devicetree/bindings/pci/
@@ -14803,7 +14826,15 @@ M:     Stanimir Varbanov <svarbanov@mm-sol.com>
 L:     linux-pci@vger.kernel.org
 L:     linux-arm-msm@vger.kernel.org
 S:     Maintained
-F:     drivers/pci/controller/dwc/*qcom*
+F:     drivers/pci/controller/dwc/pcie-qcom.c
+
+PCIE ENDPOINT DRIVER FOR QUALCOMM
+M:     Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+L:     linux-pci@vger.kernel.org
+L:     linux-arm-msm@vger.kernel.org
+S:     Maintained
+F:     Documentation/devicetree/bindings/pci/qcom,pcie-ep.yaml
+F:     drivers/pci/controller/dwc/pcie-qcom-ep.c
 
 PCIE DRIVER FOR ROCKCHIP
 M:     Shawn Lin <shawn.lin@rock-chips.com>
index 6f2e233c7bc0d7bc5cb4ba0f27fe0a7f29f162bd..d83d72c26aaa95241e8cc2a99bed0a9b7d52795f 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -850,44 +850,6 @@ ifdef CONFIG_ZERO_CALL_USED_REGS
 KBUILD_CFLAGS  += -fzero-call-used-regs=used-gpr
 endif
 
-DEBUG_CFLAGS   :=
-
-ifdef CONFIG_DEBUG_INFO
-
-ifdef CONFIG_DEBUG_INFO_SPLIT
-DEBUG_CFLAGS   += -gsplit-dwarf
-else
-DEBUG_CFLAGS   += -g
-endif
-
-ifndef CONFIG_AS_IS_LLVM
-KBUILD_AFLAGS  += -Wa,-gdwarf-2
-endif
-
-ifndef CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT
-dwarf-version-$(CONFIG_DEBUG_INFO_DWARF4) := 4
-dwarf-version-$(CONFIG_DEBUG_INFO_DWARF5) := 5
-DEBUG_CFLAGS   += -gdwarf-$(dwarf-version-y)
-endif
-
-ifdef CONFIG_DEBUG_INFO_REDUCED
-DEBUG_CFLAGS   += -fno-var-tracking
-ifdef CONFIG_CC_IS_GCC
-DEBUG_CFLAGS   += -femit-struct-debug-baseonly
-endif
-endif
-
-ifdef CONFIG_DEBUG_INFO_COMPRESSED
-DEBUG_CFLAGS   += -gz=zlib
-KBUILD_AFLAGS  += -gz=zlib
-KBUILD_LDFLAGS += --compress-debug-sections=zlib
-endif
-
-endif # CONFIG_DEBUG_INFO
-
-KBUILD_CFLAGS += $(DEBUG_CFLAGS)
-export DEBUG_CFLAGS
-
 ifdef CONFIG_FUNCTION_TRACER
 ifdef CONFIG_FTRACE_MCOUNT_USE_CC
   CC_FLAGS_FTRACE      += -mrecord-mcount
@@ -984,7 +946,7 @@ KBUILD_CFLAGS += -falign-functions=64
 endif
 
 # arch Makefile may override CC so keep this after arch Makefile is included
-NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include)
+NOSTDINC_FLAGS += -nostdinc
 
 # warn about C99 declaration after statement
 KBUILD_CFLAGS += -Wdeclaration-after-statement
@@ -1011,6 +973,21 @@ ifdef CONFIG_CC_IS_GCC
 KBUILD_CFLAGS += -Wno-maybe-uninitialized
 endif
 
+ifdef CONFIG_CC_IS_GCC
+# The allocators already balk at large sizes, so silence the compiler
+# warnings for bounds checks involving those possible values. While
+# -Wno-alloc-size-larger-than would normally be used here, earlier versions
+# of gcc (<9.1) weirdly don't handle the option correctly when _other_
+# warnings are produced (?!). Using -Walloc-size-larger-than=SIZE_MAX
+# doesn't work (as it is documented to), silently resolving to "0" prior to
+# version 9.1 (and producing an error more recently). Numeric values larger
+# than PTRDIFF_MAX also don't work prior to version 9.1, which are silently
+# ignored, continuing to default to PTRDIFF_MAX. So, left with no other
+# choice, we must perform a versioned check to disable this warning.
+# https://lore.kernel.org/lkml/20210824115859.187f272f@canb.auug.org.au
+KBUILD_CFLAGS += $(call cc-ifversion, -ge, 0901, -Wno-alloc-size-larger-than)
+endif
+
 # disable invalid "can't wrap" optimizations for signed / pointers
 KBUILD_CFLAGS  += -fno-strict-overflow
 
@@ -1036,6 +1013,7 @@ KBUILD_CPPFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
 
 # include additional Makefiles when needed
 include-y                      := scripts/Makefile.extrawarn
+include-$(CONFIG_DEBUG_INFO)   += scripts/Makefile.debug
 include-$(CONFIG_KASAN)                += scripts/Makefile.kasan
 include-$(CONFIG_KCSAN)                += scripts/Makefile.kcsan
 include-$(CONFIG_UBSAN)                += scripts/Makefile.ubsan
index c2302017403a9de90195ad1281809eef9c210c77..345d79df24bb99856e27eda600c053efcc912bca 100644 (file)
@@ -1,3 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y                  += kernel/ mm/
 obj-$(CONFIG_MATHEMU)  += math-emu/
+
+# for cleaning
+subdir- += boot
index 52529ee42dac9d5689e4caf2a2b219be71b685eb..881cb913e23abbc416ffdee4e744334d0d989955 100644 (file)
@@ -55,9 +55,6 @@ $(boot)/vmlinux.gz: vmlinux
 bootimage bootpfile bootpzfile: vmlinux
        $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-
 archheaders:
        $(Q)$(MAKE) $(build)=arch/alpha/kernel/syscalls all
 
index 72af1e72d833193a678bb1a1122c810d67dcfd31..6b8ed12936b6f131c4276d2d1283d876d42846a6 100644 (file)
@@ -233,7 +233,7 @@ albacore_init_arch(void)
                        unsigned long size;
 
                        size = initrd_end - initrd_start;
-                       memblock_free(__pa(initrd_start), PAGE_ALIGN(size));
+                       memblock_free((void *)initrd_start, PAGE_ALIGN(size));
                        if (!move_initrd(pci_mem))
                                printk("irongate_init_arch: initrd too big "
                                       "(%ldK)\ndisabling initrd\n",
index 699d8cae9b1fcd4876b9580efd81d37c624f135b..b94102fff68b454ceadfa7b87b2c49d512fdcf79 100644 (file)
@@ -1,3 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-y += kernel/
 obj-y += mm/
+
+# for cleaning
+subdir- += boot
index 8782a03f24a8e6f974e0d681d5ebc9c4c594781e..f252e7b924e96229a419cb84a010701880257320 100644 (file)
@@ -112,6 +112,3 @@ uImage: $(uimage-default-y)
        @$(kecho) '  Image $(boot)/uImage is ready'
 
 CLEAN_FILES += $(boot)/uImage
-
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
index 699ecf1196414b657da35107d87f62de10529f84..ce4e939a7f077c1d9cd15574e313b7f6b3243616 100644 (file)
@@ -59,13 +59,13 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
 
                low_mem_sz = size;
                in_use = 1;
-               memblock_add_node(base, size, 0);
+               memblock_add_node(base, size, 0, MEMBLOCK_NONE);
        } else {
 #ifdef CONFIG_HIGHMEM
                high_mem_start = base;
                high_mem_sz = size;
                in_use = 1;
-               memblock_add_node(base, size, 1);
+               memblock_add_node(base, size, 1, MEMBLOCK_NONE);
                memblock_reserve(base, size);
 #endif
        }
@@ -173,7 +173,7 @@ static void __init highmem_init(void)
 #ifdef CONFIG_HIGHMEM
        unsigned long tmp;
 
-       memblock_free(high_mem_start, high_mem_sz);
+       memblock_phys_free(high_mem_start, high_mem_sz);
        for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++)
                free_highmem_page(pfn_to_page(tmp));
 #endif
index 5208f7061524bf957e666bd68cc9134eeccd492c..b506622e7e23a50311fe3a41dc01861025163236 100644 (file)
@@ -9,3 +9,6 @@ obj-y                           += kernel/ mm/ common/
 obj-y                          += probes/
 obj-y                          += net/
 obj-y                          += crypto/
+
+# for cleaning
+subdir- += boot
index 1c540157e2831afde542c85005f1f503963dd437..a522716565c6fc733355526e91673da337d97db1 100644 (file)
@@ -318,10 +318,6 @@ ifeq ($(CONFIG_VDSO),y)
        $(Q)$(MAKE) $(build)=arch/arm/vdso $@
 endif
 
-# We use MRPROPER_FILES and CLEAN_FILES now
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-
 # My testing targets (bypasses dependencies)
 bp:;   $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $(boot)/bootpImage
 
index 96a484095194db4e24a68eabdaf1bd928a1f9c37..258586e31333e56c17c029e30cf7ca42a95c3ea4 100644 (file)
@@ -339,7 +339,7 @@ err_fabric:
 err_sysctrl:
        iounmap(relocation);
 err_reloc:
-       memblock_free(hip04_boot_method[0], hip04_boot_method[1]);
+       memblock_phys_free(hip04_boot_method[0], hip04_boot_method[1]);
 err:
        return ret;
 }
index 6162a070a4104a2650ff2ab356f187db66d0654f..6d0cb0f7bc54bd1c66b7215ae8e2711de3105efb 100644 (file)
@@ -158,7 +158,7 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align)
                panic("Failed to steal %pa bytes at %pS\n",
                      &size, (void *)_RET_IP_);
 
-       memblock_free(phys, size);
+       memblock_phys_free(phys, size);
        memblock_remove(phys, size);
 
        return phys;
index ea7ab4ca81f92dea6dc94948d6a3c187b65d826d..5bfbf7d79c99bec11d19b3551600318af2c07023 100644 (file)
@@ -4,3 +4,6 @@ obj-$(CONFIG_KVM)       += kvm/
 obj-$(CONFIG_XEN)      += xen/
 obj-$(subst m,y,$(CONFIG_HYPERV))      += hyperv/
 obj-$(CONFIG_CRYPTO)   += crypto/
+
+# for cleaning
+subdir- += boot
index 176d6fddc4f263b3fea5c90a5e8b02736e63796b..c4207cf9bb17ffb28147c51ce6a38ed54c01c4ff 100644 (file)
@@ -1163,6 +1163,10 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
        def_bool y
        depends on NUMA
 
+config NEED_PER_CPU_PAGE_FIRST_CHUNK
+       def_bool y
+       depends on NUMA
+
 source "kernel/Kconfig.hz"
 
 config ARCH_SPARSEMEM_ENABLE
index c744b1e7b3569773af6d4a0a074d5883725a9922..e8cfc5868aa8eefb5d5ae91e0f79a75044085733 100644 (file)
@@ -182,13 +182,6 @@ ifeq ($(CONFIG_ARM64_USE_LSE_ATOMICS),y)
   endif
 endif
 
-
-# We use MRPROPER_FILES and CLEAN_FILES now
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-       $(Q)$(MAKE) $(clean)=arch/arm64/kernel/vdso
-       $(Q)$(MAKE) $(clean)=arch/arm64/kernel/vdso32
-
 ifeq ($(KBUILD_EXTMOD),)
 # We need to generate vdso-offsets.h before compiling certain files in kernel/.
 # In order to do that, we should use the archprepare target, but we can't since
index 3f1490bfb938a0c064b72710c5e85db0084e7e19..88b3e2a214084522f079a7928f44abdac2c2ba40 100644 (file)
@@ -81,3 +81,6 @@ extra-y                                       += $(head-y) vmlinux.lds
 ifeq ($(CONFIG_DEBUG_EFI),y)
 AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\""
 endif
+
+# for cleaning
+subdir- += vdso vdso32
index 0941180a86d34cb74399ddcac3713fe526379e6c..29490be2546bfce91862735e2b66a83c8d682afe 100644 (file)
@@ -9,6 +9,8 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)       += xor-neon.o
 CFLAGS_REMOVE_xor-neon.o       += -mgeneral-regs-only
 CFLAGS_xor-neon.o              += -ffreestanding
+# Enable <arm_neon.h>
+CFLAGS_xor-neon.o              += -isystem $(shell $(CC) -print-file-name=include)
 endif
 
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
index 61b52a92b8b68b0d6a9d72bd816f6483bcb7c175..5b996ca4d99609a3b1588acca9d602fb1d0f8523 100644 (file)
@@ -287,6 +287,22 @@ static void __init kasan_init_depth(void)
        init_task.kasan_depth = 0;
 }
 
+#ifdef CONFIG_KASAN_VMALLOC
+void __init kasan_populate_early_vm_area_shadow(void *start, unsigned long size)
+{
+       unsigned long shadow_start, shadow_end;
+
+       if (!is_vmalloc_or_module_addr(start))
+               return;
+
+       shadow_start = (unsigned long)kasan_mem_to_shadow(start);
+       shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
+       shadow_end = (unsigned long)kasan_mem_to_shadow(start + size);
+       shadow_end = ALIGN(shadow_end, PAGE_SIZE);
+       kasan_map_populate(shadow_start, shadow_end, NUMA_NO_NODE);
+}
+#endif
+
 void __init kasan_init(void)
 {
        kasan_init_shadow();
index fd85b51b9d50fc6dcf001fd6b96c79c92d95469c..d77bf06d6a6d95378e5bae16f5c06298d41da936 100644 (file)
@@ -738,8 +738,8 @@ void __init paging_init(void)
        cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
        init_mm.pgd = swapper_pg_dir;
 
-       memblock_free(__pa_symbol(init_pg_dir),
-                     __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
+       memblock_phys_free(__pa_symbol(init_pg_dir),
+                          __pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
 
        memblock_allow_resize();
 }
index a4e40e534e6a84db241abfe5076962a90f8a71bd..4e39f7abdeb6dc90d4018a927f11f139515a4ce8 100644 (file)
@@ -1 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
+
+# for cleaning
+subdir- += boot
index 37f593a4bf53612dd8ff5492bde0e24bec13f574..86680507763647faafe6f436533f7c15ef90d0d9 100644 (file)
@@ -76,9 +76,6 @@ all: zImage
 zImage Image uImage: vmlinux
        $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-
 define archhelp
   echo  '* zImage       - Compressed kernel image (arch/$(ARCH)/boot/zImage)'
   echo  '  Image        - Uncompressed kernel image (arch/$(ARCH)/boot/Image)'
index b2583e7efbd1d91bc58be2d0fbf0faac088367fc..e4703f3534ccafc6e3bd11d789d5d475a0b43398 100644 (file)
@@ -1,2 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y  += kernel/ mm/ boot/dts/
+
+# for cleaning
+subdir- += boot
index eb4cb8f6830c572e3cf609f227f986a8e2ea23b2..807f41e60ee4a5b06f9494f3d4f11ae6aad22151 100644 (file)
@@ -34,9 +34,6 @@ libs-y        += arch/$(ARCH)/lib/
 
 boot := arch/h8300/boot
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-
 vmlinux.srec vmlinux.bin zImage uImage.bin: vmlinux
        $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
index 7e548c654a290f286cdce7740bbc4eef7d92f47d..3b3ac3e1f2728145746c436c0d8ed930aa09b18f 100644 (file)
@@ -67,8 +67,6 @@ vmlinux.bin: vmlinux FORCE
 unwcheck: vmlinux
        -$(Q)READELF=$(READELF) $(PYTHON3) $(srctree)/arch/ia64/scripts/unwcheck.py $<
 
-archclean:
-
 archheaders:
        $(Q)$(MAKE) $(build)=arch/ia64/kernel/syscalls all
 
index 42e025cfbd088ccb2e1b88ab5d9b46a70fd1005d..24901d809301541863e0c1c4251d0ccc892c6843 100644 (file)
@@ -153,7 +153,7 @@ find_memory (void)
        efi_memmap_walk(find_max_min_low_pfn, NULL);
        max_pfn = max_low_pfn;
 
-       memblock_add_node(0, PFN_PHYS(max_low_pfn), 0);
+       memblock_add_node(0, PFN_PHYS(max_low_pfn), 0, MEMBLOCK_NONE);
 
        find_initrd();
 
index 5c6da8d83c1ade4dc6f7e9a311c95768c216ae32..5d165607bf35471629f4800a0f873a4c055259c2 100644 (file)
@@ -378,7 +378,7 @@ int __init register_active_ranges(u64 start, u64 len, int nid)
 #endif
 
        if (start < end)
-               memblock_add_node(__pa(start), end - start, nid);
+               memblock_add_node(__pa(start), end - start, nid, MEMBLOCK_NONE);
        return 0;
 }
 
index dd0c0ec67f67064d82b22e2636f4d3b1b7c41959..740fc97b9c0f00f0d9c9f56b1ed09c16fa855cf7 100644 (file)
@@ -2,9 +2,7 @@
 # m68k/Makefile
 #
 # This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture
+# architecture-specific flags and dependencies.
 #
 # This file is subject to the terms and conditions of the GNU General Public
 # License.  See the file "COPYING" in the main directory of this archive
index eac9dde65193443efe6298d170b21dcfb8905682..6f1f251252944b15bceaeb033ad3a323a822bf52 100644 (file)
@@ -174,7 +174,8 @@ void __init cf_bootmem_alloc(void)
        m68k_memory[0].addr = _rambase;
        m68k_memory[0].size = _ramend - _rambase;
 
-       memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0);
+       memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0,
+                         MEMBLOCK_NONE);
 
        /* compute total pages in system */
        num_pages = PFN_DOWN(_ramend - _rambase);
index 9f3f77785aa78a013ba26b31da01d12a2d5d645f..2b05bb2bac00d8dee7cc893522579df33500e9ac 100644 (file)
@@ -410,7 +410,8 @@ void __init paging_init(void)
 
        min_addr = m68k_memory[0].addr;
        max_addr = min_addr + m68k_memory[0].size;
-       memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0);
+       memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0,
+                         MEMBLOCK_NONE);
        for (i = 1; i < m68k_num_memory;) {
                if (m68k_memory[i].addr < min_addr) {
                        printk("Ignoring memory chunk at 0x%lx:0x%lx before the first chunk\n",
@@ -421,7 +422,8 @@ void __init paging_init(void)
                                (m68k_num_memory - i) * sizeof(struct m68k_mem_info));
                        continue;
                }
-               memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i);
+               memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i,
+                                 MEMBLOCK_NONE);
                addr = m68k_memory[i].addr + m68k_memory[i].size;
                if (addr > max_addr)
                        max_addr = addr;
index a1c5978893198b2a2ef5ee83c1bd4d0a0d1d00fd..077a0b8e961571585988c30435d5ff2568674617 100644 (file)
@@ -3,3 +3,6 @@ obj-y                   += kernel/
 obj-y                  += mm/
 obj-$(CONFIG_PCI)      += pci/
 obj-y                  += boot/dts/
+
+# for cleaning
+subdir- += boot
index 9adc6b6434dfeba62346959413acf6f7c392573c..e775a696aa6fc37dfab79287920b9ca0cfa4697e 100644 (file)
@@ -60,9 +60,6 @@ export DTB
 
 all: linux.bin
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-
 archheaders:
        $(Q)$(MAKE) $(build)=arch/microblaze/kernel/syscalls all
 
index 557585f1be4180323007c7547ff0c99af3cc9464..622a4867f9e9da0c30fe605ddf6a1eff747385f2 100644 (file)
@@ -587,13 +587,12 @@ static void pcibios_fixup_resources(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_fixup_resources);
 
-int pcibios_add_device(struct pci_dev *dev)
+int pcibios_device_add(struct pci_dev *dev)
 {
        dev->irq = of_irq_parse_and_map_pci(dev, 0, 0);
 
        return 0;
 }
-EXPORT_SYMBOL(pcibios_add_device);
 
 /*
  * Reparent resource children of pr that conflict with res
index d5d6ef9bb9867835fe221aa5501187212928918a..9e8071f0e58ff1271c0bb118e12704838082aaba 100644 (file)
@@ -25,3 +25,6 @@ obj-y += vdso/
 ifdef CONFIG_KVM
 obj-y += kvm/
 endif
+
+# for cleaning
+subdir- += boot
index ea3cd080a1c7dc328b49aa3aa06a84b6c0c19af4..e036fc025cccb28085c92fb26e260f100d758171 100644 (file)
@@ -8,8 +8,7 @@
 # Copyright (C) 2002, 2003, 2004  Maciej W. Rozycki
 #
 # This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" cleaning up for this architecture.
+# architecture-specific flags and dependencies.
 #
 
 archscripts: scripts_basic
@@ -426,11 +425,6 @@ endif
        $(Q)install -D -m 644 .config $(INSTALL_PATH)/config-$(KERNELRELEASE)
        $(Q)install -D -m 644 System.map $(INSTALL_PATH)/System.map-$(KERNELRELEASE)
 
-archclean:
-       $(Q)$(MAKE) $(clean)=arch/mips/boot
-       $(Q)$(MAKE) $(clean)=arch/mips/boot/compressed
-       $(Q)$(MAKE) $(clean)=arch/mips/boot/tools
-
 archheaders:
        $(Q)$(MAKE) $(build)=arch/mips/kernel/syscalls all
 
index a3da2c5d63c2159a8d81d6e00fe7b50addbf6a58..196c44fa72d90c4fb1eac7f92e4bd7e8221c991f 100644 (file)
@@ -171,3 +171,6 @@ $(obj)/vmlinux.itb: $(obj)/vmlinux.its $(obj)/vmlinux.bin FORCE
 
 $(obj)/vmlinux.%.itb: $(obj)/vmlinux.%.its $(obj)/vmlinux.bin.% FORCE
        $(call if_changed,itb-image,$<)
+
+# for cleaning
+subdir- += compressed tools
index f03fc52ed1d68f4bdb68e36c96955aa3de056b75..ee8de1735b7c04095fc345e3bbc330b62612ac26 100644 (file)
@@ -77,7 +77,9 @@ void __init szmem(unsigned int node)
                                (u32)node_id, mem_type, mem_start, mem_size);
                        pr_info("       start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n",
                                start_pfn, end_pfn, num_physpages);
-                       memblock_add_node(PFN_PHYS(start_pfn), PFN_PHYS(node_psize), node);
+                       memblock_add_node(PFN_PHYS(start_pfn),
+                                         PFN_PHYS(node_psize), node,
+                                         MEMBLOCK_NONE);
                        break;
                case SYSTEM_RAM_RESERVED:
                        pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx MB\n",
index 19347dc6bbf88330eb7329fd2953e78a93b98297..325e1552cbeada70d870ff2179a9ae70bf26c120 100644 (file)
@@ -529,7 +529,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size,
 
 static void __init pcpu_fc_free(void *ptr, size_t size)
 {
-       memblock_free_early(__pa(ptr), size);
+       memblock_free(ptr, size);
 }
 
 void __init setup_per_cpu_areas(void)
index c800bf5559b519f96277aa98c78057bc9367df7d..120adad51d6a40aec649e96f9a2af14e442acacf 100644 (file)
@@ -51,7 +51,8 @@ choice
                select SYS_SUPPORTS_HIGHMEM
                select MIPS_GIC
                select CLKSRC_MIPS_GIC
-               select HAVE_PCI if PCI_MT7621
+               select HAVE_PCI
+               select PCI_DRIVERS_GENERIC
                select SOC_BUS
 endchoice
 
index 6173684b5aaa04f881b0b72fce93c39ab026b856..adc2faeecf7c01cda151e3100759e78fec2d2cdd 100644 (file)
@@ -341,7 +341,8 @@ static void __init szmem(void)
                                continue;
                        }
                        memblock_add_node(PFN_PHYS(slot_getbasepfn(node, slot)),
-                                         PFN_PHYS(slot_psize), node);
+                                         PFN_PHYS(slot_psize), node,
+                                         MEMBLOCK_NONE);
                }
        }
 }
index 44b1607e964ddeb9cf135ed8f083d166dfbe145b..75a34684e7045977a89faa54b1ec740eb13af5ff 100644 (file)
@@ -69,10 +69,10 @@ static void __init ip30_mem_init(void)
                total_mem += size;
 
                if (addr >= IP30_REAL_MEMORY_START)
-                       memblock_free(addr, size);
+                       memblock_phys_free(addr, size);
                else if ((addr + size) > IP30_REAL_MEMORY_START)
-                       memblock_free(IP30_REAL_MEMORY_START,
-                                    size - IP30_MAX_PROM_MEMORY);
+                       memblock_phys_free(IP30_REAL_MEMORY_START,
+                                          size - IP30_MAX_PROM_MEMORY);
        }
        pr_info("Detected %luMB of physical memory.\n", MEM_SHIFT(total_mem));
 }
index a4e40e534e6a84db241abfe5076962a90f8a71bd..4e39f7abdeb6dc90d4018a927f11f139515a4ce8 100644 (file)
@@ -1 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
+
+# for cleaning
+subdir- += boot
index ccdca714202019884ccbf794923291dce728bc46..797ad9b450af210562d526fac5f1b5a6f4d4bba9 100644 (file)
@@ -9,6 +9,8 @@ endif
 # Avoid generating FPU instructions
 arch-y  += -mno-ext-fpu-sp -mno-ext-fpu-dp -mfloat-abi=soft
 
+# Enable <nds32_intrinsic.h>
+KBUILD_CFLAGS  += -isystem $(shell $(CC) -print-file-name=include)
 KBUILD_CFLAGS  += $(call cc-option, -mno-sched-prolog-epilog)
 KBUILD_CFLAGS  += -mcmodel=large
 
@@ -62,9 +64,6 @@ prepare: vdso_prepare
 vdso_prepare: prepare0
        $(Q)$(MAKE) $(build)=arch/nds32/kernel/vdso include/generated/vdso-offsets.h
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-
 define archhelp
   echo  '  Image         - kernel image (arch/$(ARCH)/boot/Image)'
 endef
index a4e40e534e6a84db241abfe5076962a90f8a71bd..4e39f7abdeb6dc90d4018a927f11f139515a4ce8 100644 (file)
@@ -1 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
+
+# for cleaning
+subdir- += boot
index 52c03e60b114d4ffa25dd361b001472096b5ba13..02d678559066f18c8c6b2dfd672f88b71430063d 100644 (file)
@@ -8,8 +8,7 @@
 # Written by Fredrik Markstrom
 #
 # This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" cleaning up for this architecture.
+# architecture-specific flags and dependencies.
 #
 # Nios2 port by Wind River Systems Inc trough:
 #   fredrik.markstrom@gmail.com and ivarholmqvist@gmail.com
@@ -53,14 +52,12 @@ core-y      += $(nios2-boot)/dts/
 
 all: vmImage
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(nios2-boot)
-
 $(BOOT_TARGETS): vmlinux
        $(Q)$(MAKE) $(build)=$(nios2-boot) $(nios2-boot)/$@
 
 install:
-       $(Q)$(MAKE) $(build)=$(nios2-boot) BOOTIMAGE=$(KBUILD_IMAGE) install
+       sh $(srctree)/$(nios2-boot)/install.sh $(KERNELRELEASE) \
+       $(KBUILD_IMAGE) System.map "$(INSTALL_PATH)"
 
 define archhelp
   echo  '* vmImage         - Kernel-only image for U-Boot ($(KBUILD_IMAGE))'
index 37dfc7e584bce3d8080512e99e6e9062e9ebf9de..8c3ad76602f3e4a3aaa11f22cfcc38eb969c9989 100644 (file)
@@ -30,6 +30,3 @@ $(obj)/zImage: $(obj)/compressed/vmlinux FORCE
 
 $(obj)/compressed/vmlinux: $(obj)/vmlinux.gz FORCE
        $(Q)$(MAKE) $(build)=$(obj)/compressed $@
-
-install:
-       sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
index 4234b4c03e725af424ddacdd3bf742ea584cd452..b0b0f2b03f872b83b2c5b2e4ce557f000ad5dcfa 100644 (file)
@@ -1,3 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-y += lib/ kernel/ mm/
 obj-y += boot/dts/
+
+# for cleaning
+subdir- += boot
index c52de526e51899b39ae9e67ba68794efa7e9a63c..760b734fb82277b2845dc8c121c6293e90df9377 100644 (file)
@@ -1,9 +1,7 @@
 # BK Id: %F% %I% %G% %U% %#%
 #
 # This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture
+# architecture-specific flags and dependencies.
 #
 # This file is subject to the terms and conditions of the GNU General Public
 # License.  See the file "COPYING" in the main directory of this archive
@@ -48,6 +46,3 @@ PHONY += vmlinux.bin
 
 vmlinux.bin: vmlinux
        $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
-
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
index 3c068b700a8103fe3d4d3fb8902b4db86612cc89..a6d3b280ba0c20466771ad697b5f480b3946e475 100644 (file)
@@ -1,2 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y  += mm/ kernel/ math-emu/
+
+# for cleaning
+subdir- += boot
index fcde3ffa02213f669dbd4549422d31d7d1f29863..8db4af4879d02f63c84a72e9304a71586bf70f7c 100644 (file)
@@ -2,9 +2,7 @@
 # parisc/Makefile
 #
 # This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture
+# architecture-specific flags and dependencies.
 #
 # This file is subject to the terms and conditions of the GNU General Public
 # License.  See the file "COPYING" in the main directory of this archive
@@ -181,8 +179,5 @@ define archhelp
        @echo  '  zinstall      - Install compressed vmlinuz kernel'
 endef
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-
 archheaders:
        $(Q)$(MAKE) $(build)=arch/parisc/kernel/syscalls all
index 5e2f9eaa3ee7d573c1371985efc20c2a1cbb3194..22cd0d55a8924abd9d5863d8b8c2487bc257ec19 100644 (file)
@@ -16,3 +16,6 @@ obj-$(CONFIG_KVM)  += kvm/
 obj-$(CONFIG_PERF_EVENTS) += perf/
 obj-$(CONFIG_KEXEC_CORE)  += kexec/
 obj-$(CONFIG_KEXEC_FILE)  += purgatory/
+
+# for cleaning
+subdir- += boot
index 54cad1faa5d071dd4b0b0a8281187df2382def96..e02568f1733417d53cab2a1569d59e040eb758b4 100644 (file)
@@ -1,7 +1,5 @@
 # This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture.
+# architecture-specific flags and dependencies.
 #
 # This file is subject to the terms and conditions of the GNU General Public
 # License.  See the file "COPYING" in the main directory of this archive
@@ -411,9 +409,6 @@ install:
        sh -x $(srctree)/$(boot)/install.sh "$(KERNELRELEASE)" vmlinux \
        System.map "$(INSTALL_PATH)"
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-
 ifeq ($(KBUILD_EXTMOD),)
 # We need to generate vdso-offsets.h before compiling certain files in kernel/.
 # In order to do that, we should use the archprepare target, but we can't since
index 396d508730a1cc17a5e8cbf6a760d2ee478e230f..f491875700e8a317b7469c395a47534719d98aa3 100644 (file)
@@ -274,7 +274,6 @@ CONFIG_NLS_UTF8=y
 CONFIG_ENCRYPTED_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_HARDENED_USERCOPY=y
-# CONFIG_HARDENED_USERCOPY_FALLBACK is not set
 CONFIG_HARDENED_USERCOPY_PAGESPAN=y
 CONFIG_FORTIFY_SOURCE=y
 CONFIG_SECURITY_LOCKDOWN_LSM=y
index a68311077d320e247e64ba4725e9fd0afd4f965e..9c3c9f04129ff65c53b25f23132c26a85d0d3ded 100644 (file)
@@ -31,7 +31,7 @@ struct machdep_calls {
 #ifdef CONFIG_PM
        void            (*iommu_restore)(void);
 #endif
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
        unsigned long   (*memory_block_size)(void);
 #endif
 #endif /* CONFIG_PPC64 */
index 2b9edbf6e929b86cd6859ec9c1211f2bdaccd8ac..f6cf0159024e7b49dc499e93b28bee933e35f1a6 100644 (file)
@@ -55,11 +55,6 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode);
 void eeh_sysfs_add_device(struct pci_dev *pdev);
 void eeh_sysfs_remove_device(struct pci_dev *pdev);
 
-static inline const char *eeh_driver_name(struct pci_dev *pdev)
-{
-       return (pdev && pdev->driver) ? pdev->driver->name : "<null>";
-}
-
 #endif /* CONFIG_EEH */
 
 #define PCI_BUSNO(bdfn) ((bdfn >> 8) & 0xff)
index 6e4af4492a14490335d78d139872e4f469d46449..79cb7a25a5fb69cc97b78f8829b961263866974a 100644 (file)
@@ -6,21 +6,8 @@
 #include <linux/elf.h>
 #include <linux/uaccess.h>
 
-#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
-
 #include <asm-generic/sections.h>
 
-extern bool init_mem_is_free;
-
-static inline int arch_is_kernel_initmem_freed(unsigned long addr)
-{
-       if (!init_mem_is_free)
-               return 0;
-
-       return addr >= (unsigned long)__init_begin &&
-               addr < (unsigned long)__init_end;
-}
-
 extern char __head_end[];
 
 #ifdef __powerpc64__
index 358aee7c2d79acdcfc21976bc4ac1e10d64bf04a..ba527fb529931bb700ba15a944d13ea5deeffb52 100644 (file)
@@ -1095,8 +1095,8 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char
 
        cpufeatures_setup_finished();
 
-       memblock_free(__pa(dt_cpu_features),
-                       sizeof(struct dt_cpu_feature)*nr_dt_cpu_features);
+       memblock_free(dt_cpu_features,
+                     sizeof(struct dt_cpu_feature) * nr_dt_cpu_features);
 
        return 0;
 }
index 91e0f4cf1db30fc8b48dd39ececb425f28f76465..28bb1e7263a6c6a3c8fffb9f285f5c0cdf881fae 100644 (file)
@@ -399,6 +399,14 @@ out:
        return ret;
 }
 
+static inline const char *eeh_driver_name(struct pci_dev *pdev)
+{
+       if (pdev)
+               return dev_driver_string(&pdev->dev);
+
+       return "<null>";
+}
+
 /**
  * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
  * @edev: eeh device
index 3eff6a4888e79aa4b0ccdb1f2add248f41cfc736..350dab18e13732bf2d598922c8bdc38b6f879e11 100644 (file)
@@ -104,13 +104,13 @@ static bool eeh_edev_actionable(struct eeh_dev *edev)
  */
 static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
 {
-       if (!pdev || !pdev->driver)
+       if (!pdev || !pdev->dev.driver)
                return NULL;
 
-       if (!try_module_get(pdev->driver->driver.owner))
+       if (!try_module_get(pdev->dev.driver->owner))
                return NULL;
 
-       return pdev->driver;
+       return to_pci_driver(pdev->dev.driver);
 }
 
 /**
@@ -122,10 +122,10 @@ static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
  */
 static inline void eeh_pcid_put(struct pci_dev *pdev)
 {
-       if (!pdev || !pdev->driver)
+       if (!pdev || !pdev->dev.driver)
                return;
 
-       module_put(pdev->driver->driver.owner);
+       module_put(pdev->dev.driver->owner);
 }
 
 /**
index 9bd30cac852bfefd551763e28bb06e9031b3a32d..4208b4044d12ede912293a473d8ce05b31fa1f27 100644 (file)
@@ -322,8 +322,8 @@ void __init free_unused_pacas(void)
 
        new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids;
        if (new_ptrs_size < paca_ptrs_size)
-               memblock_free(__pa(paca_ptrs) + new_ptrs_size,
-                                       paca_ptrs_size - new_ptrs_size);
+               memblock_phys_free(__pa(paca_ptrs) + new_ptrs_size,
+                                  paca_ptrs_size - new_ptrs_size);
 
        paca_nr_cpu_ids = nr_cpu_ids;
        paca_ptrs_size = new_ptrs_size;
@@ -331,8 +331,8 @@ void __init free_unused_pacas(void)
 #ifdef CONFIG_PPC_BOOK3S_64
        if (early_radix_enabled()) {
                /* Ugly fixup, see new_slb_shadow() */
-               memblock_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr),
-                               sizeof(struct slb_shadow));
+               memblock_phys_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr),
+                                  sizeof(struct slb_shadow));
                paca_ptrs[boot_cpuid]->slb_shadow_ptr = NULL;
        }
 #endif
index c3573430919d275d4f00496bb5de52a99e86c5f6..6749905932f45c67b7cfe4ab0e93c5ae984464f8 100644 (file)
@@ -1059,7 +1059,7 @@ void pcibios_bus_add_device(struct pci_dev *dev)
                ppc_md.pcibios_bus_add_device(dev);
 }
 
-int pcibios_add_device(struct pci_dev *dev)
+int pcibios_device_add(struct pci_dev *dev)
 {
        struct irq_domain *d;
 
index 0b7894eed58d7480041515f8397d877ac79218fe..4f1322b657603d1c6b37f4916a55198467a3e1e3 100644 (file)
@@ -822,7 +822,7 @@ static void __init smp_setup_pacas(void)
                set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]);
        }
 
-       memblock_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32));
+       memblock_free(cpu_to_phys_id, nr_cpu_ids * sizeof(u32));
        cpu_to_phys_id = NULL;
 }
 #endif
index eaa79a0996d1b5307b2e2635f633c89891f3e5d0..6052f5d5ded343ae901ae27836ccd8e789096a5d 100644 (file)
@@ -812,7 +812,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
 
 static void __init pcpu_free_bootmem(void *ptr, size_t size)
 {
-       memblock_free(__pa(ptr), size);
+       memblock_free(ptr, size);
 }
 
 static int pcpu_cpu_distance(unsigned int from, unsigned int to)
@@ -912,7 +912,7 @@ void __init setup_per_cpu_areas(void)
 }
 #endif
 
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
 unsigned long memory_block_size_bytes(void)
 {
        if (ppc_md.memory_block_size)
index 99a7c9132422ccdfc0958c0ab94b0f57d43ed6f5..9e5d0f413b71293e1797b12fdcaac5d9e03a50cc 100644 (file)
@@ -65,5 +65,7 @@ obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o
 
 obj-$(CONFIG_ALTIVEC)  += xor_vmx.o xor_vmx_glue.o
 CFLAGS_xor_vmx.o += -maltivec $(call cc-option,-mabi=altivec)
+# Enable <altivec.h>
+CFLAGS_xor_vmx.o += -isystem $(shell $(CC) -print-file-name=include)
 
 obj-$(CONFIG_PPC64) += $(obj64-y)
index 9a75ba078e1b3741e5a7305260790a61df57c326..82d8b368ca6d4429829e40f0f2612790b89b51b9 100644 (file)
@@ -229,17 +229,22 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
        m->hstate = hstate;
        return 1;
 }
+
+bool __init hugetlb_node_alloc_supported(void)
+{
+       return false;
+}
 #endif
 
 
-int __init alloc_bootmem_huge_page(struct hstate *h)
+int __init alloc_bootmem_huge_page(struct hstate *h, int nid)
 {
 
 #ifdef CONFIG_PPC_BOOK3S_64
        if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
                return pseries_alloc_bootmem_huge_page(h);
 #endif
-       return __alloc_bootmem_huge_page(h);
+       return __alloc_bootmem_huge_page(h, nid);
 }
 
 #ifndef CONFIG_PPC_BOOK3S_64
index 3dd35c327d1c53bae493d209e2d0ab17b61af8a0..004cd6a96c8a03d85cb17d6c0cacbed104654171 100644 (file)
@@ -2981,7 +2981,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
        if (!phb->hose) {
                pr_err("  Can't allocate PCI controller for %pOF\n",
                       np);
-               memblock_free(__pa(phb), sizeof(struct pnv_phb));
+               memblock_free(phb, sizeof(struct pnv_phb));
                return;
        }
 
index deddbb233fde5a888b8f86082e84120c7bece61d..04155aaaadb1d2a51f133db8b9920359dcfeecee 100644 (file)
@@ -51,7 +51,7 @@
  * to "new_size", calculated above. Implementing this is a convoluted process
  * which requires several hooks in the PCI core:
  *
- * 1. In pcibios_add_device() we call pnv_pci_ioda_fixup_iov().
+ * 1. In pcibios_device_add() we call pnv_pci_ioda_fixup_iov().
  *
  *    At this point the device has been probed and the device's BARs are sized,
  *    but no resource allocations have been done. The SR-IOV BARs are sized
index a8db3f153063946fa42e035e39d9bef9bb6e80e5..ad56a54ac9c574e913a078f9c8f2ee6bee8b9688 100644 (file)
@@ -440,7 +440,7 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 }
 #endif /* CONFIG_KEXEC_CORE */
 
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
 static unsigned long pnv_memory_block_size(void)
 {
        /*
@@ -553,7 +553,7 @@ define_machine(powernv) {
 #ifdef CONFIG_KEXEC_CORE
        .kexec_cpu_down         = pnv_kexec_cpu_down,
 #endif
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
        .memory_block_size      = pnv_memory_block_size,
 #endif
 };
index 2188054470c12213924c7ad4bca17f461e4a4eab..8a62af5b9c243a7468dd1e102af2566335377714 100644 (file)
@@ -1088,7 +1088,7 @@ define_machine(pseries) {
        .machine_kexec          = pSeries_machine_kexec,
        .kexec_cpu_down         = pseries_kexec_cpu_down,
 #endif
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
        .memory_block_size      = pseries_memory_block_size,
 #endif
 };
index c083ecbbae4d6870da9cd084b56f17480419f42d..c5228f4969eb232471d495424a60c8d2ce2b8aa2 100644 (file)
@@ -57,8 +57,7 @@ void __init svm_swiotlb_init(void)
                return;
 
 
-       memblock_free_early(__pa(vstart),
-                           PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+       memblock_free(vstart, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
        panic("SVM: Cannot allocate SWIOTLB buffer");
 }
 
index 4614c01ba5b32e9a149e752ec4f281b2db4123f1..fb3397223d5204d11a909ee11f88dfee0f967c84 100644 (file)
@@ -2,3 +2,6 @@
 
 obj-y += kernel/ mm/ net/
 obj-$(CONFIG_BUILTIN_DTB) += boot/dts/
+
+# for cleaning
+subdir- += boot
index 58c1a28e20bb4a459a4535dc518fbefc21c30c30..7f19b784e649a5d6ce6f63941b1b21e94828f7c7 100644 (file)
@@ -1,7 +1,5 @@
 # This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture
+# architecture-specific flags and dependencies.
 #
 # This file is subject to the terms and conditions of the GNU General Public
 # License.  See the file "COPYING" in the main directory of this archive
@@ -138,6 +136,3 @@ zinstall: install-image = Image.gz
 install zinstall:
        $(CONFIG_SHELL) $(srctree)/$(boot)/install.sh $(KERNELRELEASE) \
        $(boot)/$(install-image) System.map "$(INSTALL_PATH)"
-
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
index b9620e5f00baf300b0286b7a5090f76313854a3a..b42bfdc674823cec93ab3342235f12c5b867b611 100644 (file)
@@ -230,13 +230,13 @@ static void __init init_resources(void)
 
        /* Clean-up any unused pre-allocated resources */
        if (res_idx >= 0)
-               memblock_free(__pa(mem_res), (res_idx + 1) * sizeof(*mem_res));
+               memblock_free(mem_res, (res_idx + 1) * sizeof(*mem_res));
        return;
 
  error:
        /* Better an empty resource tree than an inconsistent one */
        release_child_resources(&iomem_resource);
-       memblock_free(__pa(mem_res), mem_res_sz);
+       memblock_free(mem_res, mem_res_sz);
 }
 
 
index 8b98c501142df15026187aef1e1163f60bd9cbf7..76e36227717916d15873b41e4df1bd8fad244e3d 100644 (file)
@@ -8,3 +8,6 @@ obj-$(CONFIG_APPLDATA_BASE)     += appldata/
 obj-y                          += net/
 obj-$(CONFIG_PCI)              += pci/
 obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += purgatory/
+
+# for cleaning
+subdir- += boot tools
index b86de61b8caa2f2909b4c569c3f574942c07652a..8857ec3b97eb887d4bc78ecc8462b942b2b62071 100644 (file)
@@ -153,12 +153,15 @@ config S390
        select HAVE_DEBUG_KMEMLEAK
        select HAVE_DMA_CONTIGUOUS
        select HAVE_DYNAMIC_FTRACE
+       select HAVE_DYNAMIC_FTRACE_WITH_ARGS
+       select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
        select HAVE_DYNAMIC_FTRACE_WITH_REGS
        select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
        select HAVE_EFFICIENT_UNALIGNED_ACCESS
        select HAVE_FAST_GUP
        select HAVE_FENTRY
        select HAVE_FTRACE_MCOUNT_RECORD
+       select HAVE_FUNCTION_ARG_ACCESS_API
        select HAVE_FUNCTION_ERROR_INJECTION
        select HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_FUNCTION_TRACER
@@ -190,6 +193,7 @@ config S390
        select HAVE_REGS_AND_STACK_ACCESS_API
        select HAVE_RELIABLE_STACKTRACE
        select HAVE_RSEQ
+       select HAVE_SAMPLE_FTRACE_DIRECT
        select HAVE_SOFTIRQ_ON_OWN_STACK
        select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_VIRT_CPU_ACCOUNTING
@@ -434,6 +438,14 @@ endchoice
 config 64BIT
        def_bool y
 
+config COMMAND_LINE_SIZE
+       int "Maximum size of kernel command line"
+       default 4096
+       range 896 1048576
+       help
+         This allows you to specify the maximum length of the kernel command
+         line.
+
 config COMPAT
        def_bool y
        prompt "Kernel support for 31 bit emulation"
@@ -938,6 +950,8 @@ menu "Selftests"
 
 config S390_UNWIND_SELFTEST
        def_tristate n
+       depends on KUNIT
+       default KUNIT_ALL_TESTS
        prompt "Test unwind functions"
        help
          This option enables s390 specific stack unwinder testing kernel
@@ -946,4 +960,16 @@ config S390_UNWIND_SELFTEST
 
          Say N if you are unsure.
 
+config S390_KPROBES_SANITY_TEST
+       def_tristate n
+       prompt "Enable s390 specific kprobes tests"
+       depends on KPROBES
+       depends on KUNIT
+       help
+         This option enables an s390 specific kprobes test module. This option
+         is not useful for distributions or general kernels, but only for kernel
+         developers working on architecture code.
+
+         Say N if you are unsure.
+
 endmenu
index 450b351dfa8ef37c238443cbf5c4b84876aaeaba..69c45f600273bea6132060f84ffca10587a37889 100644 (file)
@@ -3,9 +3,7 @@
 # s390/Makefile
 #
 # This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture
+# architecture-specific flags and dependencies.
 #
 # Copyright (C) 1994 by Linus Torvalds
 #
@@ -147,10 +145,6 @@ zfcpdump:
 vdso_install:
        $(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso64 $@
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-       $(Q)$(MAKE) $(clean)=$(tools)
-
 archheaders:
        $(Q)$(MAKE) $(build)=$(syscalls) uapi
 
index a59f75c5b04903828ad83165932080931fd2fdc1..f75cc31a77dd9f38ecf5fd27ee01a1733f1bb1ca 100644 (file)
@@ -24,6 +24,7 @@ struct vmlinux_info {
        unsigned long dynsym_start;
        unsigned long rela_dyn_start;
        unsigned long rela_dyn_end;
+       unsigned long amode31_size;
 };
 
 /* Symbols defined by linker scripts */
index 40f4cff538b8d830b9912c92037feef8c8acee88..3a252d140c55fb15471c241f39785fc055d8a0b0 100644 (file)
@@ -184,35 +184,23 @@ iplstart:
        bas     %r14,.Lloader           # load parameter file
        ltr     %r2,%r2                 # got anything ?
        bz      .Lnopf
-       chi     %r2,895
-       bnh     .Lnotrunc
-       la      %r2,895
+       l       %r3,MAX_COMMAND_LINE_SIZE+ARCH_OFFSET-PARMAREA(%r12)
+       ahi     %r3,-1
+       clr     %r2,%r3
+       bl      .Lnotrunc
+       lr      %r2,%r3
 .Lnotrunc:
        l       %r4,.Linitrd
        clc     0(3,%r4),.L_hdr         # if it is HDRx
        bz      .Lagain1                # skip dataset header
        clc     0(3,%r4),.L_eof         # if it is EOFx
        bz      .Lagain1                # skip dateset trailer
-       la      %r5,0(%r4,%r2)
-       lr      %r3,%r2
-       la      %r3,COMMAND_LINE-PARMAREA(%r12) # load adr. of command line
-       mvc     0(256,%r3),0(%r4)
-       mvc     256(256,%r3),256(%r4)
-       mvc     512(256,%r3),512(%r4)
-       mvc     768(122,%r3),768(%r4)
-       slr     %r0,%r0
-       b       .Lcntlp
-.Ldelspc:
-       ic      %r0,0(%r2,%r3)
-       chi     %r0,0x20                # is it a space ?
-       be      .Lcntlp
-       ahi     %r2,1
-       b       .Leolp
-.Lcntlp:
-       brct    %r2,.Ldelspc
-.Leolp:
-       slr     %r0,%r0
-       stc     %r0,0(%r2,%r3)          # terminate buffer
+
+       lr      %r5,%r2
+       la      %r6,COMMAND_LINE-PARMAREA(%r12)
+       lr      %r7,%r2
+       ahi     %r7,1
+       mvcl    %r6,%r4
 .Lnopf:
 
 #
@@ -317,6 +305,7 @@ SYM_CODE_START_LOCAL(startup_normal)
        xc      0x300(256),0x300
        xc      0xe00(256),0xe00
        xc      0xf00(256),0xf00
+       lctlg   %c0,%c15,.Lctl-.LPG0(%r13)      # load control registers
        stcke   __LC_BOOT_CLOCK
        mvc     __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1
        spt     6f-.LPG0(%r13)
@@ -335,6 +324,22 @@ SYM_CODE_END(startup_normal)
        .quad   0x0000000180000000,startup_pgm_check_handler
 .Lio_new_psw:
        .quad   0x0002000180000000,0x1f0        # disabled wait
+.Lctl: .quad   0x04040000              # cr0: AFP registers & secondary space
+       .quad   0                       # cr1: primary space segment table
+       .quad   0                       # cr2: dispatchable unit control table
+       .quad   0                       # cr3: instruction authorization
+       .quad   0xffff                  # cr4: instruction authorization
+       .quad   0                       # cr5: primary-aste origin
+       .quad   0                       # cr6:  I/O interrupts
+       .quad   0                       # cr7:  secondary space segment table
+       .quad   0x0000000000008000      # cr8:  access registers translation
+       .quad   0                       # cr9:  tracing off
+       .quad   0                       # cr10: tracing off
+       .quad   0                       # cr11: tracing off
+       .quad   0                       # cr12: tracing off
+       .quad   0                       # cr13: home space segment table
+       .quad   0xc0000000              # cr14: machine check handling off
+       .quad   0                       # cr15: linkage stack operations
 
 #include "head_kdump.S"
 
@@ -377,11 +382,10 @@ SYM_DATA_START(parmarea)
        .quad   0                       # OLDMEM_BASE
        .quad   0                       # OLDMEM_SIZE
        .quad   kernel_version          # points to kernel version string
+       .quad   COMMAND_LINE_SIZE
 
        .org    COMMAND_LINE
        .byte   "root=/dev/ram0 ro"
        .byte   0
        .org    PARMAREA+__PARMAREA_SIZE
 SYM_DATA_END(parmarea)
-
-       .org    HEAD_END
index 0f84c072625e07462deba1705dd90ba9c1b1e403..9ed7e29c81d9a0ec9e167f9b2e9d6fcf067c6c74 100644 (file)
@@ -170,10 +170,10 @@ static inline int has_ebcdic_char(const char *str)
 
 void setup_boot_command_line(void)
 {
-       parmarea.command_line[ARCH_COMMAND_LINE_SIZE - 1] = 0;
+       parmarea.command_line[COMMAND_LINE_SIZE - 1] = 0;
        /* convert arch command line to ascii if necessary */
        if (has_ebcdic_char(parmarea.command_line))
-               EBCASC(parmarea.command_line, ARCH_COMMAND_LINE_SIZE);
+               EBCASC(parmarea.command_line, COMMAND_LINE_SIZE);
        /* copy arch command line */
        strcpy(early_command_line, strim(parmarea.command_line));
 
index 75bcbfa279418faca1f48f280b3692d544136d95..c2a1defc79daf206c0a7c5e8d96abf9d957c269f 100644 (file)
@@ -175,6 +175,6 @@ void print_pgm_check_info(void)
                            gpregs[12], gpregs[13], gpregs[14], gpregs[15]);
        print_stacktrace();
        decompressor_printk("Last Breaking-Event-Address:\n");
-       decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.breaking_event_addr,
-                           (void *)S390_lowcore.breaking_event_addr);
+       decompressor_printk(" [<%016lx>] %pS\n", (unsigned long)S390_lowcore.pgm_last_break,
+                           (void *)S390_lowcore.pgm_last_break);
 }
index 6dc8d0a53864005800f7fdac618f1f879803d689..7571dee72a0cdd7c4fec06cae66b8479a92b6667 100644 (file)
@@ -15,6 +15,7 @@
 #include "uv.h"
 
 unsigned long __bootdata_preserved(__kaslr_offset);
+unsigned long __bootdata(__amode31_base);
 unsigned long __bootdata_preserved(VMALLOC_START);
 unsigned long __bootdata_preserved(VMALLOC_END);
 struct page *__bootdata_preserved(vmemmap);
@@ -259,6 +260,12 @@ static void offset_vmlinux_info(unsigned long offset)
        vmlinux.dynsym_start += offset;
 }
 
+static unsigned long reserve_amode31(unsigned long safe_addr)
+{
+       __amode31_base = PAGE_ALIGN(safe_addr);
+       return safe_addr + vmlinux.amode31_size;
+}
+
 void startup_kernel(void)
 {
        unsigned long random_lma;
@@ -273,6 +280,7 @@ void startup_kernel(void)
        setup_lpp();
        store_ipl_parmblock();
        safe_addr = mem_safe_offset();
+       safe_addr = reserve_amode31(safe_addr);
        safe_addr = read_ipl_report(safe_addr);
        uv_query_info();
        rescue_initrd(safe_addr);
index 6aad18ee131d6634efd7188b9660d723f2ea36a6..fd825097cf048b59d8cc7486ae345e9cbec08b07 100644 (file)
@@ -61,7 +61,8 @@ CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
 CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
-CONFIG_S390_UNWIND_SELFTEST=y
+CONFIG_S390_UNWIND_SELFTEST=m
+CONFIG_S390_KPROBES_SANITY_TEST=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_STATIC_KEYS_SELFTEST=y
@@ -776,7 +777,6 @@ CONFIG_CRC8=m
 CONFIG_RANDOM32_SELFTEST=y
 CONFIG_DMA_CMA=y
 CONFIG_CMA_SIZE_MBYTES=0
-CONFIG_DMA_API_DEBUG=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DYNAMIC_DEBUG=y
 CONFIG_DEBUG_INFO=y
@@ -839,8 +839,13 @@ CONFIG_BPF_KPROBE_OVERRIDE=y
 CONFIG_HIST_TRIGGERS=y
 CONFIG_FTRACE_STARTUP_TEST=y
 # CONFIG_EVENT_TRACE_STARTUP_TEST is not set
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_TRACE_PRINTK=m
+CONFIG_SAMPLE_FTRACE_DIRECT=m
 CONFIG_DEBUG_ENTRY=y
 CONFIG_CIO_INJECT=y
+CONFIG_KUNIT=m
+CONFIG_KUNIT_DEBUGFS=y
 CONFIG_NOTIFIER_ERROR_INJECTION=m
 CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m
 CONFIG_FAULT_INJECTION=y
index f08b161c94463cd2b2a02d7d13c730d3dffb0ebf..c9c3cedff2d85634327af0d5c36d795da74ddb01 100644 (file)
@@ -60,6 +60,7 @@ CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
 CONFIG_S390_UNWIND_SELFTEST=m
+CONFIG_S390_KPROBES_SANITY_TEST=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 # CONFIG_GCC_PLUGINS is not set
@@ -788,6 +789,11 @@ CONFIG_FTRACE_SYSCALLS=y
 CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_BPF_KPROBE_OVERRIDE=y
 CONFIG_HIST_TRIGGERS=y
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_TRACE_PRINTK=m
+CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_KUNIT=m
+CONFIG_KUNIT_DEBUGFS=y
 CONFIG_LKDTM=m
 CONFIG_PERCPU_TEST=m
 CONFIG_ATOMIC64_SELFTEST=y
index f9eddbca79d2859f856454a2a0dbce68ab9eca36..2c057e1f32000a6d15e2c9d6317b0a66cbd36519 100644 (file)
 
 #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
 /* Fast-BCR without checkpoint synchronization */
-#define __ASM_BARRIER "bcr 14,0\n"
+#define __ASM_BCR_SERIALIZE "bcr 14,0\n"
 #else
-#define __ASM_BARRIER "bcr 15,0\n"
+#define __ASM_BCR_SERIALIZE "bcr 15,0\n"
 #endif
 
-#define mb() do {  asm volatile(__ASM_BARRIER : : : "memory"); } while (0)
+static __always_inline void bcr_serialize(void)
+{
+       asm volatile(__ASM_BCR_SERIALIZE : : : "memory");
+}
 
-#define rmb()                          barrier()
-#define wmb()                          barrier()
-#define dma_rmb()                      mb()
-#define dma_wmb()                      mb()
-#define __smp_mb()                     mb()
-#define __smp_rmb()                    rmb()
-#define __smp_wmb()                    wmb()
+#define mb()           bcr_serialize()
+#define rmb()          barrier()
+#define wmb()          barrier()
+#define dma_rmb()      mb()
+#define dma_wmb()      mb()
+#define __smp_mb()     mb()
+#define __smp_rmb()    rmb()
+#define __smp_wmb()    wmb()
 
 #define __smp_store_release(p, v)                                      \
 do {                                                                   \
index fd149480b6e2b1e6490f5056d280f3d6d4216dff..5a530c552c2383ed370291b7322a962f75035e37 100644 (file)
@@ -188,7 +188,7 @@ static inline bool arch_test_and_set_bit_lock(unsigned long nr,
                                              volatile unsigned long *ptr)
 {
        if (arch_test_bit(nr, ptr))
-               return 1;
+               return true;
        return arch_test_and_set_bit(nr, ptr);
 }
 
index 62228a884e0632b6f0bd7d83e266862bb34e78e9..26c710cd34859f5452daedbeea3b1557a5956208 100644 (file)
@@ -12,6 +12,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <linux/jump_label.h>
 
 struct cpuid
 {
@@ -21,5 +22,7 @@ struct cpuid
        unsigned int unused  : 16;
 } __attribute__ ((packed, aligned(8)));
 
+DECLARE_STATIC_KEY_FALSE(cpu_has_bear);
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_S390_CPU_H */
index 19a55e1e3a0c591994a7f8f7176e54704270daa8..77f24262c25c18764f3c87f8e6a1b706db5714a1 100644 (file)
@@ -462,7 +462,7 @@ arch_initcall(VNAME(var, reg))
  *
  * @var: Name of debug_info_t variable
  * @name: Name of debug log (e.g. used for debugfs entry)
- * @pages_per_area: Number of pages per area
+ * @pages: Number of pages per area
  * @nr_areas: Number of debug areas
  * @buf_size: Size of data area in each debug entry
  * @view: Pointer to debug view struct
index e8b460f39c588414673b6456b6c356a95c7c593c..267f70f4393f7650abb919831c4e32dd0610d1ad 100644 (file)
@@ -17,7 +17,6 @@
 
 void ftrace_caller(void);
 
-extern char ftrace_graph_caller_end;
 extern void *ftrace_func;
 
 struct dyn_arch_ftrace { };
@@ -42,6 +41,35 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
        return addr;
 }
 
+struct ftrace_regs {
+       struct pt_regs regs;
+};
+
+static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs)
+{
+       return &fregs->regs;
+}
+
+static __always_inline void ftrace_instruction_pointer_set(struct ftrace_regs *fregs,
+                                                          unsigned long ip)
+{
+       struct pt_regs *regs = arch_ftrace_get_regs(fregs);
+
+       regs->psw.addr = ip;
+}
+
+/*
+ * When an ftrace registered caller is tracing a function that is
+ * also set by a register_ftrace_direct() call, it needs to be
+ * differentiated in the ftrace_caller trampoline. To do this,
+ * place the direct caller in the ORIG_GPR2 part of pt_regs. This
+ * tells the ftrace_caller that there's a direct caller.
+ */
+static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
+{
+       regs->orig_gpr2 = addr;
+}
+
 /*
  * Even though the system call numbers are identical for s390/s390x a
  * different system call table is used for compat tasks. This may lead
@@ -68,4 +96,32 @@ static inline bool arch_syscall_match_sym_name(const char *sym,
 }
 
 #endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+#define FTRACE_NOP_INSN .word 0xc004, 0x0000, 0x0000 /* brcl 0,0 */
+
+#ifndef CC_USING_HOTPATCH
+
+#define FTRACE_GEN_MCOUNT_RECORD(name)         \
+       .section __mcount_loc, "a", @progbits;  \
+       .quad name;                             \
+       .previous;
+
+#else /* !CC_USING_HOTPATCH */
+
+#define FTRACE_GEN_MCOUNT_RECORD(name)
+
+#endif /* !CC_USING_HOTPATCH */
+
+#define FTRACE_GEN_NOP_ASM(name)               \
+       FTRACE_GEN_MCOUNT_RECORD(name)          \
+       FTRACE_NOP_INSN
+
+#else /* CONFIG_FUNCTION_TRACER */
+
+#define FTRACE_GEN_NOP_ASM(name)
+
+#endif /* CONFIG_FUNCTION_TRACER */
+
 #endif /* _ASM_S390_FTRACE_H */
index dcb1bba4f40639ee29a2ebdbd6df756d29ef23e6..916cfcb36d8ac91278a3f11b6de0624f0ce911ed 100644 (file)
@@ -2,6 +2,8 @@
 #ifndef _ASM_S390_JUMP_LABEL_H
 #define _ASM_S390_JUMP_LABEL_H
 
+#define HAVE_JUMP_LABEL_BATCH
+
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
index d578a8c7667654162eb7dcf6d55bbdb82b64802f..5209f223331a95ace581c5f33820c64274ed284b 100644 (file)
@@ -16,9 +16,7 @@
 
 static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip)
 {
-       struct pt_regs *regs = ftrace_get_regs(fregs);
-
-       regs->psw.addr = ip;
+       ftrace_instruction_pointer_set(fregs, ip);
 }
 
 #endif
index 11213c8bfca56e4640aa7dba443dd1f4c62a64e8..1262f5003acfff7b912d96d2350a40f5cff997cf 100644 (file)
@@ -65,7 +65,7 @@ struct lowcore {
        __u32   external_damage_code;           /* 0x00f4 */
        __u64   failing_storage_address;        /* 0x00f8 */
        __u8    pad_0x0100[0x0110-0x0100];      /* 0x0100 */
-       __u64   breaking_event_addr;            /* 0x0110 */
+       __u64   pgm_last_break;                 /* 0x0110 */
        __u8    pad_0x0118[0x0120-0x0118];      /* 0x0118 */
        psw_t   restart_old_psw;                /* 0x0120 */
        psw_t   external_old_psw;               /* 0x0130 */
@@ -93,9 +93,10 @@ struct lowcore {
        psw_t   return_psw;                     /* 0x0290 */
        psw_t   return_mcck_psw;                /* 0x02a0 */
 
+       __u64   last_break;                     /* 0x02b0 */
+
        /* CPU accounting and timing values. */
-       __u64   sys_enter_timer;                /* 0x02b0 */
-       __u8    pad_0x02b8[0x02c0-0x02b8];      /* 0x02b8 */
+       __u64   sys_enter_timer;                /* 0x02b8 */
        __u64   mcck_enter_timer;               /* 0x02c0 */
        __u64   exit_timer;                     /* 0x02c8 */
        __u64   user_timer;                     /* 0x02d0 */
@@ -188,7 +189,7 @@ struct lowcore {
        __u32   tod_progreg_save_area;          /* 0x1324 */
        __u32   cpu_timer_save_area[2];         /* 0x1328 */
        __u32   clock_comp_save_area[2];        /* 0x1330 */
-       __u8    pad_0x1338[0x1340-0x1338];      /* 0x1338 */
+       __u64   last_break_save_area;           /* 0x1338 */
        __u32   access_regs_save_area[16];      /* 0x1340 */
        __u64   cregs_save_area[16];            /* 0x1380 */
        __u8    pad_0x1400[0x1800-0x1400];      /* 0x1400 */
index b4bd8c41e9d33d4f01ef8950be79f0fd02cd59c6..82725cf783c70cfcaf511931be6b26dfc4522303 100644 (file)
@@ -12,6 +12,11 @@ void nospec_init_branches(void);
 void nospec_auto_detect(void);
 void nospec_revert(s32 *start, s32 *end);
 
+static inline bool nospec_uses_trampoline(void)
+{
+       return __is_defined(CC_USING_EXPOLINE) && !nospec_disable;
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_S390_EXPOLINE_H */
index e43416950245623efd765ba73da865a5a64fac79..008a6c856fa44e30a125982c73127c39f39c61bb 100644 (file)
@@ -583,11 +583,11 @@ static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new
 #define CRDTE_DTT_REGION1      0x1cUL
 
 static inline void crdte(unsigned long old, unsigned long new,
-                        unsigned long table, unsigned long dtt,
+                        unsigned long *table, unsigned long dtt,
                         unsigned long address, unsigned long asce)
 {
        union register_pair r1 = { .even = old, .odd = new, };
-       union register_pair r2 = { .even = table | dtt, .odd = address, };
+       union register_pair r2 = { .even = __pa(table) | dtt, .odd = address, };
 
        asm volatile(".insn rrf,0xb98f0000,%[r1],%[r2],%[asce],0"
                     : [r1] "+&d" (r1.pair)
@@ -1001,7 +1001,7 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
                                        unsigned long opt, unsigned long asce,
                                        int local)
 {
-       unsigned long pto = (unsigned long) ptep;
+       unsigned long pto = __pa(ptep);
 
        if (__builtin_constant_p(opt) && opt == 0) {
                /* Invalidation + TLB flush for the pte */
@@ -1023,7 +1023,7 @@ static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
 static __always_inline void __ptep_ipte_range(unsigned long address, int nr,
                                              pte_t *ptep, int local)
 {
-       unsigned long pto = (unsigned long) ptep;
+       unsigned long pto = __pa(ptep);
 
        /* Invalidate a range of ptes + TLB flush of the ptes */
        do {
@@ -1487,7 +1487,7 @@ static __always_inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp,
 {
        unsigned long sto;
 
-       sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t);
+       sto = __pa(pmdp) - pmd_index(addr) * sizeof(pmd_t);
        if (__builtin_constant_p(opt) && opt == 0) {
                /* flush without guest asce */
                asm volatile(
@@ -1513,7 +1513,7 @@ static __always_inline void __pudp_idte(unsigned long addr, pud_t *pudp,
 {
        unsigned long r3o;
 
-       r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t);
+       r3o = __pa(pudp) - pud_index(addr) * sizeof(pud_t);
        r3o |= _ASCE_TYPE_REGION3;
        if (__builtin_constant_p(opt) && opt == 0) {
                /* flush without guest asce */
index 61b22aa990e75dd70dc04370676489c138e2c4b6..4ffa8e7f0ed3acb439cad08f5ff46e051335e561 100644 (file)
@@ -76,8 +76,7 @@ enum {
  * The pt_regs struct defines the way the registers are stored on
  * the stack during a system call.
  */
-struct pt_regs 
-{
+struct pt_regs {
        union {
                user_pt_regs user_regs;
                struct {
@@ -97,6 +96,7 @@ struct pt_regs
        };
        unsigned long flags;
        unsigned long cr1;
+       unsigned long last_break;
 };
 
 /*
@@ -197,6 +197,25 @@ const char *regs_query_register_name(unsigned int offset);
 unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
 unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n);
 
+/**
+ * regs_get_kernel_argument() - get Nth function argument in kernel
+ * @regs:      pt_regs of that context
+ * @n:         function argument number (start from 0)
+ *
+ * regs_get_kernel_argument() returns @n th argument of the function call.
+ */
+static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
+                                                    unsigned int n)
+{
+       unsigned int argoffset = STACK_FRAME_OVERHEAD / sizeof(long);
+
+#define NR_REG_ARGUMENTS 5
+       if (n < NR_REG_ARGUMENTS)
+               return regs_get_register(regs, 2 + n);
+       n -= NR_REG_ARGUMENTS;
+       return regs_get_kernel_stack_nth(regs, argoffset + n);
+}
+
 static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
 {
        return regs->gprs[15];
index e3ae937bef1c6e6bd6fcaeada43e652bdf774a5f..c68ea35de49861073ac765bd4e7a2609d0ddae31 100644 (file)
@@ -117,6 +117,7 @@ struct zpci_report_error_header {
 
 extern char *sclp_early_sccb;
 
+void sclp_early_adjust_va(void);
 void sclp_early_set_buffer(void *sccb);
 int sclp_early_read_info(void);
 int sclp_early_read_storage_info(void);
index 85881dd48022a03df5706a6f47b72a331b7e397d..3fecaa4e8b74ddaa9ba3aa6d5414ff7c262242cf 100644 (file)
@@ -2,20 +2,8 @@
 #ifndef _S390_SECTIONS_H
 #define _S390_SECTIONS_H
 
-#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
-
 #include <asm-generic/sections.h>
 
-extern bool initmem_freed;
-
-static inline int arch_is_kernel_initmem_freed(unsigned long addr)
-{
-       if (!initmem_freed)
-               return 0;
-       return addr >= (unsigned long)__init_begin &&
-              addr < (unsigned long)__init_end;
-}
-
 /*
  * .boot.data section contains variables "shared" between the decompressor and
  * the decompressed kernel. The decompressor will store values in them, and
index b6606ffd85d898ba7ae9593b27e0d5cc9aac4df0..77e6506898f53d40bc4cad1bc830c0089e3e8f4f 100644 (file)
@@ -11,8 +11,8 @@
 #include <linux/build_bug.h>
 
 #define PARMAREA               0x10400
-#define HEAD_END               0x11000
 
+#define COMMAND_LINE_SIZE CONFIG_COMMAND_LINE_SIZE
 /*
  * Machine features detected in early.c
  */
@@ -43,6 +43,8 @@
 #define STARTUP_NORMAL_OFFSET  0x10000
 #define STARTUP_KDUMP_OFFSET   0x10010
 
+#define LEGACY_COMMAND_LINE_SIZE       896
+
 #ifndef __ASSEMBLY__
 
 #include <asm/lowcore.h>
@@ -55,8 +57,9 @@ struct parmarea {
        unsigned long oldmem_base;                      /* 0x10418 */
        unsigned long oldmem_size;                      /* 0x10420 */
        unsigned long kernel_version;                   /* 0x10428 */
-       char pad1[0x10480 - 0x10430];                   /* 0x10430 - 0x10480 */
-       char command_line[ARCH_COMMAND_LINE_SIZE];      /* 0x10480 */
+       unsigned long max_command_line_size;            /* 0x10430 */
+       char pad1[0x10480-0x10438];                     /* 0x10438 - 0x10480 */
+       char command_line[COMMAND_LINE_SIZE];           /* 0x10480 */
 };
 
 extern struct parmarea parmarea;
index 4fd66c5e8934bfc86f75492f025cc2b1f3abec19..3fae93ddb322a9f81c7e41613bf51ef7e0807783 100644 (file)
@@ -31,22 +31,18 @@ void *memmove(void *dest, const void *src, size_t n);
 #define __HAVE_ARCH_STRCMP     /* arch function */
 #define __HAVE_ARCH_STRCPY     /* inline & arch function */
 #define __HAVE_ARCH_STRLCAT    /* arch function */
-#define __HAVE_ARCH_STRLCPY    /* arch function */
 #define __HAVE_ARCH_STRLEN     /* inline & arch function */
 #define __HAVE_ARCH_STRNCAT    /* arch function */
 #define __HAVE_ARCH_STRNCPY    /* arch function */
 #define __HAVE_ARCH_STRNLEN    /* inline & arch function */
-#define __HAVE_ARCH_STRRCHR    /* arch function */
 #define __HAVE_ARCH_STRSTR     /* arch function */
 
 /* Prototypes for non-inlined arch strings functions. */
 int memcmp(const void *s1, const void *s2, size_t n);
 int strcmp(const char *s1, const char *s2);
 size_t strlcat(char *dest, const char *src, size_t n);
-size_t strlcpy(char *dest, const char *src, size_t size);
 char *strncat(char *dest, const char *src, size_t n);
 char *strncpy(char *dest, const char *src, size_t n);
-char *strrchr(const char *s, int c);
 char *strstr(const char *s1, const char *s2);
 #endif /* !CONFIG_KASAN */
 
diff --git a/arch/s390/include/asm/text-patching.h b/arch/s390/include/asm/text-patching.h
new file mode 100644 (file)
index 0000000..b219056
--- /dev/null
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_S390_TEXT_PATCHING_H
+#define _ASM_S390_TEXT_PATCHING_H
+
+#include <asm/barrier.h>
+
+static __always_inline void sync_core(void)
+{
+       bcr_serialize();
+}
+
+void text_poke_sync(void);
+void text_poke_sync_lock(void);
+
+#endif /* _ASM_S390_TEXT_PATCHING_H */
index 1f8803a31079527f55b622c7794ff1c56328f3fa..598d769e76df0fb7aca3e9fe92051899349bf7f8 100644 (file)
@@ -1,14 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *  S390 version
- *    Copyright IBM Corp. 1999, 2010
- */
-
-#ifndef _UAPI_ASM_S390_SETUP_H
-#define _UAPI_ASM_S390_SETUP_H
-
-#define COMMAND_LINE_SIZE      4096
-
-#define ARCH_COMMAND_LINE_SIZE 896
-
-#endif /* _UAPI_ASM_S390_SETUP_H */
index c22ea1c3ef8437e74c0228ad0a3f9e7cbef05f15..cce0ddee2d02d2731ef36635a18082f9b13f76c3 100644 (file)
@@ -1,5 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/facility.h>
 #include <asm/nospec-branch.h>
@@ -110,3 +113,20 @@ void __init apply_alternative_instructions(void)
 {
        apply_alternatives(__alt_instructions, __alt_instructions_end);
 }
+
+static void do_sync_core(void *info)
+{
+       sync_core();
+}
+
+void text_poke_sync(void)
+{
+       on_each_cpu(do_sync_core, NULL, 1);
+}
+
+void text_poke_sync_lock(void)
+{
+       cpus_read_lock();
+       text_poke_sync();
+       cpus_read_unlock();
+}
index b57da933858888cb5f77bb31d68de9ab11d86beb..8e00bb22866235b80cf22e6cd675bc903f67f316 100644 (file)
@@ -35,6 +35,7 @@ int main(void)
        OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2);
        OFFSET(__PT_FLAGS, pt_regs, flags);
        OFFSET(__PT_CR1, pt_regs, cr1);
+       OFFSET(__PT_LAST_BREAK, pt_regs, last_break);
        DEFINE(__PT_SIZE, sizeof(struct pt_regs));
        BLANK();
        /* stack_frame offsets */
@@ -45,6 +46,7 @@ int main(void)
        OFFSET(__SF_SIE_SAVEAREA, stack_frame, empty1[2]);
        OFFSET(__SF_SIE_REASON, stack_frame, empty1[3]);
        OFFSET(__SF_SIE_FLAGS, stack_frame, empty1[4]);
+       DEFINE(STACK_FRAME_OVERHEAD, sizeof(struct stack_frame));
        BLANK();
        /* idle data offsets */
        OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter);
@@ -77,7 +79,7 @@ int main(void)
        OFFSET(__LC_MCCK_CODE, lowcore, mcck_interruption_code);
        OFFSET(__LC_EXT_DAMAGE_CODE, lowcore, external_damage_code);
        OFFSET(__LC_MCCK_FAIL_STOR_ADDR, lowcore, failing_storage_address);
-       OFFSET(__LC_LAST_BREAK, lowcore, breaking_event_addr);
+       OFFSET(__LC_PGM_LAST_BREAK, lowcore, pgm_last_break);
        OFFSET(__LC_RETURN_LPSWE, lowcore, return_lpswe);
        OFFSET(__LC_RETURN_MCCK_LPSWE, lowcore, return_mcck_lpswe);
        OFFSET(__LC_RST_OLD_PSW, lowcore, restart_old_psw);
@@ -126,6 +128,7 @@ int main(void)
        OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count);
        OFFSET(__LC_GMAP, lowcore, gmap);
        OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline);
+       OFFSET(__LC_LAST_BREAK, lowcore, last_break);
        /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
        OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
        /* hardware defined lowcore locations 0x1000 - 0x18ff */
@@ -139,6 +142,7 @@ int main(void)
        OFFSET(__LC_TOD_PROGREG_SAVE_AREA, lowcore, tod_progreg_save_area);
        OFFSET(__LC_CPU_TIMER_SAVE_AREA, lowcore, cpu_timer_save_area);
        OFFSET(__LC_CLOCK_COMP_SAVE_AREA, lowcore, clock_comp_save_area);
+       OFFSET(__LC_LAST_BREAK_SAVE_AREA, lowcore, last_break_save_area);
        OFFSET(__LC_AREGS_SAVE_AREA, lowcore, access_regs_save_area);
        OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area);
        OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb);
@@ -160,5 +164,6 @@ int main(void)
        DEFINE(OLDMEM_BASE, PARMAREA + offsetof(struct parmarea, oldmem_base));
        DEFINE(OLDMEM_SIZE, PARMAREA + offsetof(struct parmarea, oldmem_size));
        DEFINE(COMMAND_LINE, PARMAREA + offsetof(struct parmarea, command_line));
+       DEFINE(MAX_COMMAND_LINE_SIZE, PARMAREA + offsetof(struct parmarea, max_command_line_size));
        return 0;
 }
index 54efc279f54eee79370b823e8786f8fa8bd043c5..72e106cfd8c7fa3f4dc9cf5cdcd07c32ca0f9b6e 100644 (file)
@@ -29,7 +29,7 @@ static int diag8_noresponse(int cmdlen)
        asm volatile(
                "       diag    %[rx],%[ry],0x8\n"
                : [ry] "+&d" (cmdlen)
-               : [rx] "d" ((addr_t) cpcmd_buf)
+               : [rx] "d" (__pa(cpcmd_buf))
                : "cc");
        return cmdlen;
 }
@@ -39,8 +39,8 @@ static int diag8_response(int cmdlen, char *response, int *rlen)
        union register_pair rx, ry;
        int cc;
 
-       rx.even = (addr_t) cpcmd_buf;
-       rx.odd  = (addr_t) response;
+       rx.even = __pa(cpcmd_buf);
+       rx.odd  = __pa(response);
        ry.even = cmdlen | 0x40000000L;
        ry.odd  = *rlen;
        asm volatile(
index db1bc00229caf20f04d8ee8275598536c390f70b..85f326e258df875829123905180c8acb025a4524 100644 (file)
@@ -152,7 +152,7 @@ void show_stack(struct task_struct *task, unsigned long *stack,
 static void show_last_breaking_event(struct pt_regs *regs)
 {
        printk("Last Breaking-Event-Address:\n");
-       printk(" [<%016lx>] %pSR\n", regs->args[0], (void *)regs->args[0]);
+       printk(" [<%016lx>] %pSR\n", regs->last_break, (void *)regs->last_break);
 }
 
 void show_registers(struct pt_regs *regs)
index 9857cb04672680c81c4ab81b82a380b8f1e2ba58..3cdf68c536147f6deac0244437b222b45e75e7f5 100644 (file)
@@ -280,7 +280,7 @@ char __bootdata(early_command_line)[COMMAND_LINE_SIZE];
 static void __init setup_boot_command_line(void)
 {
        /* copy arch command line */
-       strlcpy(boot_command_line, early_command_line, ARCH_COMMAND_LINE_SIZE);
+       strlcpy(boot_command_line, early_command_line, COMMAND_LINE_SIZE);
 }
 
 static void __init check_image_bootable(void)
@@ -296,6 +296,7 @@ static void __init check_image_bootable(void)
 
 void __init startup_init(void)
 {
+       sclp_early_adjust_va();
        reset_tod_clock();
        check_image_bootable();
        time_early_init();
index 4c9b967290ae059ae4bb486e05946aa6792842f2..01bae1d51113b2409a78549f63266327447d0870 100644 (file)
@@ -52,6 +52,22 @@ STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE
 
 _LPP_OFFSET    = __LC_LPP
 
+       .macro STBEAR address
+       ALTERNATIVE "", ".insn  s,0xb2010000,\address", 193
+       .endm
+
+       .macro LBEAR address
+       ALTERNATIVE "", ".insn  s,0xb2000000,\address", 193
+       .endm
+
+       .macro LPSWEY address,lpswe
+       ALTERNATIVE "b \lpswe", ".insn siy,0xeb0000000071,\address,0", 193
+       .endm
+
+       .macro MBEAR reg
+       ALTERNATIVE "", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193
+       .endm
+
        .macro  CHECK_STACK savearea
 #ifdef CONFIG_CHECK_STACK
        tml     %r15,STACK_SIZE - CONFIG_STACK_GUARD
@@ -302,6 +318,7 @@ ENTRY(system_call)
        BPOFF
        lghi    %r14,0
 .Lsysc_per:
+       STBEAR  __LC_LAST_BREAK
        lctlg   %c1,%c1,__LC_KERNEL_ASCE
        lg      %r12,__LC_CURRENT
        lg      %r15,__LC_KERNEL_STACK
@@ -321,14 +338,16 @@ ENTRY(system_call)
        xgr     %r11,%r11
        la      %r2,STACK_FRAME_OVERHEAD(%r15)  # pointer to pt_regs
        mvc     __PT_R8(64,%r2),__LC_SAVE_AREA_SYNC
+       MBEAR   %r2
        lgr     %r3,%r14
        brasl   %r14,__do_syscall
        lctlg   %c1,%c1,__LC_USER_ASCE
        mvc     __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
        BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
+       LBEAR   STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
        lmg     %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
        stpt    __LC_EXIT_TIMER
-       b       __LC_RETURN_LPSWE
+       LPSWEY  __LC_RETURN_PSW,__LC_RETURN_LPSWE
 ENDPROC(system_call)
 
 #
@@ -340,9 +359,10 @@ ENTRY(ret_from_fork)
        lctlg   %c1,%c1,__LC_USER_ASCE
        mvc     __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
        BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP
+       LBEAR   STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
        lmg     %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
        stpt    __LC_EXIT_TIMER
-       b       __LC_RETURN_LPSWE
+       LPSWEY  __LC_RETURN_PSW,__LC_RETURN_LPSWE
 ENDPROC(ret_from_fork)
 
 /*
@@ -382,6 +402,7 @@ ENTRY(pgm_check_handler)
        xc      __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
        stmg    %r0,%r7,__PT_R0(%r11)
        mvc     __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
+       mvc     __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK
        stmg    %r8,%r9,__PT_PSW(%r11)
 
        # clear user controlled registers to prevent speculative use
@@ -401,8 +422,9 @@ ENTRY(pgm_check_handler)
        stpt    __LC_EXIT_TIMER
 .Lpgm_exit_kernel:
        mvc     __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+       LBEAR   STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
        lmg     %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
-       b       __LC_RETURN_LPSWE
+       LPSWEY  __LC_RETURN_PSW,__LC_RETURN_LPSWE
 
 #
 # single stepped system call
@@ -412,7 +434,8 @@ ENTRY(pgm_check_handler)
        larl    %r14,.Lsysc_per
        stg     %r14,__LC_RETURN_PSW+8
        lghi    %r14,1
-       lpswe   __LC_RETURN_PSW         # branch to .Lsysc_per
+       LBEAR   __LC_PGM_LAST_BREAK
+       LPSWEY  __LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per
 ENDPROC(pgm_check_handler)
 
 /*
@@ -422,6 +445,7 @@ ENDPROC(pgm_check_handler)
 ENTRY(\name)
        STCK    __LC_INT_CLOCK
        stpt    __LC_SYS_ENTER_TIMER
+       STBEAR  __LC_LAST_BREAK
        BPOFF
        stmg    %r8,%r15,__LC_SAVE_AREA_ASYNC
        lg      %r12,__LC_CURRENT
@@ -453,6 +477,7 @@ ENTRY(\name)
        xgr     %r10,%r10
        xc      __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
        mvc     __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
+       MBEAR   %r11
        stmg    %r8,%r9,__PT_PSW(%r11)
        tm      %r8,0x0001              # coming from user space?
        jno     1f
@@ -465,8 +490,9 @@ ENTRY(\name)
        lctlg   %c1,%c1,__LC_USER_ASCE
        BPEXIT  __TI_flags(%r12),_TIF_ISOLATE_BP
        stpt    __LC_EXIT_TIMER
-2:     lmg     %r0,%r15,__PT_R0(%r11)
-       b       __LC_RETURN_LPSWE
+2:     LBEAR   __PT_LAST_BREAK(%r11)
+       lmg     %r0,%r15,__PT_R0(%r11)
+       LPSWEY  __LC_RETURN_PSW,__LC_RETURN_LPSWE
 ENDPROC(\name)
 .endm
 
@@ -505,6 +531,7 @@ ENTRY(mcck_int_handler)
        BPOFF
        la      %r1,4095                # validate r1
        spt     __LC_CPU_TIMER_SAVE_AREA-4095(%r1)      # validate cpu timer
+       LBEAR   __LC_LAST_BREAK_SAVE_AREA-4095(%r1)             # validate bear
        lmg     %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs
        lg      %r12,__LC_CURRENT
        lmg     %r8,%r9,__LC_MCK_OLD_PSW
@@ -591,8 +618,10 @@ ENTRY(mcck_int_handler)
        jno     0f
        BPEXIT  __TI_flags(%r12),_TIF_ISOLATE_BP
        stpt    __LC_EXIT_TIMER
-0:     lmg     %r11,%r15,__PT_R11(%r11)
-       b       __LC_RETURN_MCCK_LPSWE
+0:     ALTERNATIVE "", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193
+       LBEAR   0(%r12)
+       lmg     %r11,%r15,__PT_R11(%r11)
+       LPSWEY  __LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE
 
 .Lmcck_panic:
        /*
index 7f2696e8d511ed631b1b6dd0efbbfc0176dbf5fd..6083090be1f46d33ec8d5b62de1734431192eb37 100644 (file)
@@ -70,5 +70,6 @@ extern struct exception_table_entry _stop_amode31_ex_table[];
 #define __amode31_data __section(".amode31.data")
 #define __amode31_ref __section(".amode31.refs")
 extern long _start_amode31_refs[], _end_amode31_refs[];
+extern unsigned long __amode31_base;
 
 #endif /* _ENTRY_H */
index 5165bf344f95680fb589a24795d7b5a07a66a0ad..5510c7d10ddc31fed07af20d57e793bc4a3ae97a 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/kprobes.h>
 #include <trace/syscall.h>
 #include <asm/asm-offsets.h>
+#include <asm/text-patching.h>
 #include <asm/cacheflush.h>
 #include <asm/ftrace.lds.h>
 #include <asm/nospec-branch.h>
@@ -80,17 +81,6 @@ asm(
 
 #ifdef CONFIG_MODULES
 static char *ftrace_plt;
-
-asm(
-       "       .data\n"
-       "ftrace_plt_template:\n"
-       "       basr    %r1,%r0\n"
-       "       lg      %r1,0f-.(%r1)\n"
-       "       br      %r1\n"
-       "0:     .quad   ftrace_caller\n"
-       "ftrace_plt_template_end:\n"
-       "       .previous\n"
-);
 #endif /* CONFIG_MODULES */
 
 static const char *ftrace_shared_hotpatch_trampoline(const char **end)
@@ -116,7 +106,7 @@ static const char *ftrace_shared_hotpatch_trampoline(const char **end)
 
 bool ftrace_need_init_nop(void)
 {
-       return ftrace_shared_hotpatch_trampoline(NULL);
+       return true;
 }
 
 int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
@@ -175,28 +165,6 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
        return 0;
 }
 
-static void ftrace_generate_nop_insn(struct ftrace_insn *insn)
-{
-       /* brcl 0,0 */
-       insn->opc = 0xc004;
-       insn->disp = 0;
-}
-
-static void ftrace_generate_call_insn(struct ftrace_insn *insn,
-                                     unsigned long ip)
-{
-       unsigned long target;
-
-       /* brasl r0,ftrace_caller */
-       target = FTRACE_ADDR;
-#ifdef CONFIG_MODULES
-       if (is_module_addr((void *)ip))
-               target = (unsigned long)ftrace_plt;
-#endif /* CONFIG_MODULES */
-       insn->opc = 0xc005;
-       insn->disp = (target - ip) / 2;
-}
-
 static void brcl_disable(void *brcl)
 {
        u8 op = 0x04; /* set mask field to zero */
@@ -207,23 +175,7 @@ static void brcl_disable(void *brcl)
 int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
                    unsigned long addr)
 {
-       struct ftrace_insn orig, new, old;
-
-       if (ftrace_shared_hotpatch_trampoline(NULL)) {
-               brcl_disable((void *)rec->ip);
-               return 0;
-       }
-
-       if (copy_from_kernel_nofault(&old, (void *) rec->ip, sizeof(old)))
-               return -EFAULT;
-       /* Replace ftrace call with a nop. */
-       ftrace_generate_call_insn(&orig, rec->ip);
-       ftrace_generate_nop_insn(&new);
-
-       /* Verify that the to be replaced code matches what we expect. */
-       if (memcmp(&orig, &old, sizeof(old)))
-               return -EINVAL;
-       s390_kernel_write((void *) rec->ip, &new, sizeof(new));
+       brcl_disable((void *)rec->ip);
        return 0;
 }
 
@@ -236,23 +188,7 @@ static void brcl_enable(void *brcl)
 
 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
-       struct ftrace_insn orig, new, old;
-
-       if (ftrace_shared_hotpatch_trampoline(NULL)) {
-               brcl_enable((void *)rec->ip);
-               return 0;
-       }
-
-       if (copy_from_kernel_nofault(&old, (void *) rec->ip, sizeof(old)))
-               return -EFAULT;
-       /* Replace nop with an ftrace call. */
-       ftrace_generate_nop_insn(&orig);
-       ftrace_generate_call_insn(&new, rec->ip);
-
-       /* Verify that the to be replaced code matches what we expect. */
-       if (memcmp(&orig, &old, sizeof(old)))
-               return -EINVAL;
-       s390_kernel_write((void *) rec->ip, &new, sizeof(new));
+       brcl_enable((void *)rec->ip);
        return 0;
 }
 
@@ -264,22 +200,16 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 
 void arch_ftrace_update_code(int command)
 {
-       if (ftrace_shared_hotpatch_trampoline(NULL))
-               ftrace_modify_all_code(command);
-       else
-               ftrace_run_stop_machine(command);
-}
-
-static void __ftrace_sync(void *dummy)
-{
+       ftrace_modify_all_code(command);
 }
 
 int ftrace_arch_code_modify_post_process(void)
 {
-       if (ftrace_shared_hotpatch_trampoline(NULL)) {
-               /* Send SIGP to the other CPUs, so they see the new code. */
-               smp_call_function(__ftrace_sync, NULL, 1);
-       }
+       /*
+        * Flush any pre-fetched instructions on all
+        * CPUs to make the new code visible.
+        */
+       text_poke_sync_lock();
        return 0;
 }
 
@@ -294,10 +224,6 @@ static int __init ftrace_plt_init(void)
                panic("cannot allocate ftrace plt\n");
 
        start = ftrace_shared_hotpatch_trampoline(&end);
-       if (!start) {
-               start = ftrace_plt_template;
-               end = ftrace_plt_template_end;
-       }
        memcpy(ftrace_plt, start, end - start);
        set_memory_ro((unsigned long)ftrace_plt, 1);
        return 0;
@@ -337,12 +263,14 @@ NOKPROBE_SYMBOL(prepare_ftrace_return);
 int ftrace_enable_ftrace_graph_caller(void)
 {
        brcl_disable(ftrace_graph_caller);
+       text_poke_sync_lock();
        return 0;
 }
 
 int ftrace_disable_ftrace_graph_caller(void)
 {
        brcl_enable(ftrace_graph_caller);
+       text_poke_sync_lock();
        return 0;
 }
 
index 114b5490ad8ebace2e0d3d2fed4b447d746c2580..42f9a325a257f51c5defc3ec1f3fe068e47218ca 100644 (file)
@@ -20,8 +20,6 @@ __HEAD
 ENTRY(startup_continue)
        larl    %r1,tod_clock_base
        mvc     0(16,%r1),__LC_BOOT_CLOCK
-       larl    %r13,.LPG1              # get base
-       lctlg   %c0,%c15,.Lctl-.LPG1(%r13)      # load control registers
 #
 # Setup stack
 #
@@ -42,19 +40,3 @@ ENTRY(startup_continue)
        .align  16
 .LPG1:
 .Ldw:  .quad   0x0002000180000000,0x0000000000000000
-.Lctl: .quad   0x04040000              # cr0: AFP registers & secondary space
-       .quad   0                       # cr1: primary space segment table
-       .quad   0                       # cr2: dispatchable unit control table
-       .quad   0                       # cr3: instruction authorization
-       .quad   0xffff                  # cr4: instruction authorization
-       .quad   0                       # cr5: primary-aste origin
-       .quad   0                       # cr6: I/O interrupts
-       .quad   0                       # cr7: secondary space segment table
-       .quad   0x0000000000008000      # cr8: access registers translation
-       .quad   0                       # cr9: tracing off
-       .quad   0                       # cr10: tracing off
-       .quad   0                       # cr11: tracing off
-       .quad   0                       # cr12: tracing off
-       .quad   0                       # cr13: home space segment table
-       .quad   0xc0000000              # cr14: machine check handling off
-       .quad   0                       # cr15: linkage stack operations
index 3a3145c4a3ba4dfd612d1f8ecb1adbd701a82a4b..0df83ecaa2e0c0c6e94199fe33b4f6db64baa798 100644 (file)
@@ -140,8 +140,11 @@ void noinstr do_io_irq(struct pt_regs *regs)
 
        irq_enter();
 
-       if (user_mode(regs))
+       if (user_mode(regs)) {
                update_timer_sys();
+               if (static_branch_likely(&cpu_has_bear))
+                       current->thread.last_break = regs->last_break;
+       }
 
        from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit;
        if (from_idle)
@@ -171,8 +174,11 @@ void noinstr do_ext_irq(struct pt_regs *regs)
 
        irq_enter();
 
-       if (user_mode(regs))
+       if (user_mode(regs)) {
                update_timer_sys();
+               if (static_branch_likely(&cpu_has_bear))
+                       current->thread.last_break = regs->last_break;
+       }
 
        regs->int_code = S390_lowcore.ext_int_code_addr;
        regs->int_parm = S390_lowcore.ext_params;
index 9156653b56f69b4482351fb9a020316b72556f6d..6bec000c6c1c73015bb3e251b36793976fe44e78 100644 (file)
@@ -6,8 +6,9 @@
  * Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
  */
 #include <linux/uaccess.h>
-#include <linux/stop_machine.h>
 #include <linux/jump_label.h>
+#include <linux/module.h>
+#include <asm/text-patching.h>
 #include <asm/ipl.h>
 
 struct insn {
@@ -48,9 +49,9 @@ static struct insn orignop = {
        .offset = JUMP_LABEL_NOP_OFFSET >> 1,
 };
 
-static void __jump_label_transform(struct jump_entry *entry,
-                                  enum jump_label_type type,
-                                  int init)
+static void jump_label_transform(struct jump_entry *entry,
+                                enum jump_label_type type,
+                                int init)
 {
        void *code = (void *)jump_entry_code(entry);
        struct insn old, new;
@@ -72,19 +73,28 @@ static void __jump_label_transform(struct jump_entry *entry,
        s390_kernel_write(code, &new, sizeof(new));
 }
 
-static void __jump_label_sync(void *dummy)
+void arch_jump_label_transform(struct jump_entry *entry,
+                              enum jump_label_type type)
 {
+       jump_label_transform(entry, type, 0);
+       text_poke_sync();
 }
 
-void arch_jump_label_transform(struct jump_entry *entry,
-                              enum jump_label_type type)
+bool arch_jump_label_transform_queue(struct jump_entry *entry,
+                                    enum jump_label_type type)
+{
+       jump_label_transform(entry, type, 0);
+       return true;
+}
+
+void arch_jump_label_transform_apply(void)
 {
-       __jump_label_transform(entry, type, 0);
-       smp_call_function(__jump_label_sync, NULL, 1);
+       text_poke_sync();
 }
 
-void arch_jump_label_transform_static(struct jump_entry *entry,
-                                     enum jump_label_type type)
+void __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
+                                                      enum jump_label_type type)
 {
-       __jump_label_transform(entry, type, 1);
+       jump_label_transform(entry, type, 1);
+       text_poke_sync();
 }
index c505c0ee5f473e08e7d4481206ea64a5e5be3c33..e27a7d3b0364628b17eee8f3942719f121606cc7 100644 (file)
@@ -122,9 +122,55 @@ static void s390_free_insn_slot(struct kprobe *p)
 }
 NOKPROBE_SYMBOL(s390_free_insn_slot);
 
+/* Check if paddr is at an instruction boundary */
+static bool can_probe(unsigned long paddr)
+{
+       unsigned long addr, offset = 0;
+       kprobe_opcode_t insn;
+       struct kprobe *kp;
+
+       if (paddr & 0x01)
+               return false;
+
+       if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
+               return false;
+
+       /* Decode instructions */
+       addr = paddr - offset;
+       while (addr < paddr) {
+               if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(insn)))
+                       return false;
+
+               if (insn >> 8 == 0) {
+                       if (insn != BREAKPOINT_INSTRUCTION) {
+                               /*
+                                * Note that QEMU inserts opcode 0x0000 to implement
+                                * software breakpoints for guests. Since the size of
+                                * the original instruction is unknown, stop following
+                                * instructions and prevent setting a kprobe.
+                                */
+                               return false;
+                       }
+                       /*
+                        * Check if the instruction has been modified by another
+                        * kprobe, in which case the original instruction is
+                        * decoded.
+                        */
+                       kp = get_kprobe((void *)addr);
+                       if (!kp) {
+                               /* not a kprobe */
+                               return false;
+                       }
+                       insn = kp->opcode;
+               }
+               addr += insn_length(insn >> 8);
+       }
+       return addr == paddr;
+}
+
 int arch_prepare_kprobe(struct kprobe *p)
 {
-       if ((unsigned long) p->addr & 0x01)
+       if (!can_probe((unsigned long)p->addr))
                return -EINVAL;
        /* Make sure the probe isn't going on a difficult instruction */
        if (probe_is_prohibited_opcode(p->addr))
index f9e4baa64b675caa5ad5e1d9eccd3d01716fe721..528edff085d9ab8d3efaa56f2dbf9a7a8ac4d059 100644 (file)
@@ -216,7 +216,9 @@ void *kexec_file_add_components(struct kimage *image,
                                int (*add_kernel)(struct kimage *image,
                                                  struct s390_load_data *data))
 {
+       unsigned long max_command_line_size = LEGACY_COMMAND_LINE_SIZE;
        struct s390_load_data data = {0};
+       unsigned long minsize;
        int ret;
 
        data.report = ipl_report_init(&ipl_block);
@@ -227,10 +229,23 @@ void *kexec_file_add_components(struct kimage *image,
        if (ret)
                goto out;
 
-       if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) {
-               ret = -EINVAL;
+       ret = -EINVAL;
+       minsize = PARMAREA + offsetof(struct parmarea, command_line);
+       if (image->kernel_buf_len < minsize)
                goto out;
-       }
+
+       if (data.parm->max_command_line_size)
+               max_command_line_size = data.parm->max_command_line_size;
+
+       if (minsize + max_command_line_size < minsize)
+               goto out;
+
+       if (image->kernel_buf_len < minsize + max_command_line_size)
+               goto out;
+
+       if (image->cmdline_buf_len >= max_command_line_size)
+               goto out;
+
        memcpy(data.parm->command_line, image->cmdline_buf,
               image->cmdline_buf_len);
 
@@ -307,17 +322,3 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
        }
        return 0;
 }
-
-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-                                 unsigned long buf_len)
-{
-       /* A kernel must be at least large enough to contain head.S. During
-        * load memory in head.S will be accessed, e.g. to register the next
-        * command line. If the next kernel were smaller the current kernel
-        * will panic at load.
-        */
-       if (buf_len < HEAD_END)
-               return -ENOEXEC;
-
-       return kexec_image_probe_default(image, buf, buf_len);
-}
index 6b13797143a72acc0b476cf87e566752b321f826..39bcc0e39a10d7d0e63ec3a5dd0921a9d8fd9941 100644 (file)
@@ -22,10 +22,11 @@ ENTRY(ftrace_stub)
        BR_EX   %r14
 ENDPROC(ftrace_stub)
 
-#define STACK_FRAME_SIZE  (STACK_FRAME_OVERHEAD + __PT_SIZE)
-#define STACK_PTREGS     (STACK_FRAME_OVERHEAD)
-#define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
-#define STACK_PTREGS_PSW  (STACK_PTREGS + __PT_PSW)
+#define STACK_FRAME_SIZE       (STACK_FRAME_OVERHEAD + __PT_SIZE)
+#define STACK_PTREGS           (STACK_FRAME_OVERHEAD)
+#define STACK_PTREGS_GPRS      (STACK_PTREGS + __PT_GPRS)
+#define STACK_PTREGS_PSW       (STACK_PTREGS + __PT_PSW)
+#define STACK_PTREGS_ORIG_GPR2 (STACK_PTREGS + __PT_ORIG_GPR2)
 #ifdef __PACK_STACK
 /* allocate just enough for r14, r15 and backchain */
 #define TRACED_FUNC_FRAME_SIZE 24
@@ -33,13 +34,15 @@ ENDPROC(ftrace_stub)
 #define TRACED_FUNC_FRAME_SIZE STACK_FRAME_OVERHEAD
 #endif
 
-ENTRY(ftrace_caller)
-       .globl  ftrace_regs_caller
-       .set    ftrace_regs_caller,ftrace_caller
+       .macro  ftrace_regs_entry, allregs=0
        stg     %r14,(__SF_GPRS+8*8)(%r15)      # save traced function caller
+
+       .if \allregs == 1
        lghi    %r14,0                          # save condition code
        ipm     %r14                            # don't put any instructions
        sllg    %r14,%r14,16                    # clobbering CC before this point
+       .endif
+
        lgr     %r1,%r15
        # allocate stack frame for ftrace_caller to contain traced function
        aghi    %r15,-TRACED_FUNC_FRAME_SIZE
@@ -49,13 +52,31 @@ ENTRY(ftrace_caller)
        # allocate pt_regs and stack frame for ftrace_trace_function
        aghi    %r15,-STACK_FRAME_SIZE
        stg     %r1,(STACK_PTREGS_GPRS+15*8)(%r15)
+       xc      STACK_PTREGS_ORIG_GPR2(8,%r15),STACK_PTREGS_ORIG_GPR2(%r15)
+
+       .if \allregs == 1
        stg     %r14,(STACK_PTREGS_PSW)(%r15)
-       lg      %r14,(__SF_GPRS+8*8)(%r1)       # restore original return address
        stosm   (STACK_PTREGS_PSW)(%r15),0
+       .endif
+
+       lg      %r14,(__SF_GPRS+8*8)(%r1)       # restore original return address
        aghi    %r1,-TRACED_FUNC_FRAME_SIZE
        stg     %r1,__SF_BACKCHAIN(%r15)
        stg     %r0,(STACK_PTREGS_PSW+8)(%r15)
        stmg    %r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15)
+       .endm
+
+SYM_CODE_START(ftrace_regs_caller)
+       ftrace_regs_entry       1
+       j       ftrace_common
+SYM_CODE_END(ftrace_regs_caller)
+
+SYM_CODE_START(ftrace_caller)
+       ftrace_regs_entry       0
+       j       ftrace_common
+SYM_CODE_END(ftrace_caller)
+
+SYM_CODE_START(ftrace_common)
 #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
        aghik   %r2,%r0,-MCOUNT_INSN_SIZE
        lgrl    %r4,function_trace_op
@@ -74,24 +95,31 @@ ENTRY(ftrace_caller)
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 # The j instruction gets runtime patched to a nop instruction.
 # See ftrace_enable_ftrace_graph_caller.
-       .globl ftrace_graph_caller
-ftrace_graph_caller:
-       j       ftrace_graph_caller_end
+SYM_INNER_LABEL(ftrace_graph_caller, SYM_L_GLOBAL)
+       j       .Lftrace_graph_caller_end
        lmg     %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15)
        lg      %r4,(STACK_PTREGS_PSW+8)(%r15)
        brasl   %r14,prepare_ftrace_return
        stg     %r2,(STACK_PTREGS_GPRS+14*8)(%r15)
-ftrace_graph_caller_end:
-       .globl  ftrace_graph_caller_end
+.Lftrace_graph_caller_end:
+#endif
+       lg      %r0,(STACK_PTREGS_PSW+8)(%r15)
+#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
+       ltg     %r1,STACK_PTREGS_ORIG_GPR2(%r15)
+       locgrz  %r1,%r0
+#else
+       lg      %r1,STACK_PTREGS_ORIG_GPR2(%r15)
+       ltgr    %r1,%r1
+       jnz     0f
+       lgr     %r1,%r0
 #endif
-       lg      %r1,(STACK_PTREGS_PSW+8)(%r15)
-       lmg     %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15)
+0:     lmg     %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15)
        BR_EX   %r1
-ENDPROC(ftrace_caller)
+SYM_CODE_END(ftrace_common)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-ENTRY(return_to_handler)
+SYM_FUNC_START(return_to_handler)
        stmg    %r2,%r5,32(%r15)
        lgr     %r1,%r15
        aghi    %r15,-STACK_FRAME_OVERHEAD
@@ -101,6 +129,6 @@ ENTRY(return_to_handler)
        lgr     %r14,%r2
        lmg     %r2,%r5,32(%r15)
        BR_EX   %r14
-ENDPROC(return_to_handler)
+SYM_FUNC_END(return_to_handler)
 
 #endif
index 250e4dbf653cc03ab7f891c929c3fe48a10e0960..60e6fec27bba085b525887a74bdff7775880629c 100644 (file)
@@ -38,7 +38,7 @@ static int __init nospec_report(void)
 {
        if (test_facility(156))
                pr_info("Spectre V2 mitigation: etokens\n");
-       if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable)
+       if (nospec_uses_trampoline())
                pr_info("Spectre V2 mitigation: execute trampolines\n");
        if (__test_facility(82, alt_stfle_fac_list))
                pr_info("Spectre V2 mitigation: limited branch prediction\n");
index b4b5c8c211663d7d406309d40f6dec17c98a1e6d..52d4353188ad8cb3328dabe3f437f2e57fa6baaa 100644 (file)
@@ -15,7 +15,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
 {
        if (test_facility(156))
                return sprintf(buf, "Mitigation: etokens\n");
-       if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable)
+       if (nospec_uses_trampoline())
                return sprintf(buf, "Mitigation: execute trampolines\n");
        if (__test_facility(82, alt_stfle_fac_list))
                return sprintf(buf, "Mitigation: limited branch prediction\n");
index 4a99154fe65145d3a8f341c91954fcfa998bbd9f..6f431fa9e4d7b88774d80a4be2ce6ace44106edb 100644 (file)
@@ -773,22 +773,46 @@ static int __init cpumf_pmu_init(void)
  * counter set via normal file operations.
  */
 
-static atomic_t cfset_opencnt = ATOMIC_INIT(0);        /* Excl. access */
+static atomic_t cfset_opencnt = ATOMIC_INIT(0);                /* Access count */
 static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */
 struct cfset_call_on_cpu_parm {                /* Parm struct for smp_call_on_cpu */
        unsigned int sets;              /* Counter set bit mask */
        atomic_t cpus_ack;              /* # CPUs successfully executed func */
 };
 
-static struct cfset_request {          /* CPUs and counter set bit mask */
+static struct cfset_session {          /* CPUs and counter set bit mask */
+       struct list_head head;          /* Head of list of active processes */
+} cfset_session = {
+       .head = LIST_HEAD_INIT(cfset_session.head)
+};
+
+struct cfset_request {                 /* CPUs and counter set bit mask */
        unsigned long ctrset;           /* Bit mask of counter set to read */
        cpumask_t mask;                 /* CPU mask to read from */
-} cfset_request;
+       struct list_head node;          /* Chain to cfset_session.head */
+};
+
+static void cfset_session_init(void)
+{
+       INIT_LIST_HEAD(&cfset_session.head);
+}
+
+/* Remove current request from global bookkeeping. Maintain a counter set bit
+ * mask on a per CPU basis.
+ * Done in process context under mutex protection.
+ */
+static void cfset_session_del(struct cfset_request *p)
+{
+       list_del(&p->node);
+}
 
-static void cfset_ctrset_clear(void)
+/* Add current request to global bookkeeping. Maintain a counter set bit mask
+ * on a per CPU basis.
+ * Done in process context under mutex protection.
+ */
+static void cfset_session_add(struct cfset_request *p)
 {
-       cpumask_clear(&cfset_request.mask);
-       cfset_request.ctrset = 0;
+       list_add(&p->node, &cfset_session.head);
 }
 
 /* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access
@@ -827,15 +851,23 @@ static void cfset_ioctl_off(void *parm)
        struct cfset_call_on_cpu_parm *p = parm;
        int rc;
 
-       cpuhw->dev_state = 0;
+       /* Check if any counter set used by /dev/hwc */
        for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc)
-               if ((p->sets & cpumf_ctr_ctl[rc]))
-                       atomic_dec(&cpuhw->ctr_set[rc]);
-       rc = lcctl(cpuhw->state);       /* Keep perf_event_open counter sets */
+               if ((p->sets & cpumf_ctr_ctl[rc])) {
+                       if (!atomic_dec_return(&cpuhw->ctr_set[rc])) {
+                               ctr_set_disable(&cpuhw->dev_state,
+                                               cpumf_ctr_ctl[rc]);
+                               ctr_set_stop(&cpuhw->dev_state,
+                                            cpumf_ctr_ctl[rc]);
+                       }
+               }
+       /* Keep perf_event_open counter sets */
+       rc = lcctl(cpuhw->dev_state | cpuhw->state);
        if (rc)
                pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n",
                       cpuhw->state, S390_HWCTR_DEVICE, rc);
-       cpuhw->flags &= ~PMU_F_IN_USE;
+       if (!cpuhw->dev_state)
+               cpuhw->flags &= ~PMU_F_IN_USE;
        debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n",
                            __func__, rc, cpuhw->state, cpuhw->dev_state);
 }
@@ -870,11 +902,26 @@ static void cfset_release_cpu(void *p)
 
        debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n",
                            __func__, cpuhw->state, cpuhw->dev_state);
+       cpuhw->dev_state = 0;
        rc = lcctl(cpuhw->state);       /* Keep perf_event_open counter sets */
        if (rc)
                pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n",
                       cpuhw->state, S390_HWCTR_DEVICE, rc);
-       cpuhw->dev_state = 0;
+}
+
+/* This modifies the process CPU mask to adopt it to the currently online
+ * CPUs. Offline CPUs can not be addresses. This call terminates the access
+ * and is usually followed by close() or a new iotcl(..., START, ...) which
+ * creates a new request structure.
+ */
+static void cfset_all_stop(struct cfset_request *req)
+{
+       struct cfset_call_on_cpu_parm p = {
+               .sets = req->ctrset,
+       };
+
+       cpumask_and(&req->mask, &req->mask, cpu_online_mask);
+       on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1);
 }
 
 /* Release function is also called when application gets terminated without
@@ -882,10 +929,19 @@ static void cfset_release_cpu(void *p)
  */
 static int cfset_release(struct inode *inode, struct file *file)
 {
-       on_each_cpu(cfset_release_cpu, NULL, 1);
+       mutex_lock(&cfset_ctrset_mutex);
+       /* Open followed by close/exit has no private_data */
+       if (file->private_data) {
+               cfset_all_stop(file->private_data);
+               cfset_session_del(file->private_data);
+               kfree(file->private_data);
+               file->private_data = NULL;
+       }
+       if (!atomic_dec_return(&cfset_opencnt))
+               on_each_cpu(cfset_release_cpu, NULL, 1);
+       mutex_unlock(&cfset_ctrset_mutex);
+
        hw_perf_event_destroy(NULL);
-       cfset_ctrset_clear();
-       atomic_set(&cfset_opencnt, 0);
        return 0;
 }
 
@@ -893,9 +949,10 @@ static int cfset_open(struct inode *inode, struct file *file)
 {
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-       /* Only one user space program can open /dev/hwctr */
-       if (atomic_xchg(&cfset_opencnt, 1))
-               return -EBUSY;
+       mutex_lock(&cfset_ctrset_mutex);
+       if (atomic_inc_return(&cfset_opencnt) == 1)
+               cfset_session_init();
+       mutex_unlock(&cfset_ctrset_mutex);
 
        cpumf_hw_inuse();
        file->private_data = NULL;
@@ -903,25 +960,10 @@ static int cfset_open(struct inode *inode, struct file *file)
        return nonseekable_open(inode, file);
 }
 
-static int cfset_all_stop(void)
+static int cfset_all_start(struct cfset_request *req)
 {
        struct cfset_call_on_cpu_parm p = {
-               .sets = cfset_request.ctrset,
-       };
-       cpumask_var_t mask;
-
-       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-               return -ENOMEM;
-       cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
-       on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
-       free_cpumask_var(mask);
-       return 0;
-}
-
-static int cfset_all_start(void)
-{
-       struct cfset_call_on_cpu_parm p = {
-               .sets = cfset_request.ctrset,
+               .sets = req->ctrset,
                .cpus_ack = ATOMIC_INIT(0),
        };
        cpumask_var_t mask;
@@ -929,7 +971,7 @@ static int cfset_all_start(void)
 
        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
                return -ENOMEM;
-       cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+       cpumask_and(mask, &req->mask, cpu_online_mask);
        on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1);
        if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) {
                on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1);
@@ -1045,7 +1087,7 @@ static void cfset_cpu_read(void *parm)
                            cpuhw->sets, cpuhw->used);
 }
 
-static int cfset_all_read(unsigned long arg)
+static int cfset_all_read(unsigned long arg, struct cfset_request *req)
 {
        struct cfset_call_on_cpu_parm p;
        cpumask_var_t mask;
@@ -1054,46 +1096,53 @@ static int cfset_all_read(unsigned long arg)
        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
                return -ENOMEM;
 
-       p.sets = cfset_request.ctrset;
-       cpumask_and(mask, &cfset_request.mask, cpu_online_mask);
+       p.sets = req->ctrset;
+       cpumask_and(mask, &req->mask, cpu_online_mask);
        on_each_cpu_mask(mask, cfset_cpu_read, &p, 1);
        rc = cfset_all_copy(arg, mask);
        free_cpumask_var(mask);
        return rc;
 }
 
-static long cfset_ioctl_read(unsigned long arg)
+static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req)
 {
        struct s390_ctrset_read read;
-       int ret = 0;
+       int ret = -ENODATA;
 
-       if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
-               return -EFAULT;
-       ret = cfset_all_read(arg);
+       if (req && req->ctrset) {
+               if (copy_from_user(&read, (char __user *)arg, sizeof(read)))
+                       return -EFAULT;
+               ret = cfset_all_read(arg, req);
+       }
        return ret;
 }
 
-static long cfset_ioctl_stop(void)
+static long cfset_ioctl_stop(struct file *file)
 {
-       int ret = ENXIO;
-
-       if (cfset_request.ctrset) {
-               ret = cfset_all_stop();
-               cfset_ctrset_clear();
+       struct cfset_request *req = file->private_data;
+       int ret = -ENXIO;
+
+       if (req) {
+               cfset_all_stop(req);
+               cfset_session_del(req);
+               kfree(req);
+               file->private_data = NULL;
+               ret = 0;
        }
        return ret;
 }
 
-static long cfset_ioctl_start(unsigned long arg)
+static long cfset_ioctl_start(unsigned long arg, struct file *file)
 {
        struct s390_ctrset_start __user *ustart;
        struct s390_ctrset_start start;
+       struct cfset_request *preq;
        void __user *umask;
        unsigned int len;
        int ret = 0;
        size_t need;
 
-       if (cfset_request.ctrset)
+       if (file->private_data)
                return -EBUSY;
        ustart = (struct s390_ctrset_start __user *)arg;
        if (copy_from_user(&start, ustart, sizeof(start)))
@@ -1108,25 +1157,36 @@ static long cfset_ioctl_start(unsigned long arg)
                return -EINVAL;         /* Invalid counter set */
        if (!start.counter_sets)
                return -EINVAL;         /* No counter set at all? */
-       cpumask_clear(&cfset_request.mask);
+
+       preq = kzalloc(sizeof(*preq), GFP_KERNEL);
+       if (!preq)
+               return -ENOMEM;
+       cpumask_clear(&preq->mask);
        len = min_t(u64, start.cpumask_len, cpumask_size());
        umask = (void __user *)start.cpumask;
-       if (copy_from_user(&cfset_request.mask, umask, len))
+       if (copy_from_user(&preq->mask, umask, len)) {
+               kfree(preq);
                return -EFAULT;
-       if (cpumask_empty(&cfset_request.mask))
+       }
+       if (cpumask_empty(&preq->mask)) {
+               kfree(preq);
                return -EINVAL;
+       }
        need = cfset_needspace(start.counter_sets);
-       if (put_user(need, &ustart->data_bytes))
-               ret = -EFAULT;
-       if (ret)
-               goto out;
-       cfset_request.ctrset = start.counter_sets;
-       ret = cfset_all_start();
-out:
-       if (ret)
-               cfset_ctrset_clear();
-       debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n",
-                           __func__, cfset_request.ctrset, need, ret);
+       if (put_user(need, &ustart->data_bytes)) {
+               kfree(preq);
+               return -EFAULT;
+       }
+       preq->ctrset = start.counter_sets;
+       ret = cfset_all_start(preq);
+       if (!ret) {
+               cfset_session_add(preq);
+               file->private_data = preq;
+               debug_sprintf_event(cf_dbg, 4, "%s set %#lx need %ld ret %d\n",
+                                   __func__, preq->ctrset, need, ret);
+       } else {
+               kfree(preq);
+       }
        return ret;
 }
 
@@ -1136,7 +1196,7 @@ out:
  *    counter set keeps running until explicitly stopped. Returns the number
  *    of bytes needed to store the counter values. If another S390_HWCTR_START
  *    ioctl subcommand is called without a previous S390_HWCTR_STOP stop
- *    command, -EBUSY is returned.
+ *    command on the same file descriptor, -EBUSY is returned.
  * S390_HWCTR_READ: Read the counter set values from specified CPU list given
  *    with the S390_HWCTR_START command.
  * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the
@@ -1150,13 +1210,13 @@ static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        mutex_lock(&cfset_ctrset_mutex);
        switch (cmd) {
        case S390_HWCTR_START:
-               ret = cfset_ioctl_start(arg);
+               ret = cfset_ioctl_start(arg, file);
                break;
        case S390_HWCTR_STOP:
-               ret = cfset_ioctl_stop();
+               ret = cfset_ioctl_stop(file);
                break;
        case S390_HWCTR_READ:
-               ret = cfset_ioctl_read(arg);
+               ret = cfset_ioctl_read(arg, file->private_data);
                break;
        default:
                ret = -ENOTTY;
@@ -1182,29 +1242,41 @@ static struct miscdevice cfset_dev = {
        .fops   = &cfset_fops,
 };
 
+/* Hotplug add of a CPU. Scan through all active processes and add
+ * that CPU to the list of CPUs supplied with ioctl(..., START, ...).
+ */
 int cfset_online_cpu(unsigned int cpu)
 {
        struct cfset_call_on_cpu_parm p;
+       struct cfset_request *rp;
 
        mutex_lock(&cfset_ctrset_mutex);
-       if (cfset_request.ctrset) {
-               p.sets = cfset_request.ctrset;
-               cfset_ioctl_on(&p);
-               cpumask_set_cpu(cpu, &cfset_request.mask);
+       if (!list_empty(&cfset_session.head)) {
+               list_for_each_entry(rp, &cfset_session.head, node) {
+                       p.sets = rp->ctrset;
+                       cfset_ioctl_on(&p);
+                       cpumask_set_cpu(cpu, &rp->mask);
+               }
        }
        mutex_unlock(&cfset_ctrset_mutex);
        return 0;
 }
 
+/* Hotplug remove of a CPU. Scan through all active processes and clear
+ * that CPU from the list of CPUs supplied with ioctl(..., START, ...).
+ */
 int cfset_offline_cpu(unsigned int cpu)
 {
        struct cfset_call_on_cpu_parm p;
+       struct cfset_request *rp;
 
        mutex_lock(&cfset_ctrset_mutex);
-       if (cfset_request.ctrset) {
-               p.sets = cfset_request.ctrset;
-               cfset_ioctl_off(&p);
-               cpumask_clear_cpu(cpu, &cfset_request.mask);
+       if (!list_empty(&cfset_session.head)) {
+               list_for_each_entry(rp, &cfset_session.head, node) {
+                       p.sets = rp->ctrset;
+                       cfset_ioctl_off(&p);
+                       cpumask_clear_cpu(cpu, &rp->mask);
+               }
        }
        mutex_unlock(&cfset_ctrset_mutex);
        return 0;
index e5dd46b1bff8cbc3bd85c64799b14a07b20b4357..e8858b2de24b7529f3303f34ac4cdd0453b464aa 100644 (file)
@@ -141,7 +141,7 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
                frame->childregs.gprs[10] = arg;
                frame->childregs.gprs[11] = (unsigned long)do_exit;
                frame->childregs.orig_gpr2 = -1;
-
+               frame->childregs.last_break = 1;
                return 0;
        }
        frame->childregs = *current_pt_regs();
index 67e5fff96ee0673dc01633c0dd7c62b9d6f106b9..40405f2304f1be2376f8c275aec9a3ecd83d0d4a 100644 (file)
@@ -95,10 +95,10 @@ EXPORT_SYMBOL(console_irq);
  * relocated above 2 GB, because it has to use 31 bit addresses.
  * Such code and data is part of the .amode31 section.
  */
-unsigned long __amode31_ref __samode31 = __pa(&_samode31);
-unsigned long __amode31_ref __eamode31 = __pa(&_eamode31);
-unsigned long __amode31_ref __stext_amode31 = __pa(&_stext_amode31);
-unsigned long __amode31_ref __etext_amode31 = __pa(&_etext_amode31);
+unsigned long __amode31_ref __samode31 = (unsigned long)&_samode31;
+unsigned long __amode31_ref __eamode31 = (unsigned long)&_eamode31;
+unsigned long __amode31_ref __stext_amode31 = (unsigned long)&_stext_amode31;
+unsigned long __amode31_ref __etext_amode31 = (unsigned long)&_etext_amode31;
 struct exception_table_entry __amode31_ref *__start_amode31_ex_table = _start_amode31_ex_table;
 struct exception_table_entry __amode31_ref *__stop_amode31_ex_table = _stop_amode31_ex_table;
 
@@ -149,6 +149,7 @@ struct mem_detect_info __bootdata(mem_detect);
 struct initrd_data __bootdata(initrd_data);
 
 unsigned long __bootdata_preserved(__kaslr_offset);
+unsigned long __bootdata(__amode31_base);
 unsigned int __bootdata_preserved(zlib_dfltcc_support);
 EXPORT_SYMBOL(zlib_dfltcc_support);
 u64 __bootdata_preserved(stfle_fac_list[16]);
@@ -173,6 +174,8 @@ unsigned long MODULES_END;
 struct lowcore *lowcore_ptr[NR_CPUS];
 EXPORT_SYMBOL(lowcore_ptr);
 
+DEFINE_STATIC_KEY_FALSE(cpu_has_bear);
+
 /*
  * The Write Back bit position in the physaddr is given by the SLPC PCI.
  * Leaving the mask zero always uses write through which is safe
@@ -593,7 +596,8 @@ static void __init setup_resources(void)
         * part of the System RAM resource.
         */
        if (crashk_res.end) {
-               memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0);
+               memblock_add_node(crashk_res.start, resource_size(&crashk_res),
+                                 0, MEMBLOCK_NONE);
                memblock_reserve(crashk_res.start, resource_size(&crashk_res));
                insert_resource(&iomem_resource, &crashk_res);
        }
@@ -693,7 +697,7 @@ static void __init reserve_crashkernel(void)
        }
 
        if (register_memory_notifier(&kdump_mem_nb)) {
-               memblock_free(crash_base, crash_size);
+               memblock_phys_free(crash_base, crash_size);
                return;
        }
 
@@ -718,7 +722,7 @@ static void __init reserve_initrd(void)
 #ifdef CONFIG_BLK_DEV_INITRD
        if (!initrd_data.start || !initrd_data.size)
                return;
-       initrd_start = initrd_data.start;
+       initrd_start = (unsigned long)__va(initrd_data.start);
        initrd_end = initrd_start + initrd_data.size;
        memblock_reserve(initrd_data.start, initrd_data.size);
 #endif
@@ -748,7 +752,7 @@ static void __init free_mem_detect_info(void)
 
        get_mem_detect_reserved(&start, &size);
        if (size)
-               memblock_free(start, size);
+               memblock_phys_free(start, size);
 }
 
 static const char * __init get_mem_info_source(void)
@@ -793,7 +797,7 @@ static void __init check_initrd(void)
        if (initrd_data.start && initrd_data.size &&
            !memblock_is_region_memory(initrd_data.start, initrd_data.size)) {
                pr_err("The initial RAM disk does not fit into the memory\n");
-               memblock_free(initrd_data.start, initrd_data.size);
+               memblock_phys_free(initrd_data.start, initrd_data.size);
                initrd_start = initrd_end = 0;
        }
 #endif
@@ -804,12 +808,10 @@ static void __init check_initrd(void)
  */
 static void __init reserve_kernel(void)
 {
-       unsigned long start_pfn = PFN_UP(__pa(_end));
-
        memblock_reserve(0, STARTUP_NORMAL_OFFSET);
-       memblock_reserve((unsigned long)sclp_early_sccb, EXT_SCCB_READ_SCP);
-       memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn)
-                        - (unsigned long)_stext);
+       memblock_reserve(__amode31_base, __eamode31 - __samode31);
+       memblock_reserve(__pa(sclp_early_sccb), EXT_SCCB_READ_SCP);
+       memblock_reserve(__pa(_stext), _end - _stext);
 }
 
 static void __init setup_memory(void)
@@ -831,20 +833,14 @@ static void __init setup_memory(void)
 
 static void __init relocate_amode31_section(void)
 {
-       unsigned long amode31_addr, amode31_size;
-       long amode31_offset;
+       unsigned long amode31_size = __eamode31 - __samode31;
+       long amode31_offset = __amode31_base - __samode31;
        long *ptr;
 
-       /* Allocate a new AMODE31 capable memory region */
-       amode31_size = __eamode31 - __samode31;
        pr_info("Relocating AMODE31 section of size 0x%08lx\n", amode31_size);
-       amode31_addr = (unsigned long)memblock_alloc_low(amode31_size, PAGE_SIZE);
-       if (!amode31_addr)
-               panic("Failed to allocate memory for AMODE31 section\n");
-       amode31_offset = amode31_addr - __samode31;
 
        /* Move original AMODE31 section to the new one */
-       memmove((void *)amode31_addr, (void *)__samode31, amode31_size);
+       memmove((void *)__amode31_base, (void *)__samode31, amode31_size);
        /* Zero out the old AMODE31 section to catch invalid accesses within it */
        memset((void *)__samode31, 0, amode31_size);
 
@@ -883,14 +879,12 @@ static void __init setup_randomness(void)
 {
        struct sysinfo_3_2_2 *vmms;
 
-       vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE,
-                                                           PAGE_SIZE);
+       vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
        if (!vmms)
                panic("Failed to allocate memory for sysinfo structure\n");
-
        if (stsi(vmms, 3, 2, 2) == 0 && vmms->count)
                add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count);
-       memblock_free((unsigned long) vmms, PAGE_SIZE);
+       memblock_free(vmms, PAGE_SIZE);
 }
 
 /*
@@ -1047,6 +1041,9 @@ void __init setup_arch(char **cmdline_p)
        smp_detect_cpus();
        topology_init_early();
 
+       if (test_facility(193))
+               static_branch_enable(&cpu_has_bear);
+
        /*
         * Create kernel page tables and switch to virtual addressing.
         */
index 1a04e5bdf6555de99164894db4bc4a79bc9f46f3..78a8ea6fd582a8828d079e8f6a2265ca27513afe 100644 (file)
@@ -723,7 +723,7 @@ void __init smp_save_dump_cpus(void)
                        /* Get the CPU registers */
                        smp_save_cpu_regs(sa, addr, is_boot_cpu, page);
        }
-       memblock_free(page, PAGE_SIZE);
+       memblock_phys_free(page, PAGE_SIZE);
        diag_amode31_ops.diag308_reset();
        pcpu_set_smt(0);
 }
@@ -880,7 +880,7 @@ void __init smp_detect_cpus(void)
 
        /* Add CPUs present at boot */
        __smp_rescan_cpus(info, true);
-       memblock_free_early((unsigned long)info, sizeof(*info));
+       memblock_phys_free((unsigned long)info, sizeof(*info));
 }
 
 /*
index 8fe2d23b64f439fd92400b8cd04ecfac68f95bf5..dc2355c623d6ea1edf86c0e5f137db89bf9b07b7 100644 (file)
@@ -154,6 +154,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap)
        regs->psw = S390_lowcore.svc_old_psw;
        regs->int_code = S390_lowcore.svc_int_code;
        update_timer_sys();
+       if (static_branch_likely(&cpu_has_bear))
+               current->thread.last_break = regs->last_break;
 
        local_irq_enable();
        regs->orig_gpr2 = regs->gprs[2];
index bcefc2173de45b081c5287251f390ed746ae49ef..6c6f7dcce1a510b67b772ed5798e9481767e3db0 100644 (file)
@@ -300,7 +300,6 @@ static void (*pgm_check_table[128])(struct pt_regs *regs);
 
 void noinstr __do_pgm_check(struct pt_regs *regs)
 {
-       unsigned long last_break = S390_lowcore.breaking_event_addr;
        unsigned int trapnr;
        irqentry_state_t state;
 
@@ -311,10 +310,11 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
 
        if (user_mode(regs)) {
                update_timer_sys();
-               if (last_break < 4096)
-                       last_break = 1;
-               current->thread.last_break = last_break;
-               regs->args[0] = last_break;
+               if (!static_branch_likely(&cpu_has_bear)) {
+                       if (regs->last_break < 4096)
+                               regs->last_break = 1;
+               }
+               current->thread.last_break = regs->last_break;
        }
 
        if (S390_lowcore.pgm_code & 0x0200) {
index 8b0e62507d62e845aa31c575354100d5d4cf328f..386d4e42b8d361c95525602b9ec7dae64e6facf1 100644 (file)
@@ -64,7 +64,7 @@ void __init setup_uv(void)
        }
 
        if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) {
-               memblock_free(uv_stor_base, uv_info.uv_base_stor_len);
+               memblock_phys_free(uv_stor_base, uv_info.uv_base_stor_len);
                goto fail;
        }
 
index 63bdb9e1bfc1343234bf71398ddc27e5f0de4e8b..42c43521878ff7b116dcd5ba424e556cc3bf1421 100644 (file)
@@ -212,6 +212,7 @@ SECTIONS
                QUAD(__dynsym_start)                            /* dynsym_start */
                QUAD(__rela_dyn_start)                          /* rela_dyn_start */
                QUAD(__rela_dyn_end)                            /* rela_dyn_end */
+               QUAD(_eamode31 - _samode31)                     /* amode31_size */
        } :NONE
 
        /* Debugging sections.  */
index 2245f4b8d362953f8f64c3d0103c7b2579e37c40..c3bd993fdd0cf2987c19cf57ee3e1bc009efcdc9 100644 (file)
@@ -960,7 +960,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
        /* bit 1+2 of the target are the ilc, so we can directly use ilen */
        rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
        rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
-                                (u64 *) __LC_LAST_BREAK);
+                                (u64 *) __LC_PGM_LAST_BREAK);
        rc |= put_guest_lc(vcpu, pgm_info.code,
                           (u16 *)__LC_PGM_INT_CODE);
        rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW,
index 678333936f78ca881a6850e98da6f2d28e2aefb6..707cd4622c132d509e71e8d55a48cd2d314858fa 100644 (file)
@@ -7,6 +7,8 @@ lib-y += delay.o string.o uaccess.o find.o spinlock.o
 obj-y += mem.o xor.o
 lib-$(CONFIG_KPROBES) += probes.o
 lib-$(CONFIG_UPROBES) += probes.o
+obj-$(CONFIG_S390_KPROBES_SANITY_TEST) += test_kprobes_s390.o
+test_kprobes_s390-objs += test_kprobes_asm.o test_kprobes.o
 
 # Instrumenting memory accesses to __user data (in different address space)
 # produce false positives
index 9b2dab5a69f995055c9067f37a5688d65f1b4642..692dc84cd19c87dade2551af2650421378954e62 100644 (file)
@@ -26,7 +26,7 @@ static int __init spin_retry_init(void)
 }
 early_initcall(spin_retry_init);
 
-/**
+/*
  * spin_retry= parameter
  */
 static int __init spin_retry_setup(char *str)
index 47080560e0d8bbe0b7a84c5ece6f9624c11c62c3..7d87418182397a2848e45ae131b86319513de687 100644 (file)
@@ -100,32 +100,6 @@ char *strcpy(char *dest, const char *src)
 EXPORT_SYMBOL(strcpy);
 #endif
 
-/**
- * strlcpy - Copy a %NUL terminated string into a sized buffer
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- * @size: size of destination buffer
- *
- * Compatible with *BSD: the result is always a valid
- * NUL-terminated string that fits in the buffer (unless,
- * of course, the buffer size is zero). It does not pad
- * out the result like strncpy() does.
- */
-#ifdef __HAVE_ARCH_STRLCPY
-size_t strlcpy(char *dest, const char *src, size_t size)
-{
-       size_t ret = __strend(src) - src;
-
-       if (size) {
-               size_t len = (ret >= size) ? size-1 : ret;
-               dest[len] = '\0';
-               memcpy(dest, src, len);
-       }
-       return ret;
-}
-EXPORT_SYMBOL(strlcpy);
-#endif
-
 /**
  * strncpy - Copy a length-limited, %NUL-terminated string
  * @dest: Where to copy the string to
@@ -254,25 +228,6 @@ int strcmp(const char *s1, const char *s2)
 EXPORT_SYMBOL(strcmp);
 #endif
 
-/**
- * strrchr - Find the last occurrence of a character in a string
- * @s: The string to be searched
- * @c: The character to search for
- */
-#ifdef __HAVE_ARCH_STRRCHR
-char *strrchr(const char *s, int c)
-{
-       ssize_t len = __strend(s) - s;
-
-       do {
-               if (s[len] == (char)c)
-                       return (char *)s + len;
-       } while (--len >= 0);
-       return NULL;
-}
-EXPORT_SYMBOL(strrchr);
-#endif
-
 static inline int clcle(const char *s1, unsigned long l1,
                        const char *s2, unsigned long l2)
 {
diff --git a/arch/s390/lib/test_kprobes.c b/arch/s390/lib/test_kprobes.c
new file mode 100644 (file)
index 0000000..9e62d62
--- /dev/null
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/random.h>
+#include <kunit/test.h>
+#include "test_kprobes.h"
+
+static struct kprobe kp;
+
+static void setup_kprobe(struct kunit *test, struct kprobe *kp,
+                        const char *symbol, int offset)
+{
+       kp->offset = offset;
+       kp->addr = NULL;
+       kp->symbol_name = symbol;
+}
+
+static void test_kprobe_offset(struct kunit *test, struct kprobe *kp,
+                              const char *target, int offset)
+{
+       int ret;
+
+       setup_kprobe(test, kp, target, 0);
+       ret = register_kprobe(kp);
+       if (!ret)
+               unregister_kprobe(kp);
+       KUNIT_EXPECT_EQ(test, 0, ret);
+       setup_kprobe(test, kp, target, offset);
+       ret = register_kprobe(kp);
+       KUNIT_EXPECT_EQ(test, -EINVAL, ret);
+       if (!ret)
+               unregister_kprobe(kp);
+}
+
+static void test_kprobe_odd(struct kunit *test)
+{
+       test_kprobe_offset(test, &kp, "kprobes_target_odd",
+                          kprobes_target_odd_offs);
+}
+
+static void test_kprobe_in_insn4(struct kunit *test)
+{
+       test_kprobe_offset(test, &kp, "kprobes_target_in_insn4",
+                          kprobes_target_in_insn4_offs);
+}
+
+static void test_kprobe_in_insn6_lo(struct kunit *test)
+{
+       test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_lo",
+                          kprobes_target_in_insn6_lo_offs);
+}
+
+static void test_kprobe_in_insn6_hi(struct kunit *test)
+{
+       test_kprobe_offset(test, &kp, "kprobes_target_in_insn6_hi",
+                          kprobes_target_in_insn6_hi_offs);
+}
+
+static struct kunit_case kprobes_testcases[] = {
+       KUNIT_CASE(test_kprobe_odd),
+       KUNIT_CASE(test_kprobe_in_insn4),
+       KUNIT_CASE(test_kprobe_in_insn6_lo),
+       KUNIT_CASE(test_kprobe_in_insn6_hi),
+       {}
+};
+
+static struct kunit_suite kprobes_test_suite = {
+       .name = "kprobes_test_s390",
+       .test_cases = kprobes_testcases,
+};
+
+kunit_test_suites(&kprobes_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/test_kprobes.h b/arch/s390/lib/test_kprobes.h
new file mode 100644 (file)
index 0000000..2b4c9bc
--- /dev/null
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef TEST_KPROBES_H
+#define TEST_KPROBES_H
+
+extern unsigned long kprobes_target_odd_offs;
+extern unsigned long kprobes_target_in_insn4_offs;
+extern unsigned long kprobes_target_in_insn6_lo_offs;
+extern unsigned long kprobes_target_in_insn6_hi_offs;
+
+#endif
diff --git a/arch/s390/lib/test_kprobes_asm.S b/arch/s390/lib/test_kprobes_asm.S
new file mode 100644 (file)
index 0000000..ade7a30
--- /dev/null
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+#include <linux/linkage.h>
+#include <asm/ftrace.h>
+
+#define KPROBES_TARGET_START(name)     \
+       SYM_FUNC_START(name);           \
+       FTRACE_GEN_NOP_ASM(name)
+
+#define KPROBES_TARGET_END(name)       \
+       SYM_FUNC_END(name);             \
+       SYM_DATA(name##_offs, .quad 1b - name)
+
+KPROBES_TARGET_START(kprobes_target_in_insn4)
+       .word 0x4700 // bc 0,0
+1:     .word 0x0000
+       br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn4)
+
+KPROBES_TARGET_START(kprobes_target_in_insn6_lo)
+       .word 0xe310 // ly 1,0
+1:     .word 0x0000
+       .word 0x0058
+       br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn6_lo)
+
+KPROBES_TARGET_START(kprobes_target_in_insn6_hi)
+       .word 0xe310 // ly 1,0
+       .word 0x0000
+1:     .word 0x0058
+       br %r14
+KPROBES_TARGET_END(kprobes_target_in_insn6_hi)
+
+KPROBES_TARGET_START(kprobes_target_bp)
+       nop
+       .word 0x0000
+       nop
+1:     br %r14
+KPROBES_TARGET_END(kprobes_target_bp)
+
+KPROBES_TARGET_START(kprobes_target_odd)
+       .byte 0x07
+1:     .byte 0x07
+       br %r14
+KPROBES_TARGET_END(kprobes_target_odd)
index ecf327d743a0396d60aa12a0b62e64771d513689..cfc5f5557c06756236b935eacee5313da1f663d3 100644 (file)
@@ -3,7 +3,7 @@
  * Test module for unwind_for_each_frame
  */
 
-#define pr_fmt(fmt) "test_unwind: " fmt
+#include <kunit/test.h>
 #include <asm/unwind.h>
 #include <linux/completion.h>
 #include <linux/kallsyms.h>
@@ -16,6 +16,8 @@
 #include <linux/wait.h>
 #include <asm/irq.h>
 
+struct kunit *current_test;
+
 #define BT_BUF_SIZE (PAGE_SIZE * 4)
 
 /*
@@ -29,7 +31,7 @@ static void print_backtrace(char *bt)
                p = strsep(&bt, "\n");
                if (!p)
                        break;
-               pr_err("%s\n", p);
+               kunit_err(current_test, "%s\n", p);
        }
 }
 
@@ -49,7 +51,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
 
        bt = kmalloc(BT_BUF_SIZE, GFP_ATOMIC);
        if (!bt) {
-               pr_err("failed to allocate backtrace buffer\n");
+               kunit_err(current_test, "failed to allocate backtrace buffer\n");
                return -ENOMEM;
        }
        /* Unwind. */
@@ -63,7 +65,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
                if (frame_count++ == max_frames)
                        break;
                if (state.reliable && !addr) {
-                       pr_err("unwind state reliable but addr is 0\n");
+                       kunit_err(current_test, "unwind state reliable but addr is 0\n");
                        ret = -EINVAL;
                        break;
                }
@@ -75,7 +77,7 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
                                           stack_type_name(state.stack_info.type),
                                           (void *)state.sp, (void *)state.ip);
                        if (bt_pos >= BT_BUF_SIZE)
-                               pr_err("backtrace buffer is too small\n");
+                               kunit_err(current_test, "backtrace buffer is too small\n");
                }
                frame_count += 1;
                if (prev_is_func2 && str_has_prefix(sym, "unwindme_func1"))
@@ -85,15 +87,15 @@ static noinline int test_unwind(struct task_struct *task, struct pt_regs *regs,
 
        /* Check the results. */
        if (unwind_error(&state)) {
-               pr_err("unwind error\n");
+               kunit_err(current_test, "unwind error\n");
                ret = -EINVAL;
        }
        if (!seen_func2_func1) {
-               pr_err("unwindme_func2 and unwindme_func1 not found\n");
+               kunit_err(current_test, "unwindme_func2 and unwindme_func1 not found\n");
                ret = -EINVAL;
        }
        if (frame_count == max_frames) {
-               pr_err("Maximum number of frames exceeded\n");
+               kunit_err(current_test, "Maximum number of frames exceeded\n");
                ret = -EINVAL;
        }
        if (ret)
@@ -166,7 +168,7 @@ static noinline int unwindme_func4(struct unwindme *u)
                kp.pre_handler = pgm_pre_handler;
                ret = register_kprobe(&kp);
                if (ret < 0) {
-                       pr_err("register_kprobe failed %d\n", ret);
+                       kunit_err(current_test, "register_kprobe failed %d\n", ret);
                        return -EINVAL;
                }
 
@@ -252,7 +254,7 @@ static int test_unwind_irq(struct unwindme *u)
 }
 
 /* Spawns a task and passes it to test_unwind(). */
-static int test_unwind_task(struct unwindme *u)
+static int test_unwind_task(struct kunit *test, struct unwindme *u)
 {
        struct task_struct *task;
        int ret;
@@ -267,7 +269,7 @@ static int test_unwind_task(struct unwindme *u)
         */
        task = kthread_run(unwindme_func1, u, "%s", __func__);
        if (IS_ERR(task)) {
-               pr_err("kthread_run() failed\n");
+               kunit_err(test, "kthread_run() failed\n");
                return PTR_ERR(task);
        }
        /*
@@ -282,77 +284,98 @@ static int test_unwind_task(struct unwindme *u)
        return ret;
 }
 
-static int test_unwind_flags(int flags)
+struct test_params {
+       int flags;
+       char *name;
+};
+
+/*
+ * Create required parameter list for tests
+ */
+static const struct test_params param_list[] = {
+       {.flags = UWM_DEFAULT, .name = "UWM_DEFAULT"},
+       {.flags = UWM_SP, .name = "UWM_SP"},
+       {.flags = UWM_REGS, .name = "UWM_REGS"},
+       {.flags = UWM_SWITCH_STACK,
+               .name = "UWM_SWITCH_STACK"},
+       {.flags = UWM_SP | UWM_REGS,
+               .name = "UWM_SP | UWM_REGS"},
+       {.flags = UWM_CALLER | UWM_SP,
+               .name = "WM_CALLER | UWM_SP"},
+       {.flags = UWM_CALLER | UWM_SP | UWM_REGS,
+               .name = "UWM_CALLER | UWM_SP | UWM_REGS"},
+       {.flags = UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK,
+               .name = "UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK"},
+       {.flags = UWM_THREAD, .name = "UWM_THREAD"},
+       {.flags = UWM_THREAD | UWM_SP,
+               .name = "UWM_THREAD | UWM_SP"},
+       {.flags = UWM_THREAD | UWM_CALLER | UWM_SP,
+               .name = "UWM_THREAD | UWM_CALLER | UWM_SP"},
+       {.flags = UWM_IRQ, .name = "UWM_IRQ"},
+       {.flags = UWM_IRQ | UWM_SWITCH_STACK,
+               .name = "UWM_IRQ | UWM_SWITCH_STACK"},
+       {.flags = UWM_IRQ | UWM_SP,
+               .name = "UWM_IRQ | UWM_SP"},
+       {.flags = UWM_IRQ | UWM_REGS,
+               .name = "UWM_IRQ | UWM_REGS"},
+       {.flags = UWM_IRQ | UWM_SP | UWM_REGS,
+               .name = "UWM_IRQ | UWM_SP | UWM_REGS"},
+       {.flags = UWM_IRQ | UWM_CALLER | UWM_SP,
+               .name = "UWM_IRQ | UWM_CALLER | UWM_SP"},
+       {.flags = UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS,
+               .name = "UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS"},
+       {.flags = UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK,
+               .name = "UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK"},
+       #ifdef CONFIG_KPROBES
+       {.flags = UWM_PGM, .name = "UWM_PGM"},
+       {.flags = UWM_PGM | UWM_SP,
+               .name = "UWM_PGM | UWM_SP"},
+       {.flags = UWM_PGM | UWM_REGS,
+               .name = "UWM_PGM | UWM_REGS"},
+       {.flags = UWM_PGM | UWM_SP | UWM_REGS,
+               .name = "UWM_PGM | UWM_SP | UWM_REGS"},
+       #endif
+};
+
+/*
+ * Parameter description generator: required for KUNIT_ARRAY_PARAM()
+ */
+static void get_desc(const struct test_params *params, char *desc)
+{
+       strscpy(desc, params->name, KUNIT_PARAM_DESC_SIZE);
+}
+
+/*
+ * Create test_unwind_gen_params
+ */
+KUNIT_ARRAY_PARAM(test_unwind, param_list, get_desc);
+
+static void test_unwind_flags(struct kunit *test)
 {
        struct unwindme u;
+       const struct test_params *params;
 
-       u.flags = flags;
+       current_test = test;
+       params = (const struct test_params *)test->param_value;
+       u.flags = params->flags;
        if (u.flags & UWM_THREAD)
-               return test_unwind_task(&u);
+               KUNIT_EXPECT_EQ(test, 0, test_unwind_task(test, &u));
        else if (u.flags & UWM_IRQ)
-               return test_unwind_irq(&u);
+               KUNIT_EXPECT_EQ(test, 0, test_unwind_irq(&u));
        else
-               return unwindme_func1(&u);
+               KUNIT_EXPECT_EQ(test, 0, unwindme_func1(&u));
 }
 
-static int test_unwind_init(void)
-{
-       int failed = 0;
-       int total = 0;
-
-#define TEST(flags)                                                    \
-do {                                                                   \
-       pr_info("[ RUN      ] " #flags "\n");                           \
-       total++;                                                        \
-       if (!test_unwind_flags((flags))) {                              \
-               pr_info("[       OK ] " #flags "\n");                   \
-       } else {                                                        \
-               pr_err("[  FAILED  ] " #flags "\n");                    \
-               failed++;                                               \
-       }                                                               \
-} while (0)
-
-       pr_info("running stack unwinder tests");
-       TEST(UWM_DEFAULT);
-       TEST(UWM_SP);
-       TEST(UWM_REGS);
-       TEST(UWM_SWITCH_STACK);
-       TEST(UWM_SP | UWM_REGS);
-       TEST(UWM_CALLER | UWM_SP);
-       TEST(UWM_CALLER | UWM_SP | UWM_REGS);
-       TEST(UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK);
-       TEST(UWM_THREAD);
-       TEST(UWM_THREAD | UWM_SP);
-       TEST(UWM_THREAD | UWM_CALLER | UWM_SP);
-       TEST(UWM_IRQ);
-       TEST(UWM_IRQ | UWM_SWITCH_STACK);
-       TEST(UWM_IRQ | UWM_SP);
-       TEST(UWM_IRQ | UWM_REGS);
-       TEST(UWM_IRQ | UWM_SP | UWM_REGS);
-       TEST(UWM_IRQ | UWM_CALLER | UWM_SP);
-       TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS);
-       TEST(UWM_IRQ | UWM_CALLER | UWM_SP | UWM_REGS | UWM_SWITCH_STACK);
-#ifdef CONFIG_KPROBES
-       TEST(UWM_PGM);
-       TEST(UWM_PGM | UWM_SP);
-       TEST(UWM_PGM | UWM_REGS);
-       TEST(UWM_PGM | UWM_SP | UWM_REGS);
-#endif
-#undef TEST
-       if (failed) {
-               pr_err("%d of %d stack unwinder tests failed", failed, total);
-               WARN(1, "%d of %d stack unwinder tests failed", failed, total);
-       } else {
-               pr_info("all %d stack unwinder tests passed", total);
-       }
+static struct kunit_case unwind_test_cases[] = {
+       KUNIT_CASE_PARAM(test_unwind_flags, test_unwind_gen_params),
+       {}
+};
 
-       return failed ? -EINVAL : 0;
-}
+static struct kunit_suite test_unwind_suite = {
+       .name = "test_unwind",
+       .test_cases = unwind_test_cases,
+};
 
-static void test_unwind_exit(void)
-{
-}
+kunit_test_suites(&test_unwind_suite);
 
-module_init(test_unwind_init);
-module_exit(test_unwind_exit);
 MODULE_LICENSE("GPL");
index 1141c8d5c0d038a23ca3ecc2ced56e7b63a3c47a..2203164b39dae4043b4ba0a74d660f2885a034fc 100644 (file)
@@ -14,8 +14,8 @@
 #include <linux/moduleparam.h>
 #include <linux/gfp.h>
 #include <linux/sched.h>
+#include <linux/string_helpers.h>
 #include <linux/sysctl.h>
-#include <linux/ctype.h>
 #include <linux/swap.h>
 #include <linux/kthread.h>
 #include <linux/oom.h>
@@ -394,13 +394,10 @@ static int __init cmm_init(void)
                goto out_sysctl;
 #ifdef CONFIG_CMM_IUCV
        /* convert sender to uppercase characters */
-       if (sender) {
-               int len = strlen(sender);
-               while (len--)
-                       sender[len] = toupper(sender[len]);
-       } else {
+       if (sender)
+               string_upper(sender, sender);
+       else
                sender = cmm_default_sender;
-       }
 
        rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
        if (rc < 0)
index 0b0c8c2849530c4f0294068c8325c57db77f9dba..9f9af5298dd6e33bdd7f4f698ed8e81ac074d29f 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/kasan.h>
 #include <asm/ptdump.h>
 #include <asm/kasan.h>
+#include <asm/nospec-branch.h>
 #include <asm/sections.h>
 
 static unsigned long max_addr;
@@ -116,8 +117,13 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
                return;
        if (st->current_prot & _PAGE_NOEXEC)
                return;
-       /* The first lowcore page is currently still W+X. */
-       if (addr == PAGE_SIZE)
+       /*
+        * The first lowcore page is W+X if spectre mitigations are using
+        * trampolines or the BEAR enhancements facility is not installed,
+        * in which case we have two lpswe instructions in lowcore that need
+        * to be executable.
+        */
+       if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)))
                return;
        WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n",
                  (void *)st->start_address);
@@ -203,7 +209,9 @@ void ptdump_check_wx(void)
        if (st.wx_pages)
                pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
        else
-               pr_info("Checked W+X mappings: passed, no unexpected W+X pages found\n");
+               pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
+                       (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
+                       "unexpected " : "");
 }
 #endif /* CONFIG_DEBUG_WX */
 
index a04faf49001acf51719836221fdc7bdeeec8a1e1..8c6f258a61832a8578021e485bc6b9c02ea5f723 100644 (file)
@@ -58,8 +58,6 @@ unsigned long empty_zero_page, zero_page_mask;
 EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(zero_page_mask);
 
-bool initmem_freed;
-
 static void __init setup_zero_pages(void)
 {
        unsigned int order;
@@ -214,7 +212,6 @@ void __init mem_init(void)
 
 void free_initmem(void)
 {
-       initmem_freed = true;
        __set_memory((unsigned long)_sinittext,
                     (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
                     SET_MEMORY_RW | SET_MEMORY_NX);
index 3e473516801939b5e8759f65a1c97128949cc5e7..483b9dbe0970a2cd0f61e6e05c56f0175ece802e 100644 (file)
@@ -399,5 +399,5 @@ void __init kasan_copy_shadow_mapping(void)
 
 void __init kasan_free_early_identity(void)
 {
-       memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
+       memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
 }
index fdc86c0e4e6ca3d259f9bbd982e47261a06122b6..654019181a3742cd56ff831b607ae453a285bc4d 100644 (file)
@@ -57,7 +57,7 @@ void arch_report_meminfo(struct seq_file *m)
 static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
                    unsigned long dtt)
 {
-       unsigned long table, mask;
+       unsigned long *table, mask;
 
        mask = 0;
        if (MACHINE_HAS_EDAT2) {
@@ -72,7 +72,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
                        mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
                        break;
                }
-               table = (unsigned long)old & mask;
+               table = (unsigned long *)((unsigned long)old & mask);
                crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
        } else if (MACHINE_HAS_IDTE) {
                cspg(old, *old, new);
index 2b1c6d916cf9c651325da83569df9830f5aed0db..7d9705eeb02f1f530dbebdf4b1d7c9351d2e4bf3 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/hugetlb.h>
 #include <linux/slab.h>
 #include <asm/cacheflush.h>
+#include <asm/nospec-branch.h>
 #include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/tlbflush.h>
@@ -584,8 +585,13 @@ void __init vmem_map_init(void)
        __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT,
                     SET_MEMORY_RO | SET_MEMORY_X);
 
-       /* we need lowcore executable for our LPSWE instructions */
-       set_memory_x(0, 1);
+       if (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) {
+               /*
+                * Lowcore must be executable for LPSWE
+                * and expoline trampoline branch instructions.
+                */
+               set_memory_x(0, 1);
+       }
 
        pr_info("Write protected kernel read-only data: %luk\n",
                (unsigned long)(__end_rodata - _stext) >> 10);
index 1a374d021e256d90ad06e77a1d945509ee902c2f..233cc9bcd6527b51b2bfa74494ed5dcae9085a41 100644 (file)
@@ -567,7 +567,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
        EMIT4(0xb9040000, REG_2, BPF_REG_0);
        /* Restore registers */
        save_restore_regs(jit, REGS_RESTORE, stack_depth);
-       if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) {
+       if (nospec_uses_trampoline()) {
                jit->r14_thunk_ip = jit->prg;
                /* Generate __s390_indirect_jump_r14 thunk */
                if (test_facility(35)) {
@@ -585,7 +585,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
        /* br %r14 */
        _EMIT2(0x07fe);
 
-       if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable &&
+       if ((nospec_uses_trampoline()) &&
            (is_first_pass(jit) || (jit->seen & SEEN_FUNC))) {
                jit->r1_thunk_ip = jit->prg;
                /* Generate __s390_indirect_jump_r1 thunk */
@@ -1332,7 +1332,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
                jit->seen |= SEEN_FUNC;
                /* lgrl %w1,func */
                EMIT6_PCREL_RILB(0xc4080000, REG_W1, _EMIT_CONST_U64(func));
-               if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) {
+               if (nospec_uses_trampoline()) {
                        /* brasl %r14,__s390_indirect_jump_r1 */
                        EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip);
                } else {
index b833155ce838111981b424418703c584c47c8771..872d772b73d20051f89b35829e7a0dd64a56d80b 100644 (file)
@@ -561,7 +561,7 @@ static void zpci_cleanup_bus_resources(struct zpci_dev *zdev)
        zdev->has_resources = 0;
 }
 
-int pcibios_add_device(struct pci_dev *pdev)
+int pcibios_device_add(struct pci_dev *pdev)
 {
        struct zpci_dev *zdev = to_zpci(pdev);
        struct resource *res;
index 93223bd110c388a39437543a1071bbb302baea09..1f4540d6bd2d313f4fffc60a80118f62ea1ef8f7 100644 (file)
@@ -18,6 +18,8 @@
 static struct kmem_cache *dma_region_table_cache;
 static struct kmem_cache *dma_page_table_cache;
 static int s390_iommu_strict;
+static u64 s390_iommu_aperture;
+static u32 s390_iommu_aperture_factor = 1;
 
 static int zpci_refresh_global(struct zpci_dev *zdev)
 {
@@ -565,15 +567,19 @@ int zpci_dma_init_device(struct zpci_dev *zdev)
 
        /*
         * Restrict the iommu bitmap size to the minimum of the following:
-        * - main memory size
+        * - s390_iommu_aperture which defaults to high_memory
         * - 3-level pagetable address limit minus start_dma offset
         * - DMA address range allowed by the hardware (clp query pci fn)
         *
         * Also set zdev->end_dma to the actual end address of the usable
         * range, instead of the theoretical maximum as reported by hardware.
+        *
+        * This limits the number of concurrently usable DMA mappings since
+        * for each DMA mapped memory address we need a DMA address including
+        * extra DMA addresses for multiple mappings of the same memory address.
         */
        zdev->start_dma = PAGE_ALIGN(zdev->start_dma);
-       zdev->iommu_size = min3((u64) high_memory,
+       zdev->iommu_size = min3(s390_iommu_aperture,
                                ZPCI_TABLE_SIZE_RT - zdev->start_dma,
                                zdev->end_dma - zdev->start_dma + 1);
        zdev->end_dma = zdev->start_dma + zdev->iommu_size - 1;
@@ -660,6 +666,12 @@ static int __init dma_alloc_cpu_table_caches(void)
 
 int __init zpci_dma_init(void)
 {
+       s390_iommu_aperture = (u64)high_memory;
+       if (!s390_iommu_aperture_factor)
+               s390_iommu_aperture = ULONG_MAX;
+       else
+               s390_iommu_aperture *= s390_iommu_aperture_factor;
+
        return dma_alloc_cpu_table_caches();
 }
 
@@ -692,3 +704,12 @@ static int __init s390_iommu_setup(char *str)
 }
 
 __setup("s390_iommu=", s390_iommu_setup);
+
+static int __init s390_iommu_aperture_setup(char *str)
+{
+       if (kstrtou32(str, 10, &s390_iommu_aperture_factor))
+               s390_iommu_aperture_factor = 1;
+       return 1;
+}
+
+__setup("s390_iommu_aperture=", s390_iommu_aperture_setup);
index 5b8d647523f969abe8fc0aa792e9390f631fe828..6a5bfa9dc1f2e74e2a706df44c3e9418e41df9ad 100644 (file)
@@ -52,6 +52,8 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
        struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
        struct pci_dev *pdev = NULL;
 
+       zpci_dbg(3, "err fid:%x, fh:%x, pec:%x\n",
+                ccdf->fid, ccdf->fh, ccdf->pec);
        zpci_err("error CCDF:\n");
        zpci_err_hex(ccdf, sizeof(*ccdf));
 
@@ -96,6 +98,8 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
        struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid);
        enum zpci_state state;
 
+       zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n",
+                ccdf->fid, ccdf->fh, ccdf->pec);
        zpci_err("avail CCDF:\n");
        zpci_err_hex(ccdf, sizeof(*ccdf));
 
index 335c281811c753592c9bd3acdee1e8614e030594..cae280e5c047d1d5eaa405c86b5e8444350961c1 100644 (file)
@@ -90,6 +90,14 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
 
                if (zdev_enabled(zdev)) {
                        ret = zpci_disable_device(zdev);
+                       /*
+                        * Due to a z/VM vs LPAR inconsistency in the error
+                        * state the FH may indicate an enabled device but
+                        * disable says the device is already disabled don't
+                        * treat it as an error here.
+                        */
+                       if (ret == -EINVAL)
+                               ret = 0;
                        if (ret)
                                goto out;
                }
index 48c2a091a0720754f2f4c7ce2ad06516f24a0fc9..be171880977e5d250117b4c9810de7aa0ca7fbf6 100644 (file)
@@ -2,3 +2,6 @@
 obj-y                          += kernel/ mm/ boards/
 obj-$(CONFIG_SH_FPU_EMU)       += math-emu/
 obj-$(CONFIG_USE_BUILTIN_DTB)  += boot/dts/
+
+# for cleaning
+subdir- += boot
index 88ddb6f1c75b07f7c60de2b522e7947f4395c654..b39412bf91fb0afafee74140321d5d328be74290 100644 (file)
@@ -198,10 +198,6 @@ compressed: zImage
 archprepare:
        $(Q)$(MAKE) $(build)=arch/sh/tools include/generated/machtypes.h
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-       $(Q)$(MAKE) $(clean)=arch/sh/kernel/vsyscall
-
 archheaders:
        $(Q)$(MAKE) $(build)=arch/sh/kernel/syscalls all
 
index bac8a058ebd7cd4cad1065700ed6a3fbb52862fc..c77b5f00a66a3dce3a8e703d4b79bb198fb5f83c 100644 (file)
@@ -560,7 +560,7 @@ static void __init ap325rxa_mv_mem_reserve(void)
        if (!phys)
                panic("Failed to allocate CEU memory\n");
 
-       memblock_free(phys, size);
+       memblock_phys_free(phys, size);
        memblock_remove(phys, size);
 
        ceu_dma_membase = phys;
index 97c5703b181822297f24885b0d315c35354da27e..4c9522dd351f4b292815b16e0db3c97d46e531d1 100644 (file)
@@ -1502,7 +1502,7 @@ static void __init ecovec_mv_mem_reserve(void)
        if (!phys)
                panic("Failed to allocate CEU0 memory\n");
 
-       memblock_free(phys, size);
+       memblock_phys_free(phys, size);
        memblock_remove(phys, size);
        ceu0_dma_membase = phys;
 
@@ -1510,7 +1510,7 @@ static void __init ecovec_mv_mem_reserve(void)
        if (!phys)
                panic("Failed to allocate CEU1 memory\n");
 
-       memblock_free(phys, size);
+       memblock_phys_free(phys, size);
        memblock_remove(phys, size);
        ceu1_dma_membase = phys;
 }
index eeb5ce341efdd705cb22cc6bdbec6956ac9364a8..20f4db778ed6aee279fb3fc79668aa7076fa5db8 100644 (file)
@@ -633,7 +633,7 @@ static void __init kfr2r09_mv_mem_reserve(void)
        if (!phys)
                panic("Failed to allocate CEU memory\n");
 
-       memblock_free(phys, size);
+       memblock_phys_free(phys, size);
        memblock_remove(phys, size);
 
        ceu_dma_membase = phys;
index 6703a2122c0d6bc3fabee59daa1347a57870ab9a..f60061283c4827440c18c97d14a39d925e99a876 100644 (file)
@@ -633,7 +633,7 @@ static void __init migor_mv_mem_reserve(void)
        if (!phys)
                panic("Failed to allocate CEU memory\n");
 
-       memblock_free(phys, size);
+       memblock_phys_free(phys, size);
        memblock_remove(phys, size);
 
        ceu_dma_membase = phys;
index d9b31d4560c028cdb21487540fbefb9bd7082703..b60a2626e18b274f7965c3623b9cb96e34f7152b 100644 (file)
@@ -966,7 +966,7 @@ static void __init ms7724se_mv_mem_reserve(void)
        if (!phys)
                panic("Failed to allocate CEU0 memory\n");
 
-       memblock_free(phys, size);
+       memblock_phys_free(phys, size);
        memblock_remove(phys, size);
        ceu0_dma_membase = phys;
 
@@ -974,7 +974,7 @@ static void __init ms7724se_mv_mem_reserve(void)
        if (!phys)
                panic("Failed to allocate CEU1 memory\n");
 
-       memblock_free(phys, size);
+       memblock_phys_free(phys, size);
        memblock_remove(phys, size);
        ceu1_dma_membase = phys;
 }
index c9e574906a9b9174c0de4bb8b189646f6347d1e0..71cb3d934bf6c36fdf8a5526cf5da969a7366ee1 100644 (file)
@@ -9,3 +9,6 @@ obj-y += math-emu/
 obj-y += net/
 obj-y += crypto/
 obj-$(CONFIG_SPARC64) += vdso/
+
+# for cleaning
+subdir- += boot
index 24fb5a99f43941b0d09b3e41405b14e11799c253..c7008bbebc4cd17b38772ae77a722baccfd405e5 100644 (file)
@@ -75,9 +75,6 @@ install:
        sh $(srctree)/$(boot)/install.sh $(KERNELRELEASE) $(KBUILD_IMAGE) \
                System.map "$(INSTALL_PATH)"
 
-archclean:
-       $(Q)$(MAKE) $(clean)=$(boot)
-
 archheaders:
        $(Q)$(MAKE) $(build)=arch/sparc/kernel/syscalls all
 
index 849236d4eca483ed5c252745d1b2cb129037564d..45e5c76d449ea105617b4a8b5a259bba7214d5db 100644 (file)
@@ -22,7 +22,7 @@ ifeq ($(CONFIG_SPARC64),y)
 
 # Actual linking
 
-$(obj)/zImage: $(obj)/image
+$(obj)/zImage: $(obj)/image FORCE
        $(call if_changed,gzip)
        @echo '  kernel: $@ is ready'
 
@@ -31,7 +31,7 @@ $(obj)/vmlinux.aout: vmlinux FORCE
        @echo '  kernel: $@ is ready'
 else
 
-$(obj)/zImage: $(obj)/image
+$(obj)/zImage: $(obj)/image FORCE
        $(call if_changed,strip)
        @echo '  kernel: $@ is ready'
 
@@ -44,7 +44,7 @@ OBJCOPYFLAGS_image.bin := -S -O binary -R .note -R .comment
 $(obj)/image.bin: $(obj)/image FORCE
        $(call if_changed,objcopy)
 
-$(obj)/image.gz: $(obj)/image.bin
+$(obj)/image.gz: $(obj)/image.bin FORCE
        $(call if_changed,gzip)
 
 UIMAGE_LOADADDR = $(CONFIG_UBOOT_LOAD_ADDR)
@@ -56,7 +56,7 @@ quiet_cmd_uimage.o = UIMAGE.O $@
                      -r -b binary $@ -o $@.o
 
 targets += uImage
-$(obj)/uImage: $(obj)/image.gz
+$(obj)/uImage: $(obj)/image.gz FORCE
        $(call if_changed,uimage)
        $(call if_changed,uimage.o)
        @echo '  Image $@ is ready'
index 9c2b720bfd20d784907171668c4db7ce7d8e9fa1..31b0c19832866f18d628bf31417e3a1027a30c50 100644 (file)
@@ -1010,7 +1010,7 @@ void pcibios_set_master(struct pci_dev *dev)
 }
 
 #ifdef CONFIG_PCI_IOV
-int pcibios_add_device(struct pci_dev *dev)
+int pcibios_device_add(struct pci_dev *dev)
 {
        struct pci_dev *pdev;
 
index 0224d8f19ed69bc0e4be3ef816f4fbd9e2863981..b98a7bbe6728a015b00f92e027c165b9003ef251 100644 (file)
@@ -1567,7 +1567,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
 
 static void __init pcpu_free_bootmem(void *ptr, size_t size)
 {
-       memblock_free(__pa(ptr), size);
+       memblock_free(ptr, size);
 }
 
 static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
index 8e636ce0294953c811d320ea0aaa35dbe6437607..0039771eb01cd40f9bb163bedf5f914b95bdea5b 100644 (file)
@@ -47,7 +47,7 @@ void __init mem_init(void)
         */
        brk_end = (unsigned long) UML_ROUND_UP(sbrk(0));
        map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0);
-       memblock_free(__pa(brk_end), uml_reserved - brk_end);
+       memblock_free((void *)brk_end, uml_reserved - brk_end);
        uml_reserved = brk_end;
 
        /* this will put all low memory onto the freelists */
index 30dec019756b9bac2cf8aacb0ad787659d6a8d99..f384cb1a4f7a8ddca2f73e2cdf4b80bdb3445464 100644 (file)
@@ -25,3 +25,6 @@ obj-y += platform/
 obj-y += net/
 
 obj-$(CONFIG_KEXEC_FILE) += purgatory/
+
+# for cleaning
+subdir- += boot tools
index b1d4b481fcdd6db4d84a1a30fa6e0ac2cd63c386..95dd1ee01546ac04955f9297dfec9a60371aed0d 100644 (file)
@@ -63,7 +63,7 @@ config X86
        select ARCH_CLOCKSOURCE_INIT
        select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
        select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
-       select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
+       select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64
        select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
        select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
        select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
@@ -192,6 +192,8 @@ config X86
        select HAVE_DYNAMIC_FTRACE_WITH_REGS
        select HAVE_DYNAMIC_FTRACE_WITH_ARGS    if X86_64
        select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+       select HAVE_SAMPLE_FTRACE_DIRECT        if X86_64
+       select HAVE_SAMPLE_FTRACE_MULTI_DIRECT  if X86_64
        select HAVE_EBPF_JIT
        select HAVE_EFFICIENT_UNALIGNED_ACCESS
        select HAVE_EISA
@@ -1627,7 +1629,7 @@ config ARCH_SELECT_MEMORY_MODEL
 
 config ARCH_MEMORY_PROBE
        bool "Enable sysfs memory/probe interface"
-       depends on X86_64 && MEMORY_HOTPLUG
+       depends on MEMORY_HOTPLUG
        help
          This option enables a sysfs memory/probe interface for testing.
          See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
@@ -2423,7 +2425,7 @@ endmenu
 
 config ARCH_HAS_ADD_PAGES
        def_bool y
-       depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
+       depends on ARCH_ENABLE_MEMORY_HOTPLUG
 
 config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
        def_bool y
index aab70413ae7ae1662d9969ac528dc090319c66cd..42243869216d0fdd04a5f52d2347650673ae1c7b 100644 (file)
@@ -283,8 +283,6 @@ endif
 archclean:
        $(Q)rm -rf $(objtree)/arch/i386
        $(Q)rm -rf $(objtree)/arch/x86_64
-       $(Q)$(MAKE) $(clean)=$(boot)
-       $(Q)$(MAKE) $(clean)=arch/x86/tools
 
 define archhelp
   echo  '* bzImage             - Compressed kernel image (arch/x86/boot/bzImage)'
index c72e368dd16413a7c4a1df87a4d3b04631c2f47d..f1ba6ab2e97ef53c54bab9e3824a6b6a925edcd9 100644 (file)
@@ -1187,7 +1187,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
         * PCI slot and func to indicate the uncore box.
         */
        if (id->driver_data & ~0xffff) {
-               struct pci_driver *pci_drv = pdev->driver;
+               struct pci_driver *pci_drv = to_pci_driver(pdev->dev.driver);
 
                pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table);
                if (pmu == NULL)
index 9e1def3744f2202349061a48a3a6a8c42c77ea22..36e84d9042606476e0c69a5554853feff3052632 100644 (file)
@@ -80,7 +80,7 @@ static struct resource video_rom_resource = {
  */
 static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
 {
-       struct pci_driver *drv = pdev->driver;
+       struct pci_driver *drv = to_pci_driver(pdev->dev.driver);
        const struct pci_device_id *id;
 
        if (pdev->vendor == vendor && pdev->device == device)
index 40ed44ead0631288b16f98e233464994eb09e062..49b596db5631e83ad216f737d97ec2c308a08967 100644 (file)
@@ -322,7 +322,7 @@ static void __init reserve_initrd(void)
 
        relocate_initrd();
 
-       memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
+       memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image);
 }
 
 #else
@@ -521,7 +521,7 @@ static void __init reserve_crashkernel(void)
        }
 
        if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
-               memblock_free(crash_base, crash_size);
+               memblock_phys_free(crash_base, crash_size);
                return;
        }
 
index 5afd985591939cc285f0b035cc2e2fc810cef4e2..7b65275544b2c3dc7adc6b43c4f7681288e0e75e 100644 (file)
@@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
 
 static void __init pcpu_fc_free(void *ptr, size_t size)
 {
-       memblock_free_ptr(ptr, size);
+       memblock_free(ptr, size);
 }
 
 static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
index 23a14d82e7838eb27d981180e6d6f75097fe9ebe..1895986842b91e471306bb099821bbef348c5513 100644 (file)
@@ -618,7 +618,7 @@ static void __init memory_map_top_down(unsigned long map_start,
         */
        addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
                                         map_end);
-       memblock_free(addr, PMD_SIZE);
+       memblock_phys_free(addr, PMD_SIZE);
        real_end = addr + PMD_SIZE;
 
        /* step_size need to be small so pgt_buf from BRK could cover it */
index bd90b8fe81e45e18c583f40467040675fe64894e..5cd7ea6d645cf0fd7e779cd7fcf9f8efacb07b6b 100644 (file)
@@ -779,37 +779,6 @@ void __init mem_init(void)
        test_wp_bit();
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size,
-                   struct mhp_params *params)
-{
-       unsigned long start_pfn = start >> PAGE_SHIFT;
-       unsigned long nr_pages = size >> PAGE_SHIFT;
-       int ret;
-
-       /*
-        * The page tables were already mapped at boot so if the caller
-        * requests a different mapping type then we must change all the
-        * pages with __set_memory_prot().
-        */
-       if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) {
-               ret = __set_memory_prot(start, nr_pages, params->pgprot);
-               if (ret)
-                       return ret;
-       }
-
-       return __add_pages(nid, start_pfn, nr_pages, params);
-}
-
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
-{
-       unsigned long start_pfn = start >> PAGE_SHIFT;
-       unsigned long nr_pages = size >> PAGE_SHIFT;
-
-       __remove_pages(start_pfn, nr_pages, altmap);
-}
-#endif
-
 int kernel_set_to_readonly __read_mostly;
 
 static void mark_nxdata_nx(void)
index ef885370719a605876b3375e22db72f8d025cb64..e7b9b464a82f1d0798284a34ecc943db316e7095 100644 (file)
@@ -49,7 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
                        p = early_alloc(PMD_SIZE, nid, false);
                        if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
                                return;
-                       memblock_free_ptr(p, PMD_SIZE);
+                       memblock_free(p, PMD_SIZE);
                }
 
                p = early_alloc(PAGE_SIZE, nid, true);
@@ -85,7 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
                        p = early_alloc(PUD_SIZE, nid, false);
                        if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
                                return;
-                       memblock_free_ptr(p, PUD_SIZE);
+                       memblock_free(p, PUD_SIZE);
                }
 
                p = early_alloc(PAGE_SIZE, nid, true);
index 1e9b93b088dbf0dc454d2ebb30dcab055d1f7f59..c6b1213086d6330222a5ee19f70eb7193b4b8ec9 100644 (file)
@@ -355,7 +355,7 @@ void __init numa_reset_distance(void)
 
        /* numa_distance could be 1LU marking allocation failure, test cnt */
        if (numa_distance_cnt)
-               memblock_free_ptr(numa_distance, size);
+               memblock_free(numa_distance, size);
        numa_distance_cnt = 0;
        numa_distance = NULL;   /* enable table creation */
 }
index e801e30089c436087681b8f09ad88e76d3046d56..1a02b791d273cb1e9981663d67131f01c211753e 100644 (file)
@@ -517,7 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
        }
 
        /* free the copied physical distance table */
-       memblock_free_ptr(phys_dist, phys_size);
+       memblock_free(phys_dist, phys_size);
        return;
 
 no_emu:
index 3507f456fcd09d601c0de1517adb5b6735a2021c..9e1e6b8d8876313e2972b3634d337e6d266af504 100644 (file)
@@ -632,7 +632,7 @@ static void set_dev_domain_options(struct pci_dev *pdev)
                pdev->hotplug_user_indicators = 1;
 }
 
-int pcibios_add_device(struct pci_dev *dev)
+int pcibios_device_add(struct pci_dev *dev)
 {
        struct pci_setup_rom *rom;
        struct irq_domain *msidom;
index 1ce436eeda15e1eed88360501a0a220d423b9d28..cdbf4822f43116da09b3e41bb7970d13701e3cf1 100644 (file)
@@ -1025,7 +1025,7 @@ static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
        for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
                make_lowmem_page_readwrite(vaddr);
 
-       memblock_free(paddr, size);
+       memblock_phys_free(paddr, size);
 }
 
 static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
@@ -1151,7 +1151,7 @@ static void __init xen_pagetable_p2m_free(void)
                xen_cleanhighmap(addr, addr + size);
                size = PAGE_ALIGN(xen_start_info->nr_pages *
                                  sizeof(unsigned long));
-               memblock_free(__pa(addr), size);
+               memblock_free((void *)addr, size);
        } else {
                xen_cleanmfnmap(addr);
        }
@@ -1956,7 +1956,7 @@ void __init xen_relocate_p2m(void)
                pfn_end = p2m_pfn_end;
        }
 
-       memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+       memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
        while (pfn < pfn_end) {
                if (pfn == p2m_pfn) {
                        pfn = p2m_pfn_end;
index 5e6e236977c75f5f2aa3f7677419a17fd37ceba2..58db86f7b3846ef04b749154f0c9e9178654a806 100644 (file)
@@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void)
 static void __ref free_p2m_page(void *p)
 {
        if (unlikely(!slab_is_available())) {
-               memblock_free((unsigned long)p, PAGE_SIZE);
+               memblock_free(p, PAGE_SIZE);
                return;
        }
 
index 8bfc1033010770fafdff3fd6238378f3322312f2..f387fc7e5250080eba432c1d0e410d9f5fab7e5e 100644 (file)
@@ -153,7 +153,7 @@ static void __init xen_del_extra_mem(unsigned long start_pfn,
                        break;
                }
        }
-       memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
+       memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
 }
 
 /*
@@ -719,7 +719,7 @@ static void __init xen_reserve_xen_mfnlist(void)
                return;
 
        xen_relocate_p2m();
-       memblock_free(start, size);
+       memblock_phys_free(start, size);
 }
 
 /**
@@ -885,7 +885,7 @@ char * __init xen_memory_setup(void)
                xen_phys_memcpy(new_area, start, size);
                pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
                        start, start + size, new_area, new_area + size);
-               memblock_free(start, size);
+               memblock_phys_free(start, size);
                boot_params.hdr.ramdisk_image = new_area;
                boot_params.ext_ramdisk_image = new_area >> 32;
        }
index 96714ef7c89e34581c1d53fe08a82ca1fbaf2493..9778216d6e09dbfa610e969e12ac95bb52d7803d 100644 (file)
@@ -7,9 +7,7 @@
 # Copyright (C) 2014 Cadence Design Systems Inc.
 #
 # This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies. Remember to do have actions
-# for "archclean" and "archdep" for cleaning up and making dependencies for
-# this architecture
+# architecture-specific flags and dependencies.
 
 # Core configuration.
 # (Use VAR=<xtensa_config> to use another default compiler.)
index 99e98c9bae419a84774aa488d4d30a75017eba11..2dd28931d6990a1d1874e7b03e9de2776ae33f8c 100644 (file)
@@ -42,12 +42,14 @@ _bootparam:
 
        .align  4
 _SetupMMU:
+#if XCHAL_HAVE_WINDOWED
        movi    a0, 0
        wsr     a0, windowbase
        rsync
        movi    a0, 1
        wsr     a0, windowstart
        rsync
+#endif
        movi    a0, 0x1F
        wsr     a0, ps
        rsync
index 48ba5a232d948792d95aa8bc3fe3aca5e4c250ec..3ed94ad3500076c52bcdd8b65806782da81bf013 100644 (file)
@@ -3,6 +3,7 @@
 #include <asm/regs.h>
 #include <asm/asmmacro.h>
 #include <asm/cacheasm.h>
+#include <asm/processor.h>
        /*
         * RB-Data: RedBoot data/bss
         * P:       Boot-Parameters
@@ -36,7 +37,7 @@
        .globl __start
        /* this must be the first byte of the loader! */
 __start:
-       entry   sp, 32          # we do not intend to return
+       abi_entry(32)           # we do not intend to return
        _call0  _start
 __start_a0:
        .align 4
@@ -55,17 +56,19 @@ _start:
        movi    a4, 1
        wsr     a4, ps
        rsync
-
+#if XCHAL_HAVE_WINDOWED
        rsr     a5, windowbase
        ssl     a5
        sll     a4, a4
        wsr     a4, windowstart
        rsync
-
-       movi    a4, 0x00040000
+#endif
+       movi    a4, KERNEL_PS_WOE_MASK
        wsr     a4, ps
        rsync
 
+KABI_C0        mov     abi_saved0, abi_arg0
+
        /* copy the loader to its address
         * Note: The loader itself is a very small piece, so we assume we
         *       don't partially overlap. We also assume (even more important)
@@ -168,52 +171,52 @@ _reloc:
 
        movi    a3, __image_load
        sub     a4, a3, a4
-       add     a8, a0, a4
+       add     abi_arg2, a0, a4
 
        # a1  Stack
        # a8(a4)  Load address of the image
 
-       movi    a6, _image_start
-       movi    a10, _image_end
-       movi    a7, 0x1000000
-       sub     a11, a10, a6
-       movi    a9, complen
-       s32i    a11, a9, 0
+       movi    abi_arg0, _image_start
+       movi    abi_arg4, _image_end
+       movi    abi_arg1, 0x1000000
+       sub     abi_tmp0, abi_arg4, abi_arg0
+       movi    abi_arg3, complen
+       s32i    abi_tmp0, abi_arg3, 0
 
        movi    a0, 0
 
-       # a6 destination
-       # a7 maximum size of destination
-       # a8 source
-       # a9 ptr to length
+       # abi_arg0 destination
+       # abi_arg1 maximum size of destination
+       # abi_arg2 source
+       # abi_arg3 ptr to length
 
        .extern gunzip
-       movi    a4, gunzip
-       beqz    a4, 1f
+       movi    abi_tmp0, gunzip
+       beqz    abi_tmp0, 1f
 
-       callx4  a4
+       abi_callx       abi_tmp0
 
        j       2f
 
 
-       # a6 destination start
-       # a7 maximum size of destination
-       # a8 source start
-       # a9 ptr to length
-       # a10 destination end
+       # abi_arg0 destination start
+       # abi_arg1 maximum size of destination
+       # abi_arg2 source start
+       # abi_arg3 ptr to length
+       # abi_arg4 destination end
 
 1:
-        l32i    a9, a8, 0
-        l32i    a11, a8, 4
-        s32i    a9, a6, 0
-        s32i    a11, a6, 4
-        l32i    a9, a8, 8
-        l32i    a11, a8, 12
-        s32i    a9, a6, 8
-        s32i    a11, a6, 12
-        addi    a6, a6, 16
-        addi    a8, a8, 16
-        blt     a6, a10, 1b
+        l32i    abi_tmp0, abi_arg2, 0
+        l32i    abi_tmp1, abi_arg2, 4
+        s32i    abi_tmp0, abi_arg0, 0
+        s32i    abi_tmp1, abi_arg0, 4
+        l32i    abi_tmp0, abi_arg2, 8
+        l32i    abi_tmp1, abi_arg2, 12
+        s32i    abi_tmp0, abi_arg0, 8
+        s32i    abi_tmp1, abi_arg0, 12
+        addi    abi_arg0, abi_arg0, 16
+        addi    abi_arg2, abi_arg2, 16
+        blt     abi_arg0, abi_arg4, 1b
 
 
        /* jump to the kernel */
@@ -230,6 +233,7 @@ _reloc:
 
        # a2  Boot parameter list
 
+KABI_C0        mov     abi_arg0, abi_saved0
        movi    a0, _image_start
        jx      a0
 
index bfc89e11f4698ee5f7dee1de89d6d89dd6e16358..809c507d18250da09cb242b5c34e883aa67d0137 100644 (file)
 #define XTENSA_STACK_ALIGNMENT         16
 
 #if defined(__XTENSA_WINDOWED_ABI__)
+
+/* Assembly instructions for windowed kernel ABI. */
+#define KABI_W
+/* Assembly instructions for call0 kernel ABI (will be ignored). */
+#define KABI_C0 #
+
 #define XTENSA_FRAME_SIZE_RESERVE      16
 #define XTENSA_SPILL_STACK_RESERVE     32
 
 #define abi_ret(frame_size) retw
 #define abi_ret_default retw
 
+       /* direct call */
+#define abi_call call4
+       /* indirect call */
+#define abi_callx callx4
+       /* outgoing call argument registers */
+#define abi_arg0 a6
+#define abi_arg1 a7
+#define abi_arg2 a8
+#define abi_arg3 a9
+#define abi_arg4 a10
+#define abi_arg5 a11
+       /* return value */
+#define abi_rv a6
+       /* registers preserved across call */
+#define abi_saved0 a2
+#define abi_saved1 a3
+
+       /* none of the above */
+#define abi_tmp0 a4
+#define abi_tmp1 a5
+
 #elif defined(__XTENSA_CALL0_ABI__)
 
+/* Assembly instructions for windowed kernel ABI (will be ignored). */
+#define KABI_W #
+/* Assembly instructions for call0 kernel ABI. */
+#define KABI_C0
+
 #define XTENSA_SPILL_STACK_RESERVE     0
 
 #define abi_entry(frame_size) __abi_entry (frame_size)
 
 #define abi_ret_default ret
 
+       /* direct call */
+#define abi_call call0
+       /* indirect call */
+#define abi_callx callx0
+       /* outgoing call argument registers */
+#define abi_arg0 a2
+#define abi_arg1 a3
+#define abi_arg2 a4
+#define abi_arg3 a5
+#define abi_arg4 a6
+#define abi_arg5 a7
+       /* return value */
+#define abi_rv a2
+       /* registers preserved across call */
+#define abi_saved0 a12
+#define abi_saved1 a13
+
+       /* none of the above */
+#define abi_tmp0 a8
+#define abi_tmp1 a9
+
 #else
 #error Unsupported Xtensa ABI
 #endif
 
+#if defined(USER_SUPPORT_WINDOWED)
+/* Assembly instructions for windowed user ABI. */
+#define UABI_W
+/* Assembly instructions for call0 user ABI (will be ignored). */
+#define UABI_C0 #
+#else
+/* Assembly instructions for windowed user ABI (will be ignored). */
+#define UABI_W #
+/* Assembly instructions for call0 user ABI. */
+#define UABI_C0
+#endif
+
 #define __XTENSA_HANDLER       .section ".exception.text", "ax"
 
 #endif /* _XTENSA_ASMMACRO_H */
index 4361fe4247e30c255ee92f1cc6468467250b0f6b..52da614f953ce007b90173413baf7da9d91c281a 100644 (file)
  *
  * Locking interrupts looks like this:
  *
- *    rsil a15, TOPLEVEL
+ *    rsil a14, TOPLEVEL
  *    <code>
- *    wsr  a15, PS
+ *    wsr  a14, PS
  *    rsync
  *
- * Note that a15 is used here because the register allocation
+ * Note that a14 is used here because the register allocation
  * done by the compiler is not guaranteed and a window overflow
  * may not occur between the rsil and wsr instructions. By using
- * a15 in the rsil, the machine is guaranteed to be in a state
+ * a14 in the rsil, the machine is guaranteed to be in a state
  * where no register reference will cause an overflow.
  */
 
@@ -185,15 +185,15 @@ static inline void arch_atomic_##op(int i, atomic_t * v)          \
        unsigned int vval;                                              \
                                                                        \
        __asm__ __volatile__(                                           \
-                       "       rsil    a15, "__stringify(TOPLEVEL)"\n" \
+                       "       rsil    a14, "__stringify(TOPLEVEL)"\n" \
                        "       l32i    %[result], %[mem]\n"            \
                        "       " #op " %[result], %[result], %[i]\n"   \
                        "       s32i    %[result], %[mem]\n"            \
-                       "       wsr     a15, ps\n"                      \
+                       "       wsr     a14, ps\n"                      \
                        "       rsync\n"                                \
                        : [result] "=&a" (vval), [mem] "+m" (*v)        \
                        : [i] "a" (i)                                   \
-                       : "a15", "memory"                               \
+                       : "a14", "memory"                               \
                        );                                              \
 }                                                                      \
 
@@ -203,15 +203,15 @@ static inline int arch_atomic_##op##_return(int i, atomic_t * v)  \
        unsigned int vval;                                              \
                                                                        \
        __asm__ __volatile__(                                           \
-                       "       rsil    a15,"__stringify(TOPLEVEL)"\n"  \
+                       "       rsil    a14,"__stringify(TOPLEVEL)"\n"  \
                        "       l32i    %[result], %[mem]\n"            \
                        "       " #op " %[result], %[result], %[i]\n"   \
                        "       s32i    %[result], %[mem]\n"            \
-                       "       wsr     a15, ps\n"                      \
+                       "       wsr     a14, ps\n"                      \
                        "       rsync\n"                                \
                        : [result] "=&a" (vval), [mem] "+m" (*v)        \
                        : [i] "a" (i)                                   \
-                       : "a15", "memory"                               \
+                       : "a14", "memory"                               \
                        );                                              \
                                                                        \
        return vval;                                                    \
@@ -223,16 +223,16 @@ static inline int arch_atomic_fetch_##op(int i, atomic_t * v)             \
        unsigned int tmp, vval;                                         \
                                                                        \
        __asm__ __volatile__(                                           \
-                       "       rsil    a15,"__stringify(TOPLEVEL)"\n"  \
+                       "       rsil    a14,"__stringify(TOPLEVEL)"\n"  \
                        "       l32i    %[result], %[mem]\n"            \
                        "       " #op " %[tmp], %[result], %[i]\n"      \
                        "       s32i    %[tmp], %[mem]\n"               \
-                       "       wsr     a15, ps\n"                      \
+                       "       wsr     a14, ps\n"                      \
                        "       rsync\n"                                \
                        : [result] "=&a" (vval), [tmp] "=&a" (tmp),     \
                          [mem] "+m" (*v)                               \
                        : [i] "a" (i)                                   \
-                       : "a15", "memory"                               \
+                       : "a14", "memory"                               \
                        );                                              \
                                                                        \
        return vval;                                                    \
index 3699e2818efb79360baa902ca628eeeadacfd953..eb87810357ad88ea1797ce3ccff1b5ea4a4b1404 100644 (file)
@@ -52,16 +52,16 @@ __cmpxchg_u32(volatile int *p, int old, int new)
        return new;
 #else
        __asm__ __volatile__(
-                       "       rsil    a15, "__stringify(TOPLEVEL)"\n"
+                       "       rsil    a14, "__stringify(TOPLEVEL)"\n"
                        "       l32i    %[old], %[mem]\n"
                        "       bne     %[old], %[cmp], 1f\n"
                        "       s32i    %[new], %[mem]\n"
                        "1:\n"
-                       "       wsr     a15, ps\n"
+                       "       wsr     a14, ps\n"
                        "       rsync\n"
                        : [old] "=&a" (old), [mem] "+m" (*p)
                        : [cmp] "a" (old), [new] "r" (new)
-                       : "a15", "memory");
+                       : "a14", "memory");
        return old;
 #endif
 }
@@ -116,10 +116,10 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 /*
  * xchg_u32
  *
- * Note that a15 is used here because the register allocation
+ * Note that a14 is used here because the register allocation
  * done by the compiler is not guaranteed and a window overflow
  * may not occur between the rsil and wsr instructions. By using
- * a15 in the rsil, the machine is guaranteed to be in a state
+ * a14 in the rsil, the machine is guaranteed to be in a state
  * where no register reference will cause an overflow.
  */
 
@@ -157,14 +157,14 @@ static inline unsigned long xchg_u32(volatile int * m, unsigned long val)
 #else
        unsigned long tmp;
        __asm__ __volatile__(
-                       "       rsil    a15, "__stringify(TOPLEVEL)"\n"
+                       "       rsil    a14, "__stringify(TOPLEVEL)"\n"
                        "       l32i    %[tmp], %[mem]\n"
                        "       s32i    %[val], %[mem]\n"
-                       "       wsr     a15, ps\n"
+                       "       wsr     a14, ps\n"
                        "       rsync\n"
                        : [tmp] "=&a" (tmp), [mem] "+m" (*m)
                        : [val] "a" (val)
-                       : "a15", "memory");
+                       : "a14", "memory");
        return tmp;
 #endif
 }
index 5590b0f688376cd75acbaaed5d991110c909257d..9138077e567ddc333b07f3266a01ad64e903e570 100644 (file)
 #define XCHAL_SPANNING_WAY 0
 #endif
 
+#if XCHAL_HAVE_WINDOWED
+#if defined(CONFIG_USER_ABI_DEFAULT) || defined(CONFIG_USER_ABI_CALL0_PROBE)
+/* Whether windowed ABI is supported in userspace. */
+#define USER_SUPPORT_WINDOWED
+#endif
+#if defined(__XTENSA_WINDOWED_ABI__) || defined(USER_SUPPORT_WINDOWED)
+/* Whether windowed ABI is supported either in userspace or in the kernel. */
+#define SUPPORT_WINDOWED
+#endif
+#endif
+
 #endif
index ad15fbc572838c9f88692f2ce8fd0dc6a3961447..37d3e9887fe7b8e3af67076160048529005d73a7 100644 (file)
 #include <asm/types.h>
 #include <asm/regs.h>
 
-/* Assertions. */
-
-#if (XCHAL_HAVE_WINDOWED != 1)
-# error Linux requires the Xtensa Windowed Registers Option.
-#endif
-
 /* Xtensa ABI requires stack alignment to be at least 16 */
 
 #define STACK_ALIGN (XCHAL_DATA_WIDTH > 16 ? XCHAL_DATA_WIDTH : 16)
 #define WSBITS  (XCHAL_NUM_AREGS / 4)      /* width of WINDOWSTART in bits */
 #define WBBITS  (XCHAL_NUM_AREGS_LOG2 - 2) /* width of WINDOWBASE in bits */
 
+#if defined(__XTENSA_WINDOWED_ABI__)
+#define KERNEL_PS_WOE_MASK PS_WOE_MASK
+#elif defined(__XTENSA_CALL0_ABI__)
+#define KERNEL_PS_WOE_MASK 0
+#else
+#error Unsupported xtensa ABI
+#endif
+
 #ifndef __ASSEMBLY__
 
+#if defined(__XTENSA_WINDOWED_ABI__)
+
 /* Build a valid return address for the specified call winsize.
  * winsize must be 1 (call4), 2 (call8), or 3 (call12)
  */
  */
 #define MAKE_PC_FROM_RA(ra,sp)    (((ra) & 0x3fffffff) | ((sp) & 0xc0000000))
 
+#elif defined(__XTENSA_CALL0_ABI__)
+
+/* Build a valid return address for the specified call winsize.
+ * winsize must be 1 (call4), 2 (call8), or 3 (call12)
+ */
+#define MAKE_RA_FOR_CALL(ra, ws)   (ra)
+
+/* Convert return address to a valid pc
+ * Note: We assume that the stack pointer is in the same 1GB ranges as the ra
+ */
+#define MAKE_PC_FROM_RA(ra, sp)    (ra)
+
+#else
+#error Unsupported Xtensa ABI
+#endif
+
 /* Spill slot location for the register reg in the spill area under the stack
  * pointer sp. reg must be in the range [0..4).
  */
diff --git a/arch/xtensa/include/asm/sections.h b/arch/xtensa/include/asm/sections.h
new file mode 100644 (file)
index 0000000..a8c42d0
--- /dev/null
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _XTENSA_SECTIONS_H
+#define _XTENSA_SECTIONS_H
+
+#include <asm-generic/sections.h>
+
+#ifdef CONFIG_VECTORS_ADDR
+extern char _WindowVectors_text_start[];
+extern char _WindowVectors_text_end[];
+extern char _DebugInterruptVector_text_start[];
+extern char _DebugInterruptVector_text_end[];
+extern char _KernelExceptionVector_text_start[];
+extern char _KernelExceptionVector_text_end[];
+extern char _UserExceptionVector_text_start[];
+extern char _UserExceptionVector_text_end[];
+extern char _DoubleExceptionVector_text_start[];
+extern char _DoubleExceptionVector_text_end[];
+extern char _exception_text_start[];
+extern char _exception_text_end[];
+extern char _Level2InterruptVector_text_start[];
+extern char _Level2InterruptVector_text_end[];
+extern char _Level3InterruptVector_text_start[];
+extern char _Level3InterruptVector_text_end[];
+extern char _Level4InterruptVector_text_start[];
+extern char _Level4InterruptVector_text_end[];
+extern char _Level5InterruptVector_text_start[];
+extern char _Level5InterruptVector_text_end[];
+extern char _Level6InterruptVector_text_start[];
+extern char _Level6InterruptVector_text_end[];
+#endif
+#ifdef CONFIG_SMP
+extern char _SecondaryResetVector_text_start[];
+extern char _SecondaryResetVector_text_end[];
+#endif
+#ifdef CONFIG_XIP_KERNEL
+extern char _xip_start[];
+extern char _xip_end[];
+#endif
+
+#endif
index f720a57d0a5bb0613c623b4bb894789423f0f5d9..6fa47cd8e02d44ef88b93b235edf094dec0971f4 100644 (file)
@@ -56,6 +56,7 @@ void secondary_trap_init(void);
 
 static inline void spill_registers(void)
 {
+#if defined(__XTENSA_WINDOWED_ABI__)
 #if XCHAL_NUM_AREGS > 16
        __asm__ __volatile__ (
                "       call8   1f\n"
@@ -96,6 +97,7 @@ static inline void spill_registers(void)
                "       mov     a12, a12\n"
                : : : "memory");
 #endif
+#endif
 }
 
 struct debug_table {
index 9301452e521ed7c9752d0a29e0276a90fe8135a0..d062c732ef1864c9eab5730a86682c62a9bb1b3a 100644 (file)
@@ -58,7 +58,9 @@
  *  BE  shift left / mask 0 0 X X
  */
 
+#if XCHAL_HAVE_WINDOWED
 #define UNALIGNED_USER_EXCEPTION
+#endif
 
 #if XCHAL_HAVE_BE
 
index 647b162f959b47fa4aca88b8338b3cfeef1d684f..99ab3c1a33873aa7404d90d397194dc5a437e3a5 100644 (file)
@@ -158,6 +158,7 @@ _user_exception:
        /* Rotate ws so that the current windowbase is at bit0. */
        /* Assume ws = xxwww1yyyy. Rotate ws right, so that a2 = yyyyxxwww1 */
 
+#if defined(USER_SUPPORT_WINDOWED)
        rsr     a2, windowbase
        rsr     a3, windowstart
        ssr     a2
@@ -167,24 +168,33 @@ _user_exception:
        src     a2, a3, a2
        srli    a2, a2, 32-WSBITS
        s32i    a2, a1, PT_WMASK        # needed for restoring registers
+#else
+       movi    a2, 0
+       movi    a3, 1
+       s32i    a2, a1, PT_WINDOWBASE
+       s32i    a3, a1, PT_WINDOWSTART
+       s32i    a3, a1, PT_WMASK
+#endif
 
        /* Save only live registers. */
 
-       _bbsi.l a2, 1, 1f
+UABI_W _bbsi.l a2, 1, 1f
        s32i    a4, a1, PT_AREG4
        s32i    a5, a1, PT_AREG5
        s32i    a6, a1, PT_AREG6
        s32i    a7, a1, PT_AREG7
-       _bbsi.l a2, 2, 1f
+UABI_W _bbsi.l a2, 2, 1f
        s32i    a8, a1, PT_AREG8
        s32i    a9, a1, PT_AREG9
        s32i    a10, a1, PT_AREG10
        s32i    a11, a1, PT_AREG11
-       _bbsi.l a2, 3, 1f
+UABI_W _bbsi.l a2, 3, 1f
        s32i    a12, a1, PT_AREG12
        s32i    a13, a1, PT_AREG13
        s32i    a14, a1, PT_AREG14
        s32i    a15, a1, PT_AREG15
+
+#if defined(USER_SUPPORT_WINDOWED)
        _bnei   a2, 1, 1f               # only one valid frame?
 
        /* Only one valid frame, skip saving regs. */
@@ -239,7 +249,7 @@ _user_exception:
        rsync
 
        /* We are back to the original stack pointer (a1) */
-
+#endif
 2:     /* Now, jump to the common exception handler. */
 
        j       common_exception
@@ -295,6 +305,7 @@ _kernel_exception:
        s32i    a3, a1, PT_SAR
        s32i    a2, a1, PT_ICOUNTLEVEL
 
+#if defined(__XTENSA_WINDOWED_ABI__)
        /* Rotate ws so that the current windowbase is at bit0. */
        /* Assume ws = xxwww1yyyy. Rotate ws right, so that a2 = yyyyxxwww1 */
 
@@ -305,27 +316,28 @@ _kernel_exception:
        src     a2, a3, a2
        srli    a2, a2, 32-WSBITS
        s32i    a2, a1, PT_WMASK        # needed for kernel_exception_exit
+#endif
 
        /* Save only the live window-frame */
 
-       _bbsi.l a2, 1, 1f
+KABI_W _bbsi.l a2, 1, 1f
        s32i    a4, a1, PT_AREG4
        s32i    a5, a1, PT_AREG5
        s32i    a6, a1, PT_AREG6
        s32i    a7, a1, PT_AREG7
-       _bbsi.l a2, 2, 1f
+KABI_W _bbsi.l a2, 2, 1f
        s32i    a8, a1, PT_AREG8
        s32i    a9, a1, PT_AREG9
        s32i    a10, a1, PT_AREG10
        s32i    a11, a1, PT_AREG11
-       _bbsi.l a2, 3, 1f
+KABI_W _bbsi.l a2, 3, 1f
        s32i    a12, a1, PT_AREG12
        s32i    a13, a1, PT_AREG13
        s32i    a14, a1, PT_AREG14
        s32i    a15, a1, PT_AREG15
 
+#ifdef __XTENSA_WINDOWED_ABI__
        _bnei   a2, 1, 1f
-
        /* Copy spill slots of a0 and a1 to imitate movsp
         * in order to keep exception stack continuous
         */
@@ -333,6 +345,7 @@ _kernel_exception:
        l32i    a0, a1, PT_SIZE + 4
        s32e    a3, a1, -16
        s32e    a0, a1, -12
+#endif
 1:
        l32i    a0, a1, PT_AREG0        # restore saved a0
        wsr     a0, depc
@@ -419,16 +432,16 @@ common_exception:
        movi    a3, LOCKLEVEL
 
 .Lexception:
-       movi    a0, PS_WOE_MASK
-       or      a3, a3, a0
+KABI_W movi    a0, PS_WOE_MASK
+KABI_W or      a3, a3, a0
 #else
        addi    a2, a2, -EXCCAUSE_LEVEL1_INTERRUPT
        movi    a0, LOCKLEVEL
        extui   a3, a3, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH
                                        # a3 = PS.INTLEVEL
        moveqz  a3, a0, a2              # a3 = LOCKLEVEL iff interrupt
-       movi    a2, PS_WOE_MASK
-       or      a3, a3, a2
+KABI_W movi    a2, PS_WOE_MASK
+KABI_W or      a3, a3, a2
        rsr     a2, exccause
 #endif
 
@@ -461,14 +474,14 @@ common_exception:
         */
 
        rsr     a4, excsave1
-       mov     a6, a1                  # pass stack frame
-       mov     a7, a2                  # pass EXCCAUSE
        addx4   a4, a2, a4
        l32i    a4, a4, EXC_TABLE_DEFAULT               # load handler
+       mov     abi_arg1, a2                    # pass EXCCAUSE
+       mov     abi_arg0, a1                    # pass stack frame
 
        /* Call the second-level handler */
 
-       callx4  a4
+       abi_callx       a4
 
        /* Jump here for exception exit */
        .global common_exception_return
@@ -482,15 +495,15 @@ common_exception_return:
 1:
        irq_save a2, a3
 #ifdef CONFIG_TRACE_IRQFLAGS
-       call4   trace_hardirqs_off
+       abi_call        trace_hardirqs_off
 #endif
 
        /* Jump if we are returning from kernel exceptions. */
 
-       l32i    a3, a1, PT_PS
+       l32i    abi_saved1, a1, PT_PS
        GET_THREAD_INFO(a2, a1)
        l32i    a4, a2, TI_FLAGS
-       _bbci.l a3, PS_UM_BIT, 6f
+       _bbci.l abi_saved1, PS_UM_BIT, 6f
 
        /* Specific to a user exception exit:
         * We need to check some flags for signal handling and rescheduling,
@@ -509,20 +522,20 @@ common_exception_return:
        /* Call do_signal() */
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-       call4   trace_hardirqs_on
+       abi_call        trace_hardirqs_on
 #endif
        rsil    a2, 0
-       mov     a6, a1
-       call4   do_notify_resume        # int do_notify_resume(struct pt_regs*)
+       mov     abi_arg0, a1
+       abi_call        do_notify_resume        # int do_notify_resume(struct pt_regs*)
        j       1b
 
 3:     /* Reschedule */
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-       call4   trace_hardirqs_on
+       abi_call        trace_hardirqs_on
 #endif
        rsil    a2, 0
-       call4   schedule        # void schedule (void)
+       abi_call        schedule        # void schedule (void)
        j       1b
 
 #ifdef CONFIG_PREEMPTION
@@ -533,33 +546,33 @@ common_exception_return:
 
        l32i    a4, a2, TI_PRE_COUNT
        bnez    a4, 4f
-       call4   preempt_schedule_irq
+       abi_call        preempt_schedule_irq
        j       4f
 #endif
 
 #if XTENSA_FAKE_NMI
 .LNMIexit:
-       l32i    a3, a1, PT_PS
-       _bbci.l a3, PS_UM_BIT, 4f
+       l32i    abi_saved1, a1, PT_PS
+       _bbci.l abi_saved1, PS_UM_BIT, 4f
 #endif
 
 5:
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
        _bbci.l a4, TIF_DB_DISABLED, 7f
-       call4   restore_dbreak
+       abi_call        restore_dbreak
 7:
 #endif
 #ifdef CONFIG_DEBUG_TLB_SANITY
        l32i    a4, a1, PT_DEPC
        bgeui   a4, VALID_DOUBLE_EXCEPTION_ADDRESS, 4f
-       call4   check_tlb_sanity
+       abi_call        check_tlb_sanity
 #endif
 6:
 4:
 #ifdef CONFIG_TRACE_IRQFLAGS
-       extui   a4, a3, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH
+       extui   a4, abi_saved1, PS_INTLEVEL_SHIFT, PS_INTLEVEL_WIDTH
        bgei    a4, LOCKLEVEL, 1f
-       call4   trace_hardirqs_on
+       abi_call        trace_hardirqs_on
 1:
 #endif
        /* Restore optional registers. */
@@ -572,14 +585,15 @@ common_exception_return:
        l32i    a2, a1, PT_SCOMPARE1
        wsr     a2, scompare1
 #endif
-       wsr     a3, ps          /* disable interrupts */
+       wsr     abi_saved1, ps          /* disable interrupts */
 
-       _bbci.l a3, PS_UM_BIT, kernel_exception_exit
+       _bbci.l abi_saved1, PS_UM_BIT, kernel_exception_exit
 
 user_exception_exit:
 
        /* Restore the state of the task and return from the exception. */
 
+#if defined(USER_SUPPORT_WINDOWED)
        /* Switch to the user thread WINDOWBASE. Save SP temporarily in DEPC */
 
        l32i    a2, a1, PT_WINDOWBASE
@@ -634,8 +648,10 @@ user_exception_exit:
         *       frame where we had loaded a2), or at least the lower 4 bits
         *       (if we have restored WSBITS-1 frames).
         */
-
 2:
+#else
+       movi    a2, 1
+#endif
 #if XCHAL_HAVE_THREADPTR
        l32i    a3, a1, PT_THREADPTR
        wur     a3, threadptr
@@ -650,6 +666,7 @@ user_exception_exit:
 
 kernel_exception_exit:
 
+#if defined(__XTENSA_WINDOWED_ABI__)
        /* Check if we have to do a movsp.
         *
         * We only have to do a movsp if the previous window-frame has
@@ -702,6 +719,9 @@ kernel_exception_exit:
         *
         * Note: We expect a2 to hold PT_WMASK
         */
+#else
+       movi    a2, 1
+#endif
 
 common_exception_exit:
 
@@ -920,14 +940,16 @@ unrecoverable_text:
 
 ENTRY(unrecoverable_exception)
 
+#if XCHAL_HAVE_WINDOWED
        movi    a0, 1
        movi    a1, 0
 
        wsr     a0, windowstart
        wsr     a1, windowbase
        rsync
+#endif
 
-       movi    a1, PS_WOE_MASK | LOCKLEVEL
+       movi    a1, KERNEL_PS_WOE_MASK | LOCKLEVEL
        wsr     a1, ps
        rsync
 
@@ -935,8 +957,8 @@ ENTRY(unrecoverable_exception)
        movi    a0, 0
        addi    a1, a1, PT_REGS_OFFSET
 
-       movi    a6, unrecoverable_text
-       call4   panic
+       movi    abi_arg0, unrecoverable_text
+       abi_call        panic
 
 1:     j       1b
 
@@ -947,6 +969,7 @@ ENDPROC(unrecoverable_exception)
        __XTENSA_HANDLER
        .literal_position
 
+#ifdef SUPPORT_WINDOWED
 /*
  * Fast-handler for alloca exceptions
  *
@@ -1010,6 +1033,7 @@ ENTRY(fast_alloca)
 8:     j       _WindowUnderflow8
 4:     j       _WindowUnderflow4
 ENDPROC(fast_alloca)
+#endif
 
 #ifdef CONFIG_USER_ABI_CALL0_PROBE
 /*
@@ -1206,7 +1230,8 @@ ENDPROC(fast_syscall_xtensa)
  * Note: We assume the stack pointer is EXC_TABLE_KSTK in the fixup handler.
  */
 
-#ifdef CONFIG_FAST_SYSCALL_SPILL_REGISTERS
+#if defined(CONFIG_FAST_SYSCALL_SPILL_REGISTERS) && \
+               defined(USER_SUPPORT_WINDOWED)
 
 ENTRY(fast_syscall_spill_registers)
 
@@ -1403,12 +1428,12 @@ ENTRY(fast_syscall_spill_registers)
        rsr     a3, excsave1
        l32i    a1, a3, EXC_TABLE_KSTK
 
-       movi    a4, PS_WOE_MASK | LOCKLEVEL
+       movi    a4, KERNEL_PS_WOE_MASK | LOCKLEVEL
        wsr     a4, ps
        rsync
 
-       movi    a6, SIGSEGV
-       call4   do_exit
+       movi    abi_arg0, SIGSEGV
+       abi_call        do_exit
 
        /* shouldn't return, so panic */
 
@@ -1887,57 +1912,77 @@ ENDPROC(fast_store_prohibited)
 
 ENTRY(system_call)
 
+#if defined(__XTENSA_WINDOWED_ABI__)
        abi_entry_default
+#elif defined(__XTENSA_CALL0_ABI__)
+       abi_entry(12)
+
+       s32i    a0, sp, 0
+       s32i    abi_saved0, sp, 4
+       s32i    abi_saved1, sp, 8
+       mov     abi_saved0, a2
+#else
+#error Unsupported Xtensa ABI
+#endif
 
        /* regs->syscall = regs->areg[2] */
 
-       l32i    a7, a2, PT_AREG2
-       s32i    a7, a2, PT_SYSCALL
+       l32i    a7, abi_saved0, PT_AREG2
+       s32i    a7, abi_saved0, PT_SYSCALL
 
        GET_THREAD_INFO(a4, a1)
-       l32i    a3, a4, TI_FLAGS
+       l32i    abi_saved1, a4, TI_FLAGS
        movi    a4, _TIF_WORK_MASK
-       and     a3, a3, a4
-       beqz    a3, 1f
+       and     abi_saved1, abi_saved1, a4
+       beqz    abi_saved1, 1f
 
-       mov     a6, a2
-       call4   do_syscall_trace_enter
-       beqz    a6, .Lsyscall_exit
-       l32i    a7, a2, PT_SYSCALL
+       mov     abi_arg0, abi_saved0
+       abi_call        do_syscall_trace_enter
+       beqz    abi_rv, .Lsyscall_exit
+       l32i    a7, abi_saved0, PT_SYSCALL
 
 1:
        /* syscall = sys_call_table[syscall_nr] */
 
        movi    a4, sys_call_table
        movi    a5, __NR_syscalls
-       movi    a6, -ENOSYS
+       movi    abi_rv, -ENOSYS
        bgeu    a7, a5, 1f
 
        addx4   a4, a7, a4
-       l32i    a4, a4, 0
+       l32i    abi_tmp0, a4, 0
 
        /* Load args: arg0 - arg5 are passed via regs. */
 
-       l32i    a6, a2, PT_AREG6
-       l32i    a7, a2, PT_AREG3
-       l32i    a8, a2, PT_AREG4
-       l32i    a9, a2, PT_AREG5
-       l32i    a10, a2, PT_AREG8
-       l32i    a11, a2, PT_AREG9
+       l32i    abi_arg0, abi_saved0, PT_AREG6
+       l32i    abi_arg1, abi_saved0, PT_AREG3
+       l32i    abi_arg2, abi_saved0, PT_AREG4
+       l32i    abi_arg3, abi_saved0, PT_AREG5
+       l32i    abi_arg4, abi_saved0, PT_AREG8
+       l32i    abi_arg5, abi_saved0, PT_AREG9
 
-       callx4  a4
+       abi_callx       abi_tmp0
 
 1:     /* regs->areg[2] = return_value */
 
-       s32i    a6, a2, PT_AREG2
-       bnez    a3, 1f
+       s32i    abi_rv, abi_saved0, PT_AREG2
+       bnez    abi_saved1, 1f
 .Lsyscall_exit:
+#if defined(__XTENSA_WINDOWED_ABI__)
        abi_ret_default
+#elif defined(__XTENSA_CALL0_ABI__)
+       l32i    a0, sp, 0
+       l32i    abi_saved0, sp, 4
+       l32i    abi_saved1, sp, 8
+       abi_ret(12)
+#else
+#error Unsupported Xtensa ABI
+#endif
 
 1:
-       mov     a6, a2
-       call4   do_syscall_trace_leave
-       abi_ret_default
+       mov     abi_arg0, abi_saved0
+       abi_call        do_syscall_trace_leave
+       j       .Lsyscall_exit
 
 ENDPROC(system_call)
 
@@ -1988,8 +2033,18 @@ ENDPROC(system_call)
 
 ENTRY(_switch_to)
 
+#if defined(__XTENSA_WINDOWED_ABI__)
        abi_entry(XTENSA_SPILL_STACK_RESERVE)
+#elif defined(__XTENSA_CALL0_ABI__)
+       abi_entry(16)
 
+       s32i    a12, sp, 0
+       s32i    a13, sp, 4
+       s32i    a14, sp, 8
+       s32i    a15, sp, 12
+#else
+#error Unsupported Xtensa ABI
+#endif
        mov     a11, a3                 # and 'next' (a3)
 
        l32i    a4, a2, TASK_THREAD_INFO
@@ -2033,7 +2088,9 @@ ENTRY(_switch_to)
 
        /* Flush register file. */
 
+#if defined(__XTENSA_WINDOWED_ABI__)
        spill_registers_kernel
+#endif
 
        /* Set kernel stack (and leave critical section)
         * Note: It's save to set it here. The stack will not be overwritten
@@ -2055,34 +2112,43 @@ ENTRY(_switch_to)
        wsr     a14, ps
        rsync
 
+#if defined(__XTENSA_WINDOWED_ABI__)
        abi_ret(XTENSA_SPILL_STACK_RESERVE)
+#elif defined(__XTENSA_CALL0_ABI__)
+       l32i    a12, sp, 0
+       l32i    a13, sp, 4
+       l32i    a14, sp, 8
+       l32i    a15, sp, 12
+       abi_ret(16)
+#else
+#error Unsupported Xtensa ABI
+#endif
 
 ENDPROC(_switch_to)
 
 ENTRY(ret_from_fork)
 
        /* void schedule_tail (struct task_struct *prev)
-        * Note: prev is still in a6 (return value from fake call4 frame)
+        * Note: prev is still in abi_arg0 (return value from fake call frame)
         */
-       call4   schedule_tail
-
-       mov     a6, a1
-       call4   do_syscall_trace_leave
+       abi_call        schedule_tail
 
-       j       common_exception_return
+       mov             abi_arg0, a1
+       abi_call        do_syscall_trace_leave
+       j               common_exception_return
 
 ENDPROC(ret_from_fork)
 
 /*
  * Kernel thread creation helper
- * On entry, set up by copy_thread: a2 = thread_fn, a3 = thread_fn arg
- *           left from _switch_to: a6 = prev
+ * On entry, set up by copy_thread: abi_saved0 = thread_fn,
+ * abi_saved1 = thread_fn arg. Left from _switch_to: abi_arg0 = prev
  */
 ENTRY(ret_from_kernel_thread)
 
-       call4   schedule_tail
-       mov     a6, a3
-       callx4  a2
-       j       common_exception_return
+       abi_call        schedule_tail
+       mov             abi_arg0, abi_saved1
+       abi_callx       abi_saved0
+       j               common_exception_return
 
 ENDPROC(ret_from_kernel_thread)
index b9b81e76beeace0b137e97c1298e2db046b2fe9e..8484294bc623f03c8a427aa290402927a8d5517a 100644 (file)
@@ -15,6 +15,7 @@
  * Kevin Chea
  */
 
+#include <asm/asmmacro.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/cacheasm.h>
@@ -66,11 +67,13 @@ _SetupOCD:
         * xt-gdb to single step via DEBUG exceptions received directly
         * by ocd.
         */
+#if XCHAL_HAVE_WINDOWED
        movi    a1, 1
        movi    a0, 0
        wsr     a1, windowstart
        wsr     a0, windowbase
        rsync
+#endif
 
        movi    a1, LOCKLEVEL
        wsr     a1, ps
@@ -193,9 +196,10 @@ ENTRY(_startup)
        movi    a1, start_info
        l32i    a1, a1, 0
 
-       movi    a2, PS_WOE_MASK | LOCKLEVEL
-                                       # WOE=1, INTLEVEL=LOCKLEVEL, UM=0
-       wsr     a2, ps                  # (enable reg-windows; progmode stack)
+       /* Disable interrupts. */
+       /* Enable window exceptions if kernel is built with windowed ABI. */
+       movi    a2, KERNEL_PS_WOE_MASK | LOCKLEVEL
+       wsr     a2, ps
        rsync
 
 #ifdef CONFIG_SMP
@@ -267,13 +271,13 @@ ENTRY(_startup)
        l32i    a1, a1, 0
 #endif
 
-       movi    a6, 0
-       xsr     a6, excsave1
+       movi    abi_arg0, 0
+       xsr     abi_arg0, excsave1
 
        /* init_arch kick-starts the linux kernel */
 
-       call4   init_arch
-       call4   start_kernel
+       abi_call        init_arch
+       abi_call        start_kernel
 
 should_never_return:
        j       should_never_return
@@ -297,10 +301,10 @@ should_never_return:
        s32i    a3, a2, 0
        memw
 
-       movi    a6, 0
-       wsr     a6, excsave1
+       movi    abi_arg0, 0
+       wsr     abi_arg0, excsave1
 
-       call4   secondary_start_kernel
+       abi_call        secondary_start_kernel
        j       should_never_return
 
 #endif  /* CONFIG_SMP */
index 5e4619f52858afdd856be1df8208cacac77f83f7..51daaf4e0b82a5e34a02978b6e05fbe90257d5ca 100644 (file)
 /*
  * Entry condition:
  *
- *   a2:       a0 of the caller
+ *   a2:       a0 of the caller in windowed ABI
+ *   a10:      a0 of the caller in call0 ABI
+ *
+ * In call0 ABI the function _mcount is called with the special ABI:
+ * its argument is in a10 and all the usual argument registers (a2 - a7)
+ * must be preserved in addition to callee-saved a12 - a15.
  */
 
 ENTRY(_mcount)
-
+#if defined(__XTENSA_WINDOWED_ABI__)
        abi_entry_default
 
        movi    a4, ftrace_trace_function
@@ -42,7 +47,36 @@ ENTRY(_mcount)
        callx4  a4
 
        abi_ret_default
+#elif defined(__XTENSA_CALL0_ABI__)
+       abi_entry_default
+
+       movi    a9, ftrace_trace_function
+       l32i    a9, a9, 0
+       movi    a11, ftrace_stub
+       bne     a9, a11, 1f
+       abi_ret_default
 
+1:     abi_entry(28)
+       s32i    a0, sp, 0
+       s32i    a2, sp, 4
+       s32i    a3, sp, 8
+       s32i    a4, sp, 12
+       s32i    a5, sp, 16
+       s32i    a6, sp, 20
+       s32i    a7, sp, 24
+       addi    a2, a10, -MCOUNT_INSN_SIZE
+       callx0  a9
+       l32i    a0, sp, 0
+       l32i    a2, sp, 4
+       l32i    a3, sp, 8
+       l32i    a4, sp, 12
+       l32i    a5, sp, 16
+       l32i    a6, sp, 20
+       l32i    a7, sp, 24
+       abi_ret(28)
+#else
+#error Unsupported Xtensa ABI
+#endif
 ENDPROC(_mcount)
 
 ENTRY(ftrace_stub)
index 47f933fed87005d794c0f50343bfc81720207e8c..bd80df890b1e3a7598dd6a093e2f7ded74dc4552 100644 (file)
@@ -211,11 +211,18 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn,
        struct thread_info *ti;
 #endif
 
+#if defined(__XTENSA_WINDOWED_ABI__)
        /* Create a call4 dummy-frame: a0 = 0, a1 = childregs. */
        SPILL_SLOT(childregs, 1) = (unsigned long)childregs;
        SPILL_SLOT(childregs, 0) = 0;
 
        p->thread.sp = (unsigned long)childregs;
+#elif defined(__XTENSA_CALL0_ABI__)
+       /* Reserve 16 bytes for the _switch_to stack frame. */
+       p->thread.sp = (unsigned long)childregs - 16;
+#else
+#error Unsupported Xtensa ABI
+#endif
 
        if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
                struct pt_regs *regs = current_pt_regs();
@@ -272,11 +279,25 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn,
                p->thread.ra = MAKE_RA_FOR_CALL(
                                (unsigned long)ret_from_kernel_thread, 1);
 
-               /* pass parameters to ret_from_kernel_thread:
-                * a2 = thread_fn, a3 = thread_fn arg
+               /* pass parameters to ret_from_kernel_thread: */
+#if defined(__XTENSA_WINDOWED_ABI__)
+               /*
+                * a2 = thread_fn, a3 = thread_fn arg.
+                * Window underflow will load registers from the
+                * spill slots on the stack on return from _switch_to.
                 */
-               SPILL_SLOT(childregs, 3) = thread_fn_arg;
                SPILL_SLOT(childregs, 2) = usp_thread_fn;
+               SPILL_SLOT(childregs, 3) = thread_fn_arg;
+#elif defined(__XTENSA_CALL0_ABI__)
+               /*
+                * a12 = thread_fn, a13 = thread_fn arg.
+                * _switch_to epilogue will load registers from the stack.
+                */
+               ((unsigned long *)p->thread.sp)[0] = usp_thread_fn;
+               ((unsigned long *)p->thread.sp)[1] = thread_fn_arg;
+#else
+#error Unsupported Xtensa ABI
+#endif
 
                /* Childregs are only used when we're going to userspace
                 * in which case start_thread will set them up.
index ee9082a142feb49cf187c014df9d6df8b605a939..8db20cfb44ab8836db40547f311ca7b64b7aae64 100644 (file)
 #include <asm/bootparam.h>
 #include <asm/kasan.h>
 #include <asm/mmu_context.h>
-#include <asm/processor.h>
-#include <asm/timex.h>
-#include <asm/platform.h>
 #include <asm/page.h>
-#include <asm/setup.h>
 #include <asm/param.h>
+#include <asm/platform.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
 #include <asm/smp.h>
 #include <asm/sysmem.h>
+#include <asm/timex.h>
 
 #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE)
 struct screen_info screen_info = {
@@ -271,49 +272,6 @@ void __init init_arch(bp_tag_t *bp_start)
  * Initialize system. Setup memory and reserve regions.
  */
 
-extern char _end[];
-extern char _stext[];
-extern char _WindowVectors_text_start;
-extern char _WindowVectors_text_end;
-extern char _DebugInterruptVector_text_start;
-extern char _DebugInterruptVector_text_end;
-extern char _KernelExceptionVector_text_start;
-extern char _KernelExceptionVector_text_end;
-extern char _UserExceptionVector_text_start;
-extern char _UserExceptionVector_text_end;
-extern char _DoubleExceptionVector_text_start;
-extern char _DoubleExceptionVector_text_end;
-extern char _exception_text_start;
-extern char _exception_text_end;
-#if XCHAL_EXCM_LEVEL >= 2
-extern char _Level2InterruptVector_text_start;
-extern char _Level2InterruptVector_text_end;
-#endif
-#if XCHAL_EXCM_LEVEL >= 3
-extern char _Level3InterruptVector_text_start;
-extern char _Level3InterruptVector_text_end;
-#endif
-#if XCHAL_EXCM_LEVEL >= 4
-extern char _Level4InterruptVector_text_start;
-extern char _Level4InterruptVector_text_end;
-#endif
-#if XCHAL_EXCM_LEVEL >= 5
-extern char _Level5InterruptVector_text_start;
-extern char _Level5InterruptVector_text_end;
-#endif
-#if XCHAL_EXCM_LEVEL >= 6
-extern char _Level6InterruptVector_text_start;
-extern char _Level6InterruptVector_text_end;
-#endif
-#ifdef CONFIG_SMP
-extern char _SecondaryResetVector_text_start;
-extern char _SecondaryResetVector_text_end;
-#endif
-#ifdef CONFIG_XIP_KERNEL
-extern char _xip_start[];
-extern char _xip_end[];
-#endif
-
 static inline int __init_memblock mem_reserve(unsigned long start,
                                              unsigned long end)
 {
@@ -349,49 +307,51 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 #ifdef CONFIG_VECTORS_ADDR
-       mem_reserve(__pa(&_WindowVectors_text_start),
-                   __pa(&_WindowVectors_text_end));
+#ifdef SUPPORT_WINDOWED
+       mem_reserve(__pa(_WindowVectors_text_start),
+                   __pa(_WindowVectors_text_end));
+#endif
 
-       mem_reserve(__pa(&_DebugInterruptVector_text_start),
-                   __pa(&_DebugInterruptVector_text_end));
+       mem_reserve(__pa(_DebugInterruptVector_text_start),
+                   __pa(_DebugInterruptVector_text_end));
 
-       mem_reserve(__pa(&_KernelExceptionVector_text_start),
-                   __pa(&_KernelExceptionVector_text_end));
+       mem_reserve(__pa(_KernelExceptionVector_text_start),
+                   __pa(_KernelExceptionVector_text_end));
 
-       mem_reserve(__pa(&_UserExceptionVector_text_start),
-                   __pa(&_UserExceptionVector_text_end));
+       mem_reserve(__pa(_UserExceptionVector_text_start),
+                   __pa(_UserExceptionVector_text_end));
 
-       mem_reserve(__pa(&_DoubleExceptionVector_text_start),
-                   __pa(&_DoubleExceptionVector_text_end));
+       mem_reserve(__pa(_DoubleExceptionVector_text_start),
+                   __pa(_DoubleExceptionVector_text_end));
 
-       mem_reserve(__pa(&_exception_text_start),
-                   __pa(&_exception_text_end));
+       mem_reserve(__pa(_exception_text_start),
+                   __pa(_exception_text_end));
 #if XCHAL_EXCM_LEVEL >= 2
-       mem_reserve(__pa(&_Level2InterruptVector_text_start),
-                   __pa(&_Level2InterruptVector_text_end));
+       mem_reserve(__pa(_Level2InterruptVector_text_start),
+                   __pa(_Level2InterruptVector_text_end));
 #endif
 #if XCHAL_EXCM_LEVEL >= 3
-       mem_reserve(__pa(&_Level3InterruptVector_text_start),
-                   __pa(&_Level3InterruptVector_text_end));
+       mem_reserve(__pa(_Level3InterruptVector_text_start),
+                   __pa(_Level3InterruptVector_text_end));
 #endif
 #if XCHAL_EXCM_LEVEL >= 4
-       mem_reserve(__pa(&_Level4InterruptVector_text_start),
-                   __pa(&_Level4InterruptVector_text_end));
+       mem_reserve(__pa(_Level4InterruptVector_text_start),
+                   __pa(_Level4InterruptVector_text_end));
 #endif
 #if XCHAL_EXCM_LEVEL >= 5
-       mem_reserve(__pa(&_Level5InterruptVector_text_start),
-                   __pa(&_Level5InterruptVector_text_end));
+       mem_reserve(__pa(_Level5InterruptVector_text_start),
+                   __pa(_Level5InterruptVector_text_end));
 #endif
 #if XCHAL_EXCM_LEVEL >= 6
-       mem_reserve(__pa(&_Level6InterruptVector_text_start),
-                   __pa(&_Level6InterruptVector_text_end));
+       mem_reserve(__pa(_Level6InterruptVector_text_start),
+                   __pa(_Level6InterruptVector_text_end));
 #endif
 
 #endif /* CONFIG_VECTORS_ADDR */
 
 #ifdef CONFIG_SMP
-       mem_reserve(__pa(&_SecondaryResetVector_text_start),
-                   __pa(&_SecondaryResetVector_text_end));
+       mem_reserve(__pa(_SecondaryResetVector_text_start),
+                   __pa(_SecondaryResetVector_text_end));
 #endif
        parse_early_param();
        bootmem_init();
index c4d77dbfb61afcf3f94ad8b12f0b9345586387a9..f6c949895b3eb197bcf05b6b1610f860d4304446 100644 (file)
@@ -45,12 +45,13 @@ struct rt_sigframe
        unsigned int window[4];
 };
 
-/* 
+#if defined(USER_SUPPORT_WINDOWED)
+/*
  * Flush register windows stored in pt_regs to stack.
  * Returns 1 for errors.
  */
 
-int
+static int
 flush_window_regs_user(struct pt_regs *regs)
 {
        const unsigned long ws = regs->windowstart;
@@ -121,6 +122,13 @@ flush_window_regs_user(struct pt_regs *regs)
 errout:
        return err;
 }
+#else
+static int
+flush_window_regs_user(struct pt_regs *regs)
+{
+       return 0;
+}
+#endif
 
 /*
  * Note: We don't copy double exception 'regs', we have to finish double exc. 
index 874b6efc6fb3193ee0aacd1ed7c6f36f8d930cd0..35a7d47f28cfcaec5a8ac940aaf6bbceff51e1cb 100644 (file)
@@ -97,7 +97,9 @@ static dispatch_init_table_t __initdata dispatch_init_table[] = {
 /* EXCCAUSE_INSTRUCTION_FETCH unhandled */
 /* EXCCAUSE_LOAD_STORE_ERROR unhandled*/
 { EXCCAUSE_LEVEL1_INTERRUPT,   0,         do_interrupt },
+#ifdef SUPPORT_WINDOWED
 { EXCCAUSE_ALLOCA,             USER|KRNL, fast_alloca },
+#endif
 /* EXCCAUSE_INTEGER_DIVIDE_BY_ZERO unhandled */
 /* EXCCAUSE_PRIVILEGED unhandled */
 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION
@@ -462,12 +464,10 @@ void secondary_trap_init(void)
 
 void show_regs(struct pt_regs * regs)
 {
-       int i, wmask;
+       int i;
 
        show_regs_print_info(KERN_DEFAULT);
 
-       wmask = regs->wmask & ~1;
-
        for (i = 0; i < 16; i++) {
                if ((i % 8) == 0)
                        pr_info("a%02d:", i);
index 1a7538ccfc5a489c25398a432251c2517b12dcaf..407ece204e7caad11c6f070a28b2977d1b60a7e8 100644 (file)
@@ -226,6 +226,7 @@ ENTRY(_DoubleExceptionVector)
 
        xsr     a0, depc                # get DEPC, save a0
 
+#ifdef SUPPORT_WINDOWED
        movi    a2, WINDOW_VECTORS_VADDR
        _bltu   a0, a2, .Lfixup
        addi    a2, a2, WINDOW_VECTORS_SIZE
@@ -275,6 +276,10 @@ _DoubleExceptionVector_WindowUnderflow:
        l32i    a0, a0, EXC_TABLE_FAST_USER
        jx      a0
 
+#else
+       j       .Lfixup
+#endif
+
        /*
         * We only allow the ITLB miss exception if we are in kernel space.
         * All other exceptions are unexpected and thus unrecoverable!
@@ -343,6 +348,7 @@ _DoubleExceptionVector_WindowUnderflow:
        l32i    a0, a0, EXC_TABLE_FAST_USER
        jx      a0
 
+#ifdef SUPPORT_WINDOWED
        /*
         * Restart window OVERFLOW exception.
         * Currently:
@@ -475,9 +481,12 @@ _DoubleExceptionVector_handle_exception:
        rsr     a0, depc
        rotw    -3
        j       1b
+#endif
 
 ENDPROC(_DoubleExceptionVector)
 
+#ifdef SUPPORT_WINDOWED
+
 /*
  * Fixup handler for TLB miss in double exception handler for window owerflow.
  * We get here with windowbase set to the window that was being spilled and
@@ -590,6 +599,8 @@ ENTRY(window_overflow_restore_a0_fixup)
 
 ENDPROC(window_overflow_restore_a0_fixup)
 
+#endif
+
 /*
  * Debug interrupt vector
  *
@@ -650,6 +661,25 @@ ENTRY(_Level\level\()InterruptVector)
        irq_entry_level 5
        irq_entry_level 6
 
+#if XCHAL_EXCM_LEVEL >= 2
+       /*
+        *  Continuation of medium priority interrupt dispatch code.
+        *  On entry here, a0 contains PS, and EPC2 contains saved a0:
+        */
+       __XTENSA_HANDLER
+       .align 4
+_SimulateUserKernelVectorException:
+       addi    a0, a0, (1 << PS_EXCM_BIT)
+#if !XTENSA_FAKE_NMI
+       wsr     a0, ps
+#endif
+       bbsi.l  a0, PS_UM_BIT, 1f       # branch if user mode
+       xsr     a0, excsave2            # restore a0
+       j       _KernelExceptionVector  # simulate kernel vector exception
+1:     xsr     a0, excsave2            # restore a0
+       j       _UserExceptionVector    # simulate user vector exception
+#endif
+
 
 /* Window overflow and underflow handlers.
  * The handlers must be 64 bytes apart, first starting with the underflow
@@ -668,6 +698,8 @@ ENTRY(_Level\level\()InterruptVector)
        .section                .WindowVectors.text, "ax"
 
 
+#ifdef SUPPORT_WINDOWED
+
 /* 4-Register Window Overflow Vector (Handler) */
 
 ENTRY_ALIGN64(_WindowOverflow4)
@@ -680,27 +712,6 @@ ENTRY_ALIGN64(_WindowOverflow4)
 
 ENDPROC(_WindowOverflow4)
 
-
-#if XCHAL_EXCM_LEVEL >= 2
-       /*  Not a window vector - but a convenient location
-        *  (where we know there's space) for continuation of
-        *  medium priority interrupt dispatch code.
-        *  On entry here, a0 contains PS, and EPC2 contains saved a0:
-        */
-       .align 4
-_SimulateUserKernelVectorException:
-       addi    a0, a0, (1 << PS_EXCM_BIT)
-#if !XTENSA_FAKE_NMI
-       wsr     a0, ps
-#endif
-       bbsi.l  a0, PS_UM_BIT, 1f       # branch if user mode
-       xsr     a0, excsave2            # restore a0
-       j       _KernelExceptionVector  # simulate kernel vector exception
-1:     xsr     a0, excsave2            # restore a0
-       j       _UserExceptionVector    # simulate user vector exception
-#endif
-
-
 /* 4-Register Window Underflow Vector (Handler) */
 
 ENTRY_ALIGN64(_WindowUnderflow4)
@@ -789,4 +800,6 @@ ENTRY_ALIGN64(_WindowUnderflow12)
 
 ENDPROC(_WindowUnderflow12)
 
+#endif
+
        .text
index d23a6e38f06253d0d21970c92f9da821627eac77..eee270a039a46043c9750fd5f2ec8997a41e2fa0 100644 (file)
@@ -94,7 +94,9 @@ SECTIONS
     . = ALIGN(PAGE_SIZE);
     _vecbase = .;
 
+#ifdef SUPPORT_WINDOWED
     SECTION_VECTOR2 (.WindowVectors.text, WINDOW_VECTORS_VADDR)
+#endif
 #if XCHAL_EXCM_LEVEL >= 2
     SECTION_VECTOR2 (.Level2InterruptVector.text, INTLEVEL2_VECTOR_VADDR)
 #endif
@@ -166,8 +168,10 @@ SECTIONS
     __boot_reloc_table_start = ABSOLUTE(.);
 
 #if !MERGED_VECTORS
+#ifdef SUPPORT_WINDOWED
     RELOCATE_ENTRY(_WindowVectors_text,
                   .WindowVectors.text);
+#endif
 #if XCHAL_EXCM_LEVEL >= 2
     RELOCATE_ENTRY(_Level2InterruptVector_text,
                   .Level2InterruptVector.text);
@@ -229,14 +233,18 @@ SECTIONS
 #if !MERGED_VECTORS
   /* The vectors are relocated to the real position at startup time */
 
+#ifdef SUPPORT_WINDOWED
   SECTION_VECTOR4 (_WindowVectors_text,
                  .WindowVectors.text,
                  WINDOW_VECTORS_VADDR,
-                 .dummy)
+                 LAST)
+#undef LAST
+#define LAST   .WindowVectors.text
+#endif
   SECTION_VECTOR4 (_DebugInterruptVector_text,
                  .DebugInterruptVector.text,
                  DEBUG_VECTOR_VADDR,
-                 .WindowVectors.text)
+                 LAST)
 #undef LAST
 #define LAST   .DebugInterruptVector.text
 #if XCHAL_EXCM_LEVEL >= 2
index 4faf46fe3f381a3afed861091ea6ccc1b01f2627..0731912227d37eb12a2616ff845c27c0a47cca9b 100644 (file)
@@ -45,7 +45,6 @@
 #   a9/ tmp
 #   a10/ tmp
 #   a11/ dst
-#   a12/ tmp
 
 .text
 ENTRY(__strncpy_user)
@@ -61,7 +60,7 @@ ENTRY(__strncpy_user)
        bbsi.l  a3, 0, .Lsrc1mod2 # if only  8-bit aligned
        bbsi.l  a3, 1, .Lsrc2mod4 # if only 16-bit aligned
 .Lsrcaligned:  # return here when src is word-aligned
-       srli    a12, a4, 2      # number of loop iterations with 4B per loop
+       srli    a10, a4, 2      # number of loop iterations with 4B per loop
        movi    a9, 3
        bnone   a11, a9, .Laligned
        j       .Ldstunaligned
@@ -102,11 +101,11 @@ EX(10f)   s8i     a9, a11, 0              # store byte 0
        .byte   0               # (0 mod 4 alignment for LBEG)
 .Laligned:
 #if XCHAL_HAVE_LOOPS
-       loopnez a12, .Loop1done
+       loopnez a10, .Loop1done
 #else
-       beqz    a12, .Loop1done
-       slli    a12, a12, 2
-       add     a12, a12, a11   # a12 = end of last 4B chunck
+       beqz    a10, .Loop1done
+       slli    a10, a10, 2
+       add     a10, a10, a11   # a10 = end of last 4B chunck
 #endif
 .Loop1:
 EX(11f)        l32i    a9, a3, 0               # get word from src
@@ -118,7 +117,7 @@ EX(10f)     s32i    a9, a11, 0              # store word to dst
        bnone   a9, a8, .Lz3            # if byte 3 is zero
        addi    a11, a11, 4             # advance dst pointer
 #if !XCHAL_HAVE_LOOPS
-       blt     a11, a12, .Loop1
+       blt     a11, a10, .Loop1
 #endif
 
 .Loop1done:
@@ -185,7 +184,7 @@ EX(10f)     s8i     a9, a11, 2
        loopnez a4, .Lunalignedend
 #else
        beqz    a4, .Lunalignedend
-       add     a12, a11, a4            # a12 = ending address
+       add     a10, a11, a4            # a10 = ending address
 #endif /* XCHAL_HAVE_LOOPS */
 .Lnextbyte:
 EX(11f)        l8ui    a9, a3, 0
@@ -194,7 +193,7 @@ EX(10f)     s8i     a9, a11, 0
        beqz    a9, .Lunalignedend
        addi    a11, a11, 1
 #if !XCHAL_HAVE_LOOPS
-       blt     a11, a12, .Lnextbyte
+       blt     a11, a10, .Lnextbyte
 #endif
 
 .Lunalignedend:
index a0aa4047f94aec5779014c5aaef043f97e76d22c..16128c094c621ed8cdc9c6f8854a9aac611c966e 100644 (file)
        .text
 ENTRY(__xtensa_copy_user)
 
-       abi_entry_default
+#if !XCHAL_HAVE_LOOPS && defined(__XTENSA_CALL0_ABI__)
+#define STACK_SIZE 4
+#else
+#define STACK_SIZE 0
+#endif
+       abi_entry(STACK_SIZE)
        # a2/ dst, a3/ src, a4/ len
        mov     a5, a2          # copy dst so that a2 is return value
        mov     a11, a4         # preserve original len for error case
@@ -75,7 +80,7 @@ ENTRY(__xtensa_copy_user)
        __ssa8  a3              # set shift amount from byte offset
        bnez    a4, .Lsrcunaligned
        movi    a2, 0           # return success for len==0
-       abi_ret_default
+       abi_ret(STACK_SIZE)
 
 /*
  * Destination is unaligned
@@ -127,7 +132,7 @@ EX(10f)     s8i     a6, a5, 0
 #endif /* !XCHAL_HAVE_LOOPS */
 .Lbytecopydone:
        movi    a2, 0           # return success for len bytes copied
-       abi_ret_default
+       abi_ret(STACK_SIZE)
 
 /*
  * Destination and source are word-aligned.
@@ -187,7 +192,7 @@ EX(10f)     l8ui    a6, a3,  0
 EX(10f)        s8i     a6, a5,  0
 .L5:
        movi    a2, 0           # return success for len bytes copied
-       abi_ret_default
+       abi_ret(STACK_SIZE)
 
 /*
  * Destination is aligned, Source is unaligned
@@ -205,8 +210,14 @@ EX(10f)    l32i    a6, a3, 0       # load first word
        loopnez a7, .Loop2done
 #else /* !XCHAL_HAVE_LOOPS */
        beqz    a7, .Loop2done
+#if defined(__XTENSA_CALL0_ABI__)
+       s32i    a10, a1, 0
+       slli    a10, a7, 4
+       add     a10, a10, a3    # a10 = end of last 16B source chunk
+#else
        slli    a12, a7, 4
        add     a12, a12, a3    # a12 = end of last 16B source chunk
+#endif
 #endif /* !XCHAL_HAVE_LOOPS */
 .Loop2:
 EX(10f)        l32i    a7, a3,  4
@@ -224,7 +235,12 @@ EX(10f)    s32i    a8, a5,  8
 EX(10f)        s32i    a9, a5, 12
        addi    a5, a5, 16
 #if !XCHAL_HAVE_LOOPS
+#if defined(__XTENSA_CALL0_ABI__)
+       blt     a3, a10, .Loop2
+       l32i    a10, a1, 0
+#else
        blt     a3, a12, .Loop2
+#endif
 #endif /* !XCHAL_HAVE_LOOPS */
 .Loop2done:
        bbci.l  a4, 3, .L12
@@ -264,7 +280,7 @@ EX(10f)     l8ui    a6, a3,  0
 EX(10f)        s8i     a6, a5,  0
 .L15:
        movi    a2, 0           # return success for len bytes copied
-       abi_ret_default
+       abi_ret(STACK_SIZE)
 
 ENDPROC(__xtensa_copy_user)
 
@@ -281,4 +297,4 @@ ENDPROC(__xtensa_copy_user)
 10:
        sub     a2, a5, a2      /* a2 <-- bytes copied */
        sub     a2, a11, a2     /* a2 <-- bytes not copied */
-       abi_ret_default
+       abi_ret(STACK_SIZE)
index c633f15a048136805970627be563680c8d3fdb06..429c4d57458c361ece52378cc682924feeec3ebb 100644 (file)
@@ -119,6 +119,8 @@ CFLAGS_aegis128-neon-inner.o += $(aegis128-cflags-y)
 CFLAGS_REMOVE_aegis128-neon-inner.o += -mgeneral-regs-only
 aegis128-$(CONFIG_CRYPTO_AEGIS128_SIMD) += aegis128-neon.o aegis128-neon-inner.o
 endif
+# Enable <arm_neon.h>
+CFLAGS_aegis128-neon-inner.o += -isystem $(shell $(CC) -print-file-name=include)
 
 obj-$(CONFIG_CRYPTO_PCRYPT) += pcrypt.o
 obj-$(CONFIG_CRYPTO_CRYPTD) += cryptd.o
index d7deedf3548e0f3a8cb22462ffe347471b41f761..ab2f7dfb0c44429f2b4296c9435bc25edd97b86c 100644 (file)
@@ -199,33 +199,20 @@ static acpi_status acpi_pci_query_osc(struct acpi_pci_root *root,
        acpi_status status;
        u32 result, capbuf[3];
 
-       support &= OSC_PCI_SUPPORT_MASKS;
        support |= root->osc_support_set;
 
        capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE;
        capbuf[OSC_SUPPORT_DWORD] = support;
-       if (control) {
-               *control &= OSC_PCI_CONTROL_MASKS;
-               capbuf[OSC_CONTROL_DWORD] = *control | root->osc_control_set;
-       } else {
-               /* Run _OSC query only with existing controls. */
-               capbuf[OSC_CONTROL_DWORD] = root->osc_control_set;
-       }
+       capbuf[OSC_CONTROL_DWORD] = *control | root->osc_control_set;
 
        status = acpi_pci_run_osc(root->device->handle, capbuf, &result);
        if (ACPI_SUCCESS(status)) {
                root->osc_support_set = support;
-               if (control)
-                       *control = result;
+               *control = result;
        }
        return status;
 }
 
-static acpi_status acpi_pci_osc_support(struct acpi_pci_root *root, u32 flags)
-{
-       return acpi_pci_query_osc(root, flags, NULL);
-}
-
 struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle)
 {
        struct acpi_pci_root *root;
@@ -348,8 +335,9 @@ EXPORT_SYMBOL_GPL(acpi_get_pci_dev);
  * _OSC bits the BIOS has granted control of, but its contents are meaningless
  * on failure.
  **/
-static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 req)
+static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 support)
 {
+       u32 req = OSC_PCI_EXPRESS_CAPABILITY_CONTROL;
        struct acpi_pci_root *root;
        acpi_status status;
        u32 ctrl, capbuf[3];
@@ -357,22 +345,16 @@ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 r
        if (!mask)
                return AE_BAD_PARAMETER;
 
-       ctrl = *mask & OSC_PCI_CONTROL_MASKS;
-       if ((ctrl & req) != req)
-               return AE_TYPE;
-
        root = acpi_pci_find_root(handle);
        if (!root)
                return AE_NOT_EXIST;
 
-       *mask = ctrl | root->osc_control_set;
-       /* No need to evaluate _OSC if the control was already granted. */
-       if ((root->osc_control_set & ctrl) == ctrl)
-               return AE_OK;
+       ctrl   = *mask;
+       *mask |= root->osc_control_set;
 
        /* Need to check the available controls bits before requesting them. */
-       while (*mask) {
-               status = acpi_pci_query_osc(root, root->osc_support_set, mask);
+       do {
+               status = acpi_pci_query_osc(root, support, mask);
                if (ACPI_FAILURE(status))
                        return status;
                if (ctrl == *mask)
@@ -380,7 +362,11 @@ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 r
                decode_osc_control(root, "platform does not support",
                                   ctrl & ~(*mask));
                ctrl = *mask;
-       }
+       } while (*mask);
+
+       /* No need to request _OSC if the control was already granted. */
+       if ((root->osc_control_set & ctrl) == ctrl)
+               return AE_OK;
 
        if ((ctrl & req) != req) {
                decode_osc_control(root, "not requesting control; platform does not support",
@@ -399,25 +385,9 @@ static acpi_status acpi_pci_osc_control_set(acpi_handle handle, u32 *mask, u32 r
        return AE_OK;
 }
 
-static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
-                                bool is_pcie)
+static u32 calculate_support(void)
 {
-       u32 support, control, requested;
-       acpi_status status;
-       struct acpi_device *device = root->device;
-       acpi_handle handle = device->handle;
-
-       /*
-        * Apple always return failure on _OSC calls when _OSI("Darwin") has
-        * been called successfully. We know the feature set supported by the
-        * platform, so avoid calling _OSC at all
-        */
-       if (x86_apple_machine) {
-               root->osc_control_set = ~OSC_PCI_EXPRESS_PME_CONTROL;
-               decode_osc_control(root, "OS assumes control of",
-                                  root->osc_control_set);
-               return;
-       }
+       u32 support;
 
        /*
         * All supported architectures that use ACPI have support for
@@ -434,30 +404,12 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
        if (IS_ENABLED(CONFIG_PCIE_EDR))
                support |= OSC_PCI_EDR_SUPPORT;
 
-       decode_osc_support(root, "OS supports", support);
-       status = acpi_pci_osc_support(root, support);
-       if (ACPI_FAILURE(status)) {
-               *no_aspm = 1;
-
-               /* _OSC is optional for PCI host bridges */
-               if ((status == AE_NOT_FOUND) && !is_pcie)
-                       return;
-
-               dev_info(&device->dev, "_OSC: platform retains control of PCIe features (%s)\n",
-                        acpi_format_exception(status));
-               return;
-       }
-
-       if (pcie_ports_disabled) {
-               dev_info(&device->dev, "PCIe port services disabled; not requesting _OSC control\n");
-               return;
-       }
+       return support;
+}
 
-       if ((support & ACPI_PCIE_REQ_SUPPORT) != ACPI_PCIE_REQ_SUPPORT) {
-               decode_osc_support(root, "not requesting OS control; OS requires",
-                                  ACPI_PCIE_REQ_SUPPORT);
-               return;
-       }
+static u32 calculate_control(void)
+{
+       u32 control;
 
        control = OSC_PCI_EXPRESS_CAPABILITY_CONTROL
                | OSC_PCI_EXPRESS_PME_CONTROL;
@@ -483,11 +435,59 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
        if (IS_ENABLED(CONFIG_PCIE_DPC) && IS_ENABLED(CONFIG_PCIE_EDR))
                control |= OSC_PCI_EXPRESS_DPC_CONTROL;
 
-       requested = control;
-       status = acpi_pci_osc_control_set(handle, &control,
-                                         OSC_PCI_EXPRESS_CAPABILITY_CONTROL);
+       return control;
+}
+
+static bool os_control_query_checks(struct acpi_pci_root *root, u32 support)
+{
+       struct acpi_device *device = root->device;
+
+       if (pcie_ports_disabled) {
+               dev_info(&device->dev, "PCIe port services disabled; not requesting _OSC control\n");
+               return false;
+       }
+
+       if ((support & ACPI_PCIE_REQ_SUPPORT) != ACPI_PCIE_REQ_SUPPORT) {
+               decode_osc_support(root, "not requesting OS control; OS requires",
+                                  ACPI_PCIE_REQ_SUPPORT);
+               return false;
+       }
+
+       return true;
+}
+
+static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
+                                bool is_pcie)
+{
+       u32 support, control = 0, requested = 0;
+       acpi_status status;
+       struct acpi_device *device = root->device;
+       acpi_handle handle = device->handle;
+
+       /*
+        * Apple always return failure on _OSC calls when _OSI("Darwin") has
+        * been called successfully. We know the feature set supported by the
+        * platform, so avoid calling _OSC at all
+        */
+       if (x86_apple_machine) {
+               root->osc_control_set = ~OSC_PCI_EXPRESS_PME_CONTROL;
+               decode_osc_control(root, "OS assumes control of",
+                                  root->osc_control_set);
+               return;
+       }
+
+       support = calculate_support();
+
+       decode_osc_support(root, "OS supports", support);
+
+       if (os_control_query_checks(root, support))
+               requested = control = calculate_control();
+
+       status = acpi_pci_osc_control_set(handle, &control, support);
        if (ACPI_SUCCESS(status)) {
-               decode_osc_control(root, "OS now controls", control);
+               if (control)
+                       decode_osc_control(root, "OS now controls", control);
+
                if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_ASPM) {
                        /*
                         * We have ASPM control, but the FADT indicates that
@@ -498,11 +498,6 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
                        *no_aspm = 1;
                }
        } else {
-               decode_osc_control(root, "OS requested", requested);
-               decode_osc_control(root, "platform willing to grant", control);
-               dev_info(&device->dev, "_OSC: platform retains control of PCIe features (%s)\n",
-                       acpi_format_exception(status));
-
                /*
                 * We want to disable ASPM here, but aspm_disabled
                 * needs to remain in its state from boot so that we
@@ -511,6 +506,18 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm,
                 * root scan.
                 */
                *no_aspm = 1;
+
+               /* _OSC is optional for PCI host bridges */
+               if ((status == AE_NOT_FOUND) && !is_pcie)
+                       return;
+
+               if (control) {
+                       decode_osc_control(root, "OS requested", requested);
+                       decode_osc_control(root, "platform willing to grant", control);
+               }
+
+               dev_info(&device->dev, "_OSC: platform retains control of PCIe features (%s)\n",
+                        acpi_format_exception(status));
        }
 }
 
index 1509cb74705a30ade4f89e8c2f6af9cb9bacb594..64012cda4d1267077c5107354c037f2712ec6122 100644 (file)
@@ -25,6 +25,12 @@ config CHARLCD
          This is some character LCD core interface that multiple drivers can
          use.
 
+config LINEDISP
+       tristate "Character line display core support" if COMPILE_TEST
+       help
+         This is the core support for single-line character displays, to be
+         selected by drivers that use it.
+
 config HD44780_COMMON
        tristate "Common functions for HD44780 (and compatibles) LCD displays" if COMPILE_TEST
        select CHARLCD
@@ -155,6 +161,7 @@ config IMG_ASCII_LCD
        depends on HAS_IOMEM
        default y if MIPS_MALTA
        select MFD_SYSCON
+       select LINEDISP
        help
          Enable this to support the simple ASCII LCD displays found on
          development boards such as the MIPS Boston, MIPS Malta & MIPS SEAD3
@@ -162,13 +169,16 @@ config IMG_ASCII_LCD
 
 config HT16K33
        tristate "Holtek Ht16K33 LED controller with keyscan"
-       depends on FB && OF && I2C && INPUT
+       depends on FB && I2C && INPUT
        select FB_SYS_FOPS
        select FB_SYS_FILLRECT
        select FB_SYS_COPYAREA
        select FB_SYS_IMAGEBLIT
        select INPUT_MATRIXKMAP
        select FB_BACKLIGHT
+       select NEW_LEDS
+       select LEDS_CLASS
+       select LINEDISP
        help
          Say yes here to add support for Holtek HT16K33, RAM mapping 16*8
          LED controller driver with keyscan.
index 307771027c893a5bbc5e182b8c7a839e19299797..6968ed4d3f0a89300e1d77df7b462a589ce05165 100644 (file)
@@ -13,3 +13,4 @@ obj-$(CONFIG_HD44780)         += hd44780.o
 obj-$(CONFIG_HT16K33)          += ht16k33.o
 obj-$(CONFIG_PARPORT_PANEL)    += panel.o
 obj-$(CONFIG_LCD2S)            += lcd2s.o
+obj-$(CONFIG_LINEDISP)         += line-display.o
index d66821adf45324908cbf87c17022e8e83d764754..0df474506fb927d9a9083d21328bf8d2d9451b02 100644 (file)
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/fb.h>
 #include <linux/mm.h>
 #include <linux/platform_device.h>
-#include <linux/string.h>
-#include <linux/uaccess.h>
 #include <linux/cfag12864b.h>
 
 #define CFAG12864BFB_NAME "cfag12864bfb"
@@ -41,8 +38,8 @@ static const struct fb_var_screeninfo cfag12864bfb_var = {
        .yres_virtual = CFAG12864B_HEIGHT,
        .bits_per_pixel = 1,
        .red = { 0, 1, 0 },
-       .green = { 0, 1, 0 },
-       .blue = { 0, 1, 0 },
+       .green = { 0, 1, 0 },
+       .blue = { 0, 1, 0 },
        .left_margin = 0,
        .right_margin = 0,
        .upper_margin = 0,
@@ -70,7 +67,7 @@ static const struct fb_ops cfag12864bfb_ops = {
 static int cfag12864bfb_probe(struct platform_device *device)
 {
        int ret = -EINVAL;
-       struct fb_info *info = framebuffer_alloc(0, &device->dev);
+       struct fb_info *info = framebuffer_alloc(0, &device->dev);
 
        if (!info)
                goto none;
index 1e69cc6d21a0dca208267f58476a1dde739203fd..4fab3b2c702390f602edf4ff51fd21681afd0e5a 100644 (file)
@@ -5,27 +5,39 @@
  * Author: Robin van der Gracht <robin@protonic.nl>
  *
  * Copyright: (C) 2016 Protonic Holland.
+ * Copyright (C) 2021 Glider bv
  */
 
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/i2c.h>
-#include <linux/of.h>
+#include <linux/property.h>
 #include <linux/fb.h>
-#include <linux/slab.h>
 #include <linux/backlight.h>
 #include <linux/input.h>
 #include <linux/input/matrix_keypad.h>
+#include <linux/leds.h>
 #include <linux/workqueue.h>
 #include <linux/mm.h>
 
+#include <linux/map_to_7segment.h>
+#include <linux/map_to_14segment.h>
+
+#include <asm/unaligned.h>
+
+#include "line-display.h"
+
 /* Registers */
 #define REG_SYSTEM_SETUP               0x20
 #define REG_SYSTEM_SETUP_OSC_ON                BIT(0)
 
 #define REG_DISPLAY_SETUP              0x80
 #define REG_DISPLAY_SETUP_ON           BIT(0)
+#define REG_DISPLAY_SETUP_BLINK_OFF    (0 << 1)
+#define REG_DISPLAY_SETUP_BLINK_2HZ    (1 << 1)
+#define REG_DISPLAY_SETUP_BLINK_1HZ    (2 << 1)
+#define REG_DISPLAY_SETUP_BLINK_0HZ5   (3 << 1)
 
 #define REG_ROWINT_SET                 0xA0
 #define REG_ROWINT_SET_INT_EN          BIT(0)
 #define BYTES_PER_ROW          (HT16K33_MATRIX_LED_MAX_ROWS / 8)
 #define HT16K33_FB_SIZE                (HT16K33_MATRIX_LED_MAX_COLS * BYTES_PER_ROW)
 
+enum display_type {
+       DISP_MATRIX = 0,
+       DISP_QUAD_7SEG,
+       DISP_QUAD_14SEG,
+};
+
 struct ht16k33_keypad {
        struct i2c_client *client;
        struct input_dev *dev;
@@ -65,13 +83,29 @@ struct ht16k33_fbdev {
        uint32_t refresh_rate;
        uint8_t *buffer;
        uint8_t *cache;
-       struct delayed_work work;
+};
+
+struct ht16k33_seg {
+       struct linedisp linedisp;
+       union {
+               struct seg7_conversion_map seg7;
+               struct seg14_conversion_map seg14;
+       } map;
+       unsigned int map_size;
+       char curr[4];
 };
 
 struct ht16k33_priv {
        struct i2c_client *client;
+       struct delayed_work work;
+       struct led_classdev led;
        struct ht16k33_keypad keypad;
-       struct ht16k33_fbdev fbdev;
+       union {
+               struct ht16k33_fbdev fbdev;
+               struct ht16k33_seg seg;
+       };
+       enum display_type type;
+       uint8_t blink;
 };
 
 static const struct fb_fix_screeninfo ht16k33_fb_fix = {
@@ -101,9 +135,36 @@ static const struct fb_var_screeninfo ht16k33_fb_var = {
        .vmode = FB_VMODE_NONINTERLACED,
 };
 
+static const SEG7_DEFAULT_MAP(initial_map_seg7);
+static const SEG14_DEFAULT_MAP(initial_map_seg14);
+
+static ssize_t map_seg_show(struct device *dev, struct device_attribute *attr,
+                           char *buf)
+{
+       struct ht16k33_priv *priv = dev_get_drvdata(dev);
+
+       memcpy(buf, &priv->seg.map, priv->seg.map_size);
+       return priv->seg.map_size;
+}
+
+static ssize_t map_seg_store(struct device *dev, struct device_attribute *attr,
+                            const char *buf, size_t cnt)
+{
+       struct ht16k33_priv *priv = dev_get_drvdata(dev);
+
+       if (cnt != priv->seg.map_size)
+               return -EINVAL;
+
+       memcpy(&priv->seg.map, buf, cnt);
+       return cnt;
+}
+
+static DEVICE_ATTR(map_seg7, 0644, map_seg_show, map_seg_store);
+static DEVICE_ATTR(map_seg14, 0644, map_seg_show, map_seg_store);
+
 static int ht16k33_display_on(struct ht16k33_priv *priv)
 {
-       uint8_t data = REG_DISPLAY_SETUP | REG_DISPLAY_SETUP_ON;
+       uint8_t data = REG_DISPLAY_SETUP | REG_DISPLAY_SETUP_ON | priv->blink;
 
        return i2c_smbus_write_byte(priv->client, data);
 }
@@ -113,11 +174,72 @@ static int ht16k33_display_off(struct ht16k33_priv *priv)
        return i2c_smbus_write_byte(priv->client, REG_DISPLAY_SETUP);
 }
 
+static int ht16k33_brightness_set(struct ht16k33_priv *priv,
+                                 unsigned int brightness)
+{
+       int err;
+
+       if (brightness == 0) {
+               priv->blink = REG_DISPLAY_SETUP_BLINK_OFF;
+               return ht16k33_display_off(priv);
+       }
+
+       err = ht16k33_display_on(priv);
+       if (err)
+               return err;
+
+       return i2c_smbus_write_byte(priv->client,
+                                   REG_BRIGHTNESS | (brightness - 1));
+}
+
+static int ht16k33_brightness_set_blocking(struct led_classdev *led_cdev,
+                                          enum led_brightness brightness)
+{
+       struct ht16k33_priv *priv = container_of(led_cdev, struct ht16k33_priv,
+                                                led);
+
+       return ht16k33_brightness_set(priv, brightness);
+}
+
+static int ht16k33_blink_set(struct led_classdev *led_cdev,
+                            unsigned long *delay_on, unsigned long *delay_off)
+{
+       struct ht16k33_priv *priv = container_of(led_cdev, struct ht16k33_priv,
+                                                led);
+       unsigned int delay;
+       uint8_t blink;
+       int err;
+
+       if (!*delay_on && !*delay_off) {
+               blink = REG_DISPLAY_SETUP_BLINK_1HZ;
+               delay = 1000;
+       } else if (*delay_on <= 750) {
+               blink = REG_DISPLAY_SETUP_BLINK_2HZ;
+               delay = 500;
+       } else if (*delay_on <= 1500) {
+               blink = REG_DISPLAY_SETUP_BLINK_1HZ;
+               delay = 1000;
+       } else {
+               blink = REG_DISPLAY_SETUP_BLINK_0HZ5;
+               delay = 2000;
+       }
+
+       err = i2c_smbus_write_byte(priv->client,
+                                  REG_DISPLAY_SETUP | REG_DISPLAY_SETUP_ON |
+                                  blink);
+       if (err)
+               return err;
+
+       priv->blink = blink;
+       *delay_on = *delay_off = delay;
+       return 0;
+}
+
 static void ht16k33_fb_queue(struct ht16k33_priv *priv)
 {
        struct ht16k33_fbdev *fbdev = &priv->fbdev;
 
-       schedule_delayed_work(&fbdev->work, HZ / fbdev->refresh_rate);
+       schedule_delayed_work(&priv->work, HZ / fbdev->refresh_rate);
 }
 
 /*
@@ -125,10 +247,9 @@ static void ht16k33_fb_queue(struct ht16k33_priv *priv)
  */
 static void ht16k33_fb_update(struct work_struct *work)
 {
-       struct ht16k33_fbdev *fbdev =
-               container_of(work, struct ht16k33_fbdev, work.work);
-       struct ht16k33_priv *priv =
-               container_of(fbdev, struct ht16k33_priv, fbdev);
+       struct ht16k33_priv *priv = container_of(work, struct ht16k33_priv,
+                                                work.work);
+       struct ht16k33_fbdev *fbdev = &priv->fbdev;
 
        uint8_t *p1, *p2;
        int len, pos = 0, first = -1;
@@ -168,9 +289,9 @@ requeue:
 
 static int ht16k33_initialize(struct ht16k33_priv *priv)
 {
+       uint8_t data[HT16K33_FB_SIZE];
        uint8_t byte;
        int err;
-       uint8_t data[HT16K33_MATRIX_LED_MAX_COLS * 2];
 
        /* Clear RAM (8 * 16 bits) */
        memset(data, 0, sizeof(data));
@@ -198,13 +319,10 @@ static int ht16k33_bl_update_status(struct backlight_device *bl)
 
        if (bl->props.power != FB_BLANK_UNBLANK ||
            bl->props.fb_blank != FB_BLANK_UNBLANK ||
-           bl->props.state & BL_CORE_FBBLANK || brightness == 0) {
-               return ht16k33_display_off(priv);
-       }
+           bl->props.state & BL_CORE_FBBLANK)
+               brightness = 0;
 
-       ht16k33_display_on(priv);
-       return i2c_smbus_write_byte(priv->client,
-                                   REG_BRIGHTNESS | (brightness - 1));
+       return ht16k33_brightness_set(priv, brightness);
 }
 
 static int ht16k33_bl_check_fb(struct backlight_device *bl, struct fb_info *fi)
@@ -219,6 +337,15 @@ static const struct backlight_ops ht16k33_bl_ops = {
        .check_fb       = ht16k33_bl_check_fb,
 };
 
+/*
+ * Blank events will be passed to the actual device handling the backlight when
+ * we return zero here.
+ */
+static int ht16k33_blank(int blank, struct fb_info *info)
+{
+       return 0;
+}
+
 static int ht16k33_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
        struct ht16k33_priv *priv = info->par;
@@ -231,6 +358,7 @@ static const struct fb_ops ht16k33_fb_ops = {
        .owner = THIS_MODULE,
        .fb_read = fb_sys_read,
        .fb_write = fb_sys_write,
+       .fb_blank = ht16k33_blank,
        .fb_fillrect = sys_fillrect,
        .fb_copyarea = sys_copyarea,
        .fb_imageblit = sys_imageblit,
@@ -313,10 +441,82 @@ static void ht16k33_keypad_stop(struct input_dev *dev)
        disable_irq(keypad->client->irq);
 }
 
+static void ht16k33_linedisp_update(struct linedisp *linedisp)
+{
+       struct ht16k33_priv *priv = container_of(linedisp, struct ht16k33_priv,
+                                                seg.linedisp);
+
+       schedule_delayed_work(&priv->work, 0);
+}
+
+static void ht16k33_seg7_update(struct work_struct *work)
+{
+       struct ht16k33_priv *priv = container_of(work, struct ht16k33_priv,
+                                                work.work);
+       struct ht16k33_seg *seg = &priv->seg;
+       char *s = seg->curr;
+       uint8_t buf[9];
+
+       buf[0] = map_to_seg7(&seg->map.seg7, *s++);
+       buf[1] = 0;
+       buf[2] = map_to_seg7(&seg->map.seg7, *s++);
+       buf[3] = 0;
+       buf[4] = 0;
+       buf[5] = 0;
+       buf[6] = map_to_seg7(&seg->map.seg7, *s++);
+       buf[7] = 0;
+       buf[8] = map_to_seg7(&seg->map.seg7, *s++);
+
+       i2c_smbus_write_i2c_block_data(priv->client, 0, ARRAY_SIZE(buf), buf);
+}
+
+static void ht16k33_seg14_update(struct work_struct *work)
+{
+       struct ht16k33_priv *priv = container_of(work, struct ht16k33_priv,
+                                                work.work);
+       struct ht16k33_seg *seg = &priv->seg;
+       char *s = seg->curr;
+       uint8_t buf[8];
+
+       put_unaligned_le16(map_to_seg14(&seg->map.seg14, *s++), buf);
+       put_unaligned_le16(map_to_seg14(&seg->map.seg14, *s++), buf + 2);
+       put_unaligned_le16(map_to_seg14(&seg->map.seg14, *s++), buf + 4);
+       put_unaligned_le16(map_to_seg14(&seg->map.seg14, *s++), buf + 6);
+
+       i2c_smbus_write_i2c_block_data(priv->client, 0, ARRAY_SIZE(buf), buf);
+}
+
+static int ht16k33_led_probe(struct device *dev, struct led_classdev *led,
+                            unsigned int brightness)
+{
+       struct led_init_data init_data = {};
+       int err;
+
+       /* The LED is optional */
+       init_data.fwnode = device_get_named_child_node(dev, "led");
+       if (!init_data.fwnode)
+               return 0;
+
+       init_data.devicename = "auxdisplay";
+       init_data.devname_mandatory = true;
+
+       led->brightness_set_blocking = ht16k33_brightness_set_blocking;
+       led->blink_set = ht16k33_blink_set;
+       led->flags = LED_CORE_SUSPENDRESUME;
+       led->brightness = brightness;
+       led->max_brightness = MAX_BRIGHTNESS;
+
+       err = devm_led_classdev_register_ext(dev, led, &init_data);
+       if (err)
+               dev_err(dev, "Failed to register LED\n");
+
+       return err;
+}
+
 static int ht16k33_keypad_probe(struct i2c_client *client,
                                struct ht16k33_keypad *keypad)
 {
-       struct device_node *node = client->dev.of_node;
+       struct device *dev = &client->dev;
        u32 rows = HT16K33_MATRIX_KEYPAD_MAX_ROWS;
        u32 cols = HT16K33_MATRIX_KEYPAD_MAX_COLS;
        int err;
@@ -324,7 +524,7 @@ static int ht16k33_keypad_probe(struct i2c_client *client,
        keypad->client = client;
        init_waitqueue_head(&keypad->wait);
 
-       keypad->dev = devm_input_allocate_device(&client->dev);
+       keypad->dev = devm_input_allocate_device(dev);
        if (!keypad->dev)
                return -ENOMEM;
 
@@ -335,23 +535,23 @@ static int ht16k33_keypad_probe(struct i2c_client *client,
        keypad->dev->open = ht16k33_keypad_start;
        keypad->dev->close = ht16k33_keypad_stop;
 
-       if (!of_get_property(node, "linux,no-autorepeat", NULL))
+       if (!device_property_read_bool(dev, "linux,no-autorepeat"))
                __set_bit(EV_REP, keypad->dev->evbit);
 
-       err = of_property_read_u32(node, "debounce-delay-ms",
-                                  &keypad->debounce_ms);
+       err = device_property_read_u32(dev, "debounce-delay-ms",
+                                      &keypad->debounce_ms);
        if (err) {
-               dev_err(&client->dev, "key debounce delay not specified\n");
+               dev_err(dev, "key debounce delay not specified\n");
                return err;
        }
 
-       err = matrix_keypad_parse_of_params(&client->dev, &rows, &cols);
+       err = matrix_keypad_parse_properties(dev, &rows, &cols);
        if (err)
                return err;
        if (rows > HT16K33_MATRIX_KEYPAD_MAX_ROWS ||
            cols > HT16K33_MATRIX_KEYPAD_MAX_COLS) {
-               dev_err(&client->dev, "%u rows or %u cols out of range in DT\n",
-                       rows, cols);
+               dev_err(dev, "%u rows or %u cols out of range in DT\n", rows,
+                       cols);
                return -ERANGE;
        }
 
@@ -362,56 +562,55 @@ static int ht16k33_keypad_probe(struct i2c_client *client,
        err = matrix_keypad_build_keymap(NULL, NULL, rows, cols, NULL,
                                         keypad->dev);
        if (err) {
-               dev_err(&client->dev, "failed to build keymap\n");
+               dev_err(dev, "failed to build keymap\n");
                return err;
        }
 
-       err = devm_request_threaded_irq(&client->dev, client->irq,
-                                       NULL, ht16k33_keypad_irq_thread,
+       err = devm_request_threaded_irq(dev, client->irq, NULL,
+                                       ht16k33_keypad_irq_thread,
                                        IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
                                        DRIVER_NAME, keypad);
        if (err) {
-               dev_err(&client->dev, "irq request failed %d, error %d\n",
-                       client->irq, err);
+               dev_err(dev, "irq request failed %d, error %d\n", client->irq,
+                       err);
                return err;
        }
 
        ht16k33_keypad_stop(keypad->dev);
 
-       err = input_register_device(keypad->dev);
-       if (err)
-               return err;
-
-       return 0;
+       return input_register_device(keypad->dev);
 }
 
-static int ht16k33_probe(struct i2c_client *client,
-                                 const struct i2c_device_id *id)
+static int ht16k33_fbdev_probe(struct device *dev, struct ht16k33_priv *priv,
+                              uint32_t brightness)
 {
+       struct ht16k33_fbdev *fbdev = &priv->fbdev;
+       struct backlight_device *bl = NULL;
        int err;
-       uint32_t dft_brightness;
-       struct backlight_device *bl;
-       struct backlight_properties bl_props;
-       struct ht16k33_priv *priv;
-       struct ht16k33_fbdev *fbdev;
-       struct device_node *node = client->dev.of_node;
 
-       if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
-               dev_err(&client->dev, "i2c_check_functionality error\n");
-               return -EIO;
-       }
-
-       priv = devm_kzalloc(&client->dev, sizeof(*priv), GFP_KERNEL);
-       if (!priv)
-               return -ENOMEM;
-
-       priv->client = client;
-       i2c_set_clientdata(client, priv);
-       fbdev = &priv->fbdev;
+       if (priv->led.dev) {
+               err = ht16k33_brightness_set(priv, brightness);
+               if (err)
+                       return err;
+       } else {
+               /* backwards compatibility with DT lacking an led subnode */
+               struct backlight_properties bl_props;
+
+               memset(&bl_props, 0, sizeof(struct backlight_properties));
+               bl_props.type = BACKLIGHT_RAW;
+               bl_props.max_brightness = MAX_BRIGHTNESS;
+
+               bl = devm_backlight_device_register(dev, DRIVER_NAME"-bl", dev,
+                                                   priv, &ht16k33_bl_ops,
+                                                   &bl_props);
+               if (IS_ERR(bl)) {
+                       dev_err(dev, "failed to register backlight\n");
+                       return PTR_ERR(bl);
+               }
 
-       err = ht16k33_initialize(priv);
-       if (err)
-               return err;
+               bl->props.brightness = brightness;
+               ht16k33_bl_update_status(bl);
+       }
 
        /* Framebuffer (2 bytes per column) */
        BUILD_BUG_ON(PAGE_SIZE < HT16K33_FB_SIZE);
@@ -419,32 +618,33 @@ static int ht16k33_probe(struct i2c_client *client,
        if (!fbdev->buffer)
                return -ENOMEM;
 
-       fbdev->cache = devm_kmalloc(&client->dev, HT16K33_FB_SIZE, GFP_KERNEL);
+       fbdev->cache = devm_kmalloc(dev, HT16K33_FB_SIZE, GFP_KERNEL);
        if (!fbdev->cache) {
                err = -ENOMEM;
                goto err_fbdev_buffer;
        }
 
-       fbdev->info = framebuffer_alloc(0, &client->dev);
+       fbdev->info = framebuffer_alloc(0, dev);
        if (!fbdev->info) {
                err = -ENOMEM;
                goto err_fbdev_buffer;
        }
 
-       err = of_property_read_u32(node, "refresh-rate-hz",
-               &fbdev->refresh_rate);
+       err = device_property_read_u32(dev, "refresh-rate-hz",
+                                      &fbdev->refresh_rate);
        if (err) {
-               dev_err(&client->dev, "refresh rate not specified\n");
+               dev_err(dev, "refresh rate not specified\n");
                goto err_fbdev_info;
        }
        fb_bl_default_curve(fbdev->info, 0, MIN_BRIGHTNESS, MAX_BRIGHTNESS);
 
-       INIT_DELAYED_WORK(&fbdev->work, ht16k33_fb_update);
+       INIT_DELAYED_WORK(&priv->work, ht16k33_fb_update);
        fbdev->info->fbops = &ht16k33_fb_ops;
        fbdev->info->screen_base = (char __iomem *) fbdev->buffer;
        fbdev->info->screen_size = HT16K33_FB_SIZE;
        fbdev->info->fix = ht16k33_fb_fix;
        fbdev->info->var = ht16k33_fb_var;
+       fbdev->info->bl_dev = bl;
        fbdev->info->pseudo_palette = NULL;
        fbdev->info->flags = FBINFO_FLAG_DEFAULT;
        fbdev->info->par = priv;
@@ -453,51 +653,125 @@ static int ht16k33_probe(struct i2c_client *client,
        if (err)
                goto err_fbdev_info;
 
-       /* Keypad */
-       if (client->irq > 0) {
-               err = ht16k33_keypad_probe(client, &priv->keypad);
-               if (err)
-                       goto err_fbdev_unregister;
+       ht16k33_fb_queue(priv);
+       return 0;
+
+err_fbdev_info:
+       framebuffer_release(fbdev->info);
+err_fbdev_buffer:
+       free_page((unsigned long) fbdev->buffer);
+
+       return err;
+}
+
+static int ht16k33_seg_probe(struct device *dev, struct ht16k33_priv *priv,
+                            uint32_t brightness)
+{
+       struct ht16k33_seg *seg = &priv->seg;
+       int err;
+
+       err = ht16k33_brightness_set(priv, brightness);
+       if (err)
+               return err;
+
+       switch (priv->type) {
+       case DISP_MATRIX:
+               /* not handled here */
+               err = -EINVAL;
+               break;
+
+       case DISP_QUAD_7SEG:
+               INIT_DELAYED_WORK(&priv->work, ht16k33_seg7_update);
+               seg->map.seg7 = initial_map_seg7;
+               seg->map_size = sizeof(seg->map.seg7);
+               err = device_create_file(dev, &dev_attr_map_seg7);
+               break;
+
+       case DISP_QUAD_14SEG:
+               INIT_DELAYED_WORK(&priv->work, ht16k33_seg14_update);
+               seg->map.seg14 = initial_map_seg14;
+               seg->map_size = sizeof(seg->map.seg14);
+               err = device_create_file(dev, &dev_attr_map_seg14);
+               break;
        }
+       if (err)
+               return err;
+
+       err = linedisp_register(&seg->linedisp, dev, 4, seg->curr,
+                               ht16k33_linedisp_update);
+       if (err)
+               goto err_remove_map_file;
+
+       return 0;
+
+err_remove_map_file:
+       device_remove_file(dev, &dev_attr_map_seg7);
+       device_remove_file(dev, &dev_attr_map_seg14);
+       return err;
+}
+
+static int ht16k33_probe(struct i2c_client *client)
+{
+       struct device *dev = &client->dev;
+       const struct of_device_id *id;
+       struct ht16k33_priv *priv;
+       uint32_t dft_brightness;
+       int err;
 
-       /* Backlight */
-       memset(&bl_props, 0, sizeof(struct backlight_properties));
-       bl_props.type = BACKLIGHT_RAW;
-       bl_props.max_brightness = MAX_BRIGHTNESS;
-
-       bl = devm_backlight_device_register(&client->dev, DRIVER_NAME"-bl",
-                                           &client->dev, priv,
-                                           &ht16k33_bl_ops, &bl_props);
-       if (IS_ERR(bl)) {
-               dev_err(&client->dev, "failed to register backlight\n");
-               err = PTR_ERR(bl);
-               goto err_fbdev_unregister;
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+               dev_err(dev, "i2c_check_functionality error\n");
+               return -EIO;
        }
 
-       err = of_property_read_u32(node, "default-brightness-level",
-                                  &dft_brightness);
+       priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       priv->client = client;
+       id = i2c_of_match_device(dev->driver->of_match_table, client);
+       if (id)
+               priv->type = (uintptr_t)id->data;
+       i2c_set_clientdata(client, priv);
+
+       err = ht16k33_initialize(priv);
+       if (err)
+               return err;
+
+       err = device_property_read_u32(dev, "default-brightness-level",
+                                      &dft_brightness);
        if (err) {
                dft_brightness = MAX_BRIGHTNESS;
        } else if (dft_brightness > MAX_BRIGHTNESS) {
-               dev_warn(&client->dev,
+               dev_warn(dev,
                         "invalid default brightness level: %u, using %u\n",
                         dft_brightness, MAX_BRIGHTNESS);
                dft_brightness = MAX_BRIGHTNESS;
        }
 
-       bl->props.brightness = dft_brightness;
-       ht16k33_bl_update_status(bl);
-
-       ht16k33_fb_queue(priv);
-       return 0;
+       /* LED */
+       err = ht16k33_led_probe(dev, &priv->led, dft_brightness);
+       if (err)
+               return err;
 
-err_fbdev_unregister:
-       unregister_framebuffer(fbdev->info);
-err_fbdev_info:
-       framebuffer_release(fbdev->info);
-err_fbdev_buffer:
-       free_page((unsigned long) fbdev->buffer);
+       /* Keypad */
+       if (client->irq > 0) {
+               err = ht16k33_keypad_probe(client, &priv->keypad);
+               if (err)
+                       return err;
+       }
 
+       switch (priv->type) {
+       case DISP_MATRIX:
+               /* Frame Buffer Display */
+               err = ht16k33_fbdev_probe(dev, priv, dft_brightness);
+               break;
+
+       case DISP_QUAD_7SEG:
+       case DISP_QUAD_14SEG:
+               /* Segment Display */
+               err = ht16k33_seg_probe(dev, priv, dft_brightness);
+               break;
+       }
        return err;
 }
 
@@ -506,10 +780,22 @@ static int ht16k33_remove(struct i2c_client *client)
        struct ht16k33_priv *priv = i2c_get_clientdata(client);
        struct ht16k33_fbdev *fbdev = &priv->fbdev;
 
-       cancel_delayed_work_sync(&fbdev->work);
-       unregister_framebuffer(fbdev->info);
-       framebuffer_release(fbdev->info);
-       free_page((unsigned long) fbdev->buffer);
+       cancel_delayed_work_sync(&priv->work);
+
+       switch (priv->type) {
+       case DISP_MATRIX:
+               unregister_framebuffer(fbdev->info);
+               framebuffer_release(fbdev->info);
+               free_page((unsigned long)fbdev->buffer);
+               break;
+
+       case DISP_QUAD_7SEG:
+       case DISP_QUAD_14SEG:
+               linedisp_unregister(&priv->seg.linedisp);
+               device_remove_file(&client->dev, &dev_attr_map_seg7);
+               device_remove_file(&client->dev, &dev_attr_map_seg14);
+               break;
+       }
 
        return 0;
 }
@@ -521,17 +807,26 @@ static const struct i2c_device_id ht16k33_i2c_match[] = {
 MODULE_DEVICE_TABLE(i2c, ht16k33_i2c_match);
 
 static const struct of_device_id ht16k33_of_match[] = {
-       { .compatible = "holtek,ht16k33", },
+       {
+               /* 0.56" 4-Digit 7-Segment FeatherWing Display (Red) */
+               .compatible = "adafruit,3108", .data = (void *)DISP_QUAD_7SEG,
+       }, {
+               /* 0.54" Quad Alphanumeric FeatherWing Display (Red) */
+               .compatible = "adafruit,3130", .data = (void *)DISP_QUAD_14SEG,
+       }, {
+               /* Generic, assumed Dot-Matrix Display */
+               .compatible = "holtek,ht16k33", .data = (void *)DISP_MATRIX,
+       },
        { }
 };
 MODULE_DEVICE_TABLE(of, ht16k33_of_match);
 
 static struct i2c_driver ht16k33_driver = {
-       .probe          = ht16k33_probe,
+       .probe_new      = ht16k33_probe,
        .remove         = ht16k33_remove,
        .driver         = {
                .name           = DRIVER_NAME,
-               .of_match_table = of_match_ptr(ht16k33_of_match),
+               .of_match_table = ht16k33_of_match,
        },
        .id_table = ht16k33_i2c_match,
 };
index 1cce409ce5cacbc8a9a7ae271233c981b0ef2dcd..fa23e415f260ec5659be0c15f2f9bdef03903288 100644 (file)
@@ -4,7 +4,6 @@
  * Author: Paul Burton <paul.burton@mips.com>
  */
 
-#include <generated/utsrelease.h>
 #include <linux/kernel.h>
 #include <linux/io.h>
 #include <linux/mfd/syscon.h>
@@ -14,7 +13,8 @@
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
-#include <linux/sysfs.h>
+
+#include "line-display.h"
 
 struct img_ascii_lcd_ctx;
 
@@ -27,36 +27,26 @@ struct img_ascii_lcd_ctx;
 struct img_ascii_lcd_config {
        unsigned int num_chars;
        bool external_regmap;
-       void (*update)(struct img_ascii_lcd_ctx *ctx);
+       void (*update)(struct linedisp *linedisp);
 };
 
 /**
  * struct img_ascii_lcd_ctx - Private data structure
- * @pdev: the ASCII LCD platform device
  * @base: the base address of the LCD registers
  * @regmap: the regmap through which LCD registers are accessed
  * @offset: the offset within regmap to the start of the LCD registers
  * @cfg: pointer to the LCD model configuration
- * @message: the full message to display or scroll on the LCD
- * @message_len: the length of the @message string
- * @scroll_pos: index of the first character of @message currently displayed
- * @scroll_rate: scroll interval in jiffies
- * @timer: timer used to implement scrolling
+ * @linedisp: line display structure
  * @curr: the string currently displayed on the LCD
  */
 struct img_ascii_lcd_ctx {
-       struct platform_device *pdev;
        union {
                void __iomem *base;
                struct regmap *regmap;
        };
        u32 offset;
        const struct img_ascii_lcd_config *cfg;
-       char *message;
-       unsigned int message_len;
-       unsigned int scroll_pos;
-       unsigned int scroll_rate;
-       struct timer_list timer;
+       struct linedisp linedisp;
        char curr[] __aligned(8);
 };
 
@@ -64,8 +54,10 @@ struct img_ascii_lcd_ctx {
  * MIPS Boston development board
  */
 
-static void boston_update(struct img_ascii_lcd_ctx *ctx)
+static void boston_update(struct linedisp *linedisp)
 {
+       struct img_ascii_lcd_ctx *ctx =
+               container_of(linedisp, struct img_ascii_lcd_ctx, linedisp);
        ulong val;
 
 #if BITS_PER_LONG == 64
@@ -90,12 +82,14 @@ static struct img_ascii_lcd_config boston_config = {
  * MIPS Malta development board
  */
 
-static void malta_update(struct img_ascii_lcd_ctx *ctx)
+static void malta_update(struct linedisp *linedisp)
 {
+       struct img_ascii_lcd_ctx *ctx =
+               container_of(linedisp, struct img_ascii_lcd_ctx, linedisp);
        unsigned int i;
        int err = 0;
 
-       for (i = 0; i < ctx->cfg->num_chars; i++) {
+       for (i = 0; i < linedisp->num_chars; i++) {
                err = regmap_write(ctx->regmap,
                                   ctx->offset + (i * 8), ctx->curr[i]);
                if (err)
@@ -173,12 +167,14 @@ static int sead3_wait_lcd_idle(struct img_ascii_lcd_ctx *ctx)
        return 0;
 }
 
-static void sead3_update(struct img_ascii_lcd_ctx *ctx)
+static void sead3_update(struct linedisp *linedisp)
 {
+       struct img_ascii_lcd_ctx *ctx =
+               container_of(linedisp, struct img_ascii_lcd_ctx, linedisp);
        unsigned int i;
        int err = 0;
 
-       for (i = 0; i < ctx->cfg->num_chars; i++) {
+       for (i = 0; i < linedisp->num_chars; i++) {
                err = sead3_wait_lcd_idle(ctx);
                if (err)
                        break;
@@ -218,130 +214,6 @@ static const struct of_device_id img_ascii_lcd_matches[] = {
 };
 MODULE_DEVICE_TABLE(of, img_ascii_lcd_matches);
 
-/**
- * img_ascii_lcd_scroll() - scroll the display by a character
- * @t: really a pointer to the private data structure
- *
- * Scroll the current message along the LCD by one character, rearming the
- * timer if required.
- */
-static void img_ascii_lcd_scroll(struct timer_list *t)
-{
-       struct img_ascii_lcd_ctx *ctx = from_timer(ctx, t, timer);
-       unsigned int i, ch = ctx->scroll_pos;
-       unsigned int num_chars = ctx->cfg->num_chars;
-
-       /* update the current message string */
-       for (i = 0; i < num_chars;) {
-               /* copy as many characters from the string as possible */
-               for (; i < num_chars && ch < ctx->message_len; i++, ch++)
-                       ctx->curr[i] = ctx->message[ch];
-
-               /* wrap around to the start of the string */
-               ch = 0;
-       }
-
-       /* update the LCD */
-       ctx->cfg->update(ctx);
-
-       /* move on to the next character */
-       ctx->scroll_pos++;
-       ctx->scroll_pos %= ctx->message_len;
-
-       /* rearm the timer */
-       if (ctx->message_len > ctx->cfg->num_chars)
-               mod_timer(&ctx->timer, jiffies + ctx->scroll_rate);
-}
-
-/**
- * img_ascii_lcd_display() - set the message to be displayed
- * @ctx: pointer to the private data structure
- * @msg: the message to display
- * @count: length of msg, or -1
- *
- * Display a new message @msg on the LCD. @msg can be longer than the number of
- * characters the LCD can display, in which case it will begin scrolling across
- * the LCD display.
- *
- * Return: 0 on success, -ENOMEM on memory allocation failure
- */
-static int img_ascii_lcd_display(struct img_ascii_lcd_ctx *ctx,
-                            const char *msg, ssize_t count)
-{
-       char *new_msg;
-
-       /* stop the scroll timer */
-       del_timer_sync(&ctx->timer);
-
-       if (count == -1)
-               count = strlen(msg);
-
-       /* if the string ends with a newline, trim it */
-       if (msg[count - 1] == '\n')
-               count--;
-
-       new_msg = devm_kmalloc(&ctx->pdev->dev, count + 1, GFP_KERNEL);
-       if (!new_msg)
-               return -ENOMEM;
-
-       memcpy(new_msg, msg, count);
-       new_msg[count] = 0;
-
-       if (ctx->message)
-               devm_kfree(&ctx->pdev->dev, ctx->message);
-
-       ctx->message = new_msg;
-       ctx->message_len = count;
-       ctx->scroll_pos = 0;
-
-       /* update the LCD */
-       img_ascii_lcd_scroll(&ctx->timer);
-
-       return 0;
-}
-
-/**
- * message_show() - read message via sysfs
- * @dev: the LCD device
- * @attr: the LCD message attribute
- * @buf: the buffer to read the message into
- *
- * Read the current message being displayed or scrolled across the LCD display
- * into @buf, for reads from sysfs.
- *
- * Return: the number of characters written to @buf
- */
-static ssize_t message_show(struct device *dev, struct device_attribute *attr,
-                           char *buf)
-{
-       struct img_ascii_lcd_ctx *ctx = dev_get_drvdata(dev);
-
-       return sprintf(buf, "%s\n", ctx->message);
-}
-
-/**
- * message_store() - write a new message via sysfs
- * @dev: the LCD device
- * @attr: the LCD message attribute
- * @buf: the buffer containing the new message
- * @count: the size of the message in @buf
- *
- * Write a new message to display or scroll across the LCD display from sysfs.
- *
- * Return: the size of the message on success, else -ERRNO
- */
-static ssize_t message_store(struct device *dev, struct device_attribute *attr,
-                            const char *buf, size_t count)
-{
-       struct img_ascii_lcd_ctx *ctx = dev_get_drvdata(dev);
-       int err;
-
-       err = img_ascii_lcd_display(ctx, buf, count);
-       return err ?: count;
-}
-
-static DEVICE_ATTR_RW(message);
-
 /**
  * img_ascii_lcd_probe() - probe an LCD display device
  * @pdev: the LCD platform device
@@ -355,26 +227,25 @@ static int img_ascii_lcd_probe(struct platform_device *pdev)
 {
        const struct of_device_id *match;
        const struct img_ascii_lcd_config *cfg;
+       struct device *dev = &pdev->dev;
        struct img_ascii_lcd_ctx *ctx;
        int err;
 
-       match = of_match_device(img_ascii_lcd_matches, &pdev->dev);
+       match = of_match_device(img_ascii_lcd_matches, dev);
        if (!match)
                return -ENODEV;
 
        cfg = match->data;
-       ctx = devm_kzalloc(&pdev->dev, sizeof(*ctx) + cfg->num_chars,
-                          GFP_KERNEL);
+       ctx = devm_kzalloc(dev, sizeof(*ctx) + cfg->num_chars, GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;
 
        if (cfg->external_regmap) {
-               ctx->regmap = syscon_node_to_regmap(pdev->dev.parent->of_node);
+               ctx->regmap = syscon_node_to_regmap(dev->parent->of_node);
                if (IS_ERR(ctx->regmap))
                        return PTR_ERR(ctx->regmap);
 
-               if (of_property_read_u32(pdev->dev.of_node, "offset",
-                                        &ctx->offset))
+               if (of_property_read_u32(dev->of_node, "offset", &ctx->offset))
                        return -EINVAL;
        } else {
                ctx->base = devm_platform_ioremap_resource(pdev, 0);
@@ -382,29 +253,23 @@ static int img_ascii_lcd_probe(struct platform_device *pdev)
                        return PTR_ERR(ctx->base);
        }
 
-       ctx->pdev = pdev;
-       ctx->cfg = cfg;
-       ctx->message = NULL;
-       ctx->scroll_pos = 0;
-       ctx->scroll_rate = HZ / 2;
-
-       /* initialise a timer for scrolling the message */
-       timer_setup(&ctx->timer, img_ascii_lcd_scroll, 0);
-
-       platform_set_drvdata(pdev, ctx);
-
-       /* display a default message */
-       err = img_ascii_lcd_display(ctx, "Linux " UTS_RELEASE "       ", -1);
+       err = linedisp_register(&ctx->linedisp, dev, cfg->num_chars, ctx->curr,
+                               cfg->update);
        if (err)
-               goto out_del_timer;
+               return err;
 
-       err = device_create_file(&pdev->dev, &dev_attr_message);
+       /* for backwards compatibility */
+       err = compat_only_sysfs_link_entry_to_kobj(&dev->kobj,
+                                                  &ctx->linedisp.dev.kobj,
+                                                  "message", NULL);
        if (err)
-               goto out_del_timer;
+               goto err_unregister;
 
+       platform_set_drvdata(pdev, ctx);
        return 0;
-out_del_timer:
-       del_timer_sync(&ctx->timer);
+
+err_unregister:
+       linedisp_unregister(&ctx->linedisp);
        return err;
 }
 
@@ -421,8 +286,8 @@ static int img_ascii_lcd_remove(struct platform_device *pdev)
 {
        struct img_ascii_lcd_ctx *ctx = platform_get_drvdata(pdev);
 
-       device_remove_file(&pdev->dev, &dev_attr_message);
-       del_timer_sync(&ctx->timer);
+       sysfs_remove_link(&pdev->dev.kobj, "message");
+       linedisp_unregister(&ctx->linedisp);
        return 0;
 }
 
index e871b94a1911a74b78a332437cb9dc3c3e2ee9d5..234f9dbe6e300542dc4cc1b3f1d62bb01fbf9cbd 100644 (file)
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
-#include <linux/fs.h>
-#include <linux/io.h>
 #include <linux/parport.h>
-#include <linux/uaccess.h>
 #include <linux/ks0108.h>
 
 #define KS0108_NAME "ks0108"
diff --git a/drivers/auxdisplay/line-display.c b/drivers/auxdisplay/line-display.c
new file mode 100644 (file)
index 0000000..03e7f10
--- /dev/null
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Character line display core support
+ *
+ * Copyright (C) 2016 Imagination Technologies
+ * Author: Paul Burton <paul.burton@mips.com>
+ *
+ * Copyright (C) 2021 Glider bv
+ */
+
+#include <generated/utsrelease.h>
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/timer.h>
+
+#include "line-display.h"
+
+#define DEFAULT_SCROLL_RATE    (HZ / 2)
+
+/**
+ * linedisp_scroll() - scroll the display by a character
+ * @t: really a pointer to the private data structure
+ *
+ * Scroll the current message along the display by one character, rearming the
+ * timer if required.
+ */
+static void linedisp_scroll(struct timer_list *t)
+{
+       struct linedisp *linedisp = from_timer(linedisp, t, timer);
+       unsigned int i, ch = linedisp->scroll_pos;
+       unsigned int num_chars = linedisp->num_chars;
+
+       /* update the current message string */
+       for (i = 0; i < num_chars;) {
+               /* copy as many characters from the string as possible */
+               for (; i < num_chars && ch < linedisp->message_len; i++, ch++)
+                       linedisp->buf[i] = linedisp->message[ch];
+
+               /* wrap around to the start of the string */
+               ch = 0;
+       }
+
+       /* update the display */
+       linedisp->update(linedisp);
+
+       /* move on to the next character */
+       linedisp->scroll_pos++;
+       linedisp->scroll_pos %= linedisp->message_len;
+
+       /* rearm the timer */
+       if (linedisp->message_len > num_chars && linedisp->scroll_rate)
+               mod_timer(&linedisp->timer, jiffies + linedisp->scroll_rate);
+}
+
+/**
+ * linedisp_display() - set the message to be displayed
+ * @linedisp: pointer to the private data structure
+ * @msg: the message to display
+ * @count: length of msg, or -1
+ *
+ * Display a new message @msg on the display. @msg can be longer than the
+ * number of characters the display can display, in which case it will begin
+ * scrolling across the display.
+ *
+ * Return: 0 on success, -ENOMEM on memory allocation failure
+ */
+static int linedisp_display(struct linedisp *linedisp, const char *msg,
+                           ssize_t count)
+{
+       char *new_msg;
+
+       /* stop the scroll timer */
+       del_timer_sync(&linedisp->timer);
+
+       if (count == -1)
+               count = strlen(msg);
+
+       /* if the string ends with a newline, trim it */
+       if (msg[count - 1] == '\n')
+               count--;
+
+       if (!count) {
+               /* Clear the display */
+               kfree(linedisp->message);
+               linedisp->message = NULL;
+               linedisp->message_len = 0;
+               memset(linedisp->buf, ' ', linedisp->num_chars);
+               linedisp->update(linedisp);
+               return 0;
+       }
+
+       new_msg = kmemdup_nul(msg, count, GFP_KERNEL);
+       if (!new_msg)
+               return -ENOMEM;
+
+       kfree(linedisp->message);
+
+       linedisp->message = new_msg;
+       linedisp->message_len = count;
+       linedisp->scroll_pos = 0;
+
+       /* update the display */
+       linedisp_scroll(&linedisp->timer);
+
+       return 0;
+}
+
+/**
+ * message_show() - read message via sysfs
+ * @dev: the display device
+ * @attr: the display message attribute
+ * @buf: the buffer to read the message into
+ *
+ * Read the current message being displayed or scrolled across the display into
+ * @buf, for reads from sysfs.
+ *
+ * Return: the number of characters written to @buf
+ */
+static ssize_t message_show(struct device *dev, struct device_attribute *attr,
+                           char *buf)
+{
+       struct linedisp *linedisp = container_of(dev, struct linedisp, dev);
+
+       return sysfs_emit(buf, "%s\n", linedisp->message);
+}
+
+/**
+ * message_store() - write a new message via sysfs
+ * @dev: the display device
+ * @attr: the display message attribute
+ * @buf: the buffer containing the new message
+ * @count: the size of the message in @buf
+ *
+ * Write a new message to display or scroll across the display from sysfs.
+ *
+ * Return: the size of the message on success, else -ERRNO
+ */
+static ssize_t message_store(struct device *dev, struct device_attribute *attr,
+                            const char *buf, size_t count)
+{
+       struct linedisp *linedisp = container_of(dev, struct linedisp, dev);
+       int err;
+
+       err = linedisp_display(linedisp, buf, count);
+       return err ?: count;
+}
+
+static DEVICE_ATTR_RW(message);
+
+static ssize_t scroll_step_ms_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+       struct linedisp *linedisp = container_of(dev, struct linedisp, dev);
+
+       return sysfs_emit(buf, "%u\n", jiffies_to_msecs(linedisp->scroll_rate));
+}
+
+static ssize_t scroll_step_ms_store(struct device *dev,
+                                   struct device_attribute *attr,
+                                   const char *buf, size_t count)
+{
+       struct linedisp *linedisp = container_of(dev, struct linedisp, dev);
+       unsigned int ms;
+
+       if (kstrtouint(buf, 10, &ms) != 0)
+               return -EINVAL;
+
+       linedisp->scroll_rate = msecs_to_jiffies(ms);
+       if (linedisp->message && linedisp->message_len > linedisp->num_chars) {
+               del_timer_sync(&linedisp->timer);
+               if (linedisp->scroll_rate)
+                       linedisp_scroll(&linedisp->timer);
+       }
+
+       return count;
+}
+
+static DEVICE_ATTR_RW(scroll_step_ms);
+
+static struct attribute *linedisp_attrs[] = {
+       &dev_attr_message.attr,
+       &dev_attr_scroll_step_ms.attr,
+       NULL,
+};
+ATTRIBUTE_GROUPS(linedisp);
+
+static const struct device_type linedisp_type = {
+       .groups = linedisp_groups,
+};
+
+/**
+ * linedisp_register - register a character line display
+ * @linedisp: pointer to character line display structure
+ * @parent: parent device
+ * @num_chars: the number of characters that can be displayed
+ * @buf: pointer to a buffer that can hold @num_chars characters
+ * @update: Function called to update the display.  This must not sleep!
+ *
+ * Return: zero on success, else a negative error code.
+ */
+int linedisp_register(struct linedisp *linedisp, struct device *parent,
+                     unsigned int num_chars, char *buf,
+                     void (*update)(struct linedisp *linedisp))
+{
+       static atomic_t linedisp_id = ATOMIC_INIT(-1);
+       int err;
+
+       memset(linedisp, 0, sizeof(*linedisp));
+       linedisp->dev.parent = parent;
+       linedisp->dev.type = &linedisp_type;
+       linedisp->update = update;
+       linedisp->buf = buf;
+       linedisp->num_chars = num_chars;
+       linedisp->scroll_rate = DEFAULT_SCROLL_RATE;
+
+       device_initialize(&linedisp->dev);
+       dev_set_name(&linedisp->dev, "linedisp.%lu",
+                    (unsigned long)atomic_inc_return(&linedisp_id));
+
+       /* initialise a timer for scrolling the message */
+       timer_setup(&linedisp->timer, linedisp_scroll, 0);
+
+       err = device_add(&linedisp->dev);
+       if (err)
+               goto out_del_timer;
+
+       /* display a default message */
+       err = linedisp_display(linedisp, "Linux " UTS_RELEASE "       ", -1);
+       if (err)
+               goto out_del_dev;
+
+       return 0;
+
+out_del_dev:
+       device_del(&linedisp->dev);
+out_del_timer:
+       del_timer_sync(&linedisp->timer);
+       put_device(&linedisp->dev);
+       return err;
+}
+EXPORT_SYMBOL_GPL(linedisp_register);
+
+/**
+ * linedisp_unregister - unregister a character line display
+ * @linedisp: pointer to character line display structure registered previously
+ *           with linedisp_register()
+ */
+void linedisp_unregister(struct linedisp *linedisp)
+{
+       device_del(&linedisp->dev);
+       del_timer_sync(&linedisp->timer);
+       kfree(linedisp->message);
+       put_device(&linedisp->dev);
+}
+EXPORT_SYMBOL_GPL(linedisp_unregister);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/auxdisplay/line-display.h b/drivers/auxdisplay/line-display.h
new file mode 100644 (file)
index 0000000..0f5891d
--- /dev/null
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Character line display core support
+ *
+ * Copyright (C) 2016 Imagination Technologies
+ * Author: Paul Burton <paul.burton@mips.com>
+ *
+ * Copyright (C) 2021 Glider bv
+ */
+
+#ifndef _LINEDISP_H
+#define _LINEDISP_H
+
+/**
+ * struct linedisp - character line display private data structure
+ * @dev: the line display device
+ * @timer: timer used to implement scrolling
+ * @update: function called to update the display
+ * @buf: pointer to the buffer for the string currently displayed
+ * @message: the full message to display or scroll on the display
+ * @num_chars: the number of characters that can be displayed
+ * @message_len: the length of the @message string
+ * @scroll_pos: index of the first character of @message currently displayed
+ * @scroll_rate: scroll interval in jiffies
+ */
+struct linedisp {
+       struct device dev;
+       struct timer_list timer;
+       void (*update)(struct linedisp *linedisp);
+       char *buf;
+       char *message;
+       unsigned int num_chars;
+       unsigned int message_len;
+       unsigned int scroll_pos;
+       unsigned int scroll_rate;
+};
+
+int linedisp_register(struct linedisp *linedisp, struct device *parent,
+                     unsigned int num_chars, char *buf,
+                     void (*update)(struct linedisp *linedisp));
+void linedisp_unregister(struct linedisp *linedisp);
+
+#endif /* LINEDISP_H */
index ef8e44a7d2881c6b9937662af30c07b8e29bccfa..02f7f1358e865a2b8d8b6003c9f3edd92baf472c 100644 (file)
@@ -13,7 +13,7 @@ obj-y                 += power/
 obj-$(CONFIG_ISA_BUS_API)      += isa.o
 obj-y                          += firmware_loader/
 obj-$(CONFIG_NUMA)     += node.o
-obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
 ifeq ($(CONFIG_SYSFS),y)
 obj-$(CONFIG_MODULES)  += module.o
 endif
index 00fb4120a5b3a8166882216b91316d86ab0c9950..bc1876915457d47e2ea3e7c26729964fc4a97ef4 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/of.h>
 
 #include <asm/sections.h>
+#include <asm/pgalloc.h>
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
@@ -165,25 +166,86 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size,
 
 static void __init pcpu_fc_free(void *ptr, size_t size)
 {
-       memblock_free_early(__pa(ptr), size);
+       memblock_free(ptr, size);
 }
 
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+static void __init pcpu_populate_pte(unsigned long addr)
+{
+       pgd_t *pgd = pgd_offset_k(addr);
+       p4d_t *p4d;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       p4d = p4d_offset(pgd, addr);
+       if (p4d_none(*p4d)) {
+               pud_t *new;
+
+               new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+               if (!new)
+                       goto err_alloc;
+               p4d_populate(&init_mm, p4d, new);
+       }
+
+       pud = pud_offset(p4d, addr);
+       if (pud_none(*pud)) {
+               pmd_t *new;
+
+               new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+               if (!new)
+                       goto err_alloc;
+               pud_populate(&init_mm, pud, new);
+       }
+
+       pmd = pmd_offset(pud, addr);
+       if (!pmd_present(*pmd)) {
+               pte_t *new;
+
+               new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+               if (!new)
+                       goto err_alloc;
+               pmd_populate_kernel(&init_mm, pmd, new);
+       }
+
+       return;
+
+err_alloc:
+       panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n",
+             __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
+}
+#endif
+
 void __init setup_per_cpu_areas(void)
 {
        unsigned long delta;
        unsigned int cpu;
-       int rc;
+       int rc = -EINVAL;
+
+       if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+               /*
+                * Always reserve area for module percpu variables.  That's
+                * what the legacy allocator did.
+                */
+               rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
+                                           PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
+                                           pcpu_cpu_distance,
+                                           pcpu_fc_alloc, pcpu_fc_free);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+               if (rc < 0)
+                       pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n",
+                                  pcpu_fc_names[pcpu_chosen_fc], rc);
+#endif
+       }
 
-       /*
-        * Always reserve area for module percpu variables.  That's
-        * what the legacy allocator did.
-        */
-       rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
-                                   PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
-                                   pcpu_cpu_distance,
-                                   pcpu_fc_alloc, pcpu_fc_free);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+       if (rc < 0)
+               rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE,
+                                          pcpu_fc_alloc,
+                                          pcpu_fc_free,
+                                          pcpu_populate_pte);
+#endif
        if (rc < 0)
-               panic("Failed to initialize percpu areas.");
+               panic("Failed to initialize percpu areas (err=%d).", rc);
 
        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu)
@@ -264,7 +326,7 @@ void __init numa_free_distance(void)
        size = numa_distance_cnt * numa_distance_cnt *
                sizeof(numa_distance[0]);
 
-       memblock_free_ptr(numa_distance, size);
+       memblock_free(numa_distance, size);
        numa_distance_cnt = 0;
        numa_distance = NULL;
 }
@@ -275,15 +337,13 @@ void __init numa_free_distance(void)
 static int __init numa_alloc_distance(void)
 {
        size_t size;
-       u64 phys;
        int i, j;
 
        size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
-       phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn));
-       if (WARN_ON(!phys))
+       numa_distance = memblock_alloc(size, PAGE_SIZE);
+       if (WARN_ON(!numa_distance))
                return -ENOMEM;
 
-       numa_distance = __va(phys);
        numa_distance_cnt = nr_node_ids;
 
        /* fill with the default distances */
index c56d34f8158f7c21473535c3decac896878f8f12..b5a4ba18f9f9071d8f24a210af6d5f33450ad758 100644 (file)
@@ -629,7 +629,7 @@ static void node_device_release(struct device *dev)
 {
        struct node *node = to_node(dev);
 
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
        /*
         * We schedule the work only when a memory section is
         * onlined/offlined on this node. When we come here,
@@ -782,7 +782,7 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
        return 0;
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
 static int __ref get_nid_for_pfn(unsigned long pfn)
 {
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -958,10 +958,9 @@ static int node_memory_callback(struct notifier_block *self,
        return NOTIFY_OK;
 }
 #endif /* CONFIG_HUGETLBFS */
-#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
-#if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \
-    !defined(CONFIG_HUGETLBFS)
+#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS)
 static inline int node_memory_callback(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
index 69c10a7b7c617b0155637c42e53821693445b826..960632197b054c9bb6a16e3ef9ce779b2388ee1f 100644 (file)
@@ -162,7 +162,6 @@ static int bcma_host_pci_probe(struct pci_dev *dev,
 {
        struct bcma_bus *bus;
        int err = -ENOMEM;
-       const char *name;
        u32 val;
 
        /* Alloc */
@@ -175,10 +174,7 @@ static int bcma_host_pci_probe(struct pci_dev *dev,
        if (err)
                goto err_kfree_bus;
 
-       name = dev_name(&dev->dev);
-       if (dev->driver && dev->driver->name)
-               name = dev->driver->name;
-       err = pci_request_regions(dev, name);
+       err = pci_request_regions(dev, "bcma-pci-bridge");
        if (err)
                goto err_pci_disable;
        pci_set_master(dev);
index a68297fb51a2f4febbadabe3a1802a8dc5559253..c8931ba91b5742d3f1e1f57e82d4e73c4af3395f 100644 (file)
@@ -291,22 +291,16 @@ static ssize_t mem_used_max_store(struct device *dev,
        return len;
 }
 
-static ssize_t idle_store(struct device *dev,
-               struct device_attribute *attr, const char *buf, size_t len)
+/*
+ * Mark all pages which are older than or equal to cutoff as IDLE.
+ * Callers should hold the zram init lock in read mode
+ */
+static void mark_idle(struct zram *zram, ktime_t cutoff)
 {
-       struct zram *zram = dev_to_zram(dev);
+       int is_idle = 1;
        unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
        int index;
 
-       if (!sysfs_streq(buf, "all"))
-               return -EINVAL;
-
-       down_read(&zram->init_lock);
-       if (!init_done(zram)) {
-               up_read(&zram->init_lock);
-               return -EINVAL;
-       }
-
        for (index = 0; index < nr_pages; index++) {
                /*
                 * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
@@ -314,14 +308,50 @@ static ssize_t idle_store(struct device *dev,
                 */
                zram_slot_lock(zram, index);
                if (zram_allocated(zram, index) &&
-                               !zram_test_flag(zram, index, ZRAM_UNDER_WB))
-                       zram_set_flag(zram, index, ZRAM_IDLE);
+                               !zram_test_flag(zram, index, ZRAM_UNDER_WB)) {
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+                       is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time);
+#endif
+                       if (is_idle)
+                               zram_set_flag(zram, index, ZRAM_IDLE);
+               }
                zram_slot_unlock(zram, index);
        }
+}
 
-       up_read(&zram->init_lock);
+static ssize_t idle_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t len)
+{
+       struct zram *zram = dev_to_zram(dev);
+       ktime_t cutoff_time = 0;
+       ssize_t rv = -EINVAL;
 
-       return len;
+       if (!sysfs_streq(buf, "all")) {
+               /*
+                * If it did not parse as 'all' try to treat it as an integer when
+                * we have memory tracking enabled.
+                */
+               u64 age_sec;
+
+               if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec))
+                       cutoff_time = ktime_sub(ktime_get_boottime(),
+                                       ns_to_ktime(age_sec * NSEC_PER_SEC));
+               else
+                       goto out;
+       }
+
+       down_read(&zram->init_lock);
+       if (!init_done(zram))
+               goto out_unlock;
+
+       /* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */
+       mark_idle(zram, cutoff_time);
+       rv = len;
+
+out_unlock:
+       up_read(&zram->init_lock);
+out:
+       return rv;
 }
 
 #ifdef CONFIG_ZRAM_WRITEBACK
@@ -587,7 +617,7 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
 {
        struct bio *bio;
 
-       bio = bio_alloc(GFP_ATOMIC, 1);
+       bio = bio_alloc(GFP_NOIO, 1);
        if (!bio)
                return -ENOMEM;
 
@@ -910,7 +940,7 @@ static ssize_t read_block_state(struct file *file, char __user *buf,
                        zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
                        zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.');
 
-               if (count < copied) {
+               if (count <= copied) {
                        zram_slot_unlock(zram, index);
                        break;
                }
index fed52ae516ba837a53edf9d2b53fabc67cb5d8d7..52d6cca6262e26c42e346d90749fdde69e29993a 100644 (file)
@@ -3118,7 +3118,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
        };
        int ret;
 
-       ret = strscpy(interface.name, pdev->driver->name,
+       ret = strscpy(interface.name, dev_driver_string(&pdev->dev),
                      sizeof(interface.name));
        if (ret < 0)
                return -ENAMETOOLONG;
index 359fb7989dfbfd4fd25e47892a695e5442abf302..71ef065914b22d319779609e71a04a2c04202a77 100644 (file)
@@ -247,11 +247,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        pci_set_master(pdev);
 
-       if (adf_enable_aer(accel_dev)) {
-               dev_err(&pdev->dev, "Failed to enable aer.\n");
-               ret = -EFAULT;
-               goto out_err;
-       }
+       adf_enable_aer(accel_dev);
 
        if (pci_save_state(pdev)) {
                dev_err(&pdev->dev, "Failed to save pci state.\n");
@@ -304,6 +300,7 @@ static struct pci_driver adf_driver = {
        .probe = adf_probe,
        .remove = adf_remove,
        .sriov_configure = adf_sriov_configure,
+       .err_handler = &adf_err_handler,
 };
 
 module_pci_driver(adf_driver);
index cc6e75dc60de94a033aa6a3a42bd1e5e30f38823..2aef0bb791dfa3f39867662c55210bd47c96ec6e 100644 (file)
@@ -33,6 +33,7 @@ static struct pci_driver adf_driver = {
        .probe = adf_probe,
        .remove = adf_remove,
        .sriov_configure = adf_sriov_configure,
+       .err_handler = &adf_err_handler,
 };
 
 static void adf_cleanup_pci_dev(struct adf_accel_dev *accel_dev)
@@ -192,11 +193,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        }
        pci_set_master(pdev);
 
-       if (adf_enable_aer(accel_dev)) {
-               dev_err(&pdev->dev, "Failed to enable aer\n");
-               ret = -EFAULT;
-               goto out_err_free_reg;
-       }
+       adf_enable_aer(accel_dev);
 
        if (pci_save_state(pdev)) {
                dev_err(&pdev->dev, "Failed to save pci state\n");
index bf251dfe74b36603e6802a84f330184d86e6b94a..56163083f16163c878f02cf1859982492426a325 100644 (file)
@@ -33,6 +33,7 @@ static struct pci_driver adf_driver = {
        .probe = adf_probe,
        .remove = adf_remove,
        .sriov_configure = adf_sriov_configure,
+       .err_handler = &adf_err_handler,
 };
 
 static void adf_cleanup_pci_dev(struct adf_accel_dev *accel_dev)
@@ -192,11 +193,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        }
        pci_set_master(pdev);
 
-       if (adf_enable_aer(accel_dev)) {
-               dev_err(&pdev->dev, "Failed to enable aer\n");
-               ret = -EFAULT;
-               goto out_err_free_reg;
-       }
+       adf_enable_aer(accel_dev);
 
        if (pci_save_state(pdev)) {
                dev_err(&pdev->dev, "Failed to save pci state\n");
index ed3e40bc56eb2b4f9db0d6df77a641ad511b702a..fe9bb2f3536a978439aa55700f649db8b43f3805 100644 (file)
@@ -166,11 +166,12 @@ static void adf_resume(struct pci_dev *pdev)
        dev_info(&pdev->dev, "Device is up and running\n");
 }
 
-static const struct pci_error_handlers adf_err_handler = {
+const struct pci_error_handlers adf_err_handler = {
        .error_detected = adf_error_detected,
        .slot_reset = adf_slot_reset,
        .resume = adf_resume,
 };
+EXPORT_SYMBOL_GPL(adf_err_handler);
 
 /**
  * adf_enable_aer() - Enable Advance Error Reporting for acceleration device
@@ -179,17 +180,12 @@ static const struct pci_error_handlers adf_err_handler = {
  * Function enables PCI Advance Error Reporting for the
  * QAT acceleration device accel_dev.
  * To be used by QAT device specific drivers.
- *
- * Return: 0 on success, error code otherwise.
  */
-int adf_enable_aer(struct adf_accel_dev *accel_dev)
+void adf_enable_aer(struct adf_accel_dev *accel_dev)
 {
        struct pci_dev *pdev = accel_to_pci_dev(accel_dev);
-       struct pci_driver *pdrv = pdev->driver;
 
-       pdrv->err_handler = &adf_err_handler;
        pci_enable_pcie_error_reporting(pdev);
-       return 0;
 }
 EXPORT_SYMBOL_GPL(adf_enable_aer);
 
index 2cc6622833c449471af0e3237a81a364f174efe4..de94b76a6d2ce4f686d50a534c353b9360af37bd 100644 (file)
@@ -94,7 +94,8 @@ void adf_ae_fw_release(struct adf_accel_dev *accel_dev);
 int adf_ae_start(struct adf_accel_dev *accel_dev);
 int adf_ae_stop(struct adf_accel_dev *accel_dev);
 
-int adf_enable_aer(struct adf_accel_dev *accel_dev);
+extern const struct pci_error_handlers adf_err_handler;
+void adf_enable_aer(struct adf_accel_dev *accel_dev);
 void adf_disable_aer(struct adf_accel_dev *accel_dev);
 void adf_reset_sbr(struct adf_accel_dev *accel_dev);
 void adf_reset_flr(struct adf_accel_dev *accel_dev);
index 3976a81bd99b8b34fd2df8dc0a211063cdeaba1b..acca56752aa02dec61fcc901ace74edecdf63c32 100644 (file)
@@ -33,6 +33,7 @@ static struct pci_driver adf_driver = {
        .probe = adf_probe,
        .remove = adf_remove,
        .sriov_configure = adf_sriov_configure,
+       .err_handler = &adf_err_handler,
 };
 
 static void adf_cleanup_pci_dev(struct adf_accel_dev *accel_dev)
@@ -192,11 +193,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        }
        pci_set_master(pdev);
 
-       if (adf_enable_aer(accel_dev)) {
-               dev_err(&pdev->dev, "Failed to enable aer\n");
-               ret = -EFAULT;
-               goto out_err_free_reg;
-       }
+       adf_enable_aer(accel_dev);
 
        if (pci_save_state(pdev)) {
                dev_err(&pdev->dev, "Failed to save pci state\n");
index 2ff1883dc788d2954761e894e6edfd267a576535..4df55a55da841117568a05783921b4ed5a4b64e1 100644 (file)
@@ -35,7 +35,7 @@ void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags)
                if (slab_is_available())
                        memblock_free_late(phys, size);
                else
-                       memblock_free(phys, size);
+                       memblock_phys_free(phys, size);
        } else if (flags & EFI_MEMMAP_SLAB) {
                struct page *p = pfn_to_page(PHYS_PFN(phys));
                unsigned int order = get_order(size);
index e50243580269a045b0a76cb0ba0dfca68667e18e..49b13cc01073a51e6d7aa1a77859899237634ec8 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/device.h>
 #include <linux/errno.h>
+#include <linux/slab.h>
 #include <linux/fsi-occ.h>
 #include <linux/mm.h>
 #include <linux/module.h>
index 1a19ebad60adf094a6990305ba0cbf99864003c1..63259b3ea5abd7aa43de6b22d0ec7861a47e7489 100644 (file)
@@ -487,7 +487,7 @@ static int xgene_slimpro_i2c_probe(struct platform_device *pdev)
                pcc_chan = pcc_mbox_request_channel(cl, ctx->mbox_idx);
                if (IS_ERR(pcc_chan)) {
                        dev_err(&pdev->dev, "PCC mailbox channel request failed\n");
-                       return PTR_ERR(ctx->pcc_chan);
+                       return PTR_ERR(pcc_chan);
                }
 
                ctx->pcc_chan = pcc_chan;
index 96d4a1f8de79777ec641e5c81c1094b5d28eae2c..565ef55988112c910dffd41d0e9c866515f1cda3 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/bitfield.h>
 #include <linux/clk.h>
 #include <linux/dev_printk.h>
+#include <linux/dma-iommu.h>
 #include <linux/dma-mapping.h>
 #include <linux/err.h>
 #include <linux/interrupt.h>
@@ -737,6 +738,31 @@ static int apple_dart_def_domain_type(struct device *dev)
        return 0;
 }
 
+#ifndef CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR
+/* Keep things compiling when CONFIG_PCI_APPLE isn't selected */
+#define CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR    0
+#endif
+#define DOORBELL_ADDR  (CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR & PAGE_MASK)
+
+static void apple_dart_get_resv_regions(struct device *dev,
+                                       struct list_head *head)
+{
+       if (IS_ENABLED(CONFIG_PCIE_APPLE) && dev_is_pci(dev)) {
+               struct iommu_resv_region *region;
+               int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+               region = iommu_alloc_resv_region(DOORBELL_ADDR,
+                                                PAGE_SIZE, prot,
+                                                IOMMU_RESV_MSI);
+               if (!region)
+                       return;
+
+               list_add_tail(&region->list, head);
+       }
+
+       iommu_dma_get_resv_regions(dev, head);
+}
+
 static const struct iommu_ops apple_dart_iommu_ops = {
        .domain_alloc = apple_dart_domain_alloc,
        .domain_free = apple_dart_domain_free,
@@ -753,6 +779,8 @@ static const struct iommu_ops apple_dart_iommu_ops = {
        .device_group = apple_dart_device_group,
        .of_xlate = apple_dart_of_xlate,
        .def_domain_type = apple_dart_def_domain_type,
+       .get_resv_regions = apple_dart_get_resv_regions,
+       .put_resv_regions = generic_iommu_put_resv_regions,
        .pgsize_bitmap = -1UL, /* Restricted during dart probe */
 };
 
index d33913d523c1d5fcaf9fab4bbc09e81c014e1e74..a4fbc3fc713d8ccfc3bcd55a9f36b21393ca7df9 100644 (file)
@@ -570,7 +570,7 @@ fail_msg_node:
 fail_db_node:
        of_node_put(smu->db_node);
 fail_bootmem:
-       memblock_free_ptr(smu, sizeof(struct smu_device));
+       memblock_free(smu, sizeof(struct smu_device));
        smu = NULL;
 fail_np:
        of_node_put(np);
index 7f7abc9069f7c8153b75bcd81e8d97ca9b3ddabd..b94d5e4fdc233b8eec84b1d95bb4b8dc6dae682f 100644 (file)
@@ -829,7 +829,6 @@ int
 mpt_device_driver_register(struct mpt_pci_driver * dd_cbfunc, u8 cb_idx)
 {
        MPT_ADAPTER     *ioc;
-       const struct pci_device_id *id;
 
        if (!cb_idx || cb_idx >= MPT_MAX_PROTOCOL_DRIVERS)
                return -EINVAL;
@@ -838,10 +837,8 @@ mpt_device_driver_register(struct mpt_pci_driver * dd_cbfunc, u8 cb_idx)
 
        /* call per pci device probe entry point */
        list_for_each_entry(ioc, &ioc_list, list) {
-               id = ioc->pcidev->driver ?
-                   ioc->pcidev->driver->id_table : NULL;
                if (dd_cbfunc->probe)
-                       dd_cbfunc->probe(ioc->pcidev, id);
+                       dd_cbfunc->probe(ioc->pcidev);
         }
 
        return 0;
@@ -2032,7 +2029,7 @@ mpt_attach(struct pci_dev *pdev, const struct pci_device_id *id)
        for(cb_idx = 0; cb_idx < MPT_MAX_PROTOCOL_DRIVERS; cb_idx++) {
                if(MptDeviceDriverHandlers[cb_idx] &&
                  MptDeviceDriverHandlers[cb_idx]->probe) {
-                       MptDeviceDriverHandlers[cb_idx]->probe(pdev,id);
+                       MptDeviceDriverHandlers[cb_idx]->probe(pdev);
                }
        }
 
index b9e0376be7232acc9bad4996c0d8dd897cc25707..4bd0682c65d3be75727c9d414df728a50d2cfdfa 100644 (file)
@@ -257,7 +257,7 @@ typedef enum {
 } MPT_DRIVER_CLASS;
 
 struct mpt_pci_driver{
-       int  (*probe) (struct pci_dev *dev, const struct pci_device_id *id);
+       int  (*probe) (struct pci_dev *dev);
        void (*remove) (struct pci_dev *dev);
 };
 
index 72025996cd70c0a8fcbb79d3bc4280d11ba6a7bd..ae433c150b37aa0bf9b5267bfbf8d3fe6f69697d 100644 (file)
@@ -114,7 +114,7 @@ static int mptctl_do_reset(MPT_ADAPTER *iocp, unsigned long arg);
 static int mptctl_hp_hostinfo(MPT_ADAPTER *iocp, unsigned long arg, unsigned int cmd);
 static int mptctl_hp_targetinfo(MPT_ADAPTER *iocp, unsigned long arg);
 
-static int  mptctl_probe(struct pci_dev *, const struct pci_device_id *);
+static int  mptctl_probe(struct pci_dev *);
 static void mptctl_remove(struct pci_dev *);
 
 #ifdef CONFIG_COMPAT
@@ -2838,7 +2838,7 @@ static long compat_mpctl_ioctl(struct file *f, unsigned int cmd, unsigned long a
  */
 
 static int
-mptctl_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+mptctl_probe(struct pci_dev *pdev)
 {
        MPT_ADAPTER *ioc = pci_get_drvdata(pdev);
 
index acdc257a900eb83bcf9be3bd461375a9d0b160ce..117fa4ebf6d79d836899db56e6cb15bd1d05ae22 100644 (file)
@@ -1377,7 +1377,7 @@ mpt_register_lan_device (MPT_ADAPTER *mpt_dev, int pnum)
 }
 
 static int
-mptlan_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+mptlan_probe(struct pci_dev *pdev)
 {
        MPT_ADAPTER             *ioc = pci_get_drvdata(pdev);
        struct net_device       *dev;
index 186308f1f8eba38ba09a8d511114b8ee0e845d78..9d485c9e3fff0de8b373292f9c5fff8435e70afa 100644 (file)
@@ -20,34 +20,38 @@ static void pci_error_handlers(struct cxl_afu *afu,
                                pci_channel_state_t state)
 {
        struct pci_dev *afu_dev;
+       struct pci_driver *afu_drv;
+       const struct pci_error_handlers *err_handler;
 
        if (afu->phb == NULL)
                return;
 
        list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
-               if (!afu_dev->driver)
+               afu_drv = to_pci_driver(afu_dev->dev.driver);
+               if (!afu_drv)
                        continue;
 
+               err_handler = afu_drv->err_handler;
                switch (bus_error_event) {
                case CXL_ERROR_DETECTED_EVENT:
                        afu_dev->error_state = state;
 
-                       if (afu_dev->driver->err_handler &&
-                           afu_dev->driver->err_handler->error_detected)
-                               afu_dev->driver->err_handler->error_detected(afu_dev, state);
-               break;
+                       if (err_handler &&
+                           err_handler->error_detected)
+                               err_handler->error_detected(afu_dev, state);
+                       break;
                case CXL_SLOT_RESET_EVENT:
                        afu_dev->error_state = state;
 
-                       if (afu_dev->driver->err_handler &&
-                           afu_dev->driver->err_handler->slot_reset)
-                               afu_dev->driver->err_handler->slot_reset(afu_dev);
-               break;
+                       if (err_handler &&
+                           err_handler->slot_reset)
+                               err_handler->slot_reset(afu_dev);
+                       break;
                case CXL_RESUME_EVENT:
-                       if (afu_dev->driver->err_handler &&
-                           afu_dev->driver->err_handler->resume)
-                               afu_dev->driver->err_handler->resume(afu_dev);
-               break;
+                       if (err_handler &&
+                           err_handler->resume)
+                               err_handler->resume(afu_dev);
+                       break;
                }
        }
 }
index 2ba899f5659ff46f060384dab247147e74376de5..3de0aea62ade4bbc33bf50250da5783e30f15319 100644 (file)
@@ -1795,6 +1795,8 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
                                                pci_channel_state_t state)
 {
        struct pci_dev *afu_dev;
+       struct pci_driver *afu_drv;
+       const struct pci_error_handlers *err_handler;
        pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
        pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
 
@@ -1805,14 +1807,16 @@ static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
                return result;
 
        list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
-               if (!afu_dev->driver)
+               afu_drv = to_pci_driver(afu_dev->dev.driver);
+               if (!afu_drv)
                        continue;
 
                afu_dev->error_state = state;
 
-               if (afu_dev->driver->err_handler)
-                       afu_result = afu_dev->driver->err_handler->error_detected(afu_dev,
-                                                                                 state);
+               err_handler = afu_drv->err_handler;
+               if (err_handler)
+                       afu_result = err_handler->error_detected(afu_dev,
+                                                                state);
                /* Disconnect trumps all, NONE trumps NEED_RESET */
                if (afu_result == PCI_ERS_RESULT_DISCONNECT)
                        result = PCI_ERS_RESULT_DISCONNECT;
@@ -1972,6 +1976,8 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
        struct cxl_afu *afu;
        struct cxl_context *ctx;
        struct pci_dev *afu_dev;
+       struct pci_driver *afu_drv;
+       const struct pci_error_handlers *err_handler;
        pci_ers_result_t afu_result = PCI_ERS_RESULT_RECOVERED;
        pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED;
        int i;
@@ -2028,12 +2034,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
                         * shouldn't start new work until we call
                         * their resume function.
                         */
-                       if (!afu_dev->driver)
+                       afu_drv = to_pci_driver(afu_dev->dev.driver);
+                       if (!afu_drv)
                                continue;
 
-                       if (afu_dev->driver->err_handler &&
-                           afu_dev->driver->err_handler->slot_reset)
-                               afu_result = afu_dev->driver->err_handler->slot_reset(afu_dev);
+                       err_handler = afu_drv->err_handler;
+                       if (err_handler && err_handler->slot_reset)
+                               afu_result = err_handler->slot_reset(afu_dev);
 
                        if (afu_result == PCI_ERS_RESULT_DISCONNECT)
                                result = PCI_ERS_RESULT_DISCONNECT;
@@ -2060,6 +2067,8 @@ static void cxl_pci_resume(struct pci_dev *pdev)
        struct cxl *adapter = pci_get_drvdata(pdev);
        struct cxl_afu *afu;
        struct pci_dev *afu_dev;
+       struct pci_driver *afu_drv;
+       const struct pci_error_handlers *err_handler;
        int i;
 
        /* Everything is back now. Drivers should restart work now.
@@ -2074,9 +2083,13 @@ static void cxl_pci_resume(struct pci_dev *pdev)
                        continue;
 
                list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
-                       if (afu_dev->driver && afu_dev->driver->err_handler &&
-                           afu_dev->driver->err_handler->resume)
-                               afu_dev->driver->err_handler->resume(afu_dev);
+                       afu_drv = to_pci_driver(afu_dev->dev.driver);
+                       if (!afu_drv)
+                               continue;
+
+                       err_handler = afu_drv->err_handler;
+                       if (err_handler && err_handler->resume)
+                               err_handler->resume(afu_dev);
                }
        }
        spin_unlock(&adapter->afu_list_lock);
index 63524551a13a16baa256428d9ac93ee461481edb..e6a2fd2c6d5c94b62374314c3cd11ed19502abdb 100644 (file)
@@ -10,7 +10,6 @@
 #include <linux/slab.h>
 
 #include <linux/scatterlist.h>
-#include <linux/swap.h>                /* For nr_free_buffer_pages() */
 #include <linux/list.h>
 
 #include <linux/debugfs.h>
index c8fd7f758938bfd35d70968b1b8115eb589facd8..c904e23c82379255cb089d0550810caa7e8f4da8 100644 (file)
@@ -2409,6 +2409,7 @@ static void __exit cleanup_mtd(void)
        if (proc_mtd)
                remove_proc_entry("mtd", NULL);
        class_unregister(&mtd_class);
+       bdi_unregister(mtd_bdi);
        bdi_put(mtd_bdi);
        idr_destroy(&mtd_idr);
 }
index a309016f7f8cb03582056eef4775623938a71d32..ecd025dda8d6870c56f945df0f292fc468b0a425 100644 (file)
@@ -676,8 +676,6 @@ void t3_link_changed(struct adapter *adapter, int port_id);
 void t3_link_fault(struct adapter *adapter, int port_id);
 int t3_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc);
 const struct adapter_info *t3_get_adapter_info(unsigned int board_id);
-int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data);
-int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data);
 int t3_seeprom_wp(struct adapter *adapter, int enable);
 int t3_get_tp_version(struct adapter *adapter, u32 *vers);
 int t3_check_tpsram_version(struct adapter *adapter);
index 9cf9e33664e4f7dd0b10507664e137211a58204e..bfffcaeee62413ad64fc0060fd5fce5e0d49fbd0 100644 (file)
@@ -2036,20 +2036,16 @@ static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e,
 {
        struct port_info *pi = netdev_priv(dev);
        struct adapter *adapter = pi->adapter;
-       int i, err = 0;
-
-       u8 *buf = kmalloc(EEPROMSIZE, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
+       int cnt;
 
        e->magic = EEPROM_MAGIC;
-       for (i = e->offset & ~3; !err && i < e->offset + e->len; i += 4)
-               err = t3_seeprom_read(adapter, i, (__le32 *) & buf[i]);
+       cnt = pci_read_vpd(adapter->pdev, e->offset, e->len, data);
+       if (cnt < 0)
+               return cnt;
 
-       if (!err)
-               memcpy(data, buf + e->offset, e->len);
-       kfree(buf);
-       return err;
+       e->len = cnt;
+
+       return 0;
 }
 
 static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
@@ -2058,7 +2054,6 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
        struct port_info *pi = netdev_priv(dev);
        struct adapter *adapter = pi->adapter;
        u32 aligned_offset, aligned_len;
-       __le32 *p;
        u8 *buf;
        int err;
 
@@ -2072,12 +2067,9 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
                buf = kmalloc(aligned_len, GFP_KERNEL);
                if (!buf)
                        return -ENOMEM;
-               err = t3_seeprom_read(adapter, aligned_offset, (__le32 *) buf);
-               if (!err && aligned_len > 4)
-                       err = t3_seeprom_read(adapter,
-                                             aligned_offset + aligned_len - 4,
-                                             (__le32 *) & buf[aligned_len - 4]);
-               if (err)
+               err = pci_read_vpd(adapter->pdev, aligned_offset, aligned_len,
+                                  buf);
+               if (err < 0)
                        goto out;
                memcpy(buf + (eeprom->offset & 3), data, eeprom->len);
        } else
@@ -2087,17 +2079,13 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
        if (err)
                goto out;
 
-       for (p = (__le32 *) buf; !err && aligned_len; aligned_len -= 4, p++) {
-               err = t3_seeprom_write(adapter, aligned_offset, *p);
-               aligned_offset += 4;
-       }
-
-       if (!err)
+       err = pci_write_vpd(adapter->pdev, aligned_offset, aligned_len, buf);
+       if (err >= 0)
                err = t3_seeprom_wp(adapter, 1);
 out:
        if (buf != data)
                kfree(buf);
-       return err;
+       return err < 0 ? err : 0;
 }
 
 static void get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
index 53feac8da503f6478ce043554faa31894eb46b2f..da41eee2f25c7933ee1f72164508e0d23d49a313 100644 (file)
@@ -596,80 +596,9 @@ struct t3_vpd {
        u32 pad;                /* for multiple-of-4 sizing and alignment */
 };
 
-#define EEPROM_MAX_POLL   40
 #define EEPROM_STAT_ADDR  0x4000
 #define VPD_BASE          0xc00
 
-/**
- *     t3_seeprom_read - read a VPD EEPROM location
- *     @adapter: adapter to read
- *     @addr: EEPROM address
- *     @data: where to store the read data
- *
- *     Read a 32-bit word from a location in VPD EEPROM using the card's PCI
- *     VPD ROM capability.  A zero is written to the flag bit when the
- *     address is written to the control register.  The hardware device will
- *     set the flag to 1 when 4 bytes have been read into the data register.
- */
-int t3_seeprom_read(struct adapter *adapter, u32 addr, __le32 *data)
-{
-       u16 val;
-       int attempts = EEPROM_MAX_POLL;
-       u32 v;
-       unsigned int base = adapter->params.pci.vpd_cap_addr;
-
-       if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3))
-               return -EINVAL;
-
-       pci_write_config_word(adapter->pdev, base + PCI_VPD_ADDR, addr);
-       do {
-               udelay(10);
-               pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val);
-       } while (!(val & PCI_VPD_ADDR_F) && --attempts);
-
-       if (!(val & PCI_VPD_ADDR_F)) {
-               CH_ERR(adapter, "reading EEPROM address 0x%x failed\n", addr);
-               return -EIO;
-       }
-       pci_read_config_dword(adapter->pdev, base + PCI_VPD_DATA, &v);
-       *data = cpu_to_le32(v);
-       return 0;
-}
-
-/**
- *     t3_seeprom_write - write a VPD EEPROM location
- *     @adapter: adapter to write
- *     @addr: EEPROM address
- *     @data: value to write
- *
- *     Write a 32-bit word to a location in VPD EEPROM using the card's PCI
- *     VPD ROM capability.
- */
-int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data)
-{
-       u16 val;
-       int attempts = EEPROM_MAX_POLL;
-       unsigned int base = adapter->params.pci.vpd_cap_addr;
-
-       if ((addr >= EEPROMSIZE && addr != EEPROM_STAT_ADDR) || (addr & 3))
-               return -EINVAL;
-
-       pci_write_config_dword(adapter->pdev, base + PCI_VPD_DATA,
-                              le32_to_cpu(data));
-       pci_write_config_word(adapter->pdev,base + PCI_VPD_ADDR,
-                             addr | PCI_VPD_ADDR_F);
-       do {
-               msleep(1);
-               pci_read_config_word(adapter->pdev, base + PCI_VPD_ADDR, &val);
-       } while ((val & PCI_VPD_ADDR_F) && --attempts);
-
-       if (val & PCI_VPD_ADDR_F) {
-               CH_ERR(adapter, "write to EEPROM address 0x%x failed\n", addr);
-               return -EIO;
-       }
-       return 0;
-}
-
 /**
  *     t3_seeprom_wp - enable/disable EEPROM write protection
  *     @adapter: the adapter
@@ -679,7 +608,14 @@ int t3_seeprom_write(struct adapter *adapter, u32 addr, __le32 data)
  */
 int t3_seeprom_wp(struct adapter *adapter, int enable)
 {
-       return t3_seeprom_write(adapter, EEPROM_STAT_ADDR, enable ? 0xc : 0);
+       u32 data = enable ? 0xc : 0;
+       int ret;
+
+       /* EEPROM_STAT_ADDR is outside VPD area, use pci_write_vpd_any() */
+       ret = pci_write_vpd_any(adapter->pdev, EEPROM_STAT_ADDR, sizeof(u32),
+                               &data);
+
+       return ret < 0 ? ret : 0;
 }
 
 static int vpdstrtouint(char *s, u8 len, unsigned int base, unsigned int *val)
@@ -709,24 +645,22 @@ static int vpdstrtou16(char *s, u8 len, unsigned int base, u16 *val)
  */
 static int get_vpd_params(struct adapter *adapter, struct vpd_params *p)
 {
-       int i, addr, ret;
        struct t3_vpd vpd;
+       u8 base_val = 0;
+       int addr, ret;
 
        /*
         * Card information is normally at VPD_BASE but some early cards had
         * it at 0.
         */
-       ret = t3_seeprom_read(adapter, VPD_BASE, (__le32 *)&vpd);
-       if (ret)
+       ret = pci_read_vpd(adapter->pdev, VPD_BASE, 1, &base_val);
+       if (ret < 0)
                return ret;
-       addr = vpd.id_tag == 0x82 ? VPD_BASE : 0;
+       addr = base_val == PCI_VPD_LRDT_ID_STRING ? VPD_BASE : 0;
 
-       for (i = 0; i < sizeof(vpd); i += 4) {
-               ret = t3_seeprom_read(adapter, addr + i,
-                                     (__le32 *)((u8 *)&vpd + i));
-               if (ret)
-                       return ret;
-       }
+       ret = pci_read_vpd(adapter->pdev, addr, sizeof(vpd), &vpd);
+       if (ret < 0)
+               return ret;
 
        ret = vpdstrtouint(vpd.cclk_data, vpd.cclk_len, 10, &p->cclk);
        if (ret)
index 5ebd96f6833d6ebb8dc176af3374379d0556d35f..9fdedd83f39225592cd09b6b4e0f8c556569b135 100644 (file)
@@ -608,7 +608,7 @@ static void hns3_get_drvinfo(struct net_device *netdev,
                return;
        }
 
-       strncpy(drvinfo->driver, h->pdev->driver->name,
+       strncpy(drvinfo->driver, dev_driver_string(&h->pdev->dev),
                sizeof(drvinfo->driver));
        drvinfo->driver[sizeof(drvinfo->driver) - 1] = '\0';
 
index 5d4d410b07c8cccf812fb1144cfe9bd302909c78..d650082496d64cfa53914c90eb20559599998f0c 100644 (file)
@@ -776,7 +776,7 @@ out_release:
 static int prestera_pci_probe(struct pci_dev *pdev,
                              const struct pci_device_id *id)
 {
-       const char *driver_name = pdev->driver->name;
+       const char *driver_name = dev_driver_string(&pdev->dev);
        struct prestera_fw *fw;
        int err;
 
index fcace73eae40ff09818463f91036b22e21d0abff..a15c95a10bae4d8f09712f2772e40ba313bdc009 100644 (file)
@@ -1875,7 +1875,7 @@ static void mlxsw_pci_cmd_fini(struct mlxsw_pci *mlxsw_pci)
 
 static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-       const char *driver_name = pdev->driver->name;
+       const char *driver_name = dev_driver_string(&pdev->dev);
        struct mlxsw_pci *mlxsw_pci;
        int err;
 
index 0685ece1f155d974a889d493666bff922159f22c..1de076f557405e9e48df97eb5edf39acc363efb5 100644 (file)
@@ -202,7 +202,8 @@ nfp_get_drvinfo(struct nfp_app *app, struct pci_dev *pdev,
 {
        char nsp_version[ETHTOOL_FWVERS_LEN] = {};
 
-       strlcpy(drvinfo->driver, pdev->driver->name, sizeof(drvinfo->driver));
+       strlcpy(drvinfo->driver, dev_driver_string(&pdev->dev),
+               sizeof(drvinfo->driver));
        nfp_net_get_nspinfo(app, nsp_version);
        snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
                 "%s %s %s %s", vnic_version, nsp_version,
index 352e14b007e784f92f037af808045c52f9b67223..32be5a03951fa000ede9445c60d904a9e6de2c2b 100644 (file)
@@ -156,10 +156,14 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq)
 
        /* Now start the actual "proper" walk of the interrupt tree */
        while (ipar != NULL) {
-               /* Now check if cursor is an interrupt-controller and if it is
-                * then we are done
+               /*
+                * Now check if cursor is an interrupt-controller and
+                * if it is then we are done, unless there is an
+                * interrupt-map which takes precedence.
                 */
-               if (of_property_read_bool(ipar, "interrupt-controller")) {
+               imap = of_get_property(ipar, "interrupt-map", &imaplen);
+               if (imap == NULL &&
+                   of_property_read_bool(ipar, "interrupt-controller")) {
                        pr_debug(" -> got it !\n");
                        return 0;
                }
@@ -173,8 +177,6 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq)
                        goto fail;
                }
 
-               /* Now look for an interrupt-map */
-               imap = of_get_property(ipar, "interrupt-map", &imaplen);
                /* No interrupt map, check for an interrupt parent */
                if (imap == NULL) {
                        pr_debug(" -> no map, getting parent\n");
@@ -255,6 +257,11 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq)
                out_irq->args_count = intsize = newintsize;
                addrsize = newaddrsize;
 
+               if (ipar == newpar) {
+                       pr_debug("%pOF interrupt-map entry to self\n", ipar);
+                       return 0;
+               }
+
        skiplevel:
                /* Iterate again with new parent */
                out_irq->np = newpar;
index 761fd870d1db277f79ff199c49729c6450e6d7fe..b9bd1cff179388c10705ac7cd817f241a8023082 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/of.h>
 #include <linux/of_fdt.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 
 #define RNG_SEED_SIZE          128
@@ -170,8 +171,7 @@ int ima_free_kexec_buffer(void)
        if (ret)
                return ret;
 
-       return memblock_free(addr, size);
-
+       return memblock_phys_free(addr, size);
 }
 
 /**
index 9da8835ba5a58148f8e6277700d5b152ff9531c9..9c0fb962c22b097ce185c6f1b3d946e962a93d10 100644 (file)
@@ -46,7 +46,7 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
        if (nomap) {
                err = memblock_mark_nomap(base, size);
                if (err)
-                       memblock_free(base, size);
+                       memblock_phys_free(base, size);
                kmemleak_ignore_phys(base);
        }
 
@@ -284,7 +284,8 @@ void __init fdt_init_reserved_mem(void)
                                if (nomap)
                                        memblock_clear_nomap(rmem->base, rmem->size);
                                else
-                                       memblock_free(rmem->base, rmem->size);
+                                       memblock_phys_free(rmem->base,
+                                                          rmem->size);
                        }
                }
        }
index 326f7d13024f9037572cabf4bada954c732ebb5d..e917bb3652bb3ea242de7294e66fb403e382f028 100644 (file)
@@ -254,7 +254,7 @@ config PCIE_MEDIATEK_GEN3
          MediaTek SoCs.
 
 config VMD
-       depends on PCI_MSI && X86_64 && SRCU
+       depends on PCI_MSI && X86_64 && SRCU && !UML
        tristate "Intel Volume Management Device Driver"
        help
          Adds support for the Intel Volume Management Device (VMD). VMD is a
@@ -312,6 +312,32 @@ config PCIE_HISI_ERR
          Say Y here if you want error handling support
          for the PCIe controller's errors on HiSilicon HIP SoCs
 
+config PCIE_APPLE_MSI_DOORBELL_ADDR
+       hex
+       default 0xfffff000
+       depends on PCIE_APPLE
+
+config PCIE_APPLE
+       tristate "Apple PCIe controller"
+       depends on ARCH_APPLE || COMPILE_TEST
+       depends on OF
+       depends on PCI_MSI_IRQ_DOMAIN
+       select PCI_HOST_COMMON
+       help
+         Say Y here if you want to enable PCIe controller support on Apple
+         system-on-chips, like the Apple M1. This is required for the USB
+         type-A ports, Ethernet, Wi-Fi, and Bluetooth.
+
+         If unsure, say Y if you have an Apple Silicon system.
+
+config PCIE_MT7621
+       tristate "MediaTek MT7621 PCIe Controller"
+       depends on (RALINK && SOC_MT7621) || (MIPS && COMPILE_TEST)
+       select PHY_MT7621_PCI
+       default SOC_MT7621
+       help
+         This selects a driver for the MediaTek MT7621 PCIe Controller.
+
 source "drivers/pci/controller/dwc/Kconfig"
 source "drivers/pci/controller/mobiveil/Kconfig"
 source "drivers/pci/controller/cadence/Kconfig"
index aaf30b3dcc143aaa8c59d7727cd42e7e849accc3..37c8663de7fe1ff7c9c948cd39f4b6ce1a912f5b 100644 (file)
@@ -37,6 +37,9 @@ obj-$(CONFIG_VMD) += vmd.o
 obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb.o
 obj-$(CONFIG_PCI_LOONGSON) += pci-loongson.o
 obj-$(CONFIG_PCIE_HISI_ERR) += pcie-hisi-error.o
+obj-$(CONFIG_PCIE_APPLE) += pcie-apple.o
+obj-$(CONFIG_PCIE_MT7621) += pcie-mt7621.o
+
 # pcie-hisi.o quirks are needed even without CONFIG_PCIE_DW
 obj-y                          += dwc/
 obj-y                          += mobiveil/
index ffb176d288cd9e6b48bac56cfa26899ee7ef0b62..918e11082e6a747aa75330d6bea4817765c7b030 100644 (file)
@@ -474,7 +474,7 @@ static int j721e_pcie_probe(struct platform_device *pdev)
                ret = clk_prepare_enable(clk);
                if (ret) {
                        dev_err(dev, "failed to enable pcie_refclk\n");
-                       goto err_get_sync;
+                       goto err_pcie_setup;
                }
                pcie->refclk = clk;
 
index 5fee0f89ab594888e55a154c5e133e11262b97d4..a224afadbcc005099efe20f3a429cec96183b2d0 100644 (file)
@@ -127,6 +127,8 @@ static int cdns_plat_pcie_probe(struct platform_device *pdev)
                        goto err_init;
        }
 
+       return 0;
+
  err_init:
  err_get_sync:
        pm_runtime_put_sync(dev);
index 76c0a63a3f6486477cf1650eb4f7d7f878efecd4..62ce3abf0f19607a7ad8c555692e932d7e31b110 100644 (file)
@@ -8,22 +8,20 @@ config PCIE_DW
 
 config PCIE_DW_HOST
        bool
-       depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW
 
 config PCIE_DW_EP
        bool
-       depends on PCI_ENDPOINT
        select PCIE_DW
 
 config PCI_DRA7XX
-       bool
+       tristate
 
 config PCI_DRA7XX_HOST
-       bool "TI DRA7xx PCIe controller Host Mode"
+       tristate "TI DRA7xx PCIe controller Host Mode"
        depends on SOC_DRA7XX || COMPILE_TEST
-       depends on PCI_MSI_IRQ_DOMAIN
        depends on OF && HAS_IOMEM && TI_PIPE3
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW_HOST
        select PCI_DRA7XX
        default y if SOC_DRA7XX
@@ -36,10 +34,10 @@ config PCI_DRA7XX_HOST
          This uses the DesignWare core.
 
 config PCI_DRA7XX_EP
-       bool "TI DRA7xx PCIe controller Endpoint Mode"
+       tristate "TI DRA7xx PCIe controller Endpoint Mode"
        depends on SOC_DRA7XX || COMPILE_TEST
-       depends on PCI_ENDPOINT
        depends on OF && HAS_IOMEM && TI_PIPE3
+       depends on PCI_ENDPOINT
        select PCIE_DW_EP
        select PCI_DRA7XX
        help
@@ -55,7 +53,7 @@ config PCIE_DW_PLAT
 
 config PCIE_DW_PLAT_HOST
        bool "Platform bus based DesignWare PCIe Controller - Host mode"
-       depends on PCI && PCI_MSI_IRQ_DOMAIN
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW_HOST
        select PCIE_DW_PLAT
        help
@@ -138,8 +136,8 @@ config PCI_LAYERSCAPE
        bool "Freescale Layerscape PCIe controller - Host mode"
        depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST)
        depends on PCI_MSI_IRQ_DOMAIN
-       select MFD_SYSCON
        select PCIE_DW_HOST
+       select MFD_SYSCON
        help
          Say Y here if you want to enable PCIe controller support on Layerscape
          SoCs to work in Host mode.
@@ -180,6 +178,16 @@ config PCIE_QCOM
          PCIe controller uses the DesignWare core plus Qualcomm-specific
          hardware wrappers.
 
+config PCIE_QCOM_EP
+       tristate "Qualcomm PCIe controller - Endpoint mode"
+       depends on OF && (ARCH_QCOM || COMPILE_TEST)
+       depends on PCI_ENDPOINT
+       select PCIE_DW_EP
+       help
+         Say Y here to enable support for the PCIe controllers on Qualcomm SoCs
+         to work in endpoint mode. The PCIe controller uses the DesignWare core
+         plus Qualcomm-specific hardware wrappers.
+
 config PCIE_ARMADA_8K
        bool "Marvell Armada-8K PCIe controller"
        depends on ARCH_MVEBU || COMPILE_TEST
@@ -266,7 +274,7 @@ config PCIE_KEEMBAY_EP
 
 config PCIE_KIRIN
        depends on OF && (ARM64 || COMPILE_TEST)
-       bool "HiSilicon Kirin series SoCs PCIe controllers"
+       tristate "HiSilicon Kirin series SoCs PCIe controllers"
        depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW_HOST
        help
@@ -283,8 +291,8 @@ config PCIE_HISI_STB
 
 config PCI_MESON
        tristate "MESON PCIe controller"
-       depends on PCI_MSI_IRQ_DOMAIN
        default m if ARCH_MESON
+       depends on PCI_MSI_IRQ_DOMAIN
        select PCIE_DW_HOST
        help
          Say Y here if you want to enable PCI controller support on Amlogic
index 73244409792cb5c885a57ff3e00109b171ca8d0e..8ba7b67f5e50a9ab268728cf8563c5143e4f1f55 100644 (file)
@@ -12,6 +12,7 @@ obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o
 obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o
 obj-$(CONFIG_PCI_LAYERSCAPE_EP) += pci-layerscape-ep.o
 obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o
+obj-$(CONFIG_PCIE_QCOM_EP) += pcie-qcom-ep.o
 obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o
 obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o
 obj-$(CONFIG_PCIE_ROCKCHIP_DW_HOST) += pcie-dw-rockchip.o
index fbbb78f6885e700ed4bddd8176f2b926b6cf9832..a4221f6f362912c71822ac02fffd281eb3124374 100644 (file)
@@ -7,6 +7,7 @@
  * Authors: Kishon Vijay Abraham I <kishon@ti.com>
  */
 
+#include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/err.h>
@@ -14,7 +15,7 @@
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
+#include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
 #include <linux/of_pci.h>
@@ -90,6 +91,7 @@ struct dra7xx_pcie {
        int                     phy_count;      /* DT phy-names count */
        struct phy              **phy;
        struct irq_domain       *irq_domain;
+       struct clk              *clk;
        enum dw_pcie_device_mode mode;
 };
 
@@ -607,6 +609,7 @@ static const struct of_device_id of_dra7xx_pcie_match[] = {
        },
        {},
 };
+MODULE_DEVICE_TABLE(of, of_dra7xx_pcie_match);
 
 /*
  * dra7xx_pcie_unaligned_memaccess: workaround for AM572x/AM571x Errata i870
@@ -740,6 +743,15 @@ static int dra7xx_pcie_probe(struct platform_device *pdev)
        if (!link)
                return -ENOMEM;
 
+       dra7xx->clk = devm_clk_get_optional(dev, NULL);
+       if (IS_ERR(dra7xx->clk))
+               return dev_err_probe(dev, PTR_ERR(dra7xx->clk),
+                                    "clock request failed");
+
+       ret = clk_prepare_enable(dra7xx->clk);
+       if (ret)
+               return ret;
+
        for (i = 0; i < phy_count; i++) {
                snprintf(name, sizeof(name), "pcie-phy%d", i);
                phy[i] = devm_phy_get(dev, name);
@@ -925,6 +937,8 @@ static void dra7xx_pcie_shutdown(struct platform_device *pdev)
 
        pm_runtime_disable(dev);
        dra7xx_pcie_disable_phy(dra7xx);
+
+       clk_disable_unprepare(dra7xx->clk);
 }
 
 static const struct dev_pm_ops dra7xx_pcie_pm_ops = {
@@ -943,4 +957,8 @@ static struct platform_driver dra7xx_pcie_driver = {
        },
        .shutdown = dra7xx_pcie_shutdown,
 };
-builtin_platform_driver(dra7xx_pcie_driver);
+module_platform_driver(dra7xx_pcie_driver);
+
+MODULE_AUTHOR("Kishon Vijay Abraham I <kishon@ti.com>");
+MODULE_DESCRIPTION("PCIe controller driver for TI DRA7xx SoCs");
+MODULE_LICENSE("GPL v2");
index 80fc98acf097f6ff85b2a34276a7cca80825e8d7..26f49f797b0fee27c0fdb0514fcaffb72a61ef56 100644 (file)
@@ -1132,7 +1132,7 @@ static int imx6_pcie_probe(struct platform_device *pdev)
 
        /* Limit link speed */
        pci->link_gen = 1;
-       ret = of_property_read_u32(node, "fsl,max-link-speed", &pci->link_gen);
+       of_property_read_u32(node, "fsl,max-link-speed", &pci->link_gen);
 
        imx6_pcie->vpcie = devm_regulator_get_optional(&pdev->dev, "vpcie");
        if (IS_ERR(imx6_pcie->vpcie)) {
index 998b698f40858b704e83aa545d23e49673780e9f..0eda8236c125a00cb157e275e2b6249852a2da60 100644 (file)
@@ -83,6 +83,7 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar)
        for (func_no = 0; func_no < funcs; func_no++)
                __dw_pcie_ep_reset_bar(pci, func_no, bar, 0);
 }
+EXPORT_SYMBOL_GPL(dw_pcie_ep_reset_bar);
 
 static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie_ep *ep, u8 func_no,
                u8 cap_ptr, u8 cap)
@@ -485,6 +486,7 @@ int dw_pcie_ep_raise_legacy_irq(struct dw_pcie_ep *ep, u8 func_no)
 
        return -EINVAL;
 }
+EXPORT_SYMBOL_GPL(dw_pcie_ep_raise_legacy_irq);
 
 int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no,
                             u8 interrupt_num)
@@ -536,6 +538,7 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 func_no,
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(dw_pcie_ep_raise_msi_irq);
 
 int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8 func_no,
                                       u16 interrupt_num)
index d1d9b8344ec9cf7d8215992b8bcd10b2003f745a..f4755f3a03bea1bc1b181fee82fd006af1891a4f 100644 (file)
@@ -335,6 +335,16 @@ int dw_pcie_host_init(struct pcie_port *pp)
        if (pci->link_gen < 1)
                pci->link_gen = of_pci_get_max_link_speed(np);
 
+       /* Set default bus ops */
+       bridge->ops = &dw_pcie_ops;
+       bridge->child_ops = &dw_child_pcie_ops;
+
+       if (pp->ops->host_init) {
+               ret = pp->ops->host_init(pp);
+               if (ret)
+                       return ret;
+       }
+
        if (pci_msi_enabled()) {
                pp->has_msi_ctrl = !(pp->ops->msi_host_init ||
                                     of_property_read_bool(np, "msi-parent") ||
@@ -388,15 +398,6 @@ int dw_pcie_host_init(struct pcie_port *pp)
                }
        }
 
-       /* Set default bus ops */
-       bridge->ops = &dw_pcie_ops;
-       bridge->child_ops = &dw_child_pcie_ops;
-
-       if (pp->ops->host_init) {
-               ret = pp->ops->host_init(pp);
-               if (ret)
-                       goto err_free_msi;
-       }
        dw_pcie_iatu_detect(pci);
 
        dw_pcie_setup_rc(pp);
index a945f0c0e73dc3bccb71e6a594058aa07e474e59..850b4533f4ef54faadf9835df524857d23e660eb 100644 (file)
@@ -538,6 +538,7 @@ int dw_pcie_link_up(struct dw_pcie *pci)
        return ((val & PCIE_PORT_DEBUG1_LINK_UP) &&
                (!(val & PCIE_PORT_DEBUG1_LINK_IN_TRAINING)));
 }
+EXPORT_SYMBOL_GPL(dw_pcie_link_up);
 
 void dw_pcie_upconfig_setup(struct dw_pcie *pci)
 {
index 026fd1e42a555d7abcc9fa47ee85a9e2de15f6b4..095afbccf9c16020f9115130f1ece279096ba33b 100644 (file)
@@ -8,16 +8,18 @@
  * Author: Xiaowei Song <songxiaowei@huawei.com>
  */
 
-#include <linux/compiler.h>
 #include <linux/clk.h>
+#include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/gpio.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/syscon.h>
 #include <linux/of_address.h>
+#include <linux/of_device.h>
 #include <linux/of_gpio.h>
 #include <linux/of_pci.h>
+#include <linux/phy/phy.h>
 #include <linux/pci.h>
 #include <linux/pci_regs.h>
 #include <linux/platform_device.h>
 
 #define to_kirin_pcie(x) dev_get_drvdata((x)->dev)
 
-#define REF_CLK_FREQ                   100000000
-
 /* PCIe ELBI registers */
 #define SOC_PCIECTRL_CTRL0_ADDR                0x000
 #define SOC_PCIECTRL_CTRL1_ADDR                0x004
-#define SOC_PCIEPHY_CTRL2_ADDR         0x008
-#define SOC_PCIEPHY_CTRL3_ADDR         0x00c
 #define PCIE_ELBI_SLV_DBI_ENABLE       (0x1 << 21)
 
 /* info located in APB */
 #define PCIE_APP_LTSSM_ENABLE  0x01c
-#define PCIE_APB_PHY_CTRL0     0x0
-#define PCIE_APB_PHY_CTRL1     0x4
 #define PCIE_APB_PHY_STATUS0   0x400
 #define PCIE_LINKUP_ENABLE     (0x8020)
 #define PCIE_LTSSM_ENABLE_BIT  (0x1 << 11)
-#define PIPE_CLK_STABLE                (0x1 << 19)
-#define PHY_REF_PAD_BIT                (0x1 << 8)
-#define PHY_PWR_DOWN_BIT       (0x1 << 22)
-#define PHY_RST_ACK_BIT                (0x1 << 16)
 
 /* info located in sysctrl */
 #define SCTRL_PCIE_CMOS_OFFSET 0x60
 #define PCIE_DEBOUNCE_PARAM    0xF0F400
 #define PCIE_OE_BYPASS         (0x3 << 28)
 
+/*
+ * Max number of connected PCI slots at an external PCI bridge
+ *
+ * This is used on HiKey 970, which has a PEX 8606 bridge with 4 connected
+ * lanes (lane 0 upstream, and the other three lanes, one connected to an
+ * in-board Ethernet adapter and the other two connected to M.2 and mini
+ * PCI slots.
+ *
+ * Each slot has a different clock source and uses a separate PERST# pin.
+ */
+#define MAX_PCI_SLOTS          3
+
+enum pcie_kirin_phy_type {
+       PCIE_KIRIN_INTERNAL_PHY,
+       PCIE_KIRIN_EXTERNAL_PHY
+};
+
+struct kirin_pcie {
+       enum pcie_kirin_phy_type        type;
+
+       struct dw_pcie  *pci;
+       struct regmap   *apb;
+       struct phy      *phy;
+       void            *phy_priv;      /* only for PCIE_KIRIN_INTERNAL_PHY */
+
+       /* DWC PERST# */
+       int             gpio_id_dwc_perst;
+
+       /* Per-slot PERST# */
+       int             num_slots;
+       int             gpio_id_reset[MAX_PCI_SLOTS];
+       const char      *reset_names[MAX_PCI_SLOTS];
+
+       /* Per-slot clkreq */
+       int             n_gpio_clkreq;
+       int             gpio_id_clkreq[MAX_PCI_SLOTS];
+       const char      *clkreq_names[MAX_PCI_SLOTS];
+};
+
+/*
+ * Kirin 960 PHY. Can't be split into a PHY driver without changing the
+ * DT schema.
+ */
+
+#define REF_CLK_FREQ                   100000000
+
+/* PHY info located in APB */
+#define PCIE_APB_PHY_CTRL0     0x0
+#define PCIE_APB_PHY_CTRL1     0x4
+#define PCIE_APB_PHY_STATUS0   0x400
+#define PIPE_CLK_STABLE                BIT(19)
+#define PHY_REF_PAD_BIT                BIT(8)
+#define PHY_PWR_DOWN_BIT       BIT(22)
+#define PHY_RST_ACK_BIT                BIT(16)
+
 /* peri_crg ctrl */
 #define CRGCTRL_PCIE_ASSERT_OFFSET     0x88
 #define CRGCTRL_PCIE_ASSERT_BIT                0x8c000000
 
 /* Time for delay */
-#define REF_2_PERST_MIN                20000
+#define REF_2_PERST_MIN                21000
 #define REF_2_PERST_MAX                25000
 #define PERST_2_ACCESS_MIN     10000
 #define PERST_2_ACCESS_MAX     12000
-#define LINK_WAIT_MIN          900
-#define LINK_WAIT_MAX          1000
 #define PIPE_CLK_WAIT_MIN      550
 #define PIPE_CLK_WAIT_MAX      600
 #define TIME_CMOS_MIN          100
 #define TIME_PHY_PD_MIN                10
 #define TIME_PHY_PD_MAX                11
 
-struct kirin_pcie {
-       struct dw_pcie  *pci;
-       void __iomem    *apb_base;
-       void __iomem    *phy_base;
+struct hi3660_pcie_phy {
+       struct device   *dev;
+       void __iomem    *base;
        struct regmap   *crgctrl;
        struct regmap   *sysctrl;
        struct clk      *apb_sys_clk;
        struct clk      *apb_phy_clk;
        struct clk      *phy_ref_clk;
-       struct clk      *pcie_aclk;
-       struct clk      *pcie_aux_clk;
-       int             gpio_id_reset;
+       struct clk      *aclk;
+       struct clk      *aux_clk;
 };
 
-/* Registers in PCIeCTRL */
-static inline void kirin_apb_ctrl_writel(struct kirin_pcie *kirin_pcie,
-                                        u32 val, u32 reg)
-{
-       writel(val, kirin_pcie->apb_base + reg);
-}
-
-static inline u32 kirin_apb_ctrl_readl(struct kirin_pcie *kirin_pcie, u32 reg)
-{
-       return readl(kirin_pcie->apb_base + reg);
-}
-
 /* Registers in PCIePHY */
-static inline void kirin_apb_phy_writel(struct kirin_pcie *kirin_pcie,
+static inline void kirin_apb_phy_writel(struct hi3660_pcie_phy *hi3660_pcie_phy,
                                        u32 val, u32 reg)
 {
-       writel(val, kirin_pcie->phy_base + reg);
+       writel(val, hi3660_pcie_phy->base + reg);
 }
 
-static inline u32 kirin_apb_phy_readl(struct kirin_pcie *kirin_pcie, u32 reg)
+static inline u32 kirin_apb_phy_readl(struct hi3660_pcie_phy *hi3660_pcie_phy,
+                                     u32 reg)
 {
-       return readl(kirin_pcie->phy_base + reg);
+       return readl(hi3660_pcie_phy->base + reg);
 }
 
-static long kirin_pcie_get_clk(struct kirin_pcie *kirin_pcie,
-                              struct platform_device *pdev)
+static int hi3660_pcie_phy_get_clk(struct hi3660_pcie_phy *phy)
 {
-       struct device *dev = &pdev->dev;
+       struct device *dev = phy->dev;
 
-       kirin_pcie->phy_ref_clk = devm_clk_get(dev, "pcie_phy_ref");
-       if (IS_ERR(kirin_pcie->phy_ref_clk))
-               return PTR_ERR(kirin_pcie->phy_ref_clk);
+       phy->phy_ref_clk = devm_clk_get(dev, "pcie_phy_ref");
+       if (IS_ERR(phy->phy_ref_clk))
+               return PTR_ERR(phy->phy_ref_clk);
 
-       kirin_pcie->pcie_aux_clk = devm_clk_get(dev, "pcie_aux");
-       if (IS_ERR(kirin_pcie->pcie_aux_clk))
-               return PTR_ERR(kirin_pcie->pcie_aux_clk);
+       phy->aux_clk = devm_clk_get(dev, "pcie_aux");
+       if (IS_ERR(phy->aux_clk))
+               return PTR_ERR(phy->aux_clk);
 
-       kirin_pcie->apb_phy_clk = devm_clk_get(dev, "pcie_apb_phy");
-       if (IS_ERR(kirin_pcie->apb_phy_clk))
-               return PTR_ERR(kirin_pcie->apb_phy_clk);
+       phy->apb_phy_clk = devm_clk_get(dev, "pcie_apb_phy");
+       if (IS_ERR(phy->apb_phy_clk))
+               return PTR_ERR(phy->apb_phy_clk);
 
-       kirin_pcie->apb_sys_clk = devm_clk_get(dev, "pcie_apb_sys");
-       if (IS_ERR(kirin_pcie->apb_sys_clk))
-               return PTR_ERR(kirin_pcie->apb_sys_clk);
+       phy->apb_sys_clk = devm_clk_get(dev, "pcie_apb_sys");
+       if (IS_ERR(phy->apb_sys_clk))
+               return PTR_ERR(phy->apb_sys_clk);
 
-       kirin_pcie->pcie_aclk = devm_clk_get(dev, "pcie_aclk");
-       if (IS_ERR(kirin_pcie->pcie_aclk))
-               return PTR_ERR(kirin_pcie->pcie_aclk);
+       phy->aclk = devm_clk_get(dev, "pcie_aclk");
+       if (IS_ERR(phy->aclk))
+               return PTR_ERR(phy->aclk);
 
        return 0;
 }
 
-static long kirin_pcie_get_resource(struct kirin_pcie *kirin_pcie,
-                                   struct platform_device *pdev)
+static int hi3660_pcie_phy_get_resource(struct hi3660_pcie_phy *phy)
 {
-       kirin_pcie->apb_base =
-               devm_platform_ioremap_resource_byname(pdev, "apb");
-       if (IS_ERR(kirin_pcie->apb_base))
-               return PTR_ERR(kirin_pcie->apb_base);
-
-       kirin_pcie->phy_base =
-               devm_platform_ioremap_resource_byname(pdev, "phy");
-       if (IS_ERR(kirin_pcie->phy_base))
-               return PTR_ERR(kirin_pcie->phy_base);
-
-       kirin_pcie->crgctrl =
-               syscon_regmap_lookup_by_compatible("hisilicon,hi3660-crgctrl");
-       if (IS_ERR(kirin_pcie->crgctrl))
-               return PTR_ERR(kirin_pcie->crgctrl);
-
-       kirin_pcie->sysctrl =
-               syscon_regmap_lookup_by_compatible("hisilicon,hi3660-sctrl");
-       if (IS_ERR(kirin_pcie->sysctrl))
-               return PTR_ERR(kirin_pcie->sysctrl);
+       struct device *dev = phy->dev;
+       struct platform_device *pdev;
+
+       /* registers */
+       pdev = container_of(dev, struct platform_device, dev);
+
+       phy->base = devm_platform_ioremap_resource_byname(pdev, "phy");
+       if (IS_ERR(phy->base))
+               return PTR_ERR(phy->base);
+
+       phy->crgctrl = syscon_regmap_lookup_by_compatible("hisilicon,hi3660-crgctrl");
+       if (IS_ERR(phy->crgctrl))
+               return PTR_ERR(phy->crgctrl);
+
+       phy->sysctrl = syscon_regmap_lookup_by_compatible("hisilicon,hi3660-sctrl");
+       if (IS_ERR(phy->sysctrl))
+               return PTR_ERR(phy->sysctrl);
 
        return 0;
 }
 
-static int kirin_pcie_phy_init(struct kirin_pcie *kirin_pcie)
+static int hi3660_pcie_phy_start(struct hi3660_pcie_phy *phy)
 {
-       struct device *dev = kirin_pcie->pci->dev;
+       struct device *dev = phy->dev;
        u32 reg_val;
 
-       reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_CTRL1);
+       reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_CTRL1);
        reg_val &= ~PHY_REF_PAD_BIT;
-       kirin_apb_phy_writel(kirin_pcie, reg_val, PCIE_APB_PHY_CTRL1);
+       kirin_apb_phy_writel(phy, reg_val, PCIE_APB_PHY_CTRL1);
 
-       reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_CTRL0);
+       reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_CTRL0);
        reg_val &= ~PHY_PWR_DOWN_BIT;
-       kirin_apb_phy_writel(kirin_pcie, reg_val, PCIE_APB_PHY_CTRL0);
+       kirin_apb_phy_writel(phy, reg_val, PCIE_APB_PHY_CTRL0);
        usleep_range(TIME_PHY_PD_MIN, TIME_PHY_PD_MAX);
 
-       reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_CTRL1);
+       reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_CTRL1);
        reg_val &= ~PHY_RST_ACK_BIT;
-       kirin_apb_phy_writel(kirin_pcie, reg_val, PCIE_APB_PHY_CTRL1);
+       kirin_apb_phy_writel(phy, reg_val, PCIE_APB_PHY_CTRL1);
 
        usleep_range(PIPE_CLK_WAIT_MIN, PIPE_CLK_WAIT_MAX);
-       reg_val = kirin_apb_phy_readl(kirin_pcie, PCIE_APB_PHY_STATUS0);
+       reg_val = kirin_apb_phy_readl(phy, PCIE_APB_PHY_STATUS0);
        if (reg_val & PIPE_CLK_STABLE) {
                dev_err(dev, "PIPE clk is not stable\n");
                return -EINVAL;
@@ -198,102 +226,274 @@ static int kirin_pcie_phy_init(struct kirin_pcie *kirin_pcie)
        return 0;
 }
 
-static void kirin_pcie_oe_enable(struct kirin_pcie *kirin_pcie)
+static void hi3660_pcie_phy_oe_enable(struct hi3660_pcie_phy *phy)
 {
        u32 val;
 
-       regmap_read(kirin_pcie->sysctrl, SCTRL_PCIE_OE_OFFSET, &val);
+       regmap_read(phy->sysctrl, SCTRL_PCIE_OE_OFFSET, &val);
        val |= PCIE_DEBOUNCE_PARAM;
        val &= ~PCIE_OE_BYPASS;
-       regmap_write(kirin_pcie->sysctrl, SCTRL_PCIE_OE_OFFSET, val);
+       regmap_write(phy->sysctrl, SCTRL_PCIE_OE_OFFSET, val);
 }
 
-static int kirin_pcie_clk_ctrl(struct kirin_pcie *kirin_pcie, bool enable)
+static int hi3660_pcie_phy_clk_ctrl(struct hi3660_pcie_phy *phy, bool enable)
 {
        int ret = 0;
 
        if (!enable)
                goto close_clk;
 
-       ret = clk_set_rate(kirin_pcie->phy_ref_clk, REF_CLK_FREQ);
+       ret = clk_set_rate(phy->phy_ref_clk, REF_CLK_FREQ);
        if (ret)
                return ret;
 
-       ret = clk_prepare_enable(kirin_pcie->phy_ref_clk);
+       ret = clk_prepare_enable(phy->phy_ref_clk);
        if (ret)
                return ret;
 
-       ret = clk_prepare_enable(kirin_pcie->apb_sys_clk);
+       ret = clk_prepare_enable(phy->apb_sys_clk);
        if (ret)
                goto apb_sys_fail;
 
-       ret = clk_prepare_enable(kirin_pcie->apb_phy_clk);
+       ret = clk_prepare_enable(phy->apb_phy_clk);
        if (ret)
                goto apb_phy_fail;
 
-       ret = clk_prepare_enable(kirin_pcie->pcie_aclk);
+       ret = clk_prepare_enable(phy->aclk);
        if (ret)
                goto aclk_fail;
 
-       ret = clk_prepare_enable(kirin_pcie->pcie_aux_clk);
+       ret = clk_prepare_enable(phy->aux_clk);
        if (ret)
                goto aux_clk_fail;
 
        return 0;
 
 close_clk:
-       clk_disable_unprepare(kirin_pcie->pcie_aux_clk);
+       clk_disable_unprepare(phy->aux_clk);
 aux_clk_fail:
-       clk_disable_unprepare(kirin_pcie->pcie_aclk);
+       clk_disable_unprepare(phy->aclk);
 aclk_fail:
-       clk_disable_unprepare(kirin_pcie->apb_phy_clk);
+       clk_disable_unprepare(phy->apb_phy_clk);
 apb_phy_fail:
-       clk_disable_unprepare(kirin_pcie->apb_sys_clk);
+       clk_disable_unprepare(phy->apb_sys_clk);
 apb_sys_fail:
-       clk_disable_unprepare(kirin_pcie->phy_ref_clk);
+       clk_disable_unprepare(phy->phy_ref_clk);
 
        return ret;
 }
 
-static int kirin_pcie_power_on(struct kirin_pcie *kirin_pcie)
+static int hi3660_pcie_phy_power_on(struct kirin_pcie *pcie)
 {
+       struct hi3660_pcie_phy *phy = pcie->phy_priv;
        int ret;
 
        /* Power supply for Host */
-       regmap_write(kirin_pcie->sysctrl,
+       regmap_write(phy->sysctrl,
                     SCTRL_PCIE_CMOS_OFFSET, SCTRL_PCIE_CMOS_BIT);
        usleep_range(TIME_CMOS_MIN, TIME_CMOS_MAX);
-       kirin_pcie_oe_enable(kirin_pcie);
 
-       ret = kirin_pcie_clk_ctrl(kirin_pcie, true);
+       hi3660_pcie_phy_oe_enable(phy);
+
+       ret = hi3660_pcie_phy_clk_ctrl(phy, true);
        if (ret)
                return ret;
 
        /* ISO disable, PCIeCtrl, PHY assert and clk gate clear */
-       regmap_write(kirin_pcie->sysctrl,
+       regmap_write(phy->sysctrl,
                     SCTRL_PCIE_ISO_OFFSET, SCTRL_PCIE_ISO_BIT);
-       regmap_write(kirin_pcie->crgctrl,
+       regmap_write(phy->crgctrl,
                     CRGCTRL_PCIE_ASSERT_OFFSET, CRGCTRL_PCIE_ASSERT_BIT);
-       regmap_write(kirin_pcie->sysctrl,
+       regmap_write(phy->sysctrl,
                     SCTRL_PCIE_HPCLK_OFFSET, SCTRL_PCIE_HPCLK_BIT);
 
-       ret = kirin_pcie_phy_init(kirin_pcie);
+       ret = hi3660_pcie_phy_start(phy);
        if (ret)
-               goto close_clk;
+               goto disable_clks;
 
-       /* perst assert Endpoint */
-       if (!gpio_request(kirin_pcie->gpio_id_reset, "pcie_perst")) {
-               usleep_range(REF_2_PERST_MIN, REF_2_PERST_MAX);
-               ret = gpio_direction_output(kirin_pcie->gpio_id_reset, 1);
-               if (ret)
-                       goto close_clk;
-               usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX);
+       return 0;
+
+disable_clks:
+       hi3660_pcie_phy_clk_ctrl(phy, false);
+       return ret;
+}
+
+static int hi3660_pcie_phy_init(struct platform_device *pdev,
+                               struct kirin_pcie *pcie)
+{
+       struct device *dev = &pdev->dev;
+       struct hi3660_pcie_phy *phy;
+       int ret;
 
+       phy = devm_kzalloc(dev, sizeof(*phy), GFP_KERNEL);
+       if (!phy)
+               return -ENOMEM;
+
+       pcie->phy_priv = phy;
+       phy->dev = dev;
+
+       /* registers */
+       pdev = container_of(dev, struct platform_device, dev);
+
+       ret = hi3660_pcie_phy_get_clk(phy);
+       if (ret)
+               return ret;
+
+       return hi3660_pcie_phy_get_resource(phy);
+}
+
+static int hi3660_pcie_phy_power_off(struct kirin_pcie *pcie)
+{
+       struct hi3660_pcie_phy *phy = pcie->phy_priv;
+
+       /* Drop power supply for Host */
+       regmap_write(phy->sysctrl, SCTRL_PCIE_CMOS_OFFSET, 0x00);
+
+       hi3660_pcie_phy_clk_ctrl(phy, false);
+
+       return 0;
+}
+
+/*
+ * The non-PHY part starts here
+ */
+
+static const struct regmap_config pcie_kirin_regmap_conf = {
+       .name = "kirin_pcie_apb",
+       .reg_bits = 32,
+       .val_bits = 32,
+       .reg_stride = 4,
+};
+
+static int kirin_pcie_get_gpio_enable(struct kirin_pcie *pcie,
+                                     struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct device_node *np = dev->of_node;
+       char name[32];
+       int ret, i;
+
+       /* This is an optional property */
+       ret = of_gpio_named_count(np, "hisilicon,clken-gpios");
+       if (ret < 0)
                return 0;
+
+       if (ret > MAX_PCI_SLOTS) {
+               dev_err(dev, "Too many GPIO clock requests!\n");
+               return -EINVAL;
        }
 
-close_clk:
-       kirin_pcie_clk_ctrl(kirin_pcie, false);
+       pcie->n_gpio_clkreq = ret;
+
+       for (i = 0; i < pcie->n_gpio_clkreq; i++) {
+               pcie->gpio_id_clkreq[i] = of_get_named_gpio(dev->of_node,
+                                                   "hisilicon,clken-gpios", i);
+               if (pcie->gpio_id_clkreq[i] < 0)
+                       return pcie->gpio_id_clkreq[i];
+
+               sprintf(name, "pcie_clkreq_%d", i);
+               pcie->clkreq_names[i] = devm_kstrdup_const(dev, name,
+                                                           GFP_KERNEL);
+               if (!pcie->clkreq_names[i])
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int kirin_pcie_parse_port(struct kirin_pcie *pcie,
+                                struct platform_device *pdev,
+                                struct device_node *node)
+{
+       struct device *dev = &pdev->dev;
+       struct device_node *parent, *child;
+       int ret, slot, i;
+       char name[32];
+
+       for_each_available_child_of_node(node, parent) {
+               for_each_available_child_of_node(parent, child) {
+                       i = pcie->num_slots;
+
+                       pcie->gpio_id_reset[i] = of_get_named_gpio(child,
+                                                       "reset-gpios", 0);
+                       if (pcie->gpio_id_reset[i] < 0)
+                               continue;
+
+                       pcie->num_slots++;
+                       if (pcie->num_slots > MAX_PCI_SLOTS) {
+                               dev_err(dev, "Too many PCI slots!\n");
+                               ret = -EINVAL;
+                               goto put_node;
+                       }
+
+                       ret = of_pci_get_devfn(child);
+                       if (ret < 0) {
+                               dev_err(dev, "failed to parse devfn: %d\n", ret);
+                               goto put_node;
+                       }
+
+                       slot = PCI_SLOT(ret);
+
+                       sprintf(name, "pcie_perst_%d", slot);
+                       pcie->reset_names[i] = devm_kstrdup_const(dev, name,
+                                                               GFP_KERNEL);
+                       if (!pcie->reset_names[i]) {
+                               ret = -ENOMEM;
+                               goto put_node;
+                       }
+               }
+       }
+
+       return 0;
+
+put_node:
+       of_node_put(child);
+       of_node_put(parent);
+       return ret;
+}
+
+static long kirin_pcie_get_resource(struct kirin_pcie *kirin_pcie,
+                                   struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct device_node *child, *node = dev->of_node;
+       void __iomem *apb_base;
+       int ret;
+
+       apb_base = devm_platform_ioremap_resource_byname(pdev, "apb");
+       if (IS_ERR(apb_base))
+               return PTR_ERR(apb_base);
+
+       kirin_pcie->apb = devm_regmap_init_mmio(dev, apb_base,
+                                               &pcie_kirin_regmap_conf);
+       if (IS_ERR(kirin_pcie->apb))
+               return PTR_ERR(kirin_pcie->apb);
+
+       /* pcie internal PERST# gpio */
+       kirin_pcie->gpio_id_dwc_perst = of_get_named_gpio(dev->of_node,
+                                                         "reset-gpios", 0);
+       if (kirin_pcie->gpio_id_dwc_perst == -EPROBE_DEFER) {
+               return -EPROBE_DEFER;
+       } else if (!gpio_is_valid(kirin_pcie->gpio_id_dwc_perst)) {
+               dev_err(dev, "unable to get a valid gpio pin\n");
+               return -ENODEV;
+       }
+
+       ret = kirin_pcie_get_gpio_enable(kirin_pcie, pdev);
+       if (ret)
+               return ret;
+
+       /* Parse OF children */
+       for_each_available_child_of_node(node, child) {
+               ret = kirin_pcie_parse_port(kirin_pcie, pdev, child);
+               if (ret)
+                       goto put_node;
+       }
+
+       return 0;
+
+put_node:
+       of_node_put(child);
        return ret;
 }
 
@@ -302,13 +502,13 @@ static void kirin_pcie_sideband_dbi_w_mode(struct kirin_pcie *kirin_pcie,
 {
        u32 val;
 
-       val = kirin_apb_ctrl_readl(kirin_pcie, SOC_PCIECTRL_CTRL0_ADDR);
+       regmap_read(kirin_pcie->apb, SOC_PCIECTRL_CTRL0_ADDR, &val);
        if (on)
                val = val | PCIE_ELBI_SLV_DBI_ENABLE;
        else
                val = val & ~PCIE_ELBI_SLV_DBI_ENABLE;
 
-       kirin_apb_ctrl_writel(kirin_pcie, val, SOC_PCIECTRL_CTRL0_ADDR);
+       regmap_write(kirin_pcie->apb, SOC_PCIECTRL_CTRL0_ADDR, val);
 }
 
 static void kirin_pcie_sideband_dbi_r_mode(struct kirin_pcie *kirin_pcie,
@@ -316,13 +516,13 @@ static void kirin_pcie_sideband_dbi_r_mode(struct kirin_pcie *kirin_pcie,
 {
        u32 val;
 
-       val = kirin_apb_ctrl_readl(kirin_pcie, SOC_PCIECTRL_CTRL1_ADDR);
+       regmap_read(kirin_pcie->apb, SOC_PCIECTRL_CTRL1_ADDR, &val);
        if (on)
                val = val | PCIE_ELBI_SLV_DBI_ENABLE;
        else
                val = val & ~PCIE_ELBI_SLV_DBI_ENABLE;
 
-       kirin_apb_ctrl_writel(kirin_pcie, val, SOC_PCIECTRL_CTRL1_ADDR);
+       regmap_write(kirin_pcie->apb, SOC_PCIECTRL_CTRL1_ADDR, val);
 }
 
 static int kirin_pcie_rd_own_conf(struct pci_bus *bus, unsigned int devfn,
@@ -351,9 +551,32 @@ static int kirin_pcie_wr_own_conf(struct pci_bus *bus, unsigned int devfn,
        return PCIBIOS_SUCCESSFUL;
 }
 
+static int kirin_pcie_add_bus(struct pci_bus *bus)
+{
+       struct dw_pcie *pci = to_dw_pcie_from_pp(bus->sysdata);
+       struct kirin_pcie *kirin_pcie = to_kirin_pcie(pci);
+       int i, ret;
+
+       if (!kirin_pcie->num_slots)
+               return 0;
+
+       /* Send PERST# to each slot */
+       for (i = 0; i < kirin_pcie->num_slots; i++) {
+               ret = gpio_direction_output(kirin_pcie->gpio_id_reset[i], 1);
+               if (ret) {
+                       dev_err(pci->dev, "PERST# %s error: %d\n",
+                               kirin_pcie->reset_names[i], ret);
+               }
+       }
+       usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX);
+
+       return 0;
+}
+
 static struct pci_ops kirin_pci_ops = {
        .read = kirin_pcie_rd_own_conf,
        .write = kirin_pcie_wr_own_conf,
+       .add_bus = kirin_pcie_add_bus,
 };
 
 static u32 kirin_pcie_read_dbi(struct dw_pcie *pci, void __iomem *base,
@@ -382,8 +605,9 @@ static void kirin_pcie_write_dbi(struct dw_pcie *pci, void __iomem *base,
 static int kirin_pcie_link_up(struct dw_pcie *pci)
 {
        struct kirin_pcie *kirin_pcie = to_kirin_pcie(pci);
-       u32 val = kirin_apb_ctrl_readl(kirin_pcie, PCIE_APB_PHY_STATUS0);
+       u32 val;
 
+       regmap_read(kirin_pcie->apb, PCIE_APB_PHY_STATUS0, &val);
        if ((val & PCIE_LINKUP_ENABLE) == PCIE_LINKUP_ENABLE)
                return 1;
 
@@ -395,8 +619,8 @@ static int kirin_pcie_start_link(struct dw_pcie *pci)
        struct kirin_pcie *kirin_pcie = to_kirin_pcie(pci);
 
        /* assert LTSSM enable */
-       kirin_apb_ctrl_writel(kirin_pcie, PCIE_LTSSM_ENABLE_BIT,
-                             PCIE_APP_LTSSM_ENABLE);
+       regmap_write(kirin_pcie->apb, PCIE_APP_LTSSM_ENABLE,
+                    PCIE_LTSSM_ENABLE_BIT);
 
        return 0;
 }
@@ -408,6 +632,44 @@ static int kirin_pcie_host_init(struct pcie_port *pp)
        return 0;
 }
 
+static int kirin_pcie_gpio_request(struct kirin_pcie *kirin_pcie,
+                                  struct device *dev)
+{
+       int ret, i;
+
+       for (i = 0; i < kirin_pcie->num_slots; i++) {
+               if (!gpio_is_valid(kirin_pcie->gpio_id_reset[i])) {
+                       dev_err(dev, "unable to get a valid %s gpio\n",
+                               kirin_pcie->reset_names[i]);
+                       return -ENODEV;
+               }
+
+               ret = devm_gpio_request(dev, kirin_pcie->gpio_id_reset[i],
+                                       kirin_pcie->reset_names[i]);
+               if (ret)
+                       return ret;
+       }
+
+       for (i = 0; i < kirin_pcie->n_gpio_clkreq; i++) {
+               if (!gpio_is_valid(kirin_pcie->gpio_id_clkreq[i])) {
+                       dev_err(dev, "unable to get a valid %s gpio\n",
+                               kirin_pcie->clkreq_names[i]);
+                       return -ENODEV;
+               }
+
+               ret = devm_gpio_request(dev, kirin_pcie->gpio_id_clkreq[i],
+                                       kirin_pcie->clkreq_names[i]);
+               if (ret)
+                       return ret;
+
+               ret = gpio_direction_output(kirin_pcie->gpio_id_clkreq[i], 0);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
 static const struct dw_pcie_ops kirin_dw_pcie_ops = {
        .read_dbi = kirin_pcie_read_dbi,
        .write_dbi = kirin_pcie_write_dbi,
@@ -419,8 +681,99 @@ static const struct dw_pcie_host_ops kirin_pcie_host_ops = {
        .host_init = kirin_pcie_host_init,
 };
 
+static int kirin_pcie_power_off(struct kirin_pcie *kirin_pcie)
+{
+       int i;
+
+       if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY)
+               return hi3660_pcie_phy_power_off(kirin_pcie);
+
+       for (i = 0; i < kirin_pcie->n_gpio_clkreq; i++)
+               gpio_direction_output(kirin_pcie->gpio_id_clkreq[i], 1);
+
+       phy_power_off(kirin_pcie->phy);
+       phy_exit(kirin_pcie->phy);
+
+       return 0;
+}
+
+static int kirin_pcie_power_on(struct platform_device *pdev,
+                              struct kirin_pcie *kirin_pcie)
+{
+       struct device *dev = &pdev->dev;
+       int ret;
+
+       if (kirin_pcie->type == PCIE_KIRIN_INTERNAL_PHY) {
+               ret = hi3660_pcie_phy_init(pdev, kirin_pcie);
+               if (ret)
+                       return ret;
+
+               ret = hi3660_pcie_phy_power_on(kirin_pcie);
+               if (ret)
+                       return ret;
+       } else {
+               kirin_pcie->phy = devm_of_phy_get(dev, dev->of_node, NULL);
+               if (IS_ERR(kirin_pcie->phy))
+                       return PTR_ERR(kirin_pcie->phy);
+
+               ret = kirin_pcie_gpio_request(kirin_pcie, dev);
+               if (ret)
+                       return ret;
+
+               ret = phy_init(kirin_pcie->phy);
+               if (ret)
+                       goto err;
+
+               ret = phy_power_on(kirin_pcie->phy);
+               if (ret)
+                       goto err;
+       }
+
+       /* perst assert Endpoint */
+       usleep_range(REF_2_PERST_MIN, REF_2_PERST_MAX);
+
+       if (!gpio_request(kirin_pcie->gpio_id_dwc_perst, "pcie_perst_bridge")) {
+               ret = gpio_direction_output(kirin_pcie->gpio_id_dwc_perst, 1);
+               if (ret)
+                       goto err;
+       }
+
+       usleep_range(PERST_2_ACCESS_MIN, PERST_2_ACCESS_MAX);
+
+       return 0;
+err:
+       kirin_pcie_power_off(kirin_pcie);
+
+       return ret;
+}
+
+static int __exit kirin_pcie_remove(struct platform_device *pdev)
+{
+       struct kirin_pcie *kirin_pcie = platform_get_drvdata(pdev);
+
+       dw_pcie_host_deinit(&kirin_pcie->pci->pp);
+
+       kirin_pcie_power_off(kirin_pcie);
+
+       return 0;
+}
+
+static const struct of_device_id kirin_pcie_match[] = {
+       {
+               .compatible = "hisilicon,kirin960-pcie",
+               .data = (void *)PCIE_KIRIN_INTERNAL_PHY
+       },
+       {
+               .compatible = "hisilicon,kirin970-pcie",
+               .data = (void *)PCIE_KIRIN_EXTERNAL_PHY
+       },
+       {},
+};
+
 static int kirin_pcie_probe(struct platform_device *pdev)
 {
+       enum pcie_kirin_phy_type phy_type;
+       const struct of_device_id *of_id;
        struct device *dev = &pdev->dev;
        struct kirin_pcie *kirin_pcie;
        struct dw_pcie *pci;
@@ -431,6 +784,14 @@ static int kirin_pcie_probe(struct platform_device *pdev)
                return -EINVAL;
        }
 
+       of_id = of_match_device(kirin_pcie_match, dev);
+       if (!of_id) {
+               dev_err(dev, "OF data missing\n");
+               return -EINVAL;
+       }
+
+       phy_type = (long)of_id->data;
+
        kirin_pcie = devm_kzalloc(dev, sizeof(struct kirin_pcie), GFP_KERNEL);
        if (!kirin_pcie)
                return -ENOMEM;
@@ -443,44 +804,33 @@ static int kirin_pcie_probe(struct platform_device *pdev)
        pci->ops = &kirin_dw_pcie_ops;
        pci->pp.ops = &kirin_pcie_host_ops;
        kirin_pcie->pci = pci;
-
-       ret = kirin_pcie_get_clk(kirin_pcie, pdev);
-       if (ret)
-               return ret;
+       kirin_pcie->type = phy_type;
 
        ret = kirin_pcie_get_resource(kirin_pcie, pdev);
        if (ret)
                return ret;
 
-       kirin_pcie->gpio_id_reset = of_get_named_gpio(dev->of_node,
-                                                     "reset-gpios", 0);
-       if (kirin_pcie->gpio_id_reset == -EPROBE_DEFER) {
-               return -EPROBE_DEFER;
-       } else if (!gpio_is_valid(kirin_pcie->gpio_id_reset)) {
-               dev_err(dev, "unable to get a valid gpio pin\n");
-               return -ENODEV;
-       }
+       platform_set_drvdata(pdev, kirin_pcie);
 
-       ret = kirin_pcie_power_on(kirin_pcie);
+       ret = kirin_pcie_power_on(pdev, kirin_pcie);
        if (ret)
                return ret;
 
-       platform_set_drvdata(pdev, kirin_pcie);
-
        return dw_pcie_host_init(&pci->pp);
 }
 
-static const struct of_device_id kirin_pcie_match[] = {
-       { .compatible = "hisilicon,kirin960-pcie" },
-       {},
-};
-
 static struct platform_driver kirin_pcie_driver = {
        .probe                  = kirin_pcie_probe,
+       .remove                 = __exit_p(kirin_pcie_remove),
        .driver                 = {
                .name                   = "kirin-pcie",
-               .of_match_table = kirin_pcie_match,
-               .suppress_bind_attrs = true,
+               .of_match_table         = kirin_pcie_match,
+               .suppress_bind_attrs    = true,
        },
 };
-builtin_platform_driver(kirin_pcie_driver);
+module_platform_driver(kirin_pcie_driver);
+
+MODULE_DEVICE_TABLE(of, kirin_pcie_match);
+MODULE_DESCRIPTION("PCIe host controller driver for Kirin Phone SoCs");
+MODULE_AUTHOR("Xiaowei Song <songxiaowei@huawei.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c
new file mode 100644 (file)
index 0000000..7b17da2
--- /dev/null
@@ -0,0 +1,721 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Qualcomm PCIe Endpoint controller driver
+ *
+ * Copyright (c) 2020, The Linux Foundation. All rights reserved.
+ * Author: Siddartha Mohanadoss <smohanad@codeaurora.org
+ *
+ * Copyright (c) 2021, Linaro Ltd.
+ * Author: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/mfd/syscon.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+#include <linux/regmap.h>
+#include <linux/reset.h>
+
+#include "pcie-designware.h"
+
+/* PARF registers */
+#define PARF_SYS_CTRL                          0x00
+#define PARF_DB_CTRL                           0x10
+#define PARF_PM_CTRL                           0x20
+#define PARF_MHI_BASE_ADDR_LOWER               0x178
+#define PARF_MHI_BASE_ADDR_UPPER               0x17c
+#define PARF_DEBUG_INT_EN                      0x190
+#define PARF_AXI_MSTR_RD_HALT_NO_WRITES                0x1a4
+#define PARF_AXI_MSTR_WR_ADDR_HALT             0x1a8
+#define PARF_Q2A_FLUSH                         0x1ac
+#define PARF_LTSSM                             0x1b0
+#define PARF_CFG_BITS                          0x210
+#define PARF_INT_ALL_STATUS                    0x224
+#define PARF_INT_ALL_CLEAR                     0x228
+#define PARF_INT_ALL_MASK                      0x22c
+#define PARF_SLV_ADDR_MSB_CTRL                 0x2c0
+#define PARF_DBI_BASE_ADDR                     0x350
+#define PARF_DBI_BASE_ADDR_HI                  0x354
+#define PARF_SLV_ADDR_SPACE_SIZE               0x358
+#define PARF_SLV_ADDR_SPACE_SIZE_HI            0x35c
+#define PARF_ATU_BASE_ADDR                     0x634
+#define PARF_ATU_BASE_ADDR_HI                  0x638
+#define PARF_SRIS_MODE                         0x644
+#define PARF_DEVICE_TYPE                       0x1000
+#define PARF_BDF_TO_SID_CFG                    0x2c00
+
+/* PARF_INT_ALL_{STATUS/CLEAR/MASK} register fields */
+#define PARF_INT_ALL_LINK_DOWN                 BIT(1)
+#define PARF_INT_ALL_BME                       BIT(2)
+#define PARF_INT_ALL_PM_TURNOFF                        BIT(3)
+#define PARF_INT_ALL_DEBUG                     BIT(4)
+#define PARF_INT_ALL_LTR                       BIT(5)
+#define PARF_INT_ALL_MHI_Q6                    BIT(6)
+#define PARF_INT_ALL_MHI_A7                    BIT(7)
+#define PARF_INT_ALL_DSTATE_CHANGE             BIT(8)
+#define PARF_INT_ALL_L1SUB_TIMEOUT             BIT(9)
+#define PARF_INT_ALL_MMIO_WRITE                        BIT(10)
+#define PARF_INT_ALL_CFG_WRITE                 BIT(11)
+#define PARF_INT_ALL_BRIDGE_FLUSH_N            BIT(12)
+#define PARF_INT_ALL_LINK_UP                   BIT(13)
+#define PARF_INT_ALL_AER_LEGACY                        BIT(14)
+#define PARF_INT_ALL_PLS_ERR                   BIT(15)
+#define PARF_INT_ALL_PME_LEGACY                        BIT(16)
+#define PARF_INT_ALL_PLS_PME                   BIT(17)
+
+/* PARF_BDF_TO_SID_CFG register fields */
+#define PARF_BDF_TO_SID_BYPASS                 BIT(0)
+
+/* PARF_DEBUG_INT_EN register fields */
+#define PARF_DEBUG_INT_PM_DSTATE_CHANGE                BIT(1)
+#define PARF_DEBUG_INT_CFG_BUS_MASTER_EN       BIT(2)
+#define PARF_DEBUG_INT_RADM_PM_TURNOFF         BIT(3)
+
+/* PARF_DEVICE_TYPE register fields */
+#define PARF_DEVICE_TYPE_EP                    0x0
+
+/* PARF_PM_CTRL register fields */
+#define PARF_PM_CTRL_REQ_EXIT_L1               BIT(1)
+#define PARF_PM_CTRL_READY_ENTR_L23            BIT(2)
+#define PARF_PM_CTRL_REQ_NOT_ENTR_L1           BIT(5)
+
+/* PARF_AXI_MSTR_RD_HALT_NO_WRITES register fields */
+#define PARF_AXI_MSTR_RD_HALT_NO_WRITE_EN      BIT(0)
+
+/* PARF_AXI_MSTR_WR_ADDR_HALT register fields */
+#define PARF_AXI_MSTR_WR_ADDR_HALT_EN          BIT(31)
+
+/* PARF_Q2A_FLUSH register fields */
+#define PARF_Q2A_FLUSH_EN                      BIT(16)
+
+/* PARF_SYS_CTRL register fields */
+#define PARF_SYS_CTRL_AUX_PWR_DET              BIT(4)
+#define PARF_SYS_CTRL_CORE_CLK_CGC_DIS         BIT(6)
+#define PARF_SYS_CTRL_SLV_DBI_WAKE_DISABLE     BIT(11)
+
+/* PARF_DB_CTRL register fields */
+#define PARF_DB_CTRL_INSR_DBNCR_BLOCK          BIT(0)
+#define PARF_DB_CTRL_RMVL_DBNCR_BLOCK          BIT(1)
+#define PARF_DB_CTRL_DBI_WKP_BLOCK             BIT(4)
+#define PARF_DB_CTRL_SLV_WKP_BLOCK             BIT(5)
+#define PARF_DB_CTRL_MST_WKP_BLOCK             BIT(6)
+
+/* PARF_CFG_BITS register fields */
+#define PARF_CFG_BITS_REQ_EXIT_L1SS_MSI_LTR_EN BIT(1)
+
+/* ELBI registers */
+#define ELBI_SYS_STTS                          0x08
+
+/* DBI registers */
+#define DBI_CON_STATUS                         0x44
+
+/* DBI register fields */
+#define DBI_CON_STATUS_POWER_STATE_MASK                GENMASK(1, 0)
+
+#define XMLH_LINK_UP                           0x400
+#define CORE_RESET_TIME_US_MIN                 1000
+#define CORE_RESET_TIME_US_MAX                 1005
+#define WAKE_DELAY_US                          2000 /* 2 ms */
+
+#define to_pcie_ep(x)                          dev_get_drvdata((x)->dev)
+
+enum qcom_pcie_ep_link_status {
+       QCOM_PCIE_EP_LINK_DISABLED,
+       QCOM_PCIE_EP_LINK_ENABLED,
+       QCOM_PCIE_EP_LINK_UP,
+       QCOM_PCIE_EP_LINK_DOWN,
+};
+
+static struct clk_bulk_data qcom_pcie_ep_clks[] = {
+       { .id = "cfg" },
+       { .id = "aux" },
+       { .id = "bus_master" },
+       { .id = "bus_slave" },
+       { .id = "ref" },
+       { .id = "sleep" },
+       { .id = "slave_q2a" },
+};
+
+struct qcom_pcie_ep {
+       struct dw_pcie pci;
+
+       void __iomem *parf;
+       void __iomem *elbi;
+       struct regmap *perst_map;
+       struct resource *mmio_res;
+
+       struct reset_control *core_reset;
+       struct gpio_desc *reset;
+       struct gpio_desc *wake;
+       struct phy *phy;
+
+       u32 perst_en;
+       u32 perst_sep_en;
+
+       enum qcom_pcie_ep_link_status link_status;
+       int global_irq;
+       int perst_irq;
+};
+
+static int qcom_pcie_ep_core_reset(struct qcom_pcie_ep *pcie_ep)
+{
+       struct dw_pcie *pci = &pcie_ep->pci;
+       struct device *dev = pci->dev;
+       int ret;
+
+       ret = reset_control_assert(pcie_ep->core_reset);
+       if (ret) {
+               dev_err(dev, "Cannot assert core reset\n");
+               return ret;
+       }
+
+       usleep_range(CORE_RESET_TIME_US_MIN, CORE_RESET_TIME_US_MAX);
+
+       ret = reset_control_deassert(pcie_ep->core_reset);
+       if (ret) {
+               dev_err(dev, "Cannot de-assert core reset\n");
+               return ret;
+       }
+
+       usleep_range(CORE_RESET_TIME_US_MIN, CORE_RESET_TIME_US_MAX);
+
+       return 0;
+}
+
+/*
+ * Delatch PERST_EN and PERST_SEPARATION_ENABLE with TCSR to avoid
+ * device reset during host reboot and hibernation. The driver is
+ * expected to handle this situation.
+ */
+static void qcom_pcie_ep_configure_tcsr(struct qcom_pcie_ep *pcie_ep)
+{
+       regmap_write(pcie_ep->perst_map, pcie_ep->perst_en, 0);
+       regmap_write(pcie_ep->perst_map, pcie_ep->perst_sep_en, 0);
+}
+
+static int qcom_pcie_dw_link_up(struct dw_pcie *pci)
+{
+       struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci);
+       u32 reg;
+
+       reg = readl_relaxed(pcie_ep->elbi + ELBI_SYS_STTS);
+
+       return reg & XMLH_LINK_UP;
+}
+
+static int qcom_pcie_dw_start_link(struct dw_pcie *pci)
+{
+       struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci);
+
+       enable_irq(pcie_ep->perst_irq);
+
+       return 0;
+}
+
+static void qcom_pcie_dw_stop_link(struct dw_pcie *pci)
+{
+       struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci);
+
+       disable_irq(pcie_ep->perst_irq);
+}
+
+static int qcom_pcie_perst_deassert(struct dw_pcie *pci)
+{
+       struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci);
+       struct device *dev = pci->dev;
+       u32 val, offset;
+       int ret;
+
+       ret = clk_bulk_prepare_enable(ARRAY_SIZE(qcom_pcie_ep_clks),
+                                     qcom_pcie_ep_clks);
+       if (ret)
+               return ret;
+
+       ret = qcom_pcie_ep_core_reset(pcie_ep);
+       if (ret)
+               goto err_disable_clk;
+
+       ret = phy_init(pcie_ep->phy);
+       if (ret)
+               goto err_disable_clk;
+
+       ret = phy_power_on(pcie_ep->phy);
+       if (ret)
+               goto err_phy_exit;
+
+       /* Assert WAKE# to RC to indicate device is ready */
+       gpiod_set_value_cansleep(pcie_ep->wake, 1);
+       usleep_range(WAKE_DELAY_US, WAKE_DELAY_US + 500);
+       gpiod_set_value_cansleep(pcie_ep->wake, 0);
+
+       qcom_pcie_ep_configure_tcsr(pcie_ep);
+
+       /* Disable BDF to SID mapping */
+       val = readl_relaxed(pcie_ep->parf + PARF_BDF_TO_SID_CFG);
+       val |= PARF_BDF_TO_SID_BYPASS;
+       writel_relaxed(val, pcie_ep->parf + PARF_BDF_TO_SID_CFG);
+
+       /* Enable debug IRQ */
+       val = readl_relaxed(pcie_ep->parf + PARF_DEBUG_INT_EN);
+       val |= PARF_DEBUG_INT_RADM_PM_TURNOFF |
+              PARF_DEBUG_INT_CFG_BUS_MASTER_EN |
+              PARF_DEBUG_INT_PM_DSTATE_CHANGE;
+       writel_relaxed(val, pcie_ep->parf + PARF_DEBUG_INT_EN);
+
+       /* Configure PCIe to endpoint mode */
+       writel_relaxed(PARF_DEVICE_TYPE_EP, pcie_ep->parf + PARF_DEVICE_TYPE);
+
+       /* Allow entering L1 state */
+       val = readl_relaxed(pcie_ep->parf + PARF_PM_CTRL);
+       val &= ~PARF_PM_CTRL_REQ_NOT_ENTR_L1;
+       writel_relaxed(val, pcie_ep->parf + PARF_PM_CTRL);
+
+       /* Read halts write */
+       val = readl_relaxed(pcie_ep->parf + PARF_AXI_MSTR_RD_HALT_NO_WRITES);
+       val &= ~PARF_AXI_MSTR_RD_HALT_NO_WRITE_EN;
+       writel_relaxed(val, pcie_ep->parf + PARF_AXI_MSTR_RD_HALT_NO_WRITES);
+
+       /* Write after write halt */
+       val = readl_relaxed(pcie_ep->parf + PARF_AXI_MSTR_WR_ADDR_HALT);
+       val |= PARF_AXI_MSTR_WR_ADDR_HALT_EN;
+       writel_relaxed(val, pcie_ep->parf + PARF_AXI_MSTR_WR_ADDR_HALT);
+
+       /* Q2A flush disable */
+       val = readl_relaxed(pcie_ep->parf + PARF_Q2A_FLUSH);
+       val &= ~PARF_Q2A_FLUSH_EN;
+       writel_relaxed(val, pcie_ep->parf + PARF_Q2A_FLUSH);
+
+       /* Disable DBI Wakeup, core clock CGC and enable AUX power */
+       val = readl_relaxed(pcie_ep->parf + PARF_SYS_CTRL);
+       val |= PARF_SYS_CTRL_SLV_DBI_WAKE_DISABLE |
+              PARF_SYS_CTRL_CORE_CLK_CGC_DIS |
+              PARF_SYS_CTRL_AUX_PWR_DET;
+       writel_relaxed(val, pcie_ep->parf + PARF_SYS_CTRL);
+
+       /* Disable the debouncers */
+       val = readl_relaxed(pcie_ep->parf + PARF_DB_CTRL);
+       val |= PARF_DB_CTRL_INSR_DBNCR_BLOCK | PARF_DB_CTRL_RMVL_DBNCR_BLOCK |
+              PARF_DB_CTRL_DBI_WKP_BLOCK | PARF_DB_CTRL_SLV_WKP_BLOCK |
+              PARF_DB_CTRL_MST_WKP_BLOCK;
+       writel_relaxed(val, pcie_ep->parf + PARF_DB_CTRL);
+
+       /* Request to exit from L1SS for MSI and LTR MSG */
+       val = readl_relaxed(pcie_ep->parf + PARF_CFG_BITS);
+       val |= PARF_CFG_BITS_REQ_EXIT_L1SS_MSI_LTR_EN;
+       writel_relaxed(val, pcie_ep->parf + PARF_CFG_BITS);
+
+       dw_pcie_dbi_ro_wr_en(pci);
+
+       /* Set the L0s Exit Latency to 2us-4us = 0x6 */
+       offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
+       val = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP);
+       val &= ~PCI_EXP_LNKCAP_L0SEL;
+       val |= FIELD_PREP(PCI_EXP_LNKCAP_L0SEL, 0x6);
+       dw_pcie_writel_dbi(pci, offset + PCI_EXP_LNKCAP, val);
+
+       /* Set the L1 Exit Latency to be 32us-64 us = 0x6 */
+       offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
+       val = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP);
+       val &= ~PCI_EXP_LNKCAP_L1EL;
+       val |= FIELD_PREP(PCI_EXP_LNKCAP_L1EL, 0x6);
+       dw_pcie_writel_dbi(pci, offset + PCI_EXP_LNKCAP, val);
+
+       dw_pcie_dbi_ro_wr_dis(pci);
+
+       writel_relaxed(0, pcie_ep->parf + PARF_INT_ALL_MASK);
+       val = PARF_INT_ALL_LINK_DOWN | PARF_INT_ALL_BME |
+             PARF_INT_ALL_PM_TURNOFF | PARF_INT_ALL_DSTATE_CHANGE |
+             PARF_INT_ALL_LINK_UP;
+       writel_relaxed(val, pcie_ep->parf + PARF_INT_ALL_MASK);
+
+       ret = dw_pcie_ep_init_complete(&pcie_ep->pci.ep);
+       if (ret) {
+               dev_err(dev, "Failed to complete initialization: %d\n", ret);
+               goto err_phy_power_off;
+       }
+
+       /*
+        * The physical address of the MMIO region which is exposed as the BAR
+        * should be written to MHI BASE registers.
+        */
+       writel_relaxed(pcie_ep->mmio_res->start,
+                      pcie_ep->parf + PARF_MHI_BASE_ADDR_LOWER);
+       writel_relaxed(0, pcie_ep->parf + PARF_MHI_BASE_ADDR_UPPER);
+
+       dw_pcie_ep_init_notify(&pcie_ep->pci.ep);
+
+       /* Enable LTSSM */
+       val = readl_relaxed(pcie_ep->parf + PARF_LTSSM);
+       val |= BIT(8);
+       writel_relaxed(val, pcie_ep->parf + PARF_LTSSM);
+
+       return 0;
+
+err_phy_power_off:
+       phy_power_off(pcie_ep->phy);
+err_phy_exit:
+       phy_exit(pcie_ep->phy);
+err_disable_clk:
+       clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks),
+                                  qcom_pcie_ep_clks);
+
+       return ret;
+}
+
+static void qcom_pcie_perst_assert(struct dw_pcie *pci)
+{
+       struct qcom_pcie_ep *pcie_ep = to_pcie_ep(pci);
+       struct device *dev = pci->dev;
+
+       if (pcie_ep->link_status == QCOM_PCIE_EP_LINK_DISABLED) {
+               dev_dbg(dev, "Link is already disabled\n");
+               return;
+       }
+
+       phy_power_off(pcie_ep->phy);
+       phy_exit(pcie_ep->phy);
+       clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks),
+                                  qcom_pcie_ep_clks);
+       pcie_ep->link_status = QCOM_PCIE_EP_LINK_DISABLED;
+}
+
+/* Common DWC controller ops */
+static const struct dw_pcie_ops pci_ops = {
+       .link_up = qcom_pcie_dw_link_up,
+       .start_link = qcom_pcie_dw_start_link,
+       .stop_link = qcom_pcie_dw_stop_link,
+};
+
+static int qcom_pcie_ep_get_io_resources(struct platform_device *pdev,
+                                        struct qcom_pcie_ep *pcie_ep)
+{
+       struct device *dev = &pdev->dev;
+       struct dw_pcie *pci = &pcie_ep->pci;
+       struct device_node *syscon;
+       struct resource *res;
+       int ret;
+
+       pcie_ep->parf = devm_platform_ioremap_resource_byname(pdev, "parf");
+       if (IS_ERR(pcie_ep->parf))
+               return PTR_ERR(pcie_ep->parf);
+
+       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
+       pci->dbi_base = devm_pci_remap_cfg_resource(dev, res);
+       if (IS_ERR(pci->dbi_base))
+               return PTR_ERR(pci->dbi_base);
+       pci->dbi_base2 = pci->dbi_base;
+
+       res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi");
+       pcie_ep->elbi = devm_pci_remap_cfg_resource(dev, res);
+       if (IS_ERR(pcie_ep->elbi))
+               return PTR_ERR(pcie_ep->elbi);
+
+       pcie_ep->mmio_res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+                                                        "mmio");
+
+       syscon = of_parse_phandle(dev->of_node, "qcom,perst-regs", 0);
+       if (!syscon) {
+               dev_err(dev, "Failed to parse qcom,perst-regs\n");
+               return -EINVAL;
+       }
+
+       pcie_ep->perst_map = syscon_node_to_regmap(syscon);
+       of_node_put(syscon);
+       if (IS_ERR(pcie_ep->perst_map))
+               return PTR_ERR(pcie_ep->perst_map);
+
+       ret = of_property_read_u32_index(dev->of_node, "qcom,perst-regs",
+                                        1, &pcie_ep->perst_en);
+       if (ret < 0) {
+               dev_err(dev, "No Perst Enable offset in syscon\n");
+               return ret;
+       }
+
+       ret = of_property_read_u32_index(dev->of_node, "qcom,perst-regs",
+                                        2, &pcie_ep->perst_sep_en);
+       if (ret < 0) {
+               dev_err(dev, "No Perst Separation Enable offset in syscon\n");
+               return ret;
+       }
+
+       return 0;
+}
+
+static int qcom_pcie_ep_get_resources(struct platform_device *pdev,
+                                     struct qcom_pcie_ep *pcie_ep)
+{
+       struct device *dev = &pdev->dev;
+       int ret;
+
+       ret = qcom_pcie_ep_get_io_resources(pdev, pcie_ep);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to get io resources %d\n", ret);
+               return ret;
+       }
+
+       ret = devm_clk_bulk_get(dev, ARRAY_SIZE(qcom_pcie_ep_clks),
+                               qcom_pcie_ep_clks);
+       if (ret)
+               return ret;
+
+       pcie_ep->core_reset = devm_reset_control_get_exclusive(dev, "core");
+       if (IS_ERR(pcie_ep->core_reset))
+               return PTR_ERR(pcie_ep->core_reset);
+
+       pcie_ep->reset = devm_gpiod_get(dev, "reset", GPIOD_IN);
+       if (IS_ERR(pcie_ep->reset))
+               return PTR_ERR(pcie_ep->reset);
+
+       pcie_ep->wake = devm_gpiod_get_optional(dev, "wake", GPIOD_OUT_LOW);
+       if (IS_ERR(pcie_ep->wake))
+               return PTR_ERR(pcie_ep->wake);
+
+       pcie_ep->phy = devm_phy_optional_get(&pdev->dev, "pciephy");
+       if (IS_ERR(pcie_ep->phy))
+               ret = PTR_ERR(pcie_ep->phy);
+
+       return ret;
+}
+
+/* TODO: Notify clients about PCIe state change */
+static irqreturn_t qcom_pcie_ep_global_irq_thread(int irq, void *data)
+{
+       struct qcom_pcie_ep *pcie_ep = data;
+       struct dw_pcie *pci = &pcie_ep->pci;
+       struct device *dev = pci->dev;
+       u32 status = readl_relaxed(pcie_ep->parf + PARF_INT_ALL_STATUS);
+       u32 mask = readl_relaxed(pcie_ep->parf + PARF_INT_ALL_MASK);
+       u32 dstate, val;
+
+       writel_relaxed(status, pcie_ep->parf + PARF_INT_ALL_CLEAR);
+       status &= mask;
+
+       if (FIELD_GET(PARF_INT_ALL_LINK_DOWN, status)) {
+               dev_dbg(dev, "Received Linkdown event\n");
+               pcie_ep->link_status = QCOM_PCIE_EP_LINK_DOWN;
+       } else if (FIELD_GET(PARF_INT_ALL_BME, status)) {
+               dev_dbg(dev, "Received BME event. Link is enabled!\n");
+               pcie_ep->link_status = QCOM_PCIE_EP_LINK_ENABLED;
+       } else if (FIELD_GET(PARF_INT_ALL_PM_TURNOFF, status)) {
+               dev_dbg(dev, "Received PM Turn-off event! Entering L23\n");
+               val = readl_relaxed(pcie_ep->parf + PARF_PM_CTRL);
+               val |= PARF_PM_CTRL_READY_ENTR_L23;
+               writel_relaxed(val, pcie_ep->parf + PARF_PM_CTRL);
+       } else if (FIELD_GET(PARF_INT_ALL_DSTATE_CHANGE, status)) {
+               dstate = dw_pcie_readl_dbi(pci, DBI_CON_STATUS) &
+                                          DBI_CON_STATUS_POWER_STATE_MASK;
+               dev_dbg(dev, "Received D%d state event\n", dstate);
+               if (dstate == 3) {
+                       val = readl_relaxed(pcie_ep->parf + PARF_PM_CTRL);
+                       val |= PARF_PM_CTRL_REQ_EXIT_L1;
+                       writel_relaxed(val, pcie_ep->parf + PARF_PM_CTRL);
+               }
+       } else if (FIELD_GET(PARF_INT_ALL_LINK_UP, status)) {
+               dev_dbg(dev, "Received Linkup event. Enumeration complete!\n");
+               dw_pcie_ep_linkup(&pci->ep);
+               pcie_ep->link_status = QCOM_PCIE_EP_LINK_UP;
+       } else {
+               dev_dbg(dev, "Received unknown event: %d\n", status);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t qcom_pcie_ep_perst_irq_thread(int irq, void *data)
+{
+       struct qcom_pcie_ep *pcie_ep = data;
+       struct dw_pcie *pci = &pcie_ep->pci;
+       struct device *dev = pci->dev;
+       u32 perst;
+
+       perst = gpiod_get_value(pcie_ep->reset);
+       if (perst) {
+               dev_dbg(dev, "PERST asserted by host. Shutting down the PCIe link!\n");
+               qcom_pcie_perst_assert(pci);
+       } else {
+               dev_dbg(dev, "PERST de-asserted by host. Starting link training!\n");
+               qcom_pcie_perst_deassert(pci);
+       }
+
+       irq_set_irq_type(gpiod_to_irq(pcie_ep->reset),
+                        (perst ? IRQF_TRIGGER_HIGH : IRQF_TRIGGER_LOW));
+
+       return IRQ_HANDLED;
+}
+
+static int qcom_pcie_ep_enable_irq_resources(struct platform_device *pdev,
+                                            struct qcom_pcie_ep *pcie_ep)
+{
+       int irq, ret;
+
+       irq = platform_get_irq_byname(pdev, "global");
+       if (irq < 0) {
+               dev_err(&pdev->dev, "Failed to get Global IRQ\n");
+               return irq;
+       }
+
+       ret = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+                                       qcom_pcie_ep_global_irq_thread,
+                                       IRQF_ONESHOT,
+                                       "global_irq", pcie_ep);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to request Global IRQ\n");
+               return ret;
+       }
+
+       pcie_ep->perst_irq = gpiod_to_irq(pcie_ep->reset);
+       irq_set_status_flags(pcie_ep->perst_irq, IRQ_NOAUTOEN);
+       ret = devm_request_threaded_irq(&pdev->dev, pcie_ep->perst_irq, NULL,
+                                       qcom_pcie_ep_perst_irq_thread,
+                                       IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
+                                       "perst_irq", pcie_ep);
+       if (ret) {
+               dev_err(&pdev->dev, "Failed to request PERST IRQ\n");
+               disable_irq(irq);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int qcom_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
+                                 enum pci_epc_irq_type type, u16 interrupt_num)
+{
+       struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+
+       switch (type) {
+       case PCI_EPC_IRQ_LEGACY:
+               return dw_pcie_ep_raise_legacy_irq(ep, func_no);
+       case PCI_EPC_IRQ_MSI:
+               return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
+       default:
+               dev_err(pci->dev, "Unknown IRQ type\n");
+               return -EINVAL;
+       }
+}
+
+static const struct pci_epc_features qcom_pcie_epc_features = {
+       .linkup_notifier = true,
+       .core_init_notifier = true,
+       .msi_capable = true,
+       .msix_capable = false,
+};
+
+static const struct pci_epc_features *
+qcom_pcie_epc_get_features(struct dw_pcie_ep *pci_ep)
+{
+       return &qcom_pcie_epc_features;
+}
+
+static void qcom_pcie_ep_init(struct dw_pcie_ep *ep)
+{
+       struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+       enum pci_barno bar;
+
+       for (bar = BAR_0; bar <= BAR_5; bar++)
+               dw_pcie_ep_reset_bar(pci, bar);
+}
+
+static struct dw_pcie_ep_ops pci_ep_ops = {
+       .ep_init = qcom_pcie_ep_init,
+       .raise_irq = qcom_pcie_ep_raise_irq,
+       .get_features = qcom_pcie_epc_get_features,
+};
+
+static int qcom_pcie_ep_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       struct qcom_pcie_ep *pcie_ep;
+       int ret;
+
+       pcie_ep = devm_kzalloc(dev, sizeof(*pcie_ep), GFP_KERNEL);
+       if (!pcie_ep)
+               return -ENOMEM;
+
+       pcie_ep->pci.dev = dev;
+       pcie_ep->pci.ops = &pci_ops;
+       pcie_ep->pci.ep.ops = &pci_ep_ops;
+       platform_set_drvdata(pdev, pcie_ep);
+
+       ret = qcom_pcie_ep_get_resources(pdev, pcie_ep);
+       if (ret)
+               return ret;
+
+       ret = clk_bulk_prepare_enable(ARRAY_SIZE(qcom_pcie_ep_clks),
+                                     qcom_pcie_ep_clks);
+       if (ret)
+               return ret;
+
+       ret = qcom_pcie_ep_core_reset(pcie_ep);
+       if (ret)
+               goto err_disable_clk;
+
+       ret = phy_init(pcie_ep->phy);
+       if (ret)
+               goto err_disable_clk;
+
+       /* PHY needs to be powered on for dw_pcie_ep_init() */
+       ret = phy_power_on(pcie_ep->phy);
+       if (ret)
+               goto err_phy_exit;
+
+       ret = dw_pcie_ep_init(&pcie_ep->pci.ep);
+       if (ret) {
+               dev_err(dev, "Failed to initialize endpoint: %d\n", ret);
+               goto err_phy_power_off;
+       }
+
+       ret = qcom_pcie_ep_enable_irq_resources(pdev, pcie_ep);
+       if (ret)
+               goto err_phy_power_off;
+
+       return 0;
+
+err_phy_power_off:
+       phy_power_off(pcie_ep->phy);
+err_phy_exit:
+       phy_exit(pcie_ep->phy);
+err_disable_clk:
+       clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks),
+                                  qcom_pcie_ep_clks);
+
+       return ret;
+}
+
+static int qcom_pcie_ep_remove(struct platform_device *pdev)
+{
+       struct qcom_pcie_ep *pcie_ep = platform_get_drvdata(pdev);
+
+       if (pcie_ep->link_status == QCOM_PCIE_EP_LINK_DISABLED)
+               return 0;
+
+       phy_power_off(pcie_ep->phy);
+       phy_exit(pcie_ep->phy);
+       clk_bulk_disable_unprepare(ARRAY_SIZE(qcom_pcie_ep_clks),
+                                  qcom_pcie_ep_clks);
+
+       return 0;
+}
+
+static const struct of_device_id qcom_pcie_ep_match[] = {
+       { .compatible = "qcom,sdx55-pcie-ep", },
+       { }
+};
+
+static struct platform_driver qcom_pcie_ep_driver = {
+       .probe  = qcom_pcie_ep_probe,
+       .remove = qcom_pcie_ep_remove,
+       .driver = {
+               .name = "qcom-pcie-ep",
+               .of_match_table = qcom_pcie_ep_match,
+       },
+};
+builtin_platform_driver(qcom_pcie_ep_driver);
+
+MODULE_AUTHOR("Siddartha Mohanadoss <smohanad@codeaurora.org>");
+MODULE_AUTHOR("Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>");
+MODULE_DESCRIPTION("Qualcomm PCIe Endpoint controller driver");
+MODULE_LICENSE("GPL v2");
index 8a7a300163e5cc9d8d8db00bcf9c927bef94899b..1c3d1116bb60c83716b8300838f2669471d6074c 100644 (file)
@@ -166,6 +166,9 @@ struct qcom_pcie_resources_2_7_0 {
        struct regulator_bulk_data supplies[2];
        struct reset_control *pci_reset;
        struct clk *pipe_clk;
+       struct clk *pipe_clk_src;
+       struct clk *phy_pipe_clk;
+       struct clk *ref_clk_src;
 };
 
 union qcom_pcie_resources {
@@ -189,6 +192,11 @@ struct qcom_pcie_ops {
        int (*config_sid)(struct qcom_pcie *pcie);
 };
 
+struct qcom_pcie_cfg {
+       const struct qcom_pcie_ops *ops;
+       unsigned int pipe_clk_need_muxing:1;
+};
+
 struct qcom_pcie {
        struct dw_pcie *pci;
        void __iomem *parf;                     /* DT parf */
@@ -197,6 +205,7 @@ struct qcom_pcie {
        struct phy *phy;
        struct gpio_desc *reset;
        const struct qcom_pcie_ops *ops;
+       unsigned int pipe_clk_need_muxing:1;
 };
 
 #define to_qcom_pcie(x)                dev_get_drvdata((x)->dev)
@@ -1167,6 +1176,20 @@ static int qcom_pcie_get_resources_2_7_0(struct qcom_pcie *pcie)
        if (ret < 0)
                return ret;
 
+       if (pcie->pipe_clk_need_muxing) {
+               res->pipe_clk_src = devm_clk_get(dev, "pipe_mux");
+               if (IS_ERR(res->pipe_clk_src))
+                       return PTR_ERR(res->pipe_clk_src);
+
+               res->phy_pipe_clk = devm_clk_get(dev, "phy_pipe");
+               if (IS_ERR(res->phy_pipe_clk))
+                       return PTR_ERR(res->phy_pipe_clk);
+
+               res->ref_clk_src = devm_clk_get(dev, "ref");
+               if (IS_ERR(res->ref_clk_src))
+                       return PTR_ERR(res->ref_clk_src);
+       }
+
        res->pipe_clk = devm_clk_get(dev, "pipe");
        return PTR_ERR_OR_ZERO(res->pipe_clk);
 }
@@ -1185,6 +1208,10 @@ static int qcom_pcie_init_2_7_0(struct qcom_pcie *pcie)
                return ret;
        }
 
+       /* Set TCXO as clock source for pcie_pipe_clk_src */
+       if (pcie->pipe_clk_need_muxing)
+               clk_set_parent(res->pipe_clk_src, res->ref_clk_src);
+
        ret = clk_bulk_prepare_enable(res->num_clks, res->clks);
        if (ret < 0)
                goto err_disable_regulators;
@@ -1256,6 +1283,10 @@ static int qcom_pcie_post_init_2_7_0(struct qcom_pcie *pcie)
 {
        struct qcom_pcie_resources_2_7_0 *res = &pcie->res.v2_7_0;
 
+       /* Set pipe clock as clock source for pcie_pipe_clk_src */
+       if (pcie->pipe_clk_need_muxing)
+               clk_set_parent(res->pipe_clk_src, res->phy_pipe_clk);
+
        return clk_prepare_enable(res->pipe_clk);
 }
 
@@ -1456,6 +1487,39 @@ static const struct qcom_pcie_ops ops_1_9_0 = {
        .config_sid = qcom_pcie_config_sid_sm8250,
 };
 
+static const struct qcom_pcie_cfg apq8084_cfg = {
+       .ops = &ops_1_0_0,
+};
+
+static const struct qcom_pcie_cfg ipq8064_cfg = {
+       .ops = &ops_2_1_0,
+};
+
+static const struct qcom_pcie_cfg msm8996_cfg = {
+       .ops = &ops_2_3_2,
+};
+
+static const struct qcom_pcie_cfg ipq8074_cfg = {
+       .ops = &ops_2_3_3,
+};
+
+static const struct qcom_pcie_cfg ipq4019_cfg = {
+       .ops = &ops_2_4_0,
+};
+
+static const struct qcom_pcie_cfg sdm845_cfg = {
+       .ops = &ops_2_7_0,
+};
+
+static const struct qcom_pcie_cfg sm8250_cfg = {
+       .ops = &ops_1_9_0,
+};
+
+static const struct qcom_pcie_cfg sc7280_cfg = {
+       .ops = &ops_1_9_0,
+       .pipe_clk_need_muxing = true,
+};
+
 static const struct dw_pcie_ops dw_pcie_ops = {
        .link_up = qcom_pcie_link_up,
        .start_link = qcom_pcie_start_link,
@@ -1467,6 +1531,7 @@ static int qcom_pcie_probe(struct platform_device *pdev)
        struct pcie_port *pp;
        struct dw_pcie *pci;
        struct qcom_pcie *pcie;
+       const struct qcom_pcie_cfg *pcie_cfg;
        int ret;
 
        pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
@@ -1488,7 +1553,14 @@ static int qcom_pcie_probe(struct platform_device *pdev)
 
        pcie->pci = pci;
 
-       pcie->ops = of_device_get_match_data(dev);
+       pcie_cfg = of_device_get_match_data(dev);
+       if (!pcie_cfg || !pcie_cfg->ops) {
+               dev_err(dev, "Invalid platform data\n");
+               return -EINVAL;
+       }
+
+       pcie->ops = pcie_cfg->ops;
+       pcie->pipe_clk_need_muxing = pcie_cfg->pipe_clk_need_muxing;
 
        pcie->reset = devm_gpiod_get_optional(dev, "perst", GPIOD_OUT_HIGH);
        if (IS_ERR(pcie->reset)) {
@@ -1545,16 +1617,18 @@ err_pm_runtime_put:
 }
 
 static const struct of_device_id qcom_pcie_match[] = {
-       { .compatible = "qcom,pcie-apq8084", .data = &ops_1_0_0 },
-       { .compatible = "qcom,pcie-ipq8064", .data = &ops_2_1_0 },
-       { .compatible = "qcom,pcie-ipq8064-v2", .data = &ops_2_1_0 },
-       { .compatible = "qcom,pcie-apq8064", .data = &ops_2_1_0 },
-       { .compatible = "qcom,pcie-msm8996", .data = &ops_2_3_2 },
-       { .compatible = "qcom,pcie-ipq8074", .data = &ops_2_3_3 },
-       { .compatible = "qcom,pcie-ipq4019", .data = &ops_2_4_0 },
-       { .compatible = "qcom,pcie-qcs404", .data = &ops_2_4_0 },
-       { .compatible = "qcom,pcie-sdm845", .data = &ops_2_7_0 },
-       { .compatible = "qcom,pcie-sm8250", .data = &ops_1_9_0 },
+       { .compatible = "qcom,pcie-apq8084", .data = &apq8084_cfg },
+       { .compatible = "qcom,pcie-ipq8064", .data = &ipq8064_cfg },
+       { .compatible = "qcom,pcie-ipq8064-v2", .data = &ipq8064_cfg },
+       { .compatible = "qcom,pcie-apq8064", .data = &ipq8064_cfg },
+       { .compatible = "qcom,pcie-msm8996", .data = &msm8996_cfg },
+       { .compatible = "qcom,pcie-ipq8074", .data = &ipq8074_cfg },
+       { .compatible = "qcom,pcie-ipq4019", .data = &ipq4019_cfg },
+       { .compatible = "qcom,pcie-qcs404", .data = &ipq4019_cfg },
+       { .compatible = "qcom,pcie-sdm845", .data = &sdm845_cfg },
+       { .compatible = "qcom,pcie-sm8250", .data = &sm8250_cfg },
+       { .compatible = "qcom,pcie-sc8180x", .data = &sm8250_cfg },
+       { .compatible = "qcom,pcie-sc7280", .data = &sc7280_cfg },
        { }
 };
 
index d842fd0181299bbc0f759963fc9b82d845425bee..d05be942956e297073a6e457d19f6a5d1c45af9e 100644 (file)
@@ -168,30 +168,21 @@ static void uniphier_pcie_irq_enable(struct uniphier_pcie_priv *priv)
        writel(PCL_RCV_INTX_ALL_ENABLE, priv->base + PCL_RCV_INTX);
 }
 
-static void uniphier_pcie_irq_ack(struct irq_data *d)
-{
-       struct pcie_port *pp = irq_data_get_irq_chip_data(d);
-       struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
-       struct uniphier_pcie_priv *priv = to_uniphier_pcie(pci);
-       u32 val;
-
-       val = readl(priv->base + PCL_RCV_INTX);
-       val &= ~PCL_RCV_INTX_ALL_STATUS;
-       val |= BIT(irqd_to_hwirq(d) + PCL_RCV_INTX_STATUS_SHIFT);
-       writel(val, priv->base + PCL_RCV_INTX);
-}
-
 static void uniphier_pcie_irq_mask(struct irq_data *d)
 {
        struct pcie_port *pp = irq_data_get_irq_chip_data(d);
        struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
        struct uniphier_pcie_priv *priv = to_uniphier_pcie(pci);
+       unsigned long flags;
        u32 val;
 
+       raw_spin_lock_irqsave(&pp->lock, flags);
+
        val = readl(priv->base + PCL_RCV_INTX);
-       val &= ~PCL_RCV_INTX_ALL_MASK;
        val |= BIT(irqd_to_hwirq(d) + PCL_RCV_INTX_MASK_SHIFT);
        writel(val, priv->base + PCL_RCV_INTX);
+
+       raw_spin_unlock_irqrestore(&pp->lock, flags);
 }
 
 static void uniphier_pcie_irq_unmask(struct irq_data *d)
@@ -199,17 +190,20 @@ static void uniphier_pcie_irq_unmask(struct irq_data *d)
        struct pcie_port *pp = irq_data_get_irq_chip_data(d);
        struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
        struct uniphier_pcie_priv *priv = to_uniphier_pcie(pci);
+       unsigned long flags;
        u32 val;
 
+       raw_spin_lock_irqsave(&pp->lock, flags);
+
        val = readl(priv->base + PCL_RCV_INTX);
-       val &= ~PCL_RCV_INTX_ALL_MASK;
        val &= ~BIT(irqd_to_hwirq(d) + PCL_RCV_INTX_MASK_SHIFT);
        writel(val, priv->base + PCL_RCV_INTX);
+
+       raw_spin_unlock_irqrestore(&pp->lock, flags);
 }
 
 static struct irq_chip uniphier_pcie_irq_chip = {
        .name = "PCI",
-       .irq_ack = uniphier_pcie_irq_ack,
        .irq_mask = uniphier_pcie_irq_mask,
        .irq_unmask = uniphier_pcie_irq_unmask,
 };
index a88eab6829bb2196f9432fdeae13e281a931b804..50f80f07e4db32bbf63fa6f24e4590c03b1990fa 100644 (file)
@@ -279,13 +279,10 @@ static int visconti_add_pcie_port(struct visconti_pcie *pcie,
 {
        struct dw_pcie *pci = &pcie->pci;
        struct pcie_port *pp = &pci->pp;
-       struct device *dev = &pdev->dev;
 
        pp->irq = platform_get_irq_byname(pdev, "intr");
-       if (pp->irq < 0) {
-               dev_err(dev, "Interrupt intr is missing");
+       if (pp->irq < 0)
                return pp->irq;
-       }
 
        pp->ops = &visconti_pcie_host_ops;
 
index 596ebcfcc82dc3746cb71f22be2a843a30ee8541..c5300d49807a23109bbcf75792ba1f97e7f48c83 100644 (file)
 /* PCIe core registers */
 #define PCIE_CORE_DEV_ID_REG                                   0x0
 #define PCIE_CORE_CMD_STATUS_REG                               0x4
-#define     PCIE_CORE_CMD_IO_ACCESS_EN                         BIT(0)
-#define     PCIE_CORE_CMD_MEM_ACCESS_EN                                BIT(1)
-#define     PCIE_CORE_CMD_MEM_IO_REQ_EN                                BIT(2)
 #define PCIE_CORE_DEV_REV_REG                                  0x8
+#define PCIE_CORE_EXP_ROM_BAR_REG                              0x30
 #define PCIE_CORE_PCIEXP_CAP                                   0xc0
 #define PCIE_CORE_ERR_CAPCTL_REG                               0x118
 #define     PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX                   BIT(5)
@@ -99,6 +97,7 @@
 #define     PCIE_CORE_CTRL2_MSI_ENABLE         BIT(10)
 #define PCIE_CORE_REF_CLK_REG                  (CONTROL_BASE_ADDR + 0x14)
 #define     PCIE_CORE_REF_CLK_TX_ENABLE                BIT(1)
+#define     PCIE_CORE_REF_CLK_RX_ENABLE                BIT(2)
 #define PCIE_MSG_LOG_REG                       (CONTROL_BASE_ADDR + 0x30)
 #define PCIE_ISR0_REG                          (CONTROL_BASE_ADDR + 0x40)
 #define PCIE_MSG_PM_PME_MASK                   BIT(7)
 #define     PCIE_ISR0_MSI_INT_PENDING          BIT(24)
 #define     PCIE_ISR0_INTX_ASSERT(val)         BIT(16 + (val))
 #define     PCIE_ISR0_INTX_DEASSERT(val)       BIT(20 + (val))
-#define            PCIE_ISR0_ALL_MASK                  GENMASK(26, 0)
+#define     PCIE_ISR0_ALL_MASK                 GENMASK(31, 0)
 #define PCIE_ISR1_REG                          (CONTROL_BASE_ADDR + 0x48)
 #define PCIE_ISR1_MASK_REG                     (CONTROL_BASE_ADDR + 0x4C)
 #define     PCIE_ISR1_POWER_STATE_CHANGE       BIT(4)
 #define     PCIE_ISR1_FLUSH                    BIT(5)
 #define     PCIE_ISR1_INTX_ASSERT(val)         BIT(8 + (val))
-#define     PCIE_ISR1_ALL_MASK                 GENMASK(11, 4)
+#define     PCIE_ISR1_ALL_MASK                 GENMASK(31, 0)
 #define PCIE_MSI_ADDR_LOW_REG                  (CONTROL_BASE_ADDR + 0x50)
 #define PCIE_MSI_ADDR_HIGH_REG                 (CONTROL_BASE_ADDR + 0x54)
 #define PCIE_MSI_STATUS_REG                    (CONTROL_BASE_ADDR + 0x58)
 #define PCIE_MSI_MASK_REG                      (CONTROL_BASE_ADDR + 0x5C)
 #define PCIE_MSI_PAYLOAD_REG                   (CONTROL_BASE_ADDR + 0x9C)
+#define     PCIE_MSI_DATA_MASK                 GENMASK(15, 0)
 
 /* PCIe window configuration */
 #define OB_WIN_BASE_ADDR                       0x4c00
 #define CFG_REG                                        (LMI_BASE_ADDR + 0x0)
 #define     LTSSM_SHIFT                                24
 #define     LTSSM_MASK                         0x3f
-#define     LTSSM_L0                           0x10
 #define     RC_BAR_CONFIG                      0x300
+
+/* LTSSM values in CFG_REG */
+enum {
+       LTSSM_DETECT_QUIET                      = 0x0,
+       LTSSM_DETECT_ACTIVE                     = 0x1,
+       LTSSM_POLLING_ACTIVE                    = 0x2,
+       LTSSM_POLLING_COMPLIANCE                = 0x3,
+       LTSSM_POLLING_CONFIGURATION             = 0x4,
+       LTSSM_CONFIG_LINKWIDTH_START            = 0x5,
+       LTSSM_CONFIG_LINKWIDTH_ACCEPT           = 0x6,
+       LTSSM_CONFIG_LANENUM_ACCEPT             = 0x7,
+       LTSSM_CONFIG_LANENUM_WAIT               = 0x8,
+       LTSSM_CONFIG_COMPLETE                   = 0x9,
+       LTSSM_CONFIG_IDLE                       = 0xa,
+       LTSSM_RECOVERY_RCVR_LOCK                = 0xb,
+       LTSSM_RECOVERY_SPEED                    = 0xc,
+       LTSSM_RECOVERY_RCVR_CFG                 = 0xd,
+       LTSSM_RECOVERY_IDLE                     = 0xe,
+       LTSSM_L0                                = 0x10,
+       LTSSM_RX_L0S_ENTRY                      = 0x11,
+       LTSSM_RX_L0S_IDLE                       = 0x12,
+       LTSSM_RX_L0S_FTS                        = 0x13,
+       LTSSM_TX_L0S_ENTRY                      = 0x14,
+       LTSSM_TX_L0S_IDLE                       = 0x15,
+       LTSSM_TX_L0S_FTS                        = 0x16,
+       LTSSM_L1_ENTRY                          = 0x17,
+       LTSSM_L1_IDLE                           = 0x18,
+       LTSSM_L2_IDLE                           = 0x19,
+       LTSSM_L2_TRANSMIT_WAKE                  = 0x1a,
+       LTSSM_DISABLED                          = 0x20,
+       LTSSM_LOOPBACK_ENTRY_MASTER             = 0x21,
+       LTSSM_LOOPBACK_ACTIVE_MASTER            = 0x22,
+       LTSSM_LOOPBACK_EXIT_MASTER              = 0x23,
+       LTSSM_LOOPBACK_ENTRY_SLAVE              = 0x24,
+       LTSSM_LOOPBACK_ACTIVE_SLAVE             = 0x25,
+       LTSSM_LOOPBACK_EXIT_SLAVE               = 0x26,
+       LTSSM_HOT_RESET                         = 0x27,
+       LTSSM_RECOVERY_EQUALIZATION_PHASE0      = 0x28,
+       LTSSM_RECOVERY_EQUALIZATION_PHASE1      = 0x29,
+       LTSSM_RECOVERY_EQUALIZATION_PHASE2      = 0x2a,
+       LTSSM_RECOVERY_EQUALIZATION_PHASE3      = 0x2b,
+};
+
 #define VENDOR_ID_REG                          (LMI_BASE_ADDR + 0x44)
 
 /* PCIe core controller registers */
 #define     PCIE_IRQ_MSI_INT2_DET              BIT(21)
 #define     PCIE_IRQ_RC_DBELL_DET              BIT(22)
 #define     PCIE_IRQ_EP_STATUS                 BIT(23)
-#define     PCIE_IRQ_ALL_MASK                  0xfff0fb
+#define     PCIE_IRQ_ALL_MASK                  GENMASK(31, 0)
 #define     PCIE_IRQ_ENABLE_INTS_MASK          PCIE_IRQ_CORE_INT
 
 /* Transaction types */
@@ -257,18 +299,49 @@ static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg)
        return readl(pcie->base + reg);
 }
 
-static inline u16 advk_read16(struct advk_pcie *pcie, u64 reg)
+static u8 advk_pcie_ltssm_state(struct advk_pcie *pcie)
 {
-       return advk_readl(pcie, (reg & ~0x3)) >> ((reg & 0x3) * 8);
+       u32 val;
+       u8 ltssm_state;
+
+       val = advk_readl(pcie, CFG_REG);
+       ltssm_state = (val >> LTSSM_SHIFT) & LTSSM_MASK;
+       return ltssm_state;
 }
 
-static int advk_pcie_link_up(struct advk_pcie *pcie)
+static inline bool advk_pcie_link_up(struct advk_pcie *pcie)
 {
-       u32 val, ltssm_state;
+       /* check if LTSSM is in normal operation - some L* state */
+       u8 ltssm_state = advk_pcie_ltssm_state(pcie);
+       return ltssm_state >= LTSSM_L0 && ltssm_state < LTSSM_DISABLED;
+}
 
-       val = advk_readl(pcie, CFG_REG);
-       ltssm_state = (val >> LTSSM_SHIFT) & LTSSM_MASK;
-       return ltssm_state >= LTSSM_L0;
+static inline bool advk_pcie_link_active(struct advk_pcie *pcie)
+{
+       /*
+        * According to PCIe Base specification 3.0, Table 4-14: Link
+        * Status Mapped to the LTSSM, and 4.2.6.3.6 Configuration.Idle
+        * is Link Up mapped to LTSSM Configuration.Idle, Recovery, L0,
+        * L0s, L1 and L2 states. And according to 3.2.1. Data Link
+        * Control and Management State Machine Rules is DL Up status
+        * reported in DL Active state.
+        */
+       u8 ltssm_state = advk_pcie_ltssm_state(pcie);
+       return ltssm_state >= LTSSM_CONFIG_IDLE && ltssm_state < LTSSM_DISABLED;
+}
+
+static inline bool advk_pcie_link_training(struct advk_pcie *pcie)
+{
+       /*
+        * According to PCIe Base specification 3.0, Table 4-14: Link
+        * Status Mapped to the LTSSM is Link Training mapped to LTSSM
+        * Configuration and Recovery states.
+        */
+       u8 ltssm_state = advk_pcie_ltssm_state(pcie);
+       return ((ltssm_state >= LTSSM_CONFIG_LINKWIDTH_START &&
+                ltssm_state < LTSSM_L0) ||
+               (ltssm_state >= LTSSM_RECOVERY_EQUALIZATION_PHASE0 &&
+                ltssm_state <= LTSSM_RECOVERY_EQUALIZATION_PHASE3));
 }
 
 static int advk_pcie_wait_for_link(struct advk_pcie *pcie)
@@ -291,7 +364,7 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie)
        size_t retries;
 
        for (retries = 0; retries < RETRAIN_WAIT_MAX_RETRIES; ++retries) {
-               if (!advk_pcie_link_up(pcie))
+               if (advk_pcie_link_training(pcie))
                        break;
                udelay(RETRAIN_WAIT_USLEEP_US);
        }
@@ -299,23 +372,9 @@ static void advk_pcie_wait_for_retrain(struct advk_pcie *pcie)
 
 static void advk_pcie_issue_perst(struct advk_pcie *pcie)
 {
-       u32 reg;
-
        if (!pcie->reset_gpio)
                return;
 
-       /*
-        * As required by PCI Express spec (PCI Express Base Specification, REV.
-        * 4.0 PCI Express, February 19 2014, 6.6.1 Conventional Reset) a delay
-        * for at least 100ms after de-asserting PERST# signal is needed before
-        * link training is enabled. So ensure that link training is disabled
-        * prior de-asserting PERST# signal to fulfill that PCI Express spec
-        * requirement.
-        */
-       reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
-       reg &= ~LINK_TRAINING_EN;
-       advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
-
        /* 10ms delay is needed for some cards */
        dev_info(&pcie->pdev->dev, "issuing PERST via reset GPIO for 10ms\n");
        gpiod_set_value_cansleep(pcie->reset_gpio, 1);
@@ -323,53 +382,46 @@ static void advk_pcie_issue_perst(struct advk_pcie *pcie)
        gpiod_set_value_cansleep(pcie->reset_gpio, 0);
 }
 
-static int advk_pcie_train_at_gen(struct advk_pcie *pcie, int gen)
+static void advk_pcie_train_link(struct advk_pcie *pcie)
 {
-       int ret, neg_gen;
+       struct device *dev = &pcie->pdev->dev;
        u32 reg;
+       int ret;
 
-       /* Setup link speed */
+       /*
+        * Setup PCIe rev / gen compliance based on device tree property
+        * 'max-link-speed' which also forces maximal link speed.
+        */
        reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
        reg &= ~PCIE_GEN_SEL_MSK;
-       if (gen == 3)
+       if (pcie->link_gen == 3)
                reg |= SPEED_GEN_3;
-       else if (gen == 2)
+       else if (pcie->link_gen == 2)
                reg |= SPEED_GEN_2;
        else
                reg |= SPEED_GEN_1;
        advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
 
        /*
-        * Enable link training. This is not needed in every call to this
-        * function, just once suffices, but it does not break anything either.
+        * Set maximal link speed value also into PCIe Link Control 2 register.
+        * Armada 3700 Functional Specification says that default value is based
+        * on SPEED_GEN but tests showed that default value is always 8.0 GT/s.
         */
+       reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL2);
+       reg &= ~PCI_EXP_LNKCTL2_TLS;
+       if (pcie->link_gen == 3)
+               reg |= PCI_EXP_LNKCTL2_TLS_8_0GT;
+       else if (pcie->link_gen == 2)
+               reg |= PCI_EXP_LNKCTL2_TLS_5_0GT;
+       else
+               reg |= PCI_EXP_LNKCTL2_TLS_2_5GT;
+       advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL2);
+
+       /* Enable link training after selecting PCIe generation */
        reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
        reg |= LINK_TRAINING_EN;
        advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
 
-       /*
-        * Start link training immediately after enabling it.
-        * This solves problems for some buggy cards.
-        */
-       reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL);
-       reg |= PCI_EXP_LNKCTL_RL;
-       advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKCTL);
-
-       ret = advk_pcie_wait_for_link(pcie);
-       if (ret)
-               return ret;
-
-       reg = advk_read16(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_LNKSTA);
-       neg_gen = reg & PCI_EXP_LNKSTA_CLS;
-
-       return neg_gen;
-}
-
-static void advk_pcie_train_link(struct advk_pcie *pcie)
-{
-       struct device *dev = &pcie->pdev->dev;
-       int neg_gen = -1, gen;
-
        /*
         * Reset PCIe card via PERST# signal. Some cards are not detected
         * during link training when they are in some non-initial state.
@@ -380,41 +432,18 @@ static void advk_pcie_train_link(struct advk_pcie *pcie)
         * PERST# signal could have been asserted by pinctrl subsystem before
         * probe() callback has been called or issued explicitly by reset gpio
         * function advk_pcie_issue_perst(), making the endpoint going into
-        * fundamental reset. As required by PCI Express spec a delay for at
-        * least 100ms after such a reset before link training is needed.
+        * fundamental reset. As required by PCI Express spec (PCI Express
+        * Base Specification, REV. 4.0 PCI Express, February 19 2014, 6.6.1
+        * Conventional Reset) a delay for at least 100ms after such a reset
+        * before sending a Configuration Request to the device is needed.
+        * So wait until PCIe link is up. Function advk_pcie_wait_for_link()
+        * waits for link at least 900ms.
         */
-       msleep(PCI_PM_D3COLD_WAIT);
-
-       /*
-        * Try link training at link gen specified by device tree property
-        * 'max-link-speed'. If this fails, iteratively train at lower gen.
-        */
-       for (gen = pcie->link_gen; gen > 0; --gen) {
-               neg_gen = advk_pcie_train_at_gen(pcie, gen);
-               if (neg_gen > 0)
-                       break;
-       }
-
-       if (neg_gen < 0)
-               goto err;
-
-       /*
-        * After successful training if negotiated gen is lower than requested,
-        * train again on negotiated gen. This solves some stability issues for
-        * some buggy gen1 cards.
-        */
-       if (neg_gen < gen) {
-               gen = neg_gen;
-               neg_gen = advk_pcie_train_at_gen(pcie, gen);
-       }
-
-       if (neg_gen == gen) {
-               dev_info(dev, "link up at gen %i\n", gen);
-               return;
-       }
-
-err:
-       dev_err(dev, "link never came up\n");
+       ret = advk_pcie_wait_for_link(pcie);
+       if (ret < 0)
+               dev_err(dev, "link never came up\n");
+       else
+               dev_info(dev, "link up\n");
 }
 
 /*
@@ -451,9 +480,15 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie)
        u32 reg;
        int i;
 
-       /* Enable TX */
+       /*
+        * Configure PCIe Reference clock. Direction is from the PCIe
+        * controller to the endpoint card, so enable transmitting of
+        * Reference clock differential signal off-chip and disable
+        * receiving off-chip differential signal.
+        */
        reg = advk_readl(pcie, PCIE_CORE_REF_CLK_REG);
        reg |= PCIE_CORE_REF_CLK_TX_ENABLE;
+       reg &= ~PCIE_CORE_REF_CLK_RX_ENABLE;
        advk_writel(pcie, reg, PCIE_CORE_REF_CLK_REG);
 
        /* Set to Direct mode */
@@ -477,6 +512,31 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie)
        reg = (PCI_VENDOR_ID_MARVELL << 16) | PCI_VENDOR_ID_MARVELL;
        advk_writel(pcie, reg, VENDOR_ID_REG);
 
+       /*
+        * Change Class Code of PCI Bridge device to PCI Bridge (0x600400),
+        * because the default value is Mass storage controller (0x010400).
+        *
+        * Note that this Aardvark PCI Bridge does not have compliant Type 1
+        * Configuration Space and it even cannot be accessed via Aardvark's
+        * PCI config space access method. Something like config space is
+        * available in internal Aardvark registers starting at offset 0x0
+        * and is reported as Type 0. In range 0x10 - 0x34 it has totally
+        * different registers.
+        *
+        * Therefore driver uses emulation of PCI Bridge which emulates
+        * access to configuration space via internal Aardvark registers or
+        * emulated configuration buffer.
+        */
+       reg = advk_readl(pcie, PCIE_CORE_DEV_REV_REG);
+       reg &= ~0xffffff00;
+       reg |= (PCI_CLASS_BRIDGE_PCI << 8) << 8;
+       advk_writel(pcie, reg, PCIE_CORE_DEV_REV_REG);
+
+       /* Disable Root Bridge I/O space, memory space and bus mastering */
+       reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG);
+       reg &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+       advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG);
+
        /* Set Advanced Error Capabilities and Control PF0 register */
        reg = PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX |
                PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN |
@@ -488,8 +548,9 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie)
        reg = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL);
        reg &= ~PCI_EXP_DEVCTL_RELAX_EN;
        reg &= ~PCI_EXP_DEVCTL_NOSNOOP_EN;
+       reg &= ~PCI_EXP_DEVCTL_PAYLOAD;
        reg &= ~PCI_EXP_DEVCTL_READRQ;
-       reg |= PCI_EXP_DEVCTL_PAYLOAD; /* Set max payload size */
+       reg |= PCI_EXP_DEVCTL_PAYLOAD_512B;
        reg |= PCI_EXP_DEVCTL_READRQ_512B;
        advk_writel(pcie, reg, PCIE_CORE_PCIEXP_CAP + PCI_EXP_DEVCTL);
 
@@ -574,19 +635,6 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie)
                advk_pcie_disable_ob_win(pcie, i);
 
        advk_pcie_train_link(pcie);
-
-       /*
-        * FIXME: The following register update is suspicious. This register is
-        * applicable only when the PCI controller is configured for Endpoint
-        * mode, not as a Root Complex. But apparently when this code is
-        * removed, some cards stop working. This should be investigated and
-        * a comment explaining this should be put here.
-        */
-       reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG);
-       reg |= PCIE_CORE_CMD_MEM_ACCESS_EN |
-               PCIE_CORE_CMD_IO_ACCESS_EN |
-               PCIE_CORE_CMD_MEM_IO_REQ_EN;
-       advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG);
 }
 
 static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u32 *val)
@@ -595,6 +643,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3
        u32 reg;
        unsigned int status;
        char *strcomp_status, *str_posted;
+       int ret;
 
        reg = advk_readl(pcie, PIO_STAT);
        status = (reg & PIO_COMPLETION_STATUS_MASK) >>
@@ -619,6 +668,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3
        case PIO_COMPLETION_STATUS_OK:
                if (reg & PIO_ERR_STATUS) {
                        strcomp_status = "COMP_ERR";
+                       ret = -EFAULT;
                        break;
                }
                /* Get the read result */
@@ -626,9 +676,11 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3
                        *val = advk_readl(pcie, PIO_RD_DATA);
                /* No error */
                strcomp_status = NULL;
+               ret = 0;
                break;
        case PIO_COMPLETION_STATUS_UR:
                strcomp_status = "UR";
+               ret = -EOPNOTSUPP;
                break;
        case PIO_COMPLETION_STATUS_CRS:
                if (allow_crs && val) {
@@ -646,6 +698,7 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3
                         */
                        *val = CFG_RD_CRS_VAL;
                        strcomp_status = NULL;
+                       ret = 0;
                        break;
                }
                /* PCIe r4.0, sec 2.3.2, says:
@@ -661,31 +714,34 @@ static int advk_pcie_check_pio_status(struct advk_pcie *pcie, bool allow_crs, u3
                 * Request and taking appropriate action, e.g., complete the
                 * Request to the host as a failed transaction.
                 *
-                * To simplify implementation do not re-issue the Configuration
-                * Request and complete the Request as a failed transaction.
+                * So return -EAGAIN and caller (pci-aardvark.c driver) will
+                * re-issue request again up to the PIO_RETRY_CNT retries.
                 */
                strcomp_status = "CRS";
+               ret = -EAGAIN;
                break;
        case PIO_COMPLETION_STATUS_CA:
                strcomp_status = "CA";
+               ret = -ECANCELED;
                break;
        default:
                strcomp_status = "Unknown";
+               ret = -EINVAL;
                break;
        }
 
        if (!strcomp_status)
-               return 0;
+               return ret;
 
        if (reg & PIO_NON_POSTED_REQ)
                str_posted = "Non-posted";
        else
                str_posted = "Posted";
 
-       dev_err(dev, "%s PIO Response Status: %s, %#x @ %#x\n",
+       dev_dbg(dev, "%s PIO Response Status: %s, %#x @ %#x\n",
                str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS));
 
-       return -EFAULT;
+       return ret;
 }
 
 static int advk_pcie_wait_pio(struct advk_pcie *pcie)
@@ -693,13 +749,13 @@ static int advk_pcie_wait_pio(struct advk_pcie *pcie)
        struct device *dev = &pcie->pdev->dev;
        int i;
 
-       for (i = 0; i < PIO_RETRY_CNT; i++) {
+       for (i = 1; i <= PIO_RETRY_CNT; i++) {
                u32 start, isr;
 
                start = advk_readl(pcie, PIO_START);
                isr = advk_readl(pcie, PIO_ISR);
                if (!start && isr)
-                       return 0;
+                       return i;
                udelay(PIO_RETRY_DELAY);
        }
 
@@ -707,6 +763,72 @@ static int advk_pcie_wait_pio(struct advk_pcie *pcie)
        return -ETIMEDOUT;
 }
 
+static pci_bridge_emul_read_status_t
+advk_pci_bridge_emul_base_conf_read(struct pci_bridge_emul *bridge,
+                                   int reg, u32 *value)
+{
+       struct advk_pcie *pcie = bridge->data;
+
+       switch (reg) {
+       case PCI_COMMAND:
+               *value = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG);
+               return PCI_BRIDGE_EMUL_HANDLED;
+
+       case PCI_ROM_ADDRESS1:
+               *value = advk_readl(pcie, PCIE_CORE_EXP_ROM_BAR_REG);
+               return PCI_BRIDGE_EMUL_HANDLED;
+
+       case PCI_INTERRUPT_LINE: {
+               /*
+                * From the whole 32bit register we support reading from HW only
+                * one bit: PCI_BRIDGE_CTL_BUS_RESET.
+                * Other bits are retrieved only from emulated config buffer.
+                */
+               __le32 *cfgspace = (__le32 *)&bridge->conf;
+               u32 val = le32_to_cpu(cfgspace[PCI_INTERRUPT_LINE / 4]);
+               if (advk_readl(pcie, PCIE_CORE_CTRL1_REG) & HOT_RESET_GEN)
+                       val |= PCI_BRIDGE_CTL_BUS_RESET << 16;
+               else
+                       val &= ~(PCI_BRIDGE_CTL_BUS_RESET << 16);
+               *value = val;
+               return PCI_BRIDGE_EMUL_HANDLED;
+       }
+
+       default:
+               return PCI_BRIDGE_EMUL_NOT_HANDLED;
+       }
+}
+
+static void
+advk_pci_bridge_emul_base_conf_write(struct pci_bridge_emul *bridge,
+                                    int reg, u32 old, u32 new, u32 mask)
+{
+       struct advk_pcie *pcie = bridge->data;
+
+       switch (reg) {
+       case PCI_COMMAND:
+               advk_writel(pcie, new, PCIE_CORE_CMD_STATUS_REG);
+               break;
+
+       case PCI_ROM_ADDRESS1:
+               advk_writel(pcie, new, PCIE_CORE_EXP_ROM_BAR_REG);
+               break;
+
+       case PCI_INTERRUPT_LINE:
+               if (mask & (PCI_BRIDGE_CTL_BUS_RESET << 16)) {
+                       u32 val = advk_readl(pcie, PCIE_CORE_CTRL1_REG);
+                       if (new & (PCI_BRIDGE_CTL_BUS_RESET << 16))
+                               val |= HOT_RESET_GEN;
+                       else
+                               val &= ~HOT_RESET_GEN;
+                       advk_writel(pcie, val, PCIE_CORE_CTRL1_REG);
+               }
+               break;
+
+       default:
+               break;
+       }
+}
 
 static pci_bridge_emul_read_status_t
 advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge,
@@ -723,6 +845,7 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge,
        case PCI_EXP_RTCTL: {
                u32 val = advk_readl(pcie, PCIE_ISR0_MASK_REG);
                *value = (val & PCIE_MSG_PM_PME_MASK) ? 0 : PCI_EXP_RTCTL_PMEIE;
+               *value |= le16_to_cpu(bridge->pcie_conf.rootctl) & PCI_EXP_RTCTL_CRSSVE;
                *value |= PCI_EXP_RTCAP_CRSVIS << 16;
                return PCI_BRIDGE_EMUL_HANDLED;
        }
@@ -734,12 +857,26 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge,
                return PCI_BRIDGE_EMUL_HANDLED;
        }
 
+       case PCI_EXP_LNKCAP: {
+               u32 val = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg);
+               /*
+                * PCI_EXP_LNKCAP_DLLLARC bit is hardwired in aardvark HW to 0.
+                * But support for PCI_EXP_LNKSTA_DLLLA is emulated via ltssm
+                * state so explicitly enable PCI_EXP_LNKCAP_DLLLARC flag.
+                */
+               val |= PCI_EXP_LNKCAP_DLLLARC;
+               *value = val;
+               return PCI_BRIDGE_EMUL_HANDLED;
+       }
+
        case PCI_EXP_LNKCTL: {
                /* u32 contains both PCI_EXP_LNKCTL and PCI_EXP_LNKSTA */
                u32 val = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg) &
                        ~(PCI_EXP_LNKSTA_LT << 16);
-               if (!advk_pcie_link_up(pcie))
+               if (advk_pcie_link_training(pcie))
                        val |= (PCI_EXP_LNKSTA_LT << 16);
+               if (advk_pcie_link_active(pcie))
+                       val |= (PCI_EXP_LNKSTA_DLLLA << 16);
                *value = val;
                return PCI_BRIDGE_EMUL_HANDLED;
        }
@@ -747,7 +884,6 @@ advk_pci_bridge_emul_pcie_conf_read(struct pci_bridge_emul *bridge,
        case PCI_CAP_LIST_ID:
        case PCI_EXP_DEVCAP:
        case PCI_EXP_DEVCTL:
-       case PCI_EXP_LNKCAP:
                *value = advk_readl(pcie, PCIE_CORE_PCIEXP_CAP + reg);
                return PCI_BRIDGE_EMUL_HANDLED;
        default:
@@ -794,6 +930,8 @@ advk_pci_bridge_emul_pcie_conf_write(struct pci_bridge_emul *bridge,
 }
 
 static struct pci_bridge_emul_ops advk_pci_bridge_emul_ops = {
+       .read_base = advk_pci_bridge_emul_base_conf_read,
+       .write_base = advk_pci_bridge_emul_base_conf_write,
        .read_pcie = advk_pci_bridge_emul_pcie_conf_read,
        .write_pcie = advk_pci_bridge_emul_pcie_conf_write,
 };
@@ -805,7 +943,6 @@ static struct pci_bridge_emul_ops advk_pci_bridge_emul_ops = {
 static int advk_sw_pci_bridge_init(struct advk_pcie *pcie)
 {
        struct pci_bridge_emul *bridge = &pcie->bridge;
-       int ret;
 
        bridge->conf.vendor =
                cpu_to_le16(advk_readl(pcie, PCIE_CORE_DEV_ID_REG) & 0xffff);
@@ -825,19 +962,14 @@ static int advk_sw_pci_bridge_init(struct advk_pcie *pcie)
        /* Support interrupt A for MSI feature */
        bridge->conf.intpin = PCIE_CORE_INT_A_ASSERT_ENABLE;
 
+       /* Indicates supports for Completion Retry Status */
+       bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS);
+
        bridge->has_pcie = true;
        bridge->data = pcie;
        bridge->ops = &advk_pci_bridge_emul_ops;
 
-       /* PCIe config space can be initialized after pci_bridge_emul_init() */
-       ret = pci_bridge_emul_init(bridge, 0);
-       if (ret < 0)
-               return ret;
-
-       /* Indicates supports for Completion Retry Status */
-       bridge->pcie_conf.rootcap = cpu_to_le16(PCI_EXP_RTCAP_CRSVIS);
-
-       return 0;
+       return pci_bridge_emul_init(bridge, 0);
 }
 
 static bool advk_pcie_valid_device(struct advk_pcie *pcie, struct pci_bus *bus,
@@ -889,6 +1021,7 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn,
                             int where, int size, u32 *val)
 {
        struct advk_pcie *pcie = bus->sysdata;
+       int retry_count;
        bool allow_crs;
        u32 reg;
        int ret;
@@ -911,18 +1044,8 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn,
                    (le16_to_cpu(pcie->bridge.pcie_conf.rootctl) &
                     PCI_EXP_RTCTL_CRSSVE);
 
-       if (advk_pcie_pio_is_running(pcie)) {
-               /*
-                * If it is possible return Completion Retry Status so caller
-                * tries to issue the request again instead of failing.
-                */
-               if (allow_crs) {
-                       *val = CFG_RD_CRS_VAL;
-                       return PCIBIOS_SUCCESSFUL;
-               }
-               *val = 0xffffffff;
-               return PCIBIOS_SET_FAILED;
-       }
+       if (advk_pcie_pio_is_running(pcie))
+               goto try_crs;
 
        /* Program the control register */
        reg = advk_readl(pcie, PIO_CTRL);
@@ -941,30 +1064,24 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn,
        /* Program the data strobe */
        advk_writel(pcie, 0xf, PIO_WR_DATA_STRB);
 
-       /* Clear PIO DONE ISR and start the transfer */
-       advk_writel(pcie, 1, PIO_ISR);
-       advk_writel(pcie, 1, PIO_START);
+       retry_count = 0;
+       do {
+               /* Clear PIO DONE ISR and start the transfer */
+               advk_writel(pcie, 1, PIO_ISR);
+               advk_writel(pcie, 1, PIO_START);
 
-       ret = advk_pcie_wait_pio(pcie);
-       if (ret < 0) {
-               /*
-                * If it is possible return Completion Retry Status so caller
-                * tries to issue the request again instead of failing.
-                */
-               if (allow_crs) {
-                       *val = CFG_RD_CRS_VAL;
-                       return PCIBIOS_SUCCESSFUL;
-               }
-               *val = 0xffffffff;
-               return PCIBIOS_SET_FAILED;
-       }
+               ret = advk_pcie_wait_pio(pcie);
+               if (ret < 0)
+                       goto try_crs;
 
-       /* Check PIO status and get the read result */
-       ret = advk_pcie_check_pio_status(pcie, allow_crs, val);
-       if (ret < 0) {
-               *val = 0xffffffff;
-               return PCIBIOS_SET_FAILED;
-       }
+               retry_count += ret;
+
+               /* Check PIO status and get the read result */
+               ret = advk_pcie_check_pio_status(pcie, allow_crs, val);
+       } while (ret == -EAGAIN && retry_count < PIO_RETRY_CNT);
+
+       if (ret < 0)
+               goto fail;
 
        if (size == 1)
                *val = (*val >> (8 * (where & 3))) & 0xff;
@@ -972,6 +1089,20 @@ static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn,
                *val = (*val >> (8 * (where & 3))) & 0xffff;
 
        return PCIBIOS_SUCCESSFUL;
+
+try_crs:
+       /*
+        * If it is possible, return Completion Retry Status so that caller
+        * tries to issue the request again instead of failing.
+        */
+       if (allow_crs) {
+               *val = CFG_RD_CRS_VAL;
+               return PCIBIOS_SUCCESSFUL;
+       }
+
+fail:
+       *val = 0xffffffff;
+       return PCIBIOS_SET_FAILED;
 }
 
 static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn,
@@ -980,6 +1111,7 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn,
        struct advk_pcie *pcie = bus->sysdata;
        u32 reg;
        u32 data_strobe = 0x0;
+       int retry_count;
        int offset;
        int ret;
 
@@ -1021,19 +1153,22 @@ static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn,
        /* Program the data strobe */
        advk_writel(pcie, data_strobe, PIO_WR_DATA_STRB);
 
-       /* Clear PIO DONE ISR and start the transfer */
-       advk_writel(pcie, 1, PIO_ISR);
-       advk_writel(pcie, 1, PIO_START);
+       retry_count = 0;
+       do {
+               /* Clear PIO DONE ISR and start the transfer */
+               advk_writel(pcie, 1, PIO_ISR);
+               advk_writel(pcie, 1, PIO_START);
 
-       ret = advk_pcie_wait_pio(pcie);
-       if (ret < 0)
-               return PCIBIOS_SET_FAILED;
+               ret = advk_pcie_wait_pio(pcie);
+               if (ret < 0)
+                       return PCIBIOS_SET_FAILED;
 
-       ret = advk_pcie_check_pio_status(pcie, false, NULL);
-       if (ret < 0)
-               return PCIBIOS_SET_FAILED;
+               retry_count += ret;
 
-       return PCIBIOS_SUCCESSFUL;
+               ret = advk_pcie_check_pio_status(pcie, false, NULL);
+       } while (ret == -EAGAIN && retry_count < PIO_RETRY_CNT);
+
+       return ret < 0 ? PCIBIOS_SET_FAILED : PCIBIOS_SUCCESSFUL;
 }
 
 static struct pci_ops advk_pcie_ops = {
@@ -1082,7 +1217,7 @@ static int advk_msi_irq_domain_alloc(struct irq_domain *domain,
                                    domain->host_data, handle_simple_irq,
                                    NULL, NULL);
 
-       return hwirq;
+       return 0;
 }
 
 static void advk_msi_irq_domain_free(struct irq_domain *domain,
@@ -1263,8 +1398,12 @@ static void advk_pcie_handle_msi(struct advk_pcie *pcie)
                if (!(BIT(msi_idx) & msi_status))
                        continue;
 
+               /*
+                * msi_idx contains bits [4:0] of the msi_data and msi_data
+                * contains 16bit MSI interrupt number
+                */
                advk_writel(pcie, BIT(msi_idx), PCIE_MSI_STATUS_REG);
-               msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & 0xFF;
+               msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & PCIE_MSI_DATA_MASK;
                generic_handle_irq(msi_data);
        }
 
@@ -1286,12 +1425,6 @@ static void advk_pcie_handle_int(struct advk_pcie *pcie)
        isr1_mask = advk_readl(pcie, PCIE_ISR1_MASK_REG);
        isr1_status = isr1_val & ((~isr1_mask) & PCIE_ISR1_ALL_MASK);
 
-       if (!isr0_status && !isr1_status) {
-               advk_writel(pcie, isr0_val, PCIE_ISR0_REG);
-               advk_writel(pcie, isr1_val, PCIE_ISR1_REG);
-               return;
-       }
-
        /* Process MSI interrupts */
        if (isr0_status & PCIE_ISR0_MSI_INT_PENDING)
                advk_pcie_handle_msi(pcie);
index 67c46e52c0dc3812f4d7e65b14d661b62d824eef..6733cb14e77534ef195474f3a6a8d61bd33794cc 100644 (file)
@@ -3126,14 +3126,14 @@ static int hv_pci_probe(struct hv_device *hdev,
 
        if (dom == HVPCI_DOM_INVALID) {
                dev_err(&hdev->device,
-                       "Unable to use dom# 0x%hx or other numbers", dom_req);
+                       "Unable to use dom# 0x%x or other numbers", dom_req);
                ret = -EINVAL;
                goto free_bus;
        }
 
        if (dom != dom_req)
                dev_info(&hdev->device,
-                        "PCI dom# 0x%hx has collision, using 0x%hx",
+                        "PCI dom# 0x%x has collision, using 0x%x",
                         dom_req, dom);
 
        hbus->bridge->domain_nr = dom;
index ffd84656544f2e2768bf0d91063b6b18b9fc73fe..e9d5ca245f5e08a6cb83248968ba80711e70176d 100644 (file)
@@ -17,7 +17,7 @@ static void set_val(u32 v, int where, int size, u32 *val)
 {
        int shift = (where & 3) * 8;
 
-       pr_debug("set_val %04x: %08x\n", (unsigned)(where & ~3), v);
+       pr_debug("set_val %04x: %08x\n", (unsigned int)(where & ~3), v);
        v >>= shift;
        if (size == 1)
                v &= 0xff;
@@ -187,7 +187,7 @@ static int thunder_ecam_config_read(struct pci_bus *bus, unsigned int devfn,
 
        pr_debug("%04x:%04x - Fix pass#: %08x, where: %03x, devfn: %03x\n",
                 vendor_device & 0xffff, vendor_device >> 16, class_rev,
-                (unsignedwhere, devfn);
+                (unsigned int)where, devfn);
 
        /* Check for non type-00 header */
        if (cfg_type == 0) {
index b7a8e062fcc5262d8e70998155c058f8a5623035..c50ff279903c2a5a216bd40798d5fdd532f0a8b7 100644 (file)
@@ -302,7 +302,7 @@ static void xgene_msi_isr(struct irq_desc *desc)
 
        /*
         * MSIINTn (n is 0..F) indicates if there is a pending MSI interrupt
-        * If bit x of this register is set (x is 0..7), one or more interupts
+        * If bit x of this register is set (x is 0..7), one or more interrupts
         * corresponding to MSInIRx is set.
         */
        grp_select = xgene_msi_int_read(xgene_msi, msi_grp);
index e64536047b651f0101efc65f57ea34e638c18bc8..56d0d50338c89982423dead79933c17fcb386c43 100644 (file)
@@ -48,7 +48,6 @@
 #define EN_COHERENCY                   0xF0000000
 #define EN_REG                         0x00000001
 #define OB_LO_IO                       0x00000002
-#define XGENE_PCIE_VENDORID            0x10E8
 #define XGENE_PCIE_DEVICEID            0xE004
 #define SZ_1T                          (SZ_1G*1024ULL)
 #define PIPE_PHY_RATE_RD(src)          ((0xc000 & (u32)(src)) >> 0xe)
@@ -560,7 +559,7 @@ static int xgene_pcie_setup(struct xgene_pcie_port *port)
        xgene_pcie_clear_config(port);
 
        /* setup the vendor and device IDs correctly */
-       val = (XGENE_PCIE_DEVICEID << 16) | XGENE_PCIE_VENDORID;
+       val = (XGENE_PCIE_DEVICEID << 16) | PCI_VENDOR_ID_AMCC;
        xgene_pcie_writel(port, BRIDGE_CFG_0, val);
 
        ret = xgene_pcie_map_ranges(port);
diff --git a/drivers/pci/controller/pcie-apple.c b/drivers/pci/controller/pcie-apple.c
new file mode 100644 (file)
index 0000000..1bf4d75
--- /dev/null
@@ -0,0 +1,824 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCIe host bridge driver for Apple system-on-chips.
+ *
+ * The HW is ECAM compliant, so once the controller is initialized,
+ * the driver mostly deals MSI mapping and handling of per-port
+ * interrupts (INTx, management and error signals).
+ *
+ * Initialization requires enabling power and clocks, along with a
+ * number of register pokes.
+ *
+ * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ * Copyright (C) 2021 Google LLC
+ * Copyright (C) 2021 Corellium LLC
+ * Copyright (C) 2021 Mark Kettenis <kettenis@openbsd.org>
+ *
+ * Author: Alyssa Rosenzweig <alyssa@rosenzweig.io>
+ * Author: Marc Zyngier <maz@kernel.org>
+ */
+
+#include <linux/gpio/consumer.h>
+#include <linux/kernel.h>
+#include <linux/iopoll.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/notifier.h>
+#include <linux/of_irq.h>
+#include <linux/pci-ecam.h>
+
+#define CORE_RC_PHYIF_CTL              0x00024
+#define   CORE_RC_PHYIF_CTL_RUN                BIT(0)
+#define CORE_RC_PHYIF_STAT             0x00028
+#define   CORE_RC_PHYIF_STAT_REFCLK    BIT(4)
+#define CORE_RC_CTL                    0x00050
+#define   CORE_RC_CTL_RUN              BIT(0)
+#define CORE_RC_STAT                   0x00058
+#define   CORE_RC_STAT_READY           BIT(0)
+#define CORE_FABRIC_STAT               0x04000
+#define   CORE_FABRIC_STAT_MASK                0x001F001F
+#define CORE_LANE_CFG(port)            (0x84000 + 0x4000 * (port))
+#define   CORE_LANE_CFG_REFCLK0REQ     BIT(0)
+#define   CORE_LANE_CFG_REFCLK1                BIT(1)
+#define   CORE_LANE_CFG_REFCLK0ACK     BIT(2)
+#define   CORE_LANE_CFG_REFCLKEN       (BIT(9) | BIT(10))
+#define CORE_LANE_CTL(port)            (0x84004 + 0x4000 * (port))
+#define   CORE_LANE_CTL_CFGACC         BIT(15)
+
+#define PORT_LTSSMCTL                  0x00080
+#define   PORT_LTSSMCTL_START          BIT(0)
+#define PORT_INTSTAT                   0x00100
+#define   PORT_INT_TUNNEL_ERR          31
+#define   PORT_INT_CPL_TIMEOUT         23
+#define   PORT_INT_RID2SID_MAPERR      22
+#define   PORT_INT_CPL_ABORT           21
+#define   PORT_INT_MSI_BAD_DATA                19
+#define   PORT_INT_MSI_ERR             18
+#define   PORT_INT_REQADDR_GT32                17
+#define   PORT_INT_AF_TIMEOUT          15
+#define   PORT_INT_LINK_DOWN           14
+#define   PORT_INT_LINK_UP             12
+#define   PORT_INT_LINK_BWMGMT         11
+#define   PORT_INT_AER_MASK            (15 << 4)
+#define   PORT_INT_PORT_ERR            4
+#define   PORT_INT_INTx(i)             i
+#define   PORT_INT_INTx_MASK           15
+#define PORT_INTMSK                    0x00104
+#define PORT_INTMSKSET                 0x00108
+#define PORT_INTMSKCLR                 0x0010c
+#define PORT_MSICFG                    0x00124
+#define   PORT_MSICFG_EN               BIT(0)
+#define   PORT_MSICFG_L2MSINUM_SHIFT   4
+#define PORT_MSIBASE                   0x00128
+#define   PORT_MSIBASE_1_SHIFT         16
+#define PORT_MSIADDR                   0x00168
+#define PORT_LINKSTS                   0x00208
+#define   PORT_LINKSTS_UP              BIT(0)
+#define   PORT_LINKSTS_BUSY            BIT(2)
+#define PORT_LINKCMDSTS                        0x00210
+#define PORT_OUTS_NPREQS               0x00284
+#define   PORT_OUTS_NPREQS_REQ         BIT(24)
+#define   PORT_OUTS_NPREQS_CPL         BIT(16)
+#define PORT_RXWR_FIFO                 0x00288
+#define   PORT_RXWR_FIFO_HDR           GENMASK(15, 10)
+#define   PORT_RXWR_FIFO_DATA          GENMASK(9, 0)
+#define PORT_RXRD_FIFO                 0x0028C
+#define   PORT_RXRD_FIFO_REQ           GENMASK(6, 0)
+#define PORT_OUTS_CPLS                 0x00290
+#define   PORT_OUTS_CPLS_SHRD          GENMASK(14, 8)
+#define   PORT_OUTS_CPLS_WAIT          GENMASK(6, 0)
+#define PORT_APPCLK                    0x00800
+#define   PORT_APPCLK_EN               BIT(0)
+#define   PORT_APPCLK_CGDIS            BIT(8)
+#define PORT_STATUS                    0x00804
+#define   PORT_STATUS_READY            BIT(0)
+#define PORT_REFCLK                    0x00810
+#define   PORT_REFCLK_EN               BIT(0)
+#define   PORT_REFCLK_CGDIS            BIT(8)
+#define PORT_PERST                     0x00814
+#define   PORT_PERST_OFF               BIT(0)
+#define PORT_RID2SID(i16)              (0x00828 + 4 * (i16))
+#define   PORT_RID2SID_VALID           BIT(31)
+#define   PORT_RID2SID_SID_SHIFT       16
+#define   PORT_RID2SID_BUS_SHIFT       8
+#define   PORT_RID2SID_DEV_SHIFT       3
+#define   PORT_RID2SID_FUNC_SHIFT      0
+#define PORT_OUTS_PREQS_HDR            0x00980
+#define   PORT_OUTS_PREQS_HDR_MASK     GENMASK(9, 0)
+#define PORT_OUTS_PREQS_DATA           0x00984
+#define   PORT_OUTS_PREQS_DATA_MASK    GENMASK(15, 0)
+#define PORT_TUNCTRL                   0x00988
+#define   PORT_TUNCTRL_PERST_ON                BIT(0)
+#define   PORT_TUNCTRL_PERST_ACK_REQ   BIT(1)
+#define PORT_TUNSTAT                   0x0098c
+#define   PORT_TUNSTAT_PERST_ON                BIT(0)
+#define   PORT_TUNSTAT_PERST_ACK_PEND  BIT(1)
+#define PORT_PREFMEM_ENABLE            0x00994
+
+#define MAX_RID2SID                    64
+
+/*
+ * The doorbell address is set to 0xfffff000, which by convention
+ * matches what MacOS does, and it is possible to use any other
+ * address (in the bottom 4GB, as the base register is only 32bit).
+ * However, it has to be excluded from the IOVA range, and the DART
+ * driver has to know about it.
+ */
+#define DOORBELL_ADDR          CONFIG_PCIE_APPLE_MSI_DOORBELL_ADDR
+
+struct apple_pcie {
+       struct mutex            lock;
+       struct device           *dev;
+       void __iomem            *base;
+       struct irq_domain       *domain;
+       unsigned long           *bitmap;
+       struct list_head        ports;
+       struct completion       event;
+       struct irq_fwspec       fwspec;
+       u32                     nvecs;
+};
+
+struct apple_pcie_port {
+       struct apple_pcie       *pcie;
+       struct device_node      *np;
+       void __iomem            *base;
+       struct irq_domain       *domain;
+       struct list_head        entry;
+       DECLARE_BITMAP(sid_map, MAX_RID2SID);
+       int                     sid_map_sz;
+       int                     idx;
+};
+
+static void rmw_set(u32 set, void __iomem *addr)
+{
+       writel_relaxed(readl_relaxed(addr) | set, addr);
+}
+
+static void rmw_clear(u32 clr, void __iomem *addr)
+{
+       writel_relaxed(readl_relaxed(addr) & ~clr, addr);
+}
+
+static void apple_msi_top_irq_mask(struct irq_data *d)
+{
+       pci_msi_mask_irq(d);
+       irq_chip_mask_parent(d);
+}
+
+static void apple_msi_top_irq_unmask(struct irq_data *d)
+{
+       pci_msi_unmask_irq(d);
+       irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip apple_msi_top_chip = {
+       .name                   = "PCIe MSI",
+       .irq_mask               = apple_msi_top_irq_mask,
+       .irq_unmask             = apple_msi_top_irq_unmask,
+       .irq_eoi                = irq_chip_eoi_parent,
+       .irq_set_affinity       = irq_chip_set_affinity_parent,
+       .irq_set_type           = irq_chip_set_type_parent,
+};
+
+static void apple_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+       msg->address_hi = upper_32_bits(DOORBELL_ADDR);
+       msg->address_lo = lower_32_bits(DOORBELL_ADDR);
+       msg->data = data->hwirq;
+}
+
+static struct irq_chip apple_msi_bottom_chip = {
+       .name                   = "MSI",
+       .irq_mask               = irq_chip_mask_parent,
+       .irq_unmask             = irq_chip_unmask_parent,
+       .irq_eoi                = irq_chip_eoi_parent,
+       .irq_set_affinity       = irq_chip_set_affinity_parent,
+       .irq_set_type           = irq_chip_set_type_parent,
+       .irq_compose_msi_msg    = apple_msi_compose_msg,
+};
+
+static int apple_msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+                                 unsigned int nr_irqs, void *args)
+{
+       struct apple_pcie *pcie = domain->host_data;
+       struct irq_fwspec fwspec = pcie->fwspec;
+       unsigned int i;
+       int ret, hwirq;
+
+       mutex_lock(&pcie->lock);
+
+       hwirq = bitmap_find_free_region(pcie->bitmap, pcie->nvecs,
+                                       order_base_2(nr_irqs));
+
+       mutex_unlock(&pcie->lock);
+
+       if (hwirq < 0)
+               return -ENOSPC;
+
+       fwspec.param[1] += hwirq;
+
+       ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, &fwspec);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < nr_irqs; i++) {
+               irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+                                             &apple_msi_bottom_chip,
+                                             domain->host_data);
+       }
+
+       return 0;
+}
+
+static void apple_msi_domain_free(struct irq_domain *domain, unsigned int virq,
+                                 unsigned int nr_irqs)
+{
+       struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+       struct apple_pcie *pcie = domain->host_data;
+
+       mutex_lock(&pcie->lock);
+
+       bitmap_release_region(pcie->bitmap, d->hwirq, order_base_2(nr_irqs));
+
+       mutex_unlock(&pcie->lock);
+}
+
+static const struct irq_domain_ops apple_msi_domain_ops = {
+       .alloc  = apple_msi_domain_alloc,
+       .free   = apple_msi_domain_free,
+};
+
+static struct msi_domain_info apple_msi_info = {
+       .flags  = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+                  MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX),
+       .chip   = &apple_msi_top_chip,
+};
+
+static void apple_port_irq_mask(struct irq_data *data)
+{
+       struct apple_pcie_port *port = irq_data_get_irq_chip_data(data);
+
+       writel_relaxed(BIT(data->hwirq), port->base + PORT_INTMSKSET);
+}
+
+static void apple_port_irq_unmask(struct irq_data *data)
+{
+       struct apple_pcie_port *port = irq_data_get_irq_chip_data(data);
+
+       writel_relaxed(BIT(data->hwirq), port->base + PORT_INTMSKCLR);
+}
+
+static bool hwirq_is_intx(unsigned int hwirq)
+{
+       return BIT(hwirq) & PORT_INT_INTx_MASK;
+}
+
+static void apple_port_irq_ack(struct irq_data *data)
+{
+       struct apple_pcie_port *port = irq_data_get_irq_chip_data(data);
+
+       if (!hwirq_is_intx(data->hwirq))
+               writel_relaxed(BIT(data->hwirq), port->base + PORT_INTSTAT);
+}
+
+static int apple_port_irq_set_type(struct irq_data *data, unsigned int type)
+{
+       /*
+        * It doesn't seem that there is any way to configure the
+        * trigger, so assume INTx have to be level (as per the spec),
+        * and the rest is edge (which looks likely).
+        */
+       if (hwirq_is_intx(data->hwirq) ^ !!(type & IRQ_TYPE_LEVEL_MASK))
+               return -EINVAL;
+
+       irqd_set_trigger_type(data, type);
+       return 0;
+}
+
+static struct irq_chip apple_port_irqchip = {
+       .name           = "PCIe",
+       .irq_ack        = apple_port_irq_ack,
+       .irq_mask       = apple_port_irq_mask,
+       .irq_unmask     = apple_port_irq_unmask,
+       .irq_set_type   = apple_port_irq_set_type,
+};
+
+static int apple_port_irq_domain_alloc(struct irq_domain *domain,
+                                      unsigned int virq, unsigned int nr_irqs,
+                                      void *args)
+{
+       struct apple_pcie_port *port = domain->host_data;
+       struct irq_fwspec *fwspec = args;
+       int i;
+
+       for (i = 0; i < nr_irqs; i++) {
+               irq_flow_handler_t flow = handle_edge_irq;
+               unsigned int type = IRQ_TYPE_EDGE_RISING;
+
+               if (hwirq_is_intx(fwspec->param[0] + i)) {
+                       flow = handle_level_irq;
+                       type = IRQ_TYPE_LEVEL_HIGH;
+               }
+
+               irq_domain_set_info(domain, virq + i, fwspec->param[0] + i,
+                                   &apple_port_irqchip, port, flow,
+                                   NULL, NULL);
+
+               irq_set_irq_type(virq + i, type);
+       }
+
+       return 0;
+}
+
+static void apple_port_irq_domain_free(struct irq_domain *domain,
+                                      unsigned int virq, unsigned int nr_irqs)
+{
+       int i;
+
+       for (i = 0; i < nr_irqs; i++) {
+               struct irq_data *d = irq_domain_get_irq_data(domain, virq + i);
+
+               irq_set_handler(virq + i, NULL);
+               irq_domain_reset_irq_data(d);
+       }
+}
+
+static const struct irq_domain_ops apple_port_irq_domain_ops = {
+       .translate      = irq_domain_translate_onecell,
+       .alloc          = apple_port_irq_domain_alloc,
+       .free           = apple_port_irq_domain_free,
+};
+
+static void apple_port_irq_handler(struct irq_desc *desc)
+{
+       struct apple_pcie_port *port = irq_desc_get_handler_data(desc);
+       struct irq_chip *chip = irq_desc_get_chip(desc);
+       unsigned long stat;
+       int i;
+
+       chained_irq_enter(chip, desc);
+
+       stat = readl_relaxed(port->base + PORT_INTSTAT);
+
+       for_each_set_bit(i, &stat, 32)
+               generic_handle_domain_irq(port->domain, i);
+
+       chained_irq_exit(chip, desc);
+}
+
+static int apple_pcie_port_setup_irq(struct apple_pcie_port *port)
+{
+       struct fwnode_handle *fwnode = &port->np->fwnode;
+       unsigned int irq;
+
+       /* FIXME: consider moving each interrupt under each port */
+       irq = irq_of_parse_and_map(to_of_node(dev_fwnode(port->pcie->dev)),
+                                  port->idx);
+       if (!irq)
+               return -ENXIO;
+
+       port->domain = irq_domain_create_linear(fwnode, 32,
+                                               &apple_port_irq_domain_ops,
+                                               port);
+       if (!port->domain)
+               return -ENOMEM;
+
+       /* Disable all interrupts */
+       writel_relaxed(~0, port->base + PORT_INTMSKSET);
+       writel_relaxed(~0, port->base + PORT_INTSTAT);
+
+       irq_set_chained_handler_and_data(irq, apple_port_irq_handler, port);
+
+       /* Configure MSI base address */
+       BUILD_BUG_ON(upper_32_bits(DOORBELL_ADDR));
+       writel_relaxed(lower_32_bits(DOORBELL_ADDR), port->base + PORT_MSIADDR);
+
+       /* Enable MSIs, shared between all ports */
+       writel_relaxed(0, port->base + PORT_MSIBASE);
+       writel_relaxed((ilog2(port->pcie->nvecs) << PORT_MSICFG_L2MSINUM_SHIFT) |
+                      PORT_MSICFG_EN, port->base + PORT_MSICFG);
+
+       return 0;
+}
+
+static irqreturn_t apple_pcie_port_irq(int irq, void *data)
+{
+       struct apple_pcie_port *port = data;
+       unsigned int hwirq = irq_domain_get_irq_data(port->domain, irq)->hwirq;
+
+       switch (hwirq) {
+       case PORT_INT_LINK_UP:
+               dev_info_ratelimited(port->pcie->dev, "Link up on %pOF\n",
+                                    port->np);
+               complete_all(&port->pcie->event);
+               break;
+       case PORT_INT_LINK_DOWN:
+               dev_info_ratelimited(port->pcie->dev, "Link down on %pOF\n",
+                                    port->np);
+               break;
+       default:
+               return IRQ_NONE;
+       }
+
+       return IRQ_HANDLED;
+}
+
+static int apple_pcie_port_register_irqs(struct apple_pcie_port *port)
+{
+       static struct {
+               unsigned int    hwirq;
+               const char      *name;
+       } port_irqs[] = {
+               { PORT_INT_LINK_UP,     "Link up",      },
+               { PORT_INT_LINK_DOWN,   "Link down",    },
+       };
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(port_irqs); i++) {
+               struct irq_fwspec fwspec = {
+                       .fwnode         = &port->np->fwnode,
+                       .param_count    = 1,
+                       .param          = {
+                               [0]     = port_irqs[i].hwirq,
+                       },
+               };
+               unsigned int irq;
+               int ret;
+
+               irq = irq_domain_alloc_irqs(port->domain, 1, NUMA_NO_NODE,
+                                           &fwspec);
+               if (WARN_ON(!irq))
+                       continue;
+
+               ret = request_irq(irq, apple_pcie_port_irq, 0,
+                                 port_irqs[i].name, port);
+               WARN_ON(ret);
+       }
+
+       return 0;
+}
+
+static int apple_pcie_setup_refclk(struct apple_pcie *pcie,
+                                  struct apple_pcie_port *port)
+{
+       u32 stat;
+       int res;
+
+       res = readl_relaxed_poll_timeout(pcie->base + CORE_RC_PHYIF_STAT, stat,
+                                        stat & CORE_RC_PHYIF_STAT_REFCLK,
+                                        100, 50000);
+       if (res < 0)
+               return res;
+
+       rmw_set(CORE_LANE_CTL_CFGACC, pcie->base + CORE_LANE_CTL(port->idx));
+       rmw_set(CORE_LANE_CFG_REFCLK0REQ, pcie->base + CORE_LANE_CFG(port->idx));
+
+       res = readl_relaxed_poll_timeout(pcie->base + CORE_LANE_CFG(port->idx),
+                                        stat, stat & CORE_LANE_CFG_REFCLK0ACK,
+                                        100, 50000);
+       if (res < 0)
+               return res;
+
+       rmw_set(CORE_LANE_CFG_REFCLK1, pcie->base + CORE_LANE_CFG(port->idx));
+       res = readl_relaxed_poll_timeout(pcie->base + CORE_LANE_CFG(port->idx),
+                                        stat, stat & CORE_LANE_CFG_REFCLK1,
+                                        100, 50000);
+
+       if (res < 0)
+               return res;
+
+       rmw_clear(CORE_LANE_CTL_CFGACC, pcie->base + CORE_LANE_CTL(port->idx));
+
+       rmw_set(CORE_LANE_CFG_REFCLKEN, pcie->base + CORE_LANE_CFG(port->idx));
+       rmw_set(PORT_REFCLK_EN, port->base + PORT_REFCLK);
+
+       return 0;
+}
+
+static u32 apple_pcie_rid2sid_write(struct apple_pcie_port *port,
+                                   int idx, u32 val)
+{
+       writel_relaxed(val, port->base + PORT_RID2SID(idx));
+       /* Read back to ensure completion of the write */
+       return readl_relaxed(port->base + PORT_RID2SID(idx));
+}
+
+static int apple_pcie_setup_port(struct apple_pcie *pcie,
+                                struct device_node *np)
+{
+       struct platform_device *platform = to_platform_device(pcie->dev);
+       struct apple_pcie_port *port;
+       struct gpio_desc *reset;
+       u32 stat, idx;
+       int ret, i;
+
+       reset = gpiod_get_from_of_node(np, "reset-gpios", 0,
+                                      GPIOD_OUT_LOW, "#PERST");
+       if (IS_ERR(reset))
+               return PTR_ERR(reset);
+
+       port = devm_kzalloc(pcie->dev, sizeof(*port), GFP_KERNEL);
+       if (!port)
+               return -ENOMEM;
+
+       ret = of_property_read_u32_index(np, "reg", 0, &idx);
+       if (ret)
+               return ret;
+
+       /* Use the first reg entry to work out the port index */
+       port->idx = idx >> 11;
+       port->pcie = pcie;
+       port->np = np;
+
+       port->base = devm_platform_ioremap_resource(platform, port->idx + 2);
+       if (IS_ERR(port->base))
+               return PTR_ERR(port->base);
+
+       rmw_set(PORT_APPCLK_EN, port->base + PORT_APPCLK);
+
+       ret = apple_pcie_setup_refclk(pcie, port);
+       if (ret < 0)
+               return ret;
+
+       rmw_set(PORT_PERST_OFF, port->base + PORT_PERST);
+       gpiod_set_value(reset, 1);
+
+       ret = readl_relaxed_poll_timeout(port->base + PORT_STATUS, stat,
+                                        stat & PORT_STATUS_READY, 100, 250000);
+       if (ret < 0) {
+               dev_err(pcie->dev, "port %pOF ready wait timeout\n", np);
+               return ret;
+       }
+
+       ret = apple_pcie_port_setup_irq(port);
+       if (ret)
+               return ret;
+
+       /* Reset all RID/SID mappings, and check for RAZ/WI registers */
+       for (i = 0; i < MAX_RID2SID; i++) {
+               if (apple_pcie_rid2sid_write(port, i, 0xbad1d) != 0xbad1d)
+                       break;
+               apple_pcie_rid2sid_write(port, i, 0);
+       }
+
+       dev_dbg(pcie->dev, "%pOF: %d RID/SID mapping entries\n", np, i);
+
+       port->sid_map_sz = i;
+
+       list_add_tail(&port->entry, &pcie->ports);
+       init_completion(&pcie->event);
+
+       ret = apple_pcie_port_register_irqs(port);
+       WARN_ON(ret);
+
+       writel_relaxed(PORT_LTSSMCTL_START, port->base + PORT_LTSSMCTL);
+
+       if (!wait_for_completion_timeout(&pcie->event, HZ / 10))
+               dev_warn(pcie->dev, "%pOF link didn't come up\n", np);
+
+       return 0;
+}
+
+static int apple_msi_init(struct apple_pcie *pcie)
+{
+       struct fwnode_handle *fwnode = dev_fwnode(pcie->dev);
+       struct of_phandle_args args = {};
+       struct irq_domain *parent;
+       int ret;
+
+       ret = of_parse_phandle_with_args(to_of_node(fwnode), "msi-ranges",
+                                        "#interrupt-cells", 0, &args);
+       if (ret)
+               return ret;
+
+       ret = of_property_read_u32_index(to_of_node(fwnode), "msi-ranges",
+                                        args.args_count + 1, &pcie->nvecs);
+       if (ret)
+               return ret;
+
+       of_phandle_args_to_fwspec(args.np, args.args, args.args_count,
+                                 &pcie->fwspec);
+
+       pcie->bitmap = devm_bitmap_zalloc(pcie->dev, pcie->nvecs, GFP_KERNEL);
+       if (!pcie->bitmap)
+               return -ENOMEM;
+
+       parent = irq_find_matching_fwspec(&pcie->fwspec, DOMAIN_BUS_WIRED);
+       if (!parent) {
+               dev_err(pcie->dev, "failed to find parent domain\n");
+               return -ENXIO;
+       }
+
+       parent = irq_domain_create_hierarchy(parent, 0, pcie->nvecs, fwnode,
+                                            &apple_msi_domain_ops, pcie);
+       if (!parent) {
+               dev_err(pcie->dev, "failed to create IRQ domain\n");
+               return -ENOMEM;
+       }
+       irq_domain_update_bus_token(parent, DOMAIN_BUS_NEXUS);
+
+       pcie->domain = pci_msi_create_irq_domain(fwnode, &apple_msi_info,
+                                                parent);
+       if (!pcie->domain) {
+               dev_err(pcie->dev, "failed to create MSI domain\n");
+               irq_domain_remove(parent);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static struct apple_pcie_port *apple_pcie_get_port(struct pci_dev *pdev)
+{
+       struct pci_config_window *cfg = pdev->sysdata;
+       struct apple_pcie *pcie = cfg->priv;
+       struct pci_dev *port_pdev;
+       struct apple_pcie_port *port;
+
+       /* Find the root port this device is on */
+       port_pdev = pcie_find_root_port(pdev);
+
+       /* If finding the port itself, nothing to do */
+       if (WARN_ON(!port_pdev) || pdev == port_pdev)
+               return NULL;
+
+       list_for_each_entry(port, &pcie->ports, entry) {
+               if (port->idx == PCI_SLOT(port_pdev->devfn))
+                       return port;
+       }
+
+       return NULL;
+}
+
+static int apple_pcie_add_device(struct apple_pcie_port *port,
+                                struct pci_dev *pdev)
+{
+       u32 sid, rid = PCI_DEVID(pdev->bus->number, pdev->devfn);
+       int idx, err;
+
+       dev_dbg(&pdev->dev, "added to bus %s, index %d\n",
+               pci_name(pdev->bus->self), port->idx);
+
+       err = of_map_id(port->pcie->dev->of_node, rid, "iommu-map",
+                       "iommu-map-mask", NULL, &sid);
+       if (err)
+               return err;
+
+       mutex_lock(&port->pcie->lock);
+
+       idx = bitmap_find_free_region(port->sid_map, port->sid_map_sz, 0);
+       if (idx >= 0) {
+               apple_pcie_rid2sid_write(port, idx,
+                                        PORT_RID2SID_VALID |
+                                        (sid << PORT_RID2SID_SID_SHIFT) | rid);
+
+               dev_dbg(&pdev->dev, "mapping RID%x to SID%x (index %d)\n",
+                       rid, sid, idx);
+       }
+
+       mutex_unlock(&port->pcie->lock);
+
+       return idx >= 0 ? 0 : -ENOSPC;
+}
+
+static void apple_pcie_release_device(struct apple_pcie_port *port,
+                                     struct pci_dev *pdev)
+{
+       u32 rid = PCI_DEVID(pdev->bus->number, pdev->devfn);
+       int idx;
+
+       mutex_lock(&port->pcie->lock);
+
+       for_each_set_bit(idx, port->sid_map, port->sid_map_sz) {
+               u32 val;
+
+               val = readl_relaxed(port->base + PORT_RID2SID(idx));
+               if ((val & 0xffff) == rid) {
+                       apple_pcie_rid2sid_write(port, idx, 0);
+                       bitmap_release_region(port->sid_map, idx, 0);
+                       dev_dbg(&pdev->dev, "Released %x (%d)\n", val, idx);
+                       break;
+               }
+       }
+
+       mutex_unlock(&port->pcie->lock);
+}
+
+static int apple_pcie_bus_notifier(struct notifier_block *nb,
+                                  unsigned long action,
+                                  void *data)
+{
+       struct device *dev = data;
+       struct pci_dev *pdev = to_pci_dev(dev);
+       struct apple_pcie_port *port;
+       int err;
+
+       /*
+        * This is a bit ugly. We assume that if we get notified for
+        * any PCI device, we must be in charge of it, and that there
+        * is no other PCI controller in the whole system. It probably
+        * holds for now, but who knows for how long?
+        */
+       port = apple_pcie_get_port(pdev);
+       if (!port)
+               return NOTIFY_DONE;
+
+       switch (action) {
+       case BUS_NOTIFY_ADD_DEVICE:
+               err = apple_pcie_add_device(port, pdev);
+               if (err)
+                       return notifier_from_errno(err);
+               break;
+       case BUS_NOTIFY_DEL_DEVICE:
+               apple_pcie_release_device(port, pdev);
+               break;
+       default:
+               return NOTIFY_DONE;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block apple_pcie_nb = {
+       .notifier_call = apple_pcie_bus_notifier,
+};
+
+static int apple_pcie_init(struct pci_config_window *cfg)
+{
+       struct device *dev = cfg->parent;
+       struct platform_device *platform = to_platform_device(dev);
+       struct device_node *of_port;
+       struct apple_pcie *pcie;
+       int ret;
+
+       pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
+       if (!pcie)
+               return -ENOMEM;
+
+       pcie->dev = dev;
+
+       mutex_init(&pcie->lock);
+
+       pcie->base = devm_platform_ioremap_resource(platform, 1);
+       if (IS_ERR(pcie->base))
+               return PTR_ERR(pcie->base);
+
+       cfg->priv = pcie;
+       INIT_LIST_HEAD(&pcie->ports);
+
+       for_each_child_of_node(dev->of_node, of_port) {
+               ret = apple_pcie_setup_port(pcie, of_port);
+               if (ret) {
+                       dev_err(pcie->dev, "Port %pOF setup fail: %d\n", of_port, ret);
+                       of_node_put(of_port);
+                       return ret;
+               }
+       }
+
+       return apple_msi_init(pcie);
+}
+
+static int apple_pcie_probe(struct platform_device *pdev)
+{
+       int ret;
+
+       ret = bus_register_notifier(&pci_bus_type, &apple_pcie_nb);
+       if (ret)
+               return ret;
+
+       ret = pci_host_common_probe(pdev);
+       if (ret)
+               bus_unregister_notifier(&pci_bus_type, &apple_pcie_nb);
+
+       return ret;
+}
+
+static const struct pci_ecam_ops apple_pcie_cfg_ecam_ops = {
+       .init           = apple_pcie_init,
+       .pci_ops        = {
+               .map_bus        = pci_ecam_map_bus,
+               .read           = pci_generic_config_read,
+               .write          = pci_generic_config_write,
+       }
+};
+
+static const struct of_device_id apple_pcie_of_match[] = {
+       { .compatible = "apple,pcie", .data = &apple_pcie_cfg_ecam_ops },
+       { }
+};
+MODULE_DEVICE_TABLE(of, apple_pcie_of_match);
+
+static struct platform_driver apple_pcie_driver = {
+       .probe  = apple_pcie_probe,
+       .driver = {
+               .name                   = "pcie-apple",
+               .of_match_table         = apple_pcie_of_match,
+               .suppress_bind_attrs    = true,
+       },
+};
+module_platform_driver(apple_pcie_driver);
+
+MODULE_LICENSE("GPL v2");
index cc30215f5a433697700ffbe24a5c6ab3f341a8c1..1fc7bd49a7ad344029ef882d3fdc889397265f48 100644 (file)
 #define BRCM_INT_PCI_MSI_LEGACY_NR     8
 #define BRCM_INT_PCI_MSI_SHIFT         0
 
-/* MSI target adresses */
+/* MSI target addresses */
 #define BRCM_MSI_TARGET_ADDR_LT_4GB    0x0fffffffcULL
 #define BRCM_MSI_TARGET_ADDR_GT_4GB    0xffffffffcULL
 
index 30ac5fbefbbff5cac388a08e2e1288cbafa05cc8..36b9d2c46cfa0a27fdd34a3fe38a982207cebf50 100644 (file)
@@ -249,7 +249,7 @@ enum iproc_pcie_reg {
 
        /*
         * To hold the address of the register where the MSI writes are
-        * programed.  When ARM GICv3 ITS is used, this should be programmed
+        * programmed.  When ARM GICv3 ITS is used, this should be programmed
         * with the address of the GITS_TRANSLATER register.
         */
        IPROC_PCIE_MSI_ADDR_LO,
diff --git a/drivers/pci/controller/pcie-mt7621.c b/drivers/pci/controller/pcie-mt7621.c
new file mode 100644 (file)
index 0000000..b60dfb4
--- /dev/null
@@ -0,0 +1,600 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * BRIEF MODULE DESCRIPTION
+ *     PCI init for Ralink RT2880 solution
+ *
+ * Copyright 2007 Ralink Inc. (bruce_chang@ralinktech.com.tw)
+ *
+ * May 2007 Bruce Chang
+ * Initial Release
+ *
+ * May 2009 Bruce Chang
+ * support RT2880/RT3883 PCIe
+ *
+ * May 2011 Bruce Chang
+ * support RT6855/MT7620 PCIe
+ */
+
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_pci.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/phy/phy.h>
+#include <linux/platform_device.h>
+#include <linux/reset.h>
+#include <linux/sys_soc.h>
+
+/* MediaTek-specific configuration registers */
+#define PCIE_FTS_NUM                   0x70c
+#define PCIE_FTS_NUM_MASK              GENMASK(15, 8)
+#define PCIE_FTS_NUM_L0(x)             (((x) & 0xff) << 8)
+
+/* Host-PCI bridge registers */
+#define RALINK_PCI_PCICFG_ADDR         0x0000
+#define RALINK_PCI_PCIMSK_ADDR         0x000c
+#define RALINK_PCI_CONFIG_ADDR         0x0020
+#define RALINK_PCI_CONFIG_DATA         0x0024
+#define RALINK_PCI_MEMBASE             0x0028
+#define RALINK_PCI_IOBASE              0x002c
+
+/* PCIe RC control registers */
+#define RALINK_PCI_ID                  0x0030
+#define RALINK_PCI_CLASS               0x0034
+#define RALINK_PCI_SUBID               0x0038
+#define RALINK_PCI_STATUS              0x0050
+
+/* Some definition values */
+#define PCIE_REVISION_ID               BIT(0)
+#define PCIE_CLASS_CODE                        (0x60400 << 8)
+#define PCIE_BAR_MAP_MAX               GENMASK(30, 16)
+#define PCIE_BAR_ENABLE                        BIT(0)
+#define PCIE_PORT_INT_EN(x)            BIT(20 + (x))
+#define PCIE_PORT_LINKUP               BIT(0)
+#define PCIE_PORT_CNT                  3
+
+#define PERST_DELAY_MS                 100
+
+/**
+ * struct mt7621_pcie_port - PCIe port information
+ * @base: I/O mapped register base
+ * @list: port list
+ * @pcie: pointer to PCIe host info
+ * @clk: pointer to the port clock gate
+ * @phy: pointer to PHY control block
+ * @pcie_rst: pointer to port reset control
+ * @gpio_rst: gpio reset
+ * @slot: port slot
+ * @enabled: indicates if port is enabled
+ */
+struct mt7621_pcie_port {
+       void __iomem *base;
+       struct list_head list;
+       struct mt7621_pcie *pcie;
+       struct clk *clk;
+       struct phy *phy;
+       struct reset_control *pcie_rst;
+       struct gpio_desc *gpio_rst;
+       u32 slot;
+       bool enabled;
+};
+
+/**
+ * struct mt7621_pcie - PCIe host information
+ * @base: IO Mapped Register Base
+ * @dev: Pointer to PCIe device
+ * @ports: pointer to PCIe port information
+ * @resets_inverted: depends on chip revision
+ * reset lines are inverted.
+ */
+struct mt7621_pcie {
+       void __iomem *base;
+       struct device *dev;
+       struct list_head ports;
+       bool resets_inverted;
+};
+
+static inline u32 pcie_read(struct mt7621_pcie *pcie, u32 reg)
+{
+       return readl_relaxed(pcie->base + reg);
+}
+
+static inline void pcie_write(struct mt7621_pcie *pcie, u32 val, u32 reg)
+{
+       writel_relaxed(val, pcie->base + reg);
+}
+
+static inline void pcie_rmw(struct mt7621_pcie *pcie, u32 reg, u32 clr, u32 set)
+{
+       u32 val = readl_relaxed(pcie->base + reg);
+
+       val &= ~clr;
+       val |= set;
+       writel_relaxed(val, pcie->base + reg);
+}
+
+static inline u32 pcie_port_read(struct mt7621_pcie_port *port, u32 reg)
+{
+       return readl_relaxed(port->base + reg);
+}
+
+static inline void pcie_port_write(struct mt7621_pcie_port *port,
+                                  u32 val, u32 reg)
+{
+       writel_relaxed(val, port->base + reg);
+}
+
+static inline u32 mt7621_pci_get_cfgaddr(unsigned int bus, unsigned int slot,
+                                        unsigned int func, unsigned int where)
+{
+       return (((where & 0xf00) >> 8) << 24) | (bus << 16) | (slot << 11) |
+               (func << 8) | (where & 0xfc) | 0x80000000;
+}
+
+static void __iomem *mt7621_pcie_map_bus(struct pci_bus *bus,
+                                        unsigned int devfn, int where)
+{
+       struct mt7621_pcie *pcie = bus->sysdata;
+       u32 address = mt7621_pci_get_cfgaddr(bus->number, PCI_SLOT(devfn),
+                                            PCI_FUNC(devfn), where);
+
+       writel_relaxed(address, pcie->base + RALINK_PCI_CONFIG_ADDR);
+
+       return pcie->base + RALINK_PCI_CONFIG_DATA + (where & 3);
+}
+
+struct pci_ops mt7621_pci_ops = {
+       .map_bus        = mt7621_pcie_map_bus,
+       .read           = pci_generic_config_read,
+       .write          = pci_generic_config_write,
+};
+
+static u32 read_config(struct mt7621_pcie *pcie, unsigned int dev, u32 reg)
+{
+       u32 address = mt7621_pci_get_cfgaddr(0, dev, 0, reg);
+
+       pcie_write(pcie, address, RALINK_PCI_CONFIG_ADDR);
+       return pcie_read(pcie, RALINK_PCI_CONFIG_DATA);
+}
+
+static void write_config(struct mt7621_pcie *pcie, unsigned int dev,
+                        u32 reg, u32 val)
+{
+       u32 address = mt7621_pci_get_cfgaddr(0, dev, 0, reg);
+
+       pcie_write(pcie, address, RALINK_PCI_CONFIG_ADDR);
+       pcie_write(pcie, val, RALINK_PCI_CONFIG_DATA);
+}
+
+static inline void mt7621_rst_gpio_pcie_assert(struct mt7621_pcie_port *port)
+{
+       if (port->gpio_rst)
+               gpiod_set_value(port->gpio_rst, 1);
+}
+
+static inline void mt7621_rst_gpio_pcie_deassert(struct mt7621_pcie_port *port)
+{
+       if (port->gpio_rst)
+               gpiod_set_value(port->gpio_rst, 0);
+}
+
+static inline bool mt7621_pcie_port_is_linkup(struct mt7621_pcie_port *port)
+{
+       return (pcie_port_read(port, RALINK_PCI_STATUS) & PCIE_PORT_LINKUP) != 0;
+}
+
+static inline void mt7621_control_assert(struct mt7621_pcie_port *port)
+{
+       struct mt7621_pcie *pcie = port->pcie;
+
+       if (pcie->resets_inverted)
+               reset_control_assert(port->pcie_rst);
+       else
+               reset_control_deassert(port->pcie_rst);
+}
+
+static inline void mt7621_control_deassert(struct mt7621_pcie_port *port)
+{
+       struct mt7621_pcie *pcie = port->pcie;
+
+       if (pcie->resets_inverted)
+               reset_control_deassert(port->pcie_rst);
+       else
+               reset_control_assert(port->pcie_rst);
+}
+
+static int setup_cm_memory_region(struct pci_host_bridge *host)
+{
+       struct mt7621_pcie *pcie = pci_host_bridge_priv(host);
+       struct device *dev = pcie->dev;
+       struct resource_entry *entry;
+       resource_size_t mask;
+
+       entry = resource_list_first_type(&host->windows, IORESOURCE_MEM);
+       if (!entry) {
+               dev_err(dev, "cannot get memory resource\n");
+               return -EINVAL;
+       }
+
+       if (mips_cps_numiocu(0)) {
+               /*
+                * FIXME: hardware doesn't accept mask values with 1s after
+                * 0s (e.g. 0xffef), so it would be great to warn if that's
+                * about to happen
+                */
+               mask = ~(entry->res->end - entry->res->start);
+
+               write_gcr_reg1_base(entry->res->start);
+               write_gcr_reg1_mask(mask | CM_GCR_REGn_MASK_CMTGT_IOCU0);
+               dev_info(dev, "PCI coherence region base: 0x%08llx, mask/settings: 0x%08llx\n",
+                        (unsigned long long)read_gcr_reg1_base(),
+                        (unsigned long long)read_gcr_reg1_mask());
+       }
+
+       return 0;
+}
+
+static int mt7621_pcie_parse_port(struct mt7621_pcie *pcie,
+                                 struct device_node *node,
+                                 int slot)
+{
+       struct mt7621_pcie_port *port;
+       struct device *dev = pcie->dev;
+       struct platform_device *pdev = to_platform_device(dev);
+       char name[10];
+       int err;
+
+       port = devm_kzalloc(dev, sizeof(*port), GFP_KERNEL);
+       if (!port)
+               return -ENOMEM;
+
+       port->base = devm_platform_ioremap_resource(pdev, slot + 1);
+       if (IS_ERR(port->base))
+               return PTR_ERR(port->base);
+
+       port->clk = devm_get_clk_from_child(dev, node, NULL);
+       if (IS_ERR(port->clk)) {
+               dev_err(dev, "failed to get pcie%d clock\n", slot);
+               return PTR_ERR(port->clk);
+       }
+
+       port->pcie_rst = of_reset_control_get_exclusive(node, NULL);
+       if (PTR_ERR(port->pcie_rst) == -EPROBE_DEFER) {
+               dev_err(dev, "failed to get pcie%d reset control\n", slot);
+               return PTR_ERR(port->pcie_rst);
+       }
+
+       snprintf(name, sizeof(name), "pcie-phy%d", slot);
+       port->phy = devm_of_phy_get(dev, node, name);
+       if (IS_ERR(port->phy)) {
+               dev_err(dev, "failed to get pcie-phy%d\n", slot);
+               err = PTR_ERR(port->phy);
+               goto remove_reset;
+       }
+
+       port->gpio_rst = devm_gpiod_get_index_optional(dev, "reset", slot,
+                                                      GPIOD_OUT_LOW);
+       if (IS_ERR(port->gpio_rst)) {
+               dev_err(dev, "failed to get GPIO for PCIe%d\n", slot);
+               err = PTR_ERR(port->gpio_rst);
+               goto remove_reset;
+       }
+
+       port->slot = slot;
+       port->pcie = pcie;
+
+       INIT_LIST_HEAD(&port->list);
+       list_add_tail(&port->list, &pcie->ports);
+
+       return 0;
+
+remove_reset:
+       reset_control_put(port->pcie_rst);
+       return err;
+}
+
+static int mt7621_pcie_parse_dt(struct mt7621_pcie *pcie)
+{
+       struct device *dev = pcie->dev;
+       struct platform_device *pdev = to_platform_device(dev);
+       struct device_node *node = dev->of_node, *child;
+       int err;
+
+       pcie->base = devm_platform_ioremap_resource(pdev, 0);
+       if (IS_ERR(pcie->base))
+               return PTR_ERR(pcie->base);
+
+       for_each_available_child_of_node(node, child) {
+               int slot;
+
+               err = of_pci_get_devfn(child);
+               if (err < 0) {
+                       of_node_put(child);
+                       dev_err(dev, "failed to parse devfn: %d\n", err);
+                       return err;
+               }
+
+               slot = PCI_SLOT(err);
+
+               err = mt7621_pcie_parse_port(pcie, child, slot);
+               if (err) {
+                       of_node_put(child);
+                       return err;
+               }
+       }
+
+       return 0;
+}
+
+static int mt7621_pcie_init_port(struct mt7621_pcie_port *port)
+{
+       struct mt7621_pcie *pcie = port->pcie;
+       struct device *dev = pcie->dev;
+       u32 slot = port->slot;
+       int err;
+
+       err = phy_init(port->phy);
+       if (err) {
+               dev_err(dev, "failed to initialize port%d phy\n", slot);
+               return err;
+       }
+
+       err = phy_power_on(port->phy);
+       if (err) {
+               dev_err(dev, "failed to power on port%d phy\n", slot);
+               phy_exit(port->phy);
+               return err;
+       }
+
+       port->enabled = true;
+
+       return 0;
+}
+
+static void mt7621_pcie_reset_assert(struct mt7621_pcie *pcie)
+{
+       struct mt7621_pcie_port *port;
+
+       list_for_each_entry(port, &pcie->ports, list) {
+               /* PCIe RC reset assert */
+               mt7621_control_assert(port);
+
+               /* PCIe EP reset assert */
+               mt7621_rst_gpio_pcie_assert(port);
+       }
+
+       msleep(PERST_DELAY_MS);
+}
+
+static void mt7621_pcie_reset_rc_deassert(struct mt7621_pcie *pcie)
+{
+       struct mt7621_pcie_port *port;
+
+       list_for_each_entry(port, &pcie->ports, list)
+               mt7621_control_deassert(port);
+}
+
+static void mt7621_pcie_reset_ep_deassert(struct mt7621_pcie *pcie)
+{
+       struct mt7621_pcie_port *port;
+
+       list_for_each_entry(port, &pcie->ports, list)
+               mt7621_rst_gpio_pcie_deassert(port);
+
+       msleep(PERST_DELAY_MS);
+}
+
+static int mt7621_pcie_init_ports(struct mt7621_pcie *pcie)
+{
+       struct device *dev = pcie->dev;
+       struct mt7621_pcie_port *port, *tmp;
+       u8 num_disabled = 0;
+       int err;
+
+       mt7621_pcie_reset_assert(pcie);
+       mt7621_pcie_reset_rc_deassert(pcie);
+
+       list_for_each_entry_safe(port, tmp, &pcie->ports, list) {
+               u32 slot = port->slot;
+
+               if (slot == 1) {
+                       port->enabled = true;
+                       continue;
+               }
+
+               err = mt7621_pcie_init_port(port);
+               if (err) {
+                       dev_err(dev, "initializing port %d failed\n", slot);
+                       list_del(&port->list);
+               }
+       }
+
+       mt7621_pcie_reset_ep_deassert(pcie);
+
+       tmp = NULL;
+       list_for_each_entry(port, &pcie->ports, list) {
+               u32 slot = port->slot;
+
+               if (!mt7621_pcie_port_is_linkup(port)) {
+                       dev_err(dev, "pcie%d no card, disable it (RST & CLK)\n",
+                               slot);
+                       mt7621_control_assert(port);
+                       port->enabled = false;
+                       num_disabled++;
+
+                       if (slot == 0) {
+                               tmp = port;
+                               continue;
+                       }
+
+                       if (slot == 1 && tmp && !tmp->enabled)
+                               phy_power_off(tmp->phy);
+               }
+       }
+
+       return (num_disabled != PCIE_PORT_CNT) ? 0 : -ENODEV;
+}
+
+static void mt7621_pcie_enable_port(struct mt7621_pcie_port *port)
+{
+       struct mt7621_pcie *pcie = port->pcie;
+       u32 slot = port->slot;
+       u32 val;
+
+       /* enable pcie interrupt */
+       val = pcie_read(pcie, RALINK_PCI_PCIMSK_ADDR);
+       val |= PCIE_PORT_INT_EN(slot);
+       pcie_write(pcie, val, RALINK_PCI_PCIMSK_ADDR);
+
+       /* map 2G DDR region */
+       pcie_port_write(port, PCIE_BAR_MAP_MAX | PCIE_BAR_ENABLE,
+                       PCI_BASE_ADDRESS_0);
+
+       /* configure class code and revision ID */
+       pcie_port_write(port, PCIE_CLASS_CODE | PCIE_REVISION_ID,
+                       RALINK_PCI_CLASS);
+
+       /* configure RC FTS number to 250 when it leaves L0s */
+       val = read_config(pcie, slot, PCIE_FTS_NUM);
+       val &= ~PCIE_FTS_NUM_MASK;
+       val |= PCIE_FTS_NUM_L0(0x50);
+       write_config(pcie, slot, PCIE_FTS_NUM, val);
+}
+
+static int mt7621_pcie_enable_ports(struct pci_host_bridge *host)
+{
+       struct mt7621_pcie *pcie = pci_host_bridge_priv(host);
+       struct device *dev = pcie->dev;
+       struct mt7621_pcie_port *port;
+       struct resource_entry *entry;
+       int err;
+
+       entry = resource_list_first_type(&host->windows, IORESOURCE_IO);
+       if (!entry) {
+               dev_err(dev, "cannot get io resource\n");
+               return -EINVAL;
+       }
+
+       /* Setup MEMWIN and IOWIN */
+       pcie_write(pcie, 0xffffffff, RALINK_PCI_MEMBASE);
+       pcie_write(pcie, entry->res->start - entry->offset, RALINK_PCI_IOBASE);
+
+       list_for_each_entry(port, &pcie->ports, list) {
+               if (port->enabled) {
+                       err = clk_prepare_enable(port->clk);
+                       if (err) {
+                               dev_err(dev, "enabling clk pcie%d\n",
+                                       port->slot);
+                               return err;
+                       }
+
+                       mt7621_pcie_enable_port(port);
+                       dev_info(dev, "PCIE%d enabled\n", port->slot);
+               }
+       }
+
+       return 0;
+}
+
+static int mt7621_pcie_register_host(struct pci_host_bridge *host)
+{
+       struct mt7621_pcie *pcie = pci_host_bridge_priv(host);
+
+       host->ops = &mt7621_pci_ops;
+       host->sysdata = pcie;
+       return pci_host_probe(host);
+}
+
+static const struct soc_device_attribute mt7621_pci_quirks_match[] = {
+       { .soc_id = "mt7621", .revision = "E2" }
+};
+
+static int mt7621_pci_probe(struct platform_device *pdev)
+{
+       struct device *dev = &pdev->dev;
+       const struct soc_device_attribute *attr;
+       struct mt7621_pcie_port *port;
+       struct mt7621_pcie *pcie;
+       struct pci_host_bridge *bridge;
+       int err;
+
+       if (!dev->of_node)
+               return -ENODEV;
+
+       bridge = devm_pci_alloc_host_bridge(dev, sizeof(*pcie));
+       if (!bridge)
+               return -ENOMEM;
+
+       pcie = pci_host_bridge_priv(bridge);
+       pcie->dev = dev;
+       platform_set_drvdata(pdev, pcie);
+       INIT_LIST_HEAD(&pcie->ports);
+
+       attr = soc_device_match(mt7621_pci_quirks_match);
+       if (attr)
+               pcie->resets_inverted = true;
+
+       err = mt7621_pcie_parse_dt(pcie);
+       if (err) {
+               dev_err(dev, "parsing DT failed\n");
+               return err;
+       }
+
+       err = mt7621_pcie_init_ports(pcie);
+       if (err) {
+               dev_err(dev, "nothing connected in virtual bridges\n");
+               return 0;
+       }
+
+       err = mt7621_pcie_enable_ports(bridge);
+       if (err) {
+               dev_err(dev, "error enabling pcie ports\n");
+               goto remove_resets;
+       }
+
+       err = setup_cm_memory_region(bridge);
+       if (err) {
+               dev_err(dev, "error setting up iocu mem regions\n");
+               goto remove_resets;
+       }
+
+       return mt7621_pcie_register_host(bridge);
+
+remove_resets:
+       list_for_each_entry(port, &pcie->ports, list)
+               reset_control_put(port->pcie_rst);
+
+       return err;
+}
+
+static int mt7621_pci_remove(struct platform_device *pdev)
+{
+       struct mt7621_pcie *pcie = platform_get_drvdata(pdev);
+       struct mt7621_pcie_port *port;
+
+       list_for_each_entry(port, &pcie->ports, list)
+               reset_control_put(port->pcie_rst);
+
+       return 0;
+}
+
+static const struct of_device_id mt7621_pci_ids[] = {
+       { .compatible = "mediatek,mt7621-pci" },
+       {},
+};
+MODULE_DEVICE_TABLE(of, mt7621_pci_ids);
+
+static struct platform_driver mt7621_pci_driver = {
+       .probe = mt7621_pci_probe,
+       .remove = mt7621_pci_remove,
+       .driver = {
+               .name = "mt7621-pci",
+               .of_match_table = of_match_ptr(mt7621_pci_ids),
+       },
+};
+builtin_platform_driver(mt7621_pci_driver);
index aa1cf24a5a723d5ff3d3aa627b7699b5be308ab0..f9682df1da61929b5be3b04716979830976214b1 100644 (file)
@@ -6,16 +6,13 @@
  * Author: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
  */
 
-#include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/of_pci.h>
 #include <linux/of_platform.h>
 #include <linux/pci.h>
 #include <linux/pci-epc.h>
-#include <linux/phy/phy.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 
 #include "pcie-rcar.h"
 
index 8f3131844e7778cf118c9ac0f6ac44bd0e154bda..e12c2d8be05a34e2b189e349934e3a4deb98c7bc 100644 (file)
 #include <linux/msi.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
-#include <linux/of_pci.h>
 #include <linux/of_platform.h>
 #include <linux/pci.h>
 #include <linux/phy/phy.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
-#include <linux/slab.h>
 
 #include "pcie-rcar.h"
 
index a5987e52700e3ad7f37422910e67a73253b4f7dd..a45e8e59d3d4861882a58d7e64167dbeba524c04 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <linux/device.h>
 #include <linux/interrupt.h>
+#include <linux/iommu.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -18,8 +19,6 @@
 #include <linux/rcupdate.h>
 
 #include <asm/irqdomain.h>
-#include <asm/device.h>
-#include <asm/msi.h>
 
 #define VMD_CFGBAR     0
 #define VMD_MEMBAR1    2
@@ -70,6 +69,8 @@ enum vmd_features {
        VMD_FEAT_CAN_BYPASS_MSI_REMAP           = (1 << 4),
 };
 
+static DEFINE_IDA(vmd_instance_ida);
+
 /*
  * Lock for manipulating VMD IRQ lists.
  */
@@ -120,6 +121,8 @@ struct vmd_dev {
        struct pci_bus          *bus;
        u8                      busn_start;
        u8                      first_vec;
+       char                    *name;
+       int                     instance;
 };
 
 static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
@@ -650,7 +653,7 @@ static int vmd_alloc_irqs(struct vmd_dev *vmd)
                INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
                err = devm_request_irq(&dev->dev, pci_irq_vector(dev, i),
                                       vmd_irq, IRQF_NO_THREAD,
-                                      "vmd", &vmd->irqs[i]);
+                                      vmd->name, &vmd->irqs[i]);
                if (err)
                        return err;
        }
@@ -761,7 +764,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
         * acceptable because the guest is usually CPU-limited and MSI
         * remapping doesn't become a performance bottleneck.
         */
-       if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
+       if (iommu_capable(vmd->dev->dev.bus, IOMMU_CAP_INTR_REMAP) ||
+           !(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) ||
            offset[0] || offset[1]) {
                ret = vmd_alloc_irqs(vmd);
                if (ret)
@@ -834,18 +838,32 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
                return -ENOMEM;
 
        vmd->dev = dev;
+       vmd->instance = ida_simple_get(&vmd_instance_ida, 0, 0, GFP_KERNEL);
+       if (vmd->instance < 0)
+               return vmd->instance;
+
+       vmd->name = kasprintf(GFP_KERNEL, "vmd%d", vmd->instance);
+       if (!vmd->name) {
+               err = -ENOMEM;
+               goto out_release_instance;
+       }
+
        err = pcim_enable_device(dev);
        if (err < 0)
-               return err;
+               goto out_release_instance;
 
        vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0);
-       if (!vmd->cfgbar)
-               return -ENOMEM;
+       if (!vmd->cfgbar) {
+               err = -ENOMEM;
+               goto out_release_instance;
+       }
 
        pci_set_master(dev);
        if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) &&
-           dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32)))
-               return -ENODEV;
+           dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) {
+               err = -ENODEV;
+               goto out_release_instance;
+       }
 
        if (features & VMD_FEAT_OFFSET_FIRST_VECTOR)
                vmd->first_vec = 1;
@@ -854,11 +872,16 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
        pci_set_drvdata(dev, vmd);
        err = vmd_enable_domain(vmd, features);
        if (err)
-               return err;
+               goto out_release_instance;
 
        dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n",
                 vmd->sysdata.domain);
        return 0;
+
+ out_release_instance:
+       ida_simple_remove(&vmd_instance_ida, vmd->instance);
+       kfree(vmd->name);
+       return err;
 }
 
 static void vmd_cleanup_srcu(struct vmd_dev *vmd)
@@ -879,6 +902,8 @@ static void vmd_remove(struct pci_dev *dev)
        vmd_cleanup_srcu(vmd);
        vmd_detach_resources(vmd);
        vmd_remove_irq_domain(vmd);
+       ida_simple_remove(&vmd_instance_ida, vmd->instance);
+       kfree(vmd->name);
 }
 
 #ifdef CONFIG_PM_SLEEP
@@ -903,7 +928,7 @@ static int vmd_resume(struct device *dev)
        for (i = 0; i < vmd->msix_count; i++) {
                err = devm_request_irq(dev, pci_irq_vector(pdev, i),
                                       vmd_irq, IRQF_NO_THREAD,
-                                      "vmd", &vmd->irqs[i]);
+                                      vmd->name, &vmd->irqs[i]);
                if (err)
                        return err;
        }
index 8b4756159f1599d820a3095b5aa38175a3b01146..5a03401f45719221ca5dce64147c713665a9ab96 100644 (file)
@@ -1937,7 +1937,7 @@ static ssize_t epf_ntb_##_name##_show(struct config_item *item,           \
        struct config_group *group = to_config_group(item);             \
        struct epf_ntb *ntb = to_epf_ntb(group);                        \
                                                                        \
-       return sprintf(page, "%d\n", ntb->_name);                       \
+       return sysfs_emit(page, "%d\n", ntb->_name);                    \
 }
 
 #define EPF_NTB_W(_name)                                               \
@@ -1947,11 +1947,9 @@ static ssize_t epf_ntb_##_name##_store(struct config_item *item, \
        struct config_group *group = to_config_group(item);             \
        struct epf_ntb *ntb = to_epf_ntb(group);                        \
        u32 val;                                                        \
-       int ret;                                                        \
                                                                        \
-       ret = kstrtou32(page, 0, &val);                                 \
-       if (ret)                                                        \
-               return ret;                                             \
+       if (kstrtou32(page, 0, &val) < 0)                               \
+               return -EINVAL;                                         \
                                                                        \
        ntb->_name = val;                                               \
                                                                        \
@@ -1968,7 +1966,7 @@ static ssize_t epf_ntb_##_name##_show(struct config_item *item,           \
                                                                        \
        sscanf(#_name, "mw%d", &win_no);                                \
                                                                        \
-       return sprintf(page, "%lld\n", ntb->mws_size[win_no - 1]);      \
+       return sysfs_emit(page, "%lld\n", ntb->mws_size[win_no - 1]);   \
 }
 
 #define EPF_NTB_MW_W(_name)                                            \
@@ -1980,11 +1978,9 @@ static ssize_t epf_ntb_##_name##_store(struct config_item *item, \
        struct device *dev = &ntb->epf->dev;                            \
        int win_no;                                                     \
        u64 val;                                                        \
-       int ret;                                                        \
                                                                        \
-       ret = kstrtou64(page, 0, &val);                                 \
-       if (ret)                                                        \
-               return ret;                                             \
+       if (kstrtou64(page, 0, &val) < 0)                               \
+               return -EINVAL;                                         \
                                                                        \
        if (sscanf(#_name, "mw%d", &win_no) != 1)                       \
                return -EINVAL;                                         \
@@ -2005,11 +2001,9 @@ static ssize_t epf_ntb_num_mws_store(struct config_item *item,
        struct config_group *group = to_config_group(item);
        struct epf_ntb *ntb = to_epf_ntb(group);
        u32 val;
-       int ret;
 
-       ret = kstrtou32(page, 0, &val);
-       if (ret)
-               return ret;
+       if (kstrtou32(page, 0, &val) < 0)
+               return -EINVAL;
 
        if (val > MAX_MW)
                return -EINVAL;
index 999911801877e20d432e5e5c28d347e8e0e4e711..d4850bdd837fa6dde7f6f5a9db1ee14f2c49a40e 100644 (file)
@@ -175,9 +175,8 @@ static ssize_t pci_epc_start_store(struct config_item *item, const char *page,
 
        epc = epc_group->epc;
 
-       ret = kstrtobool(page, &start);
-       if (ret)
-               return ret;
+       if (kstrtobool(page, &start) < 0)
+               return -EINVAL;
 
        if (!start) {
                pci_epc_stop(epc);
@@ -198,8 +197,7 @@ static ssize_t pci_epc_start_store(struct config_item *item, const char *page,
 
 static ssize_t pci_epc_start_show(struct config_item *item, char *page)
 {
-       return sprintf(page, "%d\n",
-                      to_pci_epc_group(item)->start);
+       return sysfs_emit(page, "%d\n", to_pci_epc_group(item)->start);
 }
 
 CONFIGFS_ATTR(pci_epc_, start);
@@ -321,7 +319,7 @@ static ssize_t pci_epf_##_name##_show(struct config_item *item,     char *page)    \
        struct pci_epf *epf = to_pci_epf_group(item)->epf;                     \
        if (WARN_ON_ONCE(!epf->header))                                        \
                return -EINVAL;                                                \
-       return sprintf(page, "0x%04x\n", epf->header->_name);                  \
+       return sysfs_emit(page, "0x%04x\n", epf->header->_name);               \
 }
 
 #define PCI_EPF_HEADER_W_u32(_name)                                           \
@@ -329,13 +327,11 @@ static ssize_t pci_epf_##_name##_store(struct config_item *item,         \
                                       const char *page, size_t len)           \
 {                                                                             \
        u32 val;                                                               \
-       int ret;                                                               \
        struct pci_epf *epf = to_pci_epf_group(item)->epf;                     \
        if (WARN_ON_ONCE(!epf->header))                                        \
                return -EINVAL;                                                \
-       ret = kstrtou32(page, 0, &val);                                        \
-       if (ret)                                                               \
-               return ret;                                                    \
+       if (kstrtou32(page, 0, &val) < 0)                                      \
+               return -EINVAL;                                                \
        epf->header->_name = val;                                              \
        return len;                                                            \
 }
@@ -345,13 +341,11 @@ static ssize_t pci_epf_##_name##_store(struct config_item *item,         \
                                       const char *page, size_t len)           \
 {                                                                             \
        u16 val;                                                               \
-       int ret;                                                               \
        struct pci_epf *epf = to_pci_epf_group(item)->epf;                     \
        if (WARN_ON_ONCE(!epf->header))                                        \
                return -EINVAL;                                                \
-       ret = kstrtou16(page, 0, &val);                                        \
-       if (ret)                                                               \
-               return ret;                                                    \
+       if (kstrtou16(page, 0, &val) < 0)                                      \
+               return -EINVAL;                                                \
        epf->header->_name = val;                                              \
        return len;                                                            \
 }
@@ -361,13 +355,11 @@ static ssize_t pci_epf_##_name##_store(struct config_item *item,         \
                                       const char *page, size_t len)           \
 {                                                                             \
        u8 val;                                                                \
-       int ret;                                                               \
        struct pci_epf *epf = to_pci_epf_group(item)->epf;                     \
        if (WARN_ON_ONCE(!epf->header))                                        \
                return -EINVAL;                                                \
-       ret = kstrtou8(page, 0, &val);                                         \
-       if (ret)                                                               \
-               return ret;                                                    \
+       if (kstrtou8(page, 0, &val) < 0)                                       \
+               return -EINVAL;                                                \
        epf->header->_name = val;                                              \
        return len;                                                            \
 }
@@ -376,11 +368,9 @@ static ssize_t pci_epf_msi_interrupts_store(struct config_item *item,
                                            const char *page, size_t len)
 {
        u8 val;
-       int ret;
 
-       ret = kstrtou8(page, 0, &val);
-       if (ret)
-               return ret;
+       if (kstrtou8(page, 0, &val) < 0)
+               return -EINVAL;
 
        to_pci_epf_group(item)->epf->msi_interrupts = val;
 
@@ -390,19 +380,17 @@ static ssize_t pci_epf_msi_interrupts_store(struct config_item *item,
 static ssize_t pci_epf_msi_interrupts_show(struct config_item *item,
                                           char *page)
 {
-       return sprintf(page, "%d\n",
-                      to_pci_epf_group(item)->epf->msi_interrupts);
+       return sysfs_emit(page, "%d\n",
+                         to_pci_epf_group(item)->epf->msi_interrupts);
 }
 
 static ssize_t pci_epf_msix_interrupts_store(struct config_item *item,
                                             const char *page, size_t len)
 {
        u16 val;
-       int ret;
 
-       ret = kstrtou16(page, 0, &val);
-       if (ret)
-               return ret;
+       if (kstrtou16(page, 0, &val) < 0)
+               return -EINVAL;
 
        to_pci_epf_group(item)->epf->msix_interrupts = val;
 
@@ -412,8 +400,8 @@ static ssize_t pci_epf_msix_interrupts_store(struct config_item *item,
 static ssize_t pci_epf_msix_interrupts_show(struct config_item *item,
                                            char *page)
 {
-       return sprintf(page, "%d\n",
-                      to_pci_epf_group(item)->epf->msix_interrupts);
+       return sysfs_emit(page, "%d\n",
+                         to_pci_epf_group(item)->epf->msix_interrupts);
 }
 
 PCI_EPF_HEADER_R(vendorid)
index ecbb0fb3b653c76e5f975ac0d56660bfdb8d7003..38621558d3975b0220e2bb9b250d0891267613d4 100644 (file)
@@ -700,7 +700,7 @@ EXPORT_SYMBOL_GPL(pci_epc_linkup);
 /**
  * pci_epc_init_notify() - Notify the EPF device that EPC device's core
  *                        initialization is completed.
- * @epc: the EPC device whose core initialization is completeds
+ * @epc: the EPC device whose core initialization is completed
  *
  * Invoke to Notify the EPF device that the EPC device's initialization
  * is completed.
index 8aea16380870056fd98d0e9fb4b7ea899d7cbc55..9ed556936f488a2a0e7dbc1c2869ce584dde6cea 100644 (file)
@@ -224,7 +224,7 @@ EXPORT_SYMBOL_GPL(pci_epf_add_vepf);
  *   be removed
  * @epf_vf: the virtual EP function to be removed
  *
- * Invoke to remove a virtual endpoint function from the physcial endpoint
+ * Invoke to remove a virtual endpoint function from the physical endpoint
  * function.
  */
 void pci_epf_remove_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf)
@@ -432,7 +432,7 @@ EXPORT_SYMBOL_GPL(pci_epf_destroy);
 /**
  * pci_epf_create() - create a new PCI EPF device
  * @name: the name of the PCI EPF device. This name will be used to bind the
- *       the EPF device to a EPF driver
+ *       EPF device to a EPF driver
  *
  * Invoke to create a new PCI EPF device by providing the name of the function
  * device.
index f031302ad4019b5c2d783cb713102e5214bcd578..12f4b351be67015edaebc7196f57d4d44c863de8 100644 (file)
@@ -22,7 +22,7 @@
  *    when the bridge is scanned and it loses a refcount when the bridge
  *    is removed.
  *  - When a P2P bridge is present, we elevate the refcount on the subordinate
- *    bus. It loses the refcount when the the driver unloads.
+ *    bus. It loses the refcount when the driver unloads.
  */
 
 #define pr_fmt(fmt) "acpiphp_glue: " fmt
index 77e4e0142fbc54d2994311ebe372af480b4c0779..2f7b49ea96e2643455c93fd89d5233e4e7776a33 100644 (file)
@@ -15,7 +15,7 @@
 #define _CPQPHP_H
 
 #include <linux/interrupt.h>
-#include <asm/io.h>            /* for read? and write? functions */
+#include <linux/io.h>          /* for read? and write? functions */
 #include <linux/delay.h>       /* for delays */
 #include <linux/mutex.h>
 #include <linux/sched/signal.h>        /* for signal_pending() */
index 1b26ca0b370120eecd3c69540f35b2635a7bb780..ed7b58eb64d26f3568939da72af75e73fa207d01 100644 (file)
@@ -519,7 +519,7 @@ error:
  * @head: list to search
  * @size: size of node to find, must be a power of two.
  *
- * Description: This function sorts the resource list by size and then returns
+ * Description: This function sorts the resource list by size and then
  * returns the first node of "size" length that is not in the ISA aliasing
  * window.  If it finds a node larger than "size" it will split it up.
  */
@@ -1202,7 +1202,7 @@ static u8 set_controller_speed(struct controller *ctrl, u8 adapter_speed, u8 hp_
 
        mdelay(5);
 
-       /* Reenable interrupts */
+       /* Re-enable interrupts */
        writel(0, ctrl->hpc_reg + INT_MASK);
 
        pci_write_config_byte(ctrl->pci_dev, 0x41, reg);
index 1b2b3f3b648bca17d637a7b265ad502b91134452..9038039ad6db5b5f30fa6cabe399814df0b74e19 100644 (file)
@@ -189,8 +189,10 @@ int cpqhp_set_irq(u8 bus_num, u8 dev_num, u8 int_pin, u8 irq_num)
                /* This should only be for x86 as it sets the Edge Level
                 * Control Register
                 */
-               outb((u8) (temp_word & 0xFF), 0x4d0); outb((u8) ((temp_word &
-               0xFF00) >> 8), 0x4d1); rc = 0; }
+               outb((u8)(temp_word & 0xFF), 0x4d0);
+               outb((u8)((temp_word & 0xFF00) >> 8), 0x4d1);
+               rc = 0;
+       }
 
        return rc;
 }
index e90a4ebf6550ab18e46ec7c26b8abbf652c43823..0399c60d2ec1ab27e941094c549c7449abc4008b 100644 (file)
@@ -352,7 +352,7 @@ struct resource_node {
        u32 len;
        int type;               /* MEM, IO, PFMEM */
        u8 fromMem;             /* this is to indicate that the range is from
-                                * from the Memory bucket rather than from PFMem */
+                                * the Memory bucket rather than from PFMem */
        struct resource_node *next;
        struct resource_node *nextRange;        /* for the other mem range on bus */
 };
@@ -736,7 +736,7 @@ struct controller {
 
 int ibmphp_init_devno(struct slot **); /* This function is called from EBDA, so we need it not be static */
 int ibmphp_do_disable_slot(struct slot *slot_cur);
-int ibmphp_update_slot_info(struct slot *);    /* This function is called from HPC, so we need it to not be be static */
+int ibmphp_update_slot_info(struct slot *);    /* This function is called from HPC, so we need it to not be static */
 int ibmphp_configure_card(struct pci_func *, u8);
 int ibmphp_unconfigure_card(struct slot **, int);
 extern const struct hotplug_slot_ops ibmphp_hotplug_slot_ops;
index 69fd401691be6e6a5ae4aa56f294ae591134039c..918dccbc74b6bc48638407eb6f825534a8c85d18 100644 (file)
@@ -189,6 +189,8 @@ int pciehp_get_attention_status(struct hotplug_slot *hotplug_slot, u8 *status);
 int pciehp_set_raw_indicator_status(struct hotplug_slot *h_slot, u8 status);
 int pciehp_get_raw_indicator_status(struct hotplug_slot *h_slot, u8 *status);
 
+int pciehp_slot_reset(struct pcie_device *dev);
+
 static inline const char *slot_name(struct controller *ctrl)
 {
        return hotplug_slot_name(&ctrl->hotplug_slot);
index ad3393930ecb4d91dc21ef7a5abf575662c76e7c..f34114d452599e23b69a89f790e2bf02867a898e 100644 (file)
@@ -351,6 +351,8 @@ static struct pcie_port_service_driver hpdriver_portdrv = {
        .runtime_suspend = pciehp_runtime_suspend,
        .runtime_resume = pciehp_runtime_resume,
 #endif /* PM */
+
+       .slot_reset     = pciehp_slot_reset,
 };
 
 int __init pcie_hp_init(void)
index 3024d7e85e6a70d53856c5a6a468c3c3e6d50a6e..83a0fa119cae823b82a17d36a8d9c6cfbb7d3719 100644 (file)
@@ -862,6 +862,32 @@ void pcie_disable_interrupt(struct controller *ctrl)
        pcie_write_cmd(ctrl, 0, mask);
 }
 
+/**
+ * pciehp_slot_reset() - ignore link event caused by error-induced hot reset
+ * @dev: PCI Express port service device
+ *
+ * Called from pcie_portdrv_slot_reset() after AER or DPC initiated a reset
+ * further up in the hierarchy to recover from an error.  The reset was
+ * propagated down to this hotplug port.  Ignore the resulting link flap.
+ * If the link failed to retrain successfully, synthesize the ignored event.
+ * Surprise removal during reset is detected through Presence Detect Changed.
+ */
+int pciehp_slot_reset(struct pcie_device *dev)
+{
+       struct controller *ctrl = get_service_data(dev);
+
+       if (ctrl->state != ON_STATE)
+               return 0;
+
+       pcie_capability_write_word(dev->port, PCI_EXP_SLTSTA,
+                                  PCI_EXP_SLTSTA_DLLSC);
+
+       if (!pciehp_check_link_active(ctrl))
+               pciehp_request(ctrl, PCI_EXP_SLTSTA_DLLSC);
+
+       return 0;
+}
+
 /*
  * pciehp has a 1:1 bus:slot relationship so we ultimately want a secondary
  * bus reset of the bridge, but at the same time we want to ensure that it is
index 9e3b27744305c71c3df6d416fec27e959460bfc4..bd7557ca49108de988fbfa3451ca85393cd03b42 100644 (file)
@@ -295,7 +295,7 @@ static int shpc_write_cmd(struct slot *slot, u8 t_slot, u8 cmd)
        mutex_lock(&slot->ctrl->cmd_lock);
 
        if (!shpc_poll_ctrl_busy(ctrl)) {
-               /* After 1 sec and and the controller is still busy */
+               /* After 1 sec and the controller is still busy */
                ctrl_err(ctrl, "Controller is still busy after 1 sec\n");
                retval = -EBUSY;
                goto out;
index dafdc652fcd06690a25581e224c4b8a318d90e5d..1d7a7c5b5307879977cb17d744526d6a6acb4136 100644 (file)
@@ -164,13 +164,15 @@ static ssize_t sriov_vf_total_msix_show(struct device *dev,
                                        char *buf)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
+       struct pci_driver *pdrv;
        u32 vf_total_msix = 0;
 
        device_lock(dev);
-       if (!pdev->driver || !pdev->driver->sriov_get_vf_total_msix)
+       pdrv = to_pci_driver(dev->driver);
+       if (!pdrv || !pdrv->sriov_get_vf_total_msix)
                goto unlock;
 
-       vf_total_msix = pdev->driver->sriov_get_vf_total_msix(pdev);
+       vf_total_msix = pdrv->sriov_get_vf_total_msix(pdev);
 unlock:
        device_unlock(dev);
        return sysfs_emit(buf, "%u\n", vf_total_msix);
@@ -183,23 +185,24 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev,
 {
        struct pci_dev *vf_dev = to_pci_dev(dev);
        struct pci_dev *pdev = pci_physfn(vf_dev);
-       int val, ret;
+       struct pci_driver *pdrv;
+       int val, ret = 0;
 
-       ret = kstrtoint(buf, 0, &val);
-       if (ret)
-               return ret;
+       if (kstrtoint(buf, 0, &val) < 0)
+               return -EINVAL;
 
        if (val < 0)
                return -EINVAL;
 
        device_lock(&pdev->dev);
-       if (!pdev->driver || !pdev->driver->sriov_set_msix_vec_count) {
+       pdrv = to_pci_driver(dev->driver);
+       if (!pdrv || !pdrv->sriov_set_msix_vec_count) {
                ret = -EOPNOTSUPP;
                goto err_pdev;
        }
 
        device_lock(&vf_dev->dev);
-       if (vf_dev->driver) {
+       if (to_pci_driver(vf_dev->dev.driver)) {
                /*
                 * A driver is already attached to this VF and has configured
                 * itself based on the current MSI-X vector count. Changing
@@ -209,7 +212,7 @@ static ssize_t sriov_vf_msix_count_store(struct device *dev,
                goto err_dev;
        }
 
-       ret = pdev->driver->sriov_set_msix_vec_count(vf_dev, val);
+       ret = pdrv->sriov_set_msix_vec_count(vf_dev, val);
 
 err_dev:
        device_unlock(&vf_dev->dev);
@@ -376,12 +379,12 @@ static ssize_t sriov_numvfs_store(struct device *dev,
                                  const char *buf, size_t count)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
-       int ret;
+       struct pci_driver *pdrv;
+       int ret = 0;
        u16 num_vfs;
 
-       ret = kstrtou16(buf, 0, &num_vfs);
-       if (ret < 0)
-               return ret;
+       if (kstrtou16(buf, 0, &num_vfs) < 0)
+               return -EINVAL;
 
        if (num_vfs > pci_sriov_get_totalvfs(pdev))
                return -ERANGE;
@@ -392,14 +395,15 @@ static ssize_t sriov_numvfs_store(struct device *dev,
                goto exit;
 
        /* is PF driver loaded */
-       if (!pdev->driver) {
+       pdrv = to_pci_driver(dev->driver);
+       if (!pdrv) {
                pci_info(pdev, "no driver bound to device; cannot configure SR-IOV\n");
                ret = -ENOENT;
                goto exit;
        }
 
        /* is PF driver loaded w/callback */
-       if (!pdev->driver->sriov_configure) {
+       if (!pdrv->sriov_configure) {
                pci_info(pdev, "driver does not support SR-IOV configuration via sysfs\n");
                ret = -ENOENT;
                goto exit;
@@ -407,7 +411,7 @@ static ssize_t sriov_numvfs_store(struct device *dev,
 
        if (num_vfs == 0) {
                /* disable VFs */
-               ret = pdev->driver->sriov_configure(pdev, 0);
+               ret = pdrv->sriov_configure(pdev, 0);
                goto exit;
        }
 
@@ -419,7 +423,7 @@ static ssize_t sriov_numvfs_store(struct device *dev,
                goto exit;
        }
 
-       ret = pdev->driver->sriov_configure(pdev, num_vfs);
+       ret = pdrv->sriov_configure(pdev, num_vfs);
        if (ret < 0)
                goto exit;
 
index 4b4792940e8691062affc2dd4868745c55b41fb8..12e296d634ebc5a9f8f69220cc975d756f1fcf8e 100644 (file)
@@ -582,7 +582,8 @@ err:
        return ret;
 }
 
-static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
+static void __iomem *msix_map_region(struct pci_dev *dev,
+                                    unsigned int nr_entries)
 {
        resource_size_t phys_addr;
        u32 table_offset;
index d84381ce82b528b3ff3f55e77d5fa11e23e015e2..0b1237cff239a6b455201a7c8f64d0dd837db5ab 100644 (file)
@@ -423,7 +423,7 @@ failed:
  */
 static int of_irq_parse_pci(const struct pci_dev *pdev, struct of_phandle_args *out_irq)
 {
-       struct device_node *dn, *ppnode;
+       struct device_node *dn, *ppnode = NULL;
        struct pci_dev *ppdev;
        __be32 laddr[3];
        u8 pin;
@@ -452,8 +452,14 @@ static int of_irq_parse_pci(const struct pci_dev *pdev, struct of_phandle_args *
        if (pin == 0)
                return -ENODEV;
 
+       /* Local interrupt-map in the device node? Use it! */
+       if (of_get_property(dn, "interrupt-map", NULL)) {
+               pin = pci_swizzle_interrupt_pin(pdev, pin);
+               ppnode = dn;
+       }
+
        /* Now we walk up the PCI tree */
-       for (;;) {
+       while (!ppnode) {
                /* Get the pci_dev of our parent */
                ppdev = pdev->bus->self;
 
index 50cdde3e9a8b2b34bc9d62def0218da7c7ae368d..8d47cb7218d15993b9e274debb08d0f5f89e834e 100644 (file)
@@ -874,7 +874,7 @@ static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap,
        int i;
 
        for_each_sg(sg, s, nents, i) {
-               s->dma_address = sg_phys(s) - p2p_pgmap->bus_offset;
+               s->dma_address = sg_phys(s) + p2p_pgmap->bus_offset;
                sg_dma_len(s) = s->length;
        }
 
@@ -943,7 +943,7 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
  *
  * Parses an attribute value to decide whether to enable p2pdma.
  * The value can select a PCI device (using its full BDF device
- * name) or a boolean (in any format strtobool() accepts). A false
+ * name) or a boolean (in any format kstrtobool() accepts). A false
  * value disables p2pdma, a true value expects the caller
  * to automatically find a compatible device and specifying a PCI device
  * expects the caller to use the specific provider.
@@ -975,11 +975,11 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
        } else if ((page[0] == '0' || page[0] == '1') && !iscntrl(page[1])) {
                /*
                 * If the user enters a PCI device that  doesn't exist
-                * like "0000:01:00.1", we don't want strtobool to think
+                * like "0000:01:00.1", we don't want kstrtobool to think
                 * it's a '0' when it's clearly not what the user wanted.
                 * So we require 0's and 1's to be exactly one character.
                 */
-       } else if (!strtobool(page, use_p2pdma)) {
+       } else if (!kstrtobool(page, use_p2pdma)) {
                return 0;
        }
 
index fdaf86a888b73771eddd1704323a8fa9eb4a0399..db97cddfc85e1c7e731bbb0f667dc2427b323f6c 100644 (file)
@@ -431,8 +431,21 @@ int pci_bridge_emul_conf_write(struct pci_bridge_emul *bridge, int where,
        /* Clear the W1C bits */
        new &= ~((value << shift) & (behavior[reg / 4].w1c & mask));
 
+       /* Save the new value with the cleared W1C bits into the cfgspace */
        cfgspace[reg / 4] = cpu_to_le32(new);
 
+       /*
+        * Clear the W1C bits not specified by the write mask, so that the
+        * write_op() does not clear them.
+        */
+       new &= ~(behavior[reg / 4].w1c & ~mask);
+
+       /*
+        * Set the W1C bits specified by the write mask, so that write_op()
+        * knows about that they are to be cleared.
+        */
+       new |= (value << shift) & (behavior[reg / 4].w1c & mask);
+
        if (write_op)
                write_op(bridge, reg, old, new, mask);
 
index 2761ab86490d14bfa547180e36dbbc997e31f922..1d98c974381cfec850461eadc3e91c7086198381 100644 (file)
@@ -319,12 +319,10 @@ static long local_pci_probe(void *_ddi)
         * its remove routine.
         */
        pm_runtime_get_sync(dev);
-       pci_dev->driver = pci_drv;
        rc = pci_drv->probe(pci_dev, ddi->id);
        if (!rc)
                return rc;
        if (rc < 0) {
-               pci_dev->driver = NULL;
                pm_runtime_put_sync(dev);
                return rc;
        }
@@ -390,14 +388,13 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
  * @pci_dev: PCI device being probed
  *
  * returns 0 on success, else error.
- * side-effect: pci_dev->driver is set to drv when drv claims pci_dev.
  */
 static int __pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev)
 {
        const struct pci_device_id *id;
        int error = 0;
 
-       if (!pci_dev->driver && drv->probe) {
+       if (drv->probe) {
                error = -ENODEV;
 
                id = pci_match_device(drv, pci_dev);
@@ -457,18 +454,15 @@ static int pci_device_probe(struct device *dev)
 static void pci_device_remove(struct device *dev)
 {
        struct pci_dev *pci_dev = to_pci_dev(dev);
-       struct pci_driver *drv = pci_dev->driver;
+       struct pci_driver *drv = to_pci_driver(dev->driver);
 
-       if (drv) {
-               if (drv->remove) {
-                       pm_runtime_get_sync(dev);
-                       drv->remove(pci_dev);
-                       pm_runtime_put_noidle(dev);
-               }
-               pcibios_free_irq(pci_dev);
-               pci_dev->driver = NULL;
-               pci_iov_remove(pci_dev);
+       if (drv->remove) {
+               pm_runtime_get_sync(dev);
+               drv->remove(pci_dev);
+               pm_runtime_put_noidle(dev);
        }
+       pcibios_free_irq(pci_dev);
+       pci_iov_remove(pci_dev);
 
        /* Undo the runtime PM settings in local_pci_probe() */
        pm_runtime_put_sync(dev);
@@ -495,7 +489,7 @@ static void pci_device_remove(struct device *dev)
 static void pci_device_shutdown(struct device *dev)
 {
        struct pci_dev *pci_dev = to_pci_dev(dev);
-       struct pci_driver *drv = pci_dev->driver;
+       struct pci_driver *drv = to_pci_driver(dev->driver);
 
        pm_runtime_resume(dev);
 
@@ -576,7 +570,7 @@ static int pci_pm_reenable_device(struct pci_dev *pci_dev)
 {
        int retval;
 
-       /* if the device was enabled before suspend, reenable */
+       /* if the device was enabled before suspend, re-enable */
        retval = pci_reenable_device(pci_dev);
        /*
         * if the device was busmaster before the suspend, make it busmaster
@@ -591,7 +585,7 @@ static int pci_pm_reenable_device(struct pci_dev *pci_dev)
 static int pci_legacy_suspend(struct device *dev, pm_message_t state)
 {
        struct pci_dev *pci_dev = to_pci_dev(dev);
-       struct pci_driver *drv = pci_dev->driver;
+       struct pci_driver *drv = to_pci_driver(dev->driver);
 
        if (drv && drv->suspend) {
                pci_power_t prev = pci_dev->current_state;
@@ -632,7 +626,7 @@ static int pci_legacy_suspend_late(struct device *dev, pm_message_t state)
 static int pci_legacy_resume(struct device *dev)
 {
        struct pci_dev *pci_dev = to_pci_dev(dev);
-       struct pci_driver *drv = pci_dev->driver;
+       struct pci_driver *drv = to_pci_driver(dev->driver);
 
        pci_fixup_device(pci_fixup_resume, pci_dev);
 
@@ -651,7 +645,7 @@ static void pci_pm_default_suspend(struct pci_dev *pci_dev)
 
 static bool pci_has_legacy_pm_support(struct pci_dev *pci_dev)
 {
-       struct pci_driver *drv = pci_dev->driver;
+       struct pci_driver *drv = to_pci_driver(pci_dev->dev.driver);
        bool ret = drv && (drv->suspend || drv->resume);
 
        /*
@@ -1244,11 +1238,11 @@ static int pci_pm_runtime_suspend(struct device *dev)
        int error;
 
        /*
-        * If pci_dev->driver is not set (unbound), we leave the device in D0,
-        * but it may go to D3cold when the bridge above it runtime suspends.
-        * Save its config space in case that happens.
+        * If the device has no driver, we leave it in D0, but it may go to
+        * D3cold when the bridge above it runtime suspends.  Save its
+        * config space in case that happens.
         */
-       if (!pci_dev->driver) {
+       if (!to_pci_driver(dev->driver)) {
                pci_save_state(pci_dev);
                return 0;
        }
@@ -1305,7 +1299,7 @@ static int pci_pm_runtime_resume(struct device *dev)
         */
        pci_restore_standard_config(pci_dev);
 
-       if (!pci_dev->driver)
+       if (!to_pci_driver(dev->driver))
                return 0;
 
        pci_fixup_device(pci_fixup_resume_early, pci_dev);
@@ -1324,14 +1318,13 @@ static int pci_pm_runtime_resume(struct device *dev)
 
 static int pci_pm_runtime_idle(struct device *dev)
 {
-       struct pci_dev *pci_dev = to_pci_dev(dev);
        const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
 
        /*
-        * If pci_dev->driver is not set (unbound), the device should
-        * always remain in D0 regardless of the runtime PM status
+        * If the device has no driver, it should always remain in D0
+        * regardless of the runtime PM status
         */
-       if (!pci_dev->driver)
+       if (!to_pci_driver(dev->driver))
                return 0;
 
        if (!pm)
@@ -1438,8 +1431,10 @@ static struct pci_driver pci_compat_driver = {
  */
 struct pci_driver *pci_dev_driver(const struct pci_dev *dev)
 {
-       if (dev->driver)
-               return dev->driver;
+       struct pci_driver *drv = to_pci_driver(dev->dev.driver);
+
+       if (drv)
+               return drv;
        else {
                int i;
                for (i = 0; i <= PCI_ROM_RESOURCE; i++)
@@ -1542,7 +1537,7 @@ static int pci_uevent(struct device *dev, struct kobj_uevent_env *env)
        return 0;
 }
 
-#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH)
+#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH)
 /**
  * pci_uevent_ers - emit a uevent during recovery path of PCI device
  * @pdev: PCI device undergoing error recovery
index f807b92afa6c45e52d929487c8e84155dbc66244..cfe2f85af09e2e5bd6270bfbd3d45ffb9b3057c3 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/vgaarb.h>
 #include <linux/pm_runtime.h>
+#include <linux/msi.h>
 #include <linux/of.h>
 #include "pci.h"
 
@@ -49,7 +50,28 @@ pci_config_attr(subsystem_vendor, "0x%04x\n");
 pci_config_attr(subsystem_device, "0x%04x\n");
 pci_config_attr(revision, "0x%02x\n");
 pci_config_attr(class, "0x%06x\n");
-pci_config_attr(irq, "%u\n");
+
+static ssize_t irq_show(struct device *dev,
+                       struct device_attribute *attr,
+                       char *buf)
+{
+       struct pci_dev *pdev = to_pci_dev(dev);
+
+#ifdef CONFIG_PCI_MSI
+       /*
+        * For MSI, show the first MSI IRQ; for all other cases including
+        * MSI-X, show the legacy INTx IRQ.
+        */
+       if (pdev->msi_enabled) {
+               struct msi_desc *desc = first_pci_msi_entry(pdev);
+
+               return sysfs_emit(buf, "%u\n", desc->irq);
+       }
+#endif
+
+       return sysfs_emit(buf, "%u\n", pdev->irq);
+}
+static DEVICE_ATTR_RO(irq);
 
 static ssize_t broken_parity_status_show(struct device *dev,
                                         struct device_attribute *attr,
@@ -275,15 +297,15 @@ static ssize_t enable_store(struct device *dev, struct device_attribute *attr,
 {
        struct pci_dev *pdev = to_pci_dev(dev);
        unsigned long val;
-       ssize_t result = kstrtoul(buf, 0, &val);
-
-       if (result < 0)
-               return result;
+       ssize_t result = 0;
 
        /* this can crash the machine when done on the "wrong" device */
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       if (kstrtoul(buf, 0, &val) < 0)
+               return -EINVAL;
+
        device_lock(dev);
        if (dev->driver)
                result = -EBUSY;
@@ -314,14 +336,13 @@ static ssize_t numa_node_store(struct device *dev,
                               size_t count)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
-       int node, ret;
+       int node;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       ret = kstrtoint(buf, 0, &node);
-       if (ret)
-               return ret;
+       if (kstrtoint(buf, 0, &node) < 0)
+               return -EINVAL;
 
        if ((node < 0 && node != NUMA_NO_NODE) || node >= MAX_NUMNODES)
                return -EINVAL;
@@ -380,12 +401,12 @@ static ssize_t msi_bus_store(struct device *dev, struct device_attribute *attr,
        struct pci_bus *subordinate = pdev->subordinate;
        unsigned long val;
 
-       if (kstrtoul(buf, 0, &val) < 0)
-               return -EINVAL;
-
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       if (kstrtoul(buf, 0, &val) < 0)
+               return -EINVAL;
+
        /*
         * "no_msi" and "bus_flags" only affect what happens when a driver
         * requests MSI or MSI-X.  They don't affect any drivers that have
@@ -1341,10 +1362,10 @@ static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
 {
        struct pci_dev *pdev = to_pci_dev(dev);
        unsigned long val;
-       ssize_t result = kstrtoul(buf, 0, &val);
+       ssize_t result;
 
-       if (result < 0)
-               return result;
+       if (kstrtoul(buf, 0, &val) < 0)
+               return -EINVAL;
 
        if (val != 1)
                return -EINVAL;
index 39c65b5f92c15de4a4ded3b76b531388e28a03a5..da75c422ba857419d14b0b731c4434096c62f1de 100644 (file)
@@ -269,7 +269,7 @@ static int pci_dev_str_match_path(struct pci_dev *dev, const char *path,
                                  const char **endptr)
 {
        int ret;
-       int seg, bus, slot, func;
+       unsigned int seg, bus, slot, func;
        char *wpath, *p;
        char end;
 
@@ -1439,6 +1439,24 @@ static int pci_save_pcie_state(struct pci_dev *dev)
        return 0;
 }
 
+void pci_bridge_reconfigure_ltr(struct pci_dev *dev)
+{
+#ifdef CONFIG_PCIEASPM
+       struct pci_dev *bridge;
+       u32 ctl;
+
+       bridge = pci_upstream_bridge(dev);
+       if (bridge && bridge->ltr_path) {
+               pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, &ctl);
+               if (!(ctl & PCI_EXP_DEVCTL2_LTR_EN)) {
+                       pci_dbg(bridge, "re-enabling LTR\n");
+                       pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2,
+                                                PCI_EXP_DEVCTL2_LTR_EN);
+               }
+       }
+#endif
+}
+
 static void pci_restore_pcie_state(struct pci_dev *dev)
 {
        int i = 0;
@@ -1449,6 +1467,13 @@ static void pci_restore_pcie_state(struct pci_dev *dev)
        if (!save_state)
                return;
 
+       /*
+        * Downstream ports reset the LTR enable bit when link goes down.
+        * Check and re-configure the bit here before restoring device.
+        * PCIe r5.0, sec 7.5.3.16.
+        */
+       pci_bridge_reconfigure_ltr(dev);
+
        cap = (u16 *)&save_state->cap.data[0];
        pcie_capability_write_word(dev, PCI_EXP_DEVCTL, cap[i++]);
        pcie_capability_write_word(dev, PCI_EXP_LNKCTL, cap[i++]);
@@ -2053,14 +2078,14 @@ void pcim_pin_device(struct pci_dev *pdev)
 EXPORT_SYMBOL(pcim_pin_device);
 
 /*
- * pcibios_add_device - provide arch specific hooks when adding device dev
+ * pcibios_device_add - provide arch specific hooks when adding device dev
  * @dev: the PCI device being added
  *
  * Permits the platform to provide architecture specific functionality when
  * devices are added. This is the default implementation. Architecture
  * implementations can override this.
  */
-int __weak pcibios_add_device(struct pci_dev *dev)
+int __weak pcibios_device_add(struct pci_dev *dev)
 {
        return 0;
 }
@@ -2180,6 +2205,7 @@ int pci_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
 }
 EXPORT_SYMBOL_GPL(pci_set_pcie_reset_state);
 
+#ifdef CONFIG_PCIEAER
 void pcie_clear_device_status(struct pci_dev *dev)
 {
        u16 sta;
@@ -2187,6 +2213,7 @@ void pcie_clear_device_status(struct pci_dev *dev)
        pcie_capability_read_word(dev, PCI_EXP_DEVSTA, &sta);
        pcie_capability_write_word(dev, PCI_EXP_DEVSTA, sta);
 }
+#endif
 
 /**
  * pcie_clear_root_pme_status - Clear root port PME interrupt status.
@@ -3697,6 +3724,14 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask)
        struct pci_dev *bridge;
        u32 cap, ctl2;
 
+       /*
+        * Per PCIe r5.0, sec 9.3.5.10, the AtomicOp Requester Enable bit
+        * in Device Control 2 is reserved in VFs and the PF value applies
+        * to all associated VFs.
+        */
+       if (dev->is_virtfn)
+               return -EINVAL;
+
        if (!pci_is_pcie(dev))
                return -EINVAL;
 
@@ -5068,13 +5103,14 @@ EXPORT_SYMBOL_GPL(pci_dev_unlock);
 
 static void pci_dev_save_and_disable(struct pci_dev *dev)
 {
+       struct pci_driver *drv = to_pci_driver(dev->dev.driver);
        const struct pci_error_handlers *err_handler =
-                       dev->driver ? dev->driver->err_handler : NULL;
+                       drv ? drv->err_handler : NULL;
 
        /*
-        * dev->driver->err_handler->reset_prepare() is protected against
-        * races with ->remove() by the device lock, which must be held by
-        * the caller.
+        * drv->err_handler->reset_prepare() is protected against races
+        * with ->remove() by the device lock, which must be held by the
+        * caller.
         */
        if (err_handler && err_handler->reset_prepare)
                err_handler->reset_prepare(dev);
@@ -5099,15 +5135,15 @@ static void pci_dev_save_and_disable(struct pci_dev *dev)
 
 static void pci_dev_restore(struct pci_dev *dev)
 {
+       struct pci_driver *drv = to_pci_driver(dev->dev.driver);
        const struct pci_error_handlers *err_handler =
-                       dev->driver ? dev->driver->err_handler : NULL;
+                       drv ? drv->err_handler : NULL;
 
        pci_restore_state(dev);
 
        /*
-        * dev->driver->err_handler->reset_done() is protected against
-        * races with ->remove() by the device lock, which must be held by
-        * the caller.
+        * drv->err_handler->reset_done() is protected against races with
+        * ->remove() by the device lock, which must be held by the caller.
         */
        if (err_handler && err_handler->reset_done)
                err_handler->reset_done(dev);
@@ -5268,7 +5304,7 @@ const struct attribute_group pci_dev_reset_method_attr_group = {
  */
 int __pci_reset_function_locked(struct pci_dev *dev)
 {
-       int i, m, rc = -ENOTTY;
+       int i, m, rc;
 
        might_sleep();
 
@@ -6304,11 +6340,12 @@ EXPORT_SYMBOL_GPL(pci_pr3_present);
  * cannot be left as a userspace activity).  DMA aliases should therefore
  * be configured via quirks, such as the PCI fixup header quirk.
  */
-void pci_add_dma_alias(struct pci_dev *dev, u8 devfn_from, unsigned nr_devfns)
+void pci_add_dma_alias(struct pci_dev *dev, u8 devfn_from,
+                      unsigned int nr_devfns)
 {
        int devfn_to;
 
-       nr_devfns = min(nr_devfns, (unsignedMAX_NR_DEVFNS - devfn_from);
+       nr_devfns = min(nr_devfns, (unsigned int)MAX_NR_DEVFNS - devfn_from);
        devfn_to = devfn_from + nr_devfns - 1;
 
        if (!dev->dma_alias_mask)
index bd39098c57dad1b71eb8745331fa3a5c4ad9a1ff..3d60cabde1a15fc8a0fb7412ac54c6201502614f 100644 (file)
@@ -86,6 +86,7 @@ void pci_msix_init(struct pci_dev *dev);
 bool pci_bridge_d3_possible(struct pci_dev *dev);
 void pci_bridge_d3_update(struct pci_dev *dev);
 void pci_bridge_wait_for_secondary_bus(struct pci_dev *dev);
+void pci_bridge_reconfigure_ltr(struct pci_dev *dev);
 
 static inline void pci_wakeup_event(struct pci_dev *dev)
 {
index b2980db88cc0926648eec25e82063280b1575953..5783a2f79e6a2eda315003f6f5cfe1ba510b585f 100644 (file)
@@ -2,12 +2,12 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y                  := portdrv_core.o portdrv_pci.o err.o rcec.o
+pcieportdrv-y                  := portdrv_core.o portdrv_pci.o rcec.o
 
 obj-$(CONFIG_PCIEPORTBUS)      += pcieportdrv.o
 
 obj-$(CONFIG_PCIEASPM)         += aspm.o
-obj-$(CONFIG_PCIEAER)          += aer.o
+obj-$(CONFIG_PCIEAER)          += aer.o err.o
 obj-$(CONFIG_PCIEAER_INJECT)   += aer_inject.o
 obj-$(CONFIG_PCIE_PME)         += pme.o
 obj-$(CONFIG_PCIE_DPC)         += dpc.o
index 9784fdcf30061c7ffd407b73bf20f60a8c70ca62..9fa1f97e5b27069f8893e59e0c471f36949f65f3 100644 (file)
@@ -57,7 +57,7 @@ struct aer_stats {
         * "as seen by this device". Note that this may mean that if an
         * end point is causing problems, the AER counters may increment
         * at its link partner (e.g. root port) because the errors will be
-        * "seen" by the link partner and not the the problematic end point
+        * "seen" by the link partner and not the problematic end point
         * itself (which may report all counters as 0 as it never saw any
         * problems).
         */
index 013a47f587ceae2b9c87e623c6ac4223efa9f487..52c74682601a914e8fa6efd0ff0c0a1e958145ef 100644 (file)
@@ -1219,7 +1219,7 @@ static ssize_t aspm_attr_store_common(struct device *dev,
        struct pcie_link_state *link = pcie_aspm_get_link(pdev);
        bool state_enable;
 
-       if (strtobool(buf, &state_enable) < 0)
+       if (kstrtobool(buf, &state_enable) < 0)
                return -EINVAL;
 
        down_read(&pci_bus_sem);
@@ -1276,7 +1276,7 @@ static ssize_t clkpm_store(struct device *dev,
        struct pcie_link_state *link = pcie_aspm_get_link(pdev);
        bool state_enable;
 
-       if (strtobool(buf, &state_enable) < 0)
+       if (kstrtobool(buf, &state_enable) < 0)
                return -EINVAL;
 
        down_read(&pci_bus_sem);
index b576aa890c76ba2a486a809a39f5ed5c0a351f38..356b9317297e57a189871c5868cd574b17bd0004 100644 (file)
@@ -49,14 +49,16 @@ static int report_error_detected(struct pci_dev *dev,
                                 pci_channel_state_t state,
                                 enum pci_ers_result *result)
 {
+       struct pci_driver *pdrv;
        pci_ers_result_t vote;
        const struct pci_error_handlers *err_handler;
 
        device_lock(&dev->dev);
+       pdrv = to_pci_driver(dev->dev.driver);
        if (!pci_dev_set_io_state(dev, state) ||
-               !dev->driver ||
-               !dev->driver->err_handler ||
-               !dev->driver->err_handler->error_detected) {
+               !pdrv ||
+               !pdrv->err_handler ||
+               !pdrv->err_handler->error_detected) {
                /*
                 * If any device in the subtree does not have an error_detected
                 * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent
@@ -70,7 +72,7 @@ static int report_error_detected(struct pci_dev *dev,
                        vote = PCI_ERS_RESULT_NONE;
                }
        } else {
-               err_handler = dev->driver->err_handler;
+               err_handler = pdrv->err_handler;
                vote = err_handler->error_detected(dev, state);
        }
        pci_uevent_ers(dev, vote);
@@ -91,16 +93,18 @@ static int report_normal_detected(struct pci_dev *dev, void *data)
 
 static int report_mmio_enabled(struct pci_dev *dev, void *data)
 {
+       struct pci_driver *pdrv;
        pci_ers_result_t vote, *result = data;
        const struct pci_error_handlers *err_handler;
 
        device_lock(&dev->dev);
-       if (!dev->driver ||
-               !dev->driver->err_handler ||
-               !dev->driver->err_handler->mmio_enabled)
+       pdrv = to_pci_driver(dev->dev.driver);
+       if (!pdrv ||
+               !pdrv->err_handler ||
+               !pdrv->err_handler->mmio_enabled)
                goto out;
 
-       err_handler = dev->driver->err_handler;
+       err_handler = pdrv->err_handler;
        vote = err_handler->mmio_enabled(dev);
        *result = merge_result(*result, vote);
 out:
@@ -110,16 +114,18 @@ out:
 
 static int report_slot_reset(struct pci_dev *dev, void *data)
 {
+       struct pci_driver *pdrv;
        pci_ers_result_t vote, *result = data;
        const struct pci_error_handlers *err_handler;
 
        device_lock(&dev->dev);
-       if (!dev->driver ||
-               !dev->driver->err_handler ||
-               !dev->driver->err_handler->slot_reset)
+       pdrv = to_pci_driver(dev->dev.driver);
+       if (!pdrv ||
+               !pdrv->err_handler ||
+               !pdrv->err_handler->slot_reset)
                goto out;
 
-       err_handler = dev->driver->err_handler;
+       err_handler = pdrv->err_handler;
        vote = err_handler->slot_reset(dev);
        *result = merge_result(*result, vote);
 out:
@@ -129,16 +135,18 @@ out:
 
 static int report_resume(struct pci_dev *dev, void *data)
 {
+       struct pci_driver *pdrv;
        const struct pci_error_handlers *err_handler;
 
        device_lock(&dev->dev);
+       pdrv = to_pci_driver(dev->dev.driver);
        if (!pci_dev_set_io_state(dev, pci_channel_io_normal) ||
-               !dev->driver ||
-               !dev->driver->err_handler ||
-               !dev->driver->err_handler->resume)
+               !pdrv ||
+               !pdrv->err_handler ||
+               !pdrv->err_handler->resume)
                goto out;
 
-       err_handler = dev->driver->err_handler;
+       err_handler = pdrv->err_handler;
        err_handler->resume(dev);
 out:
        pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
index 2ff5724b8f13f06aed1d879d6286944679d9a568..0ef4bf5f811d9d74ee134f1041fc1538fc0f838c 100644 (file)
@@ -85,8 +85,7 @@ struct pcie_port_service_driver {
        int (*runtime_suspend)(struct pcie_device *dev);
        int (*runtime_resume)(struct pcie_device *dev);
 
-       /* Device driver may resume normal operations */
-       void (*error_resume)(struct pci_dev *dev);
+       int (*slot_reset)(struct pcie_device *dev);
 
        int port_type;  /* Type of the port this driver can handle */
        u32 service;    /* Port service this device represents */
@@ -110,6 +109,7 @@ void pcie_port_service_unregister(struct pcie_port_service_driver *new);
 
 extern struct bus_type pcie_port_bus_type;
 int pcie_port_device_register(struct pci_dev *dev);
+int pcie_port_device_iter(struct device *dev, void *data);
 #ifdef CONFIG_PM
 int pcie_port_device_suspend(struct device *dev);
 int pcie_port_device_resume_noirq(struct device *dev);
@@ -118,8 +118,6 @@ int pcie_port_device_runtime_suspend(struct device *dev);
 int pcie_port_device_runtime_resume(struct device *dev);
 #endif
 void pcie_port_device_remove(struct pci_dev *dev);
-int __must_check pcie_port_bus_register(void);
-void pcie_port_bus_unregister(void);
 
 struct pci_dev;
 
index 3ee63968deaa5880dc700feb3d17b338a8557e68..bda630889f95550b111f2cbd2299f0be11970eee 100644 (file)
@@ -166,9 +166,6 @@ static int pcie_init_service_irqs(struct pci_dev *dev, int *irqs, int mask)
 {
        int ret, i;
 
-       for (i = 0; i < PCIE_PORT_DEVICE_MAXSERVICES; i++)
-               irqs[i] = -1;
-
        /*
         * If we support PME but can't use MSI/MSI-X for it, we have to
         * fall back to INTx or other interrupts, e.g., a system shared
@@ -317,8 +314,10 @@ static int pcie_device_init(struct pci_dev *pdev, int service, int irq)
  */
 int pcie_port_device_register(struct pci_dev *dev)
 {
-       int status, capabilities, i, nr_service;
-       int irqs[PCIE_PORT_DEVICE_MAXSERVICES];
+       int status, capabilities, irq_services, i, nr_service;
+       int irqs[PCIE_PORT_DEVICE_MAXSERVICES] = {
+               [0 ... PCIE_PORT_DEVICE_MAXSERVICES-1] = -1
+       };
 
        /* Enable PCI Express port device */
        status = pci_enable_device(dev);
@@ -331,18 +330,32 @@ int pcie_port_device_register(struct pci_dev *dev)
                return 0;
 
        pci_set_master(dev);
-       /*
-        * Initialize service irqs. Don't use service devices that
-        * require interrupts if there is no way to generate them.
-        * However, some drivers may have a polling mode (e.g. pciehp_poll_mode)
-        * that can be used in the absence of irqs.  Allow them to determine
-        * if that is to be used.
-        */
-       status = pcie_init_service_irqs(dev, irqs, capabilities);
-       if (status) {
-               capabilities &= PCIE_PORT_SERVICE_HP;
-               if (!capabilities)
-                       goto error_disable;
+
+       irq_services = 0;
+       if (IS_ENABLED(CONFIG_PCIE_PME))
+               irq_services |= PCIE_PORT_SERVICE_PME;
+       if (IS_ENABLED(CONFIG_PCIEAER))
+               irq_services |= PCIE_PORT_SERVICE_AER;
+       if (IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
+               irq_services |= PCIE_PORT_SERVICE_HP;
+       if (IS_ENABLED(CONFIG_PCIE_DPC))
+               irq_services |= PCIE_PORT_SERVICE_DPC;
+       irq_services &= capabilities;
+
+       if (irq_services) {
+               /*
+                * Initialize service IRQs. Don't use service devices that
+                * require interrupts if there is no way to generate them.
+                * However, some drivers may have a polling mode (e.g.
+                * pciehp_poll_mode) that can be used in the absence of IRQs.
+                * Allow them to determine if that is to be used.
+                */
+               status = pcie_init_service_irqs(dev, irqs, irq_services);
+               if (status) {
+                       irq_services &= PCIE_PORT_SERVICE_HP;
+                       if (!irq_services)
+                               goto error_disable;
+               }
        }
 
        /* Allocate child services if any */
@@ -367,24 +380,24 @@ error_disable:
        return status;
 }
 
-#ifdef CONFIG_PM
-typedef int (*pcie_pm_callback_t)(struct pcie_device *);
+typedef int (*pcie_callback_t)(struct pcie_device *);
 
-static int pm_iter(struct device *dev, void *data)
+int pcie_port_device_iter(struct device *dev, void *data)
 {
        struct pcie_port_service_driver *service_driver;
        size_t offset = *(size_t *)data;
-       pcie_pm_callback_t cb;
+       pcie_callback_t cb;
 
        if ((dev->bus == &pcie_port_bus_type) && dev->driver) {
                service_driver = to_service_driver(dev->driver);
-               cb = *(pcie_pm_callback_t *)((void *)service_driver + offset);
+               cb = *(pcie_callback_t *)((void *)service_driver + offset);
                if (cb)
                        return cb(to_pcie_device(dev));
        }
        return 0;
 }
 
+#ifdef CONFIG_PM
 /**
  * pcie_port_device_suspend - suspend port services associated with a PCIe port
  * @dev: PCI Express port to handle
@@ -392,13 +405,13 @@ static int pm_iter(struct device *dev, void *data)
 int pcie_port_device_suspend(struct device *dev)
 {
        size_t off = offsetof(struct pcie_port_service_driver, suspend);
-       return device_for_each_child(dev, &off, pm_iter);
+       return device_for_each_child(dev, &off, pcie_port_device_iter);
 }
 
 int pcie_port_device_resume_noirq(struct device *dev)
 {
        size_t off = offsetof(struct pcie_port_service_driver, resume_noirq);
-       return device_for_each_child(dev, &off, pm_iter);
+       return device_for_each_child(dev, &off, pcie_port_device_iter);
 }
 
 /**
@@ -408,7 +421,7 @@ int pcie_port_device_resume_noirq(struct device *dev)
 int pcie_port_device_resume(struct device *dev)
 {
        size_t off = offsetof(struct pcie_port_service_driver, resume);
-       return device_for_each_child(dev, &off, pm_iter);
+       return device_for_each_child(dev, &off, pcie_port_device_iter);
 }
 
 /**
@@ -418,7 +431,7 @@ int pcie_port_device_resume(struct device *dev)
 int pcie_port_device_runtime_suspend(struct device *dev)
 {
        size_t off = offsetof(struct pcie_port_service_driver, runtime_suspend);
-       return device_for_each_child(dev, &off, pm_iter);
+       return device_for_each_child(dev, &off, pcie_port_device_iter);
 }
 
 /**
@@ -428,7 +441,7 @@ int pcie_port_device_runtime_suspend(struct device *dev)
 int pcie_port_device_runtime_resume(struct device *dev)
 {
        size_t off = offsetof(struct pcie_port_service_driver, runtime_resume);
-       return device_for_each_child(dev, &off, pm_iter);
+       return device_for_each_child(dev, &off, pcie_port_device_iter);
 }
 #endif /* PM */
 
index c7ff1eea225abe8a53f8bdea6398ea715d8d187d..35eca6277a96ebf63f639a2427b0fa0f25c139be 100644 (file)
@@ -160,6 +160,9 @@ static pci_ers_result_t pcie_portdrv_error_detected(struct pci_dev *dev,
 
 static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
 {
+       size_t off = offsetof(struct pcie_port_service_driver, slot_reset);
+       device_for_each_child(&dev->dev, &off, pcie_port_device_iter);
+
        pci_restore_state(dev);
        pci_save_state(dev);
        return PCI_ERS_RESULT_RECOVERED;
@@ -170,29 +173,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct pci_dev *dev)
        return PCI_ERS_RESULT_RECOVERED;
 }
 
-static int resume_iter(struct device *device, void *data)
-{
-       struct pcie_device *pcie_device;
-       struct pcie_port_service_driver *driver;
-
-       if (device->bus == &pcie_port_bus_type && device->driver) {
-               driver = to_service_driver(device->driver);
-               if (driver && driver->error_resume) {
-                       pcie_device = to_pcie_device(device);
-
-                       /* Forward error message to service drivers */
-                       driver->error_resume(pcie_device->port);
-               }
-       }
-
-       return 0;
-}
-
-static void pcie_portdrv_err_resume(struct pci_dev *dev)
-{
-       device_for_each_child(&dev->dev, NULL, resume_iter);
-}
-
 /*
  * LINUX Device Driver Model
  */
@@ -210,7 +190,6 @@ static const struct pci_error_handlers pcie_portdrv_err_handler = {
        .error_detected = pcie_portdrv_error_detected,
        .slot_reset = pcie_portdrv_slot_reset,
        .mmio_enabled = pcie_portdrv_mmio_enabled,
-       .resume = pcie_portdrv_err_resume,
 };
 
 static struct pci_driver pcie_portdriver = {
index d9fc02a71baada590b451974de4791f47a771065..087d3658f75cef27021d10fc402896ff14b46a57 100644 (file)
@@ -883,11 +883,11 @@ static void pci_set_bus_msi_domain(struct pci_bus *bus)
 static int pci_register_host_bridge(struct pci_host_bridge *bridge)
 {
        struct device *parent = bridge->dev.parent;
-       struct resource_entry *window, *n;
+       struct resource_entry *window, *next, *n;
        struct pci_bus *bus, *b;
-       resource_size_t offset;
+       resource_size_t offset, next_offset;
        LIST_HEAD(resources);
-       struct resource *res;
+       struct resource *res, *next_res;
        char addr[64], *fmt;
        const char *name;
        int err;
@@ -970,11 +970,34 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
        if (nr_node_ids > 1 && pcibus_to_node(bus) == NUMA_NO_NODE)
                dev_warn(&bus->dev, "Unknown NUMA node; performance will be reduced\n");
 
+       /* Coalesce contiguous windows */
+       resource_list_for_each_entry_safe(window, n, &resources) {
+               if (list_is_last(&window->node, &resources))
+                       break;
+
+               next = list_next_entry(window, node);
+               offset = window->offset;
+               res = window->res;
+               next_offset = next->offset;
+               next_res = next->res;
+
+               if (res->flags != next_res->flags || offset != next_offset)
+                       continue;
+
+               if (res->end + 1 == next_res->start) {
+                       next_res->start = res->start;
+                       res->flags = res->start = res->end = 0;
+               }
+       }
+
        /* Add initial resources to the bus */
        resource_list_for_each_entry_safe(window, n, &resources) {
-               list_move_tail(&window->node, &bridge->windows);
                offset = window->offset;
                res = window->res;
+               if (!res->end)
+                       continue;
+
+               list_move_tail(&window->node, &bridge->windows);
 
                if (res->flags & IORESOURCE_BUS)
                        pci_bus_insert_busn_res(bus, bus->number, res->end);
@@ -2168,9 +2191,21 @@ static void pci_configure_ltr(struct pci_dev *dev)
         * Complex and all intermediate Switches indicate support for LTR.
         * PCIe r4.0, sec 6.18.
         */
-       if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
-           ((bridge = pci_upstream_bridge(dev)) &&
-             bridge->ltr_path)) {
+       if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) {
+               pcie_capability_set_word(dev, PCI_EXP_DEVCTL2,
+                                        PCI_EXP_DEVCTL2_LTR_EN);
+               dev->ltr_path = 1;
+               return;
+       }
+
+       /*
+        * If we're configuring a hot-added device, LTR was likely
+        * disabled in the upstream bridge, so re-enable it before enabling
+        * it in the new device.
+        */
+       bridge = pci_upstream_bridge(dev);
+       if (bridge && bridge->ltr_path) {
+               pci_bridge_reconfigure_ltr(dev);
                pcie_capability_set_word(dev, PCI_EXP_DEVCTL2,
                                         PCI_EXP_DEVCTL2_LTR_EN);
                dev->ltr_path = 1;
@@ -2450,7 +2485,7 @@ static struct irq_domain *pci_dev_msi_domain(struct pci_dev *dev)
        struct irq_domain *d;
 
        /*
-        * If a domain has been set through the pcibios_add_device()
+        * If a domain has been set through the pcibios_device_add()
         * callback, then this is the one (platform code knows best).
         */
        d = dev_get_msi_domain(&dev->dev);
@@ -2518,7 +2553,7 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
        list_add_tail(&dev->bus_list, &bus->devices);
        up_write(&pci_bus_sem);
 
-       ret = pcibios_add_device(dev);
+       ret = pcibios_device_add(dev);
        WARN_ON(ret < 0);
 
        /* Set up MSI IRQ domain */
@@ -2550,11 +2585,12 @@ struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn)
 }
 EXPORT_SYMBOL(pci_scan_single_device);
 
-static unsigned next_fn(struct pci_bus *bus, struct pci_dev *dev, unsigned fn)
+static unsigned int next_fn(struct pci_bus *bus, struct pci_dev *dev,
+                           unsigned int fn)
 {
        int pos;
        u16 cap = 0;
-       unsigned next_fn;
+       unsigned int next_fn;
 
        if (pci_ari_enabled(bus)) {
                if (!dev)
@@ -2613,7 +2649,7 @@ static int only_one_child(struct pci_bus *bus)
  */
 int pci_scan_slot(struct pci_bus *bus, int devfn)
 {
-       unsigned fn, nr = 0;
+       unsigned int fn, nr = 0;
        struct pci_dev *dev;
 
        if (only_one_child(bus) && (devfn > 0))
index 4537d1ea14fdc034a4a0c72789ca8b745315c60b..aedb78c86ddcfa8c5669ed34e0d680787e7e86c5 100644 (file)
@@ -501,7 +501,7 @@ static void quirk_s3_64M(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_S3,     PCI_DEVICE_ID_S3_868,           quirk_s3_64M);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_S3,     PCI_DEVICE_ID_S3_968,           quirk_s3_64M);
 
-static void quirk_io(struct pci_dev *dev, int pos, unsigned size,
+static void quirk_io(struct pci_dev *dev, int pos, unsigned int size,
                     const char *name)
 {
        u32 region;
@@ -552,7 +552,7 @@ static void quirk_cs5536_vsa(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, quirk_cs5536_vsa);
 
 static void quirk_io_region(struct pci_dev *dev, int port,
-                               unsigned size, int nr, const char *name)
+                           unsigned int size, int nr, const char *name)
 {
        u16 region;
        struct pci_bus_region bus_region;
@@ -666,7 +666,7 @@ static void piix4_io_quirk(struct pci_dev *dev, const char *name, unsigned int p
        base = devres & 0xffff;
        size = 16;
        for (;;) {
-               unsigned bit = size >> 1;
+               unsigned int bit = size >> 1;
                if ((bit & mask) == bit)
                        break;
                size = bit;
@@ -692,7 +692,7 @@ static void piix4_mem_quirk(struct pci_dev *dev, const char *name, unsigned int
        mask = (devres & 0x3f) << 16;
        size = 128 << 16;
        for (;;) {
-               unsigned bit = size >> 1;
+               unsigned int bit = size >> 1;
                if ((bit & mask) == bit)
                        break;
                size = bit;
@@ -806,7 +806,7 @@ static void ich6_lpc_acpi_gpio(struct pci_dev *dev)
                                "ICH6 GPIO");
 }
 
-static void ich6_lpc_generic_decode(struct pci_dev *dev, unsigned reg,
+static void ich6_lpc_generic_decode(struct pci_dev *dev, unsigned int reg,
                                    const char *name, int dynsize)
 {
        u32 val;
@@ -850,7 +850,7 @@ static void quirk_ich6_lpc(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,  PCI_DEVICE_ID_INTEL_ICH6_0, quirk_ich6_lpc);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL,  PCI_DEVICE_ID_INTEL_ICH6_1, quirk_ich6_lpc);
 
-static void ich7_lpc_generic_decode(struct pci_dev *dev, unsigned reg,
+static void ich7_lpc_generic_decode(struct pci_dev *dev, unsigned int reg,
                                    const char *name)
 {
        u32 val;
@@ -2700,7 +2700,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA,
  * then the device can't use INTx interrupts. Tegra's PCIe root ports don't
  * generate MSI interrupts for PME and AER events instead only INTx interrupts
  * are generated. Though Tegra's PCIe root ports can generate MSI interrupts
- * for other events, since PCIe specificiation doesn't support using a mix of
+ * for other events, since PCIe specification doesn't support using a mix of
  * INTx and MSI/MSI-X, it is required to disable MSI interrupts to avoid port
  * service drivers registering their respective ISRs for MSIs.
  */
@@ -3612,6 +3612,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0032, quirk_no_bus_reset);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003c, quirk_no_bus_reset);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0033, quirk_no_bus_reset);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0034, quirk_no_bus_reset);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003e, quirk_no_bus_reset);
 
 /*
  * Root port on some Cavium CN8xxx chips do not successfully complete a bus
@@ -5795,3 +5796,58 @@ static void apex_pci_fixup_class(struct pci_dev *pdev)
 }
 DECLARE_PCI_FIXUP_CLASS_HEADER(0x1ac1, 0x089a,
                               PCI_CLASS_NOT_DEFINED, 8, apex_pci_fixup_class);
+
+/*
+ * Pericom PI7C9X2G404/PI7C9X2G304/PI7C9X2G303 switch erratum E5 -
+ * ACS P2P Request Redirect is not functional
+ *
+ * When ACS P2P Request Redirect is enabled and bandwidth is not balanced
+ * between upstream and downstream ports, packets are queued in an internal
+ * buffer until CPLD packet. The workaround is to use the switch in store and
+ * forward mode.
+ */
+#define PI7C9X2Gxxx_MODE_REG           0x74
+#define PI7C9X2Gxxx_STORE_FORWARD_MODE BIT(0)
+static void pci_fixup_pericom_acs_store_forward(struct pci_dev *pdev)
+{
+       struct pci_dev *upstream;
+       u16 val;
+
+       /* Downstream ports only */
+       if (pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM)
+               return;
+
+       /* Check for ACS P2P Request Redirect use */
+       if (!pdev->acs_cap)
+               return;
+       pci_read_config_word(pdev, pdev->acs_cap + PCI_ACS_CTRL, &val);
+       if (!(val & PCI_ACS_RR))
+               return;
+
+       upstream = pci_upstream_bridge(pdev);
+       if (!upstream)
+               return;
+
+       pci_read_config_word(upstream, PI7C9X2Gxxx_MODE_REG, &val);
+       if (!(val & PI7C9X2Gxxx_STORE_FORWARD_MODE)) {
+               pci_info(upstream, "Setting PI7C9X2Gxxx store-forward mode to avoid ACS erratum\n");
+               pci_write_config_word(upstream, PI7C9X2Gxxx_MODE_REG, val |
+                                     PI7C9X2Gxxx_STORE_FORWARD_MODE);
+       }
+}
+/*
+ * Apply fixup on enable and on resume, in order to apply the fix up whenever
+ * ACS configuration changes or switch mode is reset
+ */
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_PERICOM, 0x2404,
+                        pci_fixup_pericom_acs_store_forward);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_PERICOM, 0x2404,
+                        pci_fixup_pericom_acs_store_forward);
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_PERICOM, 0x2304,
+                        pci_fixup_pericom_acs_store_forward);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_PERICOM, 0x2304,
+                        pci_fixup_pericom_acs_store_forward);
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_PERICOM, 0x2303,
+                        pci_fixup_pericom_acs_store_forward);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_PERICOM, 0x2303,
+                        pci_fixup_pericom_acs_store_forward);
index 8fc9a4e911e3a2d6419ba82964560a2396780b3d..e18d3a4383ba6b0071fe946b798f013b9abbe638 100644 (file)
@@ -85,7 +85,7 @@ static size_t pci_get_rom_size(struct pci_dev *pdev, void __iomem *rom,
 {
        void __iomem *image;
        int last_image;
-       unsigned length;
+       unsigned int length;
 
        image = rom;
        do {
index 2ce636937c6eaf5b2c23045a1de00d368fb8cb01..547396ec50b59e3f25d35c6396d80b086fa3adef 100644 (file)
@@ -1525,7 +1525,7 @@ static void pci_bridge_release_resources(struct pci_bus *bus,
 {
        struct pci_dev *dev = bus->self;
        struct resource *r;
-       unsigned old_flags = 0;
+       unsigned int old_flags = 0;
        struct resource *b_res;
        int idx = 1;
 
index 7129494754dd7396000789f0e2e297365d639f79..cc7d26b015f328ec4026703bb0c7886d80b635b9 100644 (file)
@@ -8,7 +8,6 @@
  *     David Miller (davem@redhat.com)
  */
 
-
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/errno.h>
@@ -28,25 +27,26 @@ void pci_assign_irq(struct pci_dev *dev)
                return;
        }
 
-       /* If this device is not on the primary bus, we need to figure out
-          which interrupt pin it will come in on.   We know which slot it
-          will come in on 'cos that slot is where the bridge is.   Each
-          time the interrupt line passes through a PCI-PCI bridge we must
-          apply the swizzle function.  */
-
+       /*
+        * If this device is not on the primary bus, we need to figure out
+        * which interrupt pin it will come in on. We know which slot it
+        * will come in on because that slot is where the bridge is. Each
+        * time the interrupt line passes through a PCI-PCI bridge we must
+        * apply the swizzle function.
+        */
        pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
        /* Cope with illegal. */
        if (pin > 4)
                pin = 1;
 
        if (pin) {
-               /* Follow the chain of bridges, swizzling as we go.  */
+               /* Follow the chain of bridges, swizzling as we go. */
                if (hbrg->swizzle_irq)
                        slot = (*(hbrg->swizzle_irq))(dev, &pin);
 
                /*
-                * If a swizzling function is not used map_irq must
-                * ignore slot
+                * If a swizzling function is not used, map_irq() must
+                * ignore slot.
                 */
                irq = (*(hbrg->map_irq))(dev, slot, pin);
                if (irq == -1)
@@ -56,7 +56,9 @@ void pci_assign_irq(struct pci_dev *dev)
 
        pci_dbg(dev, "assign IRQ: got %d\n", dev->irq);
 
-       /* Always tell the device, so the driver knows what is
-          the real IRQ to use; the device does not use it. */
+       /*
+        * Always tell the device, so the driver knows what is the real IRQ
+        * to use; the device does not use it.
+        */
        pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
index 0b301f8be9ed566600626f651fc785e962a5100f..38c2b036fb8e737cb114306b3a51c39f78cab330 100644 (file)
@@ -45,6 +45,7 @@ enum mrpc_state {
        MRPC_QUEUED,
        MRPC_RUNNING,
        MRPC_DONE,
+       MRPC_IO_ERROR,
 };
 
 struct switchtec_user {
@@ -66,6 +67,19 @@ struct switchtec_user {
        int event_cnt;
 };
 
+/*
+ * The MMIO reads to the device_id register should always return the device ID
+ * of the device, otherwise the firmware is probably stuck or unreachable
+ * due to a firmware reset which clears PCI state including the BARs and Memory
+ * Space Enable bits.
+ */
+static int is_firmware_running(struct switchtec_dev *stdev)
+{
+       u32 device = ioread32(&stdev->mmio_sys_info->device_id);
+
+       return stdev->pdev->device == device;
+}
+
 static struct switchtec_user *stuser_create(struct switchtec_dev *stdev)
 {
        struct switchtec_user *stuser;
@@ -113,6 +127,7 @@ static void stuser_set_state(struct switchtec_user *stuser,
                [MRPC_QUEUED] = "QUEUED",
                [MRPC_RUNNING] = "RUNNING",
                [MRPC_DONE] = "DONE",
+               [MRPC_IO_ERROR] = "IO_ERROR",
        };
 
        stuser->state = state;
@@ -184,9 +199,26 @@ static int mrpc_queue_cmd(struct switchtec_user *stuser)
        return 0;
 }
 
+static void mrpc_cleanup_cmd(struct switchtec_dev *stdev)
+{
+       /* requires the mrpc_mutex to already be held when called */
+
+       struct switchtec_user *stuser = list_entry(stdev->mrpc_queue.next,
+                                                  struct switchtec_user, list);
+
+       stuser->cmd_done = true;
+       wake_up_interruptible(&stuser->cmd_comp);
+       list_del_init(&stuser->list);
+       stuser_put(stuser);
+       stdev->mrpc_busy = 0;
+
+       mrpc_cmd_submit(stdev);
+}
+
 static void mrpc_complete_cmd(struct switchtec_dev *stdev)
 {
        /* requires the mrpc_mutex to already be held when called */
+
        struct switchtec_user *stuser;
 
        if (list_empty(&stdev->mrpc_queue))
@@ -206,7 +238,8 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev)
        stuser_set_state(stuser, MRPC_DONE);
        stuser->return_code = 0;
 
-       if (stuser->status != SWITCHTEC_MRPC_STATUS_DONE)
+       if (stuser->status != SWITCHTEC_MRPC_STATUS_DONE &&
+           stuser->status != SWITCHTEC_MRPC_STATUS_ERROR)
                goto out;
 
        if (stdev->dma_mrpc)
@@ -223,13 +256,7 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev)
                memcpy_fromio(stuser->data, &stdev->mmio_mrpc->output_data,
                              stuser->read_len);
 out:
-       stuser->cmd_done = true;
-       wake_up_interruptible(&stuser->cmd_comp);
-       list_del_init(&stuser->list);
-       stuser_put(stuser);
-       stdev->mrpc_busy = 0;
-
-       mrpc_cmd_submit(stdev);
+       mrpc_cleanup_cmd(stdev);
 }
 
 static void mrpc_event_work(struct work_struct *work)
@@ -246,6 +273,23 @@ static void mrpc_event_work(struct work_struct *work)
        mutex_unlock(&stdev->mrpc_mutex);
 }
 
+static void mrpc_error_complete_cmd(struct switchtec_dev *stdev)
+{
+       /* requires the mrpc_mutex to already be held when called */
+
+       struct switchtec_user *stuser;
+
+       if (list_empty(&stdev->mrpc_queue))
+               return;
+
+       stuser = list_entry(stdev->mrpc_queue.next,
+                           struct switchtec_user, list);
+
+       stuser_set_state(stuser, MRPC_IO_ERROR);
+
+       mrpc_cleanup_cmd(stdev);
+}
+
 static void mrpc_timeout_work(struct work_struct *work)
 {
        struct switchtec_dev *stdev;
@@ -257,6 +301,11 @@ static void mrpc_timeout_work(struct work_struct *work)
 
        mutex_lock(&stdev->mrpc_mutex);
 
+       if (!is_firmware_running(stdev)) {
+               mrpc_error_complete_cmd(stdev);
+               goto out;
+       }
+
        if (stdev->dma_mrpc)
                status = stdev->dma_mrpc->status;
        else
@@ -327,7 +376,7 @@ static ssize_t field ## _show(struct device *dev, \
                return io_string_show(buf, &si->gen4.field, \
                                      sizeof(si->gen4.field)); \
        else \
-               return -ENOTSUPP; \
+               return -EOPNOTSUPP; \
 } \
 \
 static DEVICE_ATTR_RO(field)
@@ -544,6 +593,11 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
        if (rc)
                return rc;
 
+       if (stuser->state == MRPC_IO_ERROR) {
+               mutex_unlock(&stdev->mrpc_mutex);
+               return -EIO;
+       }
+
        if (stuser->state != MRPC_DONE) {
                mutex_unlock(&stdev->mrpc_mutex);
                return -EBADE;
@@ -569,7 +623,8 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
 out:
        mutex_unlock(&stdev->mrpc_mutex);
 
-       if (stuser->status == SWITCHTEC_MRPC_STATUS_DONE)
+       if (stuser->status == SWITCHTEC_MRPC_STATUS_DONE ||
+           stuser->status == SWITCHTEC_MRPC_STATUS_ERROR)
                return size;
        else if (stuser->status == SWITCHTEC_MRPC_STATUS_INTERRUPTED)
                return -ENXIO;
@@ -613,7 +668,7 @@ static int ioctl_flash_info(struct switchtec_dev *stdev,
                info.flash_length = ioread32(&fi->gen4.flash_length);
                info.num_partitions = SWITCHTEC_NUM_PARTITIONS_GEN4;
        } else {
-               return -ENOTSUPP;
+               return -EOPNOTSUPP;
        }
 
        if (copy_to_user(uinfo, &info, sizeof(info)))
@@ -821,7 +876,7 @@ static int ioctl_flash_part_info(struct switchtec_dev *stdev,
                if (ret)
                        return ret;
        } else {
-               return -ENOTSUPP;
+               return -EOPNOTSUPP;
        }
 
        if (copy_to_user(uinfo, &info, sizeof(info)))
@@ -969,6 +1024,9 @@ static int event_ctl(struct switchtec_dev *stdev,
                return PTR_ERR(reg);
 
        hdr = ioread32(reg);
+       if (hdr & SWITCHTEC_EVENT_NOT_SUPP)
+               return -EOPNOTSUPP;
+
        for (i = 0; i < ARRAY_SIZE(ctl->data); i++)
                ctl->data[i] = ioread32(&reg[i + 1]);
 
@@ -1041,7 +1099,7 @@ static int ioctl_event_ctl(struct switchtec_dev *stdev,
                for (ctl.index = 0; ctl.index < nr_idxs; ctl.index++) {
                        ctl.flags = event_flags;
                        ret = event_ctl(stdev, &ctl);
-                       if (ret < 0)
+                       if (ret < 0 && ret != -EOPNOTSUPP)
                                return ret;
                }
        } else {
@@ -1078,7 +1136,7 @@ static int ioctl_pff_to_port(struct switchtec_dev *stdev,
                        break;
                }
 
-               reg = ioread32(&pcfg->vep_pff_inst_id);
+               reg = ioread32(&pcfg->vep_pff_inst_id) & 0xFF;
                if (reg == p.pff) {
                        p.port = SWITCHTEC_IOCTL_PFF_VEP;
                        break;
@@ -1124,7 +1182,7 @@ static int ioctl_port_to_pff(struct switchtec_dev *stdev,
                p.pff = ioread32(&pcfg->usp_pff_inst_id);
                break;
        case SWITCHTEC_IOCTL_PFF_VEP:
-               p.pff = ioread32(&pcfg->vep_pff_inst_id);
+               p.pff = ioread32(&pcfg->vep_pff_inst_id) & 0xFF;
                break;
        default:
                if (p.port > ARRAY_SIZE(pcfg->dsp_pff_inst_id))
@@ -1348,6 +1406,9 @@ static int mask_event(struct switchtec_dev *stdev, int eid, int idx)
        hdr_reg = event_regs[eid].map_reg(stdev, off, idx);
        hdr = ioread32(hdr_reg);
 
+       if (hdr & SWITCHTEC_EVENT_NOT_SUPP)
+               return 0;
+
        if (!(hdr & SWITCHTEC_EVENT_OCCURRED && hdr & SWITCHTEC_EVENT_EN_IRQ))
                return 0;
 
@@ -1498,7 +1559,7 @@ static void init_pff(struct switchtec_dev *stdev)
        if (reg < stdev->pff_csr_count)
                stdev->pff_local[reg] = 1;
 
-       reg = ioread32(&pcfg->vep_pff_inst_id);
+       reg = ioread32(&pcfg->vep_pff_inst_id) & 0xFF;
        if (reg < stdev->pff_csr_count)
                stdev->pff_local[reg] = 1;
 
@@ -1556,7 +1617,7 @@ static int switchtec_init_pci(struct switchtec_dev *stdev,
        else if (stdev->gen == SWITCHTEC_GEN4)
                part_id = &stdev->mmio_sys_info->gen4.partition_id;
        else
-               return -ENOTSUPP;
+               return -EOPNOTSUPP;
 
        stdev->partition = ioread8(part_id);
        stdev->partition_count = ioread8(&stdev->mmio_ntb->partition_count);
index 4be24890132ede060e4c12f3601864db4af867cc..a4fc4d0690fe27e9af6b938f0270561ade0862fd 100644 (file)
@@ -57,10 +57,7 @@ static size_t pci_vpd_size(struct pci_dev *dev)
        size_t off = 0, size;
        unsigned char tag, header[1+2]; /* 1 byte tag, 2 bytes length */
 
-       /* Otherwise the following reads would fail. */
-       dev->vpd.len = PCI_VPD_MAX_SIZE;
-
-       while (pci_read_vpd(dev, off, 1, header) == 1) {
+       while (pci_read_vpd_any(dev, off, 1, header) == 1) {
                size = 0;
 
                if (off == 0 && (header[0] == 0x00 || header[0] == 0xff))
@@ -68,7 +65,7 @@ static size_t pci_vpd_size(struct pci_dev *dev)
 
                if (header[0] & PCI_VPD_LRDT) {
                        /* Large Resource Data Type Tag */
-                       if (pci_read_vpd(dev, off + 1, 2, &header[1]) != 2) {
+                       if (pci_read_vpd_any(dev, off + 1, 2, &header[1]) != 2) {
                                pci_warn(dev, "failed VPD read at offset %zu\n",
                                         off + 1);
                                return off ?: PCI_VPD_SZ_INVALID;
@@ -99,14 +96,14 @@ error:
        return off ?: PCI_VPD_SZ_INVALID;
 }
 
-static bool pci_vpd_available(struct pci_dev *dev)
+static bool pci_vpd_available(struct pci_dev *dev, bool check_size)
 {
        struct pci_vpd *vpd = &dev->vpd;
 
        if (!vpd->cap)
                return false;
 
-       if (vpd->len == 0) {
+       if (vpd->len == 0 && check_size) {
                vpd->len = pci_vpd_size(dev);
                if (vpd->len == PCI_VPD_SZ_INVALID) {
                        vpd->cap = 0;
@@ -156,24 +153,27 @@ static int pci_vpd_wait(struct pci_dev *dev, bool set)
 }
 
 static ssize_t pci_vpd_read(struct pci_dev *dev, loff_t pos, size_t count,
-                           void *arg)
+                           void *arg, bool check_size)
 {
        struct pci_vpd *vpd = &dev->vpd;
+       unsigned int max_len;
        int ret = 0;
        loff_t end = pos + count;
        u8 *buf = arg;
 
-       if (!pci_vpd_available(dev))
+       if (!pci_vpd_available(dev, check_size))
                return -ENODEV;
 
        if (pos < 0)
                return -EINVAL;
 
-       if (pos > vpd->len)
+       max_len = check_size ? vpd->len : PCI_VPD_MAX_SIZE;
+
+       if (pos >= max_len)
                return 0;
 
-       if (end > vpd->len) {
-               end = vpd->len;
+       if (end > max_len) {
+               end = max_len;
                count = end - pos;
        }
 
@@ -217,20 +217,23 @@ static ssize_t pci_vpd_read(struct pci_dev *dev, loff_t pos, size_t count,
 }
 
 static ssize_t pci_vpd_write(struct pci_dev *dev, loff_t pos, size_t count,
-                            const void *arg)
+                            const void *arg, bool check_size)
 {
        struct pci_vpd *vpd = &dev->vpd;
+       unsigned int max_len;
        const u8 *buf = arg;
        loff_t end = pos + count;
        int ret = 0;
 
-       if (!pci_vpd_available(dev))
+       if (!pci_vpd_available(dev, check_size))
                return -ENODEV;
 
        if (pos < 0 || (pos & 3) || (count & 3))
                return -EINVAL;
 
-       if (end > vpd->len)
+       max_len = check_size ? vpd->len : PCI_VPD_MAX_SIZE;
+
+       if (end > max_len)
                return -EINVAL;
 
        if (mutex_lock_killable(&vpd->lock))
@@ -313,7 +316,7 @@ void *pci_vpd_alloc(struct pci_dev *dev, unsigned int *size)
        void *buf;
        int cnt;
 
-       if (!pci_vpd_available(dev))
+       if (!pci_vpd_available(dev, true))
                return ERR_PTR(-ENODEV);
 
        len = dev->vpd.len;
@@ -381,6 +384,24 @@ static int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
        return -ENOENT;
 }
 
+static ssize_t __pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf,
+                             bool check_size)
+{
+       ssize_t ret;
+
+       if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) {
+               dev = pci_get_func0_dev(dev);
+               if (!dev)
+                       return -ENODEV;
+
+               ret = pci_vpd_read(dev, pos, count, buf, check_size);
+               pci_dev_put(dev);
+               return ret;
+       }
+
+       return pci_vpd_read(dev, pos, count, buf, check_size);
+}
+
 /**
  * pci_read_vpd - Read one entry from Vital Product Data
  * @dev:       PCI device struct
@@ -389,6 +410,20 @@ static int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
  * @buf:       pointer to where to store result
  */
 ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf)
+{
+       return __pci_read_vpd(dev, pos, count, buf, true);
+}
+EXPORT_SYMBOL(pci_read_vpd);
+
+/* Same, but allow to access any address */
+ssize_t pci_read_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, void *buf)
+{
+       return __pci_read_vpd(dev, pos, count, buf, false);
+}
+EXPORT_SYMBOL(pci_read_vpd_any);
+
+static ssize_t __pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count,
+                              const void *buf, bool check_size)
 {
        ssize_t ret;
 
@@ -397,14 +432,13 @@ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf)
                if (!dev)
                        return -ENODEV;
 
-               ret = pci_vpd_read(dev, pos, count, buf);
+               ret = pci_vpd_write(dev, pos, count, buf, check_size);
                pci_dev_put(dev);
                return ret;
        }
 
-       return pci_vpd_read(dev, pos, count, buf);
+       return pci_vpd_write(dev, pos, count, buf, check_size);
 }
-EXPORT_SYMBOL(pci_read_vpd);
 
 /**
  * pci_write_vpd - Write entry to Vital Product Data
@@ -415,22 +449,17 @@ EXPORT_SYMBOL(pci_read_vpd);
  */
 ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf)
 {
-       ssize_t ret;
-
-       if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) {
-               dev = pci_get_func0_dev(dev);
-               if (!dev)
-                       return -ENODEV;
-
-               ret = pci_vpd_write(dev, pos, count, buf);
-               pci_dev_put(dev);
-               return ret;
-       }
-
-       return pci_vpd_write(dev, pos, count, buf);
+       return __pci_write_vpd(dev, pos, count, buf, true);
 }
 EXPORT_SYMBOL(pci_write_vpd);
 
+/* Same, but allow to access any address */
+ssize_t pci_write_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, const void *buf)
+{
+       return __pci_write_vpd(dev, pos, count, buf, false);
+}
+EXPORT_SYMBOL(pci_write_vpd_any);
+
 int pci_vpd_find_ro_info_keyword(const void *buf, unsigned int len,
                                 const char *kw, unsigned int *size)
 {
index 2156c632524d7a41685bf7aa6b70625505ca30dd..d858d25b6cababc714c86e371c7c1f8136bdd48e 100644 (file)
@@ -588,61 +588,43 @@ static pci_ers_result_t pcifront_common_process(int cmd,
                                                struct pcifront_device *pdev,
                                                pci_channel_state_t state)
 {
-       pci_ers_result_t result;
        struct pci_driver *pdrv;
        int bus = pdev->sh_info->aer_op.bus;
        int devfn = pdev->sh_info->aer_op.devfn;
        int domain = pdev->sh_info->aer_op.domain;
        struct pci_dev *pcidev;
-       int flag = 0;
 
        dev_dbg(&pdev->xdev->dev,
                "pcifront AER process: cmd %x (bus:%x, devfn%x)",
                cmd, bus, devfn);
-       result = PCI_ERS_RESULT_NONE;
 
        pcidev = pci_get_domain_bus_and_slot(domain, bus, devfn);
-       if (!pcidev || !pcidev->driver) {
+       if (!pcidev || !pcidev->dev.driver) {
                dev_err(&pdev->xdev->dev, "device or AER driver is NULL\n");
                pci_dev_put(pcidev);
-               return result;
+               return PCI_ERS_RESULT_NONE;
        }
-       pdrv = pcidev->driver;
-
-       if (pdrv) {
-               if (pdrv->err_handler && pdrv->err_handler->error_detected) {
-                       pci_dbg(pcidev, "trying to call AER service\n");
-                       if (pcidev) {
-                               flag = 1;
-                               switch (cmd) {
-                               case XEN_PCI_OP_aer_detected:
-                                       result = pdrv->err_handler->
-                                                error_detected(pcidev, state);
-                                       break;
-                               case XEN_PCI_OP_aer_mmio:
-                                       result = pdrv->err_handler->
-                                                mmio_enabled(pcidev);
-                                       break;
-                               case XEN_PCI_OP_aer_slotreset:
-                                       result = pdrv->err_handler->
-                                                slot_reset(pcidev);
-                                       break;
-                               case XEN_PCI_OP_aer_resume:
-                                       pdrv->err_handler->resume(pcidev);
-                                       break;
-                               default:
-                                       dev_err(&pdev->xdev->dev,
-                                               "bad request in aer recovery "
-                                               "operation!\n");
-
-                               }
-                       }
+       pdrv = to_pci_driver(pcidev->dev.driver);
+
+       if (pdrv->err_handler && pdrv->err_handler->error_detected) {
+               pci_dbg(pcidev, "trying to call AER service\n");
+               switch (cmd) {
+               case XEN_PCI_OP_aer_detected:
+                       return pdrv->err_handler->error_detected(pcidev, state);
+               case XEN_PCI_OP_aer_mmio:
+                       return pdrv->err_handler->mmio_enabled(pcidev);
+               case XEN_PCI_OP_aer_slotreset:
+                       return pdrv->err_handler->slot_reset(pcidev);
+               case XEN_PCI_OP_aer_resume:
+                       pdrv->err_handler->resume(pcidev);
+                       return PCI_ERS_RESULT_NONE;
+               default:
+                       dev_err(&pdev->xdev->dev,
+                               "bad request in aer recovery operation!\n");
                }
        }
-       if (!flag)
-               result = PCI_ERS_RESULT_NONE;
 
-       return result;
+       return PCI_ERS_RESULT_NONE;
 }
 
 
index 94331d999d273ee9ad61d0b932f11b1eb48b40c1..7df466e222820398383a87fc054211781f9431fb 100644 (file)
@@ -965,6 +965,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
        struct rio_transfer_io *transfer;
        enum dma_data_direction dir;
        int i, ret = 0;
+       size_t size;
 
        if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction))))
                return -EFAULT;
@@ -976,13 +977,14 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
             priv->md->properties.transfer_mode) == 0)
                return -ENODEV;
 
-       transfer = vmalloc(array_size(sizeof(*transfer), transaction.count));
+       size = array_size(sizeof(*transfer), transaction.count);
+       transfer = vmalloc(size);
        if (!transfer)
                return -ENOMEM;
 
        if (unlikely(copy_from_user(transfer,
                                    (void __user *)(uintptr_t)transaction.block,
-                                   array_size(sizeof(*transfer), transaction.count)))) {
+                                   size))) {
                ret = -EFAULT;
                goto out_free;
        }
@@ -994,8 +996,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
                        transaction.sync, dir, &transfer[i]);
 
        if (unlikely(copy_to_user((void __user *)(uintptr_t)transaction.block,
-                                 transfer,
-                                 array_size(sizeof(*transfer), transaction.count))))
+                                 transfer, size)))
                ret = -EFAULT;
 
 out_free:
index 3a6f3af240fa7383171a64fefe012c1e92a75657..a7a33ebf4bbe93c3e878b3ca8eb9b8bc9f468cd7 100644 (file)
@@ -34,7 +34,7 @@ int dasd_gendisk_alloc(struct dasd_block *block)
 {
        struct gendisk *gdp;
        struct dasd_device *base;
-       int len;
+       int len, rc;
 
        /* Make sure the minor for this device exists. */
        base = block->base;
@@ -80,7 +80,13 @@ int dasd_gendisk_alloc(struct dasd_block *block)
        dasd_add_link_to_gendisk(gdp, base);
        block->gdp = gdp;
        set_capacity(block->gdp, 0);
-       device_add_disk(&base->cdev->dev, block->gdp, NULL);
+
+       rc = device_add_disk(&base->cdev->dev, block->gdp, NULL);
+       if (rc) {
+               dasd_gendisk_free(block);
+               return rc;
+       }
+
        return 0;
 }
 
index 59e513d34b0f2a1f426a5c0f41a9b6de74fc03c0..27ab888b44d0a81797236487f547261c4997a051 100644 (file)
@@ -696,7 +696,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
        }
 
        get_device(&dev_info->dev);
-       device_add_disk(&dev_info->dev, dev_info->gd, NULL);
+       rc = device_add_disk(&dev_info->dev, dev_info->gd, NULL);
+       if (rc)
+               goto out_dax;
 
        switch (dev_info->segment_type) {
                case SEG_TYPE_SR:
@@ -712,6 +714,10 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
        rc = count;
        goto out;
 
+out_dax:
+       put_device(&dev_info->dev);
+       kill_dax(dev_info->dax_dev);
+       put_dax(dev_info->dax_dev);
 put_dev:
        list_del(&dev_info->lh);
        blk_cleanup_disk(dev_info->gd);
index 88cba6212ee26fdac7ccb2fe9d85412e43a2a5c0..61ecdcb2cc6afa23ddca0631b17541ff886f03a5 100644 (file)
@@ -495,9 +495,14 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev)
 
        /* 512 byte sectors */
        set_capacity(bdev->gendisk, scmdev->size >> 9);
-       device_add_disk(&scmdev->dev, bdev->gendisk, NULL);
+       ret = device_add_disk(&scmdev->dev, bdev->gendisk, NULL);
+       if (ret)
+               goto out_cleanup_disk;
+
        return 0;
 
+out_cleanup_disk:
+       blk_cleanup_disk(bdev->gendisk);
 out_tag:
        blk_mq_free_tag_set(&bdev->tag_set);
 out:
index 2cf7fe131ecec412ff4f82c5d85df7beb264a389..f0763e36b86168d6f79663303dd281fd0ec12ad3 100644 (file)
@@ -163,7 +163,7 @@ static inline void sclp_trace_req(int prio, char *id, struct sclp_req *req,
        summary.timeout = (u16)req->queue_timeout;
        summary.start_count = (u16)req->start_count;
 
-       sclp_trace(prio, id, (u32)(addr_t)sccb, summary.b, err);
+       sclp_trace(prio, id, __pa(sccb), summary.b, err);
 }
 
 static inline void sclp_trace_register(int prio, char *id, u32 a, u64 b,
@@ -502,7 +502,7 @@ sclp_add_request(struct sclp_req *req)
        }
 
        /* RQAD: Request was added (a=sccb, b=caller) */
-       sclp_trace(2, "RQAD", (u32)(addr_t)req->sccb, _RET_IP_, false);
+       sclp_trace(2, "RQAD", __pa(req->sccb), _RET_IP_, false);
 
        req->status = SCLP_REQ_QUEUED;
        req->start_count = 0;
@@ -617,15 +617,15 @@ __sclp_find_req(u32 sccb)
 
        list_for_each(l, &sclp_req_queue) {
                req = list_entry(l, struct sclp_req, list);
-               if (sccb == (u32) (addr_t) req->sccb)
-                               return req;
+               if (sccb == __pa(req->sccb))
+                       return req;
        }
        return NULL;
 }
 
 static bool ok_response(u32 sccb_int, sclp_cmdw_t cmd)
 {
-       struct sccb_header *sccb = (struct sccb_header *)(addr_t)sccb_int;
+       struct sccb_header *sccb = (struct sccb_header *)__va(sccb_int);
        struct evbuf_header *evbuf;
        u16 response;
 
@@ -664,7 +664,7 @@ static void sclp_interrupt_handler(struct ext_code ext_code,
 
        /* INT: Interrupt received (a=intparm, b=cmd) */
        sclp_trace_sccb(0, "INT", param32, active_cmd, active_cmd,
-                       (struct sccb_header *)(addr_t)finished_sccb,
+                       (struct sccb_header *)__va(finished_sccb),
                        !ok_response(finished_sccb, active_cmd));
 
        if (finished_sccb) {
@@ -1110,7 +1110,7 @@ static void sclp_check_handler(struct ext_code ext_code,
        /* Is this the interrupt we are waiting for? */
        if (finished_sccb == 0)
                return;
-       if (finished_sccb != (u32) (addr_t) sclp_init_sccb)
+       if (finished_sccb != __pa(sclp_init_sccb))
                panic("sclp: unsolicited interrupt for buffer at 0x%x\n",
                      finished_sccb);
        spin_lock(&sclp_lock);
index 5e434108aae6aa3d74e2ab2becf2830875e8ad29..8a30e77db46988eb6cfa42dcd6e4abd3b5e076f3 100644 (file)
@@ -333,7 +333,7 @@ static inline int sclp_service_call(sclp_cmdw_t command, void *sccb)
                "2:\n"
                EX_TABLE(0b, 2b)
                EX_TABLE(1b, 2b)
-               : "+&d" (cc) : "d" (command), "a" ((unsigned long)sccb)
+               : "+&d" (cc) : "d" (command), "a" (__pa(sccb))
                : "cc", "memory");
        if (cc == 4)
                return -EINVAL;
index f3d5c7f4c13d29aa48be077cc9e03456316cdb1a..b64feab62caa8b4179718bcd94f9c07c67e7580c 100644 (file)
@@ -139,7 +139,7 @@ int __init sclp_early_get_core_info(struct sclp_core_info *info)
        }
        sclp_fill_core_info(info, sccb);
 out:
-       memblock_free_early((unsigned long)sccb, length);
+       memblock_phys_free((unsigned long)sccb, length);
        return rc;
 }
 
@@ -155,6 +155,11 @@ static void __init sclp_early_console_detect(struct init_sccb *sccb)
                sclp.has_linemode = 1;
 }
 
+void __init sclp_early_adjust_va(void)
+{
+       sclp_early_sccb = __va((unsigned long)sclp_early_sccb);
+}
+
 void __init sclp_early_detect(void)
 {
        void *sccb = sclp_early_sccb;
index 1e9de99dcd023c0264406cba50ed6a6a3c8c3fdf..ec5a0e2b9255461c9e86dd4482f2c8cdda7876d0 100644 (file)
@@ -31,6 +31,8 @@ static u64 sclp_ftp_length;
 
 /**
  * sclp_ftp_txcb() - Diagnostic Test FTP services SCLP command callback
+ * @req: sclp request
+ * @data: pointer to struct completion
  */
 static void sclp_ftp_txcb(struct sclp_req *req, void *data)
 {
@@ -45,6 +47,7 @@ static void sclp_ftp_txcb(struct sclp_req *req, void *data)
 
 /**
  * sclp_ftp_rxcb() - Diagnostic Test FTP services receiver event callback
+ * @evbuf: pointer to Diagnostic Test (ET7) event buffer
  */
 static void sclp_ftp_rxcb(struct evbuf_header *evbuf)
 {
index 1e244f78f1929c871f7dbdcc23d8bd59d6df0b80..25c2d760f6e68d55461d811a1f04ab902c65a3b0 100644 (file)
@@ -122,6 +122,7 @@ static void sclp_sd_listener_remove(struct sclp_sd_listener *listener)
 
 /**
  * sclp_sd_listener_init() - Initialize a Store Data response listener
+ * @listener: Response listener to initialize
  * @id: Event ID to listen for
  *
  * Initialize a listener for asynchronous Store Data responses. This listener
@@ -193,7 +194,7 @@ static int sclp_sd_sync(unsigned long page, u8 eq, u8 di, u64 sat, u64 sa,
        struct sclp_sd_evbuf *evbuf;
        int rc;
 
-       sclp_sd_listener_init(&listener, (u32) (addr_t) sccb);
+       sclp_sd_listener_init(&listener, __pa(sccb));
        sclp_sd_listener_add(&listener);
 
        /* Prepare SCCB */
@@ -403,6 +404,7 @@ static int sclp_sd_file_update(struct sclp_sd_file *sd_file)
 /**
  * sclp_sd_file_update_async() - Wrapper for asynchronous update call
  * @data: Object to update
+ * @cookie: Unused
  */
 static void sclp_sd_file_update_async(void *data, async_cookie_t cookie)
 {
@@ -414,6 +416,9 @@ static void sclp_sd_file_update_async(void *data, async_cookie_t cookie)
 /**
  * reload_store() - Store function for "reload" sysfs attribute
  * @kobj: Kobject of sclp_sd_file object
+ * @attr: Reload attribute
+ * @buf: Data written to sysfs attribute
+ * @count: Count of bytes written
  *
  * Initiate a reload of the data associated with an sclp_sd_file object.
  */
@@ -441,8 +446,10 @@ static struct kobj_type sclp_sd_file_ktype = {
 };
 
 /**
- * data_read() - Read function for "read" sysfs attribute
+ * data_read() - Read function for "data" sysfs attribute
+ * @file: Open file pointer
  * @kobj: Kobject of sclp_sd_file object
+ * @attr: Data attribute
  * @buffer: Target buffer
  * @off: Requested file offset
  * @size: Requested number of bytes
index 29a6a0099f831726c515dd92fd62101f42a3c5c1..7bc4e4a109372451e52782a124787c5165fb468f 100644 (file)
@@ -768,6 +768,8 @@ out_driver:
 }
 __initcall(sclp_vt220_tty_init);
 
+#ifdef CONFIG_SCLP_VT220_CONSOLE
+
 static void __sclp_vt220_flush_buffer(void)
 {
        unsigned long flags;
@@ -784,8 +786,6 @@ static void __sclp_vt220_flush_buffer(void)
        spin_unlock_irqrestore(&sclp_vt220_lock, flags);
 }
 
-#ifdef CONFIG_SCLP_VT220_CONSOLE
-
 static void
 sclp_vt220_con_write(struct console *con, const char *buf, unsigned int count)
 {
index 44461928aab8afd3b3042a31c2640d39dec931ee..2bc55ccf3f23729b64b4253b5d5103fd1b9b9d74 100644 (file)
@@ -792,10 +792,13 @@ static int __unset_online(struct device *dev, void *data)
 {
        struct idset *set = data;
        struct subchannel *sch = to_subchannel(dev);
-       struct ccw_device *cdev = sch_get_cdev(sch);
+       struct ccw_device *cdev;
 
-       if (cdev && cdev->online)
-               idset_sch_del(set, sch->schid);
+       if (sch->st == SUBCHANNEL_TYPE_IO) {
+               cdev = sch_get_cdev(sch);
+               if (cdev && cdev->online)
+                       idset_sch_del(set, sch->schid);
+       }
 
        return 0;
 }
index 8d14569823d73fce10aec218ab15d2f9bae6adce..07a17613fab5878af0b016dc148cb62615a7a086 100644 (file)
@@ -1322,6 +1322,7 @@ static int purge_fn(struct device *dev, void *data)
 {
        struct ccw_device *cdev = to_ccwdev(dev);
        struct ccw_dev_id *id = &cdev->private->dev_id;
+       struct subchannel *sch = to_subchannel(cdev->dev.parent);
 
        spin_lock_irq(cdev->ccwlock);
        if (is_blacklisted(id->ssid, id->devno) &&
@@ -1330,6 +1331,7 @@ static int purge_fn(struct device *dev, void *data)
                CIO_MSG_EVENT(3, "ccw: purging 0.%x.%04x\n", id->ssid,
                              id->devno);
                ccw_device_sched_todo(cdev, CDEV_TODO_UNREG);
+               css_sched_sch_todo(sch, SCH_TODO_UNREG);
                atomic_set(&cdev->private->onoff, 0);
        }
        spin_unlock_irq(cdev->ccwlock);
index 0fe7b2f2e7f5239ee660c7779fe461ac8598fb6b..c533d1dadc6bbb0f3f388ac62b01049ac99a72c5 100644 (file)
@@ -825,13 +825,23 @@ EXPORT_SYMBOL_GPL(ccw_device_get_chid);
  */
 void *ccw_device_dma_zalloc(struct ccw_device *cdev, size_t size)
 {
-       return cio_gp_dma_zalloc(cdev->private->dma_pool, &cdev->dev, size);
+       void *addr;
+
+       if (!get_device(&cdev->dev))
+               return NULL;
+       addr = cio_gp_dma_zalloc(cdev->private->dma_pool, &cdev->dev, size);
+       if (IS_ERR_OR_NULL(addr))
+               put_device(&cdev->dev);
+       return addr;
 }
 EXPORT_SYMBOL(ccw_device_dma_zalloc);
 
 void ccw_device_dma_free(struct ccw_device *cdev, void *cpu_addr, size_t size)
 {
+       if (!cpu_addr)
+               return;
        cio_gp_dma_free(cdev->private->dma_pool, cpu_addr, size);
+       put_device(&cdev->dev);
 }
 EXPORT_SYMBOL(ccw_device_dma_free);
 
index d9b804943d1923a60d187f3bb733c6405d7652b4..1986243f9cd392d1941e5635170960c45b060135 100644 (file)
@@ -61,6 +61,10 @@ static char *aqm_str;
 module_param_named(aqmask, aqm_str, charp, 0440);
 MODULE_PARM_DESC(aqmask, "AP bus domain mask.");
 
+static int ap_useirq = 1;
+module_param_named(useirq, ap_useirq, int, 0440);
+MODULE_PARM_DESC(useirq, "Use interrupt if available, default is 1 (on).");
+
 atomic_t ap_max_msg_size = ATOMIC_INIT(AP_DEFAULT_MAX_MSG_SIZE);
 EXPORT_SYMBOL(ap_max_msg_size);
 
@@ -725,7 +729,7 @@ static void ap_check_bindings_complete(void)
                if (bound == apqns) {
                        if (!completion_done(&ap_init_apqn_bindings_complete)) {
                                complete_all(&ap_init_apqn_bindings_complete);
-                               AP_DBF(DBF_INFO, "%s complete\n", __func__);
+                               AP_DBF_INFO("%s complete\n", __func__);
                        }
                        ap_send_bindings_complete_uevent();
                }
@@ -786,9 +790,12 @@ static int __ap_revise_reserved(struct device *dev, void *dummy)
                drvres = to_ap_drv(dev->driver)->flags
                        & AP_DRIVER_FLAG_DEFAULT;
                if (!!devres != !!drvres) {
-                       AP_DBF_DBG("reprobing queue=%02x.%04x\n",
-                                  card, queue);
+                       AP_DBF_DBG("%s reprobing queue=%02x.%04x\n",
+                                  __func__, card, queue);
                        rc = device_reprobe(dev);
+                       if (rc)
+                               AP_DBF_WARN("%s reprobing queue=%02x.%04x failed\n",
+                                           __func__, card, queue);
                }
        }
 
@@ -1118,7 +1125,8 @@ static ssize_t ap_domain_store(struct bus_type *bus,
        ap_domain_index = domain;
        spin_unlock_bh(&ap_domain_lock);
 
-       AP_DBF_INFO("stored new default domain=%d\n", domain);
+       AP_DBF_INFO("%s stored new default domain=%d\n",
+                   __func__, domain);
 
        return count;
 }
@@ -1433,8 +1441,9 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func)
 
        /* < CEX2A is not supported */
        if (rawtype < AP_DEVICE_TYPE_CEX2A) {
-               AP_DBF_WARN("get_comp_type queue=%02x.%04x unsupported type %d\n",
-                           AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype);
+               AP_DBF_WARN("%s queue=%02x.%04x unsupported type %d\n",
+                           __func__, AP_QID_CARD(qid),
+                           AP_QID_QUEUE(qid), rawtype);
                return 0;
        }
        /* up to CEX7 known and fully supported */
@@ -1458,11 +1467,12 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func)
                        comp_type = apinfo.cat;
        }
        if (!comp_type)
-               AP_DBF_WARN("get_comp_type queue=%02x.%04x unable to map type %d\n",
-                           AP_QID_CARD(qid), AP_QID_QUEUE(qid), rawtype);
+               AP_DBF_WARN("%s queue=%02x.%04x unable to map type %d\n",
+                           __func__, AP_QID_CARD(qid),
+                           AP_QID_QUEUE(qid), rawtype);
        else if (comp_type != rawtype)
-               AP_DBF_INFO("get_comp_type queue=%02x.%04x map type %d to %d\n",
-                           AP_QID_CARD(qid), AP_QID_QUEUE(qid),
+               AP_DBF_INFO("%s queue=%02x.%04x map type %d to %d\n",
+                           __func__, AP_QID_CARD(qid), AP_QID_QUEUE(qid),
                            rawtype, comp_type);
        return comp_type;
 }
@@ -1535,7 +1545,7 @@ static inline void ap_scan_domains(struct ap_card *ac)
                aq = dev ? to_ap_queue(dev) : NULL;
                if (!ap_test_config_usage_domain(dom)) {
                        if (dev) {
-                               AP_DBF_INFO("%s(%d,%d) not in config any more, rm queue device\n",
+                               AP_DBF_INFO("%s(%d,%d) not in config anymore, rm queue dev\n",
                                            __func__, ac->id, dom);
                                device_unregister(dev);
                                put_device(dev);
@@ -1545,9 +1555,8 @@ static inline void ap_scan_domains(struct ap_card *ac)
                /* domain is valid, get info from this APQN */
                if (!ap_queue_info(qid, &type, &func, &depth, &ml, &decfg)) {
                        if (aq) {
-                               AP_DBF_INFO(
-                                       "%s(%d,%d) ap_queue_info() not successful, rm queue device\n",
-                                       __func__, ac->id, dom);
+                               AP_DBF_INFO("%s(%d,%d) queue_info() failed, rm queue dev\n",
+                                           __func__, ac->id, dom);
                                device_unregister(dev);
                                put_device(dev);
                        }
@@ -1577,10 +1586,10 @@ static inline void ap_scan_domains(struct ap_card *ac)
                        /* get it and thus adjust reference counter */
                        get_device(dev);
                        if (decfg)
-                               AP_DBF_INFO("%s(%d,%d) new (decfg) queue device created\n",
+                               AP_DBF_INFO("%s(%d,%d) new (decfg) queue dev created\n",
                                            __func__, ac->id, dom);
                        else
-                               AP_DBF_INFO("%s(%d,%d) new queue device created\n",
+                               AP_DBF_INFO("%s(%d,%d) new queue dev created\n",
                                            __func__, ac->id, dom);
                        goto put_dev_and_continue;
                }
@@ -1594,7 +1603,7 @@ static inline void ap_scan_domains(struct ap_card *ac)
                                aq->last_err_rc = AP_RESPONSE_DECONFIGURED;
                        }
                        spin_unlock_bh(&aq->lock);
-                       AP_DBF_INFO("%s(%d,%d) queue device config off\n",
+                       AP_DBF_INFO("%s(%d,%d) queue dev config off\n",
                                    __func__, ac->id, dom);
                        ap_send_config_uevent(&aq->ap_dev, aq->config);
                        /* 'receive' pending messages with -EAGAIN */
@@ -1609,7 +1618,7 @@ static inline void ap_scan_domains(struct ap_card *ac)
                                aq->sm_state = AP_SM_STATE_RESET_START;
                        }
                        spin_unlock_bh(&aq->lock);
-                       AP_DBF_INFO("%s(%d,%d) queue device config on\n",
+                       AP_DBF_INFO("%s(%d,%d) queue dev config on\n",
                                    __func__, ac->id, dom);
                        ap_send_config_uevent(&aq->ap_dev, aq->config);
                        goto put_dev_and_continue;
@@ -1621,7 +1630,7 @@ static inline void ap_scan_domains(struct ap_card *ac)
                        ap_flush_queue(aq);
                        /* re-init (with reset) the queue device */
                        ap_queue_init_state(aq);
-                       AP_DBF_INFO("%s(%d,%d) queue device reinit enforced\n",
+                       AP_DBF_INFO("%s(%d,%d) queue dev reinit enforced\n",
                                    __func__, ac->id, dom);
                        goto put_dev_and_continue;
                }
@@ -1653,7 +1662,7 @@ static inline void ap_scan_adapter(int ap)
        /* Adapter not in configuration ? */
        if (!ap_test_config_card_id(ap)) {
                if (ac) {
-                       AP_DBF_INFO("%s(%d) ap not in config any more, rm card and queue devices\n",
+                       AP_DBF_INFO("%s(%d) ap not in config any more, rm card and queue devs\n",
                                    __func__, ap);
                        ap_scan_rm_card_dev_and_queue_devs(ac);
                        put_device(dev);
@@ -1678,9 +1687,8 @@ static inline void ap_scan_adapter(int ap)
        if (dom > ap_max_domain_id) {
                /* Could not find a valid APQN for this adapter */
                if (ac) {
-                       AP_DBF_INFO(
-                               "%s(%d) no type info (no APQN found), rm card and queue devices\n",
-                               __func__, ap);
+                       AP_DBF_INFO("%s(%d) no type info (no APQN found), rm card and queue devs\n",
+                                   __func__, ap);
                        ap_scan_rm_card_dev_and_queue_devs(ac);
                        put_device(dev);
                } else {
@@ -1692,7 +1700,7 @@ static inline void ap_scan_adapter(int ap)
        if (!type) {
                /* No apdater type info available, an unusable adapter */
                if (ac) {
-                       AP_DBF_INFO("%s(%d) no valid type (0) info, rm card and queue devices\n",
+                       AP_DBF_INFO("%s(%d) no valid type (0) info, rm card and queue devs\n",
                                    __func__, ap);
                        ap_scan_rm_card_dev_and_queue_devs(ac);
                        put_device(dev);
@@ -1706,13 +1714,13 @@ static inline void ap_scan_adapter(int ap)
        if (ac) {
                /* Check APQN against existing card device for changes */
                if (ac->raw_hwtype != type) {
-                       AP_DBF_INFO("%s(%d) hwtype %d changed, rm card and queue devices\n",
+                       AP_DBF_INFO("%s(%d) hwtype %d changed, rm card and queue devs\n",
                                    __func__, ap, type);
                        ap_scan_rm_card_dev_and_queue_devs(ac);
                        put_device(dev);
                        ac = NULL;
                } else if (ac->functions != func) {
-                       AP_DBF_INFO("%s(%d) functions 0x%08x changed, rm card and queue devices\n",
+                       AP_DBF_INFO("%s(%d) functions 0x%08x changed, rm card and queue devs\n",
                                    __func__, ap, type);
                        ap_scan_rm_card_dev_and_queue_devs(ac);
                        put_device(dev);
@@ -1720,13 +1728,13 @@ static inline void ap_scan_adapter(int ap)
                } else {
                        if (decfg && ac->config) {
                                ac->config = false;
-                               AP_DBF_INFO("%s(%d) card device config off\n",
+                               AP_DBF_INFO("%s(%d) card dev config off\n",
                                            __func__, ap);
                                ap_send_config_uevent(&ac->ap_dev, ac->config);
                        }
                        if (!decfg && !ac->config) {
                                ac->config = true;
-                               AP_DBF_INFO("%s(%d) card device config on\n",
+                               AP_DBF_INFO("%s(%d) card dev config on\n",
                                            __func__, ap);
                                ap_send_config_uevent(&ac->ap_dev, ac->config);
                        }
@@ -1756,7 +1764,8 @@ static inline void ap_scan_adapter(int ap)
                if (ac->maxmsgsize > atomic_read(&ap_max_msg_size)) {
                        atomic_set(&ap_max_msg_size, ac->maxmsgsize);
                        AP_DBF_INFO("%s(%d) ap_max_msg_size update to %d byte\n",
-                                   __func__, ap, atomic_read(&ap_max_msg_size));
+                                   __func__, ap,
+                                   atomic_read(&ap_max_msg_size));
                }
                /* Register the new card device with AP bus */
                rc = device_register(dev);
@@ -1769,10 +1778,10 @@ static inline void ap_scan_adapter(int ap)
                /* get it and thus adjust reference counter */
                get_device(dev);
                if (decfg)
-                       AP_DBF_INFO("%s(%d) new (decfg) card device type=%d func=0x%08x created\n",
+                       AP_DBF_INFO("%s(%d) new (decfg) card dev type=%d func=0x%08x created\n",
                                    __func__, ap, type, func);
                else
-                       AP_DBF_INFO("%s(%d) new card device type=%d func=0x%08x created\n",
+                       AP_DBF_INFO("%s(%d) new card dev type=%d func=0x%08x created\n",
                                    __func__, ap, type, func);
        }
 
@@ -1810,12 +1819,12 @@ static void ap_scan_bus(struct work_struct *unused)
                if (dev)
                        put_device(dev);
                else
-                       AP_DBF_INFO("no queue device with default domain %d available\n",
-                                   ap_domain_index);
+                       AP_DBF_INFO("%s no queue device with default domain %d available\n",
+                                   __func__, ap_domain_index);
        }
 
        if (atomic64_inc_return(&ap_scan_bus_count) == 1) {
-               AP_DBF(DBF_DEBUG, "%s init scan complete\n", __func__);
+               AP_DBF_DBG("%s init scan complete\n", __func__);
                ap_send_init_scan_done_uevent();
                ap_check_bindings_complete();
        }
@@ -1830,7 +1839,7 @@ static void ap_config_timeout(struct timer_list *unused)
 
 static int __init ap_debug_init(void)
 {
-       ap_dbf_info = debug_register("ap", 1, 1,
+       ap_dbf_info = debug_register("ap", 2, 1,
                                     DBF_MAX_SPRINTF_ARGS * sizeof(long));
        debug_register_view(ap_dbf_info, &debug_sprintf_view);
        debug_set_level(ap_dbf_info, DBF_ERR);
@@ -1897,7 +1906,7 @@ static int __init ap_module_init(void)
        }
 
        /* enable interrupts if available */
-       if (ap_interrupts_available()) {
+       if (ap_interrupts_available() && ap_useirq) {
                rc = register_adapter_interrupt(&ap_airq);
                ap_irq_flag = (rc == 0);
        }
index 34b0350d0b1ae37353165f8d0fd933c55fdf59f7..c083ce88a9a61f1b24022827420900e6d521f166 100644 (file)
@@ -16,7 +16,7 @@
 #define RC2ERR(rc) ((rc) ? DBF_ERR : DBF_INFO)
 #define RC2WARN(rc) ((rc) ? DBF_WARN : DBF_INFO)
 
-#define DBF_MAX_SPRINTF_ARGS 5
+#define DBF_MAX_SPRINTF_ARGS 6
 
 #define AP_DBF(...)                                    \
        debug_sprintf_event(ap_dbf_info, ##__VA_ARGS__)
index 9ea48bf0ee40d18b33df24c451333b561c738434..1901449768ddd45cf63f73c39f804336e18a6104 100644 (file)
@@ -157,6 +157,8 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq)
        switch (status.response_code) {
        case AP_RESPONSE_NORMAL:
                aq->queue_count = max_t(int, 0, aq->queue_count - 1);
+               if (!status.queue_empty && !aq->queue_count)
+                       aq->queue_count++;
                if (aq->queue_count > 0)
                        mod_timer(&aq->timeout,
                                  jiffies + aq->request_timeout);
@@ -246,6 +248,7 @@ static enum ap_sm_wait ap_sm_write(struct ap_queue *aq)
 
        if (aq->requestq_count <= 0)
                return AP_SM_WAIT_NONE;
+
        /* Start the next request on the queue. */
        ap_msg = list_entry(aq->requestq.next, struct ap_message, list);
 #ifdef CONFIG_ZCRYPT_DEBUG
@@ -279,7 +282,7 @@ static enum ap_sm_wait ap_sm_write(struct ap_queue *aq)
                aq->sm_state = AP_SM_STATE_RESET_WAIT;
                return AP_SM_WAIT_TIMEOUT;
        case AP_RESPONSE_INVALID_DOMAIN:
-               AP_DBF(DBF_WARN, "AP_RESPONSE_INVALID_DOMAIN on NQAP\n");
+               AP_DBF_WARN("%s RESPONSE_INVALID_DOMAIN on NQAP\n", __func__);
                fallthrough;
        case AP_RESPONSE_MESSAGE_TOO_BIG:
        case AP_RESPONSE_REQ_FAC_NOT_INST:
@@ -571,8 +574,8 @@ static ssize_t reset_store(struct device *dev,
        ap_wait(ap_sm_event(aq, AP_SM_EVENT_POLL));
        spin_unlock_bh(&aq->lock);
 
-       AP_DBF(DBF_INFO, "reset queue=%02x.%04x triggered by user\n",
-              AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid));
+       AP_DBF_INFO("%s reset queue=%02x.%04x triggered by user\n",
+                   __func__, AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid));
 
        return count;
 }
index 4d2556bc7fe5857396ded11a3c673b36bc6dbb85..03311a476366b5e9f6b936411418de7e080f4932 100644 (file)
@@ -42,10 +42,13 @@ static struct ap_device_id ap_queue_ids[] = {
 MODULE_DEVICE_TABLE(vfio_ap, ap_queue_ids);
 
 /**
- * vfio_ap_queue_dev_probe:
+ * vfio_ap_queue_dev_probe: Allocate a vfio_ap_queue structure and associate it
+ *                         with the device as driver_data.
  *
- * Allocate a vfio_ap_queue structure and associate it
- * with the device as driver_data.
+ * @apdev: the AP device being probed
+ *
+ * Return: returns 0 if the probe succeeded; otherwise, returns -ENOMEM if
+ *        storage could not be allocated for a vfio_ap_queue object.
  */
 static int vfio_ap_queue_dev_probe(struct ap_device *apdev)
 {
@@ -61,10 +64,11 @@ static int vfio_ap_queue_dev_probe(struct ap_device *apdev)
 }
 
 /**
- * vfio_ap_queue_dev_remove:
+ * vfio_ap_queue_dev_remove: Free the associated vfio_ap_queue structure.
+ *
+ * @apdev: the AP device being removed
  *
- * Takes the matrix lock to avoid actions on this device while removing
- * Free the associated vfio_ap_queue structure
+ * Takes the matrix lock to avoid actions on this device while doing the remove.
  */
 static void vfio_ap_queue_dev_remove(struct ap_device *apdev)
 {
index 2341425f69675a293c66f9768f1e3dc4a4e7e42b..abc0b9b883865bce96e6d8247ee5b6528fabe2f0 100644 (file)
@@ -187,6 +187,8 @@ end_free:
  * vfio_ap_irq_enable - Enable Interruption for a APQN
  *
  * @q:  the vfio_ap_queue holding AQIC parameters
+ * @isc: the guest ISC to register with the GIB interface
+ * @nib: the notification indicator byte to pin.
  *
  * Pin the NIB saved in *q
  * Register the guest ISC to GIB interface and retrieve the
@@ -738,7 +740,6 @@ vfio_ap_mdev_verify_queues_reserved_for_apqi(struct ap_matrix_mdev *matrix_mdev,
  * assign_domain_store - parses the APQI from @buf and sets the
  * corresponding bit in the mediated matrix device's AQM
  *
- *
  * @dev:       the matrix device
  * @attr:      the mediated matrix device's assign_domain attribute
  * @buf:       a buffer containing the AP queue index (APQI) of the domain to
@@ -866,7 +867,6 @@ static DEVICE_ATTR_WO(unassign_domain);
  * assign_control_domain_store - parses the domain ID from @buf and sets
  * the corresponding bit in the mediated matrix device's ADM
  *
- *
  * @dev:       the matrix device
  * @attr:      the mediated matrix device's assign_control_domain attribute
  * @buf:       a buffer containing the domain ID to be assigned
@@ -1142,6 +1142,7 @@ static int vfio_ap_mdev_iommu_notifier(struct notifier_block *nb,
  * by @matrix_mdev.
  *
  * @matrix_mdev: a matrix mediated device
+ * @kvm: the pointer to the kvm structure being unset.
  *
  * Note: The matrix_dev->lock must be taken prior to calling
  * this function; however, the lock will be temporarily released while the
index 77760e2b546fe6c59d95d1dd4e18fade0f01ea68..648fcaf8104abb7e2a0cd342fc041b564c61b8d9 100644 (file)
 #define VFIO_AP_DRV_NAME "vfio_ap"
 
 /**
- * ap_matrix_dev - the AP matrix device structure
+ * struct ap_matrix_dev - Contains the data for the matrix device.
+ *
  * @device:    generic device structure associated with the AP matrix device
  * @available_instances: number of mediated matrix devices that can be created
  * @info:      the struct containing the output from the PQAP(QCI) instruction
- * mdev_list:  the list of mediated matrix devices created
- * lock:       mutex for locking the AP matrix device. This lock will be
+ * @mdev_list: the list of mediated matrix devices created
+ * @lock:      mutex for locking the AP matrix device. This lock will be
  *             taken every time we fiddle with state managed by the vfio_ap
  *             driver, be it using @mdev_list or writing the state of a
  *             single ap_matrix_mdev device. It's quite coarse but we don't
  *             expect much contention.
+ * @vfio_ap_drv: the vfio_ap device driver
  */
 struct ap_matrix_dev {
        struct device device;
@@ -49,17 +51,19 @@ struct ap_matrix_dev {
 extern struct ap_matrix_dev *matrix_dev;
 
 /**
- * The AP matrix is comprised of three bit masks identifying the adapters,
- * queues (domains) and control domains that belong to an AP matrix. The bits i
- * each mask, from least significant to most significant bit, correspond to IDs
- * 0 to 255. When a bit is set, the corresponding ID belongs to the matrix.
+ * struct ap_matrix - matrix of adapters, domains and control domains
  *
  * @apm_max: max adapter number in @apm
- * @apm identifies the AP adapters in the matrix
+ * @apm: identifies the AP adapters in the matrix
  * @aqm_max: max domain number in @aqm
- * @aqm identifies the AP queues (domains) in the matrix
+ * @aqm: identifies the AP queues (domains) in the matrix
  * @adm_max: max domain number in @adm
- * @adm identifies the AP control domains in the matrix
+ * @adm: identifies the AP control domains in the matrix
+ *
+ * The AP matrix is comprised of three bit masks identifying the adapters,
+ * queues (domains) and control domains that belong to an AP matrix. The bits in
+ * each mask, from left to right, correspond to IDs 0 to 255. When a bit is set
+ * the corresponding ID belongs to the matrix.
  */
 struct ap_matrix {
        unsigned long apm_max;
@@ -71,13 +75,20 @@ struct ap_matrix {
 };
 
 /**
- * struct ap_matrix_mdev - the mediated matrix device structure
- * @list:      allows the ap_matrix_mdev struct to be added to a list
+ * struct ap_matrix_mdev - Contains the data associated with a matrix mediated
+ *                        device.
+ * @vdev:      the vfio device
+ * @node:      allows the ap_matrix_mdev struct to be added to a list
  * @matrix:    the adapters, usage domains and control domains assigned to the
  *             mediated matrix device.
  * @group_notifier: notifier block used for specifying callback function for
  *                 handling the VFIO_GROUP_NOTIFY_SET_KVM event
+ * @iommu_notifier: notifier block used for specifying callback function for
+ *                 handling the VFIO_IOMMU_NOTIFY_DMA_UNMAP even
  * @kvm:       the struct holding guest's state
+ * @pqap_hook: the function pointer to the interception handler for the
+ *             PQAP(AQIC) instruction.
+ * @mdev:      the mediated device
  */
 struct ap_matrix_mdev {
        struct vfio_device vdev;
@@ -90,6 +101,14 @@ struct ap_matrix_mdev {
        struct mdev_device *mdev;
 };
 
+/**
+ * struct vfio_ap_queue - contains the data associated with a queue bound to the
+ *                       vfio_ap device driver
+ * @matrix_mdev: the matrix mediated device
+ * @saved_pfn: the guest PFN pinned for the guest
+ * @apqn: the APQN of the AP queue device
+ * @saved_isc: the guest ISC registered with the GIB interface
+ */
 struct vfio_ap_queue {
        struct ap_matrix_mdev *matrix_mdev;
        unsigned long saved_pfn;
index 356318746dd169bfb3b2d563cb29d70d0008265b..4c3dcc435e83d38ba217dd3fe21a14cb3159c781 100644 (file)
@@ -82,8 +82,8 @@ static inline int zcrypt_process_rescan(void)
                atomic_set(&zcrypt_rescan_req, 0);
                atomic_inc(&zcrypt_rescan_count);
                ap_bus_force_rescan();
-               ZCRYPT_DBF(DBF_INFO, "rescan count=%07d\n",
-                          atomic_inc_return(&zcrypt_rescan_count));
+               ZCRYPT_DBF_INFO("%s rescan count=%07d\n", __func__,
+                               atomic_inc_return(&zcrypt_rescan_count));
                return 1;
        }
        return 0;
@@ -341,8 +341,8 @@ static void zcdn_device_release(struct device *dev)
 {
        struct zcdn_device *zcdndev = to_zcdn_dev(dev);
 
-       ZCRYPT_DBF(DBF_INFO, "releasing zcdn device %d:%d\n",
-                  MAJOR(dev->devt), MINOR(dev->devt));
+       ZCRYPT_DBF_INFO("%s releasing zcdn device %d:%d\n",
+                       __func__, MAJOR(dev->devt), MINOR(dev->devt));
 
        kfree(zcdndev);
 }
@@ -407,8 +407,8 @@ static int zcdn_create(const char *name)
                goto unlockout;
        }
 
-       ZCRYPT_DBF(DBF_INFO, "created zcdn device %d:%d\n",
-                  MAJOR(devt), MINOR(devt));
+       ZCRYPT_DBF_INFO("%s created zcdn device %d:%d\n",
+                       __func__, MAJOR(devt), MINOR(devt));
 
 unlockout:
        mutex_unlock(&ap_perms_mutex);
@@ -550,9 +550,8 @@ static inline int zcrypt_check_ioctl(struct ap_perms *perms,
        }
 
        if (rc)
-               ZCRYPT_DBF(DBF_WARN,
-                          "ioctl check failed: ioctlnr=0x%04x rc=%d\n",
-                          ioctlnr, rc);
+               ZCRYPT_DBF_WARN("%s ioctl check failed: ioctlnr=0x%04x rc=%d\n",
+                               __func__, ioctlnr, rc);
 
        return rc;
 }
@@ -1446,7 +1445,7 @@ static int icarsamodexpo_ioctl(struct ap_perms *perms, unsigned long arg)
        if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
                rc = -EIO;
        if (rc) {
-               ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSAMODEXPO rc=%d\n", rc);
+               ZCRYPT_DBF_DBG("ioctl ICARSAMODEXPO rc=%d\n", rc);
                return rc;
        }
        return put_user(mex.outputdatalength, &umex->outputdatalength);
@@ -1491,7 +1490,7 @@ static int icarsacrt_ioctl(struct ap_perms *perms, unsigned long arg)
        if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
                rc = -EIO;
        if (rc) {
-               ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSACRT rc=%d\n", rc);
+               ZCRYPT_DBF_DBG("ioctl ICARSACRT rc=%d\n", rc);
                return rc;
        }
        return put_user(crt.outputdatalength, &ucrt->outputdatalength);
@@ -1509,12 +1508,12 @@ static int zsecsendcprb_ioctl(struct ap_perms *perms, unsigned long arg)
                return -EFAULT;
 
 #ifdef CONFIG_ZCRYPT_DEBUG
-       if (xcRB.status & (1U << 31)) {
+       if ((xcRB.status & 0x8000FFFF) == 0x80004649 /* 'FI' */) {
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                tr.fi.cmd = (u16)(xcRB.status >> 16);
        }
-       xcRB.status &= 0x0000FFFF;
+       xcRB.status = 0;
 #endif
 
        do {
@@ -1536,8 +1535,8 @@ static int zsecsendcprb_ioctl(struct ap_perms *perms, unsigned long arg)
        if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
                rc = -EIO;
        if (rc)
-               ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDCPRB rc=%d status=0x%x\n",
-                          rc, xcRB.status);
+               ZCRYPT_DBF_DBG("ioctl ZSENDCPRB rc=%d status=0x%x\n",
+                              rc, xcRB.status);
        if (copy_to_user(uxcRB, &xcRB, sizeof(xcRB)))
                return -EFAULT;
        return rc;
@@ -1582,7 +1581,7 @@ static int zsendep11cprb_ioctl(struct ap_perms *perms, unsigned long arg)
        if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
                rc = -EIO;
        if (rc)
-               ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDEP11CPRB rc=%d\n", rc);
+               ZCRYPT_DBF_DBG("ioctl ZSENDEP11CPRB rc=%d\n", rc);
        if (copy_to_user(uxcrb, &xcrb, sizeof(xcrb)))
                return -EFAULT;
        return rc;
@@ -1709,7 +1708,7 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd,
        }
        /* unknown ioctl number */
        default:
-               ZCRYPT_DBF(DBF_DEBUG, "unknown ioctl 0x%08x\n", cmd);
+               ZCRYPT_DBF_DBG("unknown ioctl 0x%08x\n", cmd);
                return -ENOIOCTLCMD;
        }
 }
@@ -2048,16 +2047,14 @@ int zcrypt_wait_api_operational(void)
                        break;
                case -ETIME:
                        /* timeout */
-                       ZCRYPT_DBF(DBF_WARN,
-                                  "%s ap_wait_init_apqn_bindings_complete() returned with ETIME\n",
-                                  __func__);
+                       ZCRYPT_DBF_WARN("%s ap_wait_init_apqn_bindings_complete()=ETIME\n",
+                                       __func__);
                        zcrypt_wait_api_state = -ETIME;
                        break;
                default:
                        /* other failure */
-                       ZCRYPT_DBF(DBF_DEBUG,
-                                  "%s ap_wait_init_apqn_bindings_complete() failure rc=%d\n",
-                                  __func__, rc);
+                       ZCRYPT_DBF_DBG("%s ap_wait_init_apqn_bindings_complete()=%d\n",
+                                      __func__, rc);
                        break;
                }
                break;
@@ -2079,7 +2076,7 @@ EXPORT_SYMBOL(zcrypt_wait_api_operational);
 
 int __init zcrypt_debug_init(void)
 {
-       zcrypt_dbf_info = debug_register("zcrypt", 1, 1,
+       zcrypt_dbf_info = debug_register("zcrypt", 2, 1,
                                         DBF_MAX_SPRINTF_ARGS * sizeof(long));
        debug_register_view(zcrypt_dbf_info, &debug_sprintf_view);
        debug_set_level(zcrypt_dbf_info, DBF_ERR);
index ef11d2a0ca6c591e6f6726f52ca61be7f031ab04..3e259befd30addb51154b53c89da64200dcbc757 100644 (file)
@@ -76,7 +76,7 @@ static ssize_t online_store(struct device *dev,
        zc->online = online;
        id = zc->card->id;
 
-       ZCRYPT_DBF(DBF_INFO, "card=%02x online=%d\n", id, online);
+       ZCRYPT_DBF_INFO("%s card=%02x online=%d\n", __func__, id, online);
 
        ap_send_online_uevent(&ac->ap_dev, online);
 
@@ -189,7 +189,8 @@ int zcrypt_card_register(struct zcrypt_card *zc)
 
        zc->online = 1;
 
-       ZCRYPT_DBF(DBF_INFO, "card=%02x register online=1\n", zc->card->id);
+       ZCRYPT_DBF_INFO("%s card=%02x register online=1\n",
+                       __func__, zc->card->id);
 
        rc = sysfs_create_group(&zc->card->ap_dev.device.kobj,
                                &zcrypt_card_attr_group);
@@ -211,7 +212,8 @@ EXPORT_SYMBOL(zcrypt_card_register);
  */
 void zcrypt_card_unregister(struct zcrypt_card *zc)
 {
-       ZCRYPT_DBF(DBF_INFO, "card=%02x unregister\n", zc->card->id);
+       ZCRYPT_DBF_INFO("%s card=%02x unregister\n",
+                       __func__, zc->card->id);
 
        spin_lock(&zcrypt_list_lock);
        list_del_init(&zc->list);
index 3225489a1c411ffa7802896457059db95eb40b91..5cf88aabd64b9f9048cc8374b664847f87b286a0 100644 (file)
@@ -17,7 +17,7 @@
 #define RC2ERR(rc) ((rc) ? DBF_ERR : DBF_INFO)
 #define RC2WARN(rc) ((rc) ? DBF_WARN : DBF_INFO)
 
-#define DBF_MAX_SPRINTF_ARGS 5
+#define DBF_MAX_SPRINTF_ARGS 6
 
 #define ZCRYPT_DBF(...)                                        \
        debug_sprintf_event(zcrypt_dbf_info, ##__VA_ARGS__)
index 39e626e3a37940675a504745a8187822234a612b..8b0ce600b749e0999333a56866ec285754a02a27 100644 (file)
@@ -98,9 +98,8 @@ static inline int convert_error(struct zcrypt_queue *zq,
        case REP88_ERROR_MESSAGE_MALFORMD:       /* 0x22 */
        case REP88_ERROR_KEY_TYPE:               /* 0x34 */
                /* RY indicates malformed request */
-               ZCRYPT_DBF(DBF_WARN,
-                          "dev=%02x.%04x RY=0x%02x => rc=EINVAL\n",
-                          card, queue, ehdr->reply_code);
+               ZCRYPT_DBF_WARN("%s dev=%02x.%04x RY=0x%02x => rc=EINVAL\n",
+                               __func__, card, queue, ehdr->reply_code);
                return -EINVAL;
        case REP82_ERROR_MACHINE_FAILURE:        /* 0x10 */
        case REP82_ERROR_MESSAGE_TYPE:           /* 0x20 */
@@ -119,19 +118,18 @@ static inline int convert_error(struct zcrypt_queue *zq,
                        } __packed * head = reply->msg;
                        unsigned int apfs = *((u32 *)head->fmt2.apfs);
 
-                       ZCRYPT_DBF(DBF_WARN,
-                                  "dev=%02x.%04x RY=0x%02x apfs=0x%x => bus rescan, rc=EAGAIN\n",
-                                  card, queue, ehdr->reply_code, apfs);
+                       ZCRYPT_DBF_WARN(
+                               "%s dev=%02x.%04x RY=0x%02x apfs=0x%x => bus rescan, rc=EAGAIN\n",
+                               __func__, card, queue, ehdr->reply_code, apfs);
                } else
-                       ZCRYPT_DBF(DBF_WARN,
-                                  "dev=%02x.%04x RY=0x%02x => bus rescan, rc=EAGAIN\n",
-                                  card, queue, ehdr->reply_code);
+                       ZCRYPT_DBF_WARN("%s dev=%02x.%04x RY=0x%02x => bus rescan, rc=EAGAIN\n",
+                                       __func__, card, queue,
+                                       ehdr->reply_code);
                return -EAGAIN;
        default:
                /* Assume request is valid and a retry will be worth it */
-               ZCRYPT_DBF(DBF_WARN,
-                          "dev=%02x.%04x RY=0x%02x => rc=EAGAIN\n",
-                          card, queue, ehdr->reply_code);
+               ZCRYPT_DBF_WARN("%s dev=%02x.%04x RY=0x%02x => rc=EAGAIN\n",
+                               __func__, card, queue, ehdr->reply_code);
                return -EAGAIN;
        }
 }
index 99937f3e1d49bb2529b962ec197d7c07e89dfd46..f42e8c511184f480c3b554bb10a84f87f411e681 100644 (file)
@@ -369,12 +369,10 @@ static int convert_type80(struct zcrypt_queue *zq,
                zq->online = 0;
                pr_err("Crypto dev=%02x.%04x code=0x%02x => online=0 rc=EAGAIN\n",
                       AP_QID_CARD(zq->queue->qid),
-                      AP_QID_QUEUE(zq->queue->qid),
-                      t80h->code);
-               ZCRYPT_DBF_ERR("dev=%02x.%04x code=0x%02x => online=0 rc=EAGAIN\n",
-                              AP_QID_CARD(zq->queue->qid),
-                              AP_QID_QUEUE(zq->queue->qid),
-                              t80h->code);
+                      AP_QID_QUEUE(zq->queue->qid), t80h->code);
+               ZCRYPT_DBF_ERR("%s dev=%02x.%04x code=0x%02x => online=0 rc=EAGAIN\n",
+                              __func__, AP_QID_CARD(zq->queue->qid),
+                              AP_QID_QUEUE(zq->queue->qid), t80h->code);
                ap_send_online_uevent(&zq->queue->ap_dev, zq->online);
                return -EAGAIN;
        }
@@ -409,10 +407,10 @@ static int convert_response_cex2a(struct zcrypt_queue *zq,
                       AP_QID_CARD(zq->queue->qid),
                       AP_QID_QUEUE(zq->queue->qid),
                       (int) rtype);
-               ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n",
-                              AP_QID_CARD(zq->queue->qid),
-                              AP_QID_QUEUE(zq->queue->qid),
-                              (int) rtype);
+               ZCRYPT_DBF_ERR(
+                       "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n",
+                       __func__, AP_QID_CARD(zq->queue->qid),
+                       AP_QID_QUEUE(zq->queue->qid), (int) rtype);
                ap_send_online_uevent(&zq->queue->ap_dev, zq->online);
                return -EAGAIN;
        }
index bc5a8c31ba73bcdcf7d7c3b986992d7cfa58ac4c..8582dd0d6969b2da8f604da0b5e973f0b2019257 100644 (file)
@@ -649,8 +649,8 @@ static int convert_type86_ica(struct zcrypt_queue *zq,
                    (service_rc == 8 && service_rs == 72) ||
                    (service_rc == 8 && service_rs == 770) ||
                    (service_rc == 12 && service_rs == 769)) {
-                       ZCRYPT_DBF_WARN("dev=%02x.%04x rc/rs=%d/%d => rc=EINVAL\n",
-                                       AP_QID_CARD(zq->queue->qid),
+                       ZCRYPT_DBF_WARN("%s dev=%02x.%04x rc/rs=%d/%d => rc=EINVAL\n",
+                                       __func__, AP_QID_CARD(zq->queue->qid),
                                        AP_QID_QUEUE(zq->queue->qid),
                                        (int) service_rc, (int) service_rs);
                        return -EINVAL;
@@ -660,8 +660,8 @@ static int convert_type86_ica(struct zcrypt_queue *zq,
                       AP_QID_CARD(zq->queue->qid),
                       AP_QID_QUEUE(zq->queue->qid),
                       (int) service_rc, (int) service_rs);
-               ZCRYPT_DBF_ERR("dev=%02x.%04x rc/rs=%d/%d => online=0 rc=EAGAIN\n",
-                              AP_QID_CARD(zq->queue->qid),
+               ZCRYPT_DBF_ERR("%s dev=%02x.%04x rc/rs=%d/%d => online=0 rc=EAGAIN\n",
+                              __func__, AP_QID_CARD(zq->queue->qid),
                               AP_QID_QUEUE(zq->queue->qid),
                               (int) service_rc, (int) service_rs);
                ap_send_online_uevent(&zq->queue->ap_dev, zq->online);
@@ -806,10 +806,10 @@ static int convert_response_ica(struct zcrypt_queue *zq,
                       AP_QID_CARD(zq->queue->qid),
                       AP_QID_QUEUE(zq->queue->qid),
                       (int) msg->hdr.type);
-               ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n",
-                              AP_QID_CARD(zq->queue->qid),
-                              AP_QID_QUEUE(zq->queue->qid),
-                              (int) msg->hdr.type);
+               ZCRYPT_DBF_ERR(
+                       "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n",
+                       __func__, AP_QID_CARD(zq->queue->qid),
+                       AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type);
                ap_send_online_uevent(&zq->queue->ap_dev, zq->online);
                return -EAGAIN;
        }
@@ -841,10 +841,10 @@ static int convert_response_xcrb(bool userspace, struct zcrypt_queue *zq,
                       AP_QID_CARD(zq->queue->qid),
                       AP_QID_QUEUE(zq->queue->qid),
                       (int) msg->hdr.type);
-               ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n",
-                              AP_QID_CARD(zq->queue->qid),
-                              AP_QID_QUEUE(zq->queue->qid),
-                              (int) msg->hdr.type);
+               ZCRYPT_DBF_ERR(
+                       "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n",
+                       __func__, AP_QID_CARD(zq->queue->qid),
+                       AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type);
                ap_send_online_uevent(&zq->queue->ap_dev, zq->online);
                return -EAGAIN;
        }
@@ -871,10 +871,10 @@ static int convert_response_ep11_xcrb(bool userspace, struct zcrypt_queue *zq,
                       AP_QID_CARD(zq->queue->qid),
                       AP_QID_QUEUE(zq->queue->qid),
                       (int) msg->hdr.type);
-               ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n",
-                              AP_QID_CARD(zq->queue->qid),
-                              AP_QID_QUEUE(zq->queue->qid),
-                              (int) msg->hdr.type);
+               ZCRYPT_DBF_ERR(
+                       "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n",
+                       __func__, AP_QID_CARD(zq->queue->qid),
+                       AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type);
                ap_send_online_uevent(&zq->queue->ap_dev, zq->online);
                return -EAGAIN;
        }
@@ -902,10 +902,10 @@ static int convert_response_rng(struct zcrypt_queue *zq,
                       AP_QID_CARD(zq->queue->qid),
                       AP_QID_QUEUE(zq->queue->qid),
                       (int) msg->hdr.type);
-               ZCRYPT_DBF_ERR("dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n",
-                              AP_QID_CARD(zq->queue->qid),
-                              AP_QID_QUEUE(zq->queue->qid),
-                              (int) msg->hdr.type);
+               ZCRYPT_DBF_ERR(
+                       "%s dev=%02x.%04x unknown response type 0x%02x => online=0 rc=EAGAIN\n",
+                       __func__, AP_QID_CARD(zq->queue->qid),
+                       AP_QID_QUEUE(zq->queue->qid), (int) msg->hdr.type);
                ap_send_online_uevent(&zq->queue->ap_dev, zq->online);
                return -EAGAIN;
        }
index 398bde237e376ece811addbb60b81353ab13becf..1552a850a52ede68e854550b8902fc5fdd45ddd1 100644 (file)
@@ -65,10 +65,9 @@ static ssize_t online_store(struct device *dev,
                return -EINVAL;
        zq->online = online;
 
-       ZCRYPT_DBF(DBF_INFO, "queue=%02x.%04x online=%d\n",
-                  AP_QID_CARD(zq->queue->qid),
-                  AP_QID_QUEUE(zq->queue->qid),
-                  online);
+       ZCRYPT_DBF_INFO("%s queue=%02x.%04x online=%d\n",
+                       __func__, AP_QID_CARD(zq->queue->qid),
+                       AP_QID_QUEUE(zq->queue->qid), online);
 
        ap_send_online_uevent(&aq->ap_dev, online);
 
@@ -175,8 +174,9 @@ int zcrypt_queue_register(struct zcrypt_queue *zq)
        zq->zcard = zc;
        zq->online = 1; /* New devices are online by default. */
 
-       ZCRYPT_DBF(DBF_INFO, "queue=%02x.%04x register online=1\n",
-                  AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid));
+       ZCRYPT_DBF_INFO("%s queue=%02x.%04x register online=1\n",
+                       __func__, AP_QID_CARD(zq->queue->qid),
+                       AP_QID_QUEUE(zq->queue->qid));
 
        list_add_tail(&zq->list, &zc->zqueues);
        spin_unlock(&zcrypt_list_lock);
@@ -215,8 +215,9 @@ void zcrypt_queue_unregister(struct zcrypt_queue *zq)
 {
        struct zcrypt_card *zc;
 
-       ZCRYPT_DBF(DBF_INFO, "queue=%02x.%04x unregister\n",
-                  AP_QID_CARD(zq->queue->qid), AP_QID_QUEUE(zq->queue->qid));
+       ZCRYPT_DBF_INFO("%s queue=%02x.%04x unregister\n",
+                       __func__, AP_QID_CARD(zq->queue->qid),
+                       AP_QID_QUEUE(zq->queue->qid));
 
        zc = zq->zcard;
        spin_lock(&zcrypt_list_lock);
index 410215c169208f1735ed3df0e76189bab8c980c8..dd70fd41c77d06957513eda59b41e9f671f02ca4 100644 (file)
@@ -69,7 +69,6 @@ static int ssb_pcihost_probe(struct pci_dev *dev,
 {
        struct ssb_bus *ssb;
        int err = -ENOMEM;
-       const char *name;
        u32 val;
 
        ssb = kzalloc(sizeof(*ssb), GFP_KERNEL);
@@ -78,10 +77,7 @@ static int ssb_pcihost_probe(struct pci_dev *dev,
        err = pci_enable_device(dev);
        if (err)
                goto err_kfree_ssb;
-       name = dev_name(&dev->dev);
-       if (dev->driver && dev->driver->name)
-               name = dev->driver->name;
-       err = pci_request_regions(dev, name);
+       err = pci_request_regions(dev, dev_driver_string(&dev->dev));
        if (err)
                goto err_pci_disable;
        pci_set_master(dev);
index e03627ad4460c1cf4ea5f332ce6e26b67b5055e7..59af251e75769cb34ccd45927e627264468bcad2 100644 (file)
@@ -86,8 +86,6 @@ source "drivers/staging/vc04_services/Kconfig"
 
 source "drivers/staging/pi433/Kconfig"
 
-source "drivers/staging/mt7621-pci/Kconfig"
-
 source "drivers/staging/mt7621-dma/Kconfig"
 
 source "drivers/staging/ralink-gdma/Kconfig"
index c7f8d8d8dd111d3b18851ed32e47216654634796..76f413470bc8fe8990756b7b091df0d8cd0e267e 100644 (file)
@@ -33,7 +33,6 @@ obj-$(CONFIG_KS7010)          += ks7010/
 obj-$(CONFIG_GREYBUS)          += greybus/
 obj-$(CONFIG_BCM2835_VCHIQ)    += vc04_services/
 obj-$(CONFIG_PI433)            += pi433/
-obj-$(CONFIG_PCI_MT7621)       += mt7621-pci/
 obj-$(CONFIG_SOC_MT7621)       += mt7621-dma/
 obj-$(CONFIG_DMA_RALINK)       += ralink-gdma/
 obj-$(CONFIG_SOC_MT7621)       += mt7621-dts/
diff --git a/drivers/staging/mt7621-pci/Kconfig b/drivers/staging/mt7621-pci/Kconfig
deleted file mode 100644 (file)
index ce58042..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-config PCI_MT7621
-       tristate "MediaTek MT7621 PCI Controller"
-       depends on RALINK
-       select PCI_DRIVERS_GENERIC
-       help
-         This selects a driver for the MediaTek MT7621 PCI Controller.
-
diff --git a/drivers/staging/mt7621-pci/Makefile b/drivers/staging/mt7621-pci/Makefile
deleted file mode 100644 (file)
index f4e651c..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_PCI_MT7621)       += pci-mt7621.o
diff --git a/drivers/staging/mt7621-pci/TODO b/drivers/staging/mt7621-pci/TODO
deleted file mode 100644 (file)
index d674a9a..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-
-- general code review and cleanup
-
-Cc: NeilBrown <neil@brown.name>
diff --git a/drivers/staging/mt7621-pci/mediatek,mt7621-pci.txt b/drivers/staging/mt7621-pci/mediatek,mt7621-pci.txt
deleted file mode 100644 (file)
index 327a682..0000000
+++ /dev/null
@@ -1,104 +0,0 @@
-MediaTek MT7621 PCIe controller
-
-Required properties:
-- compatible: "mediatek,mt7621-pci"
-- device_type: Must be "pci"
-- reg: Base addresses and lengths of the PCIe subsys and root ports.
-- bus-range: Range of bus numbers associated with this controller.
-- #address-cells: Address representation for root ports (must be 3)
-- pinctrl-names : The pin control state names.
-- pinctrl-0: The "default" pinctrl state.
-- #size-cells: Size representation for root ports (must be 2)
-- ranges: Ranges for the PCI memory and I/O regions.
-- #interrupt-cells: Must be 1
-- interrupt-map-mask and interrupt-map: Standard PCI IRQ mapping properties.
-  Please refer to the standard PCI bus binding document for a more detailed
-  explanation.
-- status: either "disabled" or "okay".
-- resets: Must contain an entry for each entry in reset-names.
-  See ../reset/reset.txt for details.
-- reset-names: Must be "pcie0", "pcie1", "pcieN"... based on the number of
-  root ports.
-- clocks: Must contain an entry for each entry in clock-names.
-  See ../clocks/clock-bindings.txt for details.
-- clock-names: Must be "pcie0", "pcie1", "pcieN"... based on the number of
-  root ports.
-- reset-gpios: GPIO specs for the reset pins.
-
-In addition, the device tree node must have sub-nodes describing each PCIe port
-interface, having the following mandatory properties:
-
-Required properties:
-- reg: Only the first four bytes are used to refer to the correct bus number
-      and device number.
-- #address-cells: Must be 3
-- #size-cells: Must be 2
-- ranges: Sub-ranges distributed from the PCIe controller node. An empty
-  property is sufficient.
-- bus-range: Range of bus numbers associated with this port.
-
-Example for MT7621:
-
-       pcie: pcie@1e140000 {
-               compatible = "mediatek,mt7621-pci";
-        reg = <0x1e140000 0x100    /* host-pci bridge registers */
-               0x1e142000 0x100    /* pcie port 0 RC control registers */
-               0x1e143000 0x100    /* pcie port 1 RC control registers */
-               0x1e144000 0x100>;  /* pcie port 2 RC control registers */
-
-               #address-cells = <3>;
-               #size-cells = <2>;
-
-               pinctrl-names = "default";
-               pinctrl-0 = <&pcie_pins>;
-
-               device_type = "pci";
-
-               bus-range = <0 255>;
-               ranges = <
-                       0x02000000 0 0x00000000 0x60000000 0 0x10000000 /* pci memory */
-                       0x01000000 0 0x00000000 0x1e160000 0 0x00010000 /* io space */
-               >;
-
-               #interrupt-cells = <1>;
-               interrupt-map-mask = <0xF0000 0 0 1>;
-               interrupt-map = <0x10000 0 0 1 &gic GIC_SHARED 4 IRQ_TYPE_LEVEL_HIGH>,
-                               <0x20000 0 0 1 &gic GIC_SHARED 24 IRQ_TYPE_LEVEL_HIGH>,
-                               <0x30000 0 0 1 &gic GIC_SHARED 25 IRQ_TYPE_LEVEL_HIGH>;
-
-               status = "disabled";
-
-               resets = <&rstctrl 24 &rstctrl 25 &rstctrl 26>;
-               reset-names = "pcie0", "pcie1", "pcie2";
-               clocks = <&clkctrl 24 &clkctrl 25 &clkctrl 26>;
-               clock-names = "pcie0", "pcie1", "pcie2";
-
-               reset-gpios = <&gpio 19 GPIO_ACTIVE_LOW>,
-                               <&gpio 8 GPIO_ACTIVE_LOW>,
-                               <&gpio 7 GPIO_ACTIVE_LOW>;
-
-               pcie@0,0 {
-                       reg = <0x0000 0 0 0 0>;
-                       #address-cells = <3>;
-                       #size-cells = <2>;
-                       ranges;
-                       bus-range = <0x00 0xff>;
-               };
-
-               pcie@1,0 {
-                       reg = <0x0800 0 0 0 0>;
-                       #address-cells = <3>;
-                       #size-cells = <2>;
-                       ranges;
-                       bus-range = <0x00 0xff>;
-               };
-
-               pcie@2,0 {
-                       reg = <0x1000 0 0 0 0>;
-                       #address-cells = <3>;
-                       #size-cells = <2>;
-                       ranges;
-                       bus-range = <0x00 0xff>;
-               };
-       };
-
diff --git a/drivers/staging/mt7621-pci/pci-mt7621.c b/drivers/staging/mt7621-pci/pci-mt7621.c
deleted file mode 100644 (file)
index 503cb1f..0000000
+++ /dev/null
@@ -1,600 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * BRIEF MODULE DESCRIPTION
- *     PCI init for Ralink RT2880 solution
- *
- * Copyright 2007 Ralink Inc. (bruce_chang@ralinktech.com.tw)
- *
- * May 2007 Bruce Chang
- * Initial Release
- *
- * May 2009 Bruce Chang
- * support RT2880/RT3883 PCIe
- *
- * May 2011 Bruce Chang
- * support RT6855/MT7620 PCIe
- */
-
-#include <linux/bitops.h>
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/gpio/consumer.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_pci.h>
-#include <linux/of_platform.h>
-#include <linux/pci.h>
-#include <linux/phy/phy.h>
-#include <linux/platform_device.h>
-#include <linux/reset.h>
-#include <linux/sys_soc.h>
-
-/* MediaTek specific configuration registers */
-#define PCIE_FTS_NUM                   0x70c
-#define PCIE_FTS_NUM_MASK              GENMASK(15, 8)
-#define PCIE_FTS_NUM_L0(x)             (((x) & 0xff) << 8)
-
-/* Host-PCI bridge registers */
-#define RALINK_PCI_PCICFG_ADDR         0x0000
-#define RALINK_PCI_PCIMSK_ADDR         0x000C
-#define RALINK_PCI_CONFIG_ADDR         0x0020
-#define RALINK_PCI_CONFIG_DATA         0x0024
-#define RALINK_PCI_MEMBASE             0x0028
-#define RALINK_PCI_IOBASE              0x002C
-
-/* PCIe RC control registers */
-#define RALINK_PCI_ID                  0x0030
-#define RALINK_PCI_CLASS               0x0034
-#define RALINK_PCI_SUBID               0x0038
-#define RALINK_PCI_STATUS              0x0050
-
-/* Some definition values */
-#define PCIE_REVISION_ID               BIT(0)
-#define PCIE_CLASS_CODE                        (0x60400 << 8)
-#define PCIE_BAR_MAP_MAX               GENMASK(30, 16)
-#define PCIE_BAR_ENABLE                        BIT(0)
-#define PCIE_PORT_INT_EN(x)            BIT(20 + (x))
-#define PCIE_PORT_LINKUP               BIT(0)
-#define PCIE_PORT_CNT                  3
-
-#define PERST_DELAY_MS                 100
-
-/**
- * struct mt7621_pcie_port - PCIe port information
- * @base: I/O mapped register base
- * @list: port list
- * @pcie: pointer to PCIe host info
- * @clk: pointer to the port clock gate
- * @phy: pointer to PHY control block
- * @pcie_rst: pointer to port reset control
- * @gpio_rst: gpio reset
- * @slot: port slot
- * @enabled: indicates if port is enabled
- */
-struct mt7621_pcie_port {
-       void __iomem *base;
-       struct list_head list;
-       struct mt7621_pcie *pcie;
-       struct clk *clk;
-       struct phy *phy;
-       struct reset_control *pcie_rst;
-       struct gpio_desc *gpio_rst;
-       u32 slot;
-       bool enabled;
-};
-
-/**
- * struct mt7621_pcie - PCIe host information
- * @base: IO Mapped Register Base
- * @dev: Pointer to PCIe device
- * @ports: pointer to PCIe port information
- * @resets_inverted: depends on chip revision
- * reset lines are inverted.
- */
-struct mt7621_pcie {
-       void __iomem *base;
-       struct device *dev;
-       struct list_head ports;
-       bool resets_inverted;
-};
-
-static inline u32 pcie_read(struct mt7621_pcie *pcie, u32 reg)
-{
-       return readl_relaxed(pcie->base + reg);
-}
-
-static inline void pcie_write(struct mt7621_pcie *pcie, u32 val, u32 reg)
-{
-       writel_relaxed(val, pcie->base + reg);
-}
-
-static inline void pcie_rmw(struct mt7621_pcie *pcie, u32 reg, u32 clr, u32 set)
-{
-       u32 val = readl_relaxed(pcie->base + reg);
-
-       val &= ~clr;
-       val |= set;
-       writel_relaxed(val, pcie->base + reg);
-}
-
-static inline u32 pcie_port_read(struct mt7621_pcie_port *port, u32 reg)
-{
-       return readl_relaxed(port->base + reg);
-}
-
-static inline void pcie_port_write(struct mt7621_pcie_port *port,
-                                  u32 val, u32 reg)
-{
-       writel_relaxed(val, port->base + reg);
-}
-
-static inline u32 mt7621_pci_get_cfgaddr(unsigned int bus, unsigned int slot,
-                                        unsigned int func, unsigned int where)
-{
-       return (((where & 0xF00) >> 8) << 24) | (bus << 16) | (slot << 11) |
-               (func << 8) | (where & 0xfc) | 0x80000000;
-}
-
-static void __iomem *mt7621_pcie_map_bus(struct pci_bus *bus,
-                                        unsigned int devfn, int where)
-{
-       struct mt7621_pcie *pcie = bus->sysdata;
-       u32 address = mt7621_pci_get_cfgaddr(bus->number, PCI_SLOT(devfn),
-                                            PCI_FUNC(devfn), where);
-
-       writel_relaxed(address, pcie->base + RALINK_PCI_CONFIG_ADDR);
-
-       return pcie->base + RALINK_PCI_CONFIG_DATA + (where & 3);
-}
-
-struct pci_ops mt7621_pci_ops = {
-       .map_bus        = mt7621_pcie_map_bus,
-       .read           = pci_generic_config_read,
-       .write          = pci_generic_config_write,
-};
-
-static u32 read_config(struct mt7621_pcie *pcie, unsigned int dev, u32 reg)
-{
-       u32 address = mt7621_pci_get_cfgaddr(0, dev, 0, reg);
-
-       pcie_write(pcie, address, RALINK_PCI_CONFIG_ADDR);
-       return pcie_read(pcie, RALINK_PCI_CONFIG_DATA);
-}
-
-static void write_config(struct mt7621_pcie *pcie, unsigned int dev,
-                        u32 reg, u32 val)
-{
-       u32 address = mt7621_pci_get_cfgaddr(0, dev, 0, reg);
-
-       pcie_write(pcie, address, RALINK_PCI_CONFIG_ADDR);
-       pcie_write(pcie, val, RALINK_PCI_CONFIG_DATA);
-}
-
-static inline void mt7621_rst_gpio_pcie_assert(struct mt7621_pcie_port *port)
-{
-       if (port->gpio_rst)
-               gpiod_set_value(port->gpio_rst, 1);
-}
-
-static inline void mt7621_rst_gpio_pcie_deassert(struct mt7621_pcie_port *port)
-{
-       if (port->gpio_rst)
-               gpiod_set_value(port->gpio_rst, 0);
-}
-
-static inline bool mt7621_pcie_port_is_linkup(struct mt7621_pcie_port *port)
-{
-       return (pcie_port_read(port, RALINK_PCI_STATUS) & PCIE_PORT_LINKUP) != 0;
-}
-
-static inline void mt7621_control_assert(struct mt7621_pcie_port *port)
-{
-       struct mt7621_pcie *pcie = port->pcie;
-
-       if (pcie->resets_inverted)
-               reset_control_assert(port->pcie_rst);
-       else
-               reset_control_deassert(port->pcie_rst);
-}
-
-static inline void mt7621_control_deassert(struct mt7621_pcie_port *port)
-{
-       struct mt7621_pcie *pcie = port->pcie;
-
-       if (pcie->resets_inverted)
-               reset_control_deassert(port->pcie_rst);
-       else
-               reset_control_assert(port->pcie_rst);
-}
-
-static int setup_cm_memory_region(struct pci_host_bridge *host)
-{
-       struct mt7621_pcie *pcie = pci_host_bridge_priv(host);
-       struct device *dev = pcie->dev;
-       struct resource_entry *entry;
-       resource_size_t mask;
-
-       entry = resource_list_first_type(&host->windows, IORESOURCE_MEM);
-       if (!entry) {
-               dev_err(dev, "Cannot get memory resource\n");
-               return -EINVAL;
-       }
-
-       if (mips_cps_numiocu(0)) {
-               /*
-                * FIXME: hardware doesn't accept mask values with 1s after
-                * 0s (e.g. 0xffef), so it would be great to warn if that's
-                * about to happen
-                */
-               mask = ~(entry->res->end - entry->res->start);
-
-               write_gcr_reg1_base(entry->res->start);
-               write_gcr_reg1_mask(mask | CM_GCR_REGn_MASK_CMTGT_IOCU0);
-               dev_info(dev, "PCI coherence region base: 0x%08llx, mask/settings: 0x%08llx\n",
-                        (unsigned long long)read_gcr_reg1_base(),
-                        (unsigned long long)read_gcr_reg1_mask());
-       }
-
-       return 0;
-}
-
-static int mt7621_pcie_parse_port(struct mt7621_pcie *pcie,
-                                 struct device_node *node,
-                                 int slot)
-{
-       struct mt7621_pcie_port *port;
-       struct device *dev = pcie->dev;
-       struct platform_device *pdev = to_platform_device(dev);
-       char name[10];
-       int err;
-
-       port = devm_kzalloc(dev, sizeof(*port), GFP_KERNEL);
-       if (!port)
-               return -ENOMEM;
-
-       port->base = devm_platform_ioremap_resource(pdev, slot + 1);
-       if (IS_ERR(port->base))
-               return PTR_ERR(port->base);
-
-       port->clk = devm_get_clk_from_child(dev, node, NULL);
-       if (IS_ERR(port->clk)) {
-               dev_err(dev, "failed to get pcie%d clock\n", slot);
-               return PTR_ERR(port->clk);
-       }
-
-       port->pcie_rst = of_reset_control_get_exclusive(node, NULL);
-       if (PTR_ERR(port->pcie_rst) == -EPROBE_DEFER) {
-               dev_err(dev, "failed to get pcie%d reset control\n", slot);
-               return PTR_ERR(port->pcie_rst);
-       }
-
-       snprintf(name, sizeof(name), "pcie-phy%d", slot);
-       port->phy = devm_of_phy_get(dev, node, name);
-       if (IS_ERR(port->phy)) {
-               dev_err(dev, "failed to get pcie-phy%d\n", slot);
-               err = PTR_ERR(port->phy);
-               goto remove_reset;
-       }
-
-       port->gpio_rst = devm_gpiod_get_index_optional(dev, "reset", slot,
-                                                      GPIOD_OUT_LOW);
-       if (IS_ERR(port->gpio_rst)) {
-               dev_err(dev, "Failed to get GPIO for PCIe%d\n", slot);
-               err = PTR_ERR(port->gpio_rst);
-               goto remove_reset;
-       }
-
-       port->slot = slot;
-       port->pcie = pcie;
-
-       INIT_LIST_HEAD(&port->list);
-       list_add_tail(&port->list, &pcie->ports);
-
-       return 0;
-
-remove_reset:
-       reset_control_put(port->pcie_rst);
-       return err;
-}
-
-static int mt7621_pcie_parse_dt(struct mt7621_pcie *pcie)
-{
-       struct device *dev = pcie->dev;
-       struct platform_device *pdev = to_platform_device(dev);
-       struct device_node *node = dev->of_node, *child;
-       int err;
-
-       pcie->base = devm_platform_ioremap_resource(pdev, 0);
-       if (IS_ERR(pcie->base))
-               return PTR_ERR(pcie->base);
-
-       for_each_available_child_of_node(node, child) {
-               int slot;
-
-               err = of_pci_get_devfn(child);
-               if (err < 0) {
-                       of_node_put(child);
-                       dev_err(dev, "failed to parse devfn: %d\n", err);
-                       return err;
-               }
-
-               slot = PCI_SLOT(err);
-
-               err = mt7621_pcie_parse_port(pcie, child, slot);
-               if (err) {
-                       of_node_put(child);
-                       return err;
-               }
-       }
-
-       return 0;
-}
-
-static int mt7621_pcie_init_port(struct mt7621_pcie_port *port)
-{
-       struct mt7621_pcie *pcie = port->pcie;
-       struct device *dev = pcie->dev;
-       u32 slot = port->slot;
-       int err;
-
-       err = phy_init(port->phy);
-       if (err) {
-               dev_err(dev, "failed to initialize port%d phy\n", slot);
-               return err;
-       }
-
-       err = phy_power_on(port->phy);
-       if (err) {
-               dev_err(dev, "failed to power on port%d phy\n", slot);
-               phy_exit(port->phy);
-               return err;
-       }
-
-       port->enabled = true;
-
-       return 0;
-}
-
-static void mt7621_pcie_reset_assert(struct mt7621_pcie *pcie)
-{
-       struct mt7621_pcie_port *port;
-
-       list_for_each_entry(port, &pcie->ports, list) {
-               /* PCIe RC reset assert */
-               mt7621_control_assert(port);
-
-               /* PCIe EP reset assert */
-               mt7621_rst_gpio_pcie_assert(port);
-       }
-
-       msleep(PERST_DELAY_MS);
-}
-
-static void mt7621_pcie_reset_rc_deassert(struct mt7621_pcie *pcie)
-{
-       struct mt7621_pcie_port *port;
-
-       list_for_each_entry(port, &pcie->ports, list)
-               mt7621_control_deassert(port);
-}
-
-static void mt7621_pcie_reset_ep_deassert(struct mt7621_pcie *pcie)
-{
-       struct mt7621_pcie_port *port;
-
-       list_for_each_entry(port, &pcie->ports, list)
-               mt7621_rst_gpio_pcie_deassert(port);
-
-       msleep(PERST_DELAY_MS);
-}
-
-static int mt7621_pcie_init_ports(struct mt7621_pcie *pcie)
-{
-       struct device *dev = pcie->dev;
-       struct mt7621_pcie_port *port, *tmp;
-       u8 num_disabled = 0;
-       int err;
-
-       mt7621_pcie_reset_assert(pcie);
-       mt7621_pcie_reset_rc_deassert(pcie);
-
-       list_for_each_entry_safe(port, tmp, &pcie->ports, list) {
-               u32 slot = port->slot;
-
-               if (slot == 1) {
-                       port->enabled = true;
-                       continue;
-               }
-
-               err = mt7621_pcie_init_port(port);
-               if (err) {
-                       dev_err(dev, "Initiating port %d failed\n", slot);
-                       list_del(&port->list);
-               }
-       }
-
-       mt7621_pcie_reset_ep_deassert(pcie);
-
-       tmp = NULL;
-       list_for_each_entry(port, &pcie->ports, list) {
-               u32 slot = port->slot;
-
-               if (!mt7621_pcie_port_is_linkup(port)) {
-                       dev_err(dev, "pcie%d no card, disable it (RST & CLK)\n",
-                               slot);
-                       mt7621_control_assert(port);
-                       port->enabled = false;
-                       num_disabled++;
-
-                       if (slot == 0) {
-                               tmp = port;
-                               continue;
-                       }
-
-                       if (slot == 1 && tmp && !tmp->enabled)
-                               phy_power_off(tmp->phy);
-               }
-       }
-
-       return (num_disabled != PCIE_PORT_CNT) ? 0 : -ENODEV;
-}
-
-static void mt7621_pcie_enable_port(struct mt7621_pcie_port *port)
-{
-       struct mt7621_pcie *pcie = port->pcie;
-       u32 slot = port->slot;
-       u32 val;
-
-       /* enable pcie interrupt */
-       val = pcie_read(pcie, RALINK_PCI_PCIMSK_ADDR);
-       val |= PCIE_PORT_INT_EN(slot);
-       pcie_write(pcie, val, RALINK_PCI_PCIMSK_ADDR);
-
-       /* map 2G DDR region */
-       pcie_port_write(port, PCIE_BAR_MAP_MAX | PCIE_BAR_ENABLE,
-                       PCI_BASE_ADDRESS_0);
-
-       /* configure class code and revision ID */
-       pcie_port_write(port, PCIE_CLASS_CODE | PCIE_REVISION_ID,
-                       RALINK_PCI_CLASS);
-
-       /* configure RC FTS number to 250 when it leaves L0s */
-       val = read_config(pcie, slot, PCIE_FTS_NUM);
-       val &= ~PCIE_FTS_NUM_MASK;
-       val |= PCIE_FTS_NUM_L0(0x50);
-       write_config(pcie, slot, PCIE_FTS_NUM, val);
-}
-
-static int mt7621_pcie_enable_ports(struct pci_host_bridge *host)
-{
-       struct mt7621_pcie *pcie = pci_host_bridge_priv(host);
-       struct device *dev = pcie->dev;
-       struct mt7621_pcie_port *port;
-       struct resource_entry *entry;
-       int err;
-
-       entry = resource_list_first_type(&host->windows, IORESOURCE_IO);
-       if (!entry) {
-               dev_err(dev, "Cannot get io resource\n");
-               return -EINVAL;
-       }
-
-       /* Setup MEMWIN and IOWIN */
-       pcie_write(pcie, 0xffffffff, RALINK_PCI_MEMBASE);
-       pcie_write(pcie, entry->res->start - entry->offset, RALINK_PCI_IOBASE);
-
-       list_for_each_entry(port, &pcie->ports, list) {
-               if (port->enabled) {
-                       err = clk_prepare_enable(port->clk);
-                       if (err) {
-                               dev_err(dev, "enabling clk pcie%d\n",
-                                       port->slot);
-                               return err;
-                       }
-
-                       mt7621_pcie_enable_port(port);
-                       dev_info(dev, "PCIE%d enabled\n", port->slot);
-               }
-       }
-
-       return 0;
-}
-
-static int mt7621_pcie_register_host(struct pci_host_bridge *host)
-{
-       struct mt7621_pcie *pcie = pci_host_bridge_priv(host);
-
-       host->ops = &mt7621_pci_ops;
-       host->sysdata = pcie;
-       return pci_host_probe(host);
-}
-
-static const struct soc_device_attribute mt7621_pci_quirks_match[] = {
-       { .soc_id = "mt7621", .revision = "E2" }
-};
-
-static int mt7621_pci_probe(struct platform_device *pdev)
-{
-       struct device *dev = &pdev->dev;
-       const struct soc_device_attribute *attr;
-       struct mt7621_pcie_port *port;
-       struct mt7621_pcie *pcie;
-       struct pci_host_bridge *bridge;
-       int err;
-
-       if (!dev->of_node)
-               return -ENODEV;
-
-       bridge = devm_pci_alloc_host_bridge(dev, sizeof(*pcie));
-       if (!bridge)
-               return -ENOMEM;
-
-       pcie = pci_host_bridge_priv(bridge);
-       pcie->dev = dev;
-       platform_set_drvdata(pdev, pcie);
-       INIT_LIST_HEAD(&pcie->ports);
-
-       attr = soc_device_match(mt7621_pci_quirks_match);
-       if (attr)
-               pcie->resets_inverted = true;
-
-       err = mt7621_pcie_parse_dt(pcie);
-       if (err) {
-               dev_err(dev, "Parsing DT failed\n");
-               return err;
-       }
-
-       err = mt7621_pcie_init_ports(pcie);
-       if (err) {
-               dev_err(dev, "Nothing connected in virtual bridges\n");
-               return 0;
-       }
-
-       err = mt7621_pcie_enable_ports(bridge);
-       if (err) {
-               dev_err(dev, "Error enabling pcie ports\n");
-               goto remove_resets;
-       }
-
-       err = setup_cm_memory_region(bridge);
-       if (err) {
-               dev_err(dev, "Error setting up iocu mem regions\n");
-               goto remove_resets;
-       }
-
-       return mt7621_pcie_register_host(bridge);
-
-remove_resets:
-       list_for_each_entry(port, &pcie->ports, list)
-               reset_control_put(port->pcie_rst);
-
-       return err;
-}
-
-static int mt7621_pci_remove(struct platform_device *pdev)
-{
-       struct mt7621_pcie *pcie = platform_get_drvdata(pdev);
-       struct mt7621_pcie_port *port;
-
-       list_for_each_entry(port, &pcie->ports, list)
-               reset_control_put(port->pcie_rst);
-
-       return 0;
-}
-
-static const struct of_device_id mt7621_pci_ids[] = {
-       { .compatible = "mediatek,mt7621-pci" },
-       {},
-};
-MODULE_DEVICE_TABLE(of, mt7621_pci_ids);
-
-static struct platform_driver mt7621_pci_driver = {
-       .probe = mt7621_pci_probe,
-       .remove = mt7621_pci_remove,
-       .driver = {
-               .name = "mt7621-pci",
-               .of_match_table = of_match_ptr(mt7621_pci_ids),
-       },
-};
-builtin_platform_driver(mt7621_pci_driver);
index be4ecbabdd58694aa7fee6d395bd31fe377b6fc9..933d77ad0a64212843a1b13ae80ae4fb75f87bbd 100644 (file)
@@ -185,7 +185,7 @@ static void __init xdbc_free_ring(struct xdbc_ring *ring)
        if (!seg)
                return;
 
-       memblock_free(seg->dma, PAGE_SIZE);
+       memblock_phys_free(seg->dma, PAGE_SIZE);
        ring->segment = NULL;
 }
 
@@ -665,10 +665,10 @@ int __init early_xdbc_setup_hardware(void)
                xdbc_free_ring(&xdbc.in_ring);
 
                if (xdbc.table_dma)
-                       memblock_free(xdbc.table_dma, PAGE_SIZE);
+                       memblock_phys_free(xdbc.table_dma, PAGE_SIZE);
 
                if (xdbc.out_dma)
-                       memblock_free(xdbc.out_dma, PAGE_SIZE);
+                       memblock_phys_free(xdbc.out_dma, PAGE_SIZE);
 
                xdbc.table_base = NULL;
                xdbc.out_buf = NULL;
@@ -987,8 +987,8 @@ free_and_quit:
        xdbc_free_ring(&xdbc.evt_ring);
        xdbc_free_ring(&xdbc.out_ring);
        xdbc_free_ring(&xdbc.in_ring);
-       memblock_free(xdbc.table_dma, PAGE_SIZE);
-       memblock_free(xdbc.out_dma, PAGE_SIZE);
+       memblock_phys_free(xdbc.table_dma, PAGE_SIZE);
+       memblock_phys_free(xdbc.out_dma, PAGE_SIZE);
        writel(0, &xdbc.xdbc_reg->control);
        early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
 
index 1d8a4c089a8585dfcd67d523f34a48da9724687f..92adf61078644fa0d36fdf19d58f0159face2372 100644 (file)
@@ -111,7 +111,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
        struct xhci_driver_data         *driver_data;
        const struct pci_device_id      *id;
 
-       id = pci_match_id(pdev->driver->id_table, pdev);
+       id = pci_match_id(to_pci_driver(pdev->dev.driver)->id_table, pdev);
 
        if (id && id->driver_data) {
                driver_data = (struct xhci_driver_data *)id->driver_data;
index 8fcf94cd2c965cad297beb89ec2261720a3f8732..10607be76a88b8e78c139d5173fa899c8272fc3c 100644 (file)
@@ -108,7 +108,7 @@ config VIRTIO_MEM
        default m
        depends on X86_64
        depends on VIRTIO
-       depends on MEMORY_HOTPLUG_SPARSE
+       depends on MEMORY_HOTPLUG
        depends on MEMORY_HOTREMOVE
        depends on CONTIG_ALLOC
        help
index cbdff89799807085ebfac1810d132a8aa55719cb..47aebd98f52f551718d9f66656ee57861604343a 100644 (file)
@@ -241,7 +241,7 @@ retry:
         */
        rc = xen_swiotlb_fixup(start, nslabs);
        if (rc) {
-               memblock_free(__pa(start), PAGE_ALIGN(bytes));
+               memblock_free(start, PAGE_ALIGN(bytes));
                if (nslabs > 1024 && repeat--) {
                        /* Min is 2MB */
                        nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE));
index 9fa930dfd78d60acb2f384124eb973657b016f78..dca42aa87d305e9d175b1344145ee6ddd2eae300 100644 (file)
@@ -38,7 +38,6 @@
 #include <linux/key-type.h>
 #include "cifs_spnego.h"
 #include "fscache.h"
-#include "smb2pdu.h"
 #ifdef CONFIG_CIFS_DFS_UPCALL
 #include "dfs_cache.h"
 #endif
index e916470468ea93f9a62804a1b8252c361b2a7e66..abff31dcd0050cda8020f913dc4eba5f88bbadc7 100644 (file)
@@ -20,6 +20,7 @@
 #include <crypto/internal/hash.h>
 #include <linux/scatterlist.h>
 #include <uapi/linux/cifs/cifs_mount.h>
+#include "../smbfs_common/smb2pdu.h"
 #include "smb2pdu.h"
 
 #define CIFS_MAGIC_NUMBER 0xFF534D42      /* the first four bytes of SMB PDUs */
@@ -776,7 +777,7 @@ revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
 
 static inline void
 revert_current_mid_from_hdr(struct TCP_Server_Info *server,
-                           const struct smb2_sync_hdr *shdr)
+                           const struct smb2_hdr *shdr)
 {
        unsigned int num = le16_to_cpu(shdr->CreditCharge);
 
index c3b94c1e4591338b58304b088b2f07541dcfd741..0abbff4e4135c63eaf96e85d06cc2438e7c6000e 100644 (file)
@@ -677,7 +677,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
 static unsigned int
 smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
 {
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buffer;
 
        /*
         * SMB1 does not use credits.
@@ -794,7 +794,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
                 */
        }
 
-       kfree(server->hostname);
        kfree(server);
 
        length = atomic_dec_return(&tcpSesAllocCount);
@@ -878,7 +877,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 static void
 smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
 {
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buffer;
        int scredits, in_flight;
 
        /*
@@ -1235,6 +1234,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *
        if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
                return 0;
 
+       if (strcasecmp(server->hostname, ctx->server_hostname))
+               return 0;
+
        if (!match_address(server, addr,
                           (struct sockaddr *)&ctx->srcaddr))
                return 0;
@@ -1336,6 +1338,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
        kfree(server->session_key.response);
        server->session_key.response = NULL;
        server->session_key.len = 0;
+       kfree(server->hostname);
 
        task = xchg(&server->tsk, NULL);
        if (task)
@@ -1361,14 +1364,15 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx)
                goto out_err;
        }
 
+       tcp_ses->hostname = kstrdup(ctx->server_hostname, GFP_KERNEL);
+       if (!tcp_ses->hostname) {
+               rc = -ENOMEM;
+               goto out_err;
+       }
+
        tcp_ses->ops = ctx->ops;
        tcp_ses->vals = ctx->vals;
        cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
-       tcp_ses->hostname = extract_hostname(ctx->UNC);
-       if (IS_ERR(tcp_ses->hostname)) {
-               rc = PTR_ERR(tcp_ses->hostname);
-               goto out_err_crypto_release;
-       }
 
        tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId);
        tcp_ses->noblockcnt = ctx->rootfs;
@@ -1497,8 +1501,7 @@ out_err_crypto_release:
 
 out_err:
        if (tcp_ses) {
-               if (!IS_ERR(tcp_ses->hostname))
-                       kfree(tcp_ses->hostname);
+               kfree(tcp_ses->hostname);
                if (tcp_ses->ssocket)
                        sock_release(tcp_ses->ssocket);
                kfree(tcp_ses);
@@ -2646,11 +2649,12 @@ generic_ip_connect(struct TCP_Server_Info *server)
                rc = 0;
        if (rc < 0) {
                cifs_dbg(FYI, "Error %d connecting to server\n", rc);
+               trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc);
                sock_release(socket);
                server->ssocket = NULL;
                return rc;
        }
-
+       trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr);
        if (sport == htons(RFC1001_PORT))
                rc = ip_rfc1001_connect(server);
 
index 3109def8e19989ac6ad78768eac6bca4eabf4912..38d96a4807452714ce6eb842ae181c3ad263d6ca 100644 (file)
@@ -116,6 +116,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
        fsparam_flag("nosharesock", Opt_nosharesock),
        fsparam_flag_no("persistenthandles", Opt_persistent),
        fsparam_flag_no("resilienthandles", Opt_resilient),
+       fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay),
        fsparam_flag("domainauto", Opt_domainauto),
        fsparam_flag("rdma", Opt_rdma),
        fsparam_flag("modesid", Opt_modesid),
@@ -318,6 +319,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
        DUP_CTX_STR(mount_options);
        DUP_CTX_STR(username);
        DUP_CTX_STR(password);
+       DUP_CTX_STR(server_hostname);
        DUP_CTX_STR(UNC);
        DUP_CTX_STR(source);
        DUP_CTX_STR(domainname);
@@ -456,6 +458,11 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx)
        if (!pos)
                return -EINVAL;
 
+       /* record the server hostname */
+       ctx->server_hostname = kstrndup(devname + 2, pos - devname - 2, GFP_KERNEL);
+       if (!ctx->server_hostname)
+               return -ENOMEM;
+
        /* skip past delimiter */
        ++pos;
 
@@ -1383,6 +1390,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
                        }
                }
                break;
+       case Opt_tcp_nodelay:
+               /* tcp nodelay should not usually be needed since we CORK/UNCORK the socket */
+               if (result.negated)
+                       ctx->sockopt_tcp_nodelay = false;
+               else
+                       ctx->sockopt_tcp_nodelay = true;
+               break;
        case Opt_domainauto:
                ctx->domainauto = true;
                break;
@@ -1496,6 +1510,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
        ctx->username = NULL;
        kfree_sensitive(ctx->password);
        ctx->password = NULL;
+       kfree(ctx->server_hostname);
+       ctx->server_hostname = NULL;
        kfree(ctx->UNC);
        ctx->UNC = NULL;
        kfree(ctx->source);
index a42ba71d7a81fb0181788ab4ffd1d90ba671435a..b2d22cf9cb181b05fb15acfd9a09acf55e4e4253 100644 (file)
@@ -98,6 +98,7 @@ enum cifs_param {
        Opt_nosharesock,
        Opt_persistent,
        Opt_resilient,
+       Opt_tcp_nodelay,
        Opt_domainauto,
        Opt_rdma,
        Opt_modesid,
@@ -166,6 +167,7 @@ struct smb3_fs_context {
        char *password;
        char *domainname;
        char *source;
+       char *server_hostname;
        char *UNC;
        char *nodename;
        char *iocharset;  /* local code page for mapping to and from Unicode */
index bb1185fff8cc4d6b760cf01c22a72e50f575c591..ba2c3e897b291a95a2380a940c058edeea8c81f8 100644 (file)
@@ -152,7 +152,7 @@ cifs_buf_get(void)
         * SMB2 header is bigger than CIFS one - no problems to clean some
         * more bytes for CIFS.
         */
-       size_t buf_size = sizeof(struct smb2_sync_hdr);
+       size_t buf_size = sizeof(struct smb2_hdr);
 
        /*
         * We could use negotiated size instead of max_msgsize -
index 181514b8770ddeef3bb9ea76e1240477312bbbd0..194799ddd38288847dec2b2a1b65f604ec4be322 100644 (file)
@@ -2439,14 +2439,16 @@ smb2_print_status(__le32 status)
 int
 map_smb2_to_linux_error(char *buf, bool log_err)
 {
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
        unsigned int i;
        int rc = -EIO;
        __le32 smb2err = shdr->Status;
 
        if (smb2err == 0) {
-               trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId,
-                       le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId));
+               trace_smb3_cmd_done(le32_to_cpu(shdr->Id.SyncId.TreeId),
+                             le64_to_cpu(shdr->SessionId),
+                             le16_to_cpu(shdr->Command),
+                             le64_to_cpu(shdr->MessageId));
                return 0;
        }
 
@@ -2470,8 +2472,10 @@ map_smb2_to_linux_error(char *buf, bool log_err)
        cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n",
                 __le32_to_cpu(smb2err), rc);
 
-       trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId,
-                       le16_to_cpu(shdr->Command),
-                       le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc);
+       trace_smb3_cmd_err(le32_to_cpu(shdr->Id.SyncId.TreeId),
+                          le64_to_cpu(shdr->SessionId),
+                          le16_to_cpu(shdr->Command),
+                          le64_to_cpu(shdr->MessageId),
+                          le32_to_cpu(smb2err), rc);
        return rc;
 }
index 29b5554f6263fd72c291413ccfaa57b97fc71a1c..cdcdef32759e4230815cf8c9c646d35d05295a58 100644 (file)
@@ -8,7 +8,6 @@
  *
  */
 #include <linux/ctype.h>
-#include "smb2pdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "smb2proto.h"
@@ -19,7 +18,7 @@
 #include "nterr.h"
 
 static int
-check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid)
+check_smb2_hdr(struct smb2_hdr *shdr, __u64 mid)
 {
        __u64 wire_mid = le64_to_cpu(shdr->MessageId);
 
@@ -81,9 +80,9 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
        /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24)
 };
 
-#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_sync_hdr) + sizeof(struct smb2_negotiate_rsp))
+#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_hdr) + sizeof(struct smb2_negotiate_rsp))
 
-static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
+static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len,
                              __u32 non_ctxlen)
 {
        __u16 neg_count;
@@ -135,13 +134,13 @@ static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len,
 int
 smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
 {
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
-       struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
+       struct smb2_pdu *pdu = (struct smb2_pdu *)shdr;
        __u64 mid;
        __u32 clc_len;  /* calculated length */
        int command;
-       int pdu_size = sizeof(struct smb2_sync_pdu);
-       int hdr_size = sizeof(struct smb2_sync_hdr);
+       int pdu_size = sizeof(struct smb2_pdu);
+       int hdr_size = sizeof(struct smb2_hdr);
 
        /*
         * Add function to do table lookup of StructureSize by command
@@ -155,7 +154,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
                /* decrypt frame now that it is completely read in */
                spin_lock(&cifs_tcp_ses_lock);
                list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) {
-                       if (ses->Suid == thdr->SessionId)
+                       if (ses->Suid == le64_to_cpu(thdr->SessionId))
                                break;
                }
                spin_unlock(&cifs_tcp_ses_lock);
@@ -296,7 +295,7 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
  * area and the offset to it (from the beginning of the smb are also returned.
  */
 char *
-smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
+smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr)
 {
        *off = 0;
        *len = 0;
@@ -401,8 +400,8 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
 unsigned int
 smb2_calc_size(void *buf, struct TCP_Server_Info *srvr)
 {
-       struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf;
-       struct smb2_sync_hdr *shdr = &pdu->sync_hdr;
+       struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
+       struct smb2_hdr *shdr = &pdu->hdr;
        int offset; /* the offset from the beginning of SMB to data area */
        int data_length; /* the length of the variable length data area */
        /* Structure Size has already been checked to make sure it is 64 */
@@ -669,7 +668,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 
        cifs_dbg(FYI, "Checking for oplock break\n");
 
-       if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
+       if (rsp->hdr.Command != SMB2_OPLOCK_BREAK)
                return false;
 
        if (rsp->StructureSize !=
@@ -816,25 +815,25 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
 int
 smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 {
-       struct smb2_sync_hdr *sync_hdr = mid->resp_buf;
+       struct smb2_hdr *hdr = mid->resp_buf;
        struct smb2_create_rsp *rsp = mid->resp_buf;
        struct cifs_tcon *tcon;
        int rc;
 
-       if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || sync_hdr->Command != SMB2_CREATE ||
-           sync_hdr->Status != STATUS_SUCCESS)
+       if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || hdr->Command != SMB2_CREATE ||
+           hdr->Status != STATUS_SUCCESS)
                return 0;
 
-       tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId,
-                                 sync_hdr->TreeId);
+       tcon = smb2_find_smb_tcon(server, le64_to_cpu(hdr->SessionId),
+                                 le32_to_cpu(hdr->Id.SyncId.TreeId));
        if (!tcon)
                return -ENOENT;
 
        rc = __smb2_handle_cancelled_cmd(tcon,
-                                        le16_to_cpu(sync_hdr->Command),
-                                        le64_to_cpu(sync_hdr->MessageId),
-                                        rsp->PersistentFileId,
-                                        rsp->VolatileFileId);
+                                        le16_to_cpu(hdr->Command),
+                                        le64_to_cpu(hdr->MessageId),
+                                        le64_to_cpu(rsp->PersistentFileId),
+                                        le64_to_cpu(rsp->VolatileFileId));
        if (rc)
                cifs_put_tcon(tcon);
 
@@ -856,10 +855,10 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec)
 {
        int i, rc;
        struct sdesc *d;
-       struct smb2_sync_hdr *hdr;
+       struct smb2_hdr *hdr;
        struct TCP_Server_Info *server = cifs_ses_server(ses);
 
-       hdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+       hdr = (struct smb2_hdr *)iov[0].iov_base;
        /* neg prot are always taken */
        if (hdr->Command == SMB2_NEGOTIATE)
                goto ok;
index bda606dc72b1f4dac44de87961c211fe9c4845fe..7acf71defea79ea58d659ea9191dd4f10b3b6eb5 100644 (file)
@@ -325,7 +325,7 @@ static struct mid_q_entry *
 __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
 {
        struct mid_q_entry *mid;
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
        __u64 wire_mid = le64_to_cpu(shdr->MessageId);
 
        if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
@@ -367,11 +367,11 @@ static void
 smb2_dump_detail(void *buf, struct TCP_Server_Info *server)
 {
 #ifdef CONFIG_CIFS_DEBUG2
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
 
        cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
                 shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
-                shdr->ProcessId);
+                shdr->Id.SyncId.ProcessId);
        cifs_server_dbg(VFS, "smb buf %p len %u\n", buf,
                 server->ops->calc_smb_size(buf, server));
 #endif
@@ -885,10 +885,10 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
        atomic_inc(&tcon->num_remote_opens);
 
        o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
-       oparms.fid->persistent_fid = o_rsp->PersistentFileId;
-       oparms.fid->volatile_fid = o_rsp->VolatileFileId;
+       oparms.fid->persistent_fid = le64_to_cpu(o_rsp->PersistentFileId);
+       oparms.fid->volatile_fid = le64_to_cpu(o_rsp->VolatileFileId);
 #ifdef CONFIG_CIFS_DEBUG2
-       oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId);
+       oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId);
 #endif /* CIFS_DEBUG2 */
 
        tcon->crfid.tcon = tcon;
@@ -2391,12 +2391,12 @@ again:
 
        /* If the open failed there is nothing to do */
        op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
-       if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) {
+       if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) {
                cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc);
                goto qdf_free;
        }
-       fid->persistent_fid = op_rsp->PersistentFileId;
-       fid->volatile_fid = op_rsp->VolatileFileId;
+       fid->persistent_fid = le64_to_cpu(op_rsp->PersistentFileId);
+       fid->volatile_fid = le64_to_cpu(op_rsp->VolatileFileId);
 
        /* Anything else than ENODATA means a genuine error */
        if (rc && rc != -ENODATA) {
@@ -2410,7 +2410,7 @@ again:
        atomic_inc(&tcon->num_remote_opens);
 
        qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base;
-       if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+       if (qd_rsp->hdr.Status == STATUS_NO_MORE_FILES) {
                trace_smb3_query_dir_done(xid, fid->persistent_fid,
                                          tcon->tid, tcon->ses->Suid, 0, 0);
                srch_inf->endOfSearch = true;
@@ -2462,7 +2462,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
 static bool
 smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
 {
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
        int scredits, in_flight;
 
        if (shdr->Status != STATUS_PENDING)
@@ -2489,13 +2489,14 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
 static bool
 smb2_is_session_expired(char *buf)
 {
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
 
        if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED &&
            shdr->Status != STATUS_USER_SESSION_DELETED)
                return false;
 
-       trace_smb3_ses_expired(shdr->TreeId, shdr->SessionId,
+       trace_smb3_ses_expired(le32_to_cpu(shdr->Id.SyncId.TreeId),
+                              le64_to_cpu(shdr->SessionId),
                               le16_to_cpu(shdr->Command),
                               le64_to_cpu(shdr->MessageId));
        cifs_dbg(FYI, "Session expired or deleted\n");
@@ -2506,7 +2507,7 @@ smb2_is_session_expired(char *buf)
 static bool
 smb2_is_status_io_timeout(char *buf)
 {
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
 
        if (shdr->Status == STATUS_IO_TIMEOUT)
                return true;
@@ -2517,7 +2518,7 @@ smb2_is_status_io_timeout(char *buf)
 static void
 smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
 {
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
        struct list_head *tmp, *tmp1;
        struct cifs_ses *ses;
        struct cifs_tcon *tcon;
@@ -2530,7 +2531,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
                ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
                list_for_each(tmp1, &ses->tcon_list) {
                        tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
-                       if (tcon->tid == shdr->TreeId) {
+                       if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) {
                                tcon->need_reconnect = true;
                                spin_unlock(&cifs_tcp_ses_lock);
                                pr_warn_once("Server share %s deleted.\n",
@@ -2558,9 +2559,9 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
 void
 smb2_set_related(struct smb_rqst *rqst)
 {
-       struct smb2_sync_hdr *shdr;
+       struct smb2_hdr *shdr;
 
-       shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+       shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
        if (shdr == NULL) {
                cifs_dbg(FYI, "shdr NULL in smb2_set_related\n");
                return;
@@ -2573,13 +2574,13 @@ char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0};
 void
 smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
 {
-       struct smb2_sync_hdr *shdr;
+       struct smb2_hdr *shdr;
        struct cifs_ses *ses = tcon->ses;
        struct TCP_Server_Info *server = ses->server;
        unsigned long len = smb_rqst_len(server, rqst);
        int i, num_padding;
 
-       shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base);
+       shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
        if (shdr == NULL) {
                cifs_dbg(FYI, "shdr NULL in smb2_set_next_command\n");
                return;
@@ -3124,7 +3125,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
                                resp_buftype, rsp_iov);
 
        create_rsp = rsp_iov[0].iov_base;
-       if (create_rsp && create_rsp->sync_hdr.Status)
+       if (create_rsp && create_rsp->hdr.Status)
                err_iov = rsp_iov[0];
        ioctl_rsp = rsp_iov[1].iov_base;
 
@@ -4369,8 +4370,8 @@ static void
 fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
                   struct smb_rqst *old_rq, __le16 cipher_type)
 {
-       struct smb2_sync_hdr *shdr =
-                       (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base;
+       struct smb2_hdr *shdr =
+                       (struct smb2_hdr *)old_rq->rq_iov[0].iov_base;
 
        memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr));
        tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
@@ -4496,7 +4497,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
        struct crypto_aead *tfm;
        unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
 
-       rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key);
+       rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
        if (rc) {
                cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__,
                         enc ? "en" : "de");
@@ -4788,7 +4789,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
        unsigned int cur_page_idx;
        unsigned int pad_len;
        struct cifs_readdata *rdata = mid->callback_data;
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)buf;
        struct bio_vec *bvec = NULL;
        struct iov_iter iter;
        struct kvec iov;
@@ -5117,7 +5118,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
 {
        int ret, length;
        char *buf = server->smallbuf;
-       struct smb2_sync_hdr *shdr;
+       struct smb2_hdr *shdr;
        unsigned int pdu_length = server->pdu_size;
        unsigned int buf_size;
        struct mid_q_entry *mid_entry;
@@ -5147,7 +5148,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server,
 
        next_is_large = server->large_buf;
 one_more:
-       shdr = (struct smb2_sync_hdr *)buf;
+       shdr = (struct smb2_hdr *)buf;
        if (shdr->NextCommand) {
                if (next_is_large)
                        next_buffer = (char *)cifs_buf_get();
@@ -5213,7 +5214,7 @@ smb3_receive_transform(struct TCP_Server_Info *server,
        unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
 
        if (pdu_length < sizeof(struct smb2_transform_hdr) +
-                                               sizeof(struct smb2_sync_hdr)) {
+                                               sizeof(struct smb2_hdr)) {
                cifs_server_dbg(VFS, "Transform message is too small (%u)\n",
                         pdu_length);
                cifs_reconnect(server);
@@ -5246,7 +5247,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 static int
 smb2_next_header(char *buf)
 {
-       struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf;
+       struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
        struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf;
 
        if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM)
@@ -5788,7 +5789,7 @@ struct smb_version_values smb20_values = {
        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
        .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-       .header_size = sizeof(struct smb2_sync_hdr),
+       .header_size = sizeof(struct smb2_hdr),
        .header_preamble_size = 0,
        .max_header_size = MAX_SMB2_HDR_SIZE,
        .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5809,7 +5810,7 @@ struct smb_version_values smb21_values = {
        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
        .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-       .header_size = sizeof(struct smb2_sync_hdr),
+       .header_size = sizeof(struct smb2_hdr),
        .header_preamble_size = 0,
        .max_header_size = MAX_SMB2_HDR_SIZE,
        .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5830,7 +5831,7 @@ struct smb_version_values smb3any_values = {
        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
        .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-       .header_size = sizeof(struct smb2_sync_hdr),
+       .header_size = sizeof(struct smb2_hdr),
        .header_preamble_size = 0,
        .max_header_size = MAX_SMB2_HDR_SIZE,
        .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5851,7 +5852,7 @@ struct smb_version_values smbdefault_values = {
        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
        .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-       .header_size = sizeof(struct smb2_sync_hdr),
+       .header_size = sizeof(struct smb2_hdr),
        .header_preamble_size = 0,
        .max_header_size = MAX_SMB2_HDR_SIZE,
        .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5872,7 +5873,7 @@ struct smb_version_values smb30_values = {
        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
        .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-       .header_size = sizeof(struct smb2_sync_hdr),
+       .header_size = sizeof(struct smb2_hdr),
        .header_preamble_size = 0,
        .max_header_size = MAX_SMB2_HDR_SIZE,
        .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5893,7 +5894,7 @@ struct smb_version_values smb302_values = {
        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
        .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-       .header_size = sizeof(struct smb2_sync_hdr),
+       .header_size = sizeof(struct smb2_hdr),
        .header_preamble_size = 0,
        .max_header_size = MAX_SMB2_HDR_SIZE,
        .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
@@ -5914,7 +5915,7 @@ struct smb_version_values smb311_values = {
        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
        .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
-       .header_size = sizeof(struct smb2_sync_hdr),
+       .header_size = sizeof(struct smb2_hdr),
        .header_preamble_size = 0,
        .max_header_size = MAX_SMB2_HDR_SIZE,
        .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
index 7829c590eeac64478a7855537e5c3fef90f92133..d2ecb2ea37c0df085966be3a9d536e8460aab86d 100644 (file)
@@ -23,7 +23,6 @@
 #include <linux/uuid.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
-#include "smb2pdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
 #include "cifsproto.h"
@@ -84,7 +83,7 @@ int smb3_encryption_required(const struct cifs_tcon *tcon)
 }
 
 static void
-smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
+smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd,
                  const struct cifs_tcon *tcon,
                  struct TCP_Server_Info *server)
 {
@@ -104,7 +103,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
        } else {
                shdr->CreditRequest = cpu_to_le16(2);
        }
-       shdr->ProcessId = cpu_to_le32((__u16)current->tgid);
+       shdr->Id.SyncId.ProcessId = cpu_to_le32((__u16)current->tgid);
 
        if (!tcon)
                goto out;
@@ -115,10 +114,10 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
                shdr->CreditCharge = cpu_to_le16(1);
        /* else CreditCharge MBZ */
 
-       shdr->TreeId = tcon->tid;
+       shdr->Id.SyncId.TreeId = cpu_to_le32(tcon->tid);
        /* Uid is not converted */
        if (tcon->ses)
-               shdr->SessionId = tcon->ses->Suid;
+               shdr->SessionId = cpu_to_le64(tcon->ses->Suid);
 
        /*
         * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
@@ -331,7 +330,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon,
               void *buf,
               unsigned int *total_len)
 {
-       struct smb2_sync_pdu *spdu = (struct smb2_sync_pdu *)buf;
+       struct smb2_pdu *spdu = (struct smb2_pdu *)buf;
        /* lookup word count ie StructureSize from table */
        __u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)];
 
@@ -341,10 +340,10 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon,
         */
        memset(buf, 0, 256);
 
-       smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon, server);
+       smb2_hdr_assemble(&spdu->hdr, smb2_command, tcon, server);
        spdu->StructureSize2 = cpu_to_le16(parmsize);
 
-       *total_len = parmsize + sizeof(struct smb2_sync_hdr);
+       *total_len = parmsize + sizeof(struct smb2_hdr);
 }
 
 /*
@@ -367,7 +366,7 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
        }
 
        fill_small_buf(smb2_command, tcon, server,
-                      (struct smb2_sync_hdr *)(*request_buf),
+                      (struct smb2_hdr *)(*request_buf),
                       total_len);
 
        if (tcon != NULL) {
@@ -414,8 +413,8 @@ build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
        pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES;
        pneg_ctxt->DataLength = cpu_to_le16(38);
        pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1);
-       pneg_ctxt->SaltLength = cpu_to_le16(SMB311_LINUX_CLIENT_SALT_SIZE);
-       get_random_bytes(pneg_ctxt->Salt, SMB311_LINUX_CLIENT_SALT_SIZE);
+       pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE);
+       get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE);
        pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512;
 }
 
@@ -857,7 +856,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
        if (rc)
                return rc;
 
-       req->sync_hdr.SessionId = 0;
+       req->hdr.SessionId = 0;
 
        memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
        memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
@@ -1018,7 +1017,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
                server->cipher_type = SMB2_ENCRYPTION_AES128_CCM;
 
        security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
-                                              (struct smb2_sync_hdr *)rsp);
+                                              (struct smb2_hdr *)rsp);
        /*
         * See MS-SMB2 section 2.2.4: if no blob, client picks default which
         * for us will be
@@ -1250,23 +1249,23 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
                return rc;
 
        if (sess_data->ses->binding) {
-               req->sync_hdr.SessionId = sess_data->ses->Suid;
-               req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+               req->hdr.SessionId = cpu_to_le64(sess_data->ses->Suid);
+               req->hdr.Flags |= SMB2_FLAGS_SIGNED;
                req->PreviousSessionId = 0;
                req->Flags = SMB2_SESSION_REQ_FLAG_BINDING;
        } else {
                /* First session, not a reauthenticate */
-               req->sync_hdr.SessionId = 0;
+               req->hdr.SessionId = 0;
                /*
                 * if reconnect, we need to send previous sess id
                 * otherwise it is 0
                 */
-               req->PreviousSessionId = sess_data->previous_session;
+               req->PreviousSessionId = cpu_to_le64(sess_data->previous_session);
                req->Flags = 0; /* MBZ */
        }
 
        /* enough to enable echos and oplocks and one max size write */
-       req->sync_hdr.CreditRequest = cpu_to_le16(130);
+       req->hdr.CreditRequest = cpu_to_le16(130);
 
        /* only one of SMB2 signing flags may be set in SMB2 request */
        if (server->sign)
@@ -1425,7 +1424,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
        rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
        /* keep session id and flags if binding */
        if (!ses->binding) {
-               ses->Suid = rsp->sync_hdr.SessionId;
+               ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
                ses->session_flags = le16_to_cpu(rsp->SessionFlags);
        }
 
@@ -1501,7 +1500,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
 
        /* If true, rc here is expected and not an error */
        if (sess_data->buf0_type != CIFS_NO_BUFFER &&
-               rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
+               rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
                rc = 0;
 
        if (rc)
@@ -1523,7 +1522,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
 
        /* keep existing ses id and flags if binding */
        if (!ses->binding) {
-               ses->Suid = rsp->sync_hdr.SessionId;
+               ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
                ses->session_flags = le16_to_cpu(rsp->SessionFlags);
        }
 
@@ -1558,7 +1557,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
                goto out;
 
        req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base;
-       req->sync_hdr.SessionId = ses->Suid;
+       req->hdr.SessionId = cpu_to_le64(ses->Suid);
 
        rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses,
                                        sess_data->nls_cp);
@@ -1584,7 +1583,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
 
        /* keep existing ses id and flags if binding */
        if (!ses->binding) {
-               ses->Suid = rsp->sync_hdr.SessionId;
+               ses->Suid = le64_to_cpu(rsp->hdr.SessionId);
                ses->session_flags = le16_to_cpu(rsp->SessionFlags);
        }
 
@@ -1715,12 +1714,12 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
                return rc;
 
         /* since no tcon, smb2_init can not do this, so do here */
-       req->sync_hdr.SessionId = ses->Suid;
+       req->hdr.SessionId = cpu_to_le64(ses->Suid);
 
        if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
                flags |= CIFS_TRANSFORM_REQ;
        else if (server->sign)
-               req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+               req->hdr.Flags |= SMB2_FLAGS_SIGNED;
 
        flags |= CIFS_NO_RSP_BUF;
 
@@ -1828,14 +1827,14 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
            !(ses->session_flags &
                    (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) &&
            ((ses->user_name != NULL) || (ses->sectype == Kerberos)))
-               req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+               req->hdr.Flags |= SMB2_FLAGS_SIGNED;
 
        memset(&rqst, 0, sizeof(struct smb_rqst));
        rqst.rq_iov = iov;
        rqst.rq_nvec = 2;
 
        /* Need 64 for max size write so ask for more in case not there yet */
-       req->sync_hdr.CreditRequest = cpu_to_le16(64);
+       req->hdr.CreditRequest = cpu_to_le16(64);
 
        rc = cifs_send_recv(xid, ses, server,
                            &rqst, &resp_buftype, flags, &rsp_iov);
@@ -1871,7 +1870,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
        tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
        tcon->tidStatus = CifsGood;
        tcon->need_reconnect = false;
-       tcon->tid = rsp->sync_hdr.TreeId;
+       tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId);
        strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
 
        if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
@@ -1892,9 +1891,8 @@ tcon_exit:
        return rc;
 
 tcon_error_exit:
-       if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
+       if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME)
                cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
-       }
        goto tcon_exit;
 }
 
@@ -2608,7 +2606,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
        if (tcon->share_flags & SHI1005_FLAGS_DFS) {
                int name_len;
 
-               req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
+               req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
                rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
                                                 &name_len,
                                                 tcon->treeName, utf16_path);
@@ -2672,11 +2670,13 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
        }
 
        rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
-       trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid,
+       trace_smb3_posix_mkdir_done(xid, le64_to_cpu(rsp->PersistentFileId),
+                                   tcon->tid,
                                    ses->Suid, CREATE_NOT_FILE,
                                    FILE_WRITE_ATTRIBUTES);
 
-       SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId);
+       SMB2_close(xid, tcon, le64_to_cpu(rsp->PersistentFileId),
+                  le64_to_cpu(rsp->VolatileFileId));
 
        /* Eventually save off posix specific response info and timestaps */
 
@@ -2740,7 +2740,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
        if (tcon->share_flags & SHI1005_FLAGS_DFS) {
                int name_len;
 
-               req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
+               req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS;
                rc = alloc_path_with_tree_prefix(&copy_path, &copy_size,
                                                 &name_len,
                                                 tcon->treeName, path);
@@ -2943,16 +2943,17 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
                }
                goto creat_exit;
        } else
-               trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid,
+               trace_smb3_open_done(xid, le64_to_cpu(rsp->PersistentFileId),
+                                    tcon->tid,
                                     ses->Suid, oparms->create_options,
                                     oparms->desired_access);
 
        atomic_inc(&tcon->num_remote_opens);
-       oparms->fid->persistent_fid = rsp->PersistentFileId;
-       oparms->fid->volatile_fid = rsp->VolatileFileId;
+       oparms->fid->persistent_fid = le64_to_cpu(rsp->PersistentFileId);
+       oparms->fid->volatile_fid = le64_to_cpu(rsp->VolatileFileId);
        oparms->fid->access = oparms->desired_access;
 #ifdef CONFIG_CIFS_DEBUG2
-       oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId);
+       oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId);
 #endif /* CIFS_DEBUG2 */
 
        if (buf) {
@@ -3052,7 +3053,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
         * response size smaller.
         */
        req->MaxOutputResponse = cpu_to_le32(max_response_size);
-       req->sync_hdr.CreditCharge =
+       req->hdr.CreditCharge =
                cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size),
                                         SMB2_MAX_BUFFER_SIZE));
        if (is_fsctl)
@@ -3062,7 +3063,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 
        /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
        if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
-               req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+               req->hdr.Flags |= SMB2_FLAGS_SIGNED;
 
        return 0;
 }
@@ -3236,8 +3237,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
        if (rc)
                return rc;
 
-       req->PersistentFileId = persistent_fid;
-       req->VolatileFileId = volatile_fid;
+       req->PersistentFileId = cpu_to_le64(persistent_fid);
+       req->VolatileFileId = cpu_to_le64(volatile_fid);
        if (query_attrs)
                req->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB;
        else
@@ -3600,8 +3601,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst,
        if (rc)
                return rc;
 
-       req->PersistentFileId = persistent_fid;
-       req->VolatileFileId = volatile_fid;
+       req->PersistentFileId = cpu_to_le64(persistent_fid);
+       req->VolatileFileId = cpu_to_le64(volatile_fid);
        /* See note 354 of MS-SMB2, 64K max */
        req->OutputBufferLength =
                cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE);
@@ -3687,7 +3688,7 @@ smb2_echo_callback(struct mid_q_entry *mid)
 
        if (mid->mid_state == MID_RESPONSE_RECEIVED
            || mid->mid_state == MID_RESPONSE_MALFORMED) {
-               credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+               credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
                credits.instance = server->reconnect_instance;
        }
 
@@ -3787,7 +3788,7 @@ SMB2_echo(struct TCP_Server_Info *server)
        if (rc)
                return rc;
 
-       req->sync_hdr.CreditRequest = cpu_to_le16(1);
+       req->hdr.CreditRequest = cpu_to_le16(1);
 
        iov[0].iov_len = total_len;
        iov[0].iov_base = (char *)req;
@@ -3823,8 +3824,8 @@ SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst,
        if (rc)
                return rc;
 
-       req->PersistentFileId = persistent_fid;
-       req->VolatileFileId = volatile_fid;
+       req->PersistentFileId = cpu_to_le64(persistent_fid);
+       req->VolatileFileId = cpu_to_le64(volatile_fid);
 
        iov[0].iov_base = (char *)req;
        iov[0].iov_len = total_len;
@@ -3890,8 +3891,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
        unsigned int remaining_bytes, int request_type)
 {
        int rc = -EACCES;
-       struct smb2_read_plain_req *req = NULL;
-       struct smb2_sync_hdr *shdr;
+       struct smb2_read_req *req = NULL;
+       struct smb2_hdr *shdr;
        struct TCP_Server_Info *server = io_parms->server;
 
        rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, server,
@@ -3902,11 +3903,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
        if (server == NULL)
                return -ECONNABORTED;
 
-       shdr = &req->sync_hdr;
-       shdr->ProcessId = cpu_to_le32(io_parms->pid);
+       shdr = &req->hdr;
+       shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
 
-       req->PersistentFileId = io_parms->persistent_fid;
-       req->VolatileFileId = io_parms->volatile_fid;
+       req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid);
+       req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid);
        req->ReadChannelInfoOffset = 0; /* reserved */
        req->ReadChannelInfoLength = 0; /* reserved */
        req->Channel = 0; /* reserved */
@@ -3940,7 +3941,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
                if (need_invalidate)
                        req->Channel = SMB2_CHANNEL_RDMA_V1;
                req->ReadChannelInfoOffset =
-                       cpu_to_le16(offsetof(struct smb2_read_plain_req, Buffer));
+                       cpu_to_le16(offsetof(struct smb2_read_req, Buffer));
                req->ReadChannelInfoLength =
                        cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1));
                v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
@@ -3964,10 +3965,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
                         * Related requests use info from previous read request
                         * in chain.
                         */
-                       shdr->SessionId = 0xFFFFFFFFFFFFFFFF;
-                       shdr->TreeId = 0xFFFFFFFF;
-                       req->PersistentFileId = 0xFFFFFFFFFFFFFFFF;
-                       req->VolatileFileId = 0xFFFFFFFFFFFFFFFF;
+                       shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
+                       shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF);
+                       req->PersistentFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
+                       req->VolatileFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF);
                }
        }
        if (remaining_bytes > io_parms->length)
@@ -3985,8 +3986,8 @@ smb2_readv_callback(struct mid_q_entry *mid)
        struct cifs_readdata *rdata = mid->callback_data;
        struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
        struct TCP_Server_Info *server = rdata->server;
-       struct smb2_sync_hdr *shdr =
-                               (struct smb2_sync_hdr *)rdata->iov[0].iov_base;
+       struct smb2_hdr *shdr =
+                               (struct smb2_hdr *)rdata->iov[0].iov_base;
        struct cifs_credits credits = { .value = 0, .instance = 0 };
        struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
                                 .rq_nvec = 1,
@@ -4072,7 +4073,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
 {
        int rc, flags = 0;
        char *buf;
-       struct smb2_sync_hdr *shdr;
+       struct smb2_hdr *shdr;
        struct cifs_io_parms io_parms;
        struct smb_rqst rqst = { .rq_iov = rdata->iov,
                                 .rq_nvec = 1 };
@@ -4105,7 +4106,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
        rdata->iov[0].iov_base = buf;
        rdata->iov[0].iov_len = total_len;
 
-       shdr = (struct smb2_sync_hdr *)buf;
+       shdr = (struct smb2_hdr *)buf;
 
        if (rdata->credits.value > 0) {
                shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
@@ -4144,7 +4145,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 {
        struct smb_rqst rqst;
        int resp_buftype, rc;
-       struct smb2_read_plain_req *req = NULL;
+       struct smb2_read_req *req = NULL;
        struct smb2_read_rsp *rsp = NULL;
        struct kvec iov[1];
        struct kvec rsp_iov;
@@ -4178,19 +4179,22 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
                if (rc != -ENODATA) {
                        cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
                        cifs_dbg(VFS, "Send error in read = %d\n", rc);
-                       trace_smb3_read_err(xid, req->PersistentFileId,
+                       trace_smb3_read_err(xid,
+                                           le64_to_cpu(req->PersistentFileId),
                                            io_parms->tcon->tid, ses->Suid,
                                            io_parms->offset, io_parms->length,
                                            rc);
                } else
-                       trace_smb3_read_done(xid, req->PersistentFileId,
-                                   io_parms->tcon->tid, ses->Suid,
-                                   io_parms->offset, 0);
+                       trace_smb3_read_done(xid,
+                                            le64_to_cpu(req->PersistentFileId),
+                                            io_parms->tcon->tid, ses->Suid,
+                                            io_parms->offset, 0);
                free_rsp_buf(resp_buftype, rsp_iov.iov_base);
                cifs_small_buf_release(req);
                return rc == -ENODATA ? 0 : rc;
        } else
-               trace_smb3_read_done(xid, req->PersistentFileId,
+               trace_smb3_read_done(xid,
+                                    le64_to_cpu(req->PersistentFileId),
                                    io_parms->tcon->tid, ses->Suid,
                                    io_parms->offset, io_parms->length);
 
@@ -4238,7 +4242,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
 
        switch (mid->mid_state) {
        case MID_RESPONSE_RECEIVED:
-               credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+               credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
                credits.instance = server->reconnect_instance;
                wdata->result = smb2_check_receive(mid, server, 0);
                if (wdata->result != 0)
@@ -4264,7 +4268,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
                wdata->result = -EAGAIN;
                break;
        case MID_RESPONSE_MALFORMED:
-               credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+               credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
                credits.instance = server->reconnect_instance;
                fallthrough;
        default:
@@ -4311,7 +4315,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
 {
        int rc = -EACCES, flags = 0;
        struct smb2_write_req *req = NULL;
-       struct smb2_sync_hdr *shdr;
+       struct smb2_hdr *shdr;
        struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
        struct TCP_Server_Info *server = wdata->server;
        struct kvec iov[1];
@@ -4329,11 +4333,11 @@ smb2_async_writev(struct cifs_writedata *wdata,
        if (smb3_encryption_required(tcon))
                flags |= CIFS_TRANSFORM_REQ;
 
-       shdr = (struct smb2_sync_hdr *)req;
-       shdr->ProcessId = cpu_to_le32(wdata->cfile->pid);
+       shdr = (struct smb2_hdr *)req;
+       shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid);
 
-       req->PersistentFileId = wdata->cfile->fid.persistent_fid;
-       req->VolatileFileId = wdata->cfile->fid.volatile_fid;
+       req->PersistentFileId = cpu_to_le64(wdata->cfile->fid.persistent_fid);
+       req->VolatileFileId = cpu_to_le64(wdata->cfile->fid.volatile_fid);
        req->WriteChannelInfoOffset = 0;
        req->WriteChannelInfoLength = 0;
        req->Channel = 0;
@@ -4430,7 +4434,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
                             wdata, flags, &wdata->credits);
 
        if (rc) {
-               trace_smb3_write_err(0 /* no xid */, req->PersistentFileId,
+               trace_smb3_write_err(0 /* no xid */,
+                                    le64_to_cpu(req->PersistentFileId),
                                     tcon->tid, tcon->ses->Suid, wdata->offset,
                                     wdata->bytes, rc);
                kref_put(&wdata->refcount, release);
@@ -4481,10 +4486,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
        if (smb3_encryption_required(io_parms->tcon))
                flags |= CIFS_TRANSFORM_REQ;
 
-       req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid);
+       req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid);
 
-       req->PersistentFileId = io_parms->persistent_fid;
-       req->VolatileFileId = io_parms->volatile_fid;
+       req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid);
+       req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid);
        req->WriteChannelInfoOffset = 0;
        req->WriteChannelInfoLength = 0;
        req->Channel = 0;
@@ -4512,7 +4517,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
        rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
 
        if (rc) {
-               trace_smb3_write_err(xid, req->PersistentFileId,
+               trace_smb3_write_err(xid,
+                                    le64_to_cpu(req->PersistentFileId),
                                     io_parms->tcon->tid,
                                     io_parms->tcon->ses->Suid,
                                     io_parms->offset, io_parms->length, rc);
@@ -4520,10 +4526,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
                cifs_dbg(VFS, "Send error in write = %d\n", rc);
        } else {
                *nbytes = le32_to_cpu(rsp->DataLength);
-               trace_smb3_write_done(xid, req->PersistentFileId,
-                                    io_parms->tcon->tid,
-                                    io_parms->tcon->ses->Suid,
-                                    io_parms->offset, *nbytes);
+               trace_smb3_write_done(xid,
+                                     le64_to_cpu(req->PersistentFileId),
+                                     io_parms->tcon->tid,
+                                     io_parms->tcon->ses->Suid,
+                                     io_parms->offset, *nbytes);
        }
 
        cifs_small_buf_release(req);
@@ -4866,7 +4873,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 
        if (rc) {
                if (rc == -ENODATA &&
-                   rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
+                   rsp->hdr.Status == STATUS_NO_MORE_FILES) {
                        trace_smb3_query_dir_done(xid, persistent_fid,
                                tcon->tid, tcon->ses->Suid, index, 0);
                        srch_inf->endOfSearch = true;
@@ -4914,7 +4921,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
        if (rc)
                return rc;
 
-       req->sync_hdr.ProcessId = cpu_to_le32(pid);
+       req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid);
        req->InfoType = info_type;
        req->FileInfoClass = info_class;
        req->PersistentFileId = persistent_fid;
@@ -5074,7 +5081,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
        req->VolatileFid = volatile_fid;
        req->PersistentFid = persistent_fid;
        req->OplockLevel = oplock_level;
-       req->sync_hdr.CreditRequest = cpu_to_le16(1);
+       req->hdr.CreditRequest = cpu_to_le16(1);
 
        flags |= CIFS_NO_RSP_BUF;
 
@@ -5376,7 +5383,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
        if (smb3_encryption_required(tcon))
                flags |= CIFS_TRANSFORM_REQ;
 
-       req->sync_hdr.ProcessId = cpu_to_le32(pid);
+       req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid);
        req->LockCount = cpu_to_le16(num_lock);
 
        req->PersistentFileId = persist_fid;
@@ -5452,7 +5459,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
        if (smb3_encryption_required(tcon))
                flags |= CIFS_TRANSFORM_REQ;
 
-       req->sync_hdr.CreditRequest = cpu_to_le16(1);
+       req->hdr.CreditRequest = cpu_to_le16(1);
        req->StructureSize = cpu_to_le16(36);
        total_len += 12;
 
index f32c99c9ba13179bd40b232bb696fc51495e0c47..33cfd0a1adf12ca43bde5261184f1956d417b0bc 100644 (file)
 #include <net/sock.h>
 #include "cifsacl.h"
 
-/*
- * Note that, due to trying to use names similar to the protocol specifications,
- * there are many mixed case field names in the structures below.  Although
- * this does not match typical Linux kernel style, it is necessary to be
- * able to match against the protocol specfication.
- *
- * SMB2 commands
- * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
- * (ie no useful data other than the SMB error code itself) and are marked such.
- * Knowing this helps avoid response buffer allocations and copy in some cases.
- */
-
-/* List of commands in host endian */
-#define SMB2_NEGOTIATE_HE      0x0000
-#define SMB2_SESSION_SETUP_HE  0x0001
-#define SMB2_LOGOFF_HE         0x0002 /* trivial request/resp */
-#define SMB2_TREE_CONNECT_HE   0x0003
-#define SMB2_TREE_DISCONNECT_HE        0x0004 /* trivial req/resp */
-#define SMB2_CREATE_HE         0x0005
-#define SMB2_CLOSE_HE          0x0006
-#define SMB2_FLUSH_HE          0x0007 /* trivial resp */
-#define SMB2_READ_HE           0x0008
-#define SMB2_WRITE_HE          0x0009
-#define SMB2_LOCK_HE           0x000A
-#define SMB2_IOCTL_HE          0x000B
-#define SMB2_CANCEL_HE         0x000C
-#define SMB2_ECHO_HE           0x000D
-#define SMB2_QUERY_DIRECTORY_HE        0x000E
-#define SMB2_CHANGE_NOTIFY_HE  0x000F
-#define SMB2_QUERY_INFO_HE     0x0010
-#define SMB2_SET_INFO_HE       0x0011
-#define SMB2_OPLOCK_BREAK_HE   0x0012
-
-/* The same list in little endian */
-#define SMB2_NEGOTIATE         cpu_to_le16(SMB2_NEGOTIATE_HE)
-#define SMB2_SESSION_SETUP     cpu_to_le16(SMB2_SESSION_SETUP_HE)
-#define SMB2_LOGOFF            cpu_to_le16(SMB2_LOGOFF_HE)
-#define SMB2_TREE_CONNECT      cpu_to_le16(SMB2_TREE_CONNECT_HE)
-#define SMB2_TREE_DISCONNECT   cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
-#define SMB2_CREATE            cpu_to_le16(SMB2_CREATE_HE)
-#define SMB2_CLOSE             cpu_to_le16(SMB2_CLOSE_HE)
-#define SMB2_FLUSH             cpu_to_le16(SMB2_FLUSH_HE)
-#define SMB2_READ              cpu_to_le16(SMB2_READ_HE)
-#define SMB2_WRITE             cpu_to_le16(SMB2_WRITE_HE)
-#define SMB2_LOCK              cpu_to_le16(SMB2_LOCK_HE)
-#define SMB2_IOCTL             cpu_to_le16(SMB2_IOCTL_HE)
-#define SMB2_CANCEL            cpu_to_le16(SMB2_CANCEL_HE)
-#define SMB2_ECHO              cpu_to_le16(SMB2_ECHO_HE)
-#define SMB2_QUERY_DIRECTORY   cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
-#define SMB2_CHANGE_NOTIFY     cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
-#define SMB2_QUERY_INFO                cpu_to_le16(SMB2_QUERY_INFO_HE)
-#define SMB2_SET_INFO          cpu_to_le16(SMB2_SET_INFO_HE)
-#define SMB2_OPLOCK_BREAK      cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
-
-#define SMB2_INTERNAL_CMD      cpu_to_le16(0xFFFF)
-
-#define NUMBER_OF_SMB2_COMMANDS        0x0013
-
 /* 52 transform hdr + 64 hdr + 88 create rsp */
 #define SMB2_TRANSFORM_HEADER_SIZE 52
 #define MAX_SMB2_HDR_SIZE 204
 
-#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
-#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
-#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc)
-
-/*
- * SMB2 Header Definition
- *
- * "MBZ" :  Must be Zero
- * "BB"  :  BugBug, Something to check/review/analyze later
- * "PDU" :  "Protocol Data Unit" (ie a network "frame")
- *
- */
-
-#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64)
-
-struct smb2_sync_hdr {
-       __le32 ProtocolId;      /* 0xFE 'S' 'M' 'B' */
-       __le16 StructureSize;   /* 64 */
-       __le16 CreditCharge;    /* MBZ */
-       __le32 Status;          /* Error from server */
-       __le16 Command;
-       __le16 CreditRequest;  /* CreditResponse */
-       __le32 Flags;
-       __le32 NextCommand;
-       __le64 MessageId;
-       __le32 ProcessId;
-       __u32  TreeId;          /* opaque - so do not make little endian */
-       __u64  SessionId;       /* opaque - so do not make little endian */
-       __u8   Signature[16];
-} __packed;
-
 /* The total header size for SMB2 read and write */
-#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_sync_hdr))
-
-struct smb2_sync_pdu {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize2; /* size of wct area (varies, request specific) */
-} __packed;
-
-#define SMB3_AES_CCM_NONCE 11
-#define SMB3_AES_GCM_NONCE 12
-
-/* Transform flags (for 3.0 dialect this flag indicates CCM */
-#define TRANSFORM_FLAG_ENCRYPTED       0x0001
-struct smb2_transform_hdr {
-       __le32 ProtocolId;      /* 0xFD 'S' 'M' 'B' */
-       __u8   Signature[16];
-       __u8   Nonce[16];
-       __le32 OriginalMessageSize;
-       __u16  Reserved1;
-       __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */
-       __u64  SessionId;
-} __packed;
-
-/* See MS-SMB2 2.2.42 */
-struct smb2_compression_transform_hdr_unchained {
-       __le32 ProtocolId;      /* 0xFC 'S' 'M' 'B' */
-       __le32 OriginalCompressedSegmentSize;
-       __le16 CompressionAlgorithm;
-       __le16 Flags;
-       __le16 Length; /* if chained it is length, else offset */
-} __packed;
-
-/* See MS-SMB2 2.2.42.1 */
-#define SMB2_COMPRESSION_FLAG_NONE     0x0000
-#define SMB2_COMPRESSION_FLAG_CHAINED  0x0001
-
-struct compression_payload_header {
-       __le16  CompressionAlgorithm;
-       __le16  Flags;
-       __le32  Length; /* length of compressed playload including field below if present */
-       /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
-} __packed;
-
-/* See MS-SMB2 2.2.42.2 */
-struct smb2_compression_transform_hdr_chained {
-       __le32 ProtocolId;      /* 0xFC 'S' 'M' 'B' */
-       __le32 OriginalCompressedSegmentSize;
-       /* struct compression_payload_header[] */
-} __packed;
-
-/* See MS-SMB2 2.2.42.2.2 */
-struct compression_pattern_payload_v1 {
-       __le16  Pattern;
-       __le16  Reserved1;
-       __le16  Reserved2;
-       __le32  Repetitions;
-} __packed;
+#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_hdr))
 
 /* See MS-SMB2 2.2.43 */
 struct smb2_rdma_transform {
@@ -189,17 +45,6 @@ struct smb2_rdma_crypto_transform {
        /* followed by padding */
 } __packed;
 
-/*
- *     SMB2 flag definitions
- */
-#define SMB2_FLAGS_SERVER_TO_REDIR     cpu_to_le32(0x00000001)
-#define SMB2_FLAGS_ASYNC_COMMAND       cpu_to_le32(0x00000002)
-#define SMB2_FLAGS_RELATED_OPERATIONS  cpu_to_le32(0x00000004)
-#define SMB2_FLAGS_SIGNED              cpu_to_le32(0x00000008)
-#define SMB2_FLAGS_PRIORITY_MASK       cpu_to_le32(0x00000070) /* SMB3.1.1 */
-#define SMB2_FLAGS_DFS_OPERATIONS      cpu_to_le32(0x10000000)
-#define SMB2_FLAGS_REPLAY_OPERATION    cpu_to_le32(0x20000000) /* SMB3 & up */
-
 /*
  *     Definitions for SMB2 Protocol Data Units (network frames)
  *
@@ -214,7 +59,7 @@ struct smb2_rdma_crypto_transform {
 #define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9)
 
 struct smb2_err_rsp {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize;
        __le16 Reserved; /* MBZ */
        __le32 ByteCount;  /* even if zero, at least one byte follows */
@@ -270,530 +115,6 @@ struct share_redirect_error_context_rsp {
        /* __u8 ResourceName[] */ /* Name of share as counted Unicode string */
 } __packed;
 
-#define SMB2_CLIENT_GUID_SIZE 16
-
-struct smb2_negotiate_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize; /* Must be 36 */
-       __le16 DialectCount;
-       __le16 SecurityMode;
-       __le16 Reserved;        /* MBZ */
-       __le32 Capabilities;
-       __u8   ClientGUID[SMB2_CLIENT_GUID_SIZE];
-       /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
-       __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
-       __le16 NegotiateContextCount;  /* SMB3.1.1 only. MBZ earlier */
-       __le16 Reserved2;
-       __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */
-} __packed;
-
-/* Dialects */
-#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */
-#define SMB20_PROT_ID 0x0202
-#define SMB21_PROT_ID 0x0210
-#define SMB30_PROT_ID 0x0300
-#define SMB302_PROT_ID 0x0302
-#define SMB311_PROT_ID 0x0311
-#define BAD_PROT_ID   0xFFFF
-
-/* SecurityMode flags */
-#define        SMB2_NEGOTIATE_SIGNING_ENABLED  0x0001
-#define SMB2_NEGOTIATE_SIGNING_REQUIRED        0x0002
-#define SMB2_SEC_MODE_FLAGS_ALL                0x0003
-
-/* Capabilities flags */
-#define SMB2_GLOBAL_CAP_DFS            0x00000001
-#define SMB2_GLOBAL_CAP_LEASING                0x00000002 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_LARGE_MTU      0X00000004 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_MULTI_CHANNEL  0x00000008 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING  0x00000020 /* New to SMB3 */
-#define SMB2_GLOBAL_CAP_ENCRYPTION     0x00000040 /* New to SMB3 */
-/* Internal types */
-#define SMB2_NT_FIND                   0x00100000
-#define SMB2_LARGE_FILES               0x00200000
-
-
-/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */
-#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES    cpu_to_le16(1)
-#define SMB2_ENCRYPTION_CAPABILITIES           cpu_to_le16(2)
-#define SMB2_COMPRESSION_CAPABILITIES          cpu_to_le16(3)
-#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID      cpu_to_le16(5)
-#define SMB2_TRANSPORT_CAPABILITIES            cpu_to_le16(6)
-#define SMB2_RDMA_TRANSFORM_CAPABILITIES       cpu_to_le16(7)
-#define SMB2_SIGNING_CAPABILITIES              cpu_to_le16(8)
-#define SMB2_POSIX_EXTENSIONS_AVAILABLE                cpu_to_le16(0x100)
-
-struct smb2_neg_context {
-       __le16  ContextType;
-       __le16  DataLength;
-       __le32  Reserved;
-       /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */
-} __packed;
-
-#define SMB311_LINUX_CLIENT_SALT_SIZE                  32
-/* Hash Algorithm Types */
-#define SMB2_PREAUTH_INTEGRITY_SHA512  cpu_to_le16(0x0001)
-#define SMB2_PREAUTH_HASH_SIZE 64
-
-/*
- * SaltLength that the server send can be zero, so the only three required
- * fields (all __le16) end up six bytes total, so the minimum context data len
- * in the response is six bytes which accounts for
- *
- *      HashAlgorithmCount, SaltLength, and 1 HashAlgorithm.
- */
-#define MIN_PREAUTH_CTXT_DATA_LEN 6
-
-struct smb2_preauth_neg_context {
-       __le16  ContextType; /* 1 */
-       __le16  DataLength;
-       __le32  Reserved;
-       __le16  HashAlgorithmCount; /* 1 */
-       __le16  SaltLength;
-       __le16  HashAlgorithms; /* HashAlgorithms[0] since only one defined */
-       __u8    Salt[SMB311_LINUX_CLIENT_SALT_SIZE];
-} __packed;
-
-/* Encryption Algorithms Ciphers */
-#define SMB2_ENCRYPTION_AES128_CCM     cpu_to_le16(0x0001)
-#define SMB2_ENCRYPTION_AES128_GCM     cpu_to_le16(0x0002)
-/* we currently do not request AES256_CCM since presumably GCM faster */
-#define SMB2_ENCRYPTION_AES256_CCM      cpu_to_le16(0x0003)
-#define SMB2_ENCRYPTION_AES256_GCM      cpu_to_le16(0x0004)
-
-/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */
-#define MIN_ENCRYPT_CTXT_DATA_LEN      4
-struct smb2_encryption_neg_context {
-       __le16  ContextType; /* 2 */
-       __le16  DataLength;
-       __le32  Reserved;
-       /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
-       __le16  CipherCount; /* AES128-GCM and AES128-CCM by default */
-       __le16  Ciphers[3];
-} __packed;
-
-/* See MS-SMB2 2.2.3.1.3 */
-#define SMB3_COMPRESS_NONE     cpu_to_le16(0x0000)
-#define SMB3_COMPRESS_LZNT1    cpu_to_le16(0x0001)
-#define SMB3_COMPRESS_LZ77     cpu_to_le16(0x0002)
-#define SMB3_COMPRESS_LZ77_HUFF        cpu_to_le16(0x0003)
-/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
-#define SMB3_COMPRESS_PATTERN  cpu_to_le16(0x0004) /* Pattern_V1 */
-
-/* Compression Flags */
-#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE                cpu_to_le32(0x00000000)
-#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED     cpu_to_le32(0x00000001)
-
-struct smb2_compression_capabilities_context {
-       __le16  ContextType; /* 3 */
-       __le16  DataLength;
-       __u32   Reserved;
-       __le16  CompressionAlgorithmCount;
-       __u16   Padding;
-       __u32   Flags;
-       __le16  CompressionAlgorithms[3];
-       __u16   Pad;  /* Some servers require pad to DataLen multiple of 8 */
-       /* Check if pad needed */
-} __packed;
-
-/*
- * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4.
- * Its struct simply contains NetName, an array of Unicode characters
- */
-struct smb2_netname_neg_context {
-       __le16  ContextType; /* 5 */
-       __le16  DataLength;
-       __le32  Reserved;
-       __le16  NetName[]; /* hostname of target converted to UCS-2 */
-} __packed;
-
-/*
- * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5
- * and 2.2.4.1.5
- */
-
-/* Flags */
-#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY   0x00000001
-
-struct smb2_transport_capabilities_context {
-       __le16  ContextType; /* 6 */
-       __le16  DataLength;
-       __u32   Reserved;
-       __le32  Flags;
-       __u32   Pad;
-} __packed;
-
-/*
- * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
- * and 2.2.4.1.6
- */
-
-/* RDMA Transform IDs */
-#define SMB2_RDMA_TRANSFORM_NONE       0x0000
-#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001
-#define SMB2_RDMA_TRANSFORM_SIGNING    0x0002
-
-struct smb2_rdma_transform_capabilities_context {
-       __le16  ContextType; /* 7 */
-       __le16  DataLength;
-       __u32   Reserved;
-       __le16  TransformCount;
-       __u16   Reserved1;
-       __u32   Reserved2;
-       __le16  RDMATransformIds[];
-} __packed;
-
-/*
- * For signing capabilities context see MS-SMB2 2.2.3.1.7
- * and 2.2.4.1.7
- */
-
-/* Signing algorithms */
-#define SIGNING_ALG_HMAC_SHA256        0
-#define SIGNING_ALG_AES_CMAC   1
-#define SIGNING_ALG_AES_GMAC   2
-
-struct smb2_signing_capabilities {
-       __le16  ContextType; /* 8 */
-       __le16  DataLength;
-       __u32   Reserved;
-       __le16  SigningAlgorithmCount;
-       __le16  SigningAlgorithms[];
-       /*  Followed by padding to 8 byte boundary (required by some servers) */
-} __packed;
-
-#define POSIX_CTXT_DATA_LEN    16
-struct smb2_posix_neg_context {
-       __le16  ContextType; /* 0x100 */
-       __le16  DataLength;
-       __le32  Reserved;
-       __u8    Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
-} __packed;
-
-struct smb2_negotiate_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 65 */
-       __le16 SecurityMode;
-       __le16 DialectRevision;
-       __le16 NegotiateContextCount;   /* Prior to SMB3.1.1 was Reserved & MBZ */
-       __u8   ServerGUID[16];
-       __le32 Capabilities;
-       __le32 MaxTransactSize;
-       __le32 MaxReadSize;
-       __le32 MaxWriteSize;
-       __le64 SystemTime;      /* MBZ */
-       __le64 ServerStartTime;
-       __le16 SecurityBufferOffset;
-       __le16 SecurityBufferLength;
-       __le32 NegotiateContextOffset;  /* Pre:SMB3.1.1 was reserved/ignored */
-       __u8   Buffer[1];       /* variable length GSS security buffer */
-} __packed;
-
-/* Flags */
-#define SMB2_SESSION_REQ_FLAG_BINDING          0x01
-#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA     0x04
-
-struct smb2_sess_setup_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize; /* Must be 25 */
-       __u8   Flags;
-       __u8   SecurityMode;
-       __le32 Capabilities;
-       __le32 Channel;
-       __le16 SecurityBufferOffset;
-       __le16 SecurityBufferLength;
-       __u64 PreviousSessionId;
-       __u8   Buffer[1];       /* variable length GSS security buffer */
-} __packed;
-
-/* Currently defined SessionFlags */
-#define SMB2_SESSION_FLAG_IS_GUEST     0x0001
-#define SMB2_SESSION_FLAG_IS_NULL      0x0002
-#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004
-struct smb2_sess_setup_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize; /* Must be 9 */
-       __le16 SessionFlags;
-       __le16 SecurityBufferOffset;
-       __le16 SecurityBufferLength;
-       __u8   Buffer[1];       /* variable length GSS security buffer */
-} __packed;
-
-struct smb2_logoff_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 4 */
-       __le16 Reserved;
-} __packed;
-
-struct smb2_logoff_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 4 */
-       __le16 Reserved;
-} __packed;
-
-/* Flags/Reserved for SMB3.1.1 */
-#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001)
-#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002)
-#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004)
-
-struct smb2_tree_connect_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 9 */
-       __le16 Flags; /* Reserved MBZ for dialects prior to SMB3.1.1 */
-       __le16 PathOffset;
-       __le16 PathLength;
-       __u8   Buffer[1];       /* variable length */
-} __packed;
-
-/* See MS-SMB2 section 2.2.9.2 */
-/* Context Types */
-#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000
-#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001)
-
-struct tree_connect_contexts {
-       __le16 ContextType;
-       __le16 DataLength;
-       __le32 Reserved;
-       __u8   Data[];
-} __packed;
-
-/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
-struct smb3_blob_data {
-       __le16 BlobSize;
-       __u8   BlobData[];
-} __packed;
-
-/* Valid values for Attr */
-#define SE_GROUP_MANDATORY             0x00000001
-#define SE_GROUP_ENABLED_BY_DEFAULT    0x00000002
-#define SE_GROUP_ENABLED               0x00000004
-#define SE_GROUP_OWNER                 0x00000008
-#define SE_GROUP_USE_FOR_DENY_ONLY     0x00000010
-#define SE_GROUP_INTEGRITY             0x00000020
-#define SE_GROUP_INTEGRITY_ENABLED     0x00000040
-#define SE_GROUP_RESOURCE              0x20000000
-#define SE_GROUP_LOGON_ID              0xC0000000
-
-/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
-
-struct sid_array_data {
-       __le16 SidAttrCount;
-       /* SidAttrList - array of sid_attr_data structs */
-} __packed;
-
-struct luid_attr_data {
-
-} __packed;
-
-/*
- * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5
- * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA
- */
-
-struct privilege_array_data {
-       __le16 PrivilegeCount;
-       /* array of privilege_data structs */
-} __packed;
-
-struct remoted_identity_tcon_context {
-       __le16 TicketType; /* must be 0x0001 */
-       __le16 TicketSize; /* total size of this struct */
-       __le16 User; /* offset to SID_ATTR_DATA struct with user info */
-       __le16 UserName; /* offset to null terminated Unicode username string */
-       __le16 Domain; /* offset to null terminated Unicode domain name */
-       __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */
-       __le16 RestrictedGroups; /* similar to above */
-       __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */
-       __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */
-       __le16 Owner; /* offset to BLOB_DATA struct */
-       __le16 DefaultDacl; /* offset to BLOB_DATA struct */
-       __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
-       __le16 UserClaims; /* offset to BLOB_DATA struct */
-       __le16 DeviceClaims; /* offset to BLOB_DATA struct */
-       __u8   TicketInfo[]; /* variable length buf - remoted identity data */
-} __packed;
-
-struct smb2_tree_connect_req_extension {
-       __le32 TreeConnectContextOffset;
-       __le16 TreeConnectContextCount;
-       __u8  Reserved[10];
-       __u8  PathName[]; /* variable sized array */
-       /* followed by array of TreeConnectContexts */
-} __packed;
-
-struct smb2_tree_connect_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 16 */
-       __u8   ShareType;  /* see below */
-       __u8   Reserved;
-       __le32 ShareFlags; /* see below */
-       __le32 Capabilities; /* see below */
-       __le32 MaximalAccess;
-} __packed;
-
-/* Possible ShareType values */
-#define SMB2_SHARE_TYPE_DISK   0x01
-#define SMB2_SHARE_TYPE_PIPE   0x02
-#define        SMB2_SHARE_TYPE_PRINT   0x03
-
-/*
- * Possible ShareFlags - exactly one and only one of the first 4 caching flags
- * must be set (any of the remaining, SHI1005, flags may be set individually
- * or in combination.
- */
-#define SMB2_SHAREFLAG_MANUAL_CACHING                  0x00000000
-#define SMB2_SHAREFLAG_AUTO_CACHING                    0x00000010
-#define SMB2_SHAREFLAG_VDO_CACHING                     0x00000020
-#define SMB2_SHAREFLAG_NO_CACHING                      0x00000030
-#define SHI1005_FLAGS_DFS                              0x00000001
-#define SHI1005_FLAGS_DFS_ROOT                         0x00000002
-#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS         0x00000100
-#define SHI1005_FLAGS_FORCE_SHARED_DELETE              0x00000200
-#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING          0x00000400
-#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM      0x00000800
-#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK             0x00001000
-#define SHI1005_FLAGS_ENABLE_HASH_V1                   0x00002000
-#define SHI1005_FLAGS_ENABLE_HASH_V2                   0x00004000
-#define SHI1005_FLAGS_ENCRYPT_DATA                     0x00008000
-#define SMB2_SHAREFLAG_IDENTITY_REMOTING               0x00040000 /* 3.1.1 */
-#define SMB2_SHAREFLAG_COMPRESS_DATA                   0x00100000 /* 3.1.1 */
-#define SHI1005_FLAGS_ALL                              0x0014FF33
-
-/* Possible share capabilities */
-#define SMB2_SHARE_CAP_DFS     cpu_to_le32(0x00000008) /* all dialects */
-#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
-#define SMB2_SHARE_CAP_SCALEOUT        cpu_to_le32(0x00000020) /* 3.0 */
-#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
-#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
-#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */
-
-struct smb2_tree_disconnect_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 4 */
-       __le16 Reserved;
-} __packed;
-
-struct smb2_tree_disconnect_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 4 */
-       __le16 Reserved;
-} __packed;
-
-/* File Attrubutes */
-#define FILE_ATTRIBUTE_READONLY                        0x00000001
-#define FILE_ATTRIBUTE_HIDDEN                  0x00000002
-#define FILE_ATTRIBUTE_SYSTEM                  0x00000004
-#define FILE_ATTRIBUTE_DIRECTORY               0x00000010
-#define FILE_ATTRIBUTE_ARCHIVE                 0x00000020
-#define FILE_ATTRIBUTE_NORMAL                  0x00000080
-#define FILE_ATTRIBUTE_TEMPORARY               0x00000100
-#define FILE_ATTRIBUTE_SPARSE_FILE             0x00000200
-#define FILE_ATTRIBUTE_REPARSE_POINT           0x00000400
-#define FILE_ATTRIBUTE_COMPRESSED              0x00000800
-#define FILE_ATTRIBUTE_OFFLINE                 0x00001000
-#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED     0x00002000
-#define FILE_ATTRIBUTE_ENCRYPTED               0x00004000
-#define FILE_ATTRIBUTE_INTEGRITY_STREAM                0x00008000
-#define FILE_ATTRIBUTE_NO_SCRUB_DATA           0x00020000
-
-/* Oplock levels */
-#define SMB2_OPLOCK_LEVEL_NONE         0x00
-#define SMB2_OPLOCK_LEVEL_II           0x01
-#define SMB2_OPLOCK_LEVEL_EXCLUSIVE    0x08
-#define SMB2_OPLOCK_LEVEL_BATCH                0x09
-#define SMB2_OPLOCK_LEVEL_LEASE                0xFF
-/* Non-spec internal type */
-#define SMB2_OPLOCK_LEVEL_NOCHANGE     0x99
-
-/* Desired Access Flags */
-#define FILE_READ_DATA_LE              cpu_to_le32(0x00000001)
-#define FILE_WRITE_DATA_LE             cpu_to_le32(0x00000002)
-#define FILE_APPEND_DATA_LE            cpu_to_le32(0x00000004)
-#define FILE_READ_EA_LE                        cpu_to_le32(0x00000008)
-#define FILE_WRITE_EA_LE               cpu_to_le32(0x00000010)
-#define FILE_EXECUTE_LE                        cpu_to_le32(0x00000020)
-#define FILE_READ_ATTRIBUTES_LE                cpu_to_le32(0x00000080)
-#define FILE_WRITE_ATTRIBUTES_LE       cpu_to_le32(0x00000100)
-#define FILE_DELETE_LE                 cpu_to_le32(0x00010000)
-#define FILE_READ_CONTROL_LE           cpu_to_le32(0x00020000)
-#define FILE_WRITE_DAC_LE              cpu_to_le32(0x00040000)
-#define FILE_WRITE_OWNER_LE            cpu_to_le32(0x00080000)
-#define FILE_SYNCHRONIZE_LE            cpu_to_le32(0x00100000)
-#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000)
-#define FILE_MAXIMAL_ACCESS_LE         cpu_to_le32(0x02000000)
-#define FILE_GENERIC_ALL_LE            cpu_to_le32(0x10000000)
-#define FILE_GENERIC_EXECUTE_LE                cpu_to_le32(0x20000000)
-#define FILE_GENERIC_WRITE_LE          cpu_to_le32(0x40000000)
-#define FILE_GENERIC_READ_LE           cpu_to_le32(0x80000000)
-
-/* ShareAccess Flags */
-#define FILE_SHARE_READ_LE             cpu_to_le32(0x00000001)
-#define FILE_SHARE_WRITE_LE            cpu_to_le32(0x00000002)
-#define FILE_SHARE_DELETE_LE           cpu_to_le32(0x00000004)
-#define FILE_SHARE_ALL_LE              cpu_to_le32(0x00000007)
-
-/* CreateDisposition Flags */
-#define FILE_SUPERSEDE_LE              cpu_to_le32(0x00000000)
-#define FILE_OPEN_LE                   cpu_to_le32(0x00000001)
-#define FILE_CREATE_LE                 cpu_to_le32(0x00000002)
-#define        FILE_OPEN_IF_LE                 cpu_to_le32(0x00000003)
-#define FILE_OVERWRITE_LE              cpu_to_le32(0x00000004)
-#define FILE_OVERWRITE_IF_LE           cpu_to_le32(0x00000005)
-
-/* CreateOptions Flags */
-#define FILE_DIRECTORY_FILE_LE         cpu_to_le32(0x00000001)
-/* same as #define CREATE_NOT_FILE_LE  cpu_to_le32(0x00000001) */
-#define FILE_WRITE_THROUGH_LE          cpu_to_le32(0x00000002)
-#define FILE_SEQUENTIAL_ONLY_LE                cpu_to_le32(0x00000004)
-#define FILE_NO_INTERMEDIATE_BUFFERRING_LE cpu_to_le32(0x00000008)
-#define FILE_SYNCHRONOUS_IO_ALERT_LE   cpu_to_le32(0x00000010)
-#define FILE_SYNCHRONOUS_IO_NON_ALERT_LE       cpu_to_le32(0x00000020)
-#define FILE_NON_DIRECTORY_FILE_LE     cpu_to_le32(0x00000040)
-#define FILE_COMPLETE_IF_OPLOCKED_LE   cpu_to_le32(0x00000100)
-#define FILE_NO_EA_KNOWLEDGE_LE                cpu_to_le32(0x00000200)
-#define FILE_RANDOM_ACCESS_LE          cpu_to_le32(0x00000800)
-#define FILE_DELETE_ON_CLOSE_LE                cpu_to_le32(0x00001000)
-#define FILE_OPEN_BY_FILE_ID_LE                cpu_to_le32(0x00002000)
-#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
-#define FILE_NO_COMPRESSION_LE         cpu_to_le32(0x00008000)
-#define FILE_RESERVE_OPFILTER_LE       cpu_to_le32(0x00100000)
-#define FILE_OPEN_REPARSE_POINT_LE     cpu_to_le32(0x00200000)
-#define FILE_OPEN_NO_RECALL_LE         cpu_to_le32(0x00400000)
-#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000)
-
-#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
-                       | FILE_READ_ATTRIBUTES_LE)
-#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \
-                       | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
-#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
-
-/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */
-#define IL_ANONYMOUS           cpu_to_le32(0x00000000)
-#define IL_IDENTIFICATION      cpu_to_le32(0x00000001)
-#define IL_IMPERSONATION       cpu_to_le32(0x00000002)
-#define IL_DELEGATE            cpu_to_le32(0x00000003)
-
-/* Create Context Values */
-#define SMB2_CREATE_EA_BUFFER                  "ExtA" /* extended attributes */
-#define SMB2_CREATE_SD_BUFFER                  "SecD" /* security descriptor */
-#define SMB2_CREATE_DURABLE_HANDLE_REQUEST     "DHnQ"
-#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT   "DHnC"
-#define SMB2_CREATE_ALLOCATION_SIZE            "AISi"
-#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
-#define SMB2_CREATE_TIMEWARP_REQUEST           "TWrp"
-#define SMB2_CREATE_QUERY_ON_DISK_ID           "QFid"
-#define SMB2_CREATE_REQUEST_LEASE              "RqLs"
-#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2  "DH2Q"
-#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2        "DH2C"
-#define SMB2_CREATE_APP_INSTANCE_ID    0x45BCA66AEFA7F74A9008FA462E144D74
-#define SMB2_CREATE_APP_INSTANCE_VERSION 0xB982D0B73B56074FA07B524A8116A010
-#define SVHDX_OPEN_DEVICE_CONTEX       0x9CCBCF9E04C1E643980E158DA1F6EC83
-#define SMB2_CREATE_TAG_POSIX          0x93AD25509CB411E7B42383DE968BCD7C
-
-/* Flag (SMB3 open response) values */
-#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
-
 /*
  * Maximum number of iovs we need for an open/create request.
  * [0] : struct smb2_create_req
@@ -807,26 +128,6 @@ struct smb2_tree_disconnect_rsp {
  */
 #define SMB2_CREATE_IOV_SIZE 8
 
-struct smb2_create_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 57 */
-       __u8   SecurityFlags;
-       __u8   RequestedOplockLevel;
-       __le32 ImpersonationLevel;
-       __le64 SmbCreateFlags;
-       __le64 Reserved;
-       __le32 DesiredAccess;
-       __le32 FileAttributes;
-       __le32 ShareAccess;
-       __le32 CreateDisposition;
-       __le32 CreateOptions;
-       __le16 NameOffset;
-       __le16 NameLength;
-       __le32 CreateContextsOffset;
-       __le32 CreateContextsLength;
-       __u8   Buffer[];
-} __packed;
-
 /*
  * Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
  * 88 (fixed part of create response) + 520 (path) + 208 (contexts) +
@@ -834,37 +135,6 @@ struct smb2_create_req {
  */
 #define MAX_SMB2_CREATE_RESPONSE_SIZE 880
 
-struct smb2_create_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 89 */
-       __u8   OplockLevel;
-       __u8   Flag;  /* 0x01 if reparse point */
-       __le32 CreateAction;
-       __le64 CreationTime;
-       __le64 LastAccessTime;
-       __le64 LastWriteTime;
-       __le64 ChangeTime;
-       __le64 AllocationSize;
-       __le64 EndofFile;
-       __le32 FileAttributes;
-       __le32 Reserved2;
-       __u64  PersistentFileId; /* opaque endianness */
-       __u64  VolatileFileId; /* opaque endianness */
-       __le32 CreateContextsOffset;
-       __le32 CreateContextsLength;
-       __u8   Buffer[1];
-} __packed;
-
-struct create_context {
-       __le32 Next;
-       __le16 NameOffset;
-       __le16 NameLength;
-       __le16 Reserved;
-       __le16 DataOffset;
-       __le32 DataLength;
-       __u8 Buffer[];
-} __packed;
-
 #define SMB2_LEASE_READ_CACHING_HE     0x01
 #define SMB2_LEASE_HANDLE_CACHING_HE   0x02
 #define SMB2_LEASE_WRITE_CACHING_HE    0x04
@@ -1210,7 +480,7 @@ struct duplicate_extents_to_file {
 #define SMB2_IOCTL_IOV_SIZE 2
 
 struct smb2_ioctl_req {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize;   /* Must be 57 */
        __u16 Reserved;
        __le32 CtlCode;
@@ -1228,7 +498,7 @@ struct smb2_ioctl_req {
 } __packed;
 
 struct smb2_ioctl_rsp {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize;   /* Must be 57 */
        __u16 Reserved;
        __le32 CtlCode;
@@ -1243,161 +513,6 @@ struct smb2_ioctl_rsp {
        /* char * buffer[] */
 } __packed;
 
-/* Currently defined values for close flags */
-#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB       cpu_to_le16(0x0001)
-struct smb2_close_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 24 */
-       __le16 Flags;
-       __le32 Reserved;
-       __u64  PersistentFileId; /* opaque endianness */
-       __u64  VolatileFileId; /* opaque endianness */
-} __packed;
-
-/*
- * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
- */
-#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
-
-struct smb2_close_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize; /* 60 */
-       __le16 Flags;
-       __le32 Reserved;
-       __le64 CreationTime;
-       __le64 LastAccessTime;
-       __le64 LastWriteTime;
-       __le64 ChangeTime;
-       __le64 AllocationSize;  /* Beginning of FILE_STANDARD_INFO equivalent */
-       __le64 EndOfFile;
-       __le32 Attributes;
-} __packed;
-
-struct smb2_flush_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;   /* Must be 24 */
-       __le16 Reserved1;
-       __le32 Reserved2;
-       __u64  PersistentFileId; /* opaque endianness */
-       __u64  VolatileFileId; /* opaque endianness */
-} __packed;
-
-struct smb2_flush_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize;
-       __le16 Reserved;
-} __packed;
-
-/* For read request Flags field below, following flag is defined for SMB3.02 */
-#define SMB2_READFLAG_READ_UNBUFFERED  0x01
-#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */
-
-/* Channel field for read and write: exactly one of following flags can be set*/
-#define SMB2_CHANNEL_NONE      cpu_to_le32(0x00000000)
-#define SMB2_CHANNEL_RDMA_V1   cpu_to_le32(0x00000001) /* SMB3 or later */
-#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */
-#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */
-
-/* SMB2 read request without RFC1001 length at the beginning */
-struct smb2_read_plain_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize; /* Must be 49 */
-       __u8   Padding; /* offset from start of SMB2 header to place read */
-       __u8   Flags; /* MBZ unless SMB3.02 or later */
-       __le32 Length;
-       __le64 Offset;
-       __u64  PersistentFileId; /* opaque endianness */
-       __u64  VolatileFileId; /* opaque endianness */
-       __le32 MinimumCount;
-       __le32 Channel; /* MBZ except for SMB3 or later */
-       __le32 RemainingBytes;
-       __le16 ReadChannelInfoOffset;
-       __le16 ReadChannelInfoLength;
-       __u8   Buffer[1];
-} __packed;
-
-/* Read flags */
-#define SMB2_READFLAG_RESPONSE_NONE    0x00000000
-#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM  0x00000001
-
-struct smb2_read_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize; /* Must be 17 */
-       __u8   DataOffset;
-       __u8   Reserved;
-       __le32 DataLength;
-       __le32 DataRemaining;
-       __u32  Flags;
-       __u8   Buffer[1];
-} __packed;
-
-/* For write request Flags field below the following flags are defined: */
-#define SMB2_WRITEFLAG_WRITE_THROUGH   0x00000001      /* SMB2.1 or later */
-#define SMB2_WRITEFLAG_WRITE_UNBUFFERED        0x00000002      /* SMB3.02 or later */
-
-struct smb2_write_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize; /* Must be 49 */
-       __le16 DataOffset; /* offset from start of SMB2 header to write data */
-       __le32 Length;
-       __le64 Offset;
-       __u64  PersistentFileId; /* opaque endianness */
-       __u64  VolatileFileId; /* opaque endianness */
-       __le32 Channel; /* MBZ unless SMB3.02 or later */
-       __le32 RemainingBytes;
-       __le16 WriteChannelInfoOffset;
-       __le16 WriteChannelInfoLength;
-       __le32 Flags;
-       __u8   Buffer[1];
-} __packed;
-
-struct smb2_write_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16 StructureSize; /* Must be 17 */
-       __u8   DataOffset;
-       __u8   Reserved;
-       __le32 DataLength;
-       __le32 DataRemaining;
-       __u32  Reserved2;
-       __u8   Buffer[1];
-} __packed;
-
-/* notify flags */
-#define SMB2_WATCH_TREE                        0x0001
-
-/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */
-#define FILE_NOTIFY_CHANGE_FILE_NAME           0x00000001
-#define FILE_NOTIFY_CHANGE_DIR_NAME            0x00000002
-#define FILE_NOTIFY_CHANGE_ATTRIBUTES          0x00000004
-#define FILE_NOTIFY_CHANGE_SIZE                        0x00000008
-#define FILE_NOTIFY_CHANGE_LAST_WRITE          0x00000010
-#define FILE_NOTIFY_CHANGE_LAST_ACCESS         0x00000020
-#define FILE_NOTIFY_CHANGE_CREATION            0x00000040
-#define FILE_NOTIFY_CHANGE_EA                  0x00000080
-#define FILE_NOTIFY_CHANGE_SECURITY            0x00000100
-#define FILE_NOTIFY_CHANGE_STREAM_NAME         0x00000200
-#define FILE_NOTIFY_CHANGE_STREAM_SIZE         0x00000400
-#define FILE_NOTIFY_CHANGE_STREAM_WRITE                0x00000800
-
-struct smb2_change_notify_req {
-       struct smb2_sync_hdr sync_hdr;
-       __le16  StructureSize;
-       __le16  Flags;
-       __le32  OutputBufferLength;
-       __u64   PersistentFileId; /* opaque endianness */
-       __u64   VolatileFileId; /* opaque endianness */
-       __le32  CompletionFilter;
-       __u32   Reserved;
-} __packed;
-
-struct smb2_change_notify_rsp {
-       struct smb2_sync_hdr sync_hdr;
-       __le16  StructureSize;  /* Must be 9 */
-       __le16  OutputBufferOffset;
-       __le32  OutputBufferLength;
-       __u8    Buffer[1]; /* array of file notify structs */
-} __packed;
-
 #define SMB2_LOCKFLAG_SHARED_LOCK      0x0001
 #define SMB2_LOCKFLAG_EXCLUSIVE_LOCK   0x0002
 #define SMB2_LOCKFLAG_UNLOCK           0x0004
@@ -1411,7 +526,7 @@ struct smb2_lock_element {
 } __packed;
 
 struct smb2_lock_req {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 48 */
        __le16 LockCount;
        /*
@@ -1426,19 +541,19 @@ struct smb2_lock_req {
 } __packed;
 
 struct smb2_lock_rsp {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 4 */
        __le16 Reserved;
 } __packed;
 
 struct smb2_echo_req {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize;   /* Must be 4 */
        __u16  Reserved;
 } __packed;
 
 struct smb2_echo_rsp {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize;   /* Must be 4 */
        __u16  Reserved;
 } __packed;
@@ -1468,7 +583,7 @@ struct smb2_echo_rsp {
  */
 
 struct smb2_query_directory_req {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 33 */
        __u8   FileInformationClass;
        __u8   Flags;
@@ -1482,7 +597,7 @@ struct smb2_query_directory_req {
 } __packed;
 
 struct smb2_query_directory_rsp {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 9 */
        __le16 OutputBufferOffset;
        __le32 OutputBufferLength;
@@ -1515,7 +630,7 @@ struct smb2_query_directory_rsp {
 #define SL_INDEX_SPECIFIED     0x00000004
 
 struct smb2_query_info_req {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 41 */
        __u8   InfoType;
        __u8   FileInfoClass;
@@ -1531,7 +646,7 @@ struct smb2_query_info_req {
 } __packed;
 
 struct smb2_query_info_rsp {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 9 */
        __le16 OutputBufferOffset;
        __le32 OutputBufferLength;
@@ -1548,7 +663,7 @@ struct smb2_query_info_rsp {
 #define SMB2_SET_INFO_IOV_SIZE 3
 
 struct smb2_set_info_req {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 33 */
        __u8   InfoType;
        __u8   FileInfoClass;
@@ -1562,12 +677,12 @@ struct smb2_set_info_req {
 } __packed;
 
 struct smb2_set_info_rsp {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 2 */
 } __packed;
 
 struct smb2_oplock_break {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 24 */
        __u8   OplockLevel;
        __u8   Reserved;
@@ -1579,7 +694,7 @@ struct smb2_oplock_break {
 #define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01)
 
 struct smb2_lease_break {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 44 */
        __le16 Epoch;
        __le32 Flags;
@@ -1592,7 +707,7 @@ struct smb2_lease_break {
 } __packed;
 
 struct smb2_lease_ack {
-       struct smb2_sync_hdr sync_hdr;
+       struct smb2_hdr hdr;
        __le16 StructureSize; /* Must be 36 */
        __le16 Reserved;
        __le32 Flags;
index 547945443fa7d0658d52e75f29663b4b02cc72b7..096fada16ebd8c0e84a52a80491aa0ac3339f6c5 100644 (file)
@@ -25,7 +25,7 @@ extern int smb2_check_message(char *buf, unsigned int length,
                              struct TCP_Server_Info *server);
 extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server);
 extern char *smb2_get_data_area_len(int *off, int *len,
-                                   struct smb2_sync_hdr *shdr);
+                                   struct smb2_hdr *shdr);
 extern __le16 *cifs_convert_path_to_utf16(const char *from,
                                          struct cifs_sb_info *cifs_sb);
 
index f59b956f9d2502cdaea6bcb9e89319d24c4d4100..2bf047b390a98bcc702ac6a6f1c26948b9499caa 100644 (file)
@@ -19,7 +19,6 @@
 #include <linux/mempool.h>
 #include <linux/highmem.h>
 #include <crypto/aead.h>
-#include "smb2pdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "smb2proto.h"
@@ -213,14 +212,14 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
        unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
        unsigned char *sigptr = smb2_signature;
        struct kvec *iov = rqst->rq_iov;
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
        struct cifs_ses *ses;
        struct shash_desc *shash;
        struct crypto_shash *hash;
        struct sdesc *sdesc = NULL;
        struct smb_rqst drqst;
 
-       ses = smb2_find_smb_ses(server, shdr->SessionId);
+       ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId));
        if (!ses) {
                cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
                return 0;
@@ -534,14 +533,14 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
        unsigned char smb3_signature[SMB2_CMACAES_SIZE];
        unsigned char *sigptr = smb3_signature;
        struct kvec *iov = rqst->rq_iov;
-       struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+       struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base;
        struct shash_desc *shash;
        struct crypto_shash *hash;
        struct sdesc *sdesc = NULL;
        struct smb_rqst drqst;
        u8 key[SMB3_SIGN_KEY_SIZE];
 
-       rc = smb2_get_sign_key(shdr->SessionId, server, key);
+       rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key);
        if (rc)
                return 0;
 
@@ -611,12 +610,12 @@ static int
 smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
        int rc = 0;
-       struct smb2_sync_hdr *shdr;
+       struct smb2_hdr *shdr;
        struct smb2_sess_setup_req *ssr;
        bool is_binding;
        bool is_signed;
 
-       shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+       shdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
        ssr = (struct smb2_sess_setup_req *)shdr;
 
        is_binding = shdr->Command == SMB2_SESSION_SETUP &&
@@ -642,8 +641,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
        unsigned int rc;
        char server_response_sig[SMB2_SIGNATURE_SIZE];
-       struct smb2_sync_hdr *shdr =
-                       (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+       struct smb2_hdr *shdr =
+                       (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
 
        if ((shdr->Command == SMB2_NEGOTIATE) ||
            (shdr->Command == SMB2_SESSION_SETUP) ||
@@ -689,7 +688,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
  */
 static inline void
 smb2_seq_num_into_buf(struct TCP_Server_Info *server,
-                     struct smb2_sync_hdr *shdr)
+                     struct smb2_hdr *shdr)
 {
        unsigned int i, num = le16_to_cpu(shdr->CreditCharge);
 
@@ -700,7 +699,7 @@ smb2_seq_num_into_buf(struct TCP_Server_Info *server,
 }
 
 static struct mid_q_entry *
-smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
+smb2_mid_entry_alloc(const struct smb2_hdr *shdr,
                     struct TCP_Server_Info *server)
 {
        struct mid_q_entry *temp;
@@ -732,14 +731,15 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
 
        atomic_inc(&midCount);
        temp->mid_state = MID_REQUEST_ALLOCATED;
-       trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId,
-               le16_to_cpu(shdr->Command), temp->mid);
+       trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId),
+                            le64_to_cpu(shdr->SessionId),
+                            le16_to_cpu(shdr->Command), temp->mid);
        return temp;
 }
 
 static int
 smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
-                  struct smb2_sync_hdr *shdr, struct mid_q_entry **mid)
+                  struct smb2_hdr *shdr, struct mid_q_entry **mid)
 {
        if (server->tcpStatus == CifsExiting)
                return -ENOENT;
@@ -807,8 +807,8 @@ smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server,
                   struct smb_rqst *rqst)
 {
        int rc;
-       struct smb2_sync_hdr *shdr =
-                       (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+       struct smb2_hdr *shdr =
+                       (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
        struct mid_q_entry *mid;
 
        smb2_seq_num_into_buf(server, shdr);
@@ -833,8 +833,8 @@ struct mid_q_entry *
 smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 {
        int rc;
-       struct smb2_sync_hdr *shdr =
-                       (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+       struct smb2_hdr *shdr =
+                       (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
        struct mid_q_entry *mid;
 
        if (server->tcpStatus == CifsNeedNegotiate &&
index dafcb6ab050dd9218d824b4808af3267b5c0e316..6cecf302dcfdc2ad0fe15c3affc34d125e38bf1d 100644 (file)
@@ -11,6 +11,8 @@
 #define _CIFS_TRACE_H
 
 #include <linux/tracepoint.h>
+#include <linux/net.h>
+#include <linux/inet.h>
 
 /*
  * Please use this 3-part article as a reference for writing new tracepoints:
@@ -854,6 +856,75 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name,  \
 
 DEFINE_SMB3_LEASE_ERR_EVENT(lease_err);
 
+DECLARE_EVENT_CLASS(smb3_connect_class,
+       TP_PROTO(char *hostname,
+               __u64 conn_id,
+               const struct __kernel_sockaddr_storage *dst_addr),
+       TP_ARGS(hostname, conn_id, dst_addr),
+       TP_STRUCT__entry(
+               __string(hostname, hostname)
+               __field(__u64, conn_id)
+               __array(__u8, dst_addr, sizeof(struct sockaddr_storage))
+       ),
+       TP_fast_assign(
+               struct sockaddr_storage *pss = NULL;
+
+               __entry->conn_id = conn_id;
+               pss = (struct sockaddr_storage *)__entry->dst_addr;
+               *pss = *dst_addr;
+               __assign_str(hostname, hostname);
+       ),
+       TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc",
+               __entry->conn_id,
+               __get_str(hostname),
+               __entry->dst_addr)
+)
+
+#define DEFINE_SMB3_CONNECT_EVENT(name)        \
+DEFINE_EVENT(smb3_connect_class, smb3_##name,  \
+       TP_PROTO(char *hostname,                \
+               __u64 conn_id,                  \
+               const struct __kernel_sockaddr_storage *addr),  \
+       TP_ARGS(hostname, conn_id, addr))
+
+DEFINE_SMB3_CONNECT_EVENT(connect_done);
+
+DECLARE_EVENT_CLASS(smb3_connect_err_class,
+       TP_PROTO(char *hostname, __u64 conn_id,
+               const struct __kernel_sockaddr_storage *dst_addr, int rc),
+       TP_ARGS(hostname, conn_id, dst_addr, rc),
+       TP_STRUCT__entry(
+               __string(hostname, hostname)
+               __field(__u64, conn_id)
+               __array(__u8, dst_addr, sizeof(struct sockaddr_storage))
+               __field(int, rc)
+       ),
+       TP_fast_assign(
+               struct sockaddr_storage *pss = NULL;
+
+               __entry->conn_id = conn_id;
+               __entry->rc = rc;
+               pss = (struct sockaddr_storage *)__entry->dst_addr;
+               *pss = *dst_addr;
+               __assign_str(hostname, hostname);
+       ),
+       TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc",
+               __entry->rc,
+               __entry->conn_id,
+               __get_str(hostname),
+               __entry->dst_addr)
+)
+
+#define DEFINE_SMB3_CONNECT_ERR_EVENT(name)        \
+DEFINE_EVENT(smb3_connect_err_class, smb3_##name,  \
+       TP_PROTO(char *hostname,                \
+               __u64 conn_id,                  \
+               const struct __kernel_sockaddr_storage *addr,   \
+               int rc),                        \
+       TP_ARGS(hostname, conn_id, addr, rc))
+
+DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err);
+
 DECLARE_EVENT_CLASS(smb3_reconnect_class,
        TP_PROTO(__u64  currmid,
                __u64 conn_id,
index cd60c7535181a17ea77948e829b33d9ae1aa892d..e4e0ebad1f1538e1a0d0b3f4f8431ffb5e6b7168 100644 (file)
@@ -77,9 +77,8 @@ static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
 
 /**
  * prepend_name - prepend a pathname in front of current buffer pointer
- * @buffer: buffer pointer
- * @buflen: allocated length of the buffer
- * @name:   name string and length qstr structure
+ * @p: prepend buffer which contains buffer pointer and allocated length
+ * @name: name string and length qstr structure
  *
  * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
  * make sure that either the old or the new name pointer and length are
@@ -141,8 +140,7 @@ static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
  * prepend_path - Prepend path string to a buffer
  * @path: the dentry/vfsmount to report
  * @root: root vfsmnt/dentry
- * @buffer: pointer to the end of the buffer
- * @buflen: pointer to buffer length
+ * @p: prepend buffer which contains buffer pointer and allocated length
  *
  * The function will first try to write out the pathname without taking any
  * lock other than the RCU read lock to make sure that dentries won't go away.
index 79b6a0c47f6fdbc04a7e6cf1cd5550526b4ce90d..a320c54202d95e0ee56c1daf571d0896c7821e69 100644 (file)
@@ -46,6 +46,7 @@
 #include <linux/part_stat.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/fsnotify.h>
 
 #include "ext4.h"
 #include "ext4_extents.h"      /* Needed for trace points definition */
@@ -759,6 +760,8 @@ void __ext4_error(struct super_block *sb, const char *function,
                       sb->s_id, function, line, current->comm, &vaf);
                va_end(args);
        }
+       fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);
+
        ext4_handle_error(sb, force_ro, error, 0, block, function, line);
 }
 
@@ -789,6 +792,8 @@ void __ext4_error_inode(struct inode *inode, const char *function,
                               current->comm, &vaf);
                va_end(args);
        }
+       fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);
+
        ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
                          function, line);
 }
@@ -827,6 +832,8 @@ void __ext4_error_file(struct file *file, const char *function,
                               current->comm, path, &vaf);
                va_end(args);
        }
+       fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);
+
        ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
                          function, line);
 }
@@ -894,6 +901,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
                printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
                       sb->s_id, function, line, errstr);
        }
+       fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);
 
        ext4_handle_error(sb, false, -errno, 0, 0, function, line);
 }
index 678e2c51b855c05718bba33f1f223c9c2ad931cb..0c6eacfcbeef1a5728b7a18d833cc5f6210c46e8 100644 (file)
@@ -1322,6 +1322,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
 
        de = (struct iso_directory_record *) (bh->b_data + offset);
        de_len = *(unsigned char *) de;
+       if (de_len < sizeof(struct iso_directory_record))
+               goto fail;
 
        if (offset + de_len > bufsize) {
                int frag1 = bufsize - offset;
index be3c1aad50ea3974433f39b3e6f362f2f0839155..fdf89fcf1a0ca17ac03ab3027b1b36512c408aa0 100644 (file)
@@ -602,6 +602,9 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
                                struct inode *inode, struct inode *dir,
                                const struct qstr *name, u32 cookie)
 {
+       if (WARN_ON_ONCE(!inode))
+               return 0;
+
        trace_nfsd_file_fsnotify_handle_event(inode, mask);
 
        /* Should be no marks on non-regular files */
index 057abd2cf8875ad8786acd496c30ec8add429834..b6091775aa6efe3677e9e83e50cc664dfa15a0df 100644 (file)
@@ -111,6 +111,16 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1,
        return fanotify_info_equal(info1, info2);
 }
 
+static bool fanotify_error_event_equal(struct fanotify_error_event *fee1,
+                                      struct fanotify_error_event *fee2)
+{
+       /* Error events against the same file system are always merged. */
+       if (!fanotify_fsid_equal(&fee1->fsid, &fee2->fsid))
+               return false;
+
+       return true;
+}
+
 static bool fanotify_should_merge(struct fanotify_event *old,
                                  struct fanotify_event *new)
 {
@@ -141,6 +151,9 @@ static bool fanotify_should_merge(struct fanotify_event *old,
        case FANOTIFY_EVENT_TYPE_FID_NAME:
                return fanotify_name_event_equal(FANOTIFY_NE(old),
                                                 FANOTIFY_NE(new));
+       case FANOTIFY_EVENT_TYPE_FS_ERROR:
+               return fanotify_error_event_equal(FANOTIFY_EE(old),
+                                                 FANOTIFY_EE(new));
        default:
                WARN_ON_ONCE(1);
        }
@@ -176,6 +189,10 @@ static int fanotify_merge(struct fsnotify_group *group,
                        break;
                if (fanotify_should_merge(old, new)) {
                        old->mask |= new->mask;
+
+                       if (fanotify_is_error_event(old->mask))
+                               FANOTIFY_EE(old)->err_count++;
+
                        return 1;
                }
        }
@@ -343,13 +360,23 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
 static int fanotify_encode_fh_len(struct inode *inode)
 {
        int dwords = 0;
+       int fh_len;
 
        if (!inode)
                return 0;
 
        exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
+       fh_len = dwords << 2;
+
+       /*
+        * struct fanotify_error_event might be preallocated and is
+        * limited to MAX_HANDLE_SZ.  This should never happen, but
+        * safeguard by forcing an invalid file handle.
+        */
+       if (WARN_ON_ONCE(fh_len > MAX_HANDLE_SZ))
+               return 0;
 
-       return dwords << 2;
+       return fh_len;
 }
 
 /*
@@ -370,8 +397,14 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
        fh->type = FILEID_ROOT;
        fh->len = 0;
        fh->flags = 0;
+
+       /*
+        * Invalid FHs are used by FAN_FS_ERROR for errors not
+        * linked to any inode. The f_handle won't be reported
+        * back to userspace.
+        */
        if (!inode)
-               return 0;
+               goto out;
 
        /*
         * !gpf means preallocated variable size fh, but fh_len could
@@ -403,8 +436,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
        fh->type = type;
        fh->len = fh_len;
 
-       /* Mix fh into event merge key */
-       *hash ^= fanotify_hash_fh(fh);
+out:
+       /*
+        * Mix fh into event merge key.  Hash might be NULL in case of
+        * unhashed FID events (i.e. FAN_FS_ERROR).
+        */
+       if (hash)
+               *hash ^= fanotify_hash_fh(fh);
 
        return FANOTIFY_FH_HDR_LEN + fh_len;
 
@@ -452,7 +490,7 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data,
        if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
                return dir;
 
-       if (S_ISDIR(inode->i_mode))
+       if (inode && S_ISDIR(inode->i_mode))
                return inode;
 
        return dir;
@@ -563,6 +601,44 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id,
        return &fne->fae;
 }
 
+static struct fanotify_event *fanotify_alloc_error_event(
+                                               struct fsnotify_group *group,
+                                               __kernel_fsid_t *fsid,
+                                               const void *data, int data_type,
+                                               unsigned int *hash)
+{
+       struct fs_error_report *report =
+                       fsnotify_data_error_report(data, data_type);
+       struct inode *inode;
+       struct fanotify_error_event *fee;
+       int fh_len;
+
+       if (WARN_ON_ONCE(!report))
+               return NULL;
+
+       fee = mempool_alloc(&group->fanotify_data.error_events_pool, GFP_NOFS);
+       if (!fee)
+               return NULL;
+
+       fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
+       fee->error = report->error;
+       fee->err_count = 1;
+       fee->fsid = *fsid;
+
+       inode = report->inode;
+       fh_len = fanotify_encode_fh_len(inode);
+
+       /* Bad fh_len. Fallback to using an invalid fh. Should never happen. */
+       if (!fh_len && inode)
+               inode = NULL;
+
+       fanotify_encode_fh(&fee->object_fh, inode, fh_len, NULL, 0);
+
+       *hash ^= fanotify_hash_fsid(fsid);
+
+       return &fee->fae;
+}
+
 static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
                                                   u32 mask, const void *data,
                                                   int data_type, struct inode *dir,
@@ -630,6 +706,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
 
        if (fanotify_is_perm_event(mask)) {
                event = fanotify_alloc_perm_event(path, gfp);
+       } else if (fanotify_is_error_event(mask)) {
+               event = fanotify_alloc_error_event(group, fsid, data,
+                                                  data_type, &hash);
        } else if (name_event && (file_name || child)) {
                event = fanotify_alloc_name_event(id, fsid, file_name, child,
                                                  &hash, gfp);
@@ -702,6 +781,9 @@ static void fanotify_insert_event(struct fsnotify_group *group,
 
        assert_spin_locked(&group->notification_lock);
 
+       if (!fanotify_is_hashed_event(event->mask))
+               return;
+
        pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
                 group, event, bucket);
 
@@ -738,8 +820,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
        BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
        BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
        BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
+       BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR);
 
-       BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
+       BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20);
 
        mask = fanotify_group_event_mask(group, iter_info, mask, data,
                                         data_type, dir);
@@ -778,9 +861,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
        }
 
        fsn_event = &event->fse;
-       ret = fsnotify_add_event(group, fsn_event, fanotify_merge,
-                                fanotify_is_hashed_event(mask) ?
-                                fanotify_insert_event : NULL);
+       ret = fsnotify_insert_event(group, fsn_event, fanotify_merge,
+                                   fanotify_insert_event);
        if (ret) {
                /* Permission events shouldn't be merged */
                BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
@@ -805,6 +887,9 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
        if (group->fanotify_data.ucounts)
                dec_ucount(group->fanotify_data.ucounts,
                           UCOUNT_FANOTIFY_GROUPS);
+
+       if (mempool_initialized(&group->fanotify_data.error_events_pool))
+               mempool_exit(&group->fanotify_data.error_events_pool);
 }
 
 static void fanotify_free_path_event(struct fanotify_event *event)
@@ -833,7 +918,16 @@ static void fanotify_free_name_event(struct fanotify_event *event)
        kfree(FANOTIFY_NE(event));
 }
 
-static void fanotify_free_event(struct fsnotify_event *fsn_event)
+static void fanotify_free_error_event(struct fsnotify_group *group,
+                                     struct fanotify_event *event)
+{
+       struct fanotify_error_event *fee = FANOTIFY_EE(event);
+
+       mempool_free(fee, &group->fanotify_data.error_events_pool);
+}
+
+static void fanotify_free_event(struct fsnotify_group *group,
+                               struct fsnotify_event *fsn_event)
 {
        struct fanotify_event *event;
 
@@ -855,6 +949,9 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
        case FANOTIFY_EVENT_TYPE_OVERFLOW:
                kfree(event);
                break;
+       case FANOTIFY_EVENT_TYPE_FS_ERROR:
+               fanotify_free_error_event(group, event);
+               break;
        default:
                WARN_ON_ONCE(1);
        }
index 4a5e555dc3d25d4f33104f35c9750409e4225713..d25f500bf7e79a4fb3e8e7af939b0911507f6518 100644 (file)
@@ -141,6 +141,7 @@ enum fanotify_event_type {
        FANOTIFY_EVENT_TYPE_PATH,
        FANOTIFY_EVENT_TYPE_PATH_PERM,
        FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
+       FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
        __FANOTIFY_EVENT_TYPE_NUM
 };
 
@@ -170,12 +171,18 @@ static inline void fanotify_init_event(struct fanotify_event *event,
        event->pid = NULL;
 }
 
+#define FANOTIFY_INLINE_FH(name, size)                                 \
+struct {                                                               \
+       struct fanotify_fh (name);                                      \
+       /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \
+       unsigned char _inline_fh_buf[(size)];                           \
+}
+
 struct fanotify_fid_event {
        struct fanotify_event fae;
        __kernel_fsid_t fsid;
-       struct fanotify_fh object_fh;
-       /* Reserve space in object_fh.buf[] - access with fanotify_fh_buf() */
-       unsigned char _inline_fh_buf[FANOTIFY_INLINE_FH_LEN];
+
+       FANOTIFY_INLINE_FH(object_fh, FANOTIFY_INLINE_FH_LEN);
 };
 
 static inline struct fanotify_fid_event *
@@ -196,12 +203,30 @@ FANOTIFY_NE(struct fanotify_event *event)
        return container_of(event, struct fanotify_name_event, fae);
 }
 
+struct fanotify_error_event {
+       struct fanotify_event fae;
+       s32 error; /* Error reported by the Filesystem. */
+       u32 err_count; /* Suppressed errors count */
+
+       __kernel_fsid_t fsid; /* FSID this error refers to. */
+
+       FANOTIFY_INLINE_FH(object_fh, MAX_HANDLE_SZ);
+};
+
+static inline struct fanotify_error_event *
+FANOTIFY_EE(struct fanotify_event *event)
+{
+       return container_of(event, struct fanotify_error_event, fae);
+}
+
 static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event)
 {
        if (event->type == FANOTIFY_EVENT_TYPE_FID)
                return &FANOTIFY_FE(event)->fsid;
        else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
                return &FANOTIFY_NE(event)->fsid;
+       else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+               return &FANOTIFY_EE(event)->fsid;
        else
                return NULL;
 }
@@ -213,6 +238,8 @@ static inline struct fanotify_fh *fanotify_event_object_fh(
                return &FANOTIFY_FE(event)->object_fh;
        else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
                return fanotify_info_file_fh(&FANOTIFY_NE(event)->info);
+       else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+               return &FANOTIFY_EE(event)->object_fh;
        else
                return NULL;
 }
@@ -244,6 +271,19 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event)
        return info ? fanotify_info_dir_fh_len(info) : 0;
 }
 
+static inline bool fanotify_event_has_object_fh(struct fanotify_event *event)
+{
+       /* For error events, even zeroed fh are reported. */
+       if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
+               return true;
+       return fanotify_event_object_fh_len(event) > 0;
+}
+
+static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event)
+{
+       return fanotify_event_dir_fh_len(event) > 0;
+}
+
 struct fanotify_path_event {
        struct fanotify_event fae;
        struct path path;
@@ -287,6 +327,11 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
        return container_of(fse, struct fanotify_event, fse);
 }
 
+static inline bool fanotify_is_error_event(u32 mask)
+{
+       return mask & FAN_FS_ERROR;
+}
+
 static inline bool fanotify_event_has_path(struct fanotify_event *event)
 {
        return event->type == FANOTIFY_EVENT_TYPE_PATH ||
@@ -315,7 +360,8 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event)
  */
 static inline bool fanotify_is_hashed_event(u32 mask)
 {
-       return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW);
+       return !(fanotify_is_perm_event(mask) ||
+                fsnotify_is_overflow_event(mask));
 }
 
 static inline unsigned int fanotify_event_hash_bucket(
index 6facdf476255d13c7ff7dc9aa36def05fcfdf896..559bc1e9926d6a4e41bf7265977f4cfffbba8360 100644 (file)
@@ -30,6 +30,7 @@
 #define FANOTIFY_DEFAULT_MAX_EVENTS    16384
 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192
 #define FANOTIFY_DEFAULT_MAX_GROUPS    128
+#define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32
 
 /*
  * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
@@ -114,6 +115,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
        (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
 #define FANOTIFY_PIDFD_INFO_HDR_LEN \
        sizeof(struct fanotify_event_info_pidfd)
+#define FANOTIFY_ERROR_INFO_LEN \
+       (sizeof(struct fanotify_event_info_error))
 
 static int fanotify_fid_info_len(int fh_len, int name_len)
 {
@@ -126,17 +129,26 @@ static int fanotify_fid_info_len(int fh_len, int name_len)
                       FANOTIFY_EVENT_ALIGN);
 }
 
-static int fanotify_event_info_len(unsigned int info_mode,
-                                  struct fanotify_event *event)
+static size_t fanotify_event_len(unsigned int info_mode,
+                                struct fanotify_event *event)
 {
-       struct fanotify_info *info = fanotify_event_info(event);
-       int dir_fh_len = fanotify_event_dir_fh_len(event);
-       int fh_len = fanotify_event_object_fh_len(event);
-       int info_len = 0;
+       size_t event_len = FAN_EVENT_METADATA_LEN;
+       struct fanotify_info *info;
+       int dir_fh_len;
+       int fh_len;
        int dot_len = 0;
 
-       if (dir_fh_len) {
-               info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
+       if (!info_mode)
+               return event_len;
+
+       if (fanotify_is_error_event(event->mask))
+               event_len += FANOTIFY_ERROR_INFO_LEN;
+
+       info = fanotify_event_info(event);
+
+       if (fanotify_event_has_dir_fh(event)) {
+               dir_fh_len = fanotify_event_dir_fh_len(event);
+               event_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
        } else if ((info_mode & FAN_REPORT_NAME) &&
                   (event->mask & FAN_ONDIR)) {
                /*
@@ -147,12 +159,14 @@ static int fanotify_event_info_len(unsigned int info_mode,
        }
 
        if (info_mode & FAN_REPORT_PIDFD)
-               info_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+               event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
 
-       if (fh_len)
-               info_len += fanotify_fid_info_len(fh_len, dot_len);
+       if (fanotify_event_has_object_fh(event)) {
+               fh_len = fanotify_event_object_fh_len(event);
+               event_len += fanotify_fid_info_len(fh_len, dot_len);
+       }
 
-       return info_len;
+       return event_len;
 }
 
 /*
@@ -181,7 +195,7 @@ static void fanotify_unhash_event(struct fsnotify_group *group,
 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
                                            size_t count)
 {
-       size_t event_size = FAN_EVENT_METADATA_LEN;
+       size_t event_size;
        struct fanotify_event *event = NULL;
        struct fsnotify_event *fsn_event;
        unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
@@ -194,8 +208,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
                goto out;
 
        event = FANOTIFY_E(fsn_event);
-       if (info_mode)
-               event_size += fanotify_event_info_len(info_mode, event);
+       event_size = fanotify_event_len(info_mode, event);
 
        if (event_size > count) {
                event = ERR_PTR(-EINVAL);
@@ -316,6 +329,28 @@ static int process_access_response(struct fsnotify_group *group,
        return -ENOENT;
 }
 
+static size_t copy_error_info_to_user(struct fanotify_event *event,
+                                     char __user *buf, int count)
+{
+       struct fanotify_event_info_error info;
+       struct fanotify_error_event *fee = FANOTIFY_EE(event);
+
+       info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
+       info.hdr.pad = 0;
+       info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
+
+       if (WARN_ON(count < info.hdr.len))
+               return -EFAULT;
+
+       info.error = fee->error;
+       info.error_count = fee->err_count;
+
+       if (copy_to_user(buf, &info, sizeof(info)))
+               return -EFAULT;
+
+       return info.hdr.len;
+}
+
 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
                                 int info_type, const char *name,
                                 size_t name_len,
@@ -331,9 +366,6 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
        pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
                 __func__, fh_len, name_len, info_len, count);
 
-       if (!fh_len)
-               return 0;
-
        if (WARN_ON_ONCE(len < sizeof(info) || len > count))
                return -EFAULT;
 
@@ -368,6 +400,11 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
 
        handle.handle_type = fh->type;
        handle.handle_bytes = fh_len;
+
+       /* Mangle handle_type for bad file_handle */
+       if (!fh_len)
+               handle.handle_type = FILEID_INVALID;
+
        if (copy_to_user(buf, &handle, sizeof(handle)))
                return -EFAULT;
 
@@ -444,7 +481,7 @@ static int copy_info_records_to_user(struct fanotify_event *event,
        /*
         * Event info records order is as follows: dir fid + name, child fid.
         */
-       if (fanotify_event_dir_fh_len(event)) {
+       if (fanotify_event_has_dir_fh(event)) {
                info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
                                             FAN_EVENT_INFO_TYPE_DFID;
                ret = copy_fid_info_to_user(fanotify_event_fsid(event),
@@ -460,7 +497,7 @@ static int copy_info_records_to_user(struct fanotify_event *event,
                total_bytes += ret;
        }
 
-       if (fanotify_event_object_fh_len(event)) {
+       if (fanotify_event_has_object_fh(event)) {
                const char *dot = NULL;
                int dot_len = 0;
 
@@ -520,6 +557,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
                total_bytes += ret;
        }
 
+       if (fanotify_is_error_event(event->mask)) {
+               ret = copy_error_info_to_user(event, buf, count);
+               if (ret < 0)
+                       return ret;
+               buf += ret;
+               count -= ret;
+               total_bytes += ret;
+       }
+
        return total_bytes;
 }
 
@@ -537,8 +583,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-       metadata.event_len = FAN_EVENT_METADATA_LEN +
-                               fanotify_event_info_len(info_mode, event);
+       metadata.event_len = fanotify_event_len(info_mode, event);
        metadata.metadata_len = FAN_EVENT_METADATA_LEN;
        metadata.vers = FANOTIFY_METADATA_VERSION;
        metadata.reserved = 0;
@@ -1049,6 +1094,15 @@ out_dec_ucounts:
        return ERR_PTR(ret);
 }
 
+static int fanotify_group_init_error_pool(struct fsnotify_group *group)
+{
+       if (mempool_initialized(&group->fanotify_data.error_events_pool))
+               return 0;
+
+       return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
+                                        FANOTIFY_DEFAULT_FEE_POOL_SIZE,
+                                        sizeof(struct fanotify_error_event));
+}
 
 static int fanotify_add_mark(struct fsnotify_group *group,
                             fsnotify_connp_t *connp, unsigned int type,
@@ -1057,6 +1111,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
 {
        struct fsnotify_mark *fsn_mark;
        __u32 added;
+       int ret = 0;
 
        mutex_lock(&group->mark_mutex);
        fsn_mark = fsnotify_find_mark(connp, group);
@@ -1067,13 +1122,26 @@ static int fanotify_add_mark(struct fsnotify_group *group,
                        return PTR_ERR(fsn_mark);
                }
        }
+
+       /*
+        * Error events are pre-allocated per group, only if strictly
+        * needed (i.e. FAN_FS_ERROR was requested).
+        */
+       if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
+               ret = fanotify_group_init_error_pool(group);
+               if (ret)
+                       goto out;
+       }
+
        added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
        if (added & ~fsnotify_conn_mask(fsn_mark->connector))
                fsnotify_recalc_mask(fsn_mark->connector);
+
+out:
        mutex_unlock(&group->mark_mutex);
 
        fsnotify_put_mark(fsn_mark);
-       return 0;
+       return ret;
 }
 
 static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
@@ -1295,16 +1363,15 @@ out_destroy_group:
        return fd;
 }
 
-/* Check if filesystem can encode a unique fid */
-static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
 {
        __kernel_fsid_t root_fsid;
        int err;
 
        /*
-        * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+        * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
         */
-       err = vfs_get_fsid(path->dentry, fsid);
+       err = vfs_get_fsid(dentry, fsid);
        if (err)
                return err;
 
@@ -1312,10 +1379,10 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
                return -ENODEV;
 
        /*
-        * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+        * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
         * which uses a different fsid than sb root.
         */
-       err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
+       err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
        if (err)
                return err;
 
@@ -1323,6 +1390,12 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
            root_fsid.val[1] != fsid->val[1])
                return -EXDEV;
 
+       return 0;
+}
+
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct dentry *dentry)
+{
        /*
         * We need to make sure that the file system supports at least
         * encoding a file handle so user can use name_to_handle_at() to
@@ -1330,8 +1403,8 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
         * objects. However, name_to_handle_at() requires that the
         * filesystem also supports decoding file handles.
         */
-       if (!path->dentry->d_sb->s_export_op ||
-           !path->dentry->d_sb->s_export_op->fh_to_dentry)
+       if (!dentry->d_sb->s_export_op ||
+           !dentry->d_sb->s_export_op->fh_to_dentry)
                return -EOPNOTSUPP;
 
        return 0;
@@ -1447,15 +1520,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
            group->priority == FS_PRIO_0)
                goto fput_and_out;
 
+       if (mask & FAN_FS_ERROR &&
+           mark_type != FAN_MARK_FILESYSTEM)
+               goto fput_and_out;
+
        /*
-        * Events with data type inode do not carry enough information to report
-        * event->fd, so we do not allow setting a mask for inode events unless
-        * group supports reporting fid.
-        * inode events are not supported on a mount mark, because they do not
-        * carry enough information (i.e. path) to be filtered by mount point.
+        * Events that do not carry enough information to report
+        * event->fd require a group that supports reporting fid.  Those
+        * events are not supported on a mount mark, because they do not
+        * carry enough information (i.e. path) to be filtered by mount
+        * point.
         */
        fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
-       if (mask & FANOTIFY_INODE_EVENTS &&
+       if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
            (!fid_mode || mark_type == FAN_MARK_MOUNT))
                goto fput_and_out;
 
@@ -1482,7 +1559,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
        }
 
        if (fid_mode) {
-               ret = fanotify_test_fid(&path, &__fsid);
+               ret = fanotify_test_fsid(path.dentry, &__fsid);
+               if (ret)
+                       goto path_put_and_out;
+
+               ret = fanotify_test_fid(path.dentry);
                if (ret)
                        goto path_put_and_out;
 
index 963e6ce75b96113c5739a68698082b2aea7f3195..4034ca566f95cedcb01480b0640be4b770539eec 100644 (file)
@@ -252,6 +252,9 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group,
        if (WARN_ON_ONCE(!ops->handle_inode_event))
                return 0;
 
+       if (WARN_ON_ONCE(!inode && !dir))
+               return 0;
+
        if ((inode_mark->mask & FS_EXCL_UNLINK) &&
            path && d_unlinked(path->dentry))
                return 0;
@@ -455,16 +458,16 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
  *             @file_name is relative to
  * @file_name: optional file name associated with event
  * @inode:     optional inode associated with event -
- *             either @dir or @inode must be non-NULL.
- *             if both are non-NULL event may be reported to both.
+ *             If @dir and @inode are both non-NULL, event may be
+ *             reported to both.
  * @cookie:    inotify rename cookie
  */
 int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
             const struct qstr *file_name, struct inode *inode, u32 cookie)
 {
        const struct path *path = fsnotify_data_path(data, data_type);
+       struct super_block *sb = fsnotify_data_sb(data, data_type);
        struct fsnotify_iter_info iter_info = {};
-       struct super_block *sb;
        struct mount *mnt = NULL;
        struct inode *parent = NULL;
        int ret = 0;
@@ -483,7 +486,6 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
                 */
                parent = dir;
        }
-       sb = inode->i_sb;
 
        /*
         * Optimization: srcu_read_lock() has a memory barrier which can
index fb89c351295d6ee1e8156c176417915e60969812..6a297efc478874b10346b1d8ac0e8a9c941d6bd3 100644 (file)
@@ -88,7 +88,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
         * that deliberately ignores overflow events.
         */
        if (group->overflow_event)
-               group->ops->free_event(group->overflow_event);
+               group->ops->free_event(group, group->overflow_event);
 
        fsnotify_put_group(group);
 }
index d1a64daa0171ec851903be87cd7357bf4ecba556..d92d7b0adc9a2764f16923fbf797df68730c0e7d 100644 (file)
@@ -116,7 +116,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
        if (len)
                strcpy(event->name, name->name);
 
-       ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL);
+       ret = fsnotify_add_event(group, fsn_event, inotify_merge);
        if (ret) {
                /* Our event wasn't used in the end. Free it. */
                fsnotify_destroy_event(group, fsn_event);
@@ -177,7 +177,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
                dec_inotify_instances(group->inotify_data.ucounts);
 }
 
-static void inotify_free_event(struct fsnotify_event *fsn_event)
+static void inotify_free_event(struct fsnotify_group *group,
+                              struct fsnotify_event *fsn_event)
 {
        kfree(INOTIFY_E(fsn_event));
 }
index 62051247f6d21d4b0fed55a914ccf37ae7e530c2..29fca3284bb5c83c25a669b213dfa3da870631ed 100644 (file)
@@ -94,10 +94,10 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
        __u32 mask;
 
        /*
-        * Everything should accept their own ignored and should receive events
-        * when the inode is unmounted.  All directories care about children.
+        * Everything should receive events when the inode is unmounted.
+        * All directories care about children.
         */
-       mask = (FS_IN_IGNORED | FS_UNMOUNT);
+       mask = (FS_UNMOUNT);
        if (S_ISDIR(inode->i_mode))
                mask |= FS_EVENT_ON_CHILD;
 
index 32f45543b9c649bef2bfcf0ef78ad2681c05e89c..9022ae650cf863ea5276027204fcae754024cf62 100644 (file)
@@ -64,7 +64,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
                WARN_ON(!list_empty(&event->list));
                spin_unlock(&group->notification_lock);
        }
-       group->ops->free_event(event);
+       group->ops->free_event(group, event);
 }
 
 /*
@@ -78,12 +78,12 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
  * 2 if the event was not queued - either the queue of events has overflown
  *   or the group is shutting down.
  */
-int fsnotify_add_event(struct fsnotify_group *group,
-                      struct fsnotify_event *event,
-                      int (*merge)(struct fsnotify_group *,
-                                   struct fsnotify_event *),
-                      void (*insert)(struct fsnotify_group *,
-                                     struct fsnotify_event *))
+int fsnotify_insert_event(struct fsnotify_group *group,
+                         struct fsnotify_event *event,
+                         int (*merge)(struct fsnotify_group *,
+                                      struct fsnotify_event *),
+                         void (*insert)(struct fsnotify_group *,
+                                        struct fsnotify_event *))
 {
        int ret = 0;
        struct list_head *list = &group->notification_list;
index 5d9ae17bd443f209ce4098e1bc30ef737cfca9bd..bb247bc349e461eac5c8a4a9790613313a7e6aee 100644 (file)
@@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
+                       ocfs2_commit_trans(osb, handle);
                        mlog_errno(status);
                        goto bail;
                }
@@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                                                     data_alloc_bh, start_blk,
                                                     num_clusters);
                        if (status < 0) {
+                               ocfs2_commit_trans(osb, handle);
                                mlog_errno(status);
                                goto bail;
                        }
@@ -6921,13 +6923,12 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
 }
 
 /*
- * Zero the area past i_size but still within an allocated
- * cluster. This avoids exposing nonzero data on subsequent file
- * extends.
+ * Zero partial cluster for a hole punch or truncate. This avoids exposing
+ * nonzero data on subsequent file extends.
  *
  * We need to call this before i_size is updated on the inode because
  * otherwise block_write_full_page() will skip writeout of pages past
- * i_size. The new_i_size parameter is passed for this reason.
+ * i_size.
  */
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
                                  u64 range_start, u64 range_end)
@@ -6945,6 +6946,15 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
        if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
                return 0;
 
+       /*
+        * Avoid zeroing pages fully beyond current i_size. It is pointless as
+        * underlying blocks of those pages should be already zeroed out and
+        * page writeback will skip them anyway.
+        */
+       range_end = min_t(u64, range_end, i_size_read(inode));
+       if (range_start >= range_end)
+               return 0;
+
        pages = kcalloc(ocfs2_pages_per_cluster(sb),
                        sizeof(struct page *), GFP_NOFS);
        if (pages == NULL) {
@@ -6953,9 +6963,6 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
                goto out;
        }
 
-       if (range_start == range_end)
-               goto out;
-
        ret = ocfs2_extent_map_get_blocks(inode,
                                          range_start >> sb->s_blocksize_bits,
                                          &phys, NULL, &ext_flags);
index 0e7aad1b11cc4cd2ea6fc4381307c1451c303e4a..5cd5f7511dacd8908167da64092327d03551cbe3 100644 (file)
@@ -2698,7 +2698,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
                        continue;
                }
 retry:
-               ret = -EINVAL;
                mlog(0, "attempting to send begin reco msg to %d\n",
                          nodenum);
                ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
index 54d7843c021144c53e15dce96886a24b99e5cfaf..fc5f780fa2355af37071aa95db0e2429ed289250 100644 (file)
@@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *inode,
         * greater than page size, so we have to truncate them
         * anyway.
         */
-       unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
-       truncate_inode_pages(inode->i_mapping, new_i_size);
 
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               unmap_mapping_range(inode->i_mapping,
+                                   new_i_size + PAGE_SIZE - 1, 0, 1);
+               truncate_inode_pages(inode->i_mapping, new_i_size);
                status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
                                               i_size_read(inode), 1);
                if (status)
@@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *inode,
                goto bail_unlock_sem;
        }
 
+       unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+       truncate_inode_pages(inode->i_mapping, new_i_size);
+
        status = ocfs2_commit_truncate(osb, inode, di_bh);
        if (status < 0) {
                mlog_errno(status);
index bc8f32fab964cde1196f9aedcfd153be86541e6d..6c2411c2afcf1c8ecb9ce4fb6d4839a30a2d9ae6 100644 (file)
@@ -125,7 +125,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
        struct inode *inode = NULL;
        struct super_block *sb = osb->sb;
        struct ocfs2_find_inode_args args;
-       journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
 
        trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
                               sysfile_type);
@@ -172,10 +171,11 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
         * part of the transaction - the inode could have been reclaimed and
         * now it is reread from disk.
         */
-       if (journal) {
+       if (osb->journal) {
                transaction_t *transaction;
                tid_t tid;
                struct ocfs2_inode_info *oi = OCFS2_I(inode);
+               journal_t *journal = osb->journal->j_journal;
 
                read_lock(&journal->j_state_lock);
                if (journal->j_running_transaction)
index 4f15750aac5d5415fc0be0e5b3826cc7531ad8d4..b9c339335a53db461eff6a405a3e82ce4d7da159 100644 (file)
@@ -810,19 +810,34 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
        write_unlock(&journal->j_state_lock);
 }
 
-int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
+int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
 {
        int status = -1;
        struct inode *inode = NULL; /* the journal inode */
        journal_t *j_journal = NULL;
+       struct ocfs2_journal *journal = NULL;
        struct ocfs2_dinode *di = NULL;
        struct buffer_head *bh = NULL;
-       struct ocfs2_super *osb;
        int inode_lock = 0;
 
-       BUG_ON(!journal);
+       /* initialize our journal structure */
+       journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
+       if (!journal) {
+               mlog(ML_ERROR, "unable to alloc journal\n");
+               status = -ENOMEM;
+               goto done;
+       }
+       osb->journal = journal;
+       journal->j_osb = osb;
 
-       osb = journal->j_osb;
+       atomic_set(&journal->j_num_trans, 0);
+       init_rwsem(&journal->j_trans_barrier);
+       init_waitqueue_head(&journal->j_checkpointed);
+       spin_lock_init(&journal->j_lock);
+       journal->j_trans_id = 1UL;
+       INIT_LIST_HEAD(&journal->j_la_cleanups);
+       INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
+       journal->j_state = OCFS2_JOURNAL_FREE;
 
        /* already have the inode for our journal */
        inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
@@ -1028,9 +1043,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 
        journal->j_state = OCFS2_JOURNAL_FREE;
 
-//     up_write(&journal->j_trans_barrier);
 done:
        iput(inode);
+       kfree(journal);
+       osb->journal = NULL;
 }
 
 static void ocfs2_clear_journal_error(struct super_block *sb,
index d158acb8b38a8177213ebca7e91be18e1f9456e5..8dcb2f2cadbc5bd932a66db13248d05ceaf051ee 100644 (file)
@@ -167,8 +167,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
  *  ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
  */
 void   ocfs2_set_journal_params(struct ocfs2_super *osb);
-int    ocfs2_journal_init(struct ocfs2_journal *journal,
-                         int *dirty);
+int    ocfs2_journal_init(struct ocfs2_super *osb, int *dirty);
 void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
 int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
                          int full);
index 5c914ce9b3ac95636296e308cc2f61400f41ef55..1286b88b6fa17917e0f76d351621f933e865bf56 100644 (file)
@@ -1894,8 +1894,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        /* This will disable recovery and flush any recovery work. */
        ocfs2_recovery_exit(osb);
 
-       ocfs2_journal_shutdown(osb);
-
        ocfs2_sync_blockdev(sb);
 
        ocfs2_purge_refcount_trees(osb);
@@ -1918,6 +1916,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
        ocfs2_release_system_inodes(osb);
 
+       ocfs2_journal_shutdown(osb);
+
        /*
         * If we're dismounting due to mount error, mount.ocfs2 will clean
         * up heartbeat.  If we're a local mount, there is no heartbeat.
@@ -2016,7 +2016,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        int i, cbits, bbits;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
        struct inode *inode = NULL;
-       struct ocfs2_journal *journal;
        struct ocfs2_super *osb;
        u64 total_blocks;
 
@@ -2197,33 +2196,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
        get_random_bytes(&osb->s_next_generation, sizeof(u32));
 
-       /* FIXME
-        * This should be done in ocfs2_journal_init(), but unknown
-        * ordering issues will cause the filesystem to crash.
-        * If anyone wants to figure out what part of the code
-        * refers to osb->journal before ocfs2_journal_init() is run,
-        * be my guest.
-        */
-       /* initialize our journal structure */
-
-       journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
-       if (!journal) {
-               mlog(ML_ERROR, "unable to alloc journal\n");
-               status = -ENOMEM;
-               goto bail;
-       }
-       osb->journal = journal;
-       journal->j_osb = osb;
-
-       atomic_set(&journal->j_num_trans, 0);
-       init_rwsem(&journal->j_trans_barrier);
-       init_waitqueue_head(&journal->j_checkpointed);
-       spin_lock_init(&journal->j_lock);
-       journal->j_trans_id = (unsigned long) 1;
-       INIT_LIST_HEAD(&journal->j_la_cleanups);
-       INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
-       journal->j_state = OCFS2_JOURNAL_FREE;
-
        INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
        init_llist_head(&osb->dquot_drop_list);
 
@@ -2404,7 +2376,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                                                  * ourselves. */
 
        /* Init our journal object. */
-       status = ocfs2_journal_init(osb->journal, &dirty);
+       status = ocfs2_journal_init(osb, &dirty);
        if (status < 0) {
                mlog(ML_ERROR, "Could not initialize journal!\n");
                goto finally;
@@ -2513,12 +2485,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 
        kfree(osb->osb_orphan_wipes);
        kfree(osb->slot_recovery_generations);
-       /* FIXME
-        * This belongs in journal shutdown, but because we have to
-        * allocate osb->journal at the start of ocfs2_initialize_osb(),
-        * we free it here.
-        */
-       kfree(osb->journal);
        kfree(osb->local_alloc_copy);
        kfree(osb->uuid_str);
        kfree(osb->vol_label);
index a7f6cab812672665ae230232906a60057b75d56c..f732fb94600ce0e7c3671f431d8c98aaff825813 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -856,8 +856,20 @@ static int do_dentry_open(struct file *f,
                 * of THPs into the page cache will fail.
                 */
                smp_mb();
-               if (filemap_nr_thps(inode->i_mapping))
-                       truncate_pagecache(inode, 0);
+               if (filemap_nr_thps(inode->i_mapping)) {
+                       struct address_space *mapping = inode->i_mapping;
+
+                       filemap_invalidate_lock(inode->i_mapping);
+                       /*
+                        * unmap_mapping_range just need to be called once
+                        * here, because the private pages is not need to be
+                        * unmapped mapping (e.g. data segment of dynamic
+                        * shared libraries here).
+                        */
+                       unmap_mapping_range(mapping, 0, 0, 0);
+                       truncate_inode_pages(mapping, 0);
+                       filemap_invalidate_unlock(inode->i_mapping);
+               }
        }
 
        return 0;
index f5c25f580dd92feebdd0188401a065a6cff4fd50..9323a854a60aeb27e7e36db9da4164941a8730c0 100644 (file)
@@ -134,8 +134,7 @@ struct posix_acl *get_acl(struct inode *inode, int type)
         * to just call ->get_acl to fetch the ACL ourself.  (This is going to
         * be an unlikely race.)
         */
-       if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED)
-               /* fall through */ ;
+       cmpxchg(p, ACL_NOT_CACHED, sentinel);
 
        /*
         * Normally, the ACL returned by ->get_acl will be cached.
index cf25be3e0321206b5a45f66ae2cfc2084e7f947c..ad667dbc96f5cbb928e55678b092ed184dc3aaa7 100644 (file)
@@ -397,7 +397,6 @@ struct mem_size_stats {
        u64 pss_shmem;
        u64 pss_locked;
        u64 swap_pss;
-       bool check_shmem_swap;
 };
 
 static void smaps_page_accumulate(struct mem_size_stats *mss,
@@ -478,9 +477,11 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
                          __always_unused int depth, struct mm_walk *walk)
 {
        struct mem_size_stats *mss = walk->private;
+       struct vm_area_struct *vma = walk->vma;
 
-       mss->swap += shmem_partial_swap_usage(
-                       walk->vma->vm_file->f_mapping, addr, end);
+       mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping,
+                                             linear_page_index(vma, addr),
+                                             linear_page_index(vma, end));
 
        return 0;
 }
@@ -488,6 +489,16 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
 #define smaps_pte_hole         NULL
 #endif /* CONFIG_SHMEM */
 
+static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk)
+{
+#ifdef CONFIG_SHMEM
+       if (walk->ops->pte_hole) {
+               /* depth is not used */
+               smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk);
+       }
+#endif
+}
+
 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
                struct mm_walk *walk)
 {
@@ -516,12 +527,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
                        }
                } else if (is_pfn_swap_entry(swpent))
                        page = pfn_swap_entry_to_page(swpent);
-       } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
-                                                       && pte_none(*pte))) {
-               page = xa_load(&vma->vm_file->f_mapping->i_pages,
-                                               linear_page_index(vma, addr));
-               if (xa_is_value(page))
-                       mss->swap += PAGE_SIZE;
+       } else {
+               smaps_pte_hole_lookup(addr, walk);
                return;
        }
 
@@ -735,8 +742,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
                return;
 
 #ifdef CONFIG_SHMEM
-       /* In case of smaps_rollup, reset the value from previous vma */
-       mss->check_shmem_swap = false;
        if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
                /*
                 * For shared or readonly shmem mappings we know that all
@@ -754,7 +759,6 @@ static void smap_gather_stats(struct vm_area_struct *vma,
                                        !(vma->vm_flags & VM_WRITE))) {
                        mss->swap += shmem_swapped;
                } else {
-                       mss->check_shmem_swap = true;
                        ops = &smaps_shmem_walk_ops;
                }
        }
index d3e995e1046fb317172a14c3ae6d5ca874fb5f3d..5f2405994280a1161796225d34d83c2e46541ee9 100644 (file)
@@ -414,6 +414,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                quota_error(dquot->dq_sb, "Quota structure has offset to "
                        "other block (%u) than it should (%u)", blk,
                        (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+               ret = -EIO;
                goto out_buf;
        }
        ret = read_blk(info, blk, buf);
@@ -479,6 +480,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
                goto out_buf;
        }
        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+       if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) {
+               quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+                           newblk, info->dqi_blocks);
+               ret = -EUCLEAN;
+               goto out_buf;
+       }
+
        if (depth == info->dqi_qtree_depth - 1) {
                ret = free_dqentry(info, dquot, newblk);
                newblk = 0;
@@ -578,6 +586,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
        blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
        if (!blk)       /* No reference? */
                goto out_buf;
+       if (blk < QT_TREEOFF || blk >= info->dqi_blocks) {
+               quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)",
+                           blk, info->dqi_blocks);
+               ret = -EUCLEAN;
+               goto out_buf;
+       }
+
        if (depth < info->dqi_qtree_depth - 1)
                ret = find_tree_dqentry(info, dquot, blk, depth+1);
        else
index 076f9ab943060786306ab69d2016912b4b3f6129..82e09901462e7c92490dcd5540083be96527b224 100644 (file)
@@ -1435,7 +1435,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        unsigned long safe_mask = 0;
        unsigned int commit_max_age = (unsigned int)-1;
        struct reiserfs_journal *journal = SB_JOURNAL(s);
-       char *new_opts;
        int err;
        char *qf_names[REISERFS_MAXQUOTAS];
        unsigned int qfmt = 0;
@@ -1443,10 +1442,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        int i;
 #endif
 
-       new_opts = kstrdup(arg, GFP_KERNEL);
-       if (arg && !new_opts)
-               return -ENOMEM;
-
        sync_filesystem(s);
        reiserfs_write_lock(s);
 
@@ -1597,7 +1592,6 @@ out_ok_unlocked:
 out_err_unlock:
        reiserfs_write_unlock(s);
 out_err:
-       kfree(new_opts);
        return err;
 }
 
diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h
new file mode 100644 (file)
index 0000000..7ccadcb
--- /dev/null
@@ -0,0 +1,989 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
+#ifndef _COMMON_SMB2PDU_H
+#define _COMMON_SMB2PDU_H
+
+/*
+ * Note that, due to trying to use names similar to the protocol specifications,
+ * there are many mixed case field names in the structures below.  Although
+ * this does not match typical Linux kernel style, it is necessary to be
+ * able to match against the protocol specfication.
+ *
+ * SMB2 commands
+ * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+ * (ie no useful data other than the SMB error code itself) and are marked such.
+ * Knowing this helps avoid response buffer allocations and copy in some cases.
+ */
+
+/* List of commands in host endian */
+#define SMB2_NEGOTIATE_HE      0x0000
+#define SMB2_SESSION_SETUP_HE  0x0001
+#define SMB2_LOGOFF_HE         0x0002 /* trivial request/resp */
+#define SMB2_TREE_CONNECT_HE   0x0003
+#define SMB2_TREE_DISCONNECT_HE        0x0004 /* trivial req/resp */
+#define SMB2_CREATE_HE         0x0005
+#define SMB2_CLOSE_HE          0x0006
+#define SMB2_FLUSH_HE          0x0007 /* trivial resp */
+#define SMB2_READ_HE           0x0008
+#define SMB2_WRITE_HE          0x0009
+#define SMB2_LOCK_HE           0x000A
+#define SMB2_IOCTL_HE          0x000B
+#define SMB2_CANCEL_HE         0x000C
+#define SMB2_ECHO_HE           0x000D
+#define SMB2_QUERY_DIRECTORY_HE        0x000E
+#define SMB2_CHANGE_NOTIFY_HE  0x000F
+#define SMB2_QUERY_INFO_HE     0x0010
+#define SMB2_SET_INFO_HE       0x0011
+#define SMB2_OPLOCK_BREAK_HE   0x0012
+
+/* The same list in little endian */
+#define SMB2_NEGOTIATE         cpu_to_le16(SMB2_NEGOTIATE_HE)
+#define SMB2_SESSION_SETUP     cpu_to_le16(SMB2_SESSION_SETUP_HE)
+#define SMB2_LOGOFF            cpu_to_le16(SMB2_LOGOFF_HE)
+#define SMB2_TREE_CONNECT      cpu_to_le16(SMB2_TREE_CONNECT_HE)
+#define SMB2_TREE_DISCONNECT   cpu_to_le16(SMB2_TREE_DISCONNECT_HE)
+#define SMB2_CREATE            cpu_to_le16(SMB2_CREATE_HE)
+#define SMB2_CLOSE             cpu_to_le16(SMB2_CLOSE_HE)
+#define SMB2_FLUSH             cpu_to_le16(SMB2_FLUSH_HE)
+#define SMB2_READ              cpu_to_le16(SMB2_READ_HE)
+#define SMB2_WRITE             cpu_to_le16(SMB2_WRITE_HE)
+#define SMB2_LOCK              cpu_to_le16(SMB2_LOCK_HE)
+#define SMB2_IOCTL             cpu_to_le16(SMB2_IOCTL_HE)
+#define SMB2_CANCEL            cpu_to_le16(SMB2_CANCEL_HE)
+#define SMB2_ECHO              cpu_to_le16(SMB2_ECHO_HE)
+#define SMB2_QUERY_DIRECTORY   cpu_to_le16(SMB2_QUERY_DIRECTORY_HE)
+#define SMB2_CHANGE_NOTIFY     cpu_to_le16(SMB2_CHANGE_NOTIFY_HE)
+#define SMB2_QUERY_INFO                cpu_to_le16(SMB2_QUERY_INFO_HE)
+#define SMB2_SET_INFO          cpu_to_le16(SMB2_SET_INFO_HE)
+#define SMB2_OPLOCK_BREAK      cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
+
+#define SMB2_INTERNAL_CMD      cpu_to_le16(0xFFFF)
+
+#define NUMBER_OF_SMB2_COMMANDS        0x0013
+
+/*
+ * SMB2 Header Definition
+ *
+ * "MBZ" :  Must be Zero
+ * "BB"  :  BugBug, Something to check/review/analyze later
+ * "PDU" :  "Protocol Data Unit" (ie a network "frame")
+ *
+ */
+
+#define __SMB2_HEADER_STRUCTURE_SIZE   64
+#define SMB2_HEADER_STRUCTURE_SIZE                             \
+       cpu_to_le16(__SMB2_HEADER_STRUCTURE_SIZE)
+
+#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
+#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
+#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc)
+
+/*
+ *     SMB2 flag definitions
+ */
+#define SMB2_FLAGS_SERVER_TO_REDIR     cpu_to_le32(0x00000001)
+#define SMB2_FLAGS_ASYNC_COMMAND       cpu_to_le32(0x00000002)
+#define SMB2_FLAGS_RELATED_OPERATIONS  cpu_to_le32(0x00000004)
+#define SMB2_FLAGS_SIGNED              cpu_to_le32(0x00000008)
+#define SMB2_FLAGS_PRIORITY_MASK       cpu_to_le32(0x00000070) /* SMB3.1.1 */
+#define SMB2_FLAGS_DFS_OPERATIONS      cpu_to_le32(0x10000000)
+#define SMB2_FLAGS_REPLAY_OPERATION    cpu_to_le32(0x20000000) /* SMB3 & up */
+
+/* See MS-SMB2 section 2.2.1 */
+struct smb2_hdr {
+       __le32 ProtocolId;      /* 0xFE 'S' 'M' 'B' */
+       __le16 StructureSize;   /* 64 */
+       __le16 CreditCharge;    /* MBZ */
+       __le32 Status;          /* Error from server */
+       __le16 Command;
+       __le16 CreditRequest;   /* CreditResponse */
+       __le32 Flags;
+       __le32 NextCommand;
+       __le64 MessageId;
+       union {
+               struct {
+                       __le32 ProcessId;
+                       __le32  TreeId;
+               } __packed SyncId;
+               __le64  AsyncId;
+       } __packed Id;
+       __le64  SessionId;
+       __u8   Signature[16];
+} __packed;
+
+struct smb2_pdu {
+       struct smb2_hdr hdr;
+       __le16 StructureSize2; /* size of wct area (varies, request specific) */
+} __packed;
+
+#define SMB3_AES_CCM_NONCE 11
+#define SMB3_AES_GCM_NONCE 12
+
+/* Transform flags (for 3.0 dialect this flag indicates CCM */
+#define TRANSFORM_FLAG_ENCRYPTED       0x0001
+struct smb2_transform_hdr {
+       __le32 ProtocolId;      /* 0xFD 'S' 'M' 'B' */
+       __u8   Signature[16];
+       __u8   Nonce[16];
+       __le32 OriginalMessageSize;
+       __u16  Reserved1;
+       __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */
+       __le64  SessionId;
+} __packed;
+
+
+/* See MS-SMB2 2.2.42 */
+struct smb2_compression_transform_hdr_unchained {
+       __le32 ProtocolId;      /* 0xFC 'S' 'M' 'B' */
+       __le32 OriginalCompressedSegmentSize;
+       __le16 CompressionAlgorithm;
+       __le16 Flags;
+       __le16 Length; /* if chained it is length, else offset */
+} __packed;
+
+/* See MS-SMB2 2.2.42.1 */
+#define SMB2_COMPRESSION_FLAG_NONE     0x0000
+#define SMB2_COMPRESSION_FLAG_CHAINED  0x0001
+
+struct compression_payload_header {
+       __le16  CompressionAlgorithm;
+       __le16  Flags;
+       __le32  Length; /* length of compressed playload including field below if present */
+       /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
+} __packed;
+
+/* See MS-SMB2 2.2.42.2 */
+struct smb2_compression_transform_hdr_chained {
+       __le32 ProtocolId;      /* 0xFC 'S' 'M' 'B' */
+       __le32 OriginalCompressedSegmentSize;
+       /* struct compression_payload_header[] */
+} __packed;
+
+/* See MS-SMB2 2.2.42.2.2 */
+struct compression_pattern_payload_v1 {
+       __le16  Pattern;
+       __le16  Reserved1;
+       __le16  Reserved2;
+       __le32  Repetitions;
+} __packed;
+
+/* See MS-SMB2 section 2.2.9.2 */
+/* Context Types */
+#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000
+#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001)
+
+struct tree_connect_contexts {
+       __le16 ContextType;
+       __le16 DataLength;
+       __le32 Reserved;
+       __u8   Data[];
+} __packed;
+
+/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
+struct smb3_blob_data {
+       __le16 BlobSize;
+       __u8   BlobData[];
+} __packed;
+
+/* Valid values for Attr */
+#define SE_GROUP_MANDATORY             0x00000001
+#define SE_GROUP_ENABLED_BY_DEFAULT    0x00000002
+#define SE_GROUP_ENABLED               0x00000004
+#define SE_GROUP_OWNER                 0x00000008
+#define SE_GROUP_USE_FOR_DENY_ONLY     0x00000010
+#define SE_GROUP_INTEGRITY             0x00000020
+#define SE_GROUP_INTEGRITY_ENABLED     0x00000040
+#define SE_GROUP_RESOURCE              0x20000000
+#define SE_GROUP_LOGON_ID              0xC0000000
+
+/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
+
+struct sid_array_data {
+       __le16 SidAttrCount;
+       /* SidAttrList - array of sid_attr_data structs */
+} __packed;
+
+struct luid_attr_data {
+
+} __packed;
+
+/*
+ * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5
+ * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA
+ */
+
+struct privilege_array_data {
+       __le16 PrivilegeCount;
+       /* array of privilege_data structs */
+} __packed;
+
+struct remoted_identity_tcon_context {
+       __le16 TicketType; /* must be 0x0001 */
+       __le16 TicketSize; /* total size of this struct */
+       __le16 User; /* offset to SID_ATTR_DATA struct with user info */
+       __le16 UserName; /* offset to null terminated Unicode username string */
+       __le16 Domain; /* offset to null terminated Unicode domain name */
+       __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */
+       __le16 RestrictedGroups; /* similar to above */
+       __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */
+       __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */
+       __le16 Owner; /* offset to BLOB_DATA struct */
+       __le16 DefaultDacl; /* offset to BLOB_DATA struct */
+       __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
+       __le16 UserClaims; /* offset to BLOB_DATA struct */
+       __le16 DeviceClaims; /* offset to BLOB_DATA struct */
+       __u8   TicketInfo[]; /* variable length buf - remoted identity data */
+} __packed;
+
+struct smb2_tree_connect_req_extension {
+       __le32 TreeConnectContextOffset;
+       __le16 TreeConnectContextCount;
+       __u8  Reserved[10];
+       __u8  PathName[]; /* variable sized array */
+       /* followed by array of TreeConnectContexts */
+} __packed;
+
+/* Flags/Reserved for SMB3.1.1 */
+#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001)
+#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002)
+#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004)
+
+struct smb2_tree_connect_req {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 9 */
+       __le16 Flags;           /* Flags in SMB3.1.1 */
+       __le16 PathOffset;
+       __le16 PathLength;
+       __u8   Buffer[1];       /* variable length */
+} __packed;
+
+/* Possible ShareType values */
+#define SMB2_SHARE_TYPE_DISK   0x01
+#define SMB2_SHARE_TYPE_PIPE   0x02
+#define        SMB2_SHARE_TYPE_PRINT   0x03
+
+/*
+ * Possible ShareFlags - exactly one and only one of the first 4 caching flags
+ * must be set (any of the remaining, SHI1005, flags may be set individually
+ * or in combination.
+ */
+#define SMB2_SHAREFLAG_MANUAL_CACHING                  0x00000000
+#define SMB2_SHAREFLAG_AUTO_CACHING                    0x00000010
+#define SMB2_SHAREFLAG_VDO_CACHING                     0x00000020
+#define SMB2_SHAREFLAG_NO_CACHING                      0x00000030
+#define SHI1005_FLAGS_DFS                              0x00000001
+#define SHI1005_FLAGS_DFS_ROOT                         0x00000002
+#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS         0x00000100
+#define SHI1005_FLAGS_FORCE_SHARED_DELETE              0x00000200
+#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING          0x00000400
+#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM      0x00000800
+#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK             0x00001000
+#define SHI1005_FLAGS_ENABLE_HASH_V1                   0x00002000
+#define SHI1005_FLAGS_ENABLE_HASH_V2                   0x00004000
+#define SHI1005_FLAGS_ENCRYPT_DATA                     0x00008000
+#define SMB2_SHAREFLAG_IDENTITY_REMOTING               0x00040000 /* 3.1.1 */
+#define SMB2_SHAREFLAG_COMPRESS_DATA                   0x00100000 /* 3.1.1 */
+#define SHI1005_FLAGS_ALL                              0x0014FF33
+
+/* Possible share capabilities */
+#define SMB2_SHARE_CAP_DFS     cpu_to_le32(0x00000008) /* all dialects */
+#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
+#define SMB2_SHARE_CAP_SCALEOUT        cpu_to_le32(0x00000020) /* 3.0 */
+#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
+#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
+#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */
+
+struct smb2_tree_connect_rsp {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 16 */
+       __u8   ShareType;       /* see below */
+       __u8   Reserved;
+       __le32 ShareFlags;      /* see below */
+       __le32 Capabilities;    /* see below */
+       __le32 MaximalAccess;
+} __packed;
+
+struct smb2_tree_disconnect_req {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 4 */
+       __le16 Reserved;
+} __packed;
+
+struct smb2_tree_disconnect_rsp {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 4 */
+       __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_NEGOTIATE_PROTOCOL  See MS-SMB2 section 2.2.3
+ */
+/* SecurityMode flags */
+#define        SMB2_NEGOTIATE_SIGNING_ENABLED     0x0001
+#define        SMB2_NEGOTIATE_SIGNING_ENABLED_LE  cpu_to_le16(0x0001)
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED           0x0002
+#define SMB2_NEGOTIATE_SIGNING_REQUIRED_LE cpu_to_le16(0x0002)
+#define SMB2_SEC_MODE_FLAGS_ALL            0x0003
+
+/* Capabilities flags */
+#define SMB2_GLOBAL_CAP_DFS            0x00000001
+#define SMB2_GLOBAL_CAP_LEASING                0x00000002 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_LARGE_MTU      0X00000004 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_MULTI_CHANNEL  0x00000008 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING  0x00000020 /* New to SMB3 */
+#define SMB2_GLOBAL_CAP_ENCRYPTION     0x00000040 /* New to SMB3 */
+/* Internal types */
+#define SMB2_NT_FIND                   0x00100000
+#define SMB2_LARGE_FILES               0x00200000
+
+#define SMB2_CLIENT_GUID_SIZE          16
+#define SMB2_CREATE_GUID_SIZE          16
+
+/* Dialects */
+#define SMB10_PROT_ID  0x0000 /* local only, not sent on wire w/CIFS negprot */
+#define SMB20_PROT_ID  0x0202
+#define SMB21_PROT_ID  0x0210
+#define SMB2X_PROT_ID  0x02FF
+#define SMB30_PROT_ID  0x0300
+#define SMB302_PROT_ID 0x0302
+#define SMB311_PROT_ID 0x0311
+#define BAD_PROT_ID    0xFFFF
+
+#define SMB311_SALT_SIZE                       32
+/* Hash Algorithm Types */
+#define SMB2_PREAUTH_INTEGRITY_SHA512  cpu_to_le16(0x0001)
+#define SMB2_PREAUTH_HASH_SIZE 64
+
+/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */
+#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES    cpu_to_le16(1)
+#define SMB2_ENCRYPTION_CAPABILITIES           cpu_to_le16(2)
+#define SMB2_COMPRESSION_CAPABILITIES          cpu_to_le16(3)
+#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID      cpu_to_le16(5)
+#define SMB2_TRANSPORT_CAPABILITIES            cpu_to_le16(6)
+#define SMB2_RDMA_TRANSFORM_CAPABILITIES       cpu_to_le16(7)
+#define SMB2_SIGNING_CAPABILITIES              cpu_to_le16(8)
+#define SMB2_POSIX_EXTENSIONS_AVAILABLE                cpu_to_le16(0x100)
+
+struct smb2_neg_context {
+       __le16  ContextType;
+       __le16  DataLength;
+       __le32  Reserved;
+       /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */
+} __packed;
+
+/*
+ * SaltLength that the server send can be zero, so the only three required
+ * fields (all __le16) end up six bytes total, so the minimum context data len
+ * in the response is six bytes which accounts for
+ *
+ *      HashAlgorithmCount, SaltLength, and 1 HashAlgorithm.
+ */
+#define MIN_PREAUTH_CTXT_DATA_LEN 6
+
+struct smb2_preauth_neg_context {
+       __le16  ContextType; /* 1 */
+       __le16  DataLength;
+       __le32  Reserved;
+       __le16  HashAlgorithmCount; /* 1 */
+       __le16  SaltLength;
+       __le16  HashAlgorithms; /* HashAlgorithms[0] since only one defined */
+       __u8    Salt[SMB311_SALT_SIZE];
+} __packed;
+
+/* Encryption Algorithms Ciphers */
+#define SMB2_ENCRYPTION_AES128_CCM     cpu_to_le16(0x0001)
+#define SMB2_ENCRYPTION_AES128_GCM     cpu_to_le16(0x0002)
+#define SMB2_ENCRYPTION_AES256_CCM      cpu_to_le16(0x0003)
+#define SMB2_ENCRYPTION_AES256_GCM      cpu_to_le16(0x0004)
+
+/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */
+#define MIN_ENCRYPT_CTXT_DATA_LEN      4
+struct smb2_encryption_neg_context {
+       __le16  ContextType; /* 2 */
+       __le16  DataLength;
+       __le32  Reserved;
+       /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
+       __le16  CipherCount; /* AES128-GCM and AES128-CCM by default */
+       __le16  Ciphers[];
+} __packed;
+
+/* See MS-SMB2 2.2.3.1.3 */
+#define SMB3_COMPRESS_NONE     cpu_to_le16(0x0000)
+#define SMB3_COMPRESS_LZNT1    cpu_to_le16(0x0001)
+#define SMB3_COMPRESS_LZ77     cpu_to_le16(0x0002)
+#define SMB3_COMPRESS_LZ77_HUFF        cpu_to_le16(0x0003)
+/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
+#define SMB3_COMPRESS_PATTERN  cpu_to_le16(0x0004) /* Pattern_V1 */
+
+/* Compression Flags */
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE                cpu_to_le32(0x00000000)
+#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED     cpu_to_le32(0x00000001)
+
+struct smb2_compression_capabilities_context {
+       __le16  ContextType; /* 3 */
+       __le16  DataLength;
+       __le32  Reserved;
+       __le16  CompressionAlgorithmCount;
+       __le16  Padding;
+       __le32  Flags;
+       __le16  CompressionAlgorithms[3];
+       __u16   Pad;  /* Some servers require pad to DataLen multiple of 8 */
+       /* Check if pad needed */
+} __packed;
+
+/*
+ * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4.
+ * Its struct simply contains NetName, an array of Unicode characters
+ */
+struct smb2_netname_neg_context {
+       __le16  ContextType; /* 5 */
+       __le16  DataLength;
+       __le32  Reserved;
+       __le16  NetName[]; /* hostname of target converted to UCS-2 */
+} __packed;
+
+/*
+ * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5
+ * and 2.2.4.1.5
+ */
+
+/* Flags */
+#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY   0x00000001
+
+struct smb2_transport_capabilities_context {
+       __le16  ContextType; /* 6 */
+       __le16  DataLength;
+       __u32   Reserved;
+       __le32  Flags;
+       __u32   Pad;
+} __packed;
+
+/*
+ * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6
+ * and 2.2.4.1.6
+ */
+
+/* RDMA Transform IDs */
+#define SMB2_RDMA_TRANSFORM_NONE       0x0000
+#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001
+#define SMB2_RDMA_TRANSFORM_SIGNING    0x0002
+
+struct smb2_rdma_transform_capabilities_context {
+       __le16  ContextType; /* 7 */
+       __le16  DataLength;
+       __u32   Reserved;
+       __le16  TransformCount;
+       __u16   Reserved1;
+       __u32   Reserved2;
+       __le16  RDMATransformIds[];
+} __packed;
+
+/*
+ * For signing capabilities context see MS-SMB2 2.2.3.1.7
+ * and 2.2.4.1.7
+ */
+
+/* Signing algorithms */
+#define SIGNING_ALG_HMAC_SHA256    0
+#define SIGNING_ALG_HMAC_SHA256_LE cpu_to_le16(0)
+#define SIGNING_ALG_AES_CMAC       1
+#define SIGNING_ALG_AES_CMAC_LE    cpu_to_le16(1)
+#define SIGNING_ALG_AES_GMAC       2
+#define SIGNING_ALG_AES_GMAC_LE    cpu_to_le16(2)
+
+struct smb2_signing_capabilities {
+       __le16  ContextType; /* 8 */
+       __le16  DataLength;
+       __le32  Reserved;
+       __le16  SigningAlgorithmCount;
+       __le16  SigningAlgorithms[];
+       /*  Followed by padding to 8 byte boundary (required by some servers) */
+} __packed;
+
+#define POSIX_CTXT_DATA_LEN    16
+struct smb2_posix_neg_context {
+       __le16  ContextType; /* 0x100 */
+       __le16  DataLength;
+       __le32  Reserved;
+       __u8    Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */
+} __packed;
+
+struct smb2_negotiate_req {
+       struct smb2_hdr hdr;
+       __le16 StructureSize; /* Must be 36 */
+       __le16 DialectCount;
+       __le16 SecurityMode;
+       __le16 Reserved;        /* MBZ */
+       __le32 Capabilities;
+       __u8   ClientGUID[SMB2_CLIENT_GUID_SIZE];
+       /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
+       __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
+       __le16 NegotiateContextCount;  /* SMB3.1.1 only. MBZ earlier */
+       __le16 Reserved2;
+       __le16 Dialects[];
+} __packed;
+
+struct smb2_negotiate_rsp {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 65 */
+       __le16 SecurityMode;
+       __le16 DialectRevision;
+       __le16 NegotiateContextCount;   /* Prior to SMB3.1.1 was Reserved & MBZ */
+       __u8   ServerGUID[16];
+       __le32 Capabilities;
+       __le32 MaxTransactSize;
+       __le32 MaxReadSize;
+       __le32 MaxWriteSize;
+       __le64 SystemTime;      /* MBZ */
+       __le64 ServerStartTime;
+       __le16 SecurityBufferOffset;
+       __le16 SecurityBufferLength;
+       __le32 NegotiateContextOffset;  /* Pre:SMB3.1.1 was reserved/ignored */
+       __u8   Buffer[1];       /* variable length GSS security buffer */
+} __packed;
+
+
+/*
+ * SMB2_SESSION_SETUP  See MS-SMB2 section 2.2.5
+ */
+/* Flags */
+#define SMB2_SESSION_REQ_FLAG_BINDING          0x01
+#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA     0x04
+
+struct smb2_sess_setup_req {
+       struct smb2_hdr hdr;
+       __le16 StructureSize; /* Must be 25 */
+       __u8   Flags;
+       __u8   SecurityMode;
+       __le32 Capabilities;
+       __le32 Channel;
+       __le16 SecurityBufferOffset;
+       __le16 SecurityBufferLength;
+       __le64 PreviousSessionId;
+       __u8   Buffer[1];       /* variable length GSS security buffer */
+} __packed;
+
+/* Currently defined SessionFlags */
+#define SMB2_SESSION_FLAG_IS_GUEST        0x0001
+#define SMB2_SESSION_FLAG_IS_GUEST_LE     cpu_to_le16(0x0001)
+#define SMB2_SESSION_FLAG_IS_NULL         0x0002
+#define SMB2_SESSION_FLAG_IS_NULL_LE      cpu_to_le16(0x0002)
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA    0x0004
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA_LE cpu_to_le16(0x0004)
+
+struct smb2_sess_setup_rsp {
+       struct smb2_hdr hdr;
+       __le16 StructureSize; /* Must be 9 */
+       __le16 SessionFlags;
+       __le16 SecurityBufferOffset;
+       __le16 SecurityBufferLength;
+       __u8   Buffer[1];       /* variable length GSS security buffer */
+} __packed;
+
+
+/*
+ * SMB2_LOGOFF  See MS-SMB2 section 2.2.7
+ */
+struct smb2_logoff_req {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 4 */
+       __le16 Reserved;
+} __packed;
+
+struct smb2_logoff_rsp {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 4 */
+       __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_CLOSE  See MS-SMB2 section 2.2.15
+ */
+/* Currently defined values for close flags */
+#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB       cpu_to_le16(0x0001)
+struct smb2_close_req {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 24 */
+       __le16 Flags;
+       __le32 Reserved;
+       __le64  PersistentFileId; /* opaque endianness */
+       __le64  VolatileFileId; /* opaque endianness */
+} __packed;
+
+/*
+ * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data)
+ */
+#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124
+
+struct smb2_close_rsp {
+       struct smb2_hdr hdr;
+       __le16 StructureSize; /* 60 */
+       __le16 Flags;
+       __le32 Reserved;
+       __le64 CreationTime;
+       __le64 LastAccessTime;
+       __le64 LastWriteTime;
+       __le64 ChangeTime;
+       __le64 AllocationSize;  /* Beginning of FILE_STANDARD_INFO equivalent */
+       __le64 EndOfFile;
+       __le32 Attributes;
+} __packed;
+
+
+/*
+ * SMB2_READ  See MS-SMB2 section 2.2.19
+ */
+/* For read request Flags field below, following flag is defined for SMB3.02 */
+#define SMB2_READFLAG_READ_UNBUFFERED  0x01
+#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */
+
+/* Channel field for read and write: exactly one of following flags can be set*/
+#define SMB2_CHANNEL_NONE               cpu_to_le32(0x00000000)
+#define SMB2_CHANNEL_RDMA_V1            cpu_to_le32(0x00000001)
+#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002)
+#define SMB2_CHANNEL_RDMA_TRANSFORM     cpu_to_le32(0x00000003)
+
+/* SMB2 read request without RFC1001 length at the beginning */
+struct smb2_read_req {
+       struct smb2_hdr hdr;
+       __le16 StructureSize; /* Must be 49 */
+       __u8   Padding; /* offset from start of SMB2 header to place read */
+       __u8   Flags; /* MBZ unless SMB3.02 or later */
+       __le32 Length;
+       __le64 Offset;
+       __le64  PersistentFileId;
+       __le64  VolatileFileId;
+       __le32 MinimumCount;
+       __le32 Channel; /* MBZ except for SMB3 or later */
+       __le32 RemainingBytes;
+       __le16 ReadChannelInfoOffset;
+       __le16 ReadChannelInfoLength;
+       __u8   Buffer[1];
+} __packed;
+
+/* Read flags */
+#define SMB2_READFLAG_RESPONSE_NONE            cpu_to_le32(0x00000000)
+#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM  cpu_to_le32(0x00000001)
+
+struct smb2_read_rsp {
+       struct smb2_hdr hdr;
+       __le16 StructureSize; /* Must be 17 */
+       __u8   DataOffset;
+       __u8   Reserved;
+       __le32 DataLength;
+       __le32 DataRemaining;
+       __le32 Flags;
+       __u8   Buffer[1];
+} __packed;
+
+
+/*
+ * SMB2_WRITE  See MS-SMB2 section 2.2.21
+ */
+/* For write request Flags field below the following flags are defined: */
+#define SMB2_WRITEFLAG_WRITE_THROUGH   0x00000001      /* SMB2.1 or later */
+#define SMB2_WRITEFLAG_WRITE_UNBUFFERED        0x00000002      /* SMB3.02 or later */
+
+struct smb2_write_req {
+       struct smb2_hdr hdr;
+       __le16 StructureSize; /* Must be 49 */
+       __le16 DataOffset; /* offset from start of SMB2 header to write data */
+       __le32 Length;
+       __le64 Offset;
+       __le64  PersistentFileId; /* opaque endianness */
+       __le64  VolatileFileId; /* opaque endianness */
+       __le32 Channel; /* MBZ unless SMB3.02 or later */
+       __le32 RemainingBytes;
+       __le16 WriteChannelInfoOffset;
+       __le16 WriteChannelInfoLength;
+       __le32 Flags;
+       __u8   Buffer[1];
+} __packed;
+
+struct smb2_write_rsp {
+       struct smb2_hdr hdr;
+       __le16 StructureSize; /* Must be 17 */
+       __u8   DataOffset;
+       __u8   Reserved;
+       __le32 DataLength;
+       __le32 DataRemaining;
+       __u32  Reserved2;
+       __u8   Buffer[1];
+} __packed;
+
+
+/*
+ * SMB2_FLUSH  See MS-SMB2 section 2.2.17
+ */
+struct smb2_flush_req {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 24 */
+       __le16 Reserved1;
+       __le32 Reserved2;
+       __le64  PersistentFileId;
+       __le64  VolatileFileId;
+} __packed;
+
+struct smb2_flush_rsp {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;
+       __le16 Reserved;
+} __packed;
+
+
+/*
+ * SMB2_NOTIFY  See MS-SMB2 section 2.2.35
+ */
+/* notify flags */
+#define SMB2_WATCH_TREE                        0x0001
+
+/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */
+#define FILE_NOTIFY_CHANGE_FILE_NAME           0x00000001
+#define FILE_NOTIFY_CHANGE_DIR_NAME            0x00000002
+#define FILE_NOTIFY_CHANGE_ATTRIBUTES          0x00000004
+#define FILE_NOTIFY_CHANGE_SIZE                        0x00000008
+#define FILE_NOTIFY_CHANGE_LAST_WRITE          0x00000010
+#define FILE_NOTIFY_CHANGE_LAST_ACCESS         0x00000020
+#define FILE_NOTIFY_CHANGE_CREATION            0x00000040
+#define FILE_NOTIFY_CHANGE_EA                  0x00000080
+#define FILE_NOTIFY_CHANGE_SECURITY            0x00000100
+#define FILE_NOTIFY_CHANGE_STREAM_NAME         0x00000200
+#define FILE_NOTIFY_CHANGE_STREAM_SIZE         0x00000400
+#define FILE_NOTIFY_CHANGE_STREAM_WRITE                0x00000800
+
+/* SMB2 Notify Action Flags */
+#define FILE_ACTION_ADDED                       0x00000001
+#define FILE_ACTION_REMOVED                     0x00000002
+#define FILE_ACTION_MODIFIED                    0x00000003
+#define FILE_ACTION_RENAMED_OLD_NAME            0x00000004
+#define FILE_ACTION_RENAMED_NEW_NAME            0x00000005
+#define FILE_ACTION_ADDED_STREAM                0x00000006
+#define FILE_ACTION_REMOVED_STREAM              0x00000007
+#define FILE_ACTION_MODIFIED_STREAM             0x00000008
+#define FILE_ACTION_REMOVED_BY_DELETE           0x00000009
+
+struct smb2_change_notify_req {
+       struct smb2_hdr hdr;
+       __le16  StructureSize;
+       __le16  Flags;
+       __le32  OutputBufferLength;
+       __le64  PersistentFileId; /* opaque endianness */
+       __le64  VolatileFileId; /* opaque endianness */
+       __le32  CompletionFilter;
+       __u32   Reserved;
+} __packed;
+
+struct smb2_change_notify_rsp {
+       struct smb2_hdr hdr;
+       __le16  StructureSize;  /* Must be 9 */
+       __le16  OutputBufferOffset;
+       __le32  OutputBufferLength;
+       __u8    Buffer[1]; /* array of file notify structs */
+} __packed;
+
+
+/*
+ * SMB2_CREATE  See MS-SMB2 section 2.2.13
+ */
+/* Oplock levels */
+#define SMB2_OPLOCK_LEVEL_NONE         0x00
+#define SMB2_OPLOCK_LEVEL_II           0x01
+#define SMB2_OPLOCK_LEVEL_EXCLUSIVE    0x08
+#define SMB2_OPLOCK_LEVEL_BATCH                0x09
+#define SMB2_OPLOCK_LEVEL_LEASE                0xFF
+/* Non-spec internal type */
+#define SMB2_OPLOCK_LEVEL_NOCHANGE     0x99
+
+/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */
+#define IL_ANONYMOUS           cpu_to_le32(0x00000000)
+#define IL_IDENTIFICATION      cpu_to_le32(0x00000001)
+#define IL_IMPERSONATION       cpu_to_le32(0x00000002)
+#define IL_DELEGATE            cpu_to_le32(0x00000003)
+
+/* File Attrubutes */
+#define FILE_ATTRIBUTE_READONLY                        0x00000001
+#define FILE_ATTRIBUTE_HIDDEN                  0x00000002
+#define FILE_ATTRIBUTE_SYSTEM                  0x00000004
+#define FILE_ATTRIBUTE_DIRECTORY               0x00000010
+#define FILE_ATTRIBUTE_ARCHIVE                 0x00000020
+#define FILE_ATTRIBUTE_NORMAL                  0x00000080
+#define FILE_ATTRIBUTE_TEMPORARY               0x00000100
+#define FILE_ATTRIBUTE_SPARSE_FILE             0x00000200
+#define FILE_ATTRIBUTE_REPARSE_POINT           0x00000400
+#define FILE_ATTRIBUTE_COMPRESSED              0x00000800
+#define FILE_ATTRIBUTE_OFFLINE                 0x00001000
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED     0x00002000
+#define FILE_ATTRIBUTE_ENCRYPTED               0x00004000
+#define FILE_ATTRIBUTE_INTEGRITY_STREAM                0x00008000
+#define FILE_ATTRIBUTE_NO_SCRUB_DATA           0x00020000
+#define FILE_ATTRIBUTE__MASK                   0x00007FB7
+
+#define FILE_ATTRIBUTE_READONLY_LE              cpu_to_le32(0x00000001)
+#define FILE_ATTRIBUTE_HIDDEN_LE               cpu_to_le32(0x00000002)
+#define FILE_ATTRIBUTE_SYSTEM_LE               cpu_to_le32(0x00000004)
+#define FILE_ATTRIBUTE_DIRECTORY_LE            cpu_to_le32(0x00000010)
+#define FILE_ATTRIBUTE_ARCHIVE_LE              cpu_to_le32(0x00000020)
+#define FILE_ATTRIBUTE_NORMAL_LE               cpu_to_le32(0x00000080)
+#define FILE_ATTRIBUTE_TEMPORARY_LE            cpu_to_le32(0x00000100)
+#define FILE_ATTRIBUTE_SPARSE_FILE_LE          cpu_to_le32(0x00000200)
+#define FILE_ATTRIBUTE_REPARSE_POINT_LE                cpu_to_le32(0x00000400)
+#define FILE_ATTRIBUTE_COMPRESSED_LE           cpu_to_le32(0x00000800)
+#define FILE_ATTRIBUTE_OFFLINE_LE              cpu_to_le32(0x00001000)
+#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED_LE  cpu_to_le32(0x00002000)
+#define FILE_ATTRIBUTE_ENCRYPTED_LE            cpu_to_le32(0x00004000)
+#define FILE_ATTRIBUTE_INTEGRITY_STREAM_LE     cpu_to_le32(0x00008000)
+#define FILE_ATTRIBUTE_NO_SCRUB_DATA_LE                cpu_to_le32(0x00020000)
+#define FILE_ATTRIBUTE_MASK_LE                 cpu_to_le32(0x00007FB7)
+
+/* Desired Access Flags */
+#define FILE_READ_DATA_LE              cpu_to_le32(0x00000001)
+#define FILE_LIST_DIRECTORY_LE         cpu_to_le32(0x00000001)
+#define FILE_WRITE_DATA_LE             cpu_to_le32(0x00000002)
+#define FILE_APPEND_DATA_LE            cpu_to_le32(0x00000004)
+#define FILE_ADD_SUBDIRECTORY_LE       cpu_to_le32(0x00000004)
+#define FILE_READ_EA_LE                        cpu_to_le32(0x00000008)
+#define FILE_WRITE_EA_LE               cpu_to_le32(0x00000010)
+#define FILE_EXECUTE_LE                        cpu_to_le32(0x00000020)
+#define FILE_DELETE_CHILD_LE           cpu_to_le32(0x00000040)
+#define FILE_READ_ATTRIBUTES_LE                cpu_to_le32(0x00000080)
+#define FILE_WRITE_ATTRIBUTES_LE       cpu_to_le32(0x00000100)
+#define FILE_DELETE_LE                 cpu_to_le32(0x00010000)
+#define FILE_READ_CONTROL_LE           cpu_to_le32(0x00020000)
+#define FILE_WRITE_DAC_LE              cpu_to_le32(0x00040000)
+#define FILE_WRITE_OWNER_LE            cpu_to_le32(0x00080000)
+#define FILE_SYNCHRONIZE_LE            cpu_to_le32(0x00100000)
+#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000)
+#define FILE_MAXIMAL_ACCESS_LE         cpu_to_le32(0x02000000)
+#define FILE_GENERIC_ALL_LE            cpu_to_le32(0x10000000)
+#define FILE_GENERIC_EXECUTE_LE                cpu_to_le32(0x20000000)
+#define FILE_GENERIC_WRITE_LE          cpu_to_le32(0x40000000)
+#define FILE_GENERIC_READ_LE           cpu_to_le32(0x80000000)
+#define DESIRED_ACCESS_MASK             cpu_to_le32(0xF21F01FF)
+
+
+#define FILE_READ_DESIRED_ACCESS_LE     (FILE_READ_DATA_LE        |    \
+                                        FILE_READ_EA_LE          |     \
+                                        FILE_GENERIC_READ_LE)
+#define FILE_WRITE_DESIRE_ACCESS_LE     (FILE_WRITE_DATA_LE       |    \
+                                        FILE_APPEND_DATA_LE      |     \
+                                        FILE_WRITE_EA_LE         |     \
+                                        FILE_WRITE_ATTRIBUTES_LE |     \
+                                        FILE_GENERIC_WRITE_LE)
+
+/* ShareAccess Flags */
+#define FILE_SHARE_READ_LE             cpu_to_le32(0x00000001)
+#define FILE_SHARE_WRITE_LE            cpu_to_le32(0x00000002)
+#define FILE_SHARE_DELETE_LE           cpu_to_le32(0x00000004)
+#define FILE_SHARE_ALL_LE              cpu_to_le32(0x00000007)
+
+/* CreateDisposition Flags */
+#define FILE_SUPERSEDE_LE              cpu_to_le32(0x00000000)
+#define FILE_OPEN_LE                   cpu_to_le32(0x00000001)
+#define FILE_CREATE_LE                 cpu_to_le32(0x00000002)
+#define        FILE_OPEN_IF_LE                 cpu_to_le32(0x00000003)
+#define FILE_OVERWRITE_LE              cpu_to_le32(0x00000004)
+#define FILE_OVERWRITE_IF_LE           cpu_to_le32(0x00000005)
+#define FILE_CREATE_MASK_LE             cpu_to_le32(0x00000007)
+
+#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA \
+                       | FILE_READ_ATTRIBUTES)
+#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
+                       | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES)
+#define FILE_EXEC_RIGHTS (FILE_EXECUTE)
+
+/* CreateOptions Flags */
+#define FILE_DIRECTORY_FILE_LE         cpu_to_le32(0x00000001)
+/* same as #define CREATE_NOT_FILE_LE  cpu_to_le32(0x00000001) */
+#define FILE_WRITE_THROUGH_LE          cpu_to_le32(0x00000002)
+#define FILE_SEQUENTIAL_ONLY_LE                cpu_to_le32(0x00000004)
+#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008)
+#define FILE_NON_DIRECTORY_FILE_LE     cpu_to_le32(0x00000040)
+#define FILE_COMPLETE_IF_OPLOCKED_LE   cpu_to_le32(0x00000100)
+#define FILE_NO_EA_KNOWLEDGE_LE                cpu_to_le32(0x00000200)
+#define FILE_RANDOM_ACCESS_LE          cpu_to_le32(0x00000800)
+#define FILE_DELETE_ON_CLOSE_LE                cpu_to_le32(0x00001000)
+#define FILE_OPEN_BY_FILE_ID_LE                cpu_to_le32(0x00002000)
+#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000)
+#define FILE_NO_COMPRESSION_LE         cpu_to_le32(0x00008000)
+#define FILE_OPEN_REPARSE_POINT_LE     cpu_to_le32(0x00200000)
+#define FILE_OPEN_NO_RECALL_LE         cpu_to_le32(0x00400000)
+#define CREATE_OPTIONS_MASK_LE          cpu_to_le32(0x00FFFFFF)
+
+#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \
+                       | FILE_READ_ATTRIBUTES_LE)
+#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \
+                       | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE)
+#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE)
+
+/* Create Context Values */
+#define SMB2_CREATE_EA_BUFFER                  "ExtA" /* extended attributes */
+#define SMB2_CREATE_SD_BUFFER                  "SecD" /* security descriptor */
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST     "DHnQ"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT   "DHnC"
+#define SMB2_CREATE_ALLOCATION_SIZE            "AISi"
+#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
+#define SMB2_CREATE_TIMEWARP_REQUEST           "TWrp"
+#define SMB2_CREATE_QUERY_ON_DISK_ID           "QFid"
+#define SMB2_CREATE_REQUEST_LEASE              "RqLs"
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2  "DH2Q"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2        "DH2C"
+#define SMB2_CREATE_TAG_POSIX          "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C"
+
+/* Flag (SMB3 open response) values */
+#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01
+
+struct create_context {
+       __le32 Next;
+       __le16 NameOffset;
+       __le16 NameLength;
+       __le16 Reserved;
+       __le16 DataOffset;
+       __le32 DataLength;
+       __u8 Buffer[];
+} __packed;
+
+struct smb2_create_req {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 57 */
+       __u8   SecurityFlags;
+       __u8   RequestedOplockLevel;
+       __le32 ImpersonationLevel;
+       __le64 SmbCreateFlags;
+       __le64 Reserved;
+       __le32 DesiredAccess;
+       __le32 FileAttributes;
+       __le32 ShareAccess;
+       __le32 CreateDisposition;
+       __le32 CreateOptions;
+       __le16 NameOffset;
+       __le16 NameLength;
+       __le32 CreateContextsOffset;
+       __le32 CreateContextsLength;
+       __u8   Buffer[];
+} __packed;
+
+struct smb2_create_rsp {
+       struct smb2_hdr hdr;
+       __le16 StructureSize;   /* Must be 89 */
+       __u8   OplockLevel;
+       __u8   Flags;  /* 0x01 if reparse point */
+       __le32 CreateAction;
+       __le64 CreationTime;
+       __le64 LastAccessTime;
+       __le64 LastWriteTime;
+       __le64 ChangeTime;
+       __le64 AllocationSize;
+       __le64 EndofFile;
+       __le32 FileAttributes;
+       __le32 Reserved2;
+       __le64  PersistentFileId;
+       __le64  VolatileFileId;
+       __le32 CreateContextsOffset;
+       __le32 CreateContextsLength;
+       __u8   Buffer[1];
+} __packed;
+
+
+#endif                         /* _COMMON_SMB2PDU_H */
index bcef3a6f4c4b586ad559c67c90cc61d262a9a21f..3bfc0f8fbd5bc7fbe3ee94b58e1724fffacf2eee 100644 (file)
@@ -476,6 +476,8 @@ void generic_shutdown_super(struct super_block *sb)
        spin_unlock(&sb_lock);
        up_write(&sb->s_umount);
        if (sb->s_bdi != &noop_backing_dev_info) {
+               if (sb->s_iflags & SB_I_PERSB_BDI)
+                       bdi_unregister(sb->s_bdi);
                bdi_put(sb->s_bdi);
                sb->s_bdi = &noop_backing_dev_info;
        }
@@ -1562,6 +1564,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
        }
        WARN_ON(sb->s_bdi != &noop_backing_dev_info);
        sb->s_bdi = bdi;
+       sb->s_iflags |= SB_I_PERSB_BDI;
 
        return 0;
 }
index d16302d3eb5971200843c82efe837f4c9d56429c..596ab20922892046de8d38fd8db83dab68ce4aba 100644 (file)
@@ -80,20 +80,6 @@ static inline int arch_is_kernel_data(unsigned long addr)
 }
 #endif
 
-/*
- * Check if an address is part of freed initmem. This is needed on architectures
- * with virt == phys kernel mapping, for code that wants to check if an address
- * is part of a static object within [_stext, _end]. After initmem is freed,
- * memory can be allocated from it, and such allocations would then have
- * addresses within the range [_stext, _end].
- */
-#ifndef arch_is_kernel_initmem_freed
-static inline int arch_is_kernel_initmem_freed(unsigned long addr)
-{
-       return 0;
-}
-#endif
-
 /**
  * memory_contains - checks if an object is contained within a memory region
  * @begin: virtual address of the beginning of the memory region
index fbc2146050a4eb8befc36b08b9421e60db71b78e..375715b0535fb0ae7a7b85336e755bad98804087 100644 (file)
@@ -577,7 +577,6 @@ extern u32 osc_sb_native_usb4_control;
 #define OSC_PCI_MSI_SUPPORT                    0x00000010
 #define OSC_PCI_EDR_SUPPORT                    0x00000080
 #define OSC_PCI_HPX_TYPE_3_SUPPORT             0x00000100
-#define OSC_PCI_SUPPORT_MASKS                  0x0000019f
 
 /* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */
 #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL      0x00000001
@@ -587,7 +586,6 @@ extern u32 osc_sb_native_usb4_control;
 #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL     0x00000010
 #define OSC_PCI_EXPRESS_LTR_CONTROL            0x00000020
 #define OSC_PCI_EXPRESS_DPC_CONTROL            0x00000080
-#define OSC_PCI_CONTROL_MASKS                  0x000000bf
 
 #define ACPI_GSB_ACCESS_ATTRIB_QUICK           0x00000002
 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV         0x00000004
index 33207004cfded8da3942a0e5ef66392bd6f62880..993c5628a72638f2cd3d36efdf5c006353076d78 100644 (file)
@@ -103,6 +103,9 @@ struct wb_completion {
  * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
  * is tested for blkcg after lookup and removed from index on mismatch so
  * that a new wb for the combination can be created.
+ *
+ * Each bdi_writeback that is not embedded into the backing_dev_info must hold
+ * a reference to the parent backing_dev_info.  See cgwb_create() for details.
  */
 struct bdi_writeback {
        struct backing_dev_info *bdi;   /* our parent bdi */
index 9c14f0a8dbe5ba4a27b8ba1cccfa2eb9e085720c..483979c1b9f43d67ea16e85622400f67f37f7a0a 100644 (file)
@@ -141,7 +141,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 }
 
 long congestion_wait(int sync, long timeout);
-long wait_iff_congested(int sync, long timeout);
 
 static inline bool mapping_can_writeback(struct address_space *mapping)
 {
index 53fd8c3cdbd04441425e857edee39b626fe8563e..bd801023504b29197f103f49030a5fd87c1bf4d7 100644 (file)
@@ -46,6 +46,7 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
                                        struct cma **res_cma);
 extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align,
                              bool no_warn);
+extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count);
 extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count);
 
 extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
index 7bbd8df025325b2629829da5170e0f3cf602f1cf..ccbbd31b3aae52f17a6d2bb2e2e73f1ce9fff25f 100644 (file)
 #else
 #define __diag_GCC_8(s)
 #endif
+
+/*
+ * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size"
+ * attribute) do not work, and must be disabled.
+ */
+#if GCC_VERSION < 90100
+#undef __alloc_size__
+#endif
index e6ec634039658e714f1c9ad9f2854580d646a318..b9121afd873316f0778e7e63d8f36365a66672dd 100644 (file)
 #define __aligned(x)                    __attribute__((__aligned__(x)))
 #define __aligned_largest               __attribute__((__aligned__))
 
+/*
+ * Note: do not use this directly. Instead, use __alloc_size() since it is conditionally
+ * available and includes other attributes.
+ *
+ *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alloc_005fsize-function-attribute
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#alloc-size
+ */
+#define __alloc_size__(x, ...)         __attribute__((__alloc_size__(x, ## __VA_ARGS__)))
+
 /*
  * Note: users of __always_inline currently do not write "inline" themselves,
  * which seems to be required by gcc to apply the attribute according
 #define __deprecated
 
 /*
- * Optional: only supported since gcc >= 5.1
  * Optional: not supported by clang
  * Optional: not supported by icc
  *
 
 /*
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-malloc-function-attribute
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#malloc
  */
 #define __malloc                        __attribute__((__malloc__))
 
index 05ceb2e92b0e72fa12c79b7ab7f94666a83ae300..1d32f4c03c9ef1a3371ebeb347b656fce972d5ac 100644 (file)
@@ -250,6 +250,18 @@ struct ftrace_likely_data {
 # define __cficanonical
 #endif
 
+/*
+ * Any place that could be marked with the "alloc_size" attribute is also
+ * a place to be marked with the "malloc" attribute. Do this as part of the
+ * __alloc_size macro to avoid redundant attributes and to avoid missing a
+ * __malloc marking.
+ */
+#ifdef __alloc_size__
+# define __alloc_size(x, ...)  __alloc_size__(x, ## __VA_ARGS__) __malloc
+#else
+# define __alloc_size(x, ...)  __malloc
+#endif
+
 #ifndef asm_volatile_goto
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
@@ -293,7 +305,13 @@ struct ftrace_likely_data {
 #ifdef __OPTIMIZE__
 # define __compiletime_assert(condition, msg, prefix, suffix)          \
        do {                                                            \
-               extern void prefix ## suffix(void) __compiletime_error(msg); \
+               /*                                                      \
+                * __noreturn is needed to give the compiler enough     \
+                * information to avoid certain possibly-uninitialized  \
+                * warnings (regardless of the build failing).          \
+                */                                                     \
+               __noreturn extern void prefix ## suffix(void)           \
+                       __compiletime_error(msg);                       \
                if (!(condition))                                       \
                        prefix ## suffix();                             \
        } while (0)
index d2b9c41c8edf5e8f552cf0fee5fe5bb9414fb090..d58e0476ee8e3be7a48d14d89c45d24438c684ca 100644 (file)
@@ -34,6 +34,8 @@
  */
 extern struct static_key_false cpusets_pre_enable_key;
 extern struct static_key_false cpusets_enabled_key;
+extern struct static_key_false cpusets_insane_config_key;
+
 static inline bool cpusets_enabled(void)
 {
        return static_branch_unlikely(&cpusets_enabled_key);
@@ -51,6 +53,19 @@ static inline void cpuset_dec(void)
        static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
 }
 
+/*
+ * This will get enabled whenever a cpuset configuration is considered
+ * unsupportable in general. E.g. movable only node which cannot satisfy
+ * any non movable allocations (see update_nodemask). Page allocator
+ * needs to make additional checks for those configurations and this
+ * check is meant to guard those checks without any overhead for sane
+ * configurations.
+ */
+static inline bool cpusets_insane_config(void)
+{
+       return static_branch_unlikely(&cpusets_insane_config_key);
+}
+
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_force_rebuild(void);
@@ -167,6 +182,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 
 static inline bool cpusets_enabled(void) { return false; }
 
+static inline bool cpusets_insane_config(void) { return false; }
+
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
index d68b67b8d458d76be4a0763e112f4263af0dc071..b4d4be3cc987f44c5ee5a6aa1b1520b347016a33 100644 (file)
@@ -14,6 +14,8 @@
 
 /* Minimal region size.  Every damon_region is aligned by this. */
 #define DAMON_MIN_REGION       PAGE_SIZE
+/* Max priority score for DAMON-based operation schemes */
+#define DAMOS_MAX_SCORE                (99)
 
 /**
  * struct damon_addr_range - Represents an address region of [@start, @end).
@@ -31,12 +33,22 @@ struct damon_addr_range {
  * @sampling_addr:     Address of the sample for the next access check.
  * @nr_accesses:       Access frequency of this region.
  * @list:              List head for siblings.
+ * @age:               Age of this region.
+ *
+ * @age is initially zero, increased for each aggregation interval, and reset
+ * to zero again if the access frequency is significantly changed.  If two
+ * regions are merged into a new region, both @nr_accesses and @age of the new
+ * region are set as region size-weighted average of those of the two regions.
  */
 struct damon_region {
        struct damon_addr_range ar;
        unsigned long sampling_addr;
        unsigned int nr_accesses;
        struct list_head list;
+
+       unsigned int age;
+/* private: Internal value for age calculation. */
+       unsigned int last_nr_accesses;
 };
 
 /**
@@ -59,16 +71,180 @@ struct damon_target {
        struct list_head list;
 };
 
+/**
+ * enum damos_action - Represents an action of a Data Access Monitoring-based
+ * Operation Scheme.
+ *
+ * @DAMOS_WILLNEED:    Call ``madvise()`` for the region with MADV_WILLNEED.
+ * @DAMOS_COLD:                Call ``madvise()`` for the region with MADV_COLD.
+ * @DAMOS_PAGEOUT:     Call ``madvise()`` for the region with MADV_PAGEOUT.
+ * @DAMOS_HUGEPAGE:    Call ``madvise()`` for the region with MADV_HUGEPAGE.
+ * @DAMOS_NOHUGEPAGE:  Call ``madvise()`` for the region with MADV_NOHUGEPAGE.
+ * @DAMOS_STAT:                Do nothing but count the stat.
+ */
+enum damos_action {
+       DAMOS_WILLNEED,
+       DAMOS_COLD,
+       DAMOS_PAGEOUT,
+       DAMOS_HUGEPAGE,
+       DAMOS_NOHUGEPAGE,
+       DAMOS_STAT,             /* Do nothing but only record the stat */
+};
+
+/**
+ * struct damos_quota - Controls the aggressiveness of the given scheme.
+ * @ms:                        Maximum milliseconds that the scheme can use.
+ * @sz:                        Maximum bytes of memory that the action can be applied.
+ * @reset_interval:    Charge reset interval in milliseconds.
+ *
+ * @weight_sz:         Weight of the region's size for prioritization.
+ * @weight_nr_accesses:        Weight of the region's nr_accesses for prioritization.
+ * @weight_age:                Weight of the region's age for prioritization.
+ *
+ * To avoid consuming too much CPU time or IO resources for applying the
+ * &struct damos->action to large memory, DAMON allows users to set time and/or
+ * size quotas.  The quotas can be set by writing non-zero values to &ms and
+ * &sz, respectively.  If the time quota is set, DAMON tries to use only up to
+ * &ms milliseconds within &reset_interval for applying the action.  If the
+ * size quota is set, DAMON tries to apply the action only up to &sz bytes
+ * within &reset_interval.
+ *
+ * Internally, the time quota is transformed to a size quota using estimated
+ * throughput of the scheme's action.  DAMON then compares it against &sz and
+ * uses smaller one as the effective quota.
+ *
+ * For selecting regions within the quota, DAMON prioritizes current scheme's
+ * target memory regions using the &struct damon_primitive->get_scheme_score.
+ * You could customize the prioritization logic by setting &weight_sz,
+ * &weight_nr_accesses, and &weight_age, because monitoring primitives are
+ * encouraged to respect those.
+ */
+struct damos_quota {
+       unsigned long ms;
+       unsigned long sz;
+       unsigned long reset_interval;
+
+       unsigned int weight_sz;
+       unsigned int weight_nr_accesses;
+       unsigned int weight_age;
+
+/* private: */
+       /* For throughput estimation */
+       unsigned long total_charged_sz;
+       unsigned long total_charged_ns;
+
+       unsigned long esz;      /* Effective size quota in bytes */
+
+       /* For charging the quota */
+       unsigned long charged_sz;
+       unsigned long charged_from;
+       struct damon_target *charge_target_from;
+       unsigned long charge_addr_from;
+
+       /* For prioritization */
+       unsigned long histogram[DAMOS_MAX_SCORE + 1];
+       unsigned int min_score;
+};
+
+/**
+ * enum damos_wmark_metric - Represents the watermark metric.
+ *
+ * @DAMOS_WMARK_NONE:          Ignore the watermarks of the given scheme.
+ * @DAMOS_WMARK_FREE_MEM_RATE: Free memory rate of the system in [0,1000].
+ */
+enum damos_wmark_metric {
+       DAMOS_WMARK_NONE,
+       DAMOS_WMARK_FREE_MEM_RATE,
+};
+
+/**
+ * struct damos_watermarks - Controls when a given scheme should be activated.
+ * @metric:    Metric for the watermarks.
+ * @interval:  Watermarks check time interval in microseconds.
+ * @high:      High watermark.
+ * @mid:       Middle watermark.
+ * @low:       Low watermark.
+ *
+ * If &metric is &DAMOS_WMARK_NONE, the scheme is always active.  Being active
+ * means DAMON does monitoring and applying the action of the scheme to
+ * appropriate memory regions.  Else, DAMON checks &metric of the system for at
+ * least every &interval microseconds and works as below.
+ *
+ * If &metric is higher than &high, the scheme is inactivated.  If &metric is
+ * between &mid and &low, the scheme is activated.  If &metric is lower than
+ * &low, the scheme is inactivated.
+ */
+struct damos_watermarks {
+       enum damos_wmark_metric metric;
+       unsigned long interval;
+       unsigned long high;
+       unsigned long mid;
+       unsigned long low;
+
+/* private: */
+       bool activated;
+};
+
+/**
+ * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
+ * @min_sz_region:     Minimum size of target regions.
+ * @max_sz_region:     Maximum size of target regions.
+ * @min_nr_accesses:   Minimum ``->nr_accesses`` of target regions.
+ * @max_nr_accesses:   Maximum ``->nr_accesses`` of target regions.
+ * @min_age_region:    Minimum age of target regions.
+ * @max_age_region:    Maximum age of target regions.
+ * @action:            &damo_action to be applied to the target regions.
+ * @quota:             Control the aggressiveness of this scheme.
+ * @wmarks:            Watermarks for automated (in)activation of this scheme.
+ * @stat_count:                Total number of regions that this scheme is applied.
+ * @stat_sz:           Total size of regions that this scheme is applied.
+ * @list:              List head for siblings.
+ *
+ * For each aggregation interval, DAMON finds regions which fit in the
+ * condition (&min_sz_region, &max_sz_region, &min_nr_accesses,
+ * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to
+ * those.  To avoid consuming too much CPU time or IO resources for the
+ * &action, &quota is used.
+ *
+ * To do the work only when needed, schemes can be activated for specific
+ * system situations using &wmarks.  If all schemes that registered to the
+ * monitoring context are inactive, DAMON stops monitoring either, and just
+ * repeatedly checks the watermarks.
+ *
+ * If all schemes that registered to a &struct damon_ctx are inactive, DAMON
+ * stops monitoring and just repeatedly checks the watermarks.
+ *
+ * After applying the &action to each region, &stat_count and &stat_sz is
+ * updated to reflect the number of regions and total size of regions that the
+ * &action is applied.
+ */
+struct damos {
+       unsigned long min_sz_region;
+       unsigned long max_sz_region;
+       unsigned int min_nr_accesses;
+       unsigned int max_nr_accesses;
+       unsigned int min_age_region;
+       unsigned int max_age_region;
+       enum damos_action action;
+       struct damos_quota quota;
+       struct damos_watermarks wmarks;
+       unsigned long stat_count;
+       unsigned long stat_sz;
+       struct list_head list;
+};
+
 struct damon_ctx;
 
 /**
- * struct damon_primitive      Monitoring primitives for given use cases.
+ * struct damon_primitive - Monitoring primitives for given use cases.
  *
  * @init:                      Initialize primitive-internal data structures.
  * @update:                    Update primitive-internal data structures.
  * @prepare_access_checks:     Prepare next access check of target regions.
  * @check_accesses:            Check the accesses to target regions.
  * @reset_aggregated:          Reset aggregated accesses monitoring results.
+ * @get_scheme_score:          Get the score of a region for a scheme.
+ * @apply_scheme:              Apply a DAMON-based operation scheme.
  * @target_valid:              Determine if the target is valid.
  * @cleanup:                   Clean up the context.
  *
@@ -94,6 +270,11 @@ struct damon_ctx;
  * of its update.  The value will be used for regions adjustment threshold.
  * @reset_aggregated should reset the access monitoring results that aggregated
  * by @check_accesses.
+ * @get_scheme_score should return the priority score of a region for a scheme
+ * as an integer in [0, &DAMOS_MAX_SCORE].
+ * @apply_scheme is called from @kdamond when a region for user provided
+ * DAMON-based operation scheme is found.  It should apply the scheme's action
+ * to the region.  This is not used for &DAMON_ARBITRARY_TARGET case.
  * @target_valid should check whether the target is still valid for the
  * monitoring.
  * @cleanup is called from @kdamond just before its termination.
@@ -104,12 +285,17 @@ struct damon_primitive {
        void (*prepare_access_checks)(struct damon_ctx *context);
        unsigned int (*check_accesses)(struct damon_ctx *context);
        void (*reset_aggregated)(struct damon_ctx *context);
+       int (*get_scheme_score)(struct damon_ctx *context,
+                       struct damon_target *t, struct damon_region *r,
+                       struct damos *scheme);
+       int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t,
+                       struct damon_region *r, struct damos *scheme);
        bool (*target_valid)(void *target);
        void (*cleanup)(struct damon_ctx *context);
 };
 
-/*
- * struct damon_callback       Monitoring events notification callbacks.
+/**
+ * struct damon_callback - Monitoring events notification callbacks.
  *
  * @before_start:      Called before starting the monitoring.
  * @after_sampling:    Called after each sampling.
@@ -136,7 +322,7 @@ struct damon_callback {
        int (*before_start)(struct damon_ctx *context);
        int (*after_sampling)(struct damon_ctx *context);
        int (*after_aggregation)(struct damon_ctx *context);
-       int (*before_terminate)(struct damon_ctx *context);
+       void (*before_terminate)(struct damon_ctx *context);
 };
 
 /**
@@ -182,6 +368,7 @@ struct damon_callback {
  * @min_nr_regions:    The minimum number of adaptive monitoring regions.
  * @max_nr_regions:    The maximum number of adaptive monitoring regions.
  * @adaptive_targets:  Head of monitoring targets (&damon_target) list.
+ * @schemes:           Head of schemes (&damos) list.
  */
 struct damon_ctx {
        unsigned long sample_interval;
@@ -194,7 +381,6 @@ struct damon_ctx {
 
 /* public: */
        struct task_struct *kdamond;
-       bool kdamond_stop;
        struct mutex kdamond_lock;
 
        struct damon_primitive primitive;
@@ -203,6 +389,7 @@ struct damon_ctx {
        unsigned long min_nr_regions;
        unsigned long max_nr_regions;
        struct list_head adaptive_targets;
+       struct list_head schemes;
 };
 
 #define damon_next_region(r) \
@@ -211,6 +398,9 @@ struct damon_ctx {
 #define damon_prev_region(r) \
        (container_of(r->list.prev, struct damon_region, list))
 
+#define damon_last_region(t) \
+       (list_last_entry(&t->regions_list, struct damon_region, list))
+
 #define damon_for_each_region(r, t) \
        list_for_each_entry(r, &t->regions_list, list)
 
@@ -223,6 +413,12 @@ struct damon_ctx {
 #define damon_for_each_target_safe(t, next, ctx)       \
        list_for_each_entry_safe(t, next, &(ctx)->adaptive_targets, list)
 
+#define damon_for_each_scheme(s, ctx) \
+       list_for_each_entry(s, &(ctx)->schemes, list)
+
+#define damon_for_each_scheme_safe(s, next, ctx) \
+       list_for_each_entry_safe(s, next, &(ctx)->schemes, list)
+
 #ifdef CONFIG_DAMON
 
 struct damon_region *damon_new_region(unsigned long start, unsigned long end);
@@ -232,8 +428,18 @@ inline void damon_insert_region(struct damon_region *r,
 void damon_add_region(struct damon_region *r, struct damon_target *t);
 void damon_destroy_region(struct damon_region *r, struct damon_target *t);
 
+struct damos *damon_new_scheme(
+               unsigned long min_sz_region, unsigned long max_sz_region,
+               unsigned int min_nr_accesses, unsigned int max_nr_accesses,
+               unsigned int min_age_region, unsigned int max_age_region,
+               enum damos_action action, struct damos_quota *quota,
+               struct damos_watermarks *wmarks);
+void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
+void damon_destroy_scheme(struct damos *s);
+
 struct damon_target *damon_new_target(unsigned long id);
 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
+bool damon_targets_empty(struct damon_ctx *ctx);
 void damon_free_target(struct damon_target *t);
 void damon_destroy_target(struct damon_target *t);
 unsigned int damon_nr_regions(struct damon_target *t);
@@ -245,6 +451,8 @@ int damon_set_targets(struct damon_ctx *ctx,
 int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
                unsigned long aggr_int, unsigned long primitive_upd_int,
                unsigned long min_nr_reg, unsigned long max_nr_reg);
+int damon_set_schemes(struct damon_ctx *ctx,
+                       struct damos **schemes, ssize_t nr_schemes);
 int damon_nr_running_ctxs(void);
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs);
@@ -261,8 +469,26 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx);
 unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
 bool damon_va_target_valid(void *t);
 void damon_va_cleanup(struct damon_ctx *ctx);
+int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t,
+               struct damon_region *r, struct damos *scheme);
+int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
+               struct damon_region *r, struct damos *scheme);
 void damon_va_set_primitives(struct damon_ctx *ctx);
 
 #endif /* CONFIG_DAMON_VADDR */
 
+#ifdef CONFIG_DAMON_PADDR
+
+/* Monitoring primitives for the physical memory address space */
+void damon_pa_prepare_access_checks(struct damon_ctx *ctx);
+unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
+bool damon_pa_target_valid(void *t);
+int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t,
+               struct damon_region *r, struct damos *scheme);
+int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
+               struct damon_region *r, struct damos *scheme);
+void damon_pa_set_primitives(struct damon_ctx *ctx);
+
+#endif /* CONFIG_DAMON_PADDR */
+
 #endif /* _DAMON_H */
index eec3b7c40811528dfab87563cacffea4396a6021..616af2ea20f301067faccb5e18f2814750d8b046 100644 (file)
@@ -84,13 +84,20 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */
  */
 #define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE)
 
+/* Events that can be reported with event->fd */
+#define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS)
+
 /* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */
 #define FANOTIFY_INODE_EVENTS  (FANOTIFY_DIRENT_EVENTS | \
                                 FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF)
 
+/* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */
+#define FANOTIFY_ERROR_EVENTS  (FAN_FS_ERROR)
+
 /* Events that user can request to be notified on */
 #define FANOTIFY_EVENTS                (FANOTIFY_PATH_EVENTS | \
-                                FANOTIFY_INODE_EVENTS)
+                                FANOTIFY_INODE_EVENTS | \
+                                FANOTIFY_ERROR_EVENTS)
 
 /* Events that require a permission response from user */
 #define FANOTIFY_PERM_EVENTS   (FAN_OPEN_PERM | FAN_ACCESS_PERM | \
index f3cfca5edc9ae20774207a32931c86a0e9b35bb4..4137a9bfae7a95d75f7deeccbb0e0769bdc8aa0f 100644 (file)
@@ -1440,6 +1440,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_UNTRUSTED_MOUNTER         0x00000040
 
 #define SB_I_SKIP_SYNC 0x00000100      /* Skip superblock at global sync */
+#define SB_I_PERSB_BDI 0x00000200      /* has a per-sb bdi */
 
 /* Possible states of 'frozen' field */
 enum {
index 12d3a7d308ab9ae40afd886b80cabb0a319f3e6e..787545e87eeb053d8016ebfecc1a5c67f806ff8b 100644 (file)
  * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only
  * the child is interested and not the parent.
  */
-static inline void fsnotify_name(struct inode *dir, __u32 mask,
-                                struct inode *child,
-                                const struct qstr *name, u32 cookie)
+static inline int fsnotify_name(__u32 mask, const void *data, int data_type,
+                               struct inode *dir, const struct qstr *name,
+                               u32 cookie)
 {
        if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0)
-               return;
+               return 0;
 
-       fsnotify(mask, child, FSNOTIFY_EVENT_INODE, dir, name, NULL, cookie);
+       return fsnotify(mask, data, data_type, dir, name, NULL, cookie);
 }
 
 static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
                                   __u32 mask)
 {
-       fsnotify_name(dir, mask, d_inode(dentry), &dentry->d_name, 0);
+       fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0);
 }
 
 static inline void fsnotify_inode(struct inode *inode, __u32 mask)
@@ -86,7 +86,7 @@ notify_child:
  */
 static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
 {
-       fsnotify_parent(dentry, mask, d_inode(dentry), FSNOTIFY_EVENT_INODE);
+       fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
 }
 
 static inline int fsnotify_file(struct file *file, __u32 mask)
@@ -154,8 +154,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
                new_dir_mask |= FS_ISDIR;
        }
 
-       fsnotify_name(old_dir, old_dir_mask, source, old_name, fs_cookie);
-       fsnotify_name(new_dir, new_dir_mask, source, new_name, fs_cookie);
+       fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
+                     old_dir, old_name, fs_cookie);
+       fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
+                     new_dir, new_name, fs_cookie);
 
        if (target)
                fsnotify_link_count(target);
@@ -190,16 +192,22 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 
 /*
  * fsnotify_create - 'name' was linked in
+ *
+ * Caller must make sure that dentry->d_name is stable.
+ * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
+ * ->d_inode later
  */
-static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
+static inline void fsnotify_create(struct inode *dir, struct dentry *dentry)
 {
-       audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
+       audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);
 
-       fsnotify_dirent(inode, dentry, FS_CREATE);
+       fsnotify_dirent(dir, dentry, FS_CREATE);
 }
 
 /*
  * fsnotify_link - new hardlink in 'inode' directory
+ *
+ * Caller must make sure that new_dentry->d_name is stable.
  * Note: We have to pass also the linked inode ptr as some filesystems leave
  *   new_dentry->d_inode NULL and instantiate inode pointer later
  */
@@ -209,7 +217,8 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode,
        fsnotify_link_count(inode);
        audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);
 
-       fsnotify_name(dir, FS_CREATE, inode, &new_dentry->d_name, 0);
+       fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE,
+                     dir, &new_dentry->d_name, 0);
 }
 
 /*
@@ -227,12 +236,16 @@ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
 
 /*
  * fsnotify_mkdir - directory 'name' was created
+ *
+ * Caller must make sure that dentry->d_name is stable.
+ * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
+ * ->d_inode later
  */
-static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
+static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
 {
-       audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
+       audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);
 
-       fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR);
+       fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR);
 }
 
 /*
@@ -326,4 +339,17 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
                fsnotify_dentry(dentry, mask);
 }
 
+static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
+                                   int error)
+{
+       struct fs_error_report report = {
+               .error = error,
+               .inode = inode,
+               .sb = sb,
+       };
+
+       return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR,
+                       NULL, NULL, NULL, 0);
+}
+
 #endif /* _LINUX_FS_NOTIFY_H */
index 1ce66748a2d299dbfac1b3438ed7f1ac48f48527..51ef2b079bfa0c7ea4bf8a8f1d010cb67a546d95 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
 #include <linux/refcount.h>
+#include <linux/mempool.h>
 
 /*
  * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
 
 #define FS_UNMOUNT             0x00002000      /* inode on umount fs */
 #define FS_Q_OVERFLOW          0x00004000      /* Event queued overflowed */
+#define FS_ERROR               0x00008000      /* Filesystem Error (fanotify) */
+
+/*
+ * FS_IN_IGNORED overloads FS_ERROR.  It is only used internally by inotify
+ * which does not support FS_ERROR.
+ */
 #define FS_IN_IGNORED          0x00008000      /* last inotify event here */
 
 #define FS_OPEN_PERM           0x00010000      /* open event in an permission hook */
 #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
                             FS_EVENTS_POSS_ON_CHILD | \
                             FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \
-                            FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED)
+                            FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
+                            FS_ERROR)
 
 /* Extra flags that may be reported with event or control handling of events */
 #define ALL_FSNOTIFY_FLAGS  (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \
@@ -136,6 +144,7 @@ struct mem_cgroup;
  * @dir:       optional directory associated with event -
  *             if @file_name is not NULL, this is the directory that
  *             @file_name is relative to.
+ *             Either @inode or @dir must be non-NULL.
  * @file_name: optional file name associated with event
  * @cookie:    inotify rename cookie
  *
@@ -155,7 +164,7 @@ struct fsnotify_ops {
                            const struct qstr *file_name, u32 cookie);
        void (*free_group_priv)(struct fsnotify_group *group);
        void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
-       void (*free_event)(struct fsnotify_event *event);
+       void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event);
        /* called on final put+free to free memory */
        void (*free_mark)(struct fsnotify_mark *mark);
 };
@@ -238,6 +247,7 @@ struct fsnotify_group {
                        int flags;           /* flags from fanotify_init() */
                        int f_flags; /* event_f_flags from fanotify_init() */
                        struct ucounts *ucounts;
+                       mempool_t error_events_pool;
                } fanotify_data;
 #endif /* CONFIG_FANOTIFY */
        };
@@ -248,6 +258,14 @@ enum fsnotify_data_type {
        FSNOTIFY_EVENT_NONE,
        FSNOTIFY_EVENT_PATH,
        FSNOTIFY_EVENT_INODE,
+       FSNOTIFY_EVENT_DENTRY,
+       FSNOTIFY_EVENT_ERROR,
+};
+
+struct fs_error_report {
+       int error;
+       struct inode *inode;
+       struct super_block *sb;
 };
 
 static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
@@ -255,8 +273,25 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return (struct inode *)data;
+       case FSNOTIFY_EVENT_DENTRY:
+               return d_inode(data);
        case FSNOTIFY_EVENT_PATH:
                return d_inode(((const struct path *)data)->dentry);
+       case FSNOTIFY_EVENT_ERROR:
+               return ((struct fs_error_report *)data)->inode;
+       default:
+               return NULL;
+       }
+}
+
+static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type)
+{
+       switch (data_type) {
+       case FSNOTIFY_EVENT_DENTRY:
+               /* Non const is needed for dget() */
+               return (struct dentry *)data;
+       case FSNOTIFY_EVENT_PATH:
+               return ((const struct path *)data)->dentry;
        default:
                return NULL;
        }
@@ -273,6 +308,35 @@ static inline const struct path *fsnotify_data_path(const void *data,
        }
 }
 
+static inline struct super_block *fsnotify_data_sb(const void *data,
+                                                  int data_type)
+{
+       switch (data_type) {
+       case FSNOTIFY_EVENT_INODE:
+               return ((struct inode *)data)->i_sb;
+       case FSNOTIFY_EVENT_DENTRY:
+               return ((struct dentry *)data)->d_sb;
+       case FSNOTIFY_EVENT_PATH:
+               return ((const struct path *)data)->dentry->d_sb;
+       case FSNOTIFY_EVENT_ERROR:
+               return ((struct fs_error_report *) data)->sb;
+       default:
+               return NULL;
+       }
+}
+
+static inline struct fs_error_report *fsnotify_data_error_report(
+                                                       const void *data,
+                                                       int data_type)
+{
+       switch (data_type) {
+       case FSNOTIFY_EVENT_ERROR:
+               return (struct fs_error_report *) data;
+       default:
+               return NULL;
+       }
+}
+
 enum fsnotify_obj_type {
        FSNOTIFY_OBJ_TYPE_INODE,
        FSNOTIFY_OBJ_TYPE_PARENT,
@@ -482,16 +546,30 @@ extern int fsnotify_fasync(int fd, struct file *file, int on);
 extern void fsnotify_destroy_event(struct fsnotify_group *group,
                                   struct fsnotify_event *event);
 /* attach the event to the group notification queue */
-extern int fsnotify_add_event(struct fsnotify_group *group,
-                             struct fsnotify_event *event,
-                             int (*merge)(struct fsnotify_group *,
-                                          struct fsnotify_event *),
-                             void (*insert)(struct fsnotify_group *,
-                                            struct fsnotify_event *));
+extern int fsnotify_insert_event(struct fsnotify_group *group,
+                                struct fsnotify_event *event,
+                                int (*merge)(struct fsnotify_group *,
+                                             struct fsnotify_event *),
+                                void (*insert)(struct fsnotify_group *,
+                                               struct fsnotify_event *));
+
+static inline int fsnotify_add_event(struct fsnotify_group *group,
+                                    struct fsnotify_event *event,
+                                    int (*merge)(struct fsnotify_group *,
+                                                 struct fsnotify_event *))
+{
+       return fsnotify_insert_event(group, event, merge, NULL);
+}
+
 /* Queue overflow event to a notification group */
 static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
 {
-       fsnotify_add_event(group, group->overflow_event, NULL, NULL);
+       fsnotify_add_event(group, group->overflow_event, NULL);
+}
+
+static inline bool fsnotify_is_overflow_event(u32 mask)
+{
+       return mask & FS_Q_OVERFLOW;
 }
 
 static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
index 3745efd21cf6df3fc976c87112bd386da6e58f98..b976c4177299523c6f35cea9b236aba146e70d64 100644 (file)
@@ -531,6 +531,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
                                struct list_head *page_list,
                                struct page **page_array);
 
+unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
+                               unsigned long nr_pages,
+                               struct page **page_array);
+
 /* Bulk allocate order-0 pages */
 static inline unsigned long
 alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
@@ -618,9 +622,9 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
-void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
+void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1);
 void free_pages_exact(void *virt, size_t size);
-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
+__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(1);
 
 #define __get_free_page(gfp_mask) \
                __get_free_pages((gfp_mask), 0)
index 27cdd715c5f94f498f1ab836f4ff75016a86b07b..25aff0f2ed0b0053b86467da3b2e157b07495760 100644 (file)
@@ -180,9 +180,9 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
 #ifndef clear_user_highpage
 static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
 {
-       void *addr = kmap_atomic(page);
+       void *addr = kmap_local_page(page);
        clear_user_page(addr, vaddr, page);
-       kunmap_atomic(addr);
+       kunmap_local(addr);
 }
 #endif
 
@@ -214,9 +214,9 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
 
 static inline void clear_highpage(struct page *page)
 {
-       void *kaddr = kmap_atomic(page);
+       void *kaddr = kmap_local_page(page);
        clear_page(kaddr);
-       kunmap_atomic(kaddr);
+       kunmap_local(kaddr);
 }
 
 #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
@@ -239,7 +239,7 @@ static inline void zero_user_segments(struct page *page,
                unsigned start1, unsigned end1,
                unsigned start2, unsigned end2)
 {
-       void *kaddr = kmap_atomic(page);
+       void *kaddr = kmap_local_page(page);
        unsigned int i;
 
        BUG_ON(end1 > page_size(page) || end2 > page_size(page));
@@ -250,7 +250,7 @@ static inline void zero_user_segments(struct page *page,
        if (end2 > start2)
                memset(kaddr + start2, 0, end2 - start2);
 
-       kunmap_atomic(kaddr);
+       kunmap_local(kaddr);
        for (i = 0; i < compound_nr(page); i++)
                flush_dcache_page(page + i);
 }
@@ -275,11 +275,11 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
 {
        char *vfrom, *vto;
 
-       vfrom = kmap_atomic(from);
-       vto = kmap_atomic(to);
+       vfrom = kmap_local_page(from);
+       vto = kmap_local_page(to);
        copy_user_page(vto, vfrom, vaddr, to);
-       kunmap_atomic(vto);
-       kunmap_atomic(vfrom);
+       kunmap_local(vto);
+       kunmap_local(vfrom);
 }
 
 #endif
@@ -290,11 +290,11 @@ static inline void copy_highpage(struct page *to, struct page *from)
 {
        char *vfrom, *vto;
 
-       vfrom = kmap_atomic(from);
-       vto = kmap_atomic(to);
+       vfrom = kmap_local_page(from);
+       vto = kmap_local_page(to);
        copy_page(vto, vfrom);
-       kunmap_atomic(vto);
-       kunmap_atomic(vfrom);
+       kunmap_local(vto);
+       kunmap_local(vfrom);
 }
 
 #endif
index 1faebe1cd0ed5b86ab02beb66d2d63426b5dbc2c..44c2ab0dfa5911dc5b91a430441dd410be18517b 100644 (file)
@@ -124,6 +124,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
 void hugepage_put_subpool(struct hugepage_subpool *spool);
 
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
@@ -132,6 +133,10 @@ int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
 
+int move_hugetlb_page_tables(struct vm_area_struct *vma,
+                            struct vm_area_struct *new_vma,
+                            unsigned long old_addr, unsigned long new_addr,
+                            unsigned long len);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
                         struct page **, struct vm_area_struct **,
@@ -143,9 +148,6 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                          struct vm_area_struct *vma,
                          unsigned long start, unsigned long end,
                          struct page *ref_page);
-void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
-                               unsigned long start, unsigned long end,
-                               struct page *ref_page);
 void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(char *buf, int len, int nid);
 void hugetlb_show_meminfo(void);
@@ -218,6 +220,10 @@ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
 }
 
+static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+}
+
 static inline unsigned long hugetlb_total_pages(void)
 {
        return 0;
@@ -265,6 +271,16 @@ static inline int copy_hugetlb_page_range(struct mm_struct *dst,
        return 0;
 }
 
+static inline int move_hugetlb_page_tables(struct vm_area_struct *vma,
+                                          struct vm_area_struct *new_vma,
+                                          unsigned long old_addr,
+                                          unsigned long new_addr,
+                                          unsigned long len)
+{
+       BUG();
+       return 0;
+}
+
 static inline void hugetlb_report_meminfo(struct seq_file *m)
 {
 }
@@ -385,13 +401,6 @@ static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
        BUG();
 }
 
-static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
-                       struct vm_area_struct *vma, unsigned long start,
-                       unsigned long end, struct page *ref_page)
-{
-       BUG();
-}
-
 static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned int flags)
@@ -596,6 +605,7 @@ struct hstate {
        int next_nid_to_alloc;
        int next_nid_to_free;
        unsigned int order;
+       unsigned int demote_order;
        unsigned long mask;
        unsigned long max_huge_pages;
        unsigned long nr_huge_pages;
@@ -605,6 +615,7 @@ struct hstate {
        unsigned long nr_overcommit_huge_pages;
        struct list_head hugepage_activelist;
        struct list_head hugepage_freelists[MAX_NUMNODES];
+       unsigned int max_huge_pages_node[MAX_NUMNODES];
        unsigned int nr_huge_pages_node[MAX_NUMNODES];
        unsigned int free_huge_pages_node[MAX_NUMNODES];
        unsigned int surplus_huge_pages_node[MAX_NUMNODES];
@@ -637,8 +648,9 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
                                unsigned long address, struct page *page);
 
 /* arch callback */
-int __init __alloc_bootmem_huge_page(struct hstate *h);
-int __init alloc_bootmem_huge_page(struct hstate *h);
+int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
+int __init alloc_bootmem_huge_page(struct hstate *h, int nid);
+bool __init hugetlb_node_alloc_supported(void);
 
 void __init hugetlb_add_hstate(unsigned order);
 bool __init arch_hugetlb_valid_size(unsigned long size);
index e9743cfd858527467002830df395e612ca8d857f..66a774d2710e6663e2ab2f65aa6427b330273486 100644 (file)
@@ -132,13 +132,7 @@ io_mapping_init_wc(struct io_mapping *iomap,
 
        iomap->base = base;
        iomap->size = size;
-#if defined(pgprot_noncached_wc) /* archs can't agree on a name ... */
-       iomap->prot = pgprot_noncached_wc(PAGE_KERNEL);
-#elif defined(pgprot_writecombine)
        iomap->prot = pgprot_writecombine(PAGE_KERNEL);
-#else
-       iomap->prot = pgprot_noncached(PAGE_KERNEL);
-#endif
 
        return iomap;
 }
index 9ee238ad29ce91ebe9c888c6e9ee9b8b3aaccf85..553da4899f5536570005c932786c055c7def5b90 100644 (file)
@@ -64,6 +64,10 @@ struct irq_fwspec {
        u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS];
 };
 
+/* Conversion function from of_phandle_args fields to fwspec  */
+void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+                              unsigned int count, struct irq_fwspec *fwspec);
+
 /*
  * Should several domains have the same device node, but serve
  * different purposes (for example one domain is for PCI/MSI, and the
index de5f5913374de3999ae4df5efa25837ea1ee0388..d8783b68266957a5ddde5f0d2fb28bbcec355a2e 100644 (file)
@@ -375,12 +375,14 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
 void kasan_record_aux_stack(void *ptr);
+void kasan_record_aux_stack_noalloc(void *ptr);
 
 #else /* CONFIG_KASAN_GENERIC */
 
 static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
 static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
 static inline void kasan_record_aux_stack(void *ptr) {}
+static inline void kasan_record_aux_stack_noalloc(void *ptr) {}
 
 #endif /* CONFIG_KASAN_GENERIC */
 
@@ -439,6 +441,8 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
                           unsigned long free_region_start,
                           unsigned long free_region_end);
 
+void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
+
 #else /* CONFIG_KASAN_VMALLOC */
 
 static inline int kasan_populate_vmalloc(unsigned long start,
@@ -456,6 +460,10 @@ static inline void kasan_release_vmalloc(unsigned long start,
                                         unsigned long free_region_start,
                                         unsigned long free_region_end) {}
 
+static inline void kasan_populate_early_vm_area_shadow(void *start,
+                                                      unsigned long size)
+{ }
+
 #endif /* CONFIG_KASAN_VMALLOC */
 
 #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
index e8696e4a45aa0b6a0072e424e54ccc4a4a64a60d..e5359b09de1dec577e1c089f9d49fc76beab456e 100644 (file)
@@ -247,6 +247,7 @@ extern bool early_boot_irqs_disabled;
 extern enum system_states {
        SYSTEM_BOOTING,
        SYSTEM_SCHEDULING,
+       SYSTEM_FREEING_INITMEM,
        SYSTEM_RUNNING,
        SYSTEM_HALT,
        SYSTEM_POWER_OFF,
index 3fe6dd8a18c19607ad1116f3977929d7c86d856e..4b5e3679a72c78caf5a6aa2c185658328403fa2f 100644 (file)
@@ -14,6 +14,9 @@
 
 #ifdef CONFIG_KFENCE
 
+#include <linux/atomic.h>
+#include <linux/static_key.h>
+
 /*
  * We allocate an even number of pages, as it simplifies calculations to map
  * address to metadata indices; effectively, the very first page serves as an
 #define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE)
 extern char *__kfence_pool;
 
-#ifdef CONFIG_KFENCE_STATIC_KEYS
-#include <linux/static_key.h>
 DECLARE_STATIC_KEY_FALSE(kfence_allocation_key);
-#else
-#include <linux/atomic.h>
 extern atomic_t kfence_allocation_gate;
-#endif
 
 /**
  * is_kfence_address() - check if an address belongs to KFENCE pool
@@ -116,13 +114,16 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags);
  */
 static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 {
-#ifdef CONFIG_KFENCE_STATIC_KEYS
-       if (static_branch_unlikely(&kfence_allocation_key))
+#if defined(CONFIG_KFENCE_STATIC_KEYS) || CONFIG_KFENCE_SAMPLE_INTERVAL == 0
+       if (!static_branch_unlikely(&kfence_allocation_key))
+               return NULL;
 #else
-       if (unlikely(!atomic_read(&kfence_allocation_gate)))
+       if (!static_branch_likely(&kfence_allocation_key))
+               return NULL;
 #endif
-               return __kfence_alloc(s, size, flags);
-       return NULL;
+       if (likely(atomic_read(&kfence_allocation_gate)))
+               return NULL;
+       return __kfence_alloc(s, size, flags);
 }
 
 /**
index 34de69b3b8badcdf971cd3f34b805d1b5b23f52d..7df557b16c1e83294add2fb8d5925f20f6253c9e 100644 (file)
@@ -28,17 +28,26 @@ extern unsigned long long max_possible_pfn;
 /**
  * enum memblock_flags - definition of memory region attributes
  * @MEMBLOCK_NONE: no special request
- * @MEMBLOCK_HOTPLUG: hotpluggable region
+ * @MEMBLOCK_HOTPLUG: memory region indicated in the firmware-provided memory
+ * map during early boot as hot(un)pluggable system RAM (e.g., memory range
+ * that might get hotunplugged later). With "movable_node" set on the kernel
+ * commandline, try keeping this memory region hotunpluggable. Does not apply
+ * to memblocks added ("hotplugged") after early boot.
  * @MEMBLOCK_MIRROR: mirrored region
  * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as
  * reserved in the memory map; refer to memblock_mark_nomap() description
  * for further details
+ * @MEMBLOCK_DRIVER_MANAGED: memory region that is always detected and added
+ * via a driver, and never indicated in the firmware-provided memory map as
+ * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the
+ * kernel resource tree.
  */
 enum memblock_flags {
        MEMBLOCK_NONE           = 0x0,  /* No special request */
        MEMBLOCK_HOTPLUG        = 0x1,  /* hotpluggable region */
        MEMBLOCK_MIRROR         = 0x2,  /* mirrored region */
        MEMBLOCK_NOMAP          = 0x4,  /* don't add to kernel direct mapping */
+       MEMBLOCK_DRIVER_MANAGED = 0x8,  /* always detected via a driver */
 };
 
 /**
@@ -100,10 +109,11 @@ static inline void memblock_discard(void) {}
 #endif
 
 void memblock_allow_resize(void);
-int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
+int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid,
+                     enum memblock_flags flags);
 int memblock_add(phys_addr_t base, phys_addr_t size);
 int memblock_remove(phys_addr_t base, phys_addr_t size);
-int memblock_free(phys_addr_t base, phys_addr_t size);
+int memblock_phys_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
 int memblock_physmem_add(phys_addr_t base, phys_addr_t size);
@@ -118,7 +128,7 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
 int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
 
 void memblock_free_all(void);
-void memblock_free_ptr(void *ptr, size_t size);
+void memblock_free(void *ptr, size_t size);
 void reset_node_managed_pages(pg_data_t *pgdat);
 void reset_all_zones_managed_pages(void);
 
@@ -133,7 +143,7 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags,
                          struct memblock_type *type_b, phys_addr_t *out_start,
                          phys_addr_t *out_end, int *out_nid);
 
-void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+void memblock_free_late(phys_addr_t base, phys_addr_t size);
 
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
 static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
@@ -208,7 +218,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
  */
 #define for_each_mem_range(i, p_start, p_end) \
        __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,   \
-                            MEMBLOCK_HOTPLUG, p_start, p_end, NULL)
+                            MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED, \
+                            p_start, p_end, NULL)
 
 /**
  * for_each_mem_range_rev - reverse iterate through memblock areas from
@@ -219,7 +230,8 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
  */
 #define for_each_mem_range_rev(i, p_start, p_end)                      \
        __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \
-                                MEMBLOCK_HOTPLUG, p_start, p_end, NULL)
+                                MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED,\
+                                p_start, p_end, NULL)
 
 /**
  * for_each_reserved_mem_range - iterate over all reserved memblock areas
@@ -249,6 +261,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
        return m->flags & MEMBLOCK_NOMAP;
 }
 
+static inline bool memblock_is_driver_managed(struct memblock_region *m)
+{
+       return m->flags & MEMBLOCK_DRIVER_MANAGED;
+}
+
 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
                            unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
@@ -441,23 +458,6 @@ static inline void *memblock_alloc_node(phys_addr_t size,
                                      MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 }
 
-static inline void memblock_free_early(phys_addr_t base,
-                                             phys_addr_t size)
-{
-       memblock_free(base, size);
-}
-
-static inline void memblock_free_early_nid(phys_addr_t base,
-                                                 phys_addr_t size, int nid)
-{
-       memblock_free(base, size);
-}
-
-static inline void memblock_free_late(phys_addr_t base, phys_addr_t size)
-{
-       __memblock_free_late(base, size);
-}
-
 /*
  * Set the allocation direction to bottom-up or top-down.
  */
index e34bf0cbdf55a0492984d2b47e86398c1e2bee67..0c5c403f4be6ba111600fad90cda0862b2a216da 100644 (file)
@@ -180,12 +180,6 @@ struct mem_cgroup_thresholds {
        struct mem_cgroup_threshold_ary *spare;
 };
 
-enum memcg_kmem_state {
-       KMEM_NONE,
-       KMEM_ALLOCATED,
-       KMEM_ONLINE,
-};
-
 #if defined(CONFIG_SMP)
 struct memcg_padding {
        char x[0];
@@ -318,7 +312,6 @@ struct mem_cgroup {
 
 #ifdef CONFIG_MEMCG_KMEM
        int kmemcg_id;
-       enum memcg_kmem_state kmem_state;
        struct obj_cgroup __rcu *objcg;
        struct list_head objcg_list; /* list of inherited objcgs */
 #endif
@@ -1667,7 +1660,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
                return true;
        do {
-               if (time_before(jiffies, memcg->socket_pressure))
+               if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
                        return true;
        } while ((memcg = parent_mem_cgroup(memcg)));
        return false;
index 182c606adb060ff962f6f34e4bd76afb9ab63b50..88eb587b514382b7b5aed884237250550b9356ec 100644 (file)
@@ -96,7 +96,6 @@ struct memory_notify {
        unsigned long start_pfn;
        unsigned long nr_pages;
        int status_change_nid_normal;
-       int status_change_nid_high;
        int status_change_nid;
 };
 
@@ -110,7 +109,7 @@ struct mem_section;
 #define SLAB_CALLBACK_PRI       1
 #define IPC_CALLBACK_PRI        10
 
-#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifndef CONFIG_MEMORY_HOTPLUG
 static inline void memory_dev_init(void)
 {
        return;
@@ -126,7 +125,14 @@ static inline int memory_notify(unsigned long val, void *v)
 {
        return 0;
 }
-#else
+static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
+{
+       return 0;
+}
+/* These aren't inline functions due to a GCC bug. */
+#define register_hotmemory_notifier(nb)    ({ (void)(nb); 0; })
+#define unregister_hotmemory_notifier(nb)  ({ (void)(nb); })
+#else /* CONFIG_MEMORY_HOTPLUG */
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 int create_memory_block_devices(unsigned long start, unsigned long size,
@@ -140,7 +146,6 @@ typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
                              void *arg, walk_memory_blocks_func_t func);
 extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func);
-#define CONFIG_MEM_BLOCK_SIZE  (PAGES_PER_SECTION<<PAGE_SHIFT)
 
 extern int memory_group_register_static(int nid, unsigned long max_pages);
 extern int memory_group_register_dynamic(int nid, unsigned long unit_pages);
@@ -149,9 +154,6 @@ struct memory_group *memory_group_find_by_id(int mgid);
 typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
                               struct memory_group *excluded, void *arg);
-#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-
-#ifdef CONFIG_MEMORY_HOTPLUG
 #define hotplug_memory_notifier(fn, pri) ({            \
        static __meminitdata struct notifier_block fn##_mem_nb =\
                { .notifier_call = fn, .priority = pri };\
@@ -159,15 +161,7 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
 })
 #define register_hotmemory_notifier(nb)                register_memory_notifier(nb)
 #define unregister_hotmemory_notifier(nb)      unregister_memory_notifier(nb)
-#else
-static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
-{
-       return 0;
-}
-/* These aren't inline functions due to a GCC bug. */
-#define register_hotmemory_notifier(nb)    ({ (void)(nb); 0; })
-#define unregister_hotmemory_notifier(nb)  ({ (void)(nb); })
-#endif
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
  * Kernel text modification mutex, used for code patching. Users of this lock
index e5a867c950b27841f158be5724f47316b50561a8..be48e003a51836bde05dde926c8666d0e611f311 100644 (file)
@@ -98,9 +98,6 @@ static inline void zone_seqlock_init(struct zone *zone)
 {
        seqlock_init(&zone->span_seqlock);
 }
-extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
-extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
-extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
 extern void adjust_present_page_count(struct page *page,
                                      struct memory_group *group,
                                      long nr_pages);
index 4091692bed8c00b753c4e8a8a920a50ccb1146f4..3c7595e81150b783c7c666966dea2e18def99be1 100644 (file)
@@ -8,7 +8,6 @@
 
 #include <linux/sched.h>
 #include <linux/mmzone.h>
-#include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
@@ -184,8 +183,6 @@ extern bool vma_migratable(struct vm_area_struct *vma);
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
 extern void mpol_put_task_policy(struct task_struct *);
 
-extern bool numa_demotion_enabled;
-
 static inline bool mpol_is_preferred_many(struct mempolicy *pol)
 {
        return  (pol->mode == MPOL_PREFERRED_MANY);
@@ -301,8 +298,6 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
        return NULL;
 }
 
-#define numa_demotion_enabled  false
-
 static inline bool mpol_is_preferred_many(struct mempolicy *pol)
 {
        return  false;
index 0d2aeb9b0f66a76cdfd40454b7c7c428a463bf97..eeb818c4fc78206e5cba929c6f92651c8dfab540 100644 (file)
@@ -19,24 +19,7 @@ struct migration_target_control;
  */
 #define MIGRATEPAGE_SUCCESS            0
 
-/*
- * Keep sync with:
- * - macro MIGRATE_REASON in include/trace/events/migrate.h
- * - migrate_reason_names[MR_TYPES] in mm/debug.c
- */
-enum migrate_reason {
-       MR_COMPACTION,
-       MR_MEMORY_FAILURE,
-       MR_MEMORY_HOTPLUG,
-       MR_SYSCALL,             /* also applies to cpusets */
-       MR_MEMPOLICY_MBIND,
-       MR_NUMA_MISPLACED,
-       MR_CONTIG_RANGE,
-       MR_LONGTERM_PIN,
-       MR_DEMOTION,
-       MR_TYPES
-};
-
+/* Defined in mm/debug.c: */
 extern const char *migrate_reason_names[MR_TYPES];
 
 #ifdef CONFIG_MIGRATION
@@ -61,6 +44,8 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
 void folio_migrate_copy(struct folio *newfolio, struct folio *folio);
 int folio_migrate_mapping(struct address_space *mapping,
                struct folio *newfolio, struct folio *folio, int extra_count);
+
+extern bool numa_demotion_enabled;
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
@@ -86,6 +71,8 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 {
        return -ENOSYS;
 }
+
+#define numa_demotion_enabled  false
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_COMPACTION
index 883c992490334ba27dcbcd5ba4dcad897e5dfff6..f37cc03f9369ed61e49f7be69564fa017ab51801 100644 (file)
@@ -19,4 +19,17 @@ enum migrate_mode {
        MIGRATE_SYNC_NO_COPY,
 };
 
+enum migrate_reason {
+       MR_COMPACTION,
+       MR_MEMORY_FAILURE,
+       MR_MEMORY_HOTPLUG,
+       MR_SYSCALL,             /* also applies to cpusets */
+       MR_MEMPOLICY_MBIND,
+       MR_NUMA_MISPLACED,
+       MR_CONTIG_RANGE,
+       MR_LONGTERM_PIN,
+       MR_DEMOTION,
+       MR_TYPES
+};
+
 #endif         /* MIGRATE_MODE_H_INCLUDED */
index a62b91e769c89132a3ca0261988a96b31afbafb0..a7e4a9e7d807a39bc549bcfc7238e03c4e7237bd 100644 (file)
@@ -794,40 +794,6 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
 
-extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
-static inline void *kvmalloc(size_t size, gfp_t flags)
-{
-       return kvmalloc_node(size, flags, NUMA_NO_NODE);
-}
-static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
-{
-       return kvmalloc_node(size, flags | __GFP_ZERO, node);
-}
-static inline void *kvzalloc(size_t size, gfp_t flags)
-{
-       return kvmalloc(size, flags | __GFP_ZERO);
-}
-
-static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
-{
-       size_t bytes;
-
-       if (unlikely(check_mul_overflow(n, size, &bytes)))
-               return NULL;
-
-       return kvmalloc(bytes, flags);
-}
-
-static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
-{
-       return kvmalloc_array(n, size, flags | __GFP_ZERO);
-}
-
-extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
-               gfp_t flags);
-extern void kvfree(const void *addr);
-extern void kvfree_sensitive(const void *addr, size_t len);
-
 static inline int head_compound_mapcount(struct page *head)
 {
        return atomic_read(compound_mapcount_ptr(head)) + 1;
@@ -904,6 +870,8 @@ void put_pages_list(struct list_head *pages);
 void split_page(struct page *page, unsigned int order);
 void folio_copy(struct folio *dst, struct folio *src);
 
+unsigned long nr_free_buffer_pages(void);
+
 /*
  * Compound pages have a destructor function.  Provide a
  * prototype for that function and accessor functions.
@@ -1861,12 +1829,24 @@ extern void user_shm_unlock(size_t, struct ucounts *);
  * Parameter block passed down to zap_pte_range in exceptional cases.
  */
 struct zap_details {
-       struct address_space *check_mapping;    /* Check page->mapping if set */
-       pgoff_t first_index;                    /* Lowest page->index to unmap */
-       pgoff_t last_index;                     /* Highest page->index to unmap */
+       struct address_space *zap_mapping;      /* Check page->mapping if set */
        struct page *single_page;               /* Locked page to be unmapped */
 };
 
+/*
+ * We set details->zap_mappings when we want to unmap shared but keep private
+ * pages. Return true if skip zapping this page, false otherwise.
+ */
+static inline bool
+zap_skip_check_mapping(struct zap_details *details, struct page *page)
+{
+       if (!details || !page)
+               return false;
+
+       return details->zap_mapping &&
+           (details->zap_mapping != page_rmapping(page));
+}
+
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -2576,7 +2556,7 @@ static inline unsigned long get_num_physpages(void)
  * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
  *                                                      max_highmem_pfn};
  * for_each_valid_physical_page_range()
- *     memblock_add_node(base, size, nid)
+ *     memblock_add_node(base, size, nid, MEMBLOCK_NONE)
  * free_area_init(max_zone_pfns);
  */
 void free_area_init(unsigned long *max_zone_pfn);
@@ -2604,6 +2584,7 @@ extern void memmap_init_range(unsigned long, int, unsigned long,
                unsigned long, unsigned long, enum meminit_context,
                struct vmem_altmap *, int migratetype);
 extern void setup_per_zone_wmarks(void);
+extern void calculate_min_free_kbytes(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
 extern void __init mmap_init(void);
index f7326c8704bb3bdd8970821af831430dfc4e89a6..bb8c6f5f19bcaa41b3ad7ca343287c5908ebd331 100644 (file)
@@ -114,10 +114,8 @@ struct page {
                                        struct page *next;
 #ifdef CONFIG_64BIT
                                        int pages;      /* Nr of pages left */
-                                       int pobjects;   /* Approximate count */
 #else
                                        short int pages;
-                                       short int pobjects;
 #endif
                                };
                        };
index 6a1d79d84675a783400aa3bfbdb14acbaf741209..58e744b78c2c1f22a9142c2d5c221c83851f817f 100644 (file)
@@ -199,6 +199,7 @@ enum node_stat_item {
        NR_VMSCAN_IMMEDIATE,    /* Prioritise for reclaim when writeback ends */
        NR_DIRTIED,             /* page dirtyings since bootup */
        NR_WRITTEN,             /* page writings since bootup */
+       NR_THROTTLED_WRITTEN,   /* NR_WRITTEN while reclaim throttled */
        NR_KERNEL_MISC_RECLAIMABLE,     /* reclaimable non-slab kernel pages */
        NR_FOLL_PIN_ACQUIRED,   /* via: pin_user_page(), gup flag: FOLL_PIN */
        NR_FOLL_PIN_RELEASED,   /* pages returned via unpin_user_page() */
@@ -272,6 +273,13 @@ enum lru_list {
        NR_LRU_LISTS
 };
 
+enum vmscan_throttle_state {
+       VMSCAN_THROTTLE_WRITEBACK,
+       VMSCAN_THROTTLE_ISOLATED,
+       VMSCAN_THROTTLE_NOPROGRESS,
+       NR_VMSCAN_THROTTLE,
+};
+
 #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
 
 #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
@@ -841,6 +849,13 @@ typedef struct pglist_data {
        int node_id;
        wait_queue_head_t kswapd_wait;
        wait_queue_head_t pfmemalloc_wait;
+
+       /* workqueues for throttling reclaim for different reasons. */
+       wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];
+
+       atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
+       unsigned long nr_reclaim_start; /* nr pages written while throttled
+                                        * when throttling started. */
        struct task_struct *kswapd;     /* Protected by
                                           mem_hotplug_begin/end() */
        int kswapd_order;
@@ -1220,6 +1235,28 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 #define for_each_zone_zonelist(zone, z, zlist, highidx) \
        for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
 
+/* Whether the 'nodes' are all movable nodes */
+static inline bool movable_only_nodes(nodemask_t *nodes)
+{
+       struct zonelist *zonelist;
+       struct zoneref *z;
+       int nid;
+
+       if (nodes_empty(*nodes))
+               return false;
+
+       /*
+        * We can chose arbitrary node from the nodemask to get a
+        * zonelist as they are interlinked. We just need to find
+        * at least one zone that can satisfy kernel allocations.
+        */
+       nid = first_node(*nodes);
+       zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
+       z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes);
+       return (!z->zone) ? true : false;
+}
+
+
 #ifdef CONFIG_SPARSEMEM
 #include <asm/sparsemem.h>
 #endif
@@ -1481,7 +1518,7 @@ static inline int pfn_valid(unsigned long pfn)
 
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
-       ms = __nr_to_section(pfn_to_section_nr(pfn));
+       ms = __pfn_to_section(pfn);
        if (!valid_section(ms))
                return 0;
        /*
@@ -1496,7 +1533,7 @@ static inline int pfn_in_present_section(unsigned long pfn)
 {
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
-       return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
+       return present_section(__pfn_to_section(pfn));
 }
 
 static inline unsigned long next_present_section_nr(unsigned long section_nr)
index 8e5a29897936c8b37f625561755eb857ca04dac9..bb21fd631b1621110953fd4dffebc43766b5ca00 100644 (file)
@@ -85,7 +85,7 @@ struct node {
        struct device   dev;
        struct list_head access_list;
 
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
        struct work_struct      node_work;
 #endif
 #ifdef CONFIG_HMEM_REPORTING
@@ -98,7 +98,7 @@ struct memory_block;
 extern struct node *node_devices[];
 typedef  void (*node_registration_func_t)(struct node *);
 
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
 void link_mem_sections(int nid, unsigned long start_pfn,
                       unsigned long end_pfn,
                       enum meminit_context context);
index 981341a3c3c4a9e4941b1fc18c147736d89da7c4..52ec4b5e561566d7835ce1df5b703fa4f9f993e4 100644 (file)
@@ -245,7 +245,7 @@ static __always_inline int PageCompound(struct page *page)
 #define        PAGE_POISON_PATTERN     -1l
 static inline int PagePoisoned(const struct page *page)
 {
-       return page->flags == PAGE_POISON_PATTERN;
+       return READ_ONCE(page->flags) == PAGE_POISON_PATTERN;
 }
 
 #ifdef CONFIG_DEBUG_VM
index cd8aa6fce2041c47725b2009f34669ac274f2290..b4dbcc86b3f1334c8567a485366d5c30d5611f35 100644 (file)
@@ -342,7 +342,6 @@ struct pci_dev {
        u16             pcie_flags_reg; /* Cached PCIe Capabilities Register */
        unsigned long   *dma_alias_mask;/* Mask of enabled devfn aliases */
 
-       struct pci_driver *driver;      /* Driver bound to this device */
        u64             dma_mask;       /* Mask of the bits of bus address this
                                           device implements.  Normally this is
                                           0xffffffff.  You only need to change
@@ -900,7 +899,10 @@ struct pci_driver {
        struct pci_dynids       dynids;
 };
 
-#define        to_pci_driver(drv) container_of(drv, struct pci_driver, driver)
+static inline struct pci_driver *to_pci_driver(struct device_driver *drv)
+{
+    return drv ? container_of(drv, struct pci_driver, driver) : NULL;
+}
 
 /**
  * PCI_DEVICE - macro used to describe a specific PCI device
@@ -1350,6 +1352,8 @@ void pci_unlock_rescan_remove(void);
 /* Vital Product Data routines */
 ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
 ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
+ssize_t pci_read_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
+ssize_t pci_write_vpd_any(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
 
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
 resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx);
@@ -1498,19 +1502,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
 #define PCI_IRQ_ALL_TYPES \
        (PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX)
 
-/* kmem_cache style wrapper around pci_alloc_consistent() */
-
 #include <linux/dmapool.h>
 
-#define        pci_pool dma_pool
-#define pci_pool_create(name, pdev, size, align, allocation) \
-               dma_pool_create(name, &pdev->dev, size, align, allocation)
-#define        pci_pool_destroy(pool) dma_pool_destroy(pool)
-#define        pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
-#define        pci_pool_zalloc(pool, flags, handle) \
-               dma_pool_zalloc(pool, flags, handle)
-#define        pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
-
 struct msix_entry {
        u32     vector; /* Kernel uses to write allocated vector */
        u16     entry;  /* Driver uses to specify entry, OS writes */
@@ -2126,7 +2119,7 @@ void pcibios_disable_device(struct pci_dev *dev);
 void pcibios_set_master(struct pci_dev *dev);
 int pcibios_set_pcie_reset_state(struct pci_dev *dev,
                                 enum pcie_reset_state state);
-int pcibios_add_device(struct pci_dev *dev);
+int pcibios_device_add(struct pci_dev *dev);
 void pcibios_release_device(struct pci_dev *dev);
 #ifdef CONFIG_PCI
 void pcibios_penalize_isa_irq(int irq, int active);
index 5e76af742c807afd3630e630156ab7962c26acdb..98a9371133f8f137ba641479fc489ec1ad7b2677 100644 (file)
@@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
                                pcpu_fc_populate_pte_fn_t populate_pte_fn);
 #endif
 
-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
+extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1);
 extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
 extern bool is_kernel_percpu_address(unsigned long addr);
 
@@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr);
 extern void __init setup_per_cpu_areas(void);
 #endif
 
-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp);
-extern void __percpu *__alloc_percpu(size_t size, size_t align);
+extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
+extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
 extern void free_percpu(void __percpu *__pdata);
 extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
 
index 083f3ce550bcae6064e635f51f6db2e666374e0d..181045148b0654eae388e9c747c53a60ec4abc8a 100644 (file)
@@ -142,8 +142,6 @@ struct mem_cgroup;
 void __init kmem_cache_init(void);
 bool slab_is_available(void);
 
-extern bool usercopy_fallback;
-
 struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
                        unsigned int align, slab_flags_t flags,
                        void (*ctor)(void *));
@@ -152,8 +150,8 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
                        slab_flags_t flags,
                        unsigned int useroffset, unsigned int usersize,
                        void (*ctor)(void *));
-void kmem_cache_destroy(struct kmem_cache *);
-int kmem_cache_shrink(struct kmem_cache *);
+void kmem_cache_destroy(struct kmem_cache *s);
+int kmem_cache_shrink(struct kmem_cache *s);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
@@ -181,11 +179,11 @@ int kmem_cache_shrink(struct kmem_cache *);
 /*
  * Common kmalloc functions provided by all allocators
  */
-void * __must_check krealloc(const void *, size_t, gfp_t);
-void kfree(const void *);
-void kfree_sensitive(const void *);
-size_t __ksize(const void *);
-size_t ksize(const void *);
+void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2);
+void kfree(const void *objp);
+void kfree_sensitive(const void *objp);
+size_t __ksize(const void *objp);
+size_t ksize(const void *objp);
 #ifdef CONFIG_PRINTK
 bool kmem_valid_obj(void *object);
 void kmem_dump_obj(void *object);
@@ -425,9 +423,9 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
 #define kmalloc_index(s) __kmalloc_index(s, true)
 #endif /* !CONFIG_SLOB */
 
-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
-void kmem_cache_free(struct kmem_cache *, void *);
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
+void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc;
+void kmem_cache_free(struct kmem_cache *s, void *objp);
 
 /*
  * Bulk allocation and freeing operations. These are accelerated in an
@@ -436,8 +434,8 @@ void kmem_cache_free(struct kmem_cache *, void *);
  *
  * Note that interrupts must be enabled when calling these functions.
  */
-void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
-int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
 
 /*
  * Caller must not use kfree_bulk() on memory not originally allocated
@@ -449,10 +447,12 @@ static __always_inline void kfree_bulk(size_t size, void **p)
 }
 
 #ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
+                                                        __alloc_size(1);
+void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
+                                                                        __malloc;
 #else
-static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
        return __kmalloc(size, flags);
 }
@@ -464,25 +464,24 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
 #endif
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
+extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
+                                  __assume_slab_alignment __alloc_size(3);
 
 #ifdef CONFIG_NUMA
-extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
-                                          gfp_t gfpflags,
-                                          int node, size_t size) __assume_slab_alignment __malloc;
+extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+                                        int node, size_t size) __assume_slab_alignment
+                                                               __alloc_size(4);
 #else
-static __always_inline void *
-kmem_cache_alloc_node_trace(struct kmem_cache *s,
-                             gfp_t gfpflags,
-                             int node, size_t size)
+static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
+                                                gfp_t gfpflags, int node, size_t size)
 {
        return kmem_cache_alloc_trace(s, gfpflags, size);
 }
 #endif /* CONFIG_NUMA */
 
 #else /* CONFIG_TRACING */
-static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
-               gfp_t flags, size_t size)
+static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s,
+                                                                   gfp_t flags, size_t size)
 {
        void *ret = kmem_cache_alloc(s, flags);
 
@@ -490,10 +489,8 @@ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
        return ret;
 }
 
-static __always_inline void *
-kmem_cache_alloc_node_trace(struct kmem_cache *s,
-                             gfp_t gfpflags,
-                             int node, size_t size)
+static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+                                                        int node, size_t size)
 {
        void *ret = kmem_cache_alloc_node(s, gfpflags, node);
 
@@ -502,19 +499,21 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 }
 #endif /* CONFIG_TRACING */
 
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment
+                                                                        __alloc_size(1);
 
 #ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+                               __assume_page_alignment __alloc_size(1);
 #else
-static __always_inline void *
-kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags,
+                                                                unsigned int order)
 {
        return kmalloc_order(size, flags, order);
 }
 #endif
 
-static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
+static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags)
 {
        unsigned int order = get_order(size);
        return kmalloc_order_trace(size, flags, order);
@@ -574,7 +573,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
  *     Try really hard to succeed the allocation but fail
  *     eventually.
  */
-static __always_inline void *kmalloc(size_t size, gfp_t flags)
+static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
 {
        if (__builtin_constant_p(size)) {
 #ifndef CONFIG_SLOB
@@ -596,7 +595,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
        return __kmalloc(size, flags);
 }
 
-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 #ifndef CONFIG_SLOB
        if (__builtin_constant_p(size) &&
@@ -620,7 +619,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * @size: element size.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
+static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags)
 {
        size_t bytes;
 
@@ -638,8 +637,10 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
  * @new_size: new size of a single member of the array
  * @flags: the type of memory to allocate (see kmalloc)
  */
-static __must_check inline void *
-krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags)
+static inline __alloc_size(2, 3) void * __must_check krealloc_array(void *p,
+                                                                   size_t new_n,
+                                                                   size_t new_size,
+                                                                   gfp_t flags)
 {
        size_t bytes;
 
@@ -655,7 +656,7 @@ krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t flags)
  * @size: element size.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
+static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags)
 {
        return kmalloc_array(n, size, flags | __GFP_ZERO);
 }
@@ -668,12 +669,13 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
  * allocator where we care about the real place the memory allocation
  * request comes from.
  */
-extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
+extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
+                                  __alloc_size(1);
 #define kmalloc_track_caller(size, flags) \
        __kmalloc_track_caller(size, flags, _RET_IP_)
 
-static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
-                                      int node)
+static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
+                                                         int node)
 {
        size_t bytes;
 
@@ -684,14 +686,15 @@ static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
        return __kmalloc_node(bytes, flags, node);
 }
 
-static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
+static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
 {
        return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
 }
 
 
 #ifdef CONFIG_NUMA
-extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
+extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
+                                        unsigned long caller) __alloc_size(1);
 #define kmalloc_node_track_caller(size, flags, node) \
        __kmalloc_node_track_caller(size, flags, node, \
                        _RET_IP_)
@@ -716,7 +719,7 @@ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-static inline void *kzalloc(size_t size, gfp_t flags)
+static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags)
 {
        return kmalloc(size, flags | __GFP_ZERO);
 }
@@ -727,11 +730,45 @@ static inline void *kzalloc(size_t size, gfp_t flags)
  * @flags: the type of memory to allocate (see kmalloc).
  * @node: memory node from which to allocate
  */
-static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
+static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node)
 {
        return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
+extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1);
+static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags)
+{
+       return kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node)
+{
+       return kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags)
+{
+       return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+       size_t bytes;
+
+       if (unlikely(check_mul_overflow(n, size, &bytes)))
+               return NULL;
+
+       return kvmalloc(bytes, flags);
+}
+
+static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags)
+{
+       return kvmalloc_array(n, size, flags | __GFP_ZERO);
+}
+
+extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+                     __alloc_size(3);
+extern void kvfree(const void *addr);
+extern void kvfree_sensitive(const void *addr, size_t len);
+
 unsigned int kmem_cache_size(struct kmem_cache *s);
 void __init kmem_cache_init_late(void);
 
index 85499f0586b06c55d2a494e5888bda1e2d651b9d..0fa751b946fa0b93e7e011039fee4ad9e4db048a 100644 (file)
@@ -99,6 +99,8 @@ struct kmem_cache {
 #ifdef CONFIG_SLUB_CPU_PARTIAL
        /* Number of per cpu partial objects to keep around */
        unsigned int cpu_partial;
+       /* Number of per cpu partial pages to keep around */
+       unsigned int cpu_partial_pages;
 #endif
        struct kmem_cache_order_objects oo;
 
@@ -141,17 +143,6 @@ struct kmem_cache {
        struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-#define slub_cpu_partial(s)            ((s)->cpu_partial)
-#define slub_set_cpu_partial(s, n)             \
-({                                             \
-       slub_cpu_partial(s) = (n);              \
-})
-#else
-#define slub_cpu_partial(s)            (0)
-#define slub_set_cpu_partial(s, n)
-#endif /* CONFIG_SLUB_CPU_PARTIAL */
-
 #ifdef CONFIG_SYSFS
 #define SLAB_SUPPORTS_SYSFS
 void sysfs_slab_unlink(struct kmem_cache *);
index 6bb4bc1a5f5459fc06b90548558a4745d0973b05..d29860966bc91423cc73fd8a4eb019b1b9a1c7d4 100644 (file)
 #ifndef _LINUX_STACKDEPOT_H
 #define _LINUX_STACKDEPOT_H
 
+#include <linux/gfp.h>
+
 typedef u32 depot_stack_handle_t;
 
+depot_stack_handle_t __stack_depot_save(unsigned long *entries,
+                                       unsigned int nr_entries,
+                                       gfp_t gfp_flags, bool can_alloc);
+
 depot_stack_handle_t stack_depot_save(unsigned long *entries,
                                      unsigned int nr_entries, gfp_t gfp_flags);
 
 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
                               unsigned long **entries);
 
-unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
-
 #ifdef CONFIG_STACKDEPOT
 int stack_depot_init(void);
 #else
index 9edecb494e9e2d5414367598238f7a5396dfc679..bef158815e83d6071cd186b6f5e8c4de5dfd88b1 100644 (file)
@@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
 unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
                                   unsigned int size, unsigned int skipnr);
 unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
 
 /* Internal interfaces. Do not use in generic code */
 #ifdef CONFIG_ARCH_STACKWALK
index cdf0957a88a49a343307646f1ea0c5cbb90cdd33..d1ea44b31f19f511108193f5344d5c6777083068 100644 (file)
@@ -341,7 +341,6 @@ void workingset_update_node(struct xa_node *node);
 
 /* linux/mm/page_alloc.c */
 extern unsigned long totalreserve_pages;
-extern unsigned long nr_free_buffer_pages(void);
 
 /* Definition of global_zone_page_state not available yet */
 #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
index 082f1d51957a4cd3c1d0660052c545a049e01775..be24056ac00fd131feace26b7cb8a7d3b5dd3bea 100644 (file)
@@ -19,6 +19,7 @@
 #define SWITCHTEC_EVENT_EN_CLI   BIT(2)
 #define SWITCHTEC_EVENT_EN_IRQ   BIT(3)
 #define SWITCHTEC_EVENT_FATAL    BIT(4)
+#define SWITCHTEC_EVENT_NOT_SUPP BIT(31)
 
 #define SWITCHTEC_DMA_MRPC_EN  BIT(0)
 
index 671d402c3778fa0ca540dadf86d622630ee4e4ab..6e022cc712e611635cd12720eb49ac90cf7f5d00 100644 (file)
@@ -22,7 +22,7 @@ struct notifier_block;                /* in notifier.h */
 #define VM_USERMAP             0x00000008      /* suitable for remap_vmalloc_range */
 #define VM_DMA_COHERENT                0x00000010      /* dma_alloc_coherent */
 #define VM_UNINITIALIZED       0x00000020      /* vm_struct is not fully initialized */
-#define VM_NO_GUARD            0x00000040      /* don't add guard page */
+#define VM_NO_GUARD            0x00000040      /* ***DANGEROUS*** don't add guard page */
 #define VM_KASAN               0x00000080      /* has allocated kasan shadow memory */
 #define VM_FLUSH_RESET_PERMS   0x00000100      /* reset direct map and flush TLB on unmap, can't be freed in atomic context */
 #define VM_MAP_PUT_PAGES       0x00000200      /* put pages and free array in vfree */
@@ -136,21 +136,21 @@ static inline void vmalloc_init(void)
 static inline unsigned long vmalloc_nr_pages(void) { return 0; }
 #endif
 
-extern void *vmalloc(unsigned long size);
-extern void *vzalloc(unsigned long size);
-extern void *vmalloc_user(unsigned long size);
-extern void *vmalloc_node(unsigned long size, int node);
-extern void *vzalloc_node(unsigned long size, int node);
-extern void *vmalloc_32(unsigned long size);
-extern void *vmalloc_32_user(unsigned long size);
-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask);
+extern void *vmalloc(unsigned long size) __alloc_size(1);
+extern void *vzalloc(unsigned long size) __alloc_size(1);
+extern void *vmalloc_user(unsigned long size) __alloc_size(1);
+extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1);
+extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1);
+extern void *vmalloc_32(unsigned long size) __alloc_size(1);
+extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
+extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
 extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
                        pgprot_t prot, unsigned long vm_flags, int node,
-                       const void *caller);
+                       const void *caller) __alloc_size(1);
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
-               int node, const void *caller);
-void *vmalloc_no_huge(unsigned long size);
+               int node, const void *caller) __alloc_size(1);
+void *vmalloc_no_huge(unsigned long size) __alloc_size(1);
 
 extern void vfree(const void *addr);
 extern void vfree_atomic(const void *addr);
index 0abff67b96f09a84e89e29d360103ce328d5ac96..14db8044c1ff2c18936d004021fe336950a051b7 100644 (file)
@@ -13,7 +13,7 @@ struct mm_struct;
 extern int trace_mmap_lock_reg(void);
 extern void trace_mmap_lock_unreg(void);
 
-TRACE_EVENT_FN(mmap_lock_start_locking,
+DECLARE_EVENT_CLASS(mmap_lock,
 
        TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write),
 
@@ -32,15 +32,23 @@ TRACE_EVENT_FN(mmap_lock_start_locking,
        ),
 
        TP_printk(
-               "mm=%p memcg_path=%s write=%s\n",
+               "mm=%p memcg_path=%s write=%s",
                __entry->mm,
                __get_str(memcg_path),
                __entry->write ? "true" : "false"
-       ),
-
-       trace_mmap_lock_reg, trace_mmap_lock_unreg
+       )
 );
 
+#define DEFINE_MMAP_LOCK_EVENT(name)                                    \
+       DEFINE_EVENT_FN(mmap_lock, name,                                \
+               TP_PROTO(struct mm_struct *mm, const char *memcg_path,  \
+                       bool write),                                    \
+               TP_ARGS(mm, memcg_path, write),                         \
+               trace_mmap_lock_reg, trace_mmap_lock_unreg)
+
+DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking);
+DEFINE_MMAP_LOCK_EVENT(mmap_lock_released);
+
 TRACE_EVENT_FN(mmap_lock_acquire_returned,
 
        TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write,
@@ -63,7 +71,7 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned,
        ),
 
        TP_printk(
-               "mm=%p memcg_path=%s write=%s success=%s\n",
+               "mm=%p memcg_path=%s write=%s success=%s",
                __entry->mm,
                __get_str(memcg_path),
                __entry->write ? "true" : "false",
@@ -73,34 +81,6 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned,
        trace_mmap_lock_reg, trace_mmap_lock_unreg
 );
 
-TRACE_EVENT_FN(mmap_lock_released,
-
-       TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write),
-
-       TP_ARGS(mm, memcg_path, write),
-
-       TP_STRUCT__entry(
-               __field(struct mm_struct *, mm)
-               __string(memcg_path, memcg_path)
-               __field(bool, write)
-       ),
-
-       TP_fast_assign(
-               __entry->mm = mm;
-               __assign_str(memcg_path, memcg_path);
-               __entry->write = write;
-       ),
-
-       TP_printk(
-               "mm=%p memcg_path=%s write=%s\n",
-               __entry->mm,
-               __get_str(memcg_path),
-               __entry->write ? "true" : "false"
-       ),
-
-       trace_mmap_lock_reg, trace_mmap_lock_unreg
-);
-
 #endif /* _TRACE_MMAP_LOCK_H */
 
 /* This part must be outside protection */
index 88faf2400ec253f9006ff52951a7ac7e07d99e3d..f25a6149d3ba56517ad42781eef1bf3b33f27e29 100644 (file)
                {RECLAIM_WB_ASYNC,      "RECLAIM_WB_ASYNC"}     \
                ) : "RECLAIM_WB_NONE"
 
+#define _VMSCAN_THROTTLE_WRITEBACK     (1 << VMSCAN_THROTTLE_WRITEBACK)
+#define _VMSCAN_THROTTLE_ISOLATED      (1 << VMSCAN_THROTTLE_ISOLATED)
+#define _VMSCAN_THROTTLE_NOPROGRESS    (1 << VMSCAN_THROTTLE_NOPROGRESS)
+
+#define show_throttle_flags(flags)                                             \
+       (flags) ? __print_flags(flags, "|",                                     \
+               {_VMSCAN_THROTTLE_WRITEBACK,    "VMSCAN_THROTTLE_WRITEBACK"},   \
+               {_VMSCAN_THROTTLE_ISOLATED,     "VMSCAN_THROTTLE_ISOLATED"},    \
+               {_VMSCAN_THROTTLE_NOPROGRESS,   "VMSCAN_THROTTLE_NOPROGRESS"}   \
+               ) : "VMSCAN_THROTTLE_NONE"
+
+
 #define trace_reclaim_flags(file) ( \
        (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
        (RECLAIM_WB_ASYNC) \
@@ -454,6 +466,32 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end,
        TP_ARGS(nr_reclaimed)
 );
 
+TRACE_EVENT(mm_vmscan_throttled,
+
+       TP_PROTO(int nid, int usec_timeout, int usec_delayed, int reason),
+
+       TP_ARGS(nid, usec_timeout, usec_delayed, reason),
+
+       TP_STRUCT__entry(
+               __field(int, nid)
+               __field(int, usec_timeout)
+               __field(int, usec_delayed)
+               __field(int, reason)
+       ),
+
+       TP_fast_assign(
+               __entry->nid = nid;
+               __entry->usec_timeout = usec_timeout;
+               __entry->usec_delayed = usec_delayed;
+               __entry->reason = 1U << reason;
+       ),
+
+       TP_printk("nid=%d usec_timeout=%d usect_delayed=%d reason=%s",
+               __entry->nid,
+               __entry->usec_timeout,
+               __entry->usec_delayed,
+               show_throttle_flags(__entry->reason))
+);
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
index 7dccb66474f73593c3a7c95689e04d47f17c3366..a345b1e12daf32b957312e9811ba4c9b2767096b 100644 (file)
@@ -763,13 +763,6 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
        TP_ARGS(usec_timeout, usec_delayed)
 );
 
-DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
-
-       TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
-
-       TP_ARGS(usec_timeout, usec_delayed)
-);
-
 DECLARE_EVENT_CLASS(writeback_single_inode_template,
 
        TP_PROTO(struct inode *inode,
index 64553df9d7350d03b50e62f6419a78df1bbe64e3..bd1932c2074d52c74c7efa9f86ef48e3c8d39bb0 100644 (file)
@@ -20,6 +20,7 @@
 #define FAN_OPEN_EXEC          0x00001000      /* File was opened for exec */
 
 #define FAN_Q_OVERFLOW         0x00004000      /* Event queued overflowed */
+#define FAN_FS_ERROR           0x00008000      /* Filesystem error */
 
 #define FAN_OPEN_PERM          0x00010000      /* File open in perm check */
 #define FAN_ACCESS_PERM                0x00020000      /* File accessed in perm check */
@@ -125,6 +126,7 @@ struct fanotify_event_metadata {
 #define FAN_EVENT_INFO_TYPE_DFID_NAME  2
 #define FAN_EVENT_INFO_TYPE_DFID       3
 #define FAN_EVENT_INFO_TYPE_PIDFD      4
+#define FAN_EVENT_INFO_TYPE_ERROR      5
 
 /* Variable length info record following event metadata */
 struct fanotify_event_info_header {
@@ -159,6 +161,12 @@ struct fanotify_event_info_pidfd {
        __s32 pidfd;
 };
 
+struct fanotify_event_info_error {
+       struct fanotify_event_info_header hdr;
+       __s32 error;
+       __u32 error_count;
+};
+
 struct fanotify_response {
        __s32 fd;
        __u32 response;
diff --git a/include/uapi/linux/map_to_14segment.h b/include/uapi/linux/map_to_14segment.h
new file mode 100644 (file)
index 0000000..0346ef7
--- /dev/null
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2021 Glider bv
+ *
+ * Based on include/uapi/linux/map_to_7segment.h:
+
+ * Copyright (c) 2005 Henk Vergonet <Henk.Vergonet@gmail.com>
+ */
+
+#ifndef MAP_TO_14SEGMENT_H
+#define MAP_TO_14SEGMENT_H
+
+/* This file provides translation primitives and tables for the conversion
+ * of (ASCII) characters to a 14-segments notation.
+ *
+ * The 14 segment's wikipedia notation below is used as standard.
+ * See: https://en.wikipedia.org/wiki/Fourteen-segment_display
+ *
+ * Notation:   +---a---+
+ *             |\  |  /|
+ *             f h i j b
+ *             |  \|/  |
+ *             +-g1+-g2+
+ *             |  /|\  |
+ *             e k l m c
+ *             |/  |  \|
+ *             +---d---+
+ *
+ * Usage:
+ *
+ *   Register a map variable, and fill it with a character set:
+ *     static SEG14_DEFAULT_MAP(map_seg14);
+ *
+ *
+ *   Then use for conversion:
+ *     seg14 = map_to_seg14(&map_seg14, some_char);
+ *     ...
+ *
+ * In device drivers it is recommended, if required, to make the char map
+ * accessible via the sysfs interface using the following scheme:
+ *
+ * static ssize_t map_seg14_show(struct device *dev,
+ *                              struct device_attribute *attr, char *buf)
+ * {
+ *     memcpy(buf, &map_seg14, sizeof(map_seg14));
+ *     return sizeof(map_seg14);
+ * }
+ * static ssize_t map_seg14_store(struct device *dev,
+ *                               struct device_attribute *attr,
+ *                               const char *buf, size_t cnt)
+ * {
+ *     if (cnt != sizeof(map_seg14))
+ *             return -EINVAL;
+ *     memcpy(&map_seg14, buf, cnt);
+ *     return cnt;
+ * }
+ * static DEVICE_ATTR_RW(map_seg14);
+ */
+#include <linux/errno.h>
+#include <linux/types.h>
+
+#include <asm/byteorder.h>
+
+#define BIT_SEG14_A            0
+#define BIT_SEG14_B            1
+#define BIT_SEG14_C            2
+#define BIT_SEG14_D            3
+#define BIT_SEG14_E            4
+#define BIT_SEG14_F            5
+#define BIT_SEG14_G1           6
+#define BIT_SEG14_G2           7
+#define BIT_SEG14_H            8
+#define BIT_SEG14_I            9
+#define BIT_SEG14_J            10
+#define BIT_SEG14_K            11
+#define BIT_SEG14_L            12
+#define BIT_SEG14_M            13
+#define BIT_SEG14_RESERVED1    14
+#define BIT_SEG14_RESERVED2    15
+
+struct seg14_conversion_map {
+       __be16 table[128];
+};
+
+static __inline__ int map_to_seg14(struct seg14_conversion_map *map, int c)
+{
+       if (c < 0 || c >= sizeof(map->table) / sizeof(map->table[0]))
+               return -EINVAL;
+
+       return __be16_to_cpu(map->table[c]);
+}
+
+#define SEG14_CONVERSION_MAP(_name, _map)      \
+       struct seg14_conversion_map _name = { .table = { _map } }
+
+/*
+ * It is recommended to use a facility that allows user space to redefine
+ * custom character sets for LCD devices. Please use a sysfs interface
+ * as described above.
+ */
+#define MAP_TO_SEG14_SYSFS_FILE        "map_seg14"
+
+/*******************************************************************************
+ * ASCII conversion table
+ ******************************************************************************/
+
+#define _SEG14(sym, a, b, c, d, e, f, g1, g2, h, j, k, l, m, n)        \
+       __cpu_to_be16( a << BIT_SEG14_A  |  b << BIT_SEG14_B  | \
+                      c << BIT_SEG14_C  |  d << BIT_SEG14_D  | \
+                      e << BIT_SEG14_E  |  f << BIT_SEG14_F  | \
+                     g1 << BIT_SEG14_G1 | g2 << BIT_SEG14_G2 | \
+                      h << BIT_SEG14_H  |  j << BIT_SEG14_I  | \
+                      k << BIT_SEG14_J  |  l << BIT_SEG14_K  | \
+                      m << BIT_SEG14_L  |  n << BIT_SEG14_M )
+
+#define _MAP_0_32_ASCII_SEG14_NON_PRINTABLE                            \
+       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+
+#define _MAP_33_47_ASCII_SEG14_SYMBOL                          \
+       _SEG14('!', 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('"', 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),  \
+       _SEG14('#', 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0),  \
+       _SEG14('$', 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0),  \
+       _SEG14('%', 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0),  \
+       _SEG14('&', 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1),  \
+       _SEG14('\'',0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),  \
+       _SEG14('(', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1),  \
+       _SEG14(')', 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0),  \
+       _SEG14('*', 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1),  \
+       _SEG14('+', 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0),  \
+       _SEG14(',', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),  \
+       _SEG14('-', 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('.', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),  \
+       _SEG14('/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0),
+
+#define _MAP_48_57_ASCII_SEG14_NUMERIC                         \
+       _SEG14('0', 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0),  \
+       _SEG14('1', 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),  \
+       _SEG14('2', 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('3', 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('4', 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('5', 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1),  \
+       _SEG14('6', 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('7', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0),  \
+       _SEG14('8', 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('9', 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0),
+
+#define _MAP_58_64_ASCII_SEG14_SYMBOL                          \
+       _SEG14(':', 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0),  \
+       _SEG14(';', 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0),  \
+       _SEG14('<', 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1),  \
+       _SEG14('=', 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('>', 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0),  \
+       _SEG14('?', 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0),  \
+       _SEG14('@', 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0),
+
+#define _MAP_65_90_ASCII_SEG14_ALPHA_UPPER                     \
+       _SEG14('A', 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('B', 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0),  \
+       _SEG14('C', 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('D', 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0),  \
+       _SEG14('E', 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('F', 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('G', 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('H', 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('I', 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0),  \
+       _SEG14('J', 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('K', 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1),  \
+       _SEG14('L', 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('M', 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0),  \
+       _SEG14('N', 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1),  \
+       _SEG14('O', 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('P', 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('Q', 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1),  \
+       _SEG14('R', 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1),  \
+       _SEG14('S', 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('T', 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0),  \
+       _SEG14('U', 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('V', 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0),  \
+       _SEG14('W', 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1),  \
+       _SEG14('X', 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1),  \
+       _SEG14('Y', 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0),  \
+       _SEG14('Z', 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0),
+
+#define _MAP_91_96_ASCII_SEG14_SYMBOL                          \
+       _SEG14('[', 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('\\',0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1),  \
+       _SEG14(']', 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('^', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1),  \
+       _SEG14('_', 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('`', 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0),
+
+#define _MAP_97_122_ASCII_SEG14_ALPHA_LOWER                    \
+       _SEG14('a', 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0),  \
+       _SEG14('b', 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1),  \
+       _SEG14('c', 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('d', 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0),  \
+       _SEG14('e', 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0),  \
+       _SEG14('f', 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0),  \
+       _SEG14('g', 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0),  \
+       _SEG14('h', 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0),  \
+       _SEG14('i', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0),  \
+       _SEG14('j', 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0),  \
+       _SEG14('k', 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1),  \
+       _SEG14('l', 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('m', 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0),  \
+       _SEG14('n', 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0),  \
+       _SEG14('o', 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('p', 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0),  \
+       _SEG14('q', 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0),  \
+       _SEG14('r', 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('s', 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1),  \
+       _SEG14('t', 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('u', 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0),  \
+       _SEG14('v', 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0),  \
+       _SEG14('w', 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1),  \
+       _SEG14('x', 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1),  \
+       _SEG14('y', 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0),  \
+       _SEG14('z', 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0),
+
+#define _MAP_123_126_ASCII_SEG14_SYMBOL                                \
+       _SEG14('{', 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0),  \
+       _SEG14('|', 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0),  \
+       _SEG14('}', 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1),  \
+       _SEG14('~', 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0),
+
+/* Maps */
+#define MAP_ASCII14SEG_ALPHANUM                        \
+       _MAP_0_32_ASCII_SEG14_NON_PRINTABLE     \
+       _MAP_33_47_ASCII_SEG14_SYMBOL           \
+       _MAP_48_57_ASCII_SEG14_NUMERIC          \
+       _MAP_58_64_ASCII_SEG14_SYMBOL           \
+       _MAP_65_90_ASCII_SEG14_ALPHA_UPPER      \
+       _MAP_91_96_ASCII_SEG14_SYMBOL           \
+       _MAP_97_122_ASCII_SEG14_ALPHA_LOWER     \
+       _MAP_123_126_ASCII_SEG14_SYMBOL
+
+#define SEG14_DEFAULT_MAP(_name)               \
+       SEG14_CONVERSION_MAP(_name, MAP_ASCII14SEG_ALPHANUM)
+
+#endif /* MAP_TO_14SEGMENT_H */
index e709ae8235e7fd2deeeb63b0ad507a1fd8de12b3..ff6ccbc6efe96b44953f63ad1a61792317a6e769 100644 (file)
 #define  PCI_EXP_DEVCTL_URRE   0x0008  /* Unsupported Request Reporting En. */
 #define  PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */
 #define  PCI_EXP_DEVCTL_PAYLOAD        0x00e0  /* Max_Payload_Size */
+#define  PCI_EXP_DEVCTL_PAYLOAD_128B 0x0000 /* 128 Bytes */
+#define  PCI_EXP_DEVCTL_PAYLOAD_256B 0x0020 /* 256 Bytes */
+#define  PCI_EXP_DEVCTL_PAYLOAD_512B 0x0040 /* 512 Bytes */
+#define  PCI_EXP_DEVCTL_PAYLOAD_1024B 0x0060 /* 1024 Bytes */
+#define  PCI_EXP_DEVCTL_PAYLOAD_2048B 0x0080 /* 2048 Bytes */
+#define  PCI_EXP_DEVCTL_PAYLOAD_4096B 0x00a0 /* 4096 Bytes */
 #define  PCI_EXP_DEVCTL_EXT_TAG        0x0100  /* Extended Tag Field Enable */
 #define  PCI_EXP_DEVCTL_PHANTOM        0x0200  /* Phantom Functions Enable */
 #define  PCI_EXP_DEVCTL_AUX_PME        0x0400  /* Auxiliary Power PM Enable */
index 11f8a845f259da63299a77001991631be81c0eac..21b1f4870c807fc69ed2872a1a93b1d705d46cd7 100644 (file)
@@ -901,7 +901,7 @@ config NUMA_BALANCING
        bool "Memory placement aware NUMA scheduler"
        depends on ARCH_SUPPORTS_NUMA_BALANCING
        depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
-       depends on SMP && NUMA && MIGRATION
+       depends on SMP && NUMA && MIGRATION && !PREEMPT_RT
        help
          This option adds support for automatic NUMA aware memory/task placement.
          The mechanism is quite primitive and is based on migrating memory when
index a842c05447456a31040c5db04461b5e583305a4b..2f3d96dc3db6d0a689f5c17a5f572ea5bd07f2f2 100644 (file)
@@ -607,7 +607,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end)
        unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE);
        unsigned long aligned_end = ALIGN(end, PAGE_SIZE);
 
-       memblock_free(__pa(aligned_start), aligned_end - aligned_start);
+       memblock_free((void *)aligned_start, aligned_end - aligned_start);
 #endif
 
        free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
index 183f861707e5218269e614c8cc3e0b425c1289f3..0c4a6e0d8234a50d9515c800a1ad845dfc2398de 100644 (file)
@@ -381,7 +381,7 @@ static char * __init xbc_make_cmdline(const char *key)
        ret = xbc_snprint_cmdline(new_cmdline, len + 1, root);
        if (ret < 0 || ret > len) {
                pr_err("Failed to print extra kernel cmdline.\n");
-               memblock_free_ptr(new_cmdline, len + 1);
+               memblock_free(new_cmdline, len + 1);
                return NULL;
        }
 
@@ -916,7 +916,7 @@ static void __init print_unknown_bootoptions(void)
                end += sprintf(end, " %s", *p);
 
        pr_notice("Unknown command line parameters:%s\n", unknown_options);
-       memblock_free_ptr(unknown_options, len);
+       memblock_free(unknown_options, len);
 }
 
 asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
@@ -1497,6 +1497,8 @@ static int __ref kernel_init(void *unused)
        kernel_init_freeable();
        /* need to finish all async __init code before freeing the memory */
        async_synchronize_full();
+
+       system_state = SYSTEM_FREEING_INITMEM;
        kprobe_free_init_mem();
        ftrace_free_init_mem();
        kgdb_free_init_mem();
index 60739d5e3373f8da84e9732b93822a5b8007f8de..02348b48447cf72b906bf0195c6c42afb91da6f2 100644 (file)
@@ -160,8 +160,7 @@ static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
 
        audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
 
-       if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) ||
-           WARN_ON_ONCE(!inode))
+       if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
                return 0;
 
        if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
index 698b62b4a2ec1757a76f12a68b95c5a7940abdab..713b256be944c89402a2f6206d5adeafbb18b53c 100644 (file)
@@ -473,8 +473,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
 
        parent = container_of(inode_mark, struct audit_parent, mark);
 
-       if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) ||
-           WARN_ON_ONCE(!inode))
+       if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
                return 0;
 
        if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
index 2a9695ccb65f539c713fdce1cc9bb2fd15c4779f..d0e163a02099795e8b186e8a21edfd8fd3dc8c00 100644 (file)
 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
 
+/*
+ * There could be abnormal cpuset configurations for cpu or memory
+ * node binding, add this key to provide a quick low-cost judgement
+ * of the situation.
+ */
+DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
+
 /* See "Frequency meter" comments, below. */
 
 struct fmeter {
@@ -372,6 +379,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
 
 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
 
+static inline void check_insane_mems_config(nodemask_t *nodes)
+{
+       if (!cpusets_insane_config() &&
+               movable_only_nodes(nodes)) {
+               static_branch_enable(&cpusets_insane_config_key);
+               pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
+                       "Cpuset allocations might fail even with a lot of memory available.\n",
+                       nodemask_pr_args(nodes));
+       }
+}
+
 /*
  * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
  * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
@@ -1870,6 +1888,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        if (retval < 0)
                goto done;
 
+       check_insane_mems_config(&trialcs->mems_allowed);
+
        spin_lock_irq(&callback_lock);
        cs->mems_allowed = trialcs->mems_allowed;
        spin_unlock_irq(&callback_lock);
@@ -3173,6 +3193,9 @@ update_tasks:
        cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
        mems_updated = !nodes_equal(new_mems, cs->effective_mems);
 
+       if (mems_updated)
+               check_insane_mems_config(&new_mems);
+
        if (is_in_v2_mode())
                hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                     cpus_updated, mems_updated);
index 6fab31ea47d6c5097963443fb71009d917c0760b..8e840fbbed7c7a89653a00a13e1c06922edd3dbf 100644 (file)
@@ -247,7 +247,7 @@ swiotlb_init(int verbose)
        return;
 
 fail_free_mem:
-       memblock_free_early(__pa(tlb), bytes);
+       memblock_free(tlb, bytes);
 fail:
        pr_warn("Cannot allocate buffer");
 }
index b0ea5eb0c3b43da49b8a94aa3337d666878b6af1..290661f68e6b331ecd835c96fc4126a3bb2631d2 100644 (file)
@@ -76,7 +76,7 @@ int notrace core_kernel_text(unsigned long addr)
            addr < (unsigned long)_etext)
                return 1;
 
-       if (system_state < SYSTEM_RUNNING &&
+       if (system_state < SYSTEM_FREEING_INITMEM &&
            init_kernel_text(addr))
                return 1;
        return 0;
index 4d8fc65cf38f4177c34b3138a94fcab86830892e..bf38c546aa2566d8edcc4cb3edc81d4ad476c856 100644 (file)
@@ -744,9 +744,8 @@ static int irq_domain_translate(struct irq_domain *d,
        return 0;
 }
 
-static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
-                                     unsigned int count,
-                                     struct irq_fwspec *fwspec)
+void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+                              unsigned int count, struct irq_fwspec *fwspec)
 {
        int i;
 
@@ -756,6 +755,7 @@ static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
        for (i = 0; i < count; i++)
                fwspec->param[i] = args[i];
 }
+EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
 
 unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 {
@@ -1502,6 +1502,7 @@ out_free_desc:
        irq_free_descs(virq, nr_irqs);
        return ret;
 }
+EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
 
 /* The irq_data was moved, fix the revmap to refer to the new location */
 static void irq_domain_fix_revmap(struct irq_data *d)
index 33400ff051a848b05989b82753358451bfb4c53b..8347fc158d2b96ff2169389cd79ef42d4f513cf9 100644 (file)
@@ -556,6 +556,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
        if (kbuf->image->type == KEXEC_TYPE_CRASH)
                return func(&crashk_res, kbuf);
 
+       /*
+        * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
+        * IORESOURCE_SYSRAM_DRIVER_MANAGED handling in
+        * locate_mem_hole_callback().
+        */
        if (kbuf->top_down) {
                for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE,
                                                &mstart, &mend, NULL) {
index 7096384dc60f1f0fa9218e4f4964310c9b99b352..74d37166574734beece4f43831a48c8db2f339b5 100644 (file)
@@ -788,6 +788,21 @@ static int very_verbose(struct lock_class *class)
  * Is this the address of a static object:
  */
 #ifdef __KERNEL__
+/*
+ * Check if an address is part of freed initmem. After initmem is freed,
+ * memory can be allocated from it, and such allocations would then have
+ * addresses within the range [_stext, _end].
+ */
+#ifndef arch_is_kernel_initmem_freed
+static int arch_is_kernel_initmem_freed(unsigned long addr)
+{
+       if (system_state < SYSTEM_FREEING_INITMEM)
+               return 0;
+
+       return init_section_contains((void *)addr, 1);
+}
+#endif
+
 static int static_obj(const void *obj)
 {
        unsigned long start = (unsigned long) &_stext,
index 5c26a76e800b579a1e450b1aab962da2df7fc0d6..84a9141a5e159a01642c6a67e4250e016de39797 100644 (file)
@@ -2942,7 +2942,11 @@ static int module_sig_check(struct load_info *info, int flags)
 
 static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
 {
+#if defined(CONFIG_64BIT)
+       unsigned long long secend;
+#else
        unsigned long secend;
+#endif
 
        /*
         * Check for both overflow and offset/size being
@@ -2967,14 +2971,29 @@ static int elf_validity_check(struct load_info *info)
        Elf_Shdr *shdr, *strhdr;
        int err;
 
-       if (info->len < sizeof(*(info->hdr)))
-               return -ENOEXEC;
+       if (info->len < sizeof(*(info->hdr))) {
+               pr_err("Invalid ELF header len %lu\n", info->len);
+               goto no_exec;
+       }
 
-       if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
-           || info->hdr->e_type != ET_REL
-           || !elf_check_arch(info->hdr)
-           || info->hdr->e_shentsize != sizeof(Elf_Shdr))
-               return -ENOEXEC;
+       if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
+               pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
+               goto no_exec;
+       }
+       if (info->hdr->e_type != ET_REL) {
+               pr_err("Invalid ELF header type: %u != %u\n",
+                      info->hdr->e_type, ET_REL);
+               goto no_exec;
+       }
+       if (!elf_check_arch(info->hdr)) {
+               pr_err("Invalid architecture in ELF header: %u\n",
+                      info->hdr->e_machine);
+               goto no_exec;
+       }
+       if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
+               pr_err("Invalid ELF section header size\n");
+               goto no_exec;
+       }
 
        /*
         * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
@@ -2983,8 +3002,10 @@ static int elf_validity_check(struct load_info *info)
         */
        if (info->hdr->e_shoff >= info->len
            || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
-               info->len - info->hdr->e_shoff))
-               return -ENOEXEC;
+               info->len - info->hdr->e_shoff)) {
+               pr_err("Invalid ELF section header overflow\n");
+               goto no_exec;
+       }
 
        info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
 
@@ -2992,13 +3013,19 @@ static int elf_validity_check(struct load_info *info)
         * Verify if the section name table index is valid.
         */
        if (info->hdr->e_shstrndx == SHN_UNDEF
-           || info->hdr->e_shstrndx >= info->hdr->e_shnum)
-               return -ENOEXEC;
+           || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
+               pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
+                      info->hdr->e_shstrndx, info->hdr->e_shstrndx,
+                      info->hdr->e_shnum);
+               goto no_exec;
+       }
 
        strhdr = &info->sechdrs[info->hdr->e_shstrndx];
        err = validate_section_offset(info, strhdr);
-       if (err < 0)
+       if (err < 0) {
+               pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type);
                return err;
+       }
 
        /*
         * The section name table must be NUL-terminated, as required
@@ -3006,8 +3033,10 @@ static int elf_validity_check(struct load_info *info)
         * strings in the section safe.
         */
        info->secstrings = (void *)info->hdr + strhdr->sh_offset;
-       if (info->secstrings[strhdr->sh_size - 1] != '\0')
-               return -ENOEXEC;
+       if (info->secstrings[strhdr->sh_size - 1] != '\0') {
+               pr_err("ELF Spec violation: section name table isn't null terminated\n");
+               goto no_exec;
+       }
 
        /*
         * The code assumes that section 0 has a length of zero and
@@ -3015,8 +3044,11 @@ static int elf_validity_check(struct load_info *info)
         */
        if (info->sechdrs[0].sh_type != SHT_NULL
            || info->sechdrs[0].sh_size != 0
-           || info->sechdrs[0].sh_addr != 0)
-               return -ENOEXEC;
+           || info->sechdrs[0].sh_addr != 0) {
+               pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
+                      info->sechdrs[0].sh_type);
+               goto no_exec;
+       }
 
        for (i = 1; i < info->hdr->e_shnum; i++) {
                shdr = &info->sechdrs[i];
@@ -3026,8 +3058,12 @@ static int elf_validity_check(struct load_info *info)
                        continue;
                case SHT_SYMTAB:
                        if (shdr->sh_link == SHN_UNDEF
-                           || shdr->sh_link >= info->hdr->e_shnum)
-                               return -ENOEXEC;
+                           || shdr->sh_link >= info->hdr->e_shnum) {
+                               pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
+                                      shdr->sh_link, shdr->sh_link,
+                                      info->hdr->e_shnum);
+                               goto no_exec;
+                       }
                        fallthrough;
                default:
                        err = validate_section_offset(info, shdr);
@@ -3049,6 +3085,9 @@ static int elf_validity_check(struct load_info *info)
        }
 
        return 0;
+
+no_exec:
+       return -ENOEXEC;
 }
 
 #define COPY_CHUNK_SIZE (16*PAGE_SIZE)
@@ -3940,10 +3979,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
         * sections.
         */
        err = elf_validity_check(info);
-       if (err) {
-               pr_err("Module has invalid ELF structures\n");
+       if (err)
                goto free_copy;
-       }
 
        /*
         * Everything checks out, so set up the section info
index 9e5dfb1896a95d1cc7815a706ce652ca1ce070ef..013bfd6dcc34af711765f1e14377af2c57d4cec0 100644 (file)
@@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early)
        return;
 
 err_free_descs:
-       memblock_free_ptr(new_descs, new_descs_size);
+       memblock_free(new_descs, new_descs_size);
 err_free_log_buf:
-       memblock_free_ptr(new_log_buf, new_log_buf_len);
+       memblock_free(new_log_buf, new_log_buf_len);
 }
 
 static bool __read_mostly ignore_loglevel;
index 30169c7685b641da3ed14b93697a7dec9b7c06f7..d201a7052a299fbc231f79ad5d98ebde3073bbe4 100644 (file)
@@ -1492,7 +1492,6 @@ static int                        sched_domains_curr_level;
 int                            sched_max_numa_distance;
 static int                     *sched_domains_numa_distance;
 static struct cpumask          ***sched_domains_numa_masks;
-int __read_mostly              node_reclaim_distance = RECLAIM_DISTANCE;
 
 static unsigned long __read_mostly *sched_numa_onlined_nodes;
 #endif
index 9f8117c7cfddee066477b27b6bafc397155da03a..9c625257023d295cc0d5946ca1186a2792e10a06 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/export.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
+#include <linux/interrupt.h>
 
 /**
  * stack_trace_print - Print the entries in the stack trace
@@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
 #endif /* CONFIG_USER_STACKTRACE_SUPPORT */
 
 #endif /* !CONFIG_ARCH_STACKWALK */
+
+static inline bool in_irqentry_text(unsigned long ptr)
+{
+       return (ptr >= (unsigned long)&__irqentry_text_start &&
+               ptr < (unsigned long)&__irqentry_text_end) ||
+               (ptr >= (unsigned long)&__softirqentry_text_start &&
+                ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+/**
+ * filter_irq_stacks - Find first IRQ stack entry in trace
+ * @entries:   Pointer to stack trace array
+ * @nr_entries:        Number of entries in the storage array
+ *
+ * Return: Number of trace entries until IRQ stack starts.
+ */
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries)
+{
+       unsigned int i;
+
+       for (i = 0; i < nr_entries; i++) {
+               if (in_irqentry_text(entries[i])) {
+                       /* Include the irqentry function into the stack. */
+                       return i + 1;
+               }
+       }
+       return nr_entries;
+}
+EXPORT_SYMBOL_GPL(filter_irq_stacks);
index 257ffb993ea2358bd81412f305f5e28f6982c060..f00de83d02462714526f7c6086e95de172ad9ce5 100644 (file)
@@ -137,7 +137,7 @@ static void __acct_update_integrals(struct task_struct *tsk,
         * the rest of the math is done in xacct_add_tsk.
         */
        tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
-       tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
+       tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10;
 }
 
 /**
index 1a7df882f55e2c2bb2c0392170f2141da84e7971..613917bbc4e733c1cd2e8bfd1b86e2f74f8859f5 100644 (file)
@@ -1351,7 +1351,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
        struct worker_pool *pool = pwq->pool;
 
        /* record the work call stack in order to print it in KASAN reports */
-       kasan_record_aux_stack(work);
+       kasan_record_aux_stack_noalloc(work);
 
        /* we own @work, set data and link */
        set_work_pwq(work, pwq, extra_flags);
index 6fdbf9613aec4b5588476465f2630eec34e090c2..9ef7ce18b4f56195b28a4ae3f189af08700922f0 100644 (file)
@@ -877,7 +877,7 @@ config DEBUG_MEMORY_INIT
 
 config MEMORY_NOTIFIER_ERROR_INJECT
        tristate "Memory hotplug notifier error injection module"
-       depends on MEMORY_HOTPLUG_SPARSE && NOTIFIER_ERROR_INJECTION
+       depends on MEMORY_HOTPLUG && NOTIFIER_ERROR_INJECTION
        help
          This option provides the ability to inject artificial errors to
          memory hotplug notifier chain callbacks.  It is controlled through
index e641add33947548fe764e2a22d0bf09500814c7b..912f252a41fc63744b168ddb43dcd42aec07433e 100644 (file)
@@ -25,17 +25,6 @@ menuconfig KFENCE
 
 if KFENCE
 
-config KFENCE_STATIC_KEYS
-       bool "Use static keys to set up allocations"
-       default y
-       depends on JUMP_LABEL # To ensure performance, require jump labels
-       help
-         Use static keys (static branches) to set up KFENCE allocations. Using
-         static keys is normally recommended, because it avoids a dynamic
-         branch in the allocator's fast path. However, with very low sample
-         intervals, or on systems that do not support jump labels, a dynamic
-         branch may still be an acceptable performance trade-off.
-
 config KFENCE_SAMPLE_INTERVAL
        int "Default sample interval in milliseconds"
        default 100
@@ -56,6 +45,21 @@ config KFENCE_NUM_OBJECTS
          pages are required; with one containing the object and two adjacent
          ones used as guard pages.
 
+config KFENCE_STATIC_KEYS
+       bool "Use static keys to set up allocations" if EXPERT
+       depends on JUMP_LABEL
+       help
+         Use static keys (static branches) to set up KFENCE allocations. This
+         option is only recommended when using very large sample intervals, or
+         performance has carefully been evaluated with this option.
+
+         Using static keys comes with trade-offs that need to be carefully
+         evaluated given target workloads and system architectures. Notably,
+         enabling and disabling static keys invoke IPI broadcasts, the latency
+         and impact of which is much harder to predict than a dynamic branch.
+
+         Say N if you are unsure.
+
 config KFENCE_STRESS_TEST_FAULTS
        int "Stress testing of fault handling and error reporting" if EXPERT
        default 0
index 70e0d52ffd24c37994d15fb2bc25b9f88c04fb61..74f3201ab8e59e6201c02b5e97839cb148c9c880 100644 (file)
@@ -50,7 +50,7 @@ static inline void * __init xbc_alloc_mem(size_t size)
 
 static inline void __init xbc_free_mem(void *addr, size_t size)
 {
-       memblock_free_ptr(addr, size);
+       memblock_free(addr, size);
 }
 
 #else /* !__KERNEL__ */
index c3c76b833384686e43f0b9389874625ebf30709b..a971a82d2f43607c41a6a4f1b16e49a1a1449421 100644 (file)
@@ -188,7 +188,7 @@ EXPORT_SYMBOL(free_cpumask_var);
  */
 void __init free_bootmem_cpumask_var(cpumask_var_t mask)
 {
-       memblock_free_early(__pa(mask), cpumask_size());
+       memblock_free(mask, cpumask_size());
 }
 #endif
 
index c770570bfe4f2d939a1331657f45a43edade9467..45e17619422b4ef4b3c278b242806668d034b19f 100644 (file)
@@ -14,6 +14,8 @@ hostprogs     += mktables
 
 ifeq ($(CONFIG_ALTIVEC),y)
 altivec_flags := -maltivec $(call cc-option,-mabi=altivec)
+# Enable <altivec.h>
+altivec_flags += -isystem $(shell $(CC) -print-file-name=include)
 
 ifdef CONFIG_CC_IS_CLANG
 # clang ppc port does not yet support -maltivec when -msoft-float is
@@ -34,6 +36,8 @@ endif
 # ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
 ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
 NEON_FLAGS := -ffreestanding
+# Enable <arm_neon.h>
+NEON_FLAGS += -isystem $(shell $(CC) -print-file-name=include)
 ifeq ($(ARCH),arm)
 NEON_FLAGS += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
 endif
index 0a2e417f83cbae313d4800628c5323b89e8fcec1..09485dc5bd1281b945b2ac60e577fe1ca28f88d2 100644 (file)
@@ -20,7 +20,6 @@
  */
 
 #include <linux/gfp.h>
-#include <linux/interrupt.h>
 #include <linux/jhash.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -102,8 +101,8 @@ static bool init_stack_slab(void **prealloc)
 }
 
 /* Allocation of a new stack in raw storage */
-static struct stack_record *depot_alloc_stack(unsigned long *entries, int size,
-               u32 hash, void **prealloc, gfp_t alloc_flags)
+static struct stack_record *
+depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
 {
        struct stack_record *stack;
        size_t required_size = struct_size(stack, entries, size);
@@ -248,17 +247,28 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 EXPORT_SYMBOL_GPL(stack_depot_fetch);
 
 /**
- * stack_depot_save - Save a stack trace from an array
+ * __stack_depot_save - Save a stack trace from an array
  *
  * @entries:           Pointer to storage array
  * @nr_entries:                Size of the storage array
  * @alloc_flags:       Allocation gfp flags
+ * @can_alloc:         Allocate stack slabs (increased chance of failure if false)
+ *
+ * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is
+ * %true, is allowed to replenish the stack slab pool in case no space is left
+ * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids
+ * any allocations and will fail if no space is left to store the stack trace.
  *
- * Return: The handle of the stack struct stored in depot
+ * Context: Any context, but setting @can_alloc to %false is required if
+ *          alloc_pages() cannot be used from the current context. Currently
+ *          this is the case from contexts where neither %GFP_ATOMIC nor
+ *          %GFP_NOWAIT can be used (NMI, raw_spin_lock).
+ *
+ * Return: The handle of the stack struct stored in depot, 0 on failure.
  */
-depot_stack_handle_t stack_depot_save(unsigned long *entries,
-                                     unsigned int nr_entries,
-                                     gfp_t alloc_flags)
+depot_stack_handle_t __stack_depot_save(unsigned long *entries,
+                                       unsigned int nr_entries,
+                                       gfp_t alloc_flags, bool can_alloc)
 {
        struct stack_record *found = NULL, **bucket;
        depot_stack_handle_t retval = 0;
@@ -291,7 +301,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
         * The smp_load_acquire() here pairs with smp_store_release() to
         * |next_slab_inited| in depot_alloc_stack() and init_stack_slab().
         */
-       if (unlikely(!smp_load_acquire(&next_slab_inited))) {
+       if (unlikely(can_alloc && !smp_load_acquire(&next_slab_inited))) {
                /*
                 * Zero out zone modifiers, as we don't have specific zone
                 * requirements. Keep the flags related to allocation in atomic
@@ -309,9 +319,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 
        found = find_stack(*bucket, entries, nr_entries, hash);
        if (!found) {
-               struct stack_record *new =
-                       depot_alloc_stack(entries, nr_entries,
-                                         hash, &prealloc, alloc_flags);
+               struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc);
+
                if (new) {
                        new->next = *bucket;
                        /*
@@ -340,27 +349,24 @@ exit:
 fast_exit:
        return retval;
 }
-EXPORT_SYMBOL_GPL(stack_depot_save);
-
-static inline int in_irqentry_text(unsigned long ptr)
-{
-       return (ptr >= (unsigned long)&__irqentry_text_start &&
-               ptr < (unsigned long)&__irqentry_text_end) ||
-               (ptr >= (unsigned long)&__softirqentry_text_start &&
-                ptr < (unsigned long)&__softirqentry_text_end);
-}
+EXPORT_SYMBOL_GPL(__stack_depot_save);
 
-unsigned int filter_irq_stacks(unsigned long *entries,
-                                            unsigned int nr_entries)
+/**
+ * stack_depot_save - Save a stack trace from an array
+ *
+ * @entries:           Pointer to storage array
+ * @nr_entries:                Size of the storage array
+ * @alloc_flags:       Allocation gfp flags
+ *
+ * Context: Contexts where allocations via alloc_pages() are allowed.
+ *          See __stack_depot_save() for more details.
+ *
+ * Return: The handle of the stack struct stored in depot, 0 on failure.
+ */
+depot_stack_handle_t stack_depot_save(unsigned long *entries,
+                                     unsigned int nr_entries,
+                                     gfp_t alloc_flags)
 {
-       unsigned int i;
-
-       for (i = 0; i < nr_entries; i++) {
-               if (in_irqentry_text(entries[i])) {
-                       /* Include the irqentry function into the stack. */
-                       return i + 1;
-               }
-       }
-       return nr_entries;
+       return __stack_depot_save(entries, nr_entries, alloc_flags, true);
 }
-EXPORT_SYMBOL_GPL(filter_irq_stacks);
+EXPORT_SYMBOL_GPL(stack_depot_save);
index ebed755ebf34c1ef692f11c5346713656a6cade0..67ed689a0b1bc78c2d5320691423d59e3906f1c4 100644 (file)
@@ -440,6 +440,7 @@ static void kmalloc_oob_memset_2(struct kunit *test)
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
+       OPTIMIZER_HIDE_VAR(size);
        KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2));
        kfree(ptr);
 }
@@ -452,6 +453,7 @@ static void kmalloc_oob_memset_4(struct kunit *test)
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
+       OPTIMIZER_HIDE_VAR(size);
        KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4));
        kfree(ptr);
 }
@@ -464,6 +466,7 @@ static void kmalloc_oob_memset_8(struct kunit *test)
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
+       OPTIMIZER_HIDE_VAR(size);
        KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8));
        kfree(ptr);
 }
@@ -476,6 +479,7 @@ static void kmalloc_oob_memset_16(struct kunit *test)
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
+       OPTIMIZER_HIDE_VAR(size);
        KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16));
        kfree(ptr);
 }
@@ -488,16 +492,17 @@ static void kmalloc_oob_in_memset(struct kunit *test)
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
+       OPTIMIZER_HIDE_VAR(size);
        KUNIT_EXPECT_KASAN_FAIL(test,
                                memset(ptr, 0, size + KASAN_GRANULE_SIZE));
        kfree(ptr);
 }
 
-static void kmalloc_memmove_invalid_size(struct kunit *test)
+static void kmalloc_memmove_negative_size(struct kunit *test)
 {
        char *ptr;
        size_t size = 64;
-       volatile size_t invalid_size = -2;
+       size_t invalid_size = -2;
 
        /*
         * Hardware tag-based mode doesn't check memmove for negative size.
@@ -509,6 +514,22 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
+       memset((char *)ptr, 0, 64);
+       OPTIMIZER_HIDE_VAR(invalid_size);
+       KUNIT_EXPECT_KASAN_FAIL(test,
+               memmove((char *)ptr, (char *)ptr + 4, invalid_size));
+       kfree(ptr);
+}
+
+static void kmalloc_memmove_invalid_size(struct kunit *test)
+{
+       char *ptr;
+       size_t size = 64;
+       volatile size_t invalid_size = size;
+
+       ptr = kmalloc(size, GFP_KERNEL);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
        memset((char *)ptr, 0, 64);
        KUNIT_EXPECT_KASAN_FAIL(test,
                memmove((char *)ptr, (char *)ptr + 4, invalid_size));
@@ -1129,6 +1150,7 @@ static struct kunit_case kasan_kunit_test_cases[] = {
        KUNIT_CASE(kmalloc_oob_memset_4),
        KUNIT_CASE(kmalloc_oob_memset_8),
        KUNIT_CASE(kmalloc_oob_memset_16),
+       KUNIT_CASE(kmalloc_memmove_negative_size),
        KUNIT_CASE(kmalloc_memmove_invalid_size),
        KUNIT_CASE(kmalloc_uaf),
        KUNIT_CASE(kmalloc_uaf_memset),
index 7ebf433edef3bb3a93cf6087af578a5182ff7294..b112cbc835e902357707c906f5095835d7ca6066 100644 (file)
@@ -35,6 +35,8 @@ static noinline void __init copy_user_test(void)
                return;
        }
 
+       OPTIMIZER_HIDE_VAR(size);
+
        pr_info("out-of-bounds in copy_from_user()\n");
        unused = copy_from_user(kmem, usermem, size + 1);
 
index e14993bc84d2d91c6064a0c9ff58fef839457ea2..cf41fd6df42a08563175d558df645fff377e2e10 100644 (file)
@@ -393,7 +393,7 @@ static struct test_driver {
 static void shuffle_array(int *arr, int n)
 {
        unsigned int rnd;
-       int i, j, x;
+       int i, j;
 
        for (i = n - 1; i > 0; i--)  {
                get_random_bytes(&rnd, sizeof(rnd));
@@ -402,9 +402,7 @@ static void shuffle_array(int *arr, int n)
                j = rnd % i;
 
                /* Swap indexes. */
-               x = arr[i];
-               arr[i] = arr[j];
-               arr[j] = x;
+               swap(arr[i], arr[j]);
        }
 }
 
index d16ba9249bc531bd34d6a5e0e85c83e84f19fcf1..ae1f151c2924192c5cadd04d58bebfb3c7890de3 100644 (file)
@@ -123,15 +123,11 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
        select MEMORY_ISOLATION
-       depends on SPARSEMEM || X86_64_ACPI_NUMA
+       depends on SPARSEMEM
        depends on ARCH_ENABLE_MEMORY_HOTPLUG
-       depends on 64BIT || BROKEN
+       depends on 64BIT
        select NUMA_KEEP_MEMINFO if NUMA
 
-config MEMORY_HOTPLUG_SPARSE
-       def_bool y
-       depends on SPARSEMEM && MEMORY_HOTPLUG
-
 config MEMORY_HOTPLUG_DEFAULT_ONLINE
        bool "Online the newly added memory blocks by default"
        depends on MEMORY_HOTPLUG
@@ -371,7 +367,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
 config TRANSPARENT_HUGEPAGE
        bool "Transparent Hugepage Support"
-       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
        select COMPACTION
        select XARRAY_MULTI
        help
index c878d995af06ef3b5cb1c20238c73ce0b1816b44..1eead47610112257ca01c5ff306c64716959af8f 100644 (file)
@@ -292,8 +292,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 
        memset(wb, 0, sizeof(*wb));
 
-       if (wb != &bdi->wb)
-               bdi_get(bdi);
        wb->bdi = bdi;
        wb->last_old_flush = jiffies;
        INIT_LIST_HEAD(&wb->b_dirty);
@@ -317,7 +315,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 
        err = fprop_local_init_percpu(&wb->completions, gfp);
        if (err)
-               goto out_put_bdi;
+               return err;
 
        for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
                err = percpu_counter_init(&wb->stat[i], 0, gfp);
@@ -331,9 +329,6 @@ out_destroy_stat:
        while (i--)
                percpu_counter_destroy(&wb->stat[i]);
        fprop_local_destroy_percpu(&wb->completions);
-out_put_bdi:
-       if (wb != &bdi->wb)
-               bdi_put(bdi);
        return err;
 }
 
@@ -374,8 +369,6 @@ static void wb_exit(struct bdi_writeback *wb)
                percpu_counter_destroy(&wb->stat[i]);
 
        fprop_local_destroy_percpu(&wb->completions);
-       if (wb != &wb->bdi->wb)
-               bdi_put(wb->bdi);
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -398,6 +391,7 @@ static void cgwb_release_workfn(struct work_struct *work)
        struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
                                                release_work);
        struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
+       struct backing_dev_info *bdi = wb->bdi;
 
        mutex_lock(&wb->bdi->cgwb_release_mutex);
        wb_shutdown(wb);
@@ -417,6 +411,7 @@ static void cgwb_release_workfn(struct work_struct *work)
 
        percpu_ref_exit(&wb->refcnt);
        wb_exit(wb);
+       bdi_put(bdi);
        WARN_ON_ONCE(!list_empty(&wb->b_attached));
        kfree_rcu(wb, rcu);
 }
@@ -498,6 +493,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
        INIT_LIST_HEAD(&wb->b_attached);
        INIT_WORK(&wb->release_work, cgwb_release_workfn);
        set_bit(WB_registered, &wb->state);
+       bdi_get(bdi);
 
        /*
         * The root wb determines the registered state of the whole bdi and
@@ -529,6 +525,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
        goto out_put;
 
 err_fprop_exit:
+       bdi_put(bdi);
        fprop_local_destroy_percpu(&wb->memcg_completions);
 err_ref_exit:
        percpu_ref_exit(&wb->refcnt);
@@ -959,14 +956,14 @@ void bdi_unregister(struct backing_dev_info *bdi)
                bdi->owner = NULL;
        }
 }
+EXPORT_SYMBOL(bdi_unregister);
 
 static void release_bdi(struct kref *ref)
 {
        struct backing_dev_info *bdi =
                        container_of(ref, struct backing_dev_info, refcnt);
 
-       if (test_bit(WB_registered, &bdi->wb.state))
-               bdi_unregister(bdi);
+       WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
        WARN_ON_ONCE(bdi->dev);
        wb_exit(&bdi->wb);
        kfree(bdi);
@@ -1058,51 +1055,3 @@ long congestion_wait(int sync, long timeout)
        return ret;
 }
 EXPORT_SYMBOL(congestion_wait);
-
-/**
- * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
- * @sync: SYNC or ASYNC IO
- * @timeout: timeout in jiffies
- *
- * In the event of a congested backing_dev (any backing_dev) this waits
- * for up to @timeout jiffies for either a BDI to exit congestion of the
- * given @sync queue or a write to complete.
- *
- * The return value is 0 if the sleep is for the full timeout. Otherwise,
- * it is the number of jiffies that were still remaining when the function
- * returned. return_value == timeout implies the function did not sleep.
- */
-long wait_iff_congested(int sync, long timeout)
-{
-       long ret;
-       unsigned long start = jiffies;
-       DEFINE_WAIT(wait);
-       wait_queue_head_t *wqh = &congestion_wqh[sync];
-
-       /*
-        * If there is no congestion, yield if necessary instead
-        * of sleeping on the congestion queue
-        */
-       if (atomic_read(&nr_wb_congested[sync]) == 0) {
-               cond_resched();
-
-               /* In case we scheduled, work out time remaining */
-               ret = timeout - (jiffies - start);
-               if (ret < 0)
-                       ret = 0;
-
-               goto out;
-       }
-
-       /* Sleep until uncongested or a write happens */
-       prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
-       ret = io_schedule_timeout(timeout);
-       finish_wait(wqh, &wait);
-
-out:
-       trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
-                                       jiffies_to_usecs(jiffies - start));
-
-       return ret;
-}
-EXPORT_SYMBOL(wait_iff_congested);
index 995e15480937fee8ceb6e88d0b78b21be8d6a5eb..bc9ca8f3c4871e3cba6ec0b6186c65de118eb932 100644 (file)
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -378,7 +378,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
        return 0;
 
 free_mem:
-       memblock_free(base, size);
+       memblock_phys_free(base, size);
 err:
        pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
        return ret;
@@ -524,6 +524,25 @@ out:
        return page;
 }
 
+bool cma_pages_valid(struct cma *cma, const struct page *pages,
+                    unsigned long count)
+{
+       unsigned long pfn;
+
+       if (!cma || !pages)
+               return false;
+
+       pfn = page_to_pfn(pages);
+
+       if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) {
+               pr_debug("%s(page %p, count %lu)\n", __func__,
+                                               (void *)pages, count);
+               return false;
+       }
+
+       return true;
+}
+
 /**
  * cma_release() - release allocated pages
  * @cma:   Contiguous memory region for which the allocation is performed.
@@ -539,16 +558,13 @@ bool cma_release(struct cma *cma, const struct page *pages,
 {
        unsigned long pfn;
 
-       if (!cma || !pages)
+       if (!cma_pages_valid(cma, pages, count))
                return false;
 
        pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
 
        pfn = page_to_pfn(pages);
 
-       if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
-               return false;
-
        VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
 
        free_contig_range(pfn, count);
index fbc60f964c38fbb2da85e504ac5c26f607ff1512..6e446094ce9028d5faea9a4ece20e78c940850c5 100644 (file)
@@ -761,6 +761,8 @@ isolate_freepages_range(struct compact_control *cc,
 /* Similar to reclaim, but different enough that they don't share logic */
 static bool too_many_isolated(pg_data_t *pgdat)
 {
+       bool too_many;
+
        unsigned long active, inactive, isolated;
 
        inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
@@ -770,7 +772,11 @@ static bool too_many_isolated(pg_data_t *pgdat)
        isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
                        node_page_state(pgdat, NR_ISOLATED_ANON);
 
-       return isolated > (inactive + active) / 2;
+       too_many = isolated > (inactive + active) / 2;
+       if (!too_many)
+               wake_throttle_isolated(pgdat);
+
+       return too_many;
 }
 
 /**
@@ -822,7 +828,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                if (cc->mode == MIGRATE_ASYNC)
                        return -EAGAIN;
 
-               congestion_wait(BLK_RW_ASYNC, HZ/10);
+               reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
 
                if (fatal_signal_pending(current))
                        return -EINTR;
index 37024798a97caf0b33941e6632ca88cce133e03f..5bcf05851ad078305362f897d958e2d3f3dd0167 100644 (file)
@@ -30,7 +30,15 @@ config DAMON_VADDR
        select PAGE_IDLE_FLAG
        help
          This builds the default data access monitoring primitives for DAMON
-         that works for virtual address spaces.
+         that work for virtual address spaces.
+
+config DAMON_PADDR
+       bool "Data access monitoring primitives for the physical address space"
+       depends on DAMON && MMU
+       select PAGE_IDLE_FLAG
+       help
+         This builds the default data access monitoring primitives for DAMON
+         that works for the physical address space.
 
 config DAMON_VADDR_KUNIT_TEST
        bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS
@@ -46,7 +54,7 @@ config DAMON_VADDR_KUNIT_TEST
 
 config DAMON_DBGFS
        bool "DAMON debugfs interface"
-       depends on DAMON_VADDR && DEBUG_FS
+       depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
        help
          This builds the debugfs interface for DAMON.  The user space admins
          can use the interface for arbitrary data access monitoring.
@@ -65,4 +73,16 @@ config DAMON_DBGFS_KUNIT_TEST
 
          If unsure, say N.
 
+config DAMON_RECLAIM
+       bool "Build DAMON-based reclaim (DAMON_RECLAIM)"
+       depends on DAMON_PADDR
+       help
+         This builds the DAMON-based reclamation subsystem.  It finds pages
+         that not accessed for a long time (cold) using DAMON and reclaim
+         those.
+
+         This is suggested to be used as a proactive and lightweight
+         reclamation under light memory pressure, while the traditional page
+         scanning-based reclamation is used for heavy pressure.
+
 endmenu
index fed4be3bace3e548b1c89e4563294cf09956346e..f7d5ac377a2bb5551e9fe60882b961825d4f492f 100644 (file)
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_DAMON)            := core.o
-obj-$(CONFIG_DAMON_VADDR)      += vaddr.o
+obj-$(CONFIG_DAMON_VADDR)      += prmtv-common.o vaddr.o
+obj-$(CONFIG_DAMON_PADDR)      += prmtv-common.o paddr.o
 obj-$(CONFIG_DAMON_DBGFS)      += dbgfs.o
+obj-$(CONFIG_DAMON_RECLAIM)    += reclaim.o
index 30e9211f494a789e04a759464cff667c4f36543d..c381b3c525d0bd23739012b081a311a10ec00fd6 100644 (file)
 #include <linux/damon.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/mm.h>
 #include <linux/random.h>
 #include <linux/slab.h>
+#include <linux/string.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/damon.h>
@@ -45,6 +47,9 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
        region->nr_accesses = 0;
        INIT_LIST_HEAD(&region->list);
 
+       region->age = 0;
+       region->last_nr_accesses = 0;
+
        return region;
 }
 
@@ -82,6 +87,74 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t)
        damon_free_region(r);
 }
 
+struct damos *damon_new_scheme(
+               unsigned long min_sz_region, unsigned long max_sz_region,
+               unsigned int min_nr_accesses, unsigned int max_nr_accesses,
+               unsigned int min_age_region, unsigned int max_age_region,
+               enum damos_action action, struct damos_quota *quota,
+               struct damos_watermarks *wmarks)
+{
+       struct damos *scheme;
+
+       scheme = kmalloc(sizeof(*scheme), GFP_KERNEL);
+       if (!scheme)
+               return NULL;
+       scheme->min_sz_region = min_sz_region;
+       scheme->max_sz_region = max_sz_region;
+       scheme->min_nr_accesses = min_nr_accesses;
+       scheme->max_nr_accesses = max_nr_accesses;
+       scheme->min_age_region = min_age_region;
+       scheme->max_age_region = max_age_region;
+       scheme->action = action;
+       scheme->stat_count = 0;
+       scheme->stat_sz = 0;
+       INIT_LIST_HEAD(&scheme->list);
+
+       scheme->quota.ms = quota->ms;
+       scheme->quota.sz = quota->sz;
+       scheme->quota.reset_interval = quota->reset_interval;
+       scheme->quota.weight_sz = quota->weight_sz;
+       scheme->quota.weight_nr_accesses = quota->weight_nr_accesses;
+       scheme->quota.weight_age = quota->weight_age;
+       scheme->quota.total_charged_sz = 0;
+       scheme->quota.total_charged_ns = 0;
+       scheme->quota.esz = 0;
+       scheme->quota.charged_sz = 0;
+       scheme->quota.charged_from = 0;
+       scheme->quota.charge_target_from = NULL;
+       scheme->quota.charge_addr_from = 0;
+
+       scheme->wmarks.metric = wmarks->metric;
+       scheme->wmarks.interval = wmarks->interval;
+       scheme->wmarks.high = wmarks->high;
+       scheme->wmarks.mid = wmarks->mid;
+       scheme->wmarks.low = wmarks->low;
+       scheme->wmarks.activated = true;
+
+       return scheme;
+}
+
+void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
+{
+       list_add_tail(&s->list, &ctx->schemes);
+}
+
+static void damon_del_scheme(struct damos *s)
+{
+       list_del(&s->list);
+}
+
+static void damon_free_scheme(struct damos *s)
+{
+       kfree(s);
+}
+
+void damon_destroy_scheme(struct damos *s)
+{
+       damon_del_scheme(s);
+       damon_free_scheme(s);
+}
+
 /*
  * Construct a damon_target struct
  *
@@ -107,6 +180,11 @@ void damon_add_target(struct damon_ctx *ctx, struct damon_target *t)
        list_add_tail(&t->list, &ctx->adaptive_targets);
 }
 
+bool damon_targets_empty(struct damon_ctx *ctx)
+{
+       return list_empty(&ctx->adaptive_targets);
+}
+
 static void damon_del_target(struct damon_target *t)
 {
        list_del(&t->list);
@@ -153,6 +231,7 @@ struct damon_ctx *damon_new_ctx(void)
        ctx->max_nr_regions = 1000;
 
        INIT_LIST_HEAD(&ctx->adaptive_targets);
+       INIT_LIST_HEAD(&ctx->schemes);
 
        return ctx;
 }
@@ -172,7 +251,13 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
 
 void damon_destroy_ctx(struct damon_ctx *ctx)
 {
+       struct damos *s, *next_s;
+
        damon_destroy_targets(ctx);
+
+       damon_for_each_scheme_safe(s, next_s, ctx)
+               damon_destroy_scheme(s);
+
        kfree(ctx);
 }
 
@@ -247,6 +332,30 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
        return 0;
 }
 
+/**
+ * damon_set_schemes() - Set data access monitoring based operation schemes.
+ * @ctx:       monitoring context
+ * @schemes:   array of the schemes
+ * @nr_schemes:        number of entries in @schemes
+ *
+ * This function should not be called while the kdamond of the context is
+ * running.
+ *
+ * Return: 0 if success, or negative error code otherwise.
+ */
+int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
+                       ssize_t nr_schemes)
+{
+       struct damos *s, *next;
+       ssize_t i;
+
+       damon_for_each_scheme_safe(s, next, ctx)
+               damon_destroy_scheme(s);
+       for (i = 0; i < nr_schemes; i++)
+               damon_add_scheme(ctx, schemes[i]);
+       return 0;
+}
+
 /**
  * damon_nr_running_ctxs() - Return number of currently running contexts.
  */
@@ -281,17 +390,6 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
        return sz;
 }
 
-static bool damon_kdamond_running(struct damon_ctx *ctx)
-{
-       bool running;
-
-       mutex_lock(&ctx->kdamond_lock);
-       running = ctx->kdamond != NULL;
-       mutex_unlock(&ctx->kdamond_lock);
-
-       return running;
-}
-
 static int kdamond_fn(void *data);
 
 /*
@@ -309,12 +407,11 @@ static int __damon_start(struct damon_ctx *ctx)
        mutex_lock(&ctx->kdamond_lock);
        if (!ctx->kdamond) {
                err = 0;
-               ctx->kdamond_stop = false;
                ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d",
                                nr_running_ctxs);
                if (IS_ERR(ctx->kdamond)) {
                        err = PTR_ERR(ctx->kdamond);
-                       ctx->kdamond = 0;
+                       ctx->kdamond = NULL;
                }
        }
        mutex_unlock(&ctx->kdamond_lock);
@@ -365,13 +462,15 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
  */
 static int __damon_stop(struct damon_ctx *ctx)
 {
+       struct task_struct *tsk;
+
        mutex_lock(&ctx->kdamond_lock);
-       if (ctx->kdamond) {
-               ctx->kdamond_stop = true;
+       tsk = ctx->kdamond;
+       if (tsk) {
+               get_task_struct(tsk);
                mutex_unlock(&ctx->kdamond_lock);
-               while (damon_kdamond_running(ctx))
-                       usleep_range(ctx->sample_interval,
-                                       ctx->sample_interval * 2);
+               kthread_stop(tsk);
+               put_task_struct(tsk);
                return 0;
        }
        mutex_unlock(&ctx->kdamond_lock);
@@ -444,11 +543,203 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
 
                damon_for_each_region(r, t) {
                        trace_damon_aggregated(t, r, damon_nr_regions(t));
+                       r->last_nr_accesses = r->nr_accesses;
                        r->nr_accesses = 0;
                }
        }
 }
 
+static void damon_split_region_at(struct damon_ctx *ctx,
+               struct damon_target *t, struct damon_region *r,
+               unsigned long sz_r);
+
+static bool __damos_valid_target(struct damon_region *r, struct damos *s)
+{
+       unsigned long sz;
+
+       sz = r->ar.end - r->ar.start;
+       return s->min_sz_region <= sz && sz <= s->max_sz_region &&
+               s->min_nr_accesses <= r->nr_accesses &&
+               r->nr_accesses <= s->max_nr_accesses &&
+               s->min_age_region <= r->age && r->age <= s->max_age_region;
+}
+
+static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
+               struct damon_region *r, struct damos *s)
+{
+       bool ret = __damos_valid_target(r, s);
+
+       if (!ret || !s->quota.esz || !c->primitive.get_scheme_score)
+               return ret;
+
+       return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score;
+}
+
+static void damon_do_apply_schemes(struct damon_ctx *c,
+                                  struct damon_target *t,
+                                  struct damon_region *r)
+{
+       struct damos *s;
+
+       damon_for_each_scheme(s, c) {
+               struct damos_quota *quota = &s->quota;
+               unsigned long sz = r->ar.end - r->ar.start;
+               struct timespec64 begin, end;
+
+               if (!s->wmarks.activated)
+                       continue;
+
+               /* Check the quota */
+               if (quota->esz && quota->charged_sz >= quota->esz)
+                       continue;
+
+               /* Skip previously charged regions */
+               if (quota->charge_target_from) {
+                       if (t != quota->charge_target_from)
+                               continue;
+                       if (r == damon_last_region(t)) {
+                               quota->charge_target_from = NULL;
+                               quota->charge_addr_from = 0;
+                               continue;
+                       }
+                       if (quota->charge_addr_from &&
+                                       r->ar.end <= quota->charge_addr_from)
+                               continue;
+
+                       if (quota->charge_addr_from && r->ar.start <
+                                       quota->charge_addr_from) {
+                               sz = ALIGN_DOWN(quota->charge_addr_from -
+                                               r->ar.start, DAMON_MIN_REGION);
+                               if (!sz) {
+                                       if (r->ar.end - r->ar.start <=
+                                                       DAMON_MIN_REGION)
+                                               continue;
+                                       sz = DAMON_MIN_REGION;
+                               }
+                               damon_split_region_at(c, t, r, sz);
+                               r = damon_next_region(r);
+                               sz = r->ar.end - r->ar.start;
+                       }
+                       quota->charge_target_from = NULL;
+                       quota->charge_addr_from = 0;
+               }
+
+               if (!damos_valid_target(c, t, r, s))
+                       continue;
+
+               /* Apply the scheme */
+               if (c->primitive.apply_scheme) {
+                       if (quota->esz &&
+                                       quota->charged_sz + sz > quota->esz) {
+                               sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
+                                               DAMON_MIN_REGION);
+                               if (!sz)
+                                       goto update_stat;
+                               damon_split_region_at(c, t, r, sz);
+                       }
+                       ktime_get_coarse_ts64(&begin);
+                       c->primitive.apply_scheme(c, t, r, s);
+                       ktime_get_coarse_ts64(&end);
+                       quota->total_charged_ns += timespec64_to_ns(&end) -
+                               timespec64_to_ns(&begin);
+                       quota->charged_sz += sz;
+                       if (quota->esz && quota->charged_sz >= quota->esz) {
+                               quota->charge_target_from = t;
+                               quota->charge_addr_from = r->ar.end + 1;
+                       }
+               }
+               if (s->action != DAMOS_STAT)
+                       r->age = 0;
+
+update_stat:
+               s->stat_count++;
+               s->stat_sz += sz;
+       }
+}
+
+/* Shouldn't be called if quota->ms and quota->sz are zero */
+static void damos_set_effective_quota(struct damos_quota *quota)
+{
+       unsigned long throughput;
+       unsigned long esz;
+
+       if (!quota->ms) {
+               quota->esz = quota->sz;
+               return;
+       }
+
+       if (quota->total_charged_ns)
+               throughput = quota->total_charged_sz * 1000000 /
+                       quota->total_charged_ns;
+       else
+               throughput = PAGE_SIZE * 1024;
+       esz = throughput * quota->ms;
+
+       if (quota->sz && quota->sz < esz)
+               esz = quota->sz;
+       quota->esz = esz;
+}
+
+static void kdamond_apply_schemes(struct damon_ctx *c)
+{
+       struct damon_target *t;
+       struct damon_region *r, *next_r;
+       struct damos *s;
+
+       damon_for_each_scheme(s, c) {
+               struct damos_quota *quota = &s->quota;
+               unsigned long cumulated_sz;
+               unsigned int score, max_score = 0;
+
+               if (!s->wmarks.activated)
+                       continue;
+
+               if (!quota->ms && !quota->sz)
+                       continue;
+
+               /* New charge window starts */
+               if (time_after_eq(jiffies, quota->charged_from +
+                                       msecs_to_jiffies(
+                                               quota->reset_interval))) {
+                       quota->total_charged_sz += quota->charged_sz;
+                       quota->charged_from = jiffies;
+                       quota->charged_sz = 0;
+                       damos_set_effective_quota(quota);
+               }
+
+               if (!c->primitive.get_scheme_score)
+                       continue;
+
+               /* Fill up the score histogram */
+               memset(quota->histogram, 0, sizeof(quota->histogram));
+               damon_for_each_target(t, c) {
+                       damon_for_each_region(r, t) {
+                               if (!__damos_valid_target(r, s))
+                                       continue;
+                               score = c->primitive.get_scheme_score(
+                                               c, t, r, s);
+                               quota->histogram[score] +=
+                                       r->ar.end - r->ar.start;
+                               if (score > max_score)
+                                       max_score = score;
+                       }
+               }
+
+               /* Set the min score limit */
+               for (cumulated_sz = 0, score = max_score; ; score--) {
+                       cumulated_sz += quota->histogram[score];
+                       if (cumulated_sz >= quota->esz || !score)
+                               break;
+               }
+               quota->min_score = score;
+       }
+
+       damon_for_each_target(t, c) {
+               damon_for_each_region_safe(r, next_r, t)
+                       damon_do_apply_schemes(c, t, r);
+       }
+}
+
 #define sz_damon_region(r) (r->ar.end - r->ar.start)
 
 /*
@@ -461,6 +752,7 @@ static void damon_merge_two_regions(struct damon_target *t,
 
        l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
                        (sz_l + sz_r);
+       l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
        l->ar.end = r->ar.end;
        damon_destroy_region(r, t);
 }
@@ -480,6 +772,11 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
        struct damon_region *r, *prev = NULL, *next;
 
        damon_for_each_region_safe(r, next, t) {
+               if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres)
+                       r->age = 0;
+               else
+                       r->age++;
+
                if (prev && prev->ar.end == r->ar.start &&
                    diff_of(prev->nr_accesses, r->nr_accesses) <= thres &&
                    sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
@@ -527,6 +824,9 @@ static void damon_split_region_at(struct damon_ctx *ctx,
 
        r->ar.end = new->ar.start;
 
+       new->age = r->age;
+       new->last_nr_accesses = r->last_nr_accesses;
+
        damon_insert_region(new, r, damon_next_region(r), t);
 }
 
@@ -615,12 +915,8 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
 static bool kdamond_need_stop(struct damon_ctx *ctx)
 {
        struct damon_target *t;
-       bool stop;
 
-       mutex_lock(&ctx->kdamond_lock);
-       stop = ctx->kdamond_stop;
-       mutex_unlock(&ctx->kdamond_lock);
-       if (stop)
+       if (kthread_should_stop())
                return true;
 
        if (!ctx->primitive.target_valid)
@@ -634,11 +930,81 @@ static bool kdamond_need_stop(struct damon_ctx *ctx)
        return true;
 }
 
-static void set_kdamond_stop(struct damon_ctx *ctx)
+static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric)
 {
-       mutex_lock(&ctx->kdamond_lock);
-       ctx->kdamond_stop = true;
-       mutex_unlock(&ctx->kdamond_lock);
+       struct sysinfo i;
+
+       switch (metric) {
+       case DAMOS_WMARK_FREE_MEM_RATE:
+               si_meminfo(&i);
+               return i.freeram * 1000 / i.totalram;
+       default:
+               break;
+       }
+       return -EINVAL;
+}
+
+/*
+ * Returns zero if the scheme is active.  Else, returns time to wait for next
+ * watermark check in micro-seconds.
+ */
+static unsigned long damos_wmark_wait_us(struct damos *scheme)
+{
+       unsigned long metric;
+
+       if (scheme->wmarks.metric == DAMOS_WMARK_NONE)
+               return 0;
+
+       metric = damos_wmark_metric_value(scheme->wmarks.metric);
+       /* higher than high watermark or lower than low watermark */
+       if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) {
+               if (scheme->wmarks.activated)
+                       pr_debug("deactivate a scheme (%d) for %s wmark\n",
+                                       scheme->action,
+                                       metric > scheme->wmarks.high ?
+                                       "high" : "low");
+               scheme->wmarks.activated = false;
+               return scheme->wmarks.interval;
+       }
+
+       /* inactive and higher than middle watermark */
+       if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) &&
+                       !scheme->wmarks.activated)
+               return scheme->wmarks.interval;
+
+       if (!scheme->wmarks.activated)
+               pr_debug("activate a scheme (%d)\n", scheme->action);
+       scheme->wmarks.activated = true;
+       return 0;
+}
+
+static void kdamond_usleep(unsigned long usecs)
+{
+       if (usecs > 100 * 1000)
+               schedule_timeout_interruptible(usecs_to_jiffies(usecs));
+       else
+               usleep_range(usecs, usecs + 1);
+}
+
+/* Returns negative error code if it's not activated but should return */
+static int kdamond_wait_activation(struct damon_ctx *ctx)
+{
+       struct damos *s;
+       unsigned long wait_time;
+       unsigned long min_wait_time = 0;
+
+       while (!kdamond_need_stop(ctx)) {
+               damon_for_each_scheme(s, ctx) {
+                       wait_time = damos_wmark_wait_us(s);
+                       if (!min_wait_time || wait_time < min_wait_time)
+                               min_wait_time = wait_time;
+               }
+               if (!min_wait_time)
+                       return 0;
+
+               kdamond_usleep(min_wait_time);
+       }
+       return -EBUSY;
 }
 
 /*
@@ -651,24 +1017,26 @@ static int kdamond_fn(void *data)
        struct damon_region *r, *next;
        unsigned int max_nr_accesses = 0;
        unsigned long sz_limit = 0;
+       bool done = false;
 
-       mutex_lock(&ctx->kdamond_lock);
-       pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
-       mutex_unlock(&ctx->kdamond_lock);
+       pr_debug("kdamond (%d) starts\n", current->pid);
 
        if (ctx->primitive.init)
                ctx->primitive.init(ctx);
        if (ctx->callback.before_start && ctx->callback.before_start(ctx))
-               set_kdamond_stop(ctx);
+               done = true;
 
        sz_limit = damon_region_sz_limit(ctx);
 
-       while (!kdamond_need_stop(ctx)) {
+       while (!kdamond_need_stop(ctx) && !done) {
+               if (kdamond_wait_activation(ctx))
+                       continue;
+
                if (ctx->primitive.prepare_access_checks)
                        ctx->primitive.prepare_access_checks(ctx);
                if (ctx->callback.after_sampling &&
                                ctx->callback.after_sampling(ctx))
-                       set_kdamond_stop(ctx);
+                       done = true;
 
                usleep_range(ctx->sample_interval, ctx->sample_interval + 1);
 
@@ -681,7 +1049,8 @@ static int kdamond_fn(void *data)
                                        sz_limit);
                        if (ctx->callback.after_aggregation &&
                                        ctx->callback.after_aggregation(ctx))
-                               set_kdamond_stop(ctx);
+                               done = true;
+                       kdamond_apply_schemes(ctx);
                        kdamond_reset_aggregated(ctx);
                        kdamond_split_regions(ctx);
                        if (ctx->primitive.reset_aggregated)
@@ -699,13 +1068,12 @@ static int kdamond_fn(void *data)
                        damon_destroy_region(r, t);
        }
 
-       if (ctx->callback.before_terminate &&
-                       ctx->callback.before_terminate(ctx))
-               set_kdamond_stop(ctx);
+       if (ctx->callback.before_terminate)
+               ctx->callback.before_terminate(ctx);
        if (ctx->primitive.cleanup)
                ctx->primitive.cleanup(ctx);
 
-       pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid);
+       pr_debug("kdamond (%d) finishes\n", current->pid);
        mutex_lock(&ctx->kdamond_lock);
        ctx->kdamond = NULL;
        mutex_unlock(&ctx->kdamond_lock);
@@ -714,7 +1082,7 @@ static int kdamond_fn(void *data)
        nr_running_ctxs--;
        mutex_unlock(&damon_lock);
 
-       do_exit(0);
+       return 0;
 }
 
 #include "core-test.h"
index 4eddcfa73996f24672344652cbb087e1a50bc58a..86b9f9528231efb52bfe1b791aaaffeb01315492 100644 (file)
@@ -109,9 +109,63 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
        dbgfs_destroy_ctx(ctx);
 }
 
+static void damon_dbgfs_test_set_init_regions(struct kunit *test)
+{
+       struct damon_ctx *ctx = damon_new_ctx();
+       unsigned long ids[] = {1, 2, 3};
+       /* Each line represents one region in ``<target id> <start> <end>`` */
+       char * const valid_inputs[] = {"2 10 20\n 2   20 30\n2 35 45",
+               "2 10 20\n",
+               "2 10 20\n1 39 59\n1 70 134\n  2  20 25\n",
+               ""};
+       /* Reading the file again will show sorted, clean output */
+       char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n",
+               "2 10 20\n",
+               "1 39 59\n1 70 134\n2 10 20\n2 20 25\n",
+               ""};
+       char * const invalid_inputs[] = {"4 10 20\n",   /* target not exists */
+               "2 10 20\n 2 14 26\n",          /* regions overlap */
+               "1 10 20\n2 30 40\n 1 5 8"};    /* not sorted by address */
+       char *input, *expect;
+       int i, rc;
+       char buf[256];
+
+       damon_set_targets(ctx, ids, 3);
+
+       /* Put valid inputs and check the results */
+       for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
+               input = valid_inputs[i];
+               expect = valid_expects[i];
+
+               rc = set_init_regions(ctx, input, strnlen(input, 256));
+               KUNIT_EXPECT_EQ(test, rc, 0);
+
+               memset(buf, 0, 256);
+               sprint_init_regions(ctx, buf, 256);
+
+               KUNIT_EXPECT_STREQ(test, (char *)buf, expect);
+       }
+       /* Put invalid inputs and check the return error code */
+       for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) {
+               input = invalid_inputs[i];
+               pr_info("input: %s\n", input);
+               rc = set_init_regions(ctx, input, strnlen(input, 256));
+               KUNIT_EXPECT_EQ(test, rc, -EINVAL);
+
+               memset(buf, 0, 256);
+               sprint_init_regions(ctx, buf, 256);
+
+               KUNIT_EXPECT_STREQ(test, (char *)buf, "");
+       }
+
+       damon_set_targets(ctx, NULL, 0);
+       damon_destroy_ctx(ctx);
+}
+
 static struct kunit_case damon_test_cases[] = {
        KUNIT_CASE(damon_dbgfs_test_str_to_target_ids),
        KUNIT_CASE(damon_dbgfs_test_set_targets),
+       KUNIT_CASE(damon_dbgfs_test_set_init_regions),
        {},
 };
 
index faee070977d80d4efb0bc9e5a42be9b1b267618d..eccc14b3490132baef9169bbefc5f6349e10ea35 100644 (file)
@@ -69,8 +69,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
        struct damon_ctx *ctx = file->private_data;
        unsigned long s, a, r, minr, maxr;
        char *kbuf;
-       ssize_t ret = count;
-       int err;
+       ssize_t ret;
 
        kbuf = user_input_str(buf, count, ppos);
        if (IS_ERR(kbuf))
@@ -88,11 +87,182 @@ static ssize_t dbgfs_attrs_write(struct file *file,
                goto unlock_out;
        }
 
-       err = damon_set_attrs(ctx, s, a, r, minr, maxr);
-       if (err)
-               ret = err;
+       ret = damon_set_attrs(ctx, s, a, r, minr, maxr);
+       if (!ret)
+               ret = count;
+unlock_out:
+       mutex_unlock(&ctx->kdamond_lock);
+out:
+       kfree(kbuf);
+       return ret;
+}
+
+static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
+{
+       struct damos *s;
+       int written = 0;
+       int rc;
+
+       damon_for_each_scheme(s, c) {
+               rc = scnprintf(&buf[written], len - written,
+                               "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n",
+                               s->min_sz_region, s->max_sz_region,
+                               s->min_nr_accesses, s->max_nr_accesses,
+                               s->min_age_region, s->max_age_region,
+                               s->action,
+                               s->quota.ms, s->quota.sz,
+                               s->quota.reset_interval,
+                               s->quota.weight_sz,
+                               s->quota.weight_nr_accesses,
+                               s->quota.weight_age,
+                               s->wmarks.metric, s->wmarks.interval,
+                               s->wmarks.high, s->wmarks.mid, s->wmarks.low,
+                               s->stat_count, s->stat_sz);
+               if (!rc)
+                       return -ENOMEM;
+
+               written += rc;
+       }
+       return written;
+}
+
+static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf,
+               size_t count, loff_t *ppos)
+{
+       struct damon_ctx *ctx = file->private_data;
+       char *kbuf;
+       ssize_t len;
+
+       kbuf = kmalloc(count, GFP_KERNEL);
+       if (!kbuf)
+               return -ENOMEM;
+
+       mutex_lock(&ctx->kdamond_lock);
+       len = sprint_schemes(ctx, kbuf, count);
+       mutex_unlock(&ctx->kdamond_lock);
+       if (len < 0)
+               goto out;
+       len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
+
+out:
+       kfree(kbuf);
+       return len;
+}
+
+static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
+{
+       ssize_t i;
+
+       for (i = 0; i < nr_schemes; i++)
+               kfree(schemes[i]);
+       kfree(schemes);
+}
+
+static bool damos_action_valid(int action)
+{
+       switch (action) {
+       case DAMOS_WILLNEED:
+       case DAMOS_COLD:
+       case DAMOS_PAGEOUT:
+       case DAMOS_HUGEPAGE:
+       case DAMOS_NOHUGEPAGE:
+       case DAMOS_STAT:
+               return true;
+       default:
+               return false;
+       }
+}
+
+/*
+ * Converts a string into an array of struct damos pointers
+ *
+ * Returns an array of struct damos pointers that converted if the conversion
+ * success, or NULL otherwise.
+ */
+static struct damos **str_to_schemes(const char *str, ssize_t len,
+                               ssize_t *nr_schemes)
+{
+       struct damos *scheme, **schemes;
+       const int max_nr_schemes = 256;
+       int pos = 0, parsed, ret;
+       unsigned long min_sz, max_sz;
+       unsigned int min_nr_a, max_nr_a, min_age, max_age;
+       unsigned int action;
+
+       schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
+                       GFP_KERNEL);
+       if (!schemes)
+               return NULL;
+
+       *nr_schemes = 0;
+       while (pos < len && *nr_schemes < max_nr_schemes) {
+               struct damos_quota quota = {};
+               struct damos_watermarks wmarks;
+
+               ret = sscanf(&str[pos],
+                               "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
+                               &min_sz, &max_sz, &min_nr_a, &max_nr_a,
+                               &min_age, &max_age, &action, &quota.ms,
+                               &quota.sz, &quota.reset_interval,
+                               &quota.weight_sz, &quota.weight_nr_accesses,
+                               &quota.weight_age, &wmarks.metric,
+                               &wmarks.interval, &wmarks.high, &wmarks.mid,
+                               &wmarks.low, &parsed);
+               if (ret != 18)
+                       break;
+               if (!damos_action_valid(action)) {
+                       pr_err("wrong action %d\n", action);
+                       goto fail;
+               }
+
+               pos += parsed;
+               scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
+                               min_age, max_age, action, &quota, &wmarks);
+               if (!scheme)
+                       goto fail;
+
+               schemes[*nr_schemes] = scheme;
+               *nr_schemes += 1;
+       }
+       return schemes;
+fail:
+       free_schemes_arr(schemes, *nr_schemes);
+       return NULL;
+}
+
+static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
+               size_t count, loff_t *ppos)
+{
+       struct damon_ctx *ctx = file->private_data;
+       char *kbuf;
+       struct damos **schemes;
+       ssize_t nr_schemes = 0, ret;
+
+       kbuf = user_input_str(buf, count, ppos);
+       if (IS_ERR(kbuf))
+               return PTR_ERR(kbuf);
+
+       schemes = str_to_schemes(kbuf, count, &nr_schemes);
+       if (!schemes) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       mutex_lock(&ctx->kdamond_lock);
+       if (ctx->kdamond) {
+               ret = -EBUSY;
+               goto unlock_out;
+       }
+
+       ret = damon_set_schemes(ctx, schemes, nr_schemes);
+       if (!ret) {
+               ret = count;
+               nr_schemes = 0;
+       }
+
 unlock_out:
        mutex_unlock(&ctx->kdamond_lock);
+       free_schemes_arr(schemes, nr_schemes);
 out:
        kfree(kbuf);
        return ret;
@@ -185,26 +355,31 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
                const char __user *buf, size_t count, loff_t *ppos)
 {
        struct damon_ctx *ctx = file->private_data;
+       bool id_is_pid = true;
        char *kbuf, *nrs;
        unsigned long *targets;
        ssize_t nr_targets;
-       ssize_t ret = count;
+       ssize_t ret;
        int i;
-       int err;
 
        kbuf = user_input_str(buf, count, ppos);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);
 
        nrs = kbuf;
+       if (!strncmp(kbuf, "paddr\n", count)) {
+               id_is_pid = false;
+               /* target id is meaningless here, but we set it just for fun */
+               scnprintf(kbuf, count, "42    ");
+       }
 
-       targets = str_to_target_ids(nrs, ret, &nr_targets);
+       targets = str_to_target_ids(nrs, count, &nr_targets);
        if (!targets) {
                ret = -ENOMEM;
                goto out;
        }
 
-       if (targetid_is_pid(ctx)) {
+       if (id_is_pid) {
                for (i = 0; i < nr_targets; i++) {
                        targets[i] = (unsigned long)find_get_pid(
                                        (int)targets[i]);
@@ -218,17 +393,27 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
 
        mutex_lock(&ctx->kdamond_lock);
        if (ctx->kdamond) {
-               if (targetid_is_pid(ctx))
+               if (id_is_pid)
                        dbgfs_put_pids(targets, nr_targets);
                ret = -EBUSY;
                goto unlock_out;
        }
 
-       err = damon_set_targets(ctx, targets, nr_targets);
-       if (err) {
-               if (targetid_is_pid(ctx))
+       /* remove targets with previously-set primitive */
+       damon_set_targets(ctx, NULL, 0);
+
+       /* Configure the context for the address space type */
+       if (id_is_pid)
+               damon_va_set_primitives(ctx);
+       else
+               damon_pa_set_primitives(ctx);
+
+       ret = damon_set_targets(ctx, targets, nr_targets);
+       if (ret) {
+               if (id_is_pid)
                        dbgfs_put_pids(targets, nr_targets);
-               ret = err;
+       } else {
+               ret = count;
        }
 
 unlock_out:
@@ -240,6 +425,152 @@ out:
        return ret;
 }
 
+static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
+{
+       struct damon_target *t;
+       struct damon_region *r;
+       int written = 0;
+       int rc;
+
+       damon_for_each_target(t, c) {
+               damon_for_each_region(r, t) {
+                       rc = scnprintf(&buf[written], len - written,
+                                       "%lu %lu %lu\n",
+                                       t->id, r->ar.start, r->ar.end);
+                       if (!rc)
+                               return -ENOMEM;
+                       written += rc;
+               }
+       }
+       return written;
+}
+
+static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
+               size_t count, loff_t *ppos)
+{
+       struct damon_ctx *ctx = file->private_data;
+       char *kbuf;
+       ssize_t len;
+
+       kbuf = kmalloc(count, GFP_KERNEL);
+       if (!kbuf)
+               return -ENOMEM;
+
+       mutex_lock(&ctx->kdamond_lock);
+       if (ctx->kdamond) {
+               mutex_unlock(&ctx->kdamond_lock);
+               len = -EBUSY;
+               goto out;
+       }
+
+       len = sprint_init_regions(ctx, kbuf, count);
+       mutex_unlock(&ctx->kdamond_lock);
+       if (len < 0)
+               goto out;
+       len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
+
+out:
+       kfree(kbuf);
+       return len;
+}
+
+static int add_init_region(struct damon_ctx *c,
+                        unsigned long target_id, struct damon_addr_range *ar)
+{
+       struct damon_target *t;
+       struct damon_region *r, *prev;
+       unsigned long id;
+       int rc = -EINVAL;
+
+       if (ar->start >= ar->end)
+               return -EINVAL;
+
+       damon_for_each_target(t, c) {
+               id = t->id;
+               if (targetid_is_pid(c))
+                       id = (unsigned long)pid_vnr((struct pid *)id);
+               if (id == target_id) {
+                       r = damon_new_region(ar->start, ar->end);
+                       if (!r)
+                               return -ENOMEM;
+                       damon_add_region(r, t);
+                       if (damon_nr_regions(t) > 1) {
+                               prev = damon_prev_region(r);
+                               if (prev->ar.end > r->ar.start) {
+                                       damon_destroy_region(r, t);
+                                       return -EINVAL;
+                               }
+                       }
+                       rc = 0;
+               }
+       }
+       return rc;
+}
+
+static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
+{
+       struct damon_target *t;
+       struct damon_region *r, *next;
+       int pos = 0, parsed, ret;
+       unsigned long target_id;
+       struct damon_addr_range ar;
+       int err;
+
+       damon_for_each_target(t, c) {
+               damon_for_each_region_safe(r, next, t)
+                       damon_destroy_region(r, t);
+       }
+
+       while (pos < len) {
+               ret = sscanf(&str[pos], "%lu %lu %lu%n",
+                               &target_id, &ar.start, &ar.end, &parsed);
+               if (ret != 3)
+                       break;
+               err = add_init_region(c, target_id, &ar);
+               if (err)
+                       goto fail;
+               pos += parsed;
+       }
+
+       return 0;
+
+fail:
+       damon_for_each_target(t, c) {
+               damon_for_each_region_safe(r, next, t)
+                       damon_destroy_region(r, t);
+       }
+       return err;
+}
+
+static ssize_t dbgfs_init_regions_write(struct file *file,
+                                         const char __user *buf, size_t count,
+                                         loff_t *ppos)
+{
+       struct damon_ctx *ctx = file->private_data;
+       char *kbuf;
+       ssize_t ret = count;
+       int err;
+
+       kbuf = user_input_str(buf, count, ppos);
+       if (IS_ERR(kbuf))
+               return PTR_ERR(kbuf);
+
+       mutex_lock(&ctx->kdamond_lock);
+       if (ctx->kdamond) {
+               ret = -EBUSY;
+               goto unlock_out;
+       }
+
+       err = set_init_regions(ctx, kbuf, ret);
+       if (err)
+               ret = err;
+
+unlock_out:
+       mutex_unlock(&ctx->kdamond_lock);
+       kfree(kbuf);
+       return ret;
+}
+
 static ssize_t dbgfs_kdamond_pid_read(struct file *file,
                char __user *buf, size_t count, loff_t *ppos)
 {
@@ -279,12 +610,24 @@ static const struct file_operations attrs_fops = {
        .write = dbgfs_attrs_write,
 };
 
+static const struct file_operations schemes_fops = {
+       .open = damon_dbgfs_open,
+       .read = dbgfs_schemes_read,
+       .write = dbgfs_schemes_write,
+};
+
 static const struct file_operations target_ids_fops = {
        .open = damon_dbgfs_open,
        .read = dbgfs_target_ids_read,
        .write = dbgfs_target_ids_write,
 };
 
+static const struct file_operations init_regions_fops = {
+       .open = damon_dbgfs_open,
+       .read = dbgfs_init_regions_read,
+       .write = dbgfs_init_regions_write,
+};
+
 static const struct file_operations kdamond_pid_fops = {
        .open = damon_dbgfs_open,
        .read = dbgfs_kdamond_pid_read,
@@ -292,28 +635,27 @@ static const struct file_operations kdamond_pid_fops = {
 
 static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
 {
-       const char * const file_names[] = {"attrs", "target_ids",
-               "kdamond_pid"};
-       const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops,
-               &kdamond_pid_fops};
+       const char * const file_names[] = {"attrs", "schemes", "target_ids",
+               "init_regions", "kdamond_pid"};
+       const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
+               &target_ids_fops, &init_regions_fops, &kdamond_pid_fops};
        int i;
 
        for (i = 0; i < ARRAY_SIZE(file_names); i++)
                debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]);
 }
 
-static int dbgfs_before_terminate(struct damon_ctx *ctx)
+static void dbgfs_before_terminate(struct damon_ctx *ctx)
 {
        struct damon_target *t, *next;
 
        if (!targetid_is_pid(ctx))
-               return 0;
+               return;
 
        damon_for_each_target_safe(t, next, ctx) {
                put_pid((struct pid *)t->id);
                damon_destroy_target(t);
        }
-       return 0;
 }
 
 static struct damon_ctx *dbgfs_new_ctx(void)
@@ -388,8 +730,7 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
 {
        char *kbuf;
        char *ctx_name;
-       ssize_t ret = count;
-       int err;
+       ssize_t ret;
 
        kbuf = user_input_str(buf, count, ppos);
        if (IS_ERR(kbuf))
@@ -407,9 +748,9 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
        }
 
        mutex_lock(&damon_dbgfs_lock);
-       err = dbgfs_mk_context(ctx_name);
-       if (err)
-               ret = err;
+       ret = dbgfs_mk_context(ctx_name);
+       if (!ret)
+               ret = count;
        mutex_unlock(&damon_dbgfs_lock);
 
 out:
@@ -478,8 +819,7 @@ static ssize_t dbgfs_rm_context_write(struct file *file,
                const char __user *buf, size_t count, loff_t *ppos)
 {
        char *kbuf;
-       ssize_t ret = count;
-       int err;
+       ssize_t ret;
        char *ctx_name;
 
        kbuf = user_input_str(buf, count, ppos);
@@ -498,9 +838,9 @@ static ssize_t dbgfs_rm_context_write(struct file *file,
        }
 
        mutex_lock(&damon_dbgfs_lock);
-       err = dbgfs_rm_context(ctx_name);
-       if (err)
-               ret = err;
+       ret = dbgfs_rm_context(ctx_name);
+       if (!ret)
+               ret = count;
        mutex_unlock(&damon_dbgfs_lock);
 
 out:
@@ -524,9 +864,8 @@ static ssize_t dbgfs_monitor_on_read(struct file *file,
 static ssize_t dbgfs_monitor_on_write(struct file *file,
                const char __user *buf, size_t count, loff_t *ppos)
 {
-       ssize_t ret = count;
+       ssize_t ret;
        char *kbuf;
-       int err;
 
        kbuf = user_input_str(buf, count, ppos);
        if (IS_ERR(kbuf))
@@ -538,15 +877,24 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
                return -EINVAL;
        }
 
-       if (!strncmp(kbuf, "on", count))
-               err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
-       else if (!strncmp(kbuf, "off", count))
-               err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
-       else
-               err = -EINVAL;
+       if (!strncmp(kbuf, "on", count)) {
+               int i;
 
-       if (err)
-               ret = err;
+               for (i = 0; i < dbgfs_nr_ctxs; i++) {
+                       if (damon_targets_empty(dbgfs_ctxs[i])) {
+                               kfree(kbuf);
+                               return -EINVAL;
+                       }
+               }
+               ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
+       } else if (!strncmp(kbuf, "off", count)) {
+               ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
+       } else {
+               ret = -EINVAL;
+       }
+
+       if (!ret)
+               ret = count;
        kfree(kbuf);
        return ret;
 }
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
new file mode 100644 (file)
index 0000000..a496d6f
--- /dev/null
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON Primitives for The Physical Address Space
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-pa: " fmt
+
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+
+#include "../internal.h"
+#include "prmtv-common.h"
+
+static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
+               unsigned long addr, void *arg)
+{
+       struct page_vma_mapped_walk pvmw = {
+               .page = page,
+               .vma = vma,
+               .address = addr,
+       };
+
+       while (page_vma_mapped_walk(&pvmw)) {
+               addr = pvmw.address;
+               if (pvmw.pte)
+                       damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr);
+               else
+                       damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr);
+       }
+       return true;
+}
+
+static void damon_pa_mkold(unsigned long paddr)
+{
+       struct page *page = damon_get_page(PHYS_PFN(paddr));
+       struct rmap_walk_control rwc = {
+               .rmap_one = __damon_pa_mkold,
+               .anon_lock = page_lock_anon_vma_read,
+       };
+       bool need_lock;
+
+       if (!page)
+               return;
+
+       if (!page_mapped(page) || !page_rmapping(page)) {
+               set_page_idle(page);
+               goto out;
+       }
+
+       need_lock = !PageAnon(page) || PageKsm(page);
+       if (need_lock && !trylock_page(page))
+               goto out;
+
+       rmap_walk(page, &rwc);
+
+       if (need_lock)
+               unlock_page(page);
+
+out:
+       put_page(page);
+}
+
+static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
+                                           struct damon_region *r)
+{
+       r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
+
+       damon_pa_mkold(r->sampling_addr);
+}
+
+void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
+{
+       struct damon_target *t;
+       struct damon_region *r;
+
+       damon_for_each_target(t, ctx) {
+               damon_for_each_region(r, t)
+                       __damon_pa_prepare_access_check(ctx, r);
+       }
+}
+
+struct damon_pa_access_chk_result {
+       unsigned long page_sz;
+       bool accessed;
+};
+
+static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
+               unsigned long addr, void *arg)
+{
+       struct damon_pa_access_chk_result *result = arg;
+       struct page_vma_mapped_walk pvmw = {
+               .page = page,
+               .vma = vma,
+               .address = addr,
+       };
+
+       result->accessed = false;
+       result->page_sz = PAGE_SIZE;
+       while (page_vma_mapped_walk(&pvmw)) {
+               addr = pvmw.address;
+               if (pvmw.pte) {
+                       result->accessed = pte_young(*pvmw.pte) ||
+                               !page_is_idle(page) ||
+                               mmu_notifier_test_young(vma->vm_mm, addr);
+               } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+                       result->accessed = pmd_young(*pvmw.pmd) ||
+                               !page_is_idle(page) ||
+                               mmu_notifier_test_young(vma->vm_mm, addr);
+                       result->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
+#else
+                       WARN_ON_ONCE(1);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+               }
+               if (result->accessed) {
+                       page_vma_mapped_walk_done(&pvmw);
+                       break;
+               }
+       }
+
+       /* If accessed, stop walking */
+       return !result->accessed;
+}
+
+static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
+{
+       struct page *page = damon_get_page(PHYS_PFN(paddr));
+       struct damon_pa_access_chk_result result = {
+               .page_sz = PAGE_SIZE,
+               .accessed = false,
+       };
+       struct rmap_walk_control rwc = {
+               .arg = &result,
+               .rmap_one = __damon_pa_young,
+               .anon_lock = page_lock_anon_vma_read,
+       };
+       bool need_lock;
+
+       if (!page)
+               return false;
+
+       if (!page_mapped(page) || !page_rmapping(page)) {
+               if (page_is_idle(page))
+                       result.accessed = false;
+               else
+                       result.accessed = true;
+               put_page(page);
+               goto out;
+       }
+
+       need_lock = !PageAnon(page) || PageKsm(page);
+       if (need_lock && !trylock_page(page)) {
+               put_page(page);
+               return NULL;
+       }
+
+       rmap_walk(page, &rwc);
+
+       if (need_lock)
+               unlock_page(page);
+       put_page(page);
+
+out:
+       *page_sz = result.page_sz;
+       return result.accessed;
+}
+
+static void __damon_pa_check_access(struct damon_ctx *ctx,
+                                   struct damon_region *r)
+{
+       static unsigned long last_addr;
+       static unsigned long last_page_sz = PAGE_SIZE;
+       static bool last_accessed;
+
+       /* If the region is in the last checked page, reuse the result */
+       if (ALIGN_DOWN(last_addr, last_page_sz) ==
+                               ALIGN_DOWN(r->sampling_addr, last_page_sz)) {
+               if (last_accessed)
+                       r->nr_accesses++;
+               return;
+       }
+
+       last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz);
+       if (last_accessed)
+               r->nr_accesses++;
+
+       last_addr = r->sampling_addr;
+}
+
+unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
+{
+       struct damon_target *t;
+       struct damon_region *r;
+       unsigned int max_nr_accesses = 0;
+
+       damon_for_each_target(t, ctx) {
+               damon_for_each_region(r, t) {
+                       __damon_pa_check_access(ctx, r);
+                       max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
+               }
+       }
+
+       return max_nr_accesses;
+}
+
+bool damon_pa_target_valid(void *t)
+{
+       return true;
+}
+
+int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
+               struct damon_region *r, struct damos *scheme)
+{
+       unsigned long addr;
+       LIST_HEAD(page_list);
+
+       if (scheme->action != DAMOS_PAGEOUT)
+               return -EINVAL;
+
+       for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+               struct page *page = damon_get_page(PHYS_PFN(addr));
+
+               if (!page)
+                       continue;
+
+               ClearPageReferenced(page);
+               test_and_clear_page_young(page);
+               if (isolate_lru_page(page)) {
+                       put_page(page);
+                       continue;
+               }
+               if (PageUnevictable(page)) {
+                       putback_lru_page(page);
+               } else {
+                       list_add(&page->lru, &page_list);
+                       put_page(page);
+               }
+       }
+       reclaim_pages(&page_list);
+       cond_resched();
+       return 0;
+}
+
+int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
+               struct damon_region *r, struct damos *scheme)
+{
+       switch (scheme->action) {
+       case DAMOS_PAGEOUT:
+               return damon_pageout_score(context, r, scheme);
+       default:
+               break;
+       }
+
+       return DAMOS_MAX_SCORE;
+}
+
+void damon_pa_set_primitives(struct damon_ctx *ctx)
+{
+       ctx->primitive.init = NULL;
+       ctx->primitive.update = NULL;
+       ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks;
+       ctx->primitive.check_accesses = damon_pa_check_accesses;
+       ctx->primitive.reset_aggregated = NULL;
+       ctx->primitive.target_valid = damon_pa_target_valid;
+       ctx->primitive.cleanup = NULL;
+       ctx->primitive.apply_scheme = damon_pa_apply_scheme;
+       ctx->primitive.get_scheme_score = damon_pa_scheme_score;
+}
diff --git a/mm/damon/prmtv-common.c b/mm/damon/prmtv-common.c
new file mode 100644 (file)
index 0000000..92a04f5
--- /dev/null
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for Data Access Monitoring
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+
+#include "prmtv-common.h"
+
+/*
+ * Get an online page for a pfn if it's in the LRU list.  Otherwise, returns
+ * NULL.
+ *
+ * The body of this function is stolen from the 'page_idle_get_page()'.  We
+ * steal rather than reuse it because the code is quite simple.
+ */
+struct page *damon_get_page(unsigned long pfn)
+{
+       struct page *page = pfn_to_online_page(pfn);
+
+       if (!page || !PageLRU(page) || !get_page_unless_zero(page))
+               return NULL;
+
+       if (unlikely(!PageLRU(page))) {
+               put_page(page);
+               page = NULL;
+       }
+       return page;
+}
+
+void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
+{
+       bool referenced = false;
+       struct page *page = damon_get_page(pte_pfn(*pte));
+
+       if (!page)
+               return;
+
+       if (pte_young(*pte)) {
+               referenced = true;
+               *pte = pte_mkold(*pte);
+       }
+
+#ifdef CONFIG_MMU_NOTIFIER
+       if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
+               referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+       if (referenced)
+               set_page_young(page);
+
+       set_page_idle(page);
+       put_page(page);
+}
+
+void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       bool referenced = false;
+       struct page *page = damon_get_page(pmd_pfn(*pmd));
+
+       if (!page)
+               return;
+
+       if (pmd_young(*pmd)) {
+               referenced = true;
+               *pmd = pmd_mkold(*pmd);
+       }
+
+#ifdef CONFIG_MMU_NOTIFIER
+       if (mmu_notifier_clear_young(mm, addr,
+                               addr + ((1UL) << HPAGE_PMD_SHIFT)))
+               referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+       if (referenced)
+               set_page_young(page);
+
+       set_page_idle(page);
+       put_page(page);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+
+#define DAMON_MAX_SUBSCORE     (100)
+#define DAMON_MAX_AGE_IN_LOG   (32)
+
+int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+                       struct damos *s)
+{
+       unsigned int max_nr_accesses;
+       int freq_subscore;
+       unsigned int age_in_sec;
+       int age_in_log, age_subscore;
+       unsigned int freq_weight = s->quota.weight_nr_accesses;
+       unsigned int age_weight = s->quota.weight_age;
+       int hotness;
+
+       max_nr_accesses = c->aggr_interval / c->sample_interval;
+       freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+
+       age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+       for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
+                       age_in_log++, age_in_sec >>= 1)
+               ;
+
+       /* If frequency is 0, higher age means it's colder */
+       if (freq_subscore == 0)
+               age_in_log *= -1;
+
+       /*
+        * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
+        * Scale it to be in [0, 100] and set it as age subscore.
+        */
+       age_in_log += DAMON_MAX_AGE_IN_LOG;
+       age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
+               DAMON_MAX_AGE_IN_LOG / 2;
+
+       hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
+       if (freq_weight + age_weight)
+               hotness /= freq_weight + age_weight;
+       /*
+        * Transform it to fit in [0, DAMOS_MAX_SCORE]
+        */
+       hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
+
+       /* Return coldness of the region */
+       return DAMOS_MAX_SCORE - hotness;
+}
diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h
new file mode 100644 (file)
index 0000000..61f2703
--- /dev/null
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for Data Access Monitoring
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+#include <linux/random.h>
+
+/* Get a random number in [l, r) */
+#define damon_rand(l, r) (l + prandom_u32_max(r - l))
+
+struct page *damon_get_page(unsigned long pfn);
+
+void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
+void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
+
+int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+                       struct damos *s);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
new file mode 100644 (file)
index 0000000..dc14850
--- /dev/null
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON-based page reclamation
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-reclaim: " fmt
+
+#include <linux/damon.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_reclaim."
+
+/*
+ * Enable or disable DAMON_RECLAIM.
+ *
+ * You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``.
+ * Setting it as ``N`` disables DAMON_RECLAIM.  Note that DAMON_RECLAIM could
+ * do no real monitoring and reclamation due to the watermarks-based activation
+ * condition.  Refer to below descriptions for the watermarks parameter for
+ * this.
+ */
+static bool enabled __read_mostly;
+module_param(enabled, bool, 0600);
+
+/*
+ * Time threshold for cold memory regions identification in microseconds.
+ *
+ * If a memory region is not accessed for this or longer time, DAMON_RECLAIM
+ * identifies the region as cold, and reclaims.  120 seconds by default.
+ */
+static unsigned long min_age __read_mostly = 120000000;
+module_param(min_age, ulong, 0600);
+
+/*
+ * Limit of time for trying the reclamation in milliseconds.
+ *
+ * DAMON_RECLAIM tries to use only up to this time within a time window
+ * (quota_reset_interval_ms) for trying reclamation of cold pages.  This can be
+ * used for limiting CPU consumption of DAMON_RECLAIM.  If the value is zero,
+ * the limit is disabled.
+ *
+ * 10 ms by default.
+ */
+static unsigned long quota_ms __read_mostly = 10;
+module_param(quota_ms, ulong, 0600);
+
+/*
+ * Limit of size of memory for the reclamation in bytes.
+ *
+ * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a
+ * time window (quota_reset_interval_ms) and makes no more than this limit is
+ * tried.  This can be used for limiting consumption of CPU and IO.  If this
+ * value is zero, the limit is disabled.
+ *
+ * 128 MiB by default.
+ */
+static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024;
+module_param(quota_sz, ulong, 0600);
+
+/*
+ * The time/size quota charge reset interval in milliseconds.
+ *
+ * The charge reset interval for the quota of time (quota_ms) and size
+ * (quota_sz).  That is, DAMON_RECLAIM does not try reclamation for more than
+ * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
+ * milliseconds.
+ *
+ * 1 second by default.
+ */
+static unsigned long quota_reset_interval_ms __read_mostly = 1000;
+module_param(quota_reset_interval_ms, ulong, 0600);
+
+/*
+ * The watermarks check time interval in microseconds.
+ *
+ * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
+ * enabled but inactive due to its watermarks rule.  5 seconds by default.
+ */
+static unsigned long wmarks_interval __read_mostly = 5000000;
+module_param(wmarks_interval, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the high watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is higher than
+ * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically
+ * checks the watermarks.  500 (50%) by default.
+ */
+static unsigned long wmarks_high __read_mostly = 500;
+module_param(wmarks_high, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the middle watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is between this and
+ * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring
+ * and the reclaiming.  400 (40%) by default.
+ */
+static unsigned long wmarks_mid __read_mostly = 400;
+module_param(wmarks_mid, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the low watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is lower than this,
+ * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks
+ * the watermarks.  In the case, the system falls back to the LRU-based page
+ * granularity reclamation logic.  200 (20%) by default.
+ */
+static unsigned long wmarks_low __read_mostly = 200;
+module_param(wmarks_low, ulong, 0600);
+
+/*
+ * Sampling interval for the monitoring in microseconds.
+ *
+ * The sampling interval of DAMON for the cold memory monitoring.  Please refer
+ * to the DAMON documentation for more detail.  5 ms by default.
+ */
+static unsigned long sample_interval __read_mostly = 5000;
+module_param(sample_interval, ulong, 0600);
+
+/*
+ * Aggregation interval for the monitoring in microseconds.
+ *
+ * The aggregation interval of DAMON for the cold memory monitoring.  Please
+ * refer to the DAMON documentation for more detail.  100 ms by default.
+ */
+static unsigned long aggr_interval __read_mostly = 100000;
+module_param(aggr_interval, ulong, 0600);
+
+/*
+ * Minimum number of monitoring regions.
+ *
+ * The minimal number of monitoring regions of DAMON for the cold memory
+ * monitoring.  This can be used to set lower-bound of the monitoring quality.
+ * But, setting this too high could result in increased monitoring overhead.
+ * Please refer to the DAMON documentation for more detail.  10 by default.
+ */
+static unsigned long min_nr_regions __read_mostly = 10;
+module_param(min_nr_regions, ulong, 0600);
+
+/*
+ * Maximum number of monitoring regions.
+ *
+ * The maximum number of monitoring regions of DAMON for the cold memory
+ * monitoring.  This can be used to set upper-bound of the monitoring overhead.
+ * However, setting this too low could result in bad monitoring quality.
+ * Please refer to the DAMON documentation for more detail.  1000 by default.
+ */
+static unsigned long max_nr_regions __read_mostly = 1000;
+module_param(max_nr_regions, ulong, 0600);
+
+/*
+ * Start of the target memory region in physical address.
+ *
+ * The start physical address of memory region that DAMON_RECLAIM will do work
+ * against.  By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_start __read_mostly;
+module_param(monitor_region_start, ulong, 0600);
+
+/*
+ * End of the target memory region in physical address.
+ *
+ * The end physical address of memory region that DAMON_RECLAIM will do work
+ * against.  By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_end __read_mostly;
+module_param(monitor_region_end, ulong, 0600);
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+static int kdamond_pid __read_mostly = -1;
+module_param(kdamond_pid, int, 0400);
+
+static struct damon_ctx *ctx;
+static struct damon_target *target;
+
+struct damon_reclaim_ram_walk_arg {
+       unsigned long start;
+       unsigned long end;
+};
+
+static int walk_system_ram(struct resource *res, void *arg)
+{
+       struct damon_reclaim_ram_walk_arg *a = arg;
+
+       if (a->end - a->start < res->end - res->start) {
+               a->start = res->start;
+               a->end = res->end;
+       }
+       return 0;
+}
+
+/*
+ * Find biggest 'System RAM' resource and store its start and end address in
+ * @start and @end, respectively.  If no System RAM is found, returns false.
+ */
+static bool get_monitoring_region(unsigned long *start, unsigned long *end)
+{
+       struct damon_reclaim_ram_walk_arg arg = {};
+
+       walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
+       if (arg.end <= arg.start)
+               return false;
+
+       *start = arg.start;
+       *end = arg.end;
+       return true;
+}
+
+static struct damos *damon_reclaim_new_scheme(void)
+{
+       struct damos_watermarks wmarks = {
+               .metric = DAMOS_WMARK_FREE_MEM_RATE,
+               .interval = wmarks_interval,
+               .high = wmarks_high,
+               .mid = wmarks_mid,
+               .low = wmarks_low,
+       };
+       struct damos_quota quota = {
+               /*
+                * Do not try reclamation for more than quota_ms milliseconds
+                * or quota_sz bytes within quota_reset_interval_ms.
+                */
+               .ms = quota_ms,
+               .sz = quota_sz,
+               .reset_interval = quota_reset_interval_ms,
+               /* Within the quota, page out older regions first. */
+               .weight_sz = 0,
+               .weight_nr_accesses = 0,
+               .weight_age = 1
+       };
+       struct damos *scheme = damon_new_scheme(
+                       /* Find regions having PAGE_SIZE or larger size */
+                       PAGE_SIZE, ULONG_MAX,
+                       /* and not accessed at all */
+                       0, 0,
+                       /* for min_age or more micro-seconds, and */
+                       min_age / aggr_interval, UINT_MAX,
+                       /* page out those, as soon as found */
+                       DAMOS_PAGEOUT,
+                       /* under the quota. */
+                       &quota,
+                       /* (De)activate this according to the watermarks. */
+                       &wmarks);
+
+       return scheme;
+}
+
+static int damon_reclaim_turn(bool on)
+{
+       struct damon_region *region;
+       struct damos *scheme;
+       int err;
+
+       if (!on) {
+               err = damon_stop(&ctx, 1);
+               if (!err)
+                       kdamond_pid = -1;
+               return err;
+       }
+
+       err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
+                       min_nr_regions, max_nr_regions);
+       if (err)
+               return err;
+
+       if (monitor_region_start > monitor_region_end)
+               return -EINVAL;
+       if (!monitor_region_start && !monitor_region_end &&
+                       !get_monitoring_region(&monitor_region_start,
+                               &monitor_region_end))
+               return -EINVAL;
+       /* DAMON will free this on its own when finish monitoring */
+       region = damon_new_region(monitor_region_start, monitor_region_end);
+       if (!region)
+               return -ENOMEM;
+       damon_add_region(region, target);
+
+       /* Will be freed by 'damon_set_schemes()' below */
+       scheme = damon_reclaim_new_scheme();
+       if (!scheme) {
+               err = -ENOMEM;
+               goto free_region_out;
+       }
+       err = damon_set_schemes(ctx, &scheme, 1);
+       if (err)
+               goto free_scheme_out;
+
+       err = damon_start(&ctx, 1);
+       if (!err) {
+               kdamond_pid = ctx->kdamond->pid;
+               return 0;
+       }
+
+free_scheme_out:
+       damon_destroy_scheme(scheme);
+free_region_out:
+       damon_destroy_region(region, target);
+       return err;
+}
+
+#define ENABLE_CHECK_INTERVAL_MS       1000
+static struct delayed_work damon_reclaim_timer;
+static void damon_reclaim_timer_fn(struct work_struct *work)
+{
+       static bool last_enabled;
+       bool now_enabled;
+
+       now_enabled = enabled;
+       if (last_enabled != now_enabled) {
+               if (!damon_reclaim_turn(now_enabled))
+                       last_enabled = now_enabled;
+               else
+                       enabled = last_enabled;
+       }
+
+       schedule_delayed_work(&damon_reclaim_timer,
+                       msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS));
+}
+static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
+
+static int __init damon_reclaim_init(void)
+{
+       ctx = damon_new_ctx();
+       if (!ctx)
+               return -ENOMEM;
+
+       damon_pa_set_primitives(ctx);
+
+       /* 4242 means nothing but fun */
+       target = damon_new_target(4242);
+       if (!target) {
+               damon_destroy_ctx(ctx);
+               return -ENOMEM;
+       }
+       damon_add_target(ctx, target);
+
+       schedule_delayed_work(&damon_reclaim_timer, 0);
+       return 0;
+}
+
+module_init(damon_reclaim_init);
index 1f5c13257dbaf4ef8422b3de1cb83f60b6880e87..ecfd0b2ed222d065d97e1061704ef0fa96e5ad51 100644 (file)
@@ -233,7 +233,7 @@ static void damon_test_apply_three_regions3(struct kunit *test)
  * and 70-100) has totally freed and mapped to different area (30-32 and
  * 65-68).  The target regions which were in the old second and third big
  * regions should now be removed and new target regions covering the new second
- * and third big regions should be crated.
+ * and third big regions should be created.
  */
 static void damon_test_apply_three_regions4(struct kunit *test)
 {
index 58c1fb2aafa91fcf410df608927e7b67c8f6a3e8..35fe49080ee99636f045d3f8c785319947eadf4f 100644 (file)
@@ -7,25 +7,20 @@
 
 #define pr_fmt(fmt) "damon-va: " fmt
 
-#include <linux/damon.h>
+#include <asm-generic/mman-common.h>
+#include <linux/highmem.h>
 #include <linux/hugetlb.h>
-#include <linux/mm.h>
 #include <linux/mmu_notifier.h>
-#include <linux/highmem.h>
 #include <linux/page_idle.h>
 #include <linux/pagewalk.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-#include <linux/slab.h>
+
+#include "prmtv-common.h"
 
 #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
 #undef DAMON_MIN_REGION
 #define DAMON_MIN_REGION 1
 #endif
 
-/* Get a random number in [l, r) */
-#define damon_rand(l, r) (l + prandom_u32_max(r - l))
-
 /*
  * 't->id' should be the pointer to the relevant 'struct pid' having reference
  * count.  Caller must put the returned task, unless it is NULL.
@@ -311,7 +306,7 @@ static void damon_va_apply_three_regions(struct damon_target *t,
                struct damon_addr_range bregions[3])
 {
        struct damon_region *r, *next;
-       unsigned int i = 0;
+       unsigned int i;
 
        /* Remove regions which are not in the three big regions now */
        damon_for_each_region_safe(r, next, t) {
@@ -372,82 +367,6 @@ void damon_va_update(struct damon_ctx *ctx)
        }
 }
 
-/*
- * Get an online page for a pfn if it's in the LRU list.  Otherwise, returns
- * NULL.
- *
- * The body of this function is stolen from the 'page_idle_get_page()'.  We
- * steal rather than reuse it because the code is quite simple.
- */
-static struct page *damon_get_page(unsigned long pfn)
-{
-       struct page *page = pfn_to_online_page(pfn);
-
-       if (!page || !PageLRU(page) || !get_page_unless_zero(page))
-               return NULL;
-
-       if (unlikely(!PageLRU(page))) {
-               put_page(page);
-               page = NULL;
-       }
-       return page;
-}
-
-static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm,
-                            unsigned long addr)
-{
-       bool referenced = false;
-       struct page *page = damon_get_page(pte_pfn(*pte));
-
-       if (!page)
-               return;
-
-       if (pte_young(*pte)) {
-               referenced = true;
-               *pte = pte_mkold(*pte);
-       }
-
-#ifdef CONFIG_MMU_NOTIFIER
-       if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
-               referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
-       if (referenced)
-               set_page_young(page);
-
-       set_page_idle(page);
-       put_page(page);
-}
-
-static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
-                            unsigned long addr)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       bool referenced = false;
-       struct page *page = damon_get_page(pmd_pfn(*pmd));
-
-       if (!page)
-               return;
-
-       if (pmd_young(*pmd)) {
-               referenced = true;
-               *pmd = pmd_mkold(*pmd);
-       }
-
-#ifdef CONFIG_MMU_NOTIFIER
-       if (mmu_notifier_clear_young(mm, addr,
-                               addr + ((1UL) << HPAGE_PMD_SHIFT)))
-               referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
-       if (referenced)
-               set_page_young(page);
-
-       set_page_idle(page);
-       put_page(page);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-}
-
 static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
                unsigned long next, struct mm_walk *walk)
 {
@@ -475,7 +394,7 @@ out:
        return 0;
 }
 
-static struct mm_walk_ops damon_mkold_ops = {
+static const struct mm_walk_ops damon_mkold_ops = {
        .pmd_entry = damon_mkold_pmd_entry,
 };
 
@@ -571,7 +490,7 @@ out:
        return 0;
 }
 
-static struct mm_walk_ops damon_young_ops = {
+static const struct mm_walk_ops damon_young_ops = {
        .pmd_entry = damon_young_pmd_entry,
 };
 
@@ -658,6 +577,76 @@ bool damon_va_target_valid(void *target)
        return false;
 }
 
+#ifndef CONFIG_ADVISE_SYSCALLS
+static int damos_madvise(struct damon_target *target, struct damon_region *r,
+                       int behavior)
+{
+       return -EINVAL;
+}
+#else
+static int damos_madvise(struct damon_target *target, struct damon_region *r,
+                       int behavior)
+{
+       struct mm_struct *mm;
+       int ret = -ENOMEM;
+
+       mm = damon_get_mm(target);
+       if (!mm)
+               goto out;
+
+       ret = do_madvise(mm, PAGE_ALIGN(r->ar.start),
+                       PAGE_ALIGN(r->ar.end - r->ar.start), behavior);
+       mmput(mm);
+out:
+       return ret;
+}
+#endif /* CONFIG_ADVISE_SYSCALLS */
+
+int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
+               struct damon_region *r, struct damos *scheme)
+{
+       int madv_action;
+
+       switch (scheme->action) {
+       case DAMOS_WILLNEED:
+               madv_action = MADV_WILLNEED;
+               break;
+       case DAMOS_COLD:
+               madv_action = MADV_COLD;
+               break;
+       case DAMOS_PAGEOUT:
+               madv_action = MADV_PAGEOUT;
+               break;
+       case DAMOS_HUGEPAGE:
+               madv_action = MADV_HUGEPAGE;
+               break;
+       case DAMOS_NOHUGEPAGE:
+               madv_action = MADV_NOHUGEPAGE;
+               break;
+       case DAMOS_STAT:
+               return 0;
+       default:
+               pr_warn("Wrong action %d\n", scheme->action);
+               return -EINVAL;
+       }
+
+       return damos_madvise(t, r, madv_action);
+}
+
+int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
+               struct damon_region *r, struct damos *scheme)
+{
+
+       switch (scheme->action) {
+       case DAMOS_PAGEOUT:
+               return damon_pageout_score(context, r, scheme);
+       default:
+               break;
+       }
+
+       return DAMOS_MAX_SCORE;
+}
+
 void damon_va_set_primitives(struct damon_ctx *ctx)
 {
        ctx->primitive.init = damon_va_init;
@@ -667,6 +656,8 @@ void damon_va_set_primitives(struct damon_ctx *ctx)
        ctx->primitive.reset_aggregated = NULL;
        ctx->primitive.target_valid = damon_va_target_valid;
        ctx->primitive.cleanup = NULL;
+       ctx->primitive.apply_scheme = damon_va_apply_scheme;
+       ctx->primitive.get_scheme_score = damon_va_scheme_score;
 }
 
 #include "vaddr-test.h"
index d0020fc5820271eced62eebf3be2fc73c0b77e96..a05a39ff8fe4f676a38b542dfc14c177369be5c8 100644 (file)
 #include <linux/ctype.h>
 
 #include "internal.h"
+#include <trace/events/migrate.h>
+
+/*
+ * Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can
+ * be used to populate migrate_reason_names[].
+ */
+#undef EM
+#undef EMe
+#define EM(a, b)       b,
+#define EMe(a, b)      b
 
 const char *migrate_reason_names[MR_TYPES] = {
-       "compaction",
-       "memory_failure",
-       "memory_hotplug",
-       "syscall_or_cpuset",
-       "mempolicy_mbind",
-       "numa_misplaced",
-       "contig_range",
-       "longterm_pin",
-       "demotion",
+       MIGRATE_REASON
 };
 
 const struct trace_print_flags pageflag_names[] = {
index 1403639302e48201a020c088e641c6654d36c8f8..228e3954b90c155ed77bd49086bbbc4dc22e1941 100644 (file)
@@ -1104,13 +1104,14 @@ static int __init init_args(struct pgtable_debug_args *args)
        /*
         * Initialize the debugging data.
         *
-        * __P000 (or even __S000) will help create page table entries with
-        * PROT_NONE permission as required for pxx_protnone_tests().
+        * protection_map[0] (or even protection_map[8]) will help create
+        * page table entries with PROT_NONE permission as required for
+        * pxx_protnone_tests().
         */
        memset(args, 0, sizeof(*args));
        args->vaddr              = get_random_vaddr();
        args->page_prot          = vm_get_page_prot(VMFLAGS);
-       args->page_prot_none     = __P000;
+       args->page_prot_none     = protection_map[0];
        args->is_contiguous_page = false;
        args->pud_pfn            = ULONG_MAX;
        args->pmd_pfn            = ULONG_MAX;
index bfcef6ff7a275a68b2a13c3f7fa7cee7e6ae7d88..615512caa0b5de0284a12ffbeab9c9355ab09a87 100644 (file)
@@ -638,6 +638,30 @@ static bool mapping_needs_writeback(struct address_space *mapping)
        return mapping->nrpages;
 }
 
+static bool filemap_range_has_writeback(struct address_space *mapping,
+                                       loff_t start_byte, loff_t end_byte)
+{
+       XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+       pgoff_t max = end_byte >> PAGE_SHIFT;
+       struct page *page;
+
+       if (end_byte < start_byte)
+               return false;
+
+       rcu_read_lock();
+       xas_for_each(&xas, page, max) {
+               if (xas_retry(&xas, page))
+                       continue;
+               if (xa_is_value(page))
+                       continue;
+               if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
+                       break;
+       }
+       rcu_read_unlock();
+       return page != NULL;
+
+}
+
 /**
  * filemap_range_needs_writeback - check if range potentially needs writeback
  * @mapping:           address space within which to check
@@ -655,29 +679,12 @@ static bool mapping_needs_writeback(struct address_space *mapping)
 bool filemap_range_needs_writeback(struct address_space *mapping,
                                   loff_t start_byte, loff_t end_byte)
 {
-       XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
-       pgoff_t max = end_byte >> PAGE_SHIFT;
-       struct page *page;
-
        if (!mapping_needs_writeback(mapping))
                return false;
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
            !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                return false;
-       if (end_byte < start_byte)
-               return false;
-
-       rcu_read_lock();
-       xas_for_each(&xas, page, max) {
-               if (xas_retry(&xas, page))
-                       continue;
-               if (xa_is_value(page))
-                       continue;
-               if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
-                       break;
-       }
-       rcu_read_unlock();
-       return page != NULL;
+       return filemap_range_has_writeback(mapping, start_byte, end_byte);
 }
 EXPORT_SYMBOL_GPL(filemap_range_needs_writeback);
 
@@ -1592,6 +1599,7 @@ void folio_end_writeback(struct folio *folio)
 
        smp_mb__after_atomic();
        folio_wake(folio, PG_writeback);
+       acct_reclaim_writeback(folio);
        folio_put(folio);
 }
 EXPORT_SYMBOL(folio_end_writeback);
@@ -2088,7 +2096,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
                if (!xa_is_value(page)) {
                        if (page->index < start)
                                goto put;
-                       VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
                        if (page->index + thp_nr_pages(page) - 1 > end)
                                goto put;
                        if (!trylock_page(page))
@@ -2621,6 +2628,9 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
                if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
                        iocb->ki_flags |= IOCB_NOWAIT;
 
+               if (unlikely(iocb->ki_pos >= i_size_read(inode)))
+                       break;
+
                error = filemap_get_pages(iocb, iter, &pvec);
                if (error < 0)
                        break;
@@ -2733,9 +2743,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                struct file *file = iocb->ki_filp;
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;
-               loff_t size;
 
-               size = i_size_read(inode);
                if (iocb->ki_flags & IOCB_NOWAIT) {
                        if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
                                                iocb->ki_pos + count - 1))
@@ -2767,8 +2775,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                 * the rest of the read.  Buffered reads will not work for
                 * DAX files, so don't bother trying.
                 */
-               if (retval < 0 || !count || iocb->ki_pos >= size ||
-                   IS_DAX(inode))
+               if (retval < 0 || !count || IS_DAX(inode))
+                       return retval;
+               if (iocb->ki_pos >= i_size_read(inode))
                        return retval;
        }
 
@@ -3193,24 +3202,17 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
        }
 
        if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
-           vm_fault_t ret = do_set_pmd(vmf, page);
-           if (!ret) {
-                   /* The page is mapped successfully, reference consumed. */
-                   unlock_page(page);
-                   return true;
-           }
-       }
-
-       if (pmd_none(*vmf->pmd)) {
-               vmf->ptl = pmd_lock(mm, vmf->pmd);
-               if (likely(pmd_none(*vmf->pmd))) {
-                       mm_inc_nr_ptes(mm);
-                       pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
-                       vmf->prealloc_pte = NULL;
+               vm_fault_t ret = do_set_pmd(vmf, page);
+               if (!ret) {
+                       /* The page is mapped successfully, reference consumed. */
+                       unlock_page(page);
+                       return true;
                }
-               spin_unlock(vmf->ptl);
        }
 
+       if (pmd_none(*vmf->pmd))
+               pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
+
        /* See comment in handle_pte_fault() */
        if (pmd_devmap_trans_unstable(vmf->pmd)) {
                unlock_page(page);
index e1c7e4bde11fd3f68c7e93942ebb27293c566ef4..2c51e9748a6a584f4e9d0c96f3f175fab6d89c17 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2365,7 +2365,6 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 {
        int nr_start = *nr;
        struct dev_pagemap *pgmap = NULL;
-       int ret = 1;
 
        do {
                struct page *page = pfn_to_page(pfn);
@@ -2373,14 +2372,12 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
                pgmap = get_dev_pagemap(pfn, pgmap);
                if (unlikely(!pgmap)) {
                        undo_dev_pagemap(nr, nr_start, flags, pages);
-                       ret = 0;
                        break;
                }
                SetPageReferenced(page);
                pages[*nr] = page;
                if (unlikely(!try_grab_page(page, flags))) {
                        undo_dev_pagemap(nr, nr_start, flags, pages);
-                       ret = 0;
                        break;
                }
                (*nr)++;
@@ -2388,7 +2385,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
        } while (addr += PAGE_SIZE, addr != end);
 
        put_dev_pagemap(pgmap);
-       return ret;
+       return addr == end;
 }
 
 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
index 471d9779a7f47fcf9de7682c2aa7351be093b256..88f65f1558453359e854322ef0e881e3fd1efc89 100644 (file)
@@ -382,7 +382,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
                        unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
 
                        if (end1 > start1) {
-                               kaddr = kmap_atomic(page + i);
+                               kaddr = kmap_local_page(page + i);
                                memset(kaddr + start1, 0, this_end - start1);
                        }
                        end1 -= this_end;
@@ -397,7 +397,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
 
                        if (end2 > start2) {
                                if (!kaddr)
-                                       kaddr = kmap_atomic(page + i);
+                                       kaddr = kmap_local_page(page + i);
                                memset(kaddr + start2, 0, this_end - start2);
                        }
                        end2 -= this_end;
@@ -405,7 +405,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
                }
 
                if (kaddr) {
-                       kunmap_atomic(kaddr);
+                       kunmap_local(kaddr);
                        flush_dcache_page(page + i);
                }
 
index 6378c10664599f0599f760269c1df8ca8a80328b..e09159c957e367f9adb37f9cc648d2cc39ff67ac 100644 (file)
@@ -50,6 +50,17 @@ struct hstate hstates[HUGE_MAX_HSTATE];
 
 #ifdef CONFIG_CMA
 static struct cma *hugetlb_cma[MAX_NUMNODES];
+static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
+static bool hugetlb_cma_page(struct page *page, unsigned int order)
+{
+       return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
+                               1 << order);
+}
+#else
+static bool hugetlb_cma_page(struct page *page, unsigned int order)
+{
+       return false;
+}
 #endif
 static unsigned long hugetlb_cma_size __initdata;
 
@@ -66,6 +77,7 @@ static struct hstate * __initdata parsed_hstate;
 static unsigned long __initdata default_hstate_max_huge_pages;
 static bool __initdata parsed_valid_hugepagesz = true;
 static bool __initdata parsed_default_hugepagesz;
+static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
 
 /*
  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -321,8 +333,7 @@ static bool has_same_uncharge_info(struct file_region *rg,
                                   struct file_region *org)
 {
 #ifdef CONFIG_CGROUP_HUGETLB
-       return rg && org &&
-              rg->reservation_counter == org->reservation_counter &&
+       return rg->reservation_counter == org->reservation_counter &&
               rg->css == org->css;
 
 #else
@@ -435,7 +446,6 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
                add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
                                            t, h, h_cg, regions_needed);
 
-       VM_BUG_ON(add < 0);
        return add;
 }
 
@@ -1004,6 +1014,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
                vma->vm_private_data = (void *)0;
 }
 
+/*
+ * Reset and decrement one ref on hugepage private reservation.
+ * Called with mm->mmap_sem writer semaphore held.
+ * This function should be only used by move_vma() and operate on
+ * same sized vma. It should never come here with last ref on the
+ * reservation.
+ */
+void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+       /*
+        * Clear the old hugetlb private page reservation.
+        * It has already been transferred to new_vma.
+        *
+        * During a mremap() operation of a hugetlb vma we call move_vma()
+        * which copies vma into new_vma and unmaps vma. After the copy
+        * operation both new_vma and vma share a reference to the resv_map
+        * struct, and at that point vma is about to be unmapped. We don't
+        * want to return the reservation to the pool at unmap of vma because
+        * the reservation still lives on in new_vma, so simply decrement the
+        * ref here and remove the resv_map reference from this vma.
+        */
+       struct resv_map *reservations = vma_resv_map(vma);
+
+       if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+               kref_put(&reservations->refs, resv_map_release);
+
+       reset_vma_resv_huge_pages(vma);
+}
+
 /* Returns true if the VMA has associated reserve pages */
 static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
 {
@@ -1260,9 +1299,9 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
                ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
                nr_nodes--)
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static void destroy_compound_gigantic_page(struct page *page,
-                                       unsigned int order)
+/* used to demote non-gigantic_huge pages as well */
+static void __destroy_compound_gigantic_page(struct page *page,
+                                       unsigned int order, bool demote)
 {
        int i;
        int nr_pages = 1 << order;
@@ -1272,8 +1311,10 @@ static void destroy_compound_gigantic_page(struct page *page,
        atomic_set(compound_pincount_ptr(page), 0);
 
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+               p->mapping = NULL;
                clear_compound_head(p);
-               set_page_refcounted(p);
+               if (!demote)
+                       set_page_refcounted(p);
        }
 
        set_compound_order(page, 0);
@@ -1281,6 +1322,19 @@ static void destroy_compound_gigantic_page(struct page *page,
        __ClearPageHead(page);
 }
 
+static void destroy_compound_hugetlb_page_for_demote(struct page *page,
+                                       unsigned int order)
+{
+       __destroy_compound_gigantic_page(page, order, true);
+}
+
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static void destroy_compound_gigantic_page(struct page *page,
+                                       unsigned int order)
+{
+       __destroy_compound_gigantic_page(page, order, false);
+}
+
 static void free_gigantic_page(struct page *page, unsigned int order)
 {
        /*
@@ -1353,12 +1407,15 @@ static inline void destroy_compound_gigantic_page(struct page *page,
 
 /*
  * Remove hugetlb page from lists, and update dtor so that page appears
- * as just a compound page.  A reference is held on the page.
+ * as just a compound page.
+ *
+ * A reference is held on the page, except in the case of demote.
  *
  * Must be called with hugetlb lock held.
  */
-static void remove_hugetlb_page(struct hstate *h, struct page *page,
-                                                       bool adjust_surplus)
+static void __remove_hugetlb_page(struct hstate *h, struct page *page,
+                                                       bool adjust_surplus,
+                                                       bool demote)
 {
        int nid = page_to_nid(page);
 
@@ -1396,8 +1453,12 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
         *
         * This handles the case where more than one ref is held when and
         * after update_and_free_page is called.
+        *
+        * In the case of demote we do not ref count the page as it will soon
+        * be turned into a page of smaller size.
         */
-       set_page_refcounted(page);
+       if (!demote)
+               set_page_refcounted(page);
        if (hstate_is_gigantic(h))
                set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
        else
@@ -1407,6 +1468,18 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
        h->nr_huge_pages_node[nid]--;
 }
 
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+                                                       bool adjust_surplus)
+{
+       __remove_hugetlb_page(h, page, adjust_surplus, false);
+}
+
+static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
+                                                       bool adjust_surplus)
+{
+       __remove_hugetlb_page(h, page, adjust_surplus, true);
+}
+
 static void add_hugetlb_page(struct hstate *h, struct page *page,
                             bool adjust_surplus)
 {
@@ -1476,7 +1549,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_active | 1 << PG_private |
                                1 << PG_writeback);
        }
-       if (hstate_is_gigantic(h)) {
+
+       /*
+        * Non-gigantic pages demoted from CMA allocated gigantic pages
+        * need to be given back to CMA in free_gigantic_page.
+        */
+       if (hstate_is_gigantic(h) ||
+           hugetlb_cma_page(page, huge_page_order(h))) {
                destroy_compound_gigantic_page(page, huge_page_order(h));
                free_gigantic_page(page, huge_page_order(h));
        } else {
@@ -1664,7 +1743,8 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
        spin_unlock_irq(&hugetlb_lock);
 }
 
-static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
+                                                               bool demote)
 {
        int i, j;
        int nr_pages = 1 << order;
@@ -1702,12 +1782,17 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
                 * the set of pages can not be converted to a gigantic page.
                 * The caller who allocated the pages should then discard the
                 * pages using the appropriate free interface.
+                *
+                * In the case of demote, the ref count will be zero.
                 */
-               if (!page_ref_freeze(p, 1)) {
-                       pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
-                       goto out_error;
+               if (!demote) {
+                       if (!page_ref_freeze(p, 1)) {
+                               pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+                               goto out_error;
+                       }
+               } else {
+                       VM_BUG_ON_PAGE(page_count(p), p);
                }
-               set_page_count(p, 0);
                set_compound_head(p, page);
        }
        atomic_set(compound_mapcount_ptr(page), -1);
@@ -1730,6 +1815,17 @@ out_error:
        return false;
 }
 
+static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
+{
+       return __prep_compound_gigantic_page(page, order, false);
+}
+
+static bool prep_compound_gigantic_page_for_demote(struct page *page,
+                                                       unsigned int order)
+{
+       return __prep_compound_gigantic_page(page, order, true);
+}
+
 /*
  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
  * transparent huge pages.  See the PageTransHuge() documentation for more
@@ -2868,33 +2964,39 @@ out_subpool_put:
        return ERR_PTR(-ENOSPC);
 }
 
-int alloc_bootmem_huge_page(struct hstate *h)
+int alloc_bootmem_huge_page(struct hstate *h, int nid)
        __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
-int __alloc_bootmem_huge_page(struct hstate *h)
+int __alloc_bootmem_huge_page(struct hstate *h, int nid)
 {
-       struct huge_bootmem_page *m;
+       struct huge_bootmem_page *m = NULL; /* initialize for clang */
        int nr_nodes, node;
 
+       if (nid >= nr_online_nodes)
+               return 0;
+       /* do node specific alloc */
+       if (nid != NUMA_NO_NODE) {
+               m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
+                               0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+               if (!m)
+                       return 0;
+               goto found;
+       }
+       /* allocate from next node when distributing huge pages */
        for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
-               void *addr;
-
-               addr = memblock_alloc_try_nid_raw(
+               m = memblock_alloc_try_nid_raw(
                                huge_page_size(h), huge_page_size(h),
                                0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
-               if (addr) {
-                       /*
-                        * Use the beginning of the huge page to store the
-                        * huge_bootmem_page struct (until gather_bootmem
-                        * puts them into the mem_map).
-                        */
-                       m = addr;
-                       goto found;
-               }
+               /*
+                * Use the beginning of the huge page to store the
+                * huge_bootmem_page struct (until gather_bootmem
+                * puts them into the mem_map).
+                */
+               if (!m)
+                       return 0;
+               goto found;
        }
-       return 0;
 
 found:
-       BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
        /* Put them into a private list first because mem_map is not up yet */
        INIT_LIST_HEAD(&m->list);
        list_add(&m->list, &huge_boot_pages);
@@ -2934,12 +3036,61 @@ static void __init gather_bootmem_prealloc(void)
                cond_resched();
        }
 }
+static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
+{
+       unsigned long i;
+       char buf[32];
+
+       for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
+               if (hstate_is_gigantic(h)) {
+                       if (!alloc_bootmem_huge_page(h, nid))
+                               break;
+               } else {
+                       struct page *page;
+                       gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+
+                       page = alloc_fresh_huge_page(h, gfp_mask, nid,
+                                       &node_states[N_MEMORY], NULL);
+                       if (!page)
+                               break;
+                       put_page(page); /* free it into the hugepage allocator */
+               }
+               cond_resched();
+       }
+       if (i == h->max_huge_pages_node[nid])
+               return;
+
+       string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+       pr_warn("HugeTLB: allocating %u of page size %s failed node%d.  Only allocated %lu hugepages.\n",
+               h->max_huge_pages_node[nid], buf, nid, i);
+       h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
+       h->max_huge_pages_node[nid] = i;
+}
 
 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 {
        unsigned long i;
        nodemask_t *node_alloc_noretry;
+       bool node_specific_alloc = false;
+
+       /* skip gigantic hugepages allocation if hugetlb_cma enabled */
+       if (hstate_is_gigantic(h) && hugetlb_cma_size) {
+               pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
+               return;
+       }
+
+       /* do node specific alloc */
+       for (i = 0; i < nr_online_nodes; i++) {
+               if (h->max_huge_pages_node[i] > 0) {
+                       hugetlb_hstate_alloc_pages_onenode(h, i);
+                       node_specific_alloc = true;
+               }
+       }
 
+       if (node_specific_alloc)
+               return;
+
+       /* below will do all node balanced alloc */
        if (!hstate_is_gigantic(h)) {
                /*
                 * Bit mask controlling how hard we retry per-node allocations.
@@ -2960,11 +3111,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 
        for (i = 0; i < h->max_huge_pages; ++i) {
                if (hstate_is_gigantic(h)) {
-                       if (hugetlb_cma_size) {
-                               pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
-                               goto free;
-                       }
-                       if (!alloc_bootmem_huge_page(h))
+                       if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
                                break;
                } else if (!alloc_pool_huge_page(h,
                                         &node_states[N_MEMORY],
@@ -2980,13 +3127,12 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
                        h->max_huge_pages, buf, i);
                h->max_huge_pages = i;
        }
-free:
        kfree(node_alloc_noretry);
 }
 
 static void __init hugetlb_init_hstates(void)
 {
-       struct hstate *h;
+       struct hstate *h, *h2;
 
        for_each_hstate(h) {
                if (minimum_order > huge_page_order(h))
@@ -2995,6 +3141,26 @@ static void __init hugetlb_init_hstates(void)
                /* oversize hugepages were init'ed in early boot */
                if (!hstate_is_gigantic(h))
                        hugetlb_hstate_alloc_pages(h);
+
+               /*
+                * Set demote order for each hstate.  Note that
+                * h->demote_order is initially 0.
+                * - We can not demote gigantic pages if runtime freeing
+                *   is not supported, so skip this.
+                * - If CMA allocation is possible, we can not demote
+                *   HUGETLB_PAGE_ORDER or smaller size pages.
+                */
+               if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+                       continue;
+               if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
+                       continue;
+               for_each_hstate(h2) {
+                       if (h2 == h)
+                               continue;
+                       if (h2->order < h->order &&
+                           h2->order > h->demote_order)
+                               h->demote_order = h2->order;
+               }
        }
        VM_BUG_ON(minimum_order == UINT_MAX);
 }
@@ -3235,9 +3401,100 @@ out:
        return 0;
 }
 
+static int demote_free_huge_page(struct hstate *h, struct page *page)
+{
+       int i, nid = page_to_nid(page);
+       struct hstate *target_hstate;
+       int rc = 0;
+
+       target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
+
+       remove_hugetlb_page_for_demote(h, page, false);
+       spin_unlock_irq(&hugetlb_lock);
+
+       rc = alloc_huge_page_vmemmap(h, page);
+       if (rc) {
+               /* Allocation of vmemmmap failed, we can not demote page */
+               spin_lock_irq(&hugetlb_lock);
+               set_page_refcounted(page);
+               add_hugetlb_page(h, page, false);
+               return rc;
+       }
+
+       /*
+        * Use destroy_compound_hugetlb_page_for_demote for all huge page
+        * sizes as it will not ref count pages.
+        */
+       destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
+
+       /*
+        * Taking target hstate mutex synchronizes with set_max_huge_pages.
+        * Without the mutex, pages added to target hstate could be marked
+        * as surplus.
+        *
+        * Note that we already hold h->resize_lock.  To prevent deadlock,
+        * use the convention of always taking larger size hstate mutex first.
+        */
+       mutex_lock(&target_hstate->resize_lock);
+       for (i = 0; i < pages_per_huge_page(h);
+                               i += pages_per_huge_page(target_hstate)) {
+               if (hstate_is_gigantic(target_hstate))
+                       prep_compound_gigantic_page_for_demote(page + i,
+                                                       target_hstate->order);
+               else
+                       prep_compound_page(page + i, target_hstate->order);
+               set_page_private(page + i, 0);
+               set_page_refcounted(page + i);
+               prep_new_huge_page(target_hstate, page + i, nid);
+               put_page(page + i);
+       }
+       mutex_unlock(&target_hstate->resize_lock);
+
+       spin_lock_irq(&hugetlb_lock);
+
+       /*
+        * Not absolutely necessary, but for consistency update max_huge_pages
+        * based on pool changes for the demoted page.
+        */
+       h->max_huge_pages--;
+       target_hstate->max_huge_pages += pages_per_huge_page(h);
+
+       return rc;
+}
+
+static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+       __must_hold(&hugetlb_lock)
+{
+       int nr_nodes, node;
+       struct page *page;
+       int rc = 0;
+
+       lockdep_assert_held(&hugetlb_lock);
+
+       /* We should never get here if no demote order */
+       if (!h->demote_order) {
+               pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
+               return -EINVAL;         /* internal error */
+       }
+
+       for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+               if (!list_empty(&h->hugepage_freelists[node])) {
+                       page = list_entry(h->hugepage_freelists[node].next,
+                                       struct page, lru);
+                       rc = demote_free_huge_page(h, page);
+                       break;
+               }
+       }
+
+       return rc;
+}
+
 #define HSTATE_ATTR_RO(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 
+#define HSTATE_ATTR_WO(_name) \
+       static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
+
 #define HSTATE_ATTR(_name) \
        static struct kobj_attribute _name##_attr = \
                __ATTR(_name, 0644, _name##_show, _name##_store)
@@ -3433,6 +3690,103 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj,
 }
 HSTATE_ATTR_RO(surplus_hugepages);
 
+static ssize_t demote_store(struct kobject *kobj,
+              struct kobj_attribute *attr, const char *buf, size_t len)
+{
+       unsigned long nr_demote;
+       unsigned long nr_available;
+       nodemask_t nodes_allowed, *n_mask;
+       struct hstate *h;
+       int err = 0;
+       int nid;
+
+       err = kstrtoul(buf, 10, &nr_demote);
+       if (err)
+               return err;
+       h = kobj_to_hstate(kobj, &nid);
+
+       if (nid != NUMA_NO_NODE) {
+               init_nodemask_of_node(&nodes_allowed, nid);
+               n_mask = &nodes_allowed;
+       } else {
+               n_mask = &node_states[N_MEMORY];
+       }
+
+       /* Synchronize with other sysfs operations modifying huge pages */
+       mutex_lock(&h->resize_lock);
+       spin_lock_irq(&hugetlb_lock);
+
+       while (nr_demote) {
+               /*
+                * Check for available pages to demote each time thorough the
+                * loop as demote_pool_huge_page will drop hugetlb_lock.
+                */
+               if (nid != NUMA_NO_NODE)
+                       nr_available = h->free_huge_pages_node[nid];
+               else
+                       nr_available = h->free_huge_pages;
+               nr_available -= h->resv_huge_pages;
+               if (!nr_available)
+                       break;
+
+               err = demote_pool_huge_page(h, n_mask);
+               if (err)
+                       break;
+
+               nr_demote--;
+       }
+
+       spin_unlock_irq(&hugetlb_lock);
+       mutex_unlock(&h->resize_lock);
+
+       if (err)
+               return err;
+       return len;
+}
+HSTATE_ATTR_WO(demote);
+
+static ssize_t demote_size_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr, char *buf)
+{
+       int nid;
+       struct hstate *h = kobj_to_hstate(kobj, &nid);
+       unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
+
+       return sysfs_emit(buf, "%lukB\n", demote_size);
+}
+
+static ssize_t demote_size_store(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       const char *buf, size_t count)
+{
+       struct hstate *h, *demote_hstate;
+       unsigned long demote_size;
+       unsigned int demote_order;
+       int nid;
+
+       demote_size = (unsigned long)memparse(buf, NULL);
+
+       demote_hstate = size_to_hstate(demote_size);
+       if (!demote_hstate)
+               return -EINVAL;
+       demote_order = demote_hstate->order;
+       if (demote_order < HUGETLB_PAGE_ORDER)
+               return -EINVAL;
+
+       /* demote order must be smaller than hstate order */
+       h = kobj_to_hstate(kobj, &nid);
+       if (demote_order >= h->order)
+               return -EINVAL;
+
+       /* resize_lock synchronizes access to demote size and writes */
+       mutex_lock(&h->resize_lock);
+       h->demote_order = demote_order;
+       mutex_unlock(&h->resize_lock);
+
+       return count;
+}
+HSTATE_ATTR(demote_size);
+
 static struct attribute *hstate_attrs[] = {
        &nr_hugepages_attr.attr,
        &nr_overcommit_hugepages_attr.attr,
@@ -3449,6 +3803,16 @@ static const struct attribute_group hstate_attr_group = {
        .attrs = hstate_attrs,
 };
 
+static struct attribute *hstate_demote_attrs[] = {
+       &demote_size_attr.attr,
+       &demote_attr.attr,
+       NULL,
+};
+
+static const struct attribute_group hstate_demote_attr_group = {
+       .attrs = hstate_demote_attrs,
+};
+
 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
                                    struct kobject **hstate_kobjs,
                                    const struct attribute_group *hstate_attr_group)
@@ -3466,6 +3830,12 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
                hstate_kobjs[hi] = NULL;
        }
 
+       if (h->demote_order) {
+               if (sysfs_create_group(hstate_kobjs[hi],
+                                       &hstate_demote_attr_group))
+                       pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+       }
+
        return retval;
 }
 
@@ -3671,6 +4041,10 @@ static int __init hugetlb_init(void)
                        }
                        default_hstate.max_huge_pages =
                                default_hstate_max_huge_pages;
+
+                       for (i = 0; i < nr_online_nodes; i++)
+                               default_hstate.max_huge_pages_node[i] =
+                                       default_hugepages_in_node[i];
                }
        }
 
@@ -3731,6 +4105,10 @@ void __init hugetlb_add_hstate(unsigned int order)
        parsed_hstate = h;
 }
 
+bool __init __weak hugetlb_node_alloc_supported(void)
+{
+       return true;
+}
 /*
  * hugepages command line processing
  * hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -3742,6 +4120,10 @@ static int __init hugepages_setup(char *s)
 {
        unsigned long *mhp;
        static unsigned long *last_mhp;
+       int node = NUMA_NO_NODE;
+       int count;
+       unsigned long tmp;
+       char *p = s;
 
        if (!parsed_valid_hugepagesz) {
                pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
@@ -3765,8 +4147,40 @@ static int __init hugepages_setup(char *s)
                return 0;
        }
 
-       if (sscanf(s, "%lu", mhp) <= 0)
-               *mhp = 0;
+       while (*p) {
+               count = 0;
+               if (sscanf(p, "%lu%n", &tmp, &count) != 1)
+                       goto invalid;
+               /* Parameter is node format */
+               if (p[count] == ':') {
+                       if (!hugetlb_node_alloc_supported()) {
+                               pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
+                               return 0;
+                       }
+                       node = tmp;
+                       p += count + 1;
+                       if (node < 0 || node >= nr_online_nodes)
+                               goto invalid;
+                       /* Parse hugepages */
+                       if (sscanf(p, "%lu%n", &tmp, &count) != 1)
+                               goto invalid;
+                       if (!hugetlb_max_hstate)
+                               default_hugepages_in_node[node] = tmp;
+                       else
+                               parsed_hstate->max_huge_pages_node[node] = tmp;
+                       *mhp += tmp;
+                       /* Go to parse next node*/
+                       if (p[count] == ',')
+                               p += count + 1;
+                       else
+                               break;
+               } else {
+                       if (p != s)
+                               goto invalid;
+                       *mhp = tmp;
+                       break;
+               }
+       }
 
        /*
         * Global state is always initialized later in hugetlb_init.
@@ -3779,6 +4193,10 @@ static int __init hugepages_setup(char *s)
        last_mhp = mhp;
 
        return 1;
+
+invalid:
+       pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
+       return 0;
 }
 __setup("hugepages=", hugepages_setup);
 
@@ -3840,6 +4258,7 @@ __setup("hugepagesz=", hugepagesz_setup);
 static int __init default_hugepagesz_setup(char *s)
 {
        unsigned long size;
+       int i;
 
        parsed_valid_hugepagesz = false;
        if (parsed_default_hugepagesz) {
@@ -3868,6 +4287,9 @@ static int __init default_hugepagesz_setup(char *s)
         */
        if (default_hstate_max_huge_pages) {
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+               for (i = 0; i < nr_online_nodes; i++)
+                       default_hstate.max_huge_pages_node[i] =
+                               default_hugepages_in_node[i];
                if (hstate_is_gigantic(&default_hstate))
                        hugetlb_hstate_alloc_pages(&default_hstate);
                default_hstate_max_huge_pages = 0;
@@ -4426,9 +4848,85 @@ again:
        return ret;
 }
 
-void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
-                           unsigned long start, unsigned long end,
-                           struct page *ref_page)
+static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
+                         unsigned long new_addr, pte_t *src_pte)
+{
+       struct hstate *h = hstate_vma(vma);
+       struct mm_struct *mm = vma->vm_mm;
+       pte_t *dst_pte, pte;
+       spinlock_t *src_ptl, *dst_ptl;
+
+       dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h));
+       dst_ptl = huge_pte_lock(h, mm, dst_pte);
+       src_ptl = huge_pte_lockptr(h, mm, src_pte);
+
+       /*
+        * We don't have to worry about the ordering of src and dst ptlocks
+        * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock.
+        */
+       if (src_ptl != dst_ptl)
+               spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+       pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
+       set_huge_pte_at(mm, new_addr, dst_pte, pte);
+
+       if (src_ptl != dst_ptl)
+               spin_unlock(src_ptl);
+       spin_unlock(dst_ptl);
+}
+
+int move_hugetlb_page_tables(struct vm_area_struct *vma,
+                            struct vm_area_struct *new_vma,
+                            unsigned long old_addr, unsigned long new_addr,
+                            unsigned long len)
+{
+       struct hstate *h = hstate_vma(vma);
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       unsigned long sz = huge_page_size(h);
+       struct mm_struct *mm = vma->vm_mm;
+       unsigned long old_end = old_addr + len;
+       unsigned long old_addr_copy;
+       pte_t *src_pte, *dst_pte;
+       struct mmu_notifier_range range;
+
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
+                               old_end);
+       adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+       mmu_notifier_invalidate_range_start(&range);
+       /* Prevent race with file truncation */
+       i_mmap_lock_write(mapping);
+       for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
+               src_pte = huge_pte_offset(mm, old_addr, sz);
+               if (!src_pte)
+                       continue;
+               if (huge_pte_none(huge_ptep_get(src_pte)))
+                       continue;
+
+               /* old_addr arg to huge_pmd_unshare() is a pointer and so the
+                * arg may be modified. Pass a copy instead to preserve the
+                * value in old_addr.
+                */
+               old_addr_copy = old_addr;
+
+               if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
+                       continue;
+
+               dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
+               if (!dst_pte)
+                       break;
+
+               move_huge_pte(vma, old_addr, new_addr, src_pte);
+       }
+       i_mmap_unlock_write(mapping);
+       flush_tlb_range(vma, old_end - len, old_end);
+       mmu_notifier_invalidate_range_end(&range);
+
+       return len + old_addr - old_end;
+}
+
+static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                                  unsigned long start, unsigned long end,
+                                  struct page *ref_page)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -4616,7 +5114,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 
 /*
  * Hugetlb_cow() should be called with page lock of the original hugepage held.
- * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * Called with hugetlb_fault_mutex_table held and pte_page locked so we
  * cannot race with other handlers or page migration.
  * Keep the pte_same checks anyway to make transition from the mutex easier.
  */
@@ -5965,12 +6463,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
  * sharing is possible.  For hugetlbfs, this prevents removal of any page
  * table entries associated with the address space.  This is important as we
  * are setting up sharing based on existing page table entries (mappings).
- *
- * NOTE: This routine is only called from huge_pte_alloc.  Some callers of
- * huge_pte_alloc know that sharing is not possible and do not take
- * i_mmap_rwsem as a performance optimization.  This is handled by the
- * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
- * only required for subsequent processing.
  */
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud)
@@ -6371,7 +6863,38 @@ static bool cma_reserve_called __initdata;
 
 static int __init cmdline_parse_hugetlb_cma(char *p)
 {
-       hugetlb_cma_size = memparse(p, &p);
+       int nid, count = 0;
+       unsigned long tmp;
+       char *s = p;
+
+       while (*s) {
+               if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+                       break;
+
+               if (s[count] == ':') {
+                       nid = tmp;
+                       if (nid < 0 || nid >= MAX_NUMNODES)
+                               break;
+
+                       s += count + 1;
+                       tmp = memparse(s, &s);
+                       hugetlb_cma_size_in_node[nid] = tmp;
+                       hugetlb_cma_size += tmp;
+
+                       /*
+                        * Skip the separator if have one, otherwise
+                        * break the parsing.
+                        */
+                       if (*s == ',')
+                               s++;
+                       else
+                               break;
+               } else {
+                       hugetlb_cma_size = memparse(p, &p);
+                       break;
+               }
+       }
+
        return 0;
 }
 
@@ -6380,37 +6903,80 @@ early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
 void __init hugetlb_cma_reserve(int order)
 {
        unsigned long size, reserved, per_node;
+       bool node_specific_cma_alloc = false;
        int nid;
 
        cma_reserve_called = true;
 
+       if (!hugetlb_cma_size)
+               return;
+
+       for (nid = 0; nid < MAX_NUMNODES; nid++) {
+               if (hugetlb_cma_size_in_node[nid] == 0)
+                       continue;
+
+               if (!node_state(nid, N_ONLINE)) {
+                       pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
+                       hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+                       hugetlb_cma_size_in_node[nid] = 0;
+                       continue;
+               }
+
+               if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
+                       pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
+                               nid, (PAGE_SIZE << order) / SZ_1M);
+                       hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+                       hugetlb_cma_size_in_node[nid] = 0;
+               } else {
+                       node_specific_cma_alloc = true;
+               }
+       }
+
+       /* Validate the CMA size again in case some invalid nodes specified. */
        if (!hugetlb_cma_size)
                return;
 
        if (hugetlb_cma_size < (PAGE_SIZE << order)) {
                pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
                        (PAGE_SIZE << order) / SZ_1M);
+               hugetlb_cma_size = 0;
                return;
        }
 
-       /*
-        * If 3 GB area is requested on a machine with 4 numa nodes,
-        * let's allocate 1 GB on first three nodes and ignore the last one.
-        */
-       per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
-       pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
-               hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+       if (!node_specific_cma_alloc) {
+               /*
+                * If 3 GB area is requested on a machine with 4 numa nodes,
+                * let's allocate 1 GB on first three nodes and ignore the last one.
+                */
+               per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+               pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+                       hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+       }
 
        reserved = 0;
        for_each_node_state(nid, N_ONLINE) {
                int res;
                char name[CMA_MAX_NAME];
 
-               size = min(per_node, hugetlb_cma_size - reserved);
+               if (node_specific_cma_alloc) {
+                       if (hugetlb_cma_size_in_node[nid] == 0)
+                               continue;
+
+                       size = hugetlb_cma_size_in_node[nid];
+               } else {
+                       size = min(per_node, hugetlb_cma_size - reserved);
+               }
+
                size = round_up(size, PAGE_SIZE << order);
 
                snprintf(name, sizeof(name), "hugetlb%d", nid);
-               res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
+               /*
+                * Note that 'order per bit' is based on smallest size that
+                * may be returned to CMA allocator in the case of
+                * huge page demotion.
+                */
+               res = cma_declare_contiguous_nid(0, size, 0,
+                                               PAGE_SIZE << HUGETLB_PAGE_ORDER,
                                                 0, false, name,
                                                 &hugetlb_cma[nid], nid);
                if (res) {
@@ -6426,6 +6992,13 @@ void __init hugetlb_cma_reserve(int order)
                if (reserved >= hugetlb_cma_size)
                        break;
        }
+
+       if (!reserved)
+               /*
+                * hugetlb_cma_size is used to determine if allocations from
+                * cma are possible.  Set to zero if no cma regions are set up.
+                */
+               hugetlb_cma_size = 0;
 }
 
 void __init hugetlb_cma_check(void)
index 5383023d0ccadde9424e4aa57f957ee5a817d0cb..79d93534ef1e8a9ad47ec98d883938d7aed0e1e7 100644 (file)
@@ -27,9 +27,6 @@
 #define MEMFILE_IDX(val)       (((val) >> 16) & 0xffff)
 #define MEMFILE_ATTR(val)      ((val) & 0xffff)
 
-#define hugetlb_cgroup_from_counter(counter, idx)                   \
-       container_of(counter, struct hugetlb_cgroup, hugepage[idx])
-
 static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
 
 static inline struct page_counter *
index b1001ebeb286b706fdd078a06af9aff48732c92f..3b79a5c9427a827e7646dbaf0abfe3b516c60a81 100644 (file)
@@ -41,12 +41,33 @@ static inline void *folio_raw_mapping(struct folio *folio)
        return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
 }
 
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
+                                               int nr_throttled);
+static inline void acct_reclaim_writeback(struct folio *folio)
+{
+       pg_data_t *pgdat = folio_pgdat(folio);
+       int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
+
+       if (nr_throttled)
+               __acct_reclaim_writeback(pgdat, folio, nr_throttled);
+}
+
+static inline void wake_throttle_isolated(pg_data_t *pgdat)
+{
+       wait_queue_head_t *wqh;
+
+       wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
+       if (waitqueue_active(wqh))
+               wake_up(wqh);
+}
+
 vm_fault_t do_swap_page(struct vm_fault *vmf);
 void folio_rotate_reclaimable(struct folio *folio);
 bool __folio_end_writeback(struct folio *folio);
 
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long floor, unsigned long ceiling);
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
 
 static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
 {
@@ -129,6 +150,7 @@ extern unsigned long highest_memmap_pfn;
  */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
+extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
 
 /*
  * in mm/rmap.c:
index 2baf121fb8c50030b761b8c546aa26853b8d08e3..8428da2aaf173e5337a37b204cba0641d80a1a01 100644 (file)
 #include "kasan.h"
 #include "../slab.h"
 
-depot_stack_handle_t kasan_save_stack(gfp_t flags)
+depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
 {
        unsigned long entries[KASAN_STACK_DEPTH];
        unsigned int nr_entries;
 
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
        nr_entries = filter_irq_stacks(entries, nr_entries);
-       return stack_depot_save(entries, nr_entries, flags);
+       return __stack_depot_save(entries, nr_entries, flags, can_alloc);
 }
 
 void kasan_set_track(struct kasan_track *track, gfp_t flags)
 {
        track->pid = current->pid;
-       track->stack = kasan_save_stack(flags);
+       track->stack = kasan_save_stack(flags, true);
 }
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
@@ -298,7 +298,7 @@ static inline u8 assign_tag(struct kmem_cache *cache,
        /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
 #ifdef CONFIG_SLAB
        /* For SLAB assign tags based on the object index in the freelist. */
-       return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
+       return (u8)obj_to_index(cache, virt_to_head_page(object), (void *)object);
 #else
        /*
         * For SLUB assign a random tag during slab creation, otherwise reuse
index c3f5ba7a294a8ae9c988135c24971e02c6f2660f..84a038b07c6fe06c43713969cb041e330bee9951 100644 (file)
@@ -328,7 +328,7 @@ DEFINE_ASAN_SET_SHADOW(f3);
 DEFINE_ASAN_SET_SHADOW(f5);
 DEFINE_ASAN_SET_SHADOW(f8);
 
-void kasan_record_aux_stack(void *addr)
+static void __kasan_record_aux_stack(void *addr, bool can_alloc)
 {
        struct page *page = kasan_addr_to_page(addr);
        struct kmem_cache *cache;
@@ -345,7 +345,17 @@ void kasan_record_aux_stack(void *addr)
                return;
 
        alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
-       alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+       alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc);
+}
+
+void kasan_record_aux_stack(void *addr)
+{
+       return __kasan_record_aux_stack(addr, true);
+}
+
+void kasan_record_aux_stack_noalloc(void *addr)
+{
+       return __kasan_record_aux_stack(addr, false);
 }
 
 void kasan_set_free_info(struct kmem_cache *cache,
index b495e17445ad74f9a941af5ae32157ee4f2b0409..aebd8df86a1f2770b8caff0f61cbb9edf09ca0fe 100644 (file)
@@ -266,7 +266,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip);
 
 struct page *kasan_addr_to_page(const void *addr);
 
-depot_stack_handle_t kasan_save_stack(gfp_t flags);
+depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc);
 void kasan_set_track(struct kasan_track *track, gfp_t flags);
 void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
 struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
index 8d95ee52d0194e1862f1410d6bb20d64b20d734d..4a4929b29a237f7fd3617817491d173ed7d205b0 100644 (file)
@@ -254,6 +254,11 @@ core_initcall(kasan_memhotplug_init);
 
 #ifdef CONFIG_KASAN_VMALLOC
 
+void __init __weak kasan_populate_early_vm_area_shadow(void *start,
+                                                      unsigned long size)
+{
+}
+
 static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
                                      void *unused)
 {
index 7a97db8bc8e75baa10ca10f4319f315d9821e72c..09945784df9e656171b0dc40bb023f7ced26fad3 100644 (file)
 #include <linux/atomic.h>
 #include <linux/bug.h>
 #include <linux/debugfs.h>
+#include <linux/hash.h>
 #include <linux/irq_work.h>
+#include <linux/jhash.h>
 #include <linux/kcsan-checks.h>
 #include <linux/kfence.h>
 #include <linux/kmemleak.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
+#include <linux/log2.h>
 #include <linux/memblock.h>
 #include <linux/moduleparam.h>
 #include <linux/random.h>
@@ -82,6 +85,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
 };
 module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
 
+/* Pool usage% threshold when currently covered allocations are skipped. */
+static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
+module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
+
 /* The pool of pages used for guard pages and objects. */
 char *__kfence_pool __ro_after_init;
 EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
@@ -97,14 +104,41 @@ struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
 static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
 static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
 
-#ifdef CONFIG_KFENCE_STATIC_KEYS
-/* The static key to set up a KFENCE allocation. */
+/*
+ * The static key to set up a KFENCE allocation; or if static keys are not used
+ * to gate allocations, to avoid a load and compare if KFENCE is disabled.
+ */
 DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
-#endif
 
 /* Gates the allocation, ensuring only one succeeds in a given period. */
 atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
 
+/*
+ * A Counting Bloom filter of allocation coverage: limits currently covered
+ * allocations of the same source filling up the pool.
+ *
+ * Assuming a range of 15%-85% unique allocations in the pool at any point in
+ * time, the below parameters provide a probablity of 0.02-0.33 for false
+ * positive hits respectively:
+ *
+ *     P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
+ */
+#define ALLOC_COVERED_HNUM     2
+#define ALLOC_COVERED_ORDER    (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
+#define ALLOC_COVERED_SIZE     (1 << ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_MASK     (ALLOC_COVERED_SIZE - 1)
+static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
+
+/* Stack depth used to determine uniqueness of an allocation. */
+#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
+
+/*
+ * Randomness for stack hashes, making the same collisions across reboots and
+ * different machines less likely.
+ */
+static u32 stack_hash_seed __ro_after_init;
+
 /* Statistics counters for debugfs. */
 enum kfence_counter_id {
        KFENCE_COUNTER_ALLOCATED,
@@ -112,6 +146,9 @@ enum kfence_counter_id {
        KFENCE_COUNTER_FREES,
        KFENCE_COUNTER_ZOMBIES,
        KFENCE_COUNTER_BUGS,
+       KFENCE_COUNTER_SKIP_INCOMPAT,
+       KFENCE_COUNTER_SKIP_CAPACITY,
+       KFENCE_COUNTER_SKIP_COVERED,
        KFENCE_COUNTER_COUNT,
 };
 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -121,11 +158,59 @@ static const char *const counter_names[] = {
        [KFENCE_COUNTER_FREES]          = "total frees",
        [KFENCE_COUNTER_ZOMBIES]        = "zombie allocations",
        [KFENCE_COUNTER_BUGS]           = "total bugs",
+       [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
+       [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
+       [KFENCE_COUNTER_SKIP_COVERED]   = "skipped allocations (covered)",
 };
 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
 
 /* === Internals ============================================================ */
 
+static inline bool should_skip_covered(void)
+{
+       unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
+
+       return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
+}
+
+static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
+{
+       num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
+       num_entries = filter_irq_stacks(stack_entries, num_entries);
+       return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
+}
+
+/*
+ * Adds (or subtracts) count @val for allocation stack trace hash
+ * @alloc_stack_hash from Counting Bloom filter.
+ */
+static void alloc_covered_add(u32 alloc_stack_hash, int val)
+{
+       int i;
+
+       for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+               atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
+               alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+       }
+}
+
+/*
+ * Returns true if the allocation stack trace hash @alloc_stack_hash is
+ * currently contained (non-zero count) in Counting Bloom filter.
+ */
+static bool alloc_covered_contains(u32 alloc_stack_hash)
+{
+       int i;
+
+       for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+               if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
+                       return false;
+               alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+       }
+
+       return true;
+}
+
 static bool kfence_protect(unsigned long addr)
 {
        return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@@ -183,19 +268,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m
  * Update the object's metadata state, including updating the alloc/free stacks
  * depending on the state transition.
  */
-static noinline void metadata_update_state(struct kfence_metadata *meta,
-                                          enum kfence_object_state next)
+static noinline void
+metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
+                     unsigned long *stack_entries, size_t num_stack_entries)
 {
        struct kfence_track *track =
                next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
 
        lockdep_assert_held(&meta->lock);
 
-       /*
-        * Skip over 1 (this) functions; noinline ensures we do not accidentally
-        * skip over the caller by never inlining.
-        */
-       track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+       if (stack_entries) {
+               memcpy(track->stack_entries, stack_entries,
+                      num_stack_entries * sizeof(stack_entries[0]));
+       } else {
+               /*
+                * Skip over 1 (this) functions; noinline ensures we do not
+                * accidentally skip over the caller by never inlining.
+                */
+               num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+       }
+       track->num_stack_entries = num_stack_entries;
        track->pid = task_pid_nr(current);
        track->cpu = raw_smp_processor_id();
        track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
@@ -218,12 +310,19 @@ static inline bool set_canary_byte(u8 *addr)
 /* Check canary byte at @addr. */
 static inline bool check_canary_byte(u8 *addr)
 {
+       struct kfence_metadata *meta;
+       unsigned long flags;
+
        if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
                return true;
 
        atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
-       kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr),
-                           KFENCE_ERROR_CORRUPTION);
+
+       meta = addr_to_metadata((unsigned long)addr);
+       raw_spin_lock_irqsave(&meta->lock, flags);
+       kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION);
+       raw_spin_unlock_irqrestore(&meta->lock, flags);
+
        return false;
 }
 
@@ -233,8 +332,6 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
        const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
        unsigned long addr;
 
-       lockdep_assert_held(&meta->lock);
-
        /*
         * We'll iterate over each canary byte per-side until fn() returns
         * false. However, we'll still iterate over the canary bytes to the
@@ -257,7 +354,9 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
        }
 }
 
-static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
+                                 unsigned long *stack_entries, size_t num_stack_entries,
+                                 u32 alloc_stack_hash)
 {
        struct kfence_metadata *meta = NULL;
        unsigned long flags;
@@ -271,8 +370,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
                list_del_init(&meta->list);
        }
        raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
-       if (!meta)
+       if (!meta) {
+               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
                return NULL;
+       }
 
        if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
                /*
@@ -314,11 +415,14 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
        addr = (void *)meta->addr;
 
        /* Update remaining metadata. */
-       metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
+       metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
        /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
        WRITE_ONCE(meta->cache, cache);
        meta->size = size;
-       for_each_canary(meta, set_canary_byte);
+       meta->alloc_stack_hash = alloc_stack_hash;
+       raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+       alloc_covered_add(alloc_stack_hash, 1);
 
        /* Set required struct page fields. */
        page = virt_to_page(meta->addr);
@@ -328,9 +432,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
        if (IS_ENABLED(CONFIG_SLAB))
                page->s_mem = addr;
 
-       raw_spin_unlock_irqrestore(&meta->lock, flags);
-
        /* Memory initialization. */
+       for_each_canary(meta, set_canary_byte);
 
        /*
         * We check slab_want_init_on_alloc() ourselves, rather than letting
@@ -355,6 +458,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 {
        struct kcsan_scoped_access assert_page_exclusive;
        unsigned long flags;
+       bool init;
 
        raw_spin_lock_irqsave(&meta->lock, flags);
 
@@ -382,6 +486,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
                meta->unprotected_page = 0;
        }
 
+       /* Mark the object as freed. */
+       metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
+       init = slab_want_init_on_free(meta->cache);
+       raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+       alloc_covered_add(meta->alloc_stack_hash, -1);
+
        /* Check canary bytes for memory corruption. */
        for_each_canary(meta, check_canary_byte);
 
@@ -390,14 +501,9 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
         * data is still there, and after a use-after-free is detected, we
         * unprotect the page, so the data is still accessible.
         */
-       if (!zombie && unlikely(slab_want_init_on_free(meta->cache)))
+       if (!zombie && unlikely(init))
                memzero_explicit(addr, meta->size);
 
-       /* Mark the object as freed. */
-       metadata_update_state(meta, KFENCE_OBJECT_FREED);
-
-       raw_spin_unlock_irqrestore(&meta->lock, flags);
-
        /* Protect to detect use-after-frees. */
        kfence_protect((unsigned long)addr);
 
@@ -663,11 +769,14 @@ void __init kfence_init(void)
        if (!kfence_sample_interval)
                return;
 
+       stack_hash_seed = (u32)random_get_entropy();
        if (!kfence_init_pool()) {
                pr_err("%s failed\n", __func__);
                return;
        }
 
+       if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS))
+               static_branch_enable(&kfence_allocation_key);
        WRITE_ONCE(kfence_enabled, true);
        queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
        pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
@@ -736,12 +845,18 @@ void kfence_shutdown_cache(struct kmem_cache *s)
 
 void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 {
+       unsigned long stack_entries[KFENCE_STACK_DEPTH];
+       size_t num_stack_entries;
+       u32 alloc_stack_hash;
+
        /*
         * Perform size check before switching kfence_allocation_gate, so that
         * we don't disable KFENCE without making an allocation.
         */
-       if (size > PAGE_SIZE)
+       if (size > PAGE_SIZE) {
+               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
                return NULL;
+       }
 
        /*
         * Skip allocations from non-default zones, including DMA. We cannot
@@ -749,15 +864,12 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
         * properties (e.g. reside in DMAable memory).
         */
        if ((flags & GFP_ZONEMASK) ||
-           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
+           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
+               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
                return NULL;
+       }
 
-       /*
-        * allocation_gate only needs to become non-zero, so it doesn't make
-        * sense to continue writing to it and pay the associated contention
-        * cost, in case we have a large number of concurrent allocations.
-        */
-       if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
+       if (atomic_inc_return(&kfence_allocation_gate) > 1)
                return NULL;
 #ifdef CONFIG_KFENCE_STATIC_KEYS
        /*
@@ -776,7 +888,25 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
        if (!READ_ONCE(kfence_enabled))
                return NULL;
 
-       return kfence_guarded_alloc(s, size, flags);
+       num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
+
+       /*
+        * Do expensive check for coverage of allocation in slow-path after
+        * allocation_gate has already become non-zero, even though it might
+        * mean not making any allocation within a given sample interval.
+        *
+        * This ensures reasonable allocation coverage when the pool is almost
+        * full, including avoiding long-lived allocations of the same source
+        * filling up the pool (e.g. pagecache allocations).
+        */
+       alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
+       if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
+               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
+               return NULL;
+       }
+
+       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
+                                   alloc_stack_hash);
 }
 
 size_t kfence_ksize(const void *addr)
index c1f23c61e5f9110f785bf972b26733e3310ff368..2a2d5de9d3791624f780193074983e7edd6eb482 100644 (file)
@@ -87,6 +87,8 @@ struct kfence_metadata {
        /* Allocation and free stack information. */
        struct kfence_track alloc_track;
        struct kfence_track free_track;
+       /* For updating alloc_covered on frees. */
+       u32 alloc_stack_hash;
 };
 
 extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
index f1690cf541998d0aa5b638ef7a8da4be9614b91b..695030c1fff8bba00cd54140f55f89bf64ac2917 100644 (file)
 #define arch_kfence_test_address(addr) (addr)
 #endif
 
+#define KFENCE_TEST_REQUIRES(test, cond) do {                  \
+       if (!(cond))                                            \
+               kunit_skip((test), "Test requires: " #cond);    \
+} while (0)
+
 /* Report as observed from console. */
 static struct {
        spinlock_t lock;
@@ -555,8 +560,7 @@ static void test_init_on_free(struct kunit *test)
        };
        int i;
 
-       if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON))
-               return;
+       KFENCE_TEST_REQUIRES(test, IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON));
        /* Assume it hasn't been disabled on command line. */
 
        setup_test_cache(test, size, 0, NULL);
@@ -603,10 +607,8 @@ static void test_gfpzero(struct kunit *test)
        char *buf1, *buf2;
        int i;
 
-       if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) {
-               kunit_warn(test, "skipping ... would take too long\n");
-               return;
-       }
+       /* Skip if we think it'd take too long. */
+       KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100);
 
        setup_test_cache(test, size, 0, NULL);
        buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
index 5f02fda6f26513c2c477438b04d2bc0caba04ee5..e99101162f1abf62be1249d01867e7b31c9e2bc8 100644 (file)
@@ -2299,6 +2299,11 @@ static void set_recommended_min_free_kbytes(void)
        int nr_zones = 0;
        unsigned long recommended_min;
 
+       if (!khugepaged_enabled()) {
+               calculate_min_free_kbytes();
+               goto update_wmarks;
+       }
+
        for_each_populated_zone(zone) {
                /*
                 * We don't need to worry about fragmentation of
@@ -2334,6 +2339,8 @@ static void set_recommended_min_free_kbytes(void)
 
                min_free_kbytes = recommended_min;
        }
+
+update_wmarks:
        setup_per_zone_wmarks();
 }
 
@@ -2355,12 +2362,11 @@ int start_stop_khugepaged(void)
 
                if (!list_empty(&khugepaged_scan.mm_head))
                        wake_up_interruptible(&khugepaged_wait);
-
-               set_recommended_min_free_kbytes();
        } else if (khugepaged_thread) {
                kthread_stop(khugepaged_thread);
                khugepaged_thread = NULL;
        }
+       set_recommended_min_free_kbytes();
 fail:
        mutex_unlock(&khugepaged_mutex);
        return err;
index cd58790d0fb3816711b4df2fbbb70d32ff0ede59..0cd5e89ca0631fa0ed16ec522dad263f0346ea77 100644 (file)
 #include "slab.h"
 
 #ifdef CONFIG_MEMCG_KMEM
-static LIST_HEAD(list_lrus);
+static LIST_HEAD(memcg_list_lrus);
 static DEFINE_MUTEX(list_lrus_mutex);
 
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+       return lru->memcg_aware;
+}
+
 static void list_lru_register(struct list_lru *lru)
 {
+       if (!list_lru_memcg_aware(lru))
+               return;
+
        mutex_lock(&list_lrus_mutex);
-       list_add(&lru->list, &list_lrus);
+       list_add(&lru->list, &memcg_list_lrus);
        mutex_unlock(&list_lrus_mutex);
 }
 
 static void list_lru_unregister(struct list_lru *lru)
 {
+       if (!list_lru_memcg_aware(lru))
+               return;
+
        mutex_lock(&list_lrus_mutex);
        list_del(&lru->list);
        mutex_unlock(&list_lrus_mutex);
@@ -37,11 +48,6 @@ static int lru_shrinker_id(struct list_lru *lru)
        return lru->shrinker_id;
 }
 
-static inline bool list_lru_memcg_aware(struct list_lru *lru)
-{
-       return lru->memcg_aware;
-}
-
 static inline struct list_lru_one *
 list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
 {
@@ -176,13 +182,16 @@ unsigned long list_lru_count_one(struct list_lru *lru,
 {
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;
-       unsigned long count;
+       long count;
 
        rcu_read_lock();
        l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
        count = READ_ONCE(l->nr_items);
        rcu_read_unlock();
 
+       if (unlikely(count < 0))
+               count = 0;
+
        return count;
 }
 EXPORT_SYMBOL_GPL(list_lru_count_one);
@@ -354,8 +363,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
        struct list_lru_memcg *memcg_lrus;
        int size = memcg_nr_cache_ids;
 
-       memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
-                             size * sizeof(void *), GFP_KERNEL);
+       memcg_lrus = kvmalloc(struct_size(memcg_lrus, lru, size), GFP_KERNEL);
        if (!memcg_lrus)
                return -ENOMEM;
 
@@ -389,7 +397,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 
        old = rcu_dereference_protected(nlru->memcg_lrus,
                                        lockdep_is_held(&list_lrus_mutex));
-       new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
+       new = kvmalloc(struct_size(new, lru, new_size), GFP_KERNEL);
        if (!new)
                return -ENOMEM;
 
@@ -398,19 +406,8 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
                return -ENOMEM;
        }
 
-       memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
-
-       /*
-        * The locking below allows readers that hold nlru->lock avoid taking
-        * rcu_read_lock (see list_lru_from_memcg_idx).
-        *
-        * Since list_lru_{add,del} may be called under an IRQ-safe lock,
-        * we have to use IRQ-safe primitives here to avoid deadlock.
-        */
-       spin_lock_irq(&nlru->lock);
+       memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size));
        rcu_assign_pointer(nlru->memcg_lrus, new);
-       spin_unlock_irq(&nlru->lock);
-
        kvfree_rcu(old, rcu);
        return 0;
 }
@@ -466,9 +463,6 @@ static int memcg_update_list_lru(struct list_lru *lru,
 {
        int i;
 
-       if (!list_lru_memcg_aware(lru))
-               return 0;
-
        for_each_node(i) {
                if (memcg_update_list_lru_node(&lru->node[i],
                                               old_size, new_size))
@@ -491,9 +485,6 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru,
 {
        int i;
 
-       if (!list_lru_memcg_aware(lru))
-               return;
-
        for_each_node(i)
                memcg_cancel_update_list_lru_node(&lru->node[i],
                                                  old_size, new_size);
@@ -506,7 +497,7 @@ int memcg_update_all_list_lrus(int new_size)
        int old_size = memcg_nr_cache_ids;
 
        mutex_lock(&list_lrus_mutex);
-       list_for_each_entry(lru, &list_lrus, list) {
+       list_for_each_entry(lru, &memcg_list_lrus, list) {
                ret = memcg_update_list_lru(lru, old_size, new_size);
                if (ret)
                        goto fail;
@@ -515,7 +506,7 @@ out:
        mutex_unlock(&list_lrus_mutex);
        return ret;
 fail:
-       list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+       list_for_each_entry_continue_reverse(lru, &memcg_list_lrus, list)
                memcg_cancel_update_list_lru(lru, old_size, new_size);
        goto out;
 }
@@ -552,9 +543,6 @@ static void memcg_drain_list_lru(struct list_lru *lru,
 {
        int i;
 
-       if (!list_lru_memcg_aware(lru))
-               return;
-
        for_each_node(i)
                memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
 }
@@ -564,7 +552,7 @@ void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
        struct list_lru *lru;
 
        mutex_lock(&list_lrus_mutex);
-       list_for_each_entry(lru, &list_lrus, list)
+       list_for_each_entry(lru, &memcg_list_lrus, list)
                memcg_drain_list_lru(lru, src_idx, dst_memcg);
        mutex_unlock(&list_lrus_mutex);
 }
index 5096500b2647300268bd2f56b9fc01a9c76826da..659bf0ffb0867308e23b6baf203988cbd0e30544 100644 (file)
@@ -366,14 +366,14 @@ void __init memblock_discard(void)
                addr = __pa(memblock.reserved.regions);
                size = PAGE_ALIGN(sizeof(struct memblock_region) *
                                  memblock.reserved.max);
-               __memblock_free_late(addr, size);
+               memblock_free_late(addr, size);
        }
 
        if (memblock.memory.regions != memblock_memory_init_regions) {
                addr = __pa(memblock.memory.regions);
                size = PAGE_ALIGN(sizeof(struct memblock_region) *
                                  memblock.memory.max);
-               __memblock_free_late(addr, size);
+               memblock_free_late(addr, size);
        }
 
        memblock_memory = NULL;
@@ -472,7 +472,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
                kfree(old_array);
        else if (old_array != memblock_memory_init_regions &&
                 old_array != memblock_reserved_init_regions)
-               memblock_free_ptr(old_array, old_alloc_size);
+               memblock_free(old_array, old_alloc_size);
 
        /*
         * Reserve the new array if that comes from the memblock.  Otherwise, we
@@ -655,6 +655,7 @@ repeat:
  * @base: base address of the new region
  * @size: size of the new region
  * @nid: nid of the new region
+ * @flags: flags of the new region
  *
  * Add new memblock region [@base, @base + @size) to the "memory"
  * type. See memblock_add_range() description for mode details
@@ -663,14 +664,14 @@ repeat:
  * 0 on success, -errno on failure.
  */
 int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
-                                      int nid)
+                                     int nid, enum memblock_flags flags)
 {
        phys_addr_t end = base + size - 1;
 
-       memblock_dbg("%s: [%pa-%pa] nid=%d %pS\n", __func__,
-                    &base, &end, nid, (void *)_RET_IP_);
+       memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
+                    &base, &end, nid, flags, (void *)_RET_IP_);
 
-       return memblock_add_range(&memblock.memory, base, size, nid, 0);
+       return memblock_add_range(&memblock.memory, base, size, nid, flags);
 }
 
 /**
@@ -796,28 +797,28 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
 }
 
 /**
- * memblock_free_ptr - free boot memory allocation
+ * memblock_free - free boot memory allocation
  * @ptr: starting address of the  boot memory allocation
  * @size: size of the boot memory block in bytes
  *
  * Free boot memory block previously allocated by memblock_alloc_xx() API.
  * The freeing memory will not be released to the buddy allocator.
  */
-void __init_memblock memblock_free_ptr(void *ptr, size_t size)
+void __init_memblock memblock_free(void *ptr, size_t size)
 {
        if (ptr)
-               memblock_free(__pa(ptr), size);
+               memblock_phys_free(__pa(ptr), size);
 }
 
 /**
- * memblock_free - free boot memory block
+ * memblock_phys_free - free boot memory block
  * @base: phys starting address of the  boot memory block
  * @size: size of the boot memory block in bytes
  *
  * Free boot memory block previously allocated by memblock_alloc_xx() API.
  * The freeing memory will not be released to the buddy allocator.
  */
-int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
 {
        phys_addr_t end = base + size - 1;
 
@@ -981,6 +982,10 @@ static bool should_skip_region(struct memblock_type *type,
        if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
                return true;
 
+       /* skip driver-managed memory unless we were asked for it explicitly */
+       if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
+               return true;
+
        return false;
 }
 
@@ -1589,7 +1594,7 @@ void * __init memblock_alloc_try_nid(
 }
 
 /**
- * __memblock_free_late - free pages directly to buddy allocator
+ * memblock_free_late - free pages directly to buddy allocator
  * @base: phys starting address of the  boot memory block
  * @size: size of the boot memory block in bytes
  *
@@ -1597,7 +1602,7 @@ void * __init memblock_alloc_try_nid(
  * down, but we are still initializing the system.  Pages are released directly
  * to the buddy allocator.
  */
-void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
 {
        phys_addr_t cursor, end;
 
@@ -1937,7 +1942,7 @@ static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn)
         * memmap array.
         */
        if (pg < pgend)
-               memblock_free(pg, pgend - pg);
+               memblock_phys_free(pg, pgend - pg);
 }
 
 /*
index 8dab23a71fc4f67b962aeb1bdfefa8a2dd9ae692..508bcea7df5601ca416d2fa85aaaddb8b56ff733 100644 (file)
@@ -103,11 +103,6 @@ static bool do_memsw_account(void)
        return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
 }
 
-/* memcg and lruvec stats flushing */
-static void flush_memcg_stats_dwork(struct work_struct *w);
-static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
-static DEFINE_SPINLOCK(stats_flush_lock);
-
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 
@@ -239,7 +234,7 @@ enum res_type {
             iter != NULL;                              \
             iter = mem_cgroup_iter(NULL, iter, NULL))
 
-static inline bool should_force_charge(void)
+static inline bool task_is_dying(void)
 {
        return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
                (current->flags & PF_EXITING);
@@ -613,6 +608,58 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
        return mz;
 }
 
+/*
+ * memcg and lruvec stats flushing
+ *
+ * Many codepaths leading to stats update or read are performance sensitive and
+ * adding stats flushing in such codepaths is not desirable. So, to optimize the
+ * flushing the kernel does:
+ *
+ * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
+ *    rstat update tree grow unbounded.
+ *
+ * 2) Flush the stats synchronously on reader side only when there are more than
+ *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
+ *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
+ *    only for 2 seconds due to (1).
+ */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static DEFINE_SPINLOCK(stats_flush_lock);
+static DEFINE_PER_CPU(unsigned int, stats_updates);
+static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg)
+{
+       cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+       if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH))
+               atomic_inc(&stats_flush_threshold);
+}
+
+static void __mem_cgroup_flush_stats(void)
+{
+       unsigned long flag;
+
+       if (!spin_trylock_irqsave(&stats_flush_lock, flag))
+               return;
+
+       cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+       atomic_set(&stats_flush_threshold, 0);
+       spin_unlock_irqrestore(&stats_flush_lock, flag);
+}
+
+void mem_cgroup_flush_stats(void)
+{
+       if (atomic_read(&stats_flush_threshold) > num_online_cpus())
+               __mem_cgroup_flush_stats();
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+       mem_cgroup_flush_stats();
+       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+}
+
 /**
  * __mod_memcg_state - update cgroup memory statistics
  * @memcg: the memory cgroup
@@ -625,7 +672,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
                return;
 
        __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
-       cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+       memcg_rstat_updated(memcg);
 }
 
 /* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -653,10 +700,12 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        memcg = pn->memcg;
 
        /* Update memcg */
-       __mod_memcg_state(memcg, idx, val);
+       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 
        /* Update lruvec */
        __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+
+       memcg_rstat_updated(memcg);
 }
 
 /**
@@ -758,7 +807,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                return;
 
        __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
-       cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+       memcg_rstat_updated(memcg);
 }
 
 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -1415,7 +1464,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
         *
         * Current memory state:
         */
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
 
        for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
                u64 size;
@@ -1576,7 +1625,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * A few threads which were not waiting at mutex_lock_killable() can
         * fail to bail out. Therefore, check again after holding oom_lock.
         */
-       ret = should_force_charge() || out_of_memory(&oc);
+       ret = task_is_dying() || out_of_memory(&oc);
 
 unlock:
        mutex_unlock(&oom_lock);
@@ -2544,6 +2593,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
        struct page_counter *counter;
        enum oom_status oom_status;
        unsigned long nr_reclaimed;
+       bool passed_oom = false;
        bool may_swap = true;
        bool drained = false;
        unsigned long pflags;
@@ -2578,15 +2628,6 @@ retry:
        if (gfp_mask & __GFP_ATOMIC)
                goto force;
 
-       /*
-        * Unlike in global OOM situations, memcg is not in a physical
-        * memory shortage.  Allow dying and OOM-killed tasks to
-        * bypass the last charges so that they can exit quickly and
-        * free their memory.
-        */
-       if (unlikely(should_force_charge()))
-               goto force;
-
        /*
         * Prevent unbounded recursion when reclaim operations need to
         * allocate memory. This might exceed the limits temporarily,
@@ -2644,8 +2685,9 @@ retry:
        if (gfp_mask & __GFP_RETRY_MAYFAIL)
                goto nomem;
 
-       if (fatal_signal_pending(current))
-               goto force;
+       /* Avoid endless loop for tasks bypassed by the oom killer */
+       if (passed_oom && task_is_dying())
+               goto nomem;
 
        /*
         * keep retrying as long as the memcg oom killer is able to make
@@ -2654,14 +2696,10 @@ retry:
         */
        oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
                       get_order(nr_pages * PAGE_SIZE));
-       switch (oom_status) {
-       case OOM_SUCCESS:
+       if (oom_status == OOM_SUCCESS) {
+               passed_oom = true;
                nr_retries = MAX_RECLAIM_RETRIES;
                goto retry;
-       case OOM_FAILED:
-               goto force;
-       default:
-               goto nomem;
        }
 nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
@@ -2736,8 +2774,7 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        return try_charge_memcg(memcg, gfp_mask, nr_pages);
 }
 
-#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
-static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
        if (mem_cgroup_is_root(memcg))
                return;
@@ -2746,7 +2783,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
        if (do_memsw_account())
                page_counter_uncharge(&memcg->memsw, nr_pages);
 }
-#endif
 
 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
 {
@@ -2965,7 +3001,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
                                   unsigned int nr_pages)
 {
-       struct page_counter *counter;
        struct mem_cgroup *memcg;
        int ret;
 
@@ -2975,21 +3010,8 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
        if (ret)
                goto out;
 
-       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
-           !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
-
-               /*
-                * Enforce __GFP_NOFAIL allocation because callers are not
-                * prepared to see failures and likely do not have any failure
-                * handling code.
-                */
-               if (gfp & __GFP_NOFAIL) {
-                       page_counter_charge(&memcg->kmem, nr_pages);
-                       goto out;
-               }
-               cancel_charge(memcg, nr_pages);
-               ret = -ENOMEM;
-       }
+       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+               page_counter_charge(&memcg->kmem, nr_pages);
 out:
        css_put(&memcg->css);
 
@@ -3481,19 +3503,11 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 
        /* try to free all pages in this cgroup */
        while (nr_retries && page_counter_read(&memcg->memory)) {
-               int progress;
-
                if (signal_pending(current))
                        return -EINTR;
 
-               progress = try_to_free_mem_cgroup_pages(memcg, 1,
-                                                       GFP_KERNEL, true);
-               if (!progress) {
+               if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
                        nr_retries--;
-                       /* maybe some writeback is necessary */
-                       congestion_wait(BLK_RW_ASYNC, HZ/10);
-               }
-
        }
 
        return 0;
@@ -3534,8 +3548,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
        unsigned long val;
 
        if (mem_cgroup_is_root(memcg)) {
-               /* mem_cgroup_threshold() calls here from irqsafe context */
-               cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+               mem_cgroup_flush_stats();
                val = memcg_page_state(memcg, NR_FILE_PAGES) +
                        memcg_page_state(memcg, NR_ANON_MAPPED);
                if (swap)
@@ -3610,7 +3623,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
                return 0;
 
        BUG_ON(memcg->kmemcg_id >= 0);
-       BUG_ON(memcg->kmem_state);
 
        memcg_id = memcg_alloc_cache_id();
        if (memcg_id < 0)
@@ -3627,22 +3639,18 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
        static_branch_enable(&memcg_kmem_enabled_key);
 
        memcg->kmemcg_id = memcg_id;
-       memcg->kmem_state = KMEM_ONLINE;
 
        return 0;
 }
 
 static void memcg_offline_kmem(struct mem_cgroup *memcg)
 {
-       struct cgroup_subsys_state *css;
-       struct mem_cgroup *parent, *child;
+       struct mem_cgroup *parent;
        int kmemcg_id;
 
-       if (memcg->kmem_state != KMEM_ONLINE)
+       if (memcg->kmemcg_id == -1)
                return;
 
-       memcg->kmem_state = KMEM_ALLOCATED;
-
        parent = parent_mem_cgroup(memcg);
        if (!parent)
                parent = root_mem_cgroup;
@@ -3653,31 +3661,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
        BUG_ON(kmemcg_id < 0);
 
        /*
-        * Change kmemcg_id of this cgroup and all its descendants to the
-        * parent's id, and then move all entries from this cgroup's list_lrus
-        * to ones of the parent. After we have finished, all list_lrus
-        * corresponding to this cgroup are guaranteed to remain empty. The
-        * ordering is imposed by list_lru_node->lock taken by
+        * After we have finished memcg_reparent_objcgs(), all list_lrus
+        * corresponding to this cgroup are guaranteed to remain empty.
+        * The ordering is imposed by list_lru_node->lock taken by
         * memcg_drain_all_list_lrus().
         */
-       rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
-       css_for_each_descendant_pre(css, &memcg->css) {
-               child = mem_cgroup_from_css(css);
-               BUG_ON(child->kmemcg_id != kmemcg_id);
-               child->kmemcg_id = parent->kmemcg_id;
-       }
-       rcu_read_unlock();
-
        memcg_drain_all_list_lrus(kmemcg_id, parent);
 
        memcg_free_cache_id(kmemcg_id);
-}
-
-static void memcg_free_kmem(struct mem_cgroup *memcg)
-{
-       /* css_alloc() failed, offlining didn't happen */
-       if (unlikely(memcg->kmem_state == KMEM_ONLINE))
-               memcg_offline_kmem(memcg);
+       memcg->kmemcg_id = -1;
 }
 #else
 static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -3687,22 +3679,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
 static void memcg_offline_kmem(struct mem_cgroup *memcg)
 {
 }
-static void memcg_free_kmem(struct mem_cgroup *memcg)
-{
-}
 #endif /* CONFIG_MEMCG_KMEM */
 
-static int memcg_update_kmem_max(struct mem_cgroup *memcg,
-                                unsigned long max)
-{
-       int ret;
-
-       mutex_lock(&memcg_max_mutex);
-       ret = page_counter_set_max(&memcg->kmem, max);
-       mutex_unlock(&memcg_max_mutex);
-       return ret;
-}
-
 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
 {
        int ret;
@@ -3768,10 +3746,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
                        ret = mem_cgroup_resize_max(memcg, nr_pages, true);
                        break;
                case _KMEM:
-                       pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
-                                    "Please report your usecase to linux-mm@kvack.org if you "
-                                    "depend on this functionality.\n");
-                       ret = memcg_update_kmem_max(memcg, nr_pages);
+                       /* kmem.limit_in_bytes is deprecated. */
+                       ret = -EOPNOTSUPP;
                        break;
                case _TCP:
                        ret = memcg_update_tcp_max(memcg, nr_pages);
@@ -3916,7 +3892,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
        int nid;
        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
 
        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
                seq_printf(m, "%s=%lu", stat->name,
@@ -3988,7 +3964,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 
        BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
 
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
 
        for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
                unsigned long nr;
@@ -4491,7 +4467,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
        struct mem_cgroup *parent;
 
-       cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
 
        *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
        *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -5324,7 +5300,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
        cancel_work_sync(&memcg->high_work);
        mem_cgroup_remove_from_trees(memcg);
        free_shrinker_info(memcg);
-       memcg_free_kmem(memcg);
+
+       /* Need to offline kmem if online_css() fails */
+       memcg_offline_kmem(memcg);
        mem_cgroup_free(memcg);
 }
 
@@ -5357,21 +5335,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        memcg_wb_domain_size_changed(memcg);
 }
 
-void mem_cgroup_flush_stats(void)
-{
-       if (!spin_trylock(&stats_flush_lock))
-               return;
-
-       cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
-       spin_unlock(&stats_flush_lock);
-}
-
-static void flush_memcg_stats_dwork(struct work_struct *w)
-{
-       mem_cgroup_flush_stats();
-       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
-}
-
 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5561,7 +5524,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 #endif
 
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
-                       unsigned long addr, pte_t ptent, swp_entry_t *entry)
+                       unsigned long addr, pte_t ptent)
 {
        if (!vma->vm_file) /* anonymous vma */
                return NULL;
@@ -5736,7 +5699,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
        else if (is_swap_pte(ptent))
                page = mc_handle_swap_pte(vma, ptent, &ent);
        else if (pte_none(ptent))
-               page = mc_handle_file_pte(vma, addr, ptent, &ent);
+               page = mc_handle_file_pte(vma, addr, ptent);
 
        if (!page && !ent.val)
                return ret;
@@ -6391,7 +6354,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
        int i;
        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
-       cgroup_rstat_flush(memcg->css.cgroup);
+       mem_cgroup_flush_stats();
 
        for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
                int nid;
index ff51edd6e9927d418d024191307b02bd06687466..f64ebb6226cbf6af8931ac0bf319f18a4ee77fdc 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/kernel-page-flags.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
+#include <linux/dax.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
@@ -57,6 +58,7 @@
 #include <linux/ratelimit.h>
 #include <linux/page-isolation.h>
 #include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
 #include "internal.h"
 #include "ras/ras_event.h"
 
@@ -673,7 +675,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
 #define hwpoison_hugetlb_range NULL
 #endif
 
-static struct mm_walk_ops hwp_walk_ops = {
+static const struct mm_walk_ops hwp_walk_ops = {
        .pmd_entry = hwpoison_pte_range,
        .hugetlb_entry = hwpoison_hugetlb_range,
 };
@@ -806,12 +808,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
        return ret;
 }
 
+struct page_state {
+       unsigned long mask;
+       unsigned long res;
+       enum mf_action_page_type type;
+
+       /* Callback ->action() has to unlock the relevant page inside it. */
+       int (*action)(struct page_state *ps, struct page *p);
+};
+
+/*
+ * Return true if page is still referenced by others, otherwise return
+ * false.
+ *
+ * The extra_pins is true when one extra refcount is expected.
+ */
+static bool has_extra_refcount(struct page_state *ps, struct page *p,
+                              bool extra_pins)
+{
+       int count = page_count(p) - 1;
+
+       if (extra_pins)
+               count -= 1;
+
+       if (count > 0) {
+               pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+                      page_to_pfn(p), action_page_types[ps->type], count);
+               return true;
+       }
+
+       return false;
+}
+
 /*
  * Error hit kernel page.
  * Do nothing, try to be lucky and not touch this instead. For a few cases we
  * could be more sophisticated.
  */
-static int me_kernel(struct page *p, unsigned long pfn)
+static int me_kernel(struct page_state *ps, struct page *p)
 {
        unlock_page(p);
        return MF_IGNORED;
@@ -820,9 +854,9 @@ static int me_kernel(struct page *p, unsigned long pfn)
 /*
  * Page in unknown state. Do nothing.
  */
-static int me_unknown(struct page *p, unsigned long pfn)
+static int me_unknown(struct page_state *ps, struct page *p)
 {
-       pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+       pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
        unlock_page(p);
        return MF_FAILED;
 }
@@ -830,10 +864,11 @@ static int me_unknown(struct page *p, unsigned long pfn)
 /*
  * Clean (or cleaned) page cache page.
  */
-static int me_pagecache_clean(struct page *p, unsigned long pfn)
+static int me_pagecache_clean(struct page_state *ps, struct page *p)
 {
        int ret;
        struct address_space *mapping;
+       bool extra_pins;
 
        delete_from_lru_cache(p);
 
@@ -862,14 +897,24 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                goto out;
        }
 
+       /*
+        * The shmem page is kept in page cache instead of truncating
+        * so is expected to have an extra refcount after error-handling.
+        */
+       extra_pins = shmem_mapping(mapping);
+
        /*
         * Truncation is a bit tricky. Enable it per file system for now.
         *
         * Open: to take i_rwsem or not for this? Right now we don't.
         */
-       ret = truncate_error_page(p, pfn, mapping);
+       ret = truncate_error_page(p, page_to_pfn(p), mapping);
+       if (has_extra_refcount(ps, p, extra_pins))
+               ret = MF_FAILED;
+
 out:
        unlock_page(p);
+
        return ret;
 }
 
@@ -878,7 +923,7 @@ out:
  * Issues: when the error hit a hole page the error is not properly
  * propagated.
  */
-static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+static int me_pagecache_dirty(struct page_state *ps, struct page *p)
 {
        struct address_space *mapping = page_mapping(p);
 
@@ -922,7 +967,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
                mapping_set_error(mapping, -EIO);
        }
 
-       return me_pagecache_clean(p, pfn);
+       return me_pagecache_clean(ps, p);
 }
 
 /*
@@ -944,9 +989,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
  * Clean swap cache pages can be directly isolated. A later page fault will
  * bring in the known good data from disk.
  */
-static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+static int me_swapcache_dirty(struct page_state *ps, struct page *p)
 {
        int ret;
+       bool extra_pins = false;
 
        ClearPageDirty(p);
        /* Trigger EIO in shmem: */
@@ -954,10 +1000,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 
        ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
        unlock_page(p);
+
+       if (ret == MF_DELAYED)
+               extra_pins = true;
+
+       if (has_extra_refcount(ps, p, extra_pins))
+               ret = MF_FAILED;
+
        return ret;
 }
 
-static int me_swapcache_clean(struct page *p, unsigned long pfn)
+static int me_swapcache_clean(struct page_state *ps, struct page *p)
 {
        int ret;
 
@@ -965,6 +1018,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
 
        ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
        unlock_page(p);
+
+       if (has_extra_refcount(ps, p, false))
+               ret = MF_FAILED;
+
        return ret;
 }
 
@@ -974,7 +1031,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
  * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
  *   To narrow down kill region to one page, we need to break up pmd.
  */
-static int me_huge_page(struct page *p, unsigned long pfn)
+static int me_huge_page(struct page_state *ps, struct page *p)
 {
        int res;
        struct page *hpage = compound_head(p);
@@ -985,7 +1042,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 
        mapping = page_mapping(hpage);
        if (mapping) {
-               res = truncate_error_page(hpage, pfn, mapping);
+               res = truncate_error_page(hpage, page_to_pfn(p), mapping);
                unlock_page(hpage);
        } else {
                res = MF_FAILED;
@@ -1003,6 +1060,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
                }
        }
 
+       if (has_extra_refcount(ps, p, false))
+               res = MF_FAILED;
+
        return res;
 }
 
@@ -1028,14 +1088,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 #define slab           (1UL << PG_slab)
 #define reserved       (1UL << PG_reserved)
 
-static struct page_state {
-       unsigned long mask;
-       unsigned long res;
-       enum mf_action_page_type type;
-
-       /* Callback ->action() has to unlock the relevant page inside it. */
-       int (*action)(struct page *p, unsigned long pfn);
-} error_states[] = {
+static struct page_state error_states[] = {
        { reserved,     reserved,       MF_MSG_KERNEL,  me_kernel },
        /*
         * free pages are specially detected outside this table:
@@ -1095,19 +1148,10 @@ static int page_action(struct page_state *ps, struct page *p,
                        unsigned long pfn)
 {
        int result;
-       int count;
 
        /* page p should be unlocked after returning from ps->action().  */
-       result = ps->action(p, pfn);
+       result = ps->action(ps, p);
 
-       count = page_count(p) - 1;
-       if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
-               count--;
-       if (count > 0) {
-               pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
-                      pfn, action_page_types[ps->type], count);
-               result = MF_FAILED;
-       }
        action_result(pfn, ps->type, result);
 
        /* Could do more checks here if page looks ok */
@@ -1400,14 +1444,11 @@ static int identify_page_state(unsigned long pfn, struct page *p,
 static int try_to_split_thp_page(struct page *page, const char *msg)
 {
        lock_page(page);
-       if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+       if (unlikely(split_huge_page(page))) {
                unsigned long pfn = page_to_pfn(page);
 
                unlock_page(page);
-               if (!PageAnon(page))
-                       pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
-               else
-                       pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+               pr_info("%s: %#lx: thp split failed\n", msg, pfn);
                put_page(page);
                return -EBUSY;
        }
index bcc4b0727a63239e8ac14f25429a37552aef2d70..8f1de811a1dcb52e7120169afdff5f5a6e275668 100644 (file)
@@ -433,35 +433,39 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
        }
 }
 
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
 {
-       spinlock_t *ptl;
-       pgtable_t new = pte_alloc_one(mm);
-       if (!new)
-               return -ENOMEM;
+       spinlock_t *ptl = pmd_lock(mm, pmd);
 
-       /*
-        * Ensure all pte setup (eg. pte page lock and page clearing) are
-        * visible before the pte is made visible to other CPUs by being
-        * put into page tables.
-        *
-        * The other side of the story is the pointer chasing in the page
-        * table walking code (when walking the page table without locking;
-        * ie. most of the time). Fortunately, these data accesses consist
-        * of a chain of data-dependent loads, meaning most CPUs (alpha
-        * being the notable exception) will already guarantee loads are
-        * seen in-order. See the alpha page table accessors for the
-        * smp_rmb() barriers in page table walking code.
-        */
-       smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
-
-       ptl = pmd_lock(mm, pmd);
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                mm_inc_nr_ptes(mm);
-               pmd_populate(mm, pmd, new);
-               new = NULL;
+               /*
+                * Ensure all pte setup (eg. pte page lock and page clearing) are
+                * visible before the pte is made visible to other CPUs by being
+                * put into page tables.
+                *
+                * The other side of the story is the pointer chasing in the page
+                * table walking code (when walking the page table without locking;
+                * ie. most of the time). Fortunately, these data accesses consist
+                * of a chain of data-dependent loads, meaning most CPUs (alpha
+                * being the notable exception) will already guarantee loads are
+                * seen in-order. See the alpha page table accessors for the
+                * smp_rmb() barriers in page table walking code.
+                */
+               smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
+               pmd_populate(mm, pmd, *pte);
+               *pte = NULL;
        }
        spin_unlock(ptl);
+}
+
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+{
+       pgtable_t new = pte_alloc_one(mm);
+       if (!new)
+               return -ENOMEM;
+
+       pmd_install(mm, pmd, &new);
        if (new)
                pte_free(mm, new);
        return 0;
@@ -473,10 +477,9 @@ int __pte_alloc_kernel(pmd_t *pmd)
        if (!new)
                return -ENOMEM;
 
-       smp_wmb(); /* See comment in __pte_alloc */
-
        spin_lock(&init_mm.page_table_lock);
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
+               smp_wmb(); /* See comment in pmd_install() */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
        }
@@ -1333,16 +1336,8 @@ again:
                        struct page *page;
 
                        page = vm_normal_page(vma, addr, ptent);
-                       if (unlikely(details) && page) {
-                               /*
-                                * unmap_shared_mapping_pages() wants to
-                                * invalidate cache without truncating:
-                                * unmap shared but keep private pages.
-                                */
-                               if (details->check_mapping &&
-                                   details->check_mapping != page_rmapping(page))
-                                       continue;
-                       }
+                       if (unlikely(zap_skip_check_mapping(details, page)))
+                               continue;
                        ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                        tlb->fullmm);
                        tlb_remove_tlb_entry(tlb, pte, addr);
@@ -1375,17 +1370,8 @@ again:
                    is_device_exclusive_entry(entry)) {
                        struct page *page = pfn_swap_entry_to_page(entry);
 
-                       if (unlikely(details && details->check_mapping)) {
-                               /*
-                                * unmap_shared_mapping_pages() wants to
-                                * invalidate cache without truncating:
-                                * unmap shared but keep private pages.
-                                */
-                               if (details->check_mapping !=
-                                   page_rmapping(page))
-                                       continue;
-                       }
-
+                       if (unlikely(zap_skip_check_mapping(details, page)))
+                               continue;
                        pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        rss[mm_counter(page)]--;
 
@@ -2724,19 +2710,19 @@ EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
  * proceeding (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page can safely check later on).
  */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-                               pte_t *page_table, pte_t orig_pte)
+static inline int pte_unmap_same(struct vm_fault *vmf)
 {
        int same = 1;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
-               spinlock_t *ptl = pte_lockptr(mm, pmd);
+               spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
                spin_lock(ptl);
-               same = pte_same(*page_table, orig_pte);
+               same = pte_same(*vmf->pte, vmf->orig_pte);
                spin_unlock(ptl);
        }
 #endif
-       pte_unmap(page_table);
+       pte_unmap(vmf->pte);
+       vmf->pte = NULL;
        return same;
 }
 
@@ -3321,20 +3307,20 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
 }
 
 static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
+                                           pgoff_t first_index,
+                                           pgoff_t last_index,
                                            struct zap_details *details)
 {
        struct vm_area_struct *vma;
        pgoff_t vba, vea, zba, zea;
 
-       vma_interval_tree_foreach(vma, root,
-                       details->first_index, details->last_index) {
-
+       vma_interval_tree_foreach(vma, root, first_index, last_index) {
                vba = vma->vm_pgoff;
                vea = vba + vma_pages(vma) - 1;
-               zba = details->first_index;
+               zba = first_index;
                if (zba < vba)
                        zba = vba;
-               zea = details->last_index;
+               zea = last_index;
                if (zea > vea)
                        zea = vea;
 
@@ -3360,18 +3346,22 @@ void unmap_mapping_page(struct page *page)
 {
        struct address_space *mapping = page->mapping;
        struct zap_details details = { };
+       pgoff_t first_index;
+       pgoff_t last_index;
 
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageTail(page));
 
-       details.check_mapping = mapping;
-       details.first_index = page->index;
-       details.last_index = page->index + thp_nr_pages(page) - 1;
+       first_index = page->index;
+       last_index = page->index + thp_nr_pages(page) - 1;
+
+       details.zap_mapping = mapping;
        details.single_page = page;
 
        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
-               unmap_mapping_range_tree(&mapping->i_mmap, &details);
+               unmap_mapping_range_tree(&mapping->i_mmap, first_index,
+                                        last_index, &details);
        i_mmap_unlock_write(mapping);
 }
 
@@ -3391,16 +3381,17 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
                pgoff_t nr, bool even_cows)
 {
        struct zap_details details = { };
+       pgoff_t first_index = start;
+       pgoff_t last_index = start + nr - 1;
 
-       details.check_mapping = even_cows ? NULL : mapping;
-       details.first_index = start;
-       details.last_index = start + nr - 1;
-       if (details.last_index < details.first_index)
-               details.last_index = ULONG_MAX;
+       details.zap_mapping = even_cows ? NULL : mapping;
+       if (last_index < first_index)
+               last_index = ULONG_MAX;
 
        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
-               unmap_mapping_range_tree(&mapping->i_mmap, &details);
+               unmap_mapping_range_tree(&mapping->i_mmap, first_index,
+                                        last_index, &details);
        i_mmap_unlock_write(mapping);
 }
 EXPORT_SYMBOL_GPL(unmap_mapping_pages);
@@ -3488,7 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        vm_fault_t ret = 0;
        void *shadow = NULL;
 
-       if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+       if (!pte_unmap_same(vmf))
                goto out;
 
        entry = pte_to_swp_entry(vmf->orig_pte);
@@ -3853,7 +3844,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
-               smp_wmb(); /* See comment in __pte_alloc() */
        }
 
        ret = vma->vm_ops->fault(vmf);
@@ -3924,7 +3914,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
-               smp_wmb(); /* See comment in __pte_alloc() */
        }
 
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -4037,17 +4026,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
                                return ret;
                }
 
-               if (vmf->prealloc_pte) {
-                       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-                       if (likely(pmd_none(*vmf->pmd))) {
-                               mm_inc_nr_ptes(vma->vm_mm);
-                               pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
-                               vmf->prealloc_pte = NULL;
-                       }
-                       spin_unlock(vmf->ptl);
-               } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
+               if (vmf->prealloc_pte)
+                       pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
+               else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
                        return VM_FAULT_OOM;
-               }
        }
 
        /* See comment in handle_pte_fault() */
@@ -4156,7 +4138,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
-               smp_wmb(); /* See comment in __pte_alloc() */
        }
 
        return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
@@ -4831,13 +4812,13 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
        if (!new)
                return -ENOMEM;
 
-       smp_wmb(); /* See comment in __pte_alloc */
-
        spin_lock(&mm->page_table_lock);
-       if (pgd_present(*pgd))          /* Another has populated it */
+       if (pgd_present(*pgd)) {        /* Another has populated it */
                p4d_free(mm, new);
-       else
+       } else {
+               smp_wmb(); /* See comment in pmd_install() */
                pgd_populate(mm, pgd, new);
+       }
        spin_unlock(&mm->page_table_lock);
        return 0;
 }
@@ -4854,11 +4835,10 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
        if (!new)
                return -ENOMEM;
 
-       smp_wmb(); /* See comment in __pte_alloc */
-
        spin_lock(&mm->page_table_lock);
        if (!p4d_present(*p4d)) {
                mm_inc_nr_puds(mm);
+               smp_wmb(); /* See comment in pmd_install() */
                p4d_populate(mm, p4d, new);
        } else  /* Another has populated it */
                pud_free(mm, new);
@@ -4879,14 +4859,14 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
        if (!new)
                return -ENOMEM;
 
-       smp_wmb(); /* See comment in __pte_alloc */
-
        ptl = pud_lock(mm, pud);
        if (!pud_present(*pud)) {
                mm_inc_nr_pmds(mm);
+               smp_wmb(); /* See comment in pmd_install() */
                pud_populate(mm, pud, new);
-       } else  /* Another has populated it */
+       } else {        /* Another has populated it */
                pmd_free(mm, new);
+       }
        spin_unlock(ptl);
        return 0;
 }
@@ -5423,7 +5403,6 @@ long copy_huge_page_from_user(struct page *dst_page,
                                unsigned int pages_per_huge_page,
                                bool allow_pagefault)
 {
-       void *src = (void *)usr_src;
        void *page_kaddr;
        unsigned long i, rc = 0;
        unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
@@ -5436,8 +5415,7 @@ long copy_huge_page_from_user(struct page *dst_page,
                else
                        page_kaddr = kmap_atomic(subpage);
                rc = copy_from_user(page_kaddr,
-                               (const void __user *)(src + i * PAGE_SIZE),
-                               PAGE_SIZE);
+                               usr_src + i * PAGE_SIZE, PAGE_SIZE);
                if (allow_pagefault)
                        kunmap(subpage);
                else
index 9fd0be32a281e238cb17061690435777f40fe3d9..852041f6be418c317db8a40c94cf21d501ba9cb3 100644 (file)
@@ -21,7 +21,6 @@
 #include <linux/memory.h>
 #include <linux/memremap.h>
 #include <linux/memory_hotplug.h>
-#include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
 #include <linux/delay.h>
@@ -57,7 +56,7 @@ enum {
        ONLINE_POLICY_AUTO_MOVABLE,
 };
 
-const char *online_policy_to_str[] = {
+static const char * const online_policy_to_str[] = {
        [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
        [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
 };
@@ -220,7 +219,6 @@ static void release_memory_resource(struct resource *res)
        kfree(res);
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
                const char *reason)
 {
@@ -586,10 +584,6 @@ void generic_online_page(struct page *page, unsigned int order)
        debug_pagealloc_map_pages(page, 1 << order);
        __free_pages_core(page, order);
        totalram_pages_add(1UL << order);
-#ifdef CONFIG_HIGHMEM
-       if (PageHighMem(page))
-               totalhigh_pages_add(1UL << order);
-#endif
 }
 EXPORT_SYMBOL_GPL(generic_online_page);
 
@@ -626,16 +620,11 @@ static void node_states_check_changes_online(unsigned long nr_pages,
 
        arg->status_change_nid = NUMA_NO_NODE;
        arg->status_change_nid_normal = NUMA_NO_NODE;
-       arg->status_change_nid_high = NUMA_NO_NODE;
 
        if (!node_state(nid, N_MEMORY))
                arg->status_change_nid = nid;
        if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
                arg->status_change_nid_normal = nid;
-#ifdef CONFIG_HIGHMEM
-       if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
-               arg->status_change_nid_high = nid;
-#endif
 }
 
 static void node_states_set_node(int node, struct memory_notify *arg)
@@ -643,9 +632,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
        if (arg->status_change_nid_normal >= 0)
                node_set_state(node, N_NORMAL_MEMORY);
 
-       if (arg->status_change_nid_high >= 0)
-               node_set_state(node, N_HIGH_MEMORY);
-
        if (arg->status_change_nid >= 0)
                node_set_state(node, N_MEMORY);
 }
@@ -1163,7 +1149,6 @@ failed_addition:
        mem_hotplug_done();
        return ret;
 }
-#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 static void reset_node_present_pages(pg_data_t *pgdat)
 {
@@ -1357,6 +1342,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
 int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 {
        struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+       enum memblock_flags memblock_flags = MEMBLOCK_NONE;
        struct vmem_altmap mhp_altmap = {};
        struct memory_group *group = NULL;
        u64 start, size;
@@ -1384,8 +1370,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 
        mem_hotplug_begin();
 
-       if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
-               memblock_add_node(start, size, nid);
+       if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
+               if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
+                       memblock_flags = MEMBLOCK_DRIVER_MANAGED;
+               ret = memblock_add_node(start, size, nid, memblock_flags);
+               if (ret)
+                       goto error_mem_hotplug_end;
+       }
 
        ret = __try_online_node(nid, false);
        if (ret < 0)
@@ -1458,6 +1449,7 @@ error:
                rollback_node_hotadd(nid);
        if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
                memblock_remove(start, size);
+error_mem_hotplug_end:
        mem_hotplug_done();
        return ret;
 }
@@ -1803,7 +1795,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
 
        arg->status_change_nid = NUMA_NO_NODE;
        arg->status_change_nid_normal = NUMA_NO_NODE;
-       arg->status_change_nid_high = NUMA_NO_NODE;
 
        /*
         * Check whether node_states[N_NORMAL_MEMORY] will be changed.
@@ -1818,24 +1809,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
        if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
                arg->status_change_nid_normal = zone_to_nid(zone);
 
-#ifdef CONFIG_HIGHMEM
        /*
-        * node_states[N_HIGH_MEMORY] contains nodes which
-        * have normal memory or high memory.
-        * Here we add the present_pages belonging to ZONE_HIGHMEM.
-        * If the zone is within the range of [0..ZONE_HIGHMEM), and
-        * we determine that the zones in that range become empty,
-        * we need to clear the node for N_HIGH_MEMORY.
-        */
-       present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
-       if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
-               arg->status_change_nid_high = zone_to_nid(zone);
-#endif
-
-       /*
-        * We have accounted the pages from [0..ZONE_NORMAL), and
-        * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
-        * as well.
+        * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
+        * does not apply as we don't support 32bit.
         * Here we count the possible pages from ZONE_MOVABLE.
         * If after having accounted all the pages, we see that the nr_pages
         * to be offlined is over or equal to the accounted pages,
@@ -1853,9 +1829,6 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
        if (arg->status_change_nid_normal >= 0)
                node_clear_state(node, N_NORMAL_MEMORY);
 
-       if (arg->status_change_nid_high >= 0)
-               node_clear_state(node, N_HIGH_MEMORY);
-
        if (arg->status_change_nid >= 0)
                node_clear_state(node, N_MEMORY);
 }
@@ -2204,7 +2177,7 @@ static int __ref try_remove_memory(u64 start, u64 size)
        arch_remove_memory(start, size, altmap);
 
        if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
-               memblock_free(start, size);
+               memblock_phys_free(start, size);
                memblock_remove(start, size);
        }
 
index f4b4be7af4d3ad7f8b67c336acf712b3eadc010b..10e9c87260edea7af123620da62d12945696c3a4 100644 (file)
@@ -2206,6 +2206,88 @@ struct folio *folio_alloc(gfp_t gfp, unsigned order)
 }
 EXPORT_SYMBOL(folio_alloc);
 
+static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
+               struct mempolicy *pol, unsigned long nr_pages,
+               struct page **page_array)
+{
+       int nodes;
+       unsigned long nr_pages_per_node;
+       int delta;
+       int i;
+       unsigned long nr_allocated;
+       unsigned long total_allocated = 0;
+
+       nodes = nodes_weight(pol->nodes);
+       nr_pages_per_node = nr_pages / nodes;
+       delta = nr_pages - nodes * nr_pages_per_node;
+
+       for (i = 0; i < nodes; i++) {
+               if (delta) {
+                       nr_allocated = __alloc_pages_bulk(gfp,
+                                       interleave_nodes(pol), NULL,
+                                       nr_pages_per_node + 1, NULL,
+                                       page_array);
+                       delta--;
+               } else {
+                       nr_allocated = __alloc_pages_bulk(gfp,
+                                       interleave_nodes(pol), NULL,
+                                       nr_pages_per_node, NULL, page_array);
+               }
+
+               page_array += nr_allocated;
+               total_allocated += nr_allocated;
+       }
+
+       return total_allocated;
+}
+
+static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
+               struct mempolicy *pol, unsigned long nr_pages,
+               struct page **page_array)
+{
+       gfp_t preferred_gfp;
+       unsigned long nr_allocated = 0;
+
+       preferred_gfp = gfp | __GFP_NOWARN;
+       preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+
+       nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
+                                          nr_pages, NULL, page_array);
+
+       if (nr_allocated < nr_pages)
+               nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
+                               nr_pages - nr_allocated, NULL,
+                               page_array + nr_allocated);
+       return nr_allocated;
+}
+
+/* alloc pages bulk and mempolicy should be considered at the
+ * same time in some situation such as vmalloc.
+ *
+ * It can accelerate memory allocation especially interleaving
+ * allocate memory.
+ */
+unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
+               unsigned long nr_pages, struct page **page_array)
+{
+       struct mempolicy *pol = &default_policy;
+
+       if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+               pol = get_task_policy(current);
+
+       if (pol->mode == MPOL_INTERLEAVE)
+               return alloc_pages_bulk_array_interleave(gfp, pol,
+                                                        nr_pages, page_array);
+
+       if (pol->mode == MPOL_PREFERRED_MANY)
+               return alloc_pages_bulk_array_preferred_many(gfp,
+                               numa_node_id(), pol, nr_pages, page_array);
+
+       return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
+                                 policy_nodemask(gfp, pol), nr_pages, NULL,
+                                 page_array);
+}
+
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
        struct mempolicy *pol = mpol_dup(vma_policy(src));
@@ -2985,64 +3067,3 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
                               nodemask_pr_args(&nodes));
 }
-
-bool numa_demotion_enabled = false;
-
-#ifdef CONFIG_SYSFS
-static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
-                                         struct kobj_attribute *attr, char *buf)
-{
-       return sysfs_emit(buf, "%s\n",
-                         numa_demotion_enabled? "true" : "false");
-}
-
-static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
-                                          struct kobj_attribute *attr,
-                                          const char *buf, size_t count)
-{
-       if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
-               numa_demotion_enabled = true;
-       else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
-               numa_demotion_enabled = false;
-       else
-               return -EINVAL;
-
-       return count;
-}
-
-static struct kobj_attribute numa_demotion_enabled_attr =
-       __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
-              numa_demotion_enabled_store);
-
-static struct attribute *numa_attrs[] = {
-       &numa_demotion_enabled_attr.attr,
-       NULL,
-};
-
-static const struct attribute_group numa_attr_group = {
-       .attrs = numa_attrs,
-};
-
-static int __init numa_init_sysfs(void)
-{
-       int err;
-       struct kobject *numa_kobj;
-
-       numa_kobj = kobject_create_and_add("numa", mm_kobj);
-       if (!numa_kobj) {
-               pr_err("failed to create numa kobject\n");
-               return -ENOMEM;
-       }
-       err = sysfs_create_group(numa_kobj, &numa_attr_group);
-       if (err) {
-               pr_err("failed to register numa group\n");
-               goto delete_obj;
-       }
-       return 0;
-
-delete_obj:
-       kobject_put(numa_kobj);
-       return err;
-}
-subsys_initcall(numa_init_sysfs);
-#endif
index efa9941ebe031cf6c7bc8be8bb1551d79007f22d..a11e948593df02f1e0a3f6f84948be175e84b3bd 100644 (file)
@@ -3305,3 +3305,64 @@ static int __init migrate_on_reclaim_init(void)
 }
 late_initcall(migrate_on_reclaim_init);
 #endif /* CONFIG_HOTPLUG_CPU */
+
+bool numa_demotion_enabled = false;
+
+#ifdef CONFIG_SYSFS
+static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
+                                         struct kobj_attribute *attr, char *buf)
+{
+       return sysfs_emit(buf, "%s\n",
+                         numa_demotion_enabled ? "true" : "false");
+}
+
+static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
+                                          struct kobj_attribute *attr,
+                                          const char *buf, size_t count)
+{
+       if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+               numa_demotion_enabled = true;
+       else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+               numa_demotion_enabled = false;
+       else
+               return -EINVAL;
+
+       return count;
+}
+
+static struct kobj_attribute numa_demotion_enabled_attr =
+       __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
+              numa_demotion_enabled_store);
+
+static struct attribute *numa_attrs[] = {
+       &numa_demotion_enabled_attr.attr,
+       NULL,
+};
+
+static const struct attribute_group numa_attr_group = {
+       .attrs = numa_attrs,
+};
+
+static int __init numa_init_sysfs(void)
+{
+       int err;
+       struct kobject *numa_kobj;
+
+       numa_kobj = kobject_create_and_add("numa", mm_kobj);
+       if (!numa_kobj) {
+               pr_err("failed to create numa kobject\n");
+               return -ENOMEM;
+       }
+       err = sysfs_create_group(numa_kobj, &numa_attr_group);
+       if (err) {
+               pr_err("failed to register numa group\n");
+               goto delete_obj;
+       }
+       return 0;
+
+delete_obj:
+       kobject_put(numa_kobj);
+       return err;
+}
+subsys_initcall(numa_init_sysfs);
+#endif
index 88dcc5c252255f289fae6dfb85a80d43bf0f1628..b22a07f5e76145972784076fa8a86d6c9a261840 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3332,7 +3332,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
 
 void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
 {
-       mm->total_vm += npages;
+       WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
 
        if (is_exec_mapping(flags))
                mm->exec_vm += npages;
index 883e2cc85cad84ad0e67822b8ac22a0ecbae9aee..e552f5e0ccbded82e503eba25f45b8a3a938c2c2 100644 (file)
@@ -563,7 +563,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
        error = -ENOMEM;
        if (!vma)
                goto out;
-       prev = vma->vm_prev;
+
        if (unlikely(grows & PROT_GROWSDOWN)) {
                if (vma->vm_start >= end)
                        goto out;
@@ -581,8 +581,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
                                goto out;
                }
        }
+
        if (start > vma->vm_start)
                prev = vma;
+       else
+               prev = vma->vm_prev;
 
        for (nstart = start ; ; ) {
                unsigned long mask_off_old_flags;
index badfe17ade1f06c57db60468bad7ebd55f96b55c..002eec83e91e5afd4b2229ccd2712a599ee31036 100644 (file)
@@ -489,6 +489,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
        old_end = old_addr + len;
        flush_cache_range(vma, old_addr, old_end);
 
+       if (is_vm_hugetlb_page(vma))
+               return move_hugetlb_page_tables(vma, new_vma, old_addr,
+                                               new_addr, len);
+
        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
                                old_addr, old_end);
        mmu_notifier_invalidate_range_start(&range);
@@ -565,6 +569,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                bool *locked, unsigned long flags,
                struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
 {
+       long to_account = new_len - old_len;
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma;
        unsigned long vm_flags = vma->vm_flags;
@@ -583,6 +588,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        if (mm->map_count >= sysctl_max_map_count - 3)
                return -ENOMEM;
 
+       if (unlikely(flags & MREMAP_DONTUNMAP))
+               to_account = new_len;
+
        if (vma->vm_ops && vma->vm_ops->may_split) {
                if (vma->vm_start != old_addr)
                        err = vma->vm_ops->may_split(vma, old_addr);
@@ -604,8 +612,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        if (err)
                return err;
 
-       if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) {
-               if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT))
+       if (vm_flags & VM_ACCOUNT) {
+               if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
                        return -ENOMEM;
        }
 
@@ -613,8 +621,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
                           &need_rmap_locks);
        if (!new_vma) {
-               if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT))
-                       vm_unacct_memory(new_len >> PAGE_SHIFT);
+               if (vm_flags & VM_ACCOUNT)
+                       vm_unacct_memory(to_account >> PAGE_SHIFT);
                return -ENOMEM;
        }
 
@@ -642,6 +650,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                mremap_userfaultfd_prep(new_vma, uf);
        }
 
+       if (is_vm_hugetlb_page(vma)) {
+               clear_vma_resv_huge_pages(vma);
+       }
+
        /* Conceal VM_ACCOUNT so old reservation is not undone */
        if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
                vma->vm_flags &= ~VM_ACCOUNT;
@@ -708,8 +720,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 }
 
 static struct vm_area_struct *vma_to_resize(unsigned long addr,
-       unsigned long old_len, unsigned long new_len, unsigned long flags,
-       unsigned long *p)
+       unsigned long old_len, unsigned long new_len, unsigned long flags)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
@@ -736,9 +747,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
                        (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
                return ERR_PTR(-EINVAL);
 
-       if (is_vm_hugetlb_page(vma))
-               return ERR_PTR(-EINVAL);
-
        /* We can't remap across vm area boundaries */
        if (old_len > vma->vm_end - addr)
                return ERR_PTR(-EFAULT);
@@ -768,13 +776,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
                                (new_len - old_len) >> PAGE_SHIFT))
                return ERR_PTR(-ENOMEM);
 
-       if (vma->vm_flags & VM_ACCOUNT) {
-               unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
-               if (security_vm_enough_memory_mm(mm, charged))
-                       return ERR_PTR(-ENOMEM);
-               *p = charged;
-       }
-
        return vma;
 }
 
@@ -787,7 +788,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long ret = -EINVAL;
-       unsigned long charged = 0;
        unsigned long map_flags = 0;
 
        if (offset_in_page(new_addr))
@@ -830,7 +830,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
                old_len = new_len;
        }
 
-       vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
+       vma = vma_to_resize(addr, old_len, new_len, flags);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
@@ -853,7 +853,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
                                ((addr - vma->vm_start) >> PAGE_SHIFT),
                                map_flags);
        if (IS_ERR_VALUE(ret))
-               goto out1;
+               goto out;
 
        /* We got a new mapping */
        if (!(flags & MREMAP_FIXED))
@@ -862,12 +862,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
        ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
                       uf_unmap);
 
-       if (!(offset_in_page(ret)))
-               goto out;
-
-out1:
-       vm_unacct_memory(charged);
-
 out:
        return ret;
 }
@@ -899,7 +893,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long ret = -EINVAL;
-       unsigned long charged = 0;
        bool locked = false;
        bool downgraded = false;
        struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
@@ -949,6 +942,31 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
+       vma = find_vma(mm, addr);
+       if (!vma || vma->vm_start > addr) {
+               ret = EFAULT;
+               goto out;
+       }
+
+       if (is_vm_hugetlb_page(vma)) {
+               struct hstate *h __maybe_unused = hstate_vma(vma);
+
+               old_len = ALIGN(old_len, huge_page_size(h));
+               new_len = ALIGN(new_len, huge_page_size(h));
+
+               /* addrs must be huge page aligned */
+               if (addr & ~huge_page_mask(h))
+                       goto out;
+               if (new_addr & ~huge_page_mask(h))
+                       goto out;
+
+               /*
+                * Don't allow remap expansion, because the underlying hugetlb
+                * reservation is not yet capable to handle split reservation.
+                */
+               if (new_len > old_len)
+                       goto out;
+       }
 
        if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
                ret = mremap_to(addr, old_len, new_addr, new_len,
@@ -981,7 +999,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        /*
         * Ok, we need to grow..
         */
-       vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
+       vma = vma_to_resize(addr, old_len, new_len, flags);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
@@ -992,10 +1010,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
        if (old_len == vma->vm_end - addr) {
                /* can we just expand the current mapping? */
                if (vma_expandable(vma, new_len - old_len)) {
-                       int pages = (new_len - old_len) >> PAGE_SHIFT;
+                       long pages = (new_len - old_len) >> PAGE_SHIFT;
+
+                       if (vma->vm_flags & VM_ACCOUNT) {
+                               if (security_vm_enough_memory_mm(mm, pages)) {
+                                       ret = -ENOMEM;
+                                       goto out;
+                               }
+                       }
 
                        if (vma_adjust(vma, vma->vm_start, addr + new_len,
                                       vma->vm_pgoff, NULL)) {
+                               vm_unacct_memory(pages);
                                ret = -ENOMEM;
                                goto out;
                        }
@@ -1034,10 +1060,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                               &locked, flags, &uf, &uf_unmap);
        }
 out:
-       if (offset_in_page(ret)) {
-               vm_unacct_memory(charged);
+       if (offset_in_page(ret))
                locked = false;
-       }
        if (downgraded)
                mmap_read_unlock(current->mm);
        else
index 41ef204e748202b312c203968bee45f33438b097..55a9e48a7a02681667f33e038bb209c13b1ccb3c 100644 (file)
@@ -1638,12 +1638,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 }
 EXPORT_SYMBOL(remap_vmalloc_range);
 
-unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long pgoff, unsigned long flags)
-{
-       return -ENOMEM;
-}
-
 vm_fault_t filemap_fault(struct vm_fault *vmf)
 {
        BUG();
index 50b984d048ce5349439d4307e92310d91cef9eab..195b3661da3d746ccbd73b123d974131695903d5 100644 (file)
@@ -641,6 +641,8 @@ done:
 
 static int oom_reaper(void *unused)
 {
+       set_freezable();
+
        while (true) {
                struct task_struct *tsk = NULL;
 
@@ -1120,27 +1122,24 @@ bool out_of_memory(struct oom_control *oc)
 }
 
 /*
- * The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
- * killing is already in progress so do nothing.
+ * The pagefault handler calls here because some allocation has failed. We have
+ * to take care of the memcg OOM here because this is the only safe context without
+ * any locks held but let the oom killer triggered from the allocation context care
+ * about the global OOM.
  */
 void pagefault_out_of_memory(void)
 {
-       struct oom_control oc = {
-               .zonelist = NULL,
-               .nodemask = NULL,
-               .memcg = NULL,
-               .gfp_mask = 0,
-               .order = 0,
-       };
+       static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                     DEFAULT_RATELIMIT_BURST);
 
        if (mem_cgroup_oom_synchronize(true))
                return;
 
-       if (!mutex_trylock(&oom_lock))
+       if (fatal_signal_pending(current))
                return;
-       out_of_memory(&oc);
-       mutex_unlock(&oom_lock);
+
+       if (__ratelimit(&pfoom_rs))
+               pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
 }
 
 SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
index 9c64490171e048e1479b7d04954b00f9d8a72985..2d498bb622484a7264097068fb5188c1ead589e7 100644 (file)
@@ -2366,8 +2366,15 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
                        ret = generic_writepages(mapping, wbc);
                if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
                        break;
-               cond_resched();
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
+
+               /*
+                * Lacking an allocation context or the locality or writeback
+                * state of any of the inode's pages, throttle based on
+                * writeback activity on the local node. It's as good a
+                * guess as any.
+                */
+               reclaim_throttle(NODE_DATA(numa_node_id()),
+                       VMSCAN_THROTTLE_WRITEBACK);
        }
        /*
         * Usually few pages are written by now from those we've just submitted
index fee18ada46a2f05a53c50d7aef3cefb51f3c093d..c5952749ad40bdee7287e42eff1574c81928b536 100644 (file)
@@ -677,10 +677,8 @@ static inline int pindex_to_order(unsigned int pindex)
        int order = pindex / MIGRATE_PCPTYPES;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       if (order > PAGE_ALLOC_COSTLY_ORDER) {
+       if (order > PAGE_ALLOC_COSTLY_ORDER)
                order = pageblock_order;
-               VM_BUG_ON(order != pageblock_order);
-       }
 #else
        VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
 #endif
@@ -1430,14 +1428,8 @@ static inline void prefetch_buddy(struct page *page)
 
 /*
  * Frees a number of pages from the PCP lists
- * Assumes all pages on list are in same zone, and of same order.
+ * Assumes all pages on list are in same zone.
  * count is the number of pages to free.
- *
- * If the zone was previously in an "all pages pinned" state then look to
- * see if this freeing clears that state.
- *
- * And clear the zone's pages_scanned counter, to hold off the "all pages are
- * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
                                        struct per_cpu_pages *pcp)
@@ -1591,7 +1583,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                struct zone *zone = &pgdat->node_zones[zid];
 
-               if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
+               if (zone_spans_pfn(zone, pfn))
                        break;
        }
        __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
@@ -3149,9 +3141,9 @@ static void drain_local_pages_wq(struct work_struct *work)
         * cpu which is alright but we also have to make sure to not move to
         * a different one.
         */
-       preempt_disable();
+       migrate_disable();
        drain_local_pages(drain->zone);
-       preempt_enable();
+       migrate_enable();
 }
 
 /*
@@ -3968,6 +3960,8 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 }
 
 #ifdef CONFIG_NUMA
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
@@ -4797,30 +4791,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                trace_reclaim_retry_zone(z, order, reclaimable,
                                available, min_wmark, *no_progress_loops, wmark);
                if (wmark) {
-                       /*
-                        * If we didn't make any progress and have a lot of
-                        * dirty + writeback pages then we should wait for
-                        * an IO to complete to slow down the reclaim and
-                        * prevent from pre mature OOM
-                        */
-                       if (!did_some_progress) {
-                               unsigned long write_pending;
-
-                               write_pending = zone_page_state_snapshot(zone,
-                                                       NR_ZONE_WRITE_PENDING);
-
-                               if (2 * write_pending > reclaimable) {
-                                       congestion_wait(BLK_RW_ASYNC, HZ/10);
-                                       return true;
-                               }
-                       }
-
                        ret = true;
-                       goto out;
+                       break;
                }
        }
 
-out:
        /*
         * Memory allocation/reclaim might be called from a WQ context and the
         * current implementation of the WQ concurrency control doesn't
@@ -4916,6 +4891,19 @@ retry_cpuset:
        if (!ac->preferred_zoneref->zone)
                goto nopage;
 
+       /*
+        * Check for insane configurations where the cpuset doesn't contain
+        * any suitable zone to satisfy the request - e.g. non-movable
+        * GFP_HIGHUSER allocations from MOVABLE nodes only.
+        */
+       if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
+               struct zoneref *z = first_zones_zonelist(ac->zonelist,
+                                       ac->highest_zoneidx,
+                                       &cpuset_current_mems_allowed);
+               if (!z->zone)
+                       goto nopage;
+       }
+
        if (alloc_flags & ALLOC_KSWAPD)
                wake_all_kswapds(order, gfp_mask, ac);
 
@@ -5630,8 +5618,8 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
        unsigned int order = get_order(size);
        unsigned long addr;
 
-       if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
-               gfp_mask &= ~__GFP_COMP;
+       if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
+               gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
 
        addr = __get_free_pages(gfp_mask, order);
        return make_alloc_exact(addr, order, size);
@@ -5655,8 +5643,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
        unsigned int order = get_order(size);
        struct page *p;
 
-       if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
-               gfp_mask &= ~__GFP_COMP;
+       if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
+               gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
 
        p = alloc_pages_node(nid, gfp_mask, order);
        if (!p)
@@ -5998,6 +5986,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                printk(KERN_CONT
                        "%s"
                        " free:%lukB"
+                       " boost:%lukB"
                        " min:%lukB"
                        " low:%lukB"
                        " high:%lukB"
@@ -6018,6 +6007,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                        "\n",
                        zone->name,
                        K(zone_page_state(zone, NR_FREE_PAGES)),
+                       K(zone->watermark_boost),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
@@ -6273,7 +6263,7 @@ static void build_zonelists(pg_data_t *pgdat)
                 */
                if (node_distance(local_node, node) !=
                    node_distance(local_node, prev_node))
-                       node_load[node] = load;
+                       node_load[node] += load;
 
                node_order[nr_nodes++] = node;
                prev_node = node;
@@ -6282,6 +6272,10 @@ static void build_zonelists(pg_data_t *pgdat)
 
        build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
        build_thisnode_zonelists(pgdat);
+       pr_info("Fallback order for Node %d: ", local_node);
+       for (node = 0; node < nr_nodes; node++)
+               pr_cont("%d ", node_order[node]);
+       pr_cont("\n");
 }
 
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
@@ -7407,6 +7401,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
 
 static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 {
+       int i;
+
        pgdat_resize_init(pgdat);
 
        pgdat_init_split_queue(pgdat);
@@ -7415,6 +7411,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
 
+       for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
+               init_waitqueue_head(&pgdat->reclaim_wait[i]);
+
        pgdat_page_ext_init(pgdat);
        lruvec_init(&pgdat->__lruvec);
 }
@@ -8144,8 +8143,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
        }
 
        if (pages && s)
-               pr_info("Freeing %s memory: %ldK\n",
-                       s, pages << (PAGE_SHIFT - 10));
+               pr_info("Freeing %s memory: %ldK\n", s, K(pages));
 
        return pages;
 }
@@ -8190,14 +8188,13 @@ void __init mem_init_print_info(void)
                ", %luK highmem"
 #endif
                ")\n",
-               nr_free_pages() << (PAGE_SHIFT - 10),
-               physpages << (PAGE_SHIFT - 10),
+               K(nr_free_pages()), K(physpages),
                codesize >> 10, datasize >> 10, rosize >> 10,
                (init_data_size + init_code_size) >> 10, bss_size >> 10,
-               (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
-               totalcma_pages << (PAGE_SHIFT - 10)
+               K(physpages - totalram_pages() - totalcma_pages),
+               K(totalcma_pages)
 #ifdef CONFIG_HIGHMEM
-               , totalhigh_pages() << (PAGE_SHIFT - 10)
+               , K(totalhigh_pages())
 #endif
                );
 }
@@ -8470,7 +8467,7 @@ void setup_per_zone_wmarks(void)
  * 8192MB:     11584k
  * 16384MB:    16384k
  */
-int __meminit init_per_zone_wmark_min(void)
+void calculate_min_free_kbytes(void)
 {
        unsigned long lowmem_kbytes;
        int new_min_free_kbytes;
@@ -8478,16 +8475,17 @@ int __meminit init_per_zone_wmark_min(void)
        lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
        new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 
-       if (new_min_free_kbytes > user_min_free_kbytes) {
-               min_free_kbytes = new_min_free_kbytes;
-               if (min_free_kbytes < 128)
-                       min_free_kbytes = 128;
-               if (min_free_kbytes > 262144)
-                       min_free_kbytes = 262144;
-       } else {
+       if (new_min_free_kbytes > user_min_free_kbytes)
+               min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
+       else
                pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
                                new_min_free_kbytes, user_min_free_kbytes);
-       }
+
+}
+
+int __meminit init_per_zone_wmark_min(void)
+{
+       calculate_min_free_kbytes();
        setup_per_zone_wmarks();
        refresh_zone_stat_thresholds();
        setup_per_zone_lowmem_reserve();
@@ -8774,7 +8772,8 @@ void *__init alloc_large_system_hash(const char *tablename,
                } else if (get_order(size) >= MAX_ORDER || hashdist) {
                        table = __vmalloc(size, gfp_flags);
                        virt = true;
-                       huge = is_vm_area_hugepages(table);
+                       if (table)
+                               huge = is_vm_area_hugepages(table);
                } else {
                        /*
                         * If bucketsize is not a power-of-two, we may free
@@ -9371,21 +9370,21 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 }
 #endif
 
+/*
+ * This function returns a stable result only if called under zone lock.
+ */
 bool is_free_buddy_page(struct page *page)
 {
-       struct zone *zone = page_zone(page);
        unsigned long pfn = page_to_pfn(page);
-       unsigned long flags;
        unsigned int order;
 
-       spin_lock_irqsave(&zone->lock, flags);
        for (order = 0; order < MAX_ORDER; order++) {
                struct page *page_head = page - (pfn & ((1 << order) - 1));
 
-               if (PageBuddy(page_head) && buddy_order(page_head) >= order)
+               if (PageBuddy(page_head) &&
+                   buddy_order_unsafe(page_head) >= order)
                        break;
        }
-       spin_unlock_irqrestore(&zone->lock, flags);
 
        return order < MAX_ORDER;
 }
index 2a52fd9ed464aa7d780dc50440b7f09989ad15ce..6242afb24d8479118019c084211280c160e301f1 100644 (file)
@@ -201,7 +201,7 @@ fail:
        panic("Out of memory");
 }
 
-#else /* CONFIG_FLATMEM */
+#else /* CONFIG_SPARSEMEM */
 
 struct page_ext *lookup_page_ext(const struct page *page)
 {
index a95c2c6562d0f0ea9519c46b1e95a9d3d7408ab4..f67c4c70f17f66c08626f57b1999462782395eff 100644 (file)
@@ -94,8 +94,13 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
                        buddy = page + (buddy_pfn - pfn);
 
                        if (!is_migrate_isolate_page(buddy)) {
-                               __isolate_free_page(page, order);
-                               isolated_page = true;
+                               isolated_page = !!__isolate_free_page(page, order);
+                               /*
+                                * Isolating a free page in an isolated pageblock
+                                * is expected to always work as watermarks don't
+                                * apply here.
+                                */
+                               VM_WARN_ON(!isolated_page);
                        }
                }
        }
@@ -183,7 +188,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
                             unsigned migratetype, int flags)
 {
        unsigned long pfn;
-       unsigned long undo_pfn;
        struct page *page;
 
        BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
@@ -193,25 +197,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
             pfn < end_pfn;
             pfn += pageblock_nr_pages) {
                page = __first_valid_page(pfn, pageblock_nr_pages);
-               if (page) {
-                       if (set_migratetype_isolate(page, migratetype, flags)) {
-                               undo_pfn = pfn;
-                               goto undo;
-                       }
+               if (page && set_migratetype_isolate(page, migratetype, flags)) {
+                       undo_isolate_page_range(start_pfn, pfn, migratetype);
+                       return -EBUSY;
                }
        }
        return 0;
-undo:
-       for (pfn = start_pfn;
-            pfn < undo_pfn;
-            pfn += pageblock_nr_pages) {
-               struct page *page = pfn_to_online_page(pfn);
-               if (!page)
-                       continue;
-               unset_migratetype_isolate(page, migratetype);
-       }
-
-       return -EBUSY;
 }
 
 /*
index e0a986818903dc7cc4b49b2f54749c24a636ff85..f5b2c2ea5a548adf4704ee7aacfdade4be3c1c13 100644 (file)
@@ -2472,7 +2472,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
  */
 void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
 {
-       memblock_free_early(__pa(ai), ai->__ai_size);
+       memblock_free(ai, ai->__ai_size);
 }
 
 /**
@@ -3134,7 +3134,7 @@ out_free_areas:
 out_free:
        pcpu_free_alloc_info(ai);
        if (areas)
-               memblock_free_early(__pa(areas), areas_size);
+               memblock_free(areas, areas_size);
        return rc;
 }
 #endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -3256,7 +3256,7 @@ enomem:
                free_fn(page_address(pages[j]), PAGE_SIZE);
        rc = -ENOMEM;
 out_free_ar:
-       memblock_free_early(__pa(pages), pages_size);
+       memblock_free(pages, pages_size);
        pcpu_free_alloc_info(ai);
        return rc;
 }
@@ -3286,7 +3286,7 @@ static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
 
 static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
 {
-       memblock_free_early(__pa(ptr), size);
+       memblock_free(ptr, size);
 }
 
 void __init setup_per_cpu_areas(void)
index e71e719e36c906f502509bbfe46abee0e727b3a0..6ae5693de28cef5177f80b8db747b4bf8568102d 100644 (file)
@@ -308,7 +308,7 @@ void force_page_cache_ra(struct readahead_control *ractl,
  * Set the initial window size, round to next power of 2 and square
  * for small size, x 4 for medium, and x 2 for large
  * for 128k (32 page) max ra
- * 1-8 page = 32k initial, > 8 page = 128k initial
+ * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
  */
 static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
 {
index 3a1059c284c379156698be0ade1300793cf186d1..163ac4e6bceed4fcddd842173866b0c7a32beccc 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1807,6 +1807,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
                update_hiwater_rss(mm);
 
                if (is_zone_device_page(page)) {
+                       unsigned long pfn = page_to_pfn(page);
                        swp_entry_t entry;
                        pte_t swp_pte;
 
@@ -1815,8 +1816,11 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
-                       entry = make_readable_migration_entry(
-                                                       page_to_pfn(page));
+                       entry = pte_to_swp_entry(pteval);
+                       if (is_writable_device_private_entry(entry))
+                               entry = make_writable_migration_entry(pfn);
+                       else
+                               entry = make_readable_migration_entry(pfn);
                        swp_pte = swp_entry_to_pte(entry);
 
                        /*
index 17e344e26e736089f677796030d03e3eb76d15be..23c91a8beb781d79d802479aae05ed90145dc9b8 100644 (file)
@@ -855,9 +855,8 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
                return swapped << PAGE_SHIFT;
 
        /* Here comes the more involved part */
-       return shmem_partial_swap_usage(mapping,
-                       linear_page_index(vma, vma->vm_start),
-                       linear_page_index(vma, vma->vm_end));
+       return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
+                                       vma->vm_pgoff + vma_pages(vma));
 }
 
 /*
@@ -2426,7 +2425,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
        shmem_recalc_inode(inode);
        spin_unlock_irq(&info->lock);
 
-       SetPageDirty(page);
        unlock_page(page);
        return 0;
 out_delete_from_cache:
@@ -2458,6 +2456,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
        struct inode *inode = mapping->host;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t index = pos >> PAGE_SHIFT;
+       int ret = 0;
 
        /* i_rwsem is held by caller */
        if (unlikely(info->seals & (F_SEAL_GROW |
@@ -2468,7 +2467,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
                        return -EPERM;
        }
 
-       return shmem_getpage(inode, index, pagep, SGP_WRITE);
+       ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
+
+       if (*pagep && PageHWPoison(*pagep)) {
+               unlock_page(*pagep);
+               put_page(*pagep);
+               ret = -EIO;
+       }
+
+       return ret;
 }
 
 static int
@@ -2555,6 +2562,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
                        if (sgp == SGP_CACHE)
                                set_page_dirty(page);
                        unlock_page(page);
+
+                       if (PageHWPoison(page)) {
+                               put_page(page);
+                               error = -EIO;
+                               break;
+                       }
                }
 
                /*
@@ -3116,7 +3129,8 @@ static const char *shmem_get_link(struct dentry *dentry,
                page = find_get_page(inode->i_mapping, 0);
                if (!page)
                        return ERR_PTR(-ECHILD);
-               if (!PageUptodate(page)) {
+               if (PageHWPoison(page) ||
+                   !PageUptodate(page)) {
                        put_page(page);
                        return ERR_PTR(-ECHILD);
                }
@@ -3124,6 +3138,11 @@ static const char *shmem_get_link(struct dentry *dentry,
                error = shmem_getpage(inode, 0, &page, SGP_READ);
                if (error)
                        return ERR_PTR(error);
+               if (page && PageHWPoison(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       return ERR_PTR(-ECHILD);
+               }
                unlock_page(page);
        }
        set_delayed_call(done, shmem_put_link, page);
@@ -3774,6 +3793,13 @@ static void shmem_destroy_inodecache(void)
        kmem_cache_destroy(shmem_inode_cachep);
 }
 
+/* Keep the page in page cache instead of truncating it */
+static int shmem_error_remove_page(struct address_space *mapping,
+                                  struct page *page)
+{
+       return 0;
+}
+
 const struct address_space_operations shmem_aops = {
        .writepage      = shmem_writepage,
        .set_page_dirty = __set_page_dirty_no_writeback,
@@ -3784,7 +3810,7 @@ const struct address_space_operations shmem_aops = {
 #ifdef CONFIG_MIGRATION
        .migratepage    = migrate_page,
 #endif
-       .error_remove_page = generic_error_remove_page,
+       .error_remove_page = shmem_error_remove_page,
 };
 EXPORT_SYMBOL(shmem_aops);
 
@@ -4195,6 +4221,10 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                page = ERR_PTR(error);
        else
                unlock_page(page);
+
+       if (PageHWPoison(page))
+               page = ERR_PTR(-EIO);
+
        return page;
 #else
        /*
index 874b3f8fe80da2525e8f197a8543f63c9a665543..da132a9ae6f8be6fd9744b0f9f464ad14fcb04ad 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3900,8 +3900,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
        if (err)
                goto end;
 
-       if (limit && shared && batchcount)
-               goto skip_setup;
        /*
         * The head array serves three purposes:
         * - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -3944,7 +3942,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
                limit = 32;
 #endif
        batchcount = (limit + 1) / 2;
-skip_setup:
        err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
 end:
        if (err)
@@ -4207,19 +4204,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
            n <= cachep->useroffset - offset + cachep->usersize)
                return;
 
-       /*
-        * If the copy is still within the allocated object, produce
-        * a warning instead of rejecting the copy. This is intended
-        * to be a temporary method to find any missing usercopy
-        * whitelists.
-        */
-       if (usercopy_fallback &&
-           offset <= cachep->object_size &&
-           n <= cachep->object_size - offset) {
-               usercopy_warn("SLAB object", cachep->name, to_user, offset, n);
-               return;
-       }
-
        usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
 }
 #endif /* CONFIG_HARDENED_USERCOPY */
index ec2bb0beed757cf2b143920b07b43fafeb9169f5..e5d080a9300933cdde0eddb679ab34596825cd54 100644 (file)
@@ -37,14 +37,6 @@ LIST_HEAD(slab_caches);
 DEFINE_MUTEX(slab_mutex);
 struct kmem_cache *kmem_cache;
 
-#ifdef CONFIG_HARDENED_USERCOPY
-bool usercopy_fallback __ro_after_init =
-               IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
-module_param(usercopy_fallback, bool, 0400);
-MODULE_PARM_DESC(usercopy_fallback,
-               "WARN instead of reject usercopy whitelist violations");
-#endif
-
 static LIST_HEAD(slab_caches_to_rcu_destroy);
 static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
 static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
index e87fd492a65b8cfa09f14dded95b99fac56c6bed..f7368bfffb7afc17cadefa0b75f15d38a6d0609c 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -354,7 +354,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
 
 static void prefetch_freepointer(const struct kmem_cache *s, void *object)
 {
-       prefetch(object + s->offset);
+       prefetchw(object + s->offset);
 }
 
 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
@@ -414,6 +414,29 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
        return x.x & OO_MASK;
 }
 
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
+{
+       unsigned int nr_pages;
+
+       s->cpu_partial = nr_objects;
+
+       /*
+        * We take the number of objects but actually limit the number of
+        * pages on the per cpu partial list, in order to limit excessive
+        * growth of the list. For simplicity we assume that the pages will
+        * be half-full.
+        */
+       nr_pages = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
+       s->cpu_partial_pages = nr_pages;
+}
+#else
+static inline void
+slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
+{
+}
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
+
 /*
  * Per slab locking using the pagelock
  */
@@ -2052,7 +2075,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
  */
 static inline void *acquire_slab(struct kmem_cache *s,
                struct kmem_cache_node *n, struct page *page,
-               int mode, int *objects)
+               int mode)
 {
        void *freelist;
        unsigned long counters;
@@ -2068,7 +2091,6 @@ static inline void *acquire_slab(struct kmem_cache *s,
        freelist = page->freelist;
        counters = page->counters;
        new.counters = counters;
-       *objects = new.objects - new.inuse;
        if (mode) {
                new.inuse = page->objects;
                new.freelist = NULL;
@@ -2106,9 +2128,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 {
        struct page *page, *page2;
        void *object = NULL;
-       unsigned int available = 0;
        unsigned long flags;
-       int objects;
+       unsigned int partial_pages = 0;
 
        /*
         * Racy check. If we mistakenly see no partial slabs then we
@@ -2126,11 +2147,10 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
                if (!pfmemalloc_match(page, gfpflags))
                        continue;
 
-               t = acquire_slab(s, n, page, object == NULL, &objects);
+               t = acquire_slab(s, n, page, object == NULL);
                if (!t)
                        break;
 
-               available += objects;
                if (!object) {
                        *ret_page = page;
                        stat(s, ALLOC_FROM_PARTIAL);
@@ -2138,10 +2158,15 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
                } else {
                        put_cpu_partial(s, page, 0);
                        stat(s, CPU_PARTIAL_NODE);
+                       partial_pages++;
                }
+#ifdef CONFIG_SLUB_CPU_PARTIAL
                if (!kmem_cache_has_cpu_partial(s)
-                       || available > slub_cpu_partial(s) / 2)
+                       || partial_pages > s->cpu_partial_pages / 2)
                        break;
+#else
+               break;
+#endif
 
        }
        spin_unlock_irqrestore(&n->list_lock, flags);
@@ -2546,14 +2571,13 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
        struct page *page_to_unfreeze = NULL;
        unsigned long flags;
        int pages = 0;
-       int pobjects = 0;
 
        local_lock_irqsave(&s->cpu_slab->lock, flags);
 
        oldpage = this_cpu_read(s->cpu_slab->partial);
 
        if (oldpage) {
-               if (drain && oldpage->pobjects > slub_cpu_partial(s)) {
+               if (drain && oldpage->pages >= s->cpu_partial_pages) {
                        /*
                         * Partial array is full. Move the existing set to the
                         * per node partial list. Postpone the actual unfreezing
@@ -2562,16 +2586,13 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
                        page_to_unfreeze = oldpage;
                        oldpage = NULL;
                } else {
-                       pobjects = oldpage->pobjects;
                        pages = oldpage->pages;
                }
        }
 
        pages++;
-       pobjects += page->objects - page->inuse;
 
        page->pages = pages;
-       page->pobjects = pobjects;
        page->next = oldpage;
 
        this_cpu_write(s->cpu_slab->partial, page);
@@ -3522,7 +3543,9 @@ static inline void free_nonslab_page(struct page *page, void *object)
 {
        unsigned int order = compound_order(page);
 
-       VM_BUG_ON_PAGE(!PageCompound(page), page);
+       if (WARN_ON_ONCE(!PageCompound(page)))
+               pr_warn_once("object pointer: 0x%p\n", object);
+
        kfree_hook(object);
        mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order));
        __free_pages(page, order);
@@ -3989,6 +4012,8 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
 static void set_cpu_partial(struct kmem_cache *s)
 {
 #ifdef CONFIG_SLUB_CPU_PARTIAL
+       unsigned int nr_objects;
+
        /*
         * cpu_partial determined the maximum number of objects kept in the
         * per cpu partial lists of a processor.
@@ -3998,24 +4023,22 @@ static void set_cpu_partial(struct kmem_cache *s)
         * filled up again with minimal effort. The slab will never hit the
         * per node partial lists and therefore no locking will be required.
         *
-        * This setting also determines
-        *
-        * A) The number of objects from per cpu partial slabs dumped to the
-        *    per node list when we reach the limit.
-        * B) The number of objects in cpu partial slabs to extract from the
-        *    per node list when we run out of per cpu objects. We only fetch
-        *    50% to keep some capacity around for frees.
+        * For backwards compatibility reasons, this is determined as number
+        * of objects, even though we now limit maximum number of pages, see
+        * slub_set_cpu_partial()
         */
        if (!kmem_cache_has_cpu_partial(s))
-               slub_set_cpu_partial(s, 0);
+               nr_objects = 0;
        else if (s->size >= PAGE_SIZE)
-               slub_set_cpu_partial(s, 2);
+               nr_objects = 6;
        else if (s->size >= 1024)
-               slub_set_cpu_partial(s, 6);
+               nr_objects = 24;
        else if (s->size >= 256)
-               slub_set_cpu_partial(s, 13);
+               nr_objects = 52;
        else
-               slub_set_cpu_partial(s, 30);
+               nr_objects = 120;
+
+       slub_set_cpu_partial(s, nr_objects);
 #endif
 }
 
@@ -4466,7 +4489,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
 {
        struct kmem_cache *s;
        unsigned int offset;
-       size_t object_size;
        bool is_kfence = is_kfence_address(ptr);
 
        ptr = kasan_reset_tag(ptr);
@@ -4499,19 +4521,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
            n <= s->useroffset - offset + s->usersize)
                return;
 
-       /*
-        * If the copy is still within the allocated object, produce
-        * a warning instead of rejecting the copy. This is intended
-        * to be a temporary method to find any missing usercopy
-        * whitelists.
-        */
-       object_size = slab_ksize(s);
-       if (usercopy_fallback &&
-           offset <= object_size && n <= object_size - offset) {
-               usercopy_warn("SLUB object", s->name, to_user, offset, n);
-               return;
-       }
-
        usercopy_abort("SLUB object", s->name, to_user, offset, n);
 }
 #endif /* CONFIG_HARDENED_USERCOPY */
@@ -5390,7 +5399,12 @@ SLAB_ATTR(min_partial);
 
 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
 {
-       return sysfs_emit(buf, "%u\n", slub_cpu_partial(s));
+       unsigned int nr_partial = 0;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+       nr_partial = s->cpu_partial;
+#endif
+
+       return sysfs_emit(buf, "%u\n", nr_partial);
 }
 
 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
@@ -5461,12 +5475,12 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 
                page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
 
-               if (page) {
+               if (page)
                        pages += page->pages;
-                       objects += page->pobjects;
-               }
        }
 
+       /* Approximate half-full pages , see slub_set_cpu_partial() */
+       objects = (pages * oo_objects(s->oo)) / 2;
        len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages);
 
 #ifdef CONFIG_SMP
@@ -5474,9 +5488,12 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
                struct page *page;
 
                page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
-               if (page)
+               if (page) {
+                       pages = READ_ONCE(page->pages);
+                       objects = (pages * oo_objects(s->oo)) / 2;
                        len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
-                                            cpu, page->pobjects, page->pages);
+                                            cpu, objects, pages);
+               }
        }
 #endif
        len += sysfs_emit_at(buf, len, "\n");
index bdce883f92863e870c622969f669627cb86006a7..db6df27c852a7e11e341333ee60faee6da12f92c 100644 (file)
@@ -76,7 +76,7 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
                set_pte_at(&init_mm, addr, pte, entry);
        }
 
-       /* Make pte visible before pmd. See comment in __pte_alloc(). */
+       /* Make pte visible before pmd. See comment in pmd_install(). */
        smp_wmb();
        pmd_populate_kernel(&init_mm, pmd, pgtable);
 
index 120bc8ea5293e60bcb5dd91f22467767dd959892..e5c84b0cf0c9384733477bc3707f61852e07cdb6 100644 (file)
@@ -451,7 +451,7 @@ static void *sparsemap_buf_end __meminitdata;
 static inline void __meminit sparse_buffer_free(unsigned long size)
 {
        WARN_ON(!sparsemap_buf || size == 0);
-       memblock_free_early(__pa(sparsemap_buf), size);
+       memblock_free(sparsemap_buf, size);
 }
 
 static void __init sparse_buffer_init(unsigned long size, int nid)
index 8ff9ba7cf2de3e79b134581af87b156105753e24..1841c24682f8f12fbad984830f111fa8cdfb20cb 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -135,18 +135,27 @@ EXPORT_SYMBOL(__put_page);
  * put_pages_list() - release a list of pages
  * @pages: list of pages threaded on page->lru
  *
- * Release a list of pages which are strung together on page.lru.  Currently
- * used by read_cache_pages() and related error recovery code.
+ * Release a list of pages which are strung together on page.lru.
  */
 void put_pages_list(struct list_head *pages)
 {
-       while (!list_empty(pages)) {
-               struct page *victim;
+       struct page *page, *next;
 
-               victim = lru_to_page(pages);
-               list_del(&victim->lru);
-               put_page(victim);
+       list_for_each_entry_safe(page, next, pages, lru) {
+               if (!put_page_testzero(page)) {
+                       list_del(&page->lru);
+                       continue;
+               }
+               if (PageHead(page)) {
+                       list_del(&page->lru);
+                       __put_compound_page(page);
+                       continue;
+               }
+               /* Cannot be PageLRU because it's passed to us using the lru */
+               __ClearPageWaiters(page);
        }
+
+       free_unref_page_list(pages);
 }
 EXPORT_SYMBOL(put_pages_list);
 
index 41c9e92f1f004fb9acf518ac717607de1ede141d..e59e08ef46e15fb49927d80caf64effc641c5f5e 100644 (file)
@@ -2763,7 +2763,7 @@ static int swap_show(struct seq_file *swap, void *v)
        struct swap_info_struct *si = v;
        struct file *file;
        int len;
-       unsigned int bytes, inuse;
+       unsigned long bytes, inuse;
 
        if (si == SEQ_START_TOKEN) {
                seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
@@ -2775,7 +2775,7 @@ static int swap_show(struct seq_file *swap, void *v)
 
        file = si->swap_file;
        len = seq_file_path(swap, file, " \t\n\\");
-       seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
+       seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
                        len < 40 ? 40 - len : 1, " ",
                        S_ISBLK(file_inode(file)->i_mode) ?
                                "partition" : "file\t",
@@ -3118,7 +3118,7 @@ static bool swap_discardable(struct swap_info_struct *si)
 {
        struct request_queue *q = bdev_get_queue(si->bdev);
 
-       if (!q || !blk_queue_discard(q))
+       if (!blk_queue_discard(q))
                return false;
 
        return true;
index 36e5f6ab976f0d0f965b7d4913956e2b26970754..0780c2a57ff11ba42d4a2974367ddc862578efcd 100644 (file)
@@ -69,10 +69,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
        pgoff_t offset, max_off;
 
        _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+       _dst_pte = pte_mkdirty(_dst_pte);
        if (page_in_cache && !vm_shared)
                writable = false;
-       if (writable || !page_in_cache)
-               _dst_pte = pte_mkdirty(_dst_pte);
        if (writable) {
                if (wp_copy)
                        _dst_pte = pte_mkuffd_wp(_dst_pte);
@@ -233,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
                goto out;
        }
 
+       if (PageHWPoison(page)) {
+               ret = -EIO;
+               goto out_release;
+       }
+
        ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
                                       page, false, wp_copy);
        if (ret)
index e8a807c7811077414c4efbced29a16ab75f15b22..d2a00ad4e1dd155eb474c797e060f6b85f3d37d4 100644 (file)
@@ -1195,18 +1195,14 @@ find_vmap_lowest_match(unsigned long size,
 {
        struct vmap_area *va;
        struct rb_node *node;
-       unsigned long length;
 
        /* Start from the root. */
        node = free_vmap_area_root.rb_node;
 
-       /* Adjust the search size for alignment overhead. */
-       length = size + align - 1;
-
        while (node) {
                va = rb_entry(node, struct vmap_area, rb_node);
 
-               if (get_subtree_max_size(node->rb_left) >= length &&
+               if (get_subtree_max_size(node->rb_left) >= size &&
                                vstart < va->va_start) {
                        node = node->rb_left;
                } else {
@@ -1216,9 +1212,9 @@ find_vmap_lowest_match(unsigned long size,
                        /*
                         * Does not make sense to go deeper towards the right
                         * sub-tree if it does not have a free block that is
-                        * equal or bigger to the requested search length.
+                        * equal or bigger to the requested search size.
                         */
-                       if (get_subtree_max_size(node->rb_right) >= length) {
+                       if (get_subtree_max_size(node->rb_right) >= size) {
                                node = node->rb_right;
                                continue;
                        }
@@ -1226,15 +1222,23 @@ find_vmap_lowest_match(unsigned long size,
                        /*
                         * OK. We roll back and find the first right sub-tree,
                         * that will satisfy the search criteria. It can happen
-                        * only once due to "vstart" restriction.
+                        * due to "vstart" restriction or an alignment overhead
+                        * that is bigger then PAGE_SIZE.
                         */
                        while ((node = rb_parent(node))) {
                                va = rb_entry(node, struct vmap_area, rb_node);
                                if (is_within_this_va(va, size, align, vstart))
                                        return va;
 
-                               if (get_subtree_max_size(node->rb_right) >= length &&
+                               if (get_subtree_max_size(node->rb_right) >= size &&
                                                vstart <= va->va_start) {
+                                       /*
+                                        * Shift the vstart forward. Please note, we update it with
+                                        * parent's start address adding "1" because we do not want
+                                        * to enter same sub-tree after it has already been checked
+                                        * and no suitable free block found there.
+                                        */
+                                       vstart = va->va_start + 1;
                                        node = node->rb_right;
                                        break;
                                }
@@ -1265,7 +1269,7 @@ find_vmap_lowest_linear_match(unsigned long size,
 }
 
 static void
-find_vmap_lowest_match_check(unsigned long size)
+find_vmap_lowest_match_check(unsigned long size, unsigned long align)
 {
        struct vmap_area *va_1, *va_2;
        unsigned long vstart;
@@ -1274,8 +1278,8 @@ find_vmap_lowest_match_check(unsigned long size)
        get_random_bytes(&rnd, sizeof(rnd));
        vstart = VMALLOC_START + rnd;
 
-       va_1 = find_vmap_lowest_match(size, 1, vstart);
-       va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
+       va_1 = find_vmap_lowest_match(size, align, vstart);
+       va_2 = find_vmap_lowest_linear_match(size, align, vstart);
 
        if (va_1 != va_2)
                pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@@ -1454,7 +1458,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
                return vend;
 
 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
-       find_vmap_lowest_match_check(size);
+       find_vmap_lowest_match_check(size, align);
 #endif
 
        return nva_start_addr;
@@ -2272,15 +2276,22 @@ void __init vm_area_add_early(struct vm_struct *vm)
  */
 void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 {
-       static size_t vm_init_off __initdata;
-       unsigned long addr;
+       unsigned long addr = ALIGN(VMALLOC_START, align);
+       struct vm_struct *cur, **p;
 
-       addr = ALIGN(VMALLOC_START + vm_init_off, align);
-       vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
+       BUG_ON(vmap_initialized);
 
-       vm->addr = (void *)addr;
+       for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
+               if ((unsigned long)cur->addr - addr >= vm->size)
+                       break;
+               addr = ALIGN((unsigned long)cur->addr + cur->size, align);
+       }
 
-       vm_area_add_early(vm);
+       BUG_ON(addr > VMALLOC_END - vm->size);
+       vm->addr = (void *)addr;
+       vm->next = *p;
+       *p = vm;
+       kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
 }
 
 static void vmap_init_free_space(void)
@@ -2743,6 +2754,13 @@ void *vmap(struct page **pages, unsigned int count,
 
        might_sleep();
 
+       /*
+        * Your top guard is someone else's bottom guard. Not having a top
+        * guard compromises someone else's mappings too.
+        */
+       if (WARN_ON_ONCE(flags & VM_NO_GUARD))
+               flags &= ~VM_NO_GUARD;
+
        if (count > totalram_pages())
                return NULL;
 
@@ -2825,7 +2843,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
         * to fails, fallback to a single page allocator that is
         * more permissive.
         */
-       if (!order && nid != NUMA_NO_NODE) {
+       if (!order) {
                while (nr_allocated < nr_pages) {
                        unsigned int nr, nr_pages_request;
 
@@ -2837,8 +2855,20 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                         */
                        nr_pages_request = min(100U, nr_pages - nr_allocated);
 
-                       nr = alloc_pages_bulk_array_node(gfp, nid,
-                               nr_pages_request, pages + nr_allocated);
+                       /* memory allocation should consider mempolicy, we can't
+                        * wrongly use nearest node when nid == NUMA_NO_NODE,
+                        * otherwise memory may be allocated in only one node,
+                        * but mempolcy want to alloc memory by interleaving.
+                        */
+                       if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
+                               nr = alloc_pages_bulk_array_mempolicy(gfp,
+                                                       nr_pages_request,
+                                                       pages + nr_allocated);
+
+                       else
+                               nr = alloc_pages_bulk_array_node(gfp, nid,
+                                                       nr_pages_request,
+                                                       pages + nr_allocated);
 
                        nr_allocated += nr;
                        cond_resched();
@@ -2850,7 +2880,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                        if (nr != nr_pages_request)
                                break;
                }
-       } else if (order)
+       } else
                /*
                 * Compound pages required for remap_vmalloc_page if
                 * high-order pages.
@@ -2860,6 +2890,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
        /* High-order pages or fallback path if "bulk" fails. */
 
        while (nr_allocated < nr_pages) {
+               if (fatal_signal_pending(current))
+                       break;
+
                if (nid == NUMA_NO_NODE)
                        page = alloc_pages(gfp, order);
                else
@@ -2887,6 +2920,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 int node)
 {
        const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
+       const gfp_t orig_gfp_mask = gfp_mask;
        unsigned long addr = (unsigned long)area->addr;
        unsigned long size = get_vm_area_size(area);
        unsigned long array_size;
@@ -2907,7 +2941,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        }
 
        if (!area->pages) {
-               warn_alloc(gfp_mask, NULL,
+               warn_alloc(orig_gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to allocated page array size %lu",
                        nr_small_pages * PAGE_SIZE, array_size);
                free_vm_area(area);
@@ -2927,7 +2961,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
         * allocation request, free them via __vfree() if any.
         */
        if (area->nr_pages != nr_small_pages) {
-               warn_alloc(gfp_mask, NULL,
+               warn_alloc(orig_gfp_mask, NULL,
                        "vmalloc error: size %lu, page order %u, failed to allocate pages",
                        area->nr_pages * PAGE_SIZE, page_order);
                goto fail;
@@ -2935,7 +2969,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 
        if (vmap_pages_range(addr, addr + size, prot, area->pages,
                        page_shift) < 0) {
-               warn_alloc(gfp_mask, NULL,
+               warn_alloc(orig_gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to map pages",
                        area->nr_pages * PAGE_SIZE);
                goto fail;
@@ -2961,8 +2995,16 @@ fail:
  * @caller:              caller's return address
  *
  * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags.  Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * allocator with @gfp_mask flags. Please note that the full set of gfp
+ * flags are not supported. GFP_KERNEL would be a preferred allocation mode
+ * but GFP_NOFS and GFP_NOIO are supported as well. Zone modifiers are not
+ * supported. From the reclaim modifiers__GFP_DIRECT_RECLAIM is required (aka
+ * GFP_NOWAIT is not supported) and only __GFP_NOFAIL is supported (aka
+ * __GFP_NORETRY and __GFP_RETRY_MAYFAIL are not supported).
+ * __GFP_NOWARN can be used to suppress error messages about failures.
+ *
+ * Map them into contiguous kernel virtual space, using a pagetable
+ * protection of @prot.
  *
  * Return: the address of the area or %NULL on failure
  */
@@ -3856,6 +3898,7 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 {
        if (IS_ENABLED(CONFIG_NUMA)) {
                unsigned int nr, *counters = m->private;
+               unsigned int step = 1U << vm_area_page_order(v);
 
                if (!counters)
                        return;
@@ -3867,9 +3910,8 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 
                memset(counters, 0, nr_node_ids * sizeof(unsigned int));
 
-               for (nr = 0; nr < v->nr_pages; nr++)
-                       counters[page_to_nid(v->pages[nr])]++;
-
+               for (nr = 0; nr < v->nr_pages; nr += step)
+                       counters[page_to_nid(v->pages[nr])] += step;
                for_each_node_state(nr, N_HIGH_MEMORY)
                        if (counters[nr])
                                seq_printf(m, " N%u=%u", nr, counters[nr]);
@@ -3905,7 +3947,7 @@ static int s_show(struct seq_file *m, void *p)
                        (void *)va->va_start, (void *)va->va_end,
                        va->va_end - va->va_start);
 
-               return 0;
+               goto final;
        }
 
        v = va->vm;
@@ -3946,6 +3988,7 @@ static int s_show(struct seq_file *m, void *p)
        /*
         * As a final step, dump "unpurged" areas.
         */
+final:
        if (list_is_last(&va->list, &vmap_area_list))
                show_purge_info(m);
 
index 76518e4166dc931acf0b4ae59f80d97609738b74..b52644771cc438f80f032390eaec7c87423c9ae7 100644 (file)
@@ -308,7 +308,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
                         * asserted for a second in which subsequent
                         * pressure events can occur.
                         */
-                       memcg->socket_pressure = jiffies + HZ;
+                       WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
                }
        }
 }
index 71f178f85f5b0b4d301c49df25015a6a84ed3e51..ef4a6dc7f0005f99d4765111168af7563ea5f709 100644 (file)
@@ -1021,6 +1021,91 @@ static void handle_write_error(struct address_space *mapping,
        unlock_page(page);
 }
 
+void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
+{
+       wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
+       long timeout, ret;
+       DEFINE_WAIT(wait);
+
+       /*
+        * Do not throttle IO workers, kthreads other than kswapd or
+        * workqueues. They may be required for reclaim to make
+        * forward progress (e.g. journalling workqueues or kthreads).
+        */
+       if (!current_is_kswapd() &&
+           current->flags & (PF_IO_WORKER|PF_KTHREAD))
+               return;
+
+       /*
+        * These figures are pulled out of thin air.
+        * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
+        * parallel reclaimers which is a short-lived event so the timeout is
+        * short. Failing to make progress or waiting on writeback are
+        * potentially long-lived events so use a longer timeout. This is shaky
+        * logic as a failure to make progress could be due to anything from
+        * writeback to a slow device to excessive references pages at the tail
+        * of the inactive LRU.
+        */
+       switch(reason) {
+       case VMSCAN_THROTTLE_WRITEBACK:
+               timeout = HZ/10;
+
+               if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
+                       WRITE_ONCE(pgdat->nr_reclaim_start,
+                               node_page_state(pgdat, NR_THROTTLED_WRITTEN));
+               }
+
+               break;
+       case VMSCAN_THROTTLE_NOPROGRESS:
+               timeout = HZ/2;
+               break;
+       case VMSCAN_THROTTLE_ISOLATED:
+               timeout = HZ/50;
+               break;
+       default:
+               WARN_ON_ONCE(1);
+               timeout = HZ;
+               break;
+       }
+
+       prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+       ret = schedule_timeout(timeout);
+       finish_wait(wqh, &wait);
+
+       if (reason == VMSCAN_THROTTLE_WRITEBACK)
+               atomic_dec(&pgdat->nr_writeback_throttled);
+
+       trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
+                               jiffies_to_usecs(timeout - ret),
+                               reason);
+}
+
+/*
+ * Account for pages written if tasks are throttled waiting on dirty
+ * pages to clean. If enough pages have been cleaned since throttling
+ * started then wakeup the throttled tasks.
+ */
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
+                                                       int nr_throttled)
+{
+       unsigned long nr_written;
+
+       node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
+
+       /*
+        * This is an inaccurate read as the per-cpu deltas may not
+        * be synchronised. However, given that the system is
+        * writeback throttled, it is not worth taking the penalty
+        * of getting an accurate count. At worst, the throttle
+        * timeout guarantees forward progress.
+        */
+       nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
+               READ_ONCE(pgdat->nr_reclaim_start);
+
+       if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
+               wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
+}
+
 /* possible outcome of pageout() */
 typedef enum {
        /* failed to write page out, page is locked */
@@ -1352,7 +1437,6 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
 {
        int target_nid = next_demotion_node(pgdat->node_id);
        unsigned int nr_succeeded;
-       int err;
 
        if (list_empty(demote_pages))
                return 0;
@@ -1361,7 +1445,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
                return 0;
 
        /* Demotion ignores all cpuset and mempolicy settings */
-       err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+       migrate_pages(demote_pages, alloc_demote_page, NULL,
                            target_nid, MIGRATE_ASYNC, MR_DEMOTION,
                            &nr_succeeded);
 
@@ -1427,9 +1511,8 @@ retry:
 
                /*
                 * The number of dirty pages determines if a node is marked
-                * reclaim_congested which affects wait_iff_congested. kswapd
-                * will stall and start writing pages if the tail of the LRU
-                * is all dirty unqueued pages.
+                * reclaim_congested. kswapd will stall and start writing
+                * pages if the tail of the LRU is all dirty unqueued pages.
                 */
                page_check_dirty_writeback(page, &dirty, &writeback);
                if (dirty || writeback)
@@ -2135,6 +2218,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
                struct scan_control *sc)
 {
        unsigned long inactive, isolated;
+       bool too_many;
 
        if (current_is_kswapd())
                return 0;
@@ -2158,7 +2242,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
        if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                inactive >>= 3;
 
-       return isolated > inactive;
+       too_many = isolated > inactive;
+
+       /* Wake up tasks throttled due to too_many_isolated. */
+       if (!too_many)
+               wake_throttle_isolated(pgdat);
+
+       return too_many;
 }
 
 /*
@@ -2267,8 +2357,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                        return 0;
 
                /* wait a bit for the reclaimer. */
-               msleep(100);
                stalled = true;
+               reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
 
                /* We are about to die and free our memory. Return now. */
                if (fatal_signal_pending(current))
@@ -3196,19 +3286,19 @@ again:
                 * If kswapd scans pages marked for immediate
                 * reclaim and under writeback (nr_immediate), it
                 * implies that pages are cycling through the LRU
-                * faster than they are written so also forcibly stall.
+                * faster than they are written so forcibly stall
+                * until some pages complete writeback.
                 */
                if (sc->nr.immediate)
-                       congestion_wait(BLK_RW_ASYNC, HZ/10);
+                       reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
        }
 
        /*
-        * Tag a node/memcg as congested if all the dirty pages
-        * scanned were backed by a congested BDI and
-        * wait_iff_congested will stall.
+        * Tag a node/memcg as congested if all the dirty pages were marked
+        * for writeback and immediate reclaim (counted in nr.congested).
         *
         * Legacy memcg will stall in page writeback so avoid forcibly
-        * stalling in wait_iff_congested().
+        * stalling in reclaim_throttle().
         */
        if ((current_is_kswapd() ||
             (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
@@ -3216,15 +3306,15 @@ again:
                set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
 
        /*
-        * Stall direct reclaim for IO completions if underlying BDIs
-        * and node is congested. Allow kswapd to continue until it
+        * Stall direct reclaim for IO completions if the lruvec is
+        * node is congested. Allow kswapd to continue until it
         * starts encountering unqueued dirty pages or cycling through
         * the LRU too quickly.
         */
        if (!current_is_kswapd() && current_may_throttle() &&
            !sc->hibernation_mode &&
            test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
-               wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+               reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
 
        if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                    sc))
@@ -3272,6 +3362,36 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
        return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
 }
 
+static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
+{
+       /*
+        * If reclaim is making progress greater than 12% efficiency then
+        * wake all the NOPROGRESS throttled tasks.
+        */
+       if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
+               wait_queue_head_t *wqh;
+
+               wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
+               if (waitqueue_active(wqh))
+                       wake_up(wqh);
+
+               return;
+       }
+
+       /*
+        * Do not throttle kswapd on NOPROGRESS as it will throttle on
+        * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under
+        * writeback and marked for immediate reclaim at the tail of
+        * the LRU.
+        */
+       if (current_is_kswapd())
+               return;
+
+       /* Throttle if making no progress at high prioities. */
+       if (sc->priority < DEF_PRIORITY - 2)
+               reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
+}
+
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -3356,6 +3476,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                        continue;
                last_pgdat = zone->zone_pgdat;
                shrink_node(zone->zone_pgdat, sc);
+               consider_reclaim_throttle(zone->zone_pgdat, sc);
        }
 
        /*
@@ -4302,6 +4423,7 @@ static int kswapd(void *p)
 
        WRITE_ONCE(pgdat->kswapd_order, 0);
        WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+       atomic_set(&pgdat->nr_writeback_throttled, 0);
        for ( ; ; ) {
                bool ret;
 
index 8ce2620344b2f8b4e673c8f48e92945b874ef36d..d701c335628c86c799bb33c7dba2f42f1c7b6fed 100644 (file)
@@ -165,6 +165,34 @@ atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
 EXPORT_SYMBOL(vm_zone_stat);
 EXPORT_SYMBOL(vm_node_stat);
 
+#ifdef CONFIG_NUMA
+static void fold_vm_zone_numa_events(struct zone *zone)
+{
+       unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
+       int cpu;
+       enum numa_stat_item item;
+
+       for_each_online_cpu(cpu) {
+               struct per_cpu_zonestat *pzstats;
+
+               pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+               for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+                       zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
+       }
+
+       for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+               zone_numa_event_add(zone_numa_events[item], zone, item);
+}
+
+void fold_vm_numa_events(void)
+{
+       struct zone *zone;
+
+       for_each_populated_zone(zone)
+               fold_vm_zone_numa_events(zone);
+}
+#endif
+
 #ifdef CONFIG_SMP
 
 int calculate_pressure_threshold(struct zone *zone)
@@ -771,34 +799,6 @@ static int fold_diff(int *zone_diff, int *node_diff)
        return changes;
 }
 
-#ifdef CONFIG_NUMA
-static void fold_vm_zone_numa_events(struct zone *zone)
-{
-       unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
-       int cpu;
-       enum numa_stat_item item;
-
-       for_each_online_cpu(cpu) {
-               struct per_cpu_zonestat *pzstats;
-
-               pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
-               for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
-                       zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
-       }
-
-       for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
-               zone_numa_event_add(zone_numa_events[item], zone, item);
-}
-
-void fold_vm_numa_events(void)
-{
-       struct zone *zone;
-
-       for_each_populated_zone(zone)
-               fold_vm_zone_numa_events(zone);
-}
-#endif
-
 /*
  * Update the zone counters for the current cpu.
  *
@@ -1070,8 +1070,13 @@ static void fill_contig_page_info(struct zone *zone,
        for (order = 0; order < MAX_ORDER; order++) {
                unsigned long blocks;
 
-               /* Count number of free blocks */
-               blocks = zone->free_area[order].nr_free;
+               /*
+                * Count number of free blocks.
+                *
+                * Access to nr_free is lockless as nr_free is used only for
+                * diagnostic purposes. Use data_race to avoid KCSAN warning.
+                */
+               blocks = data_race(zone->free_area[order].nr_free);
                info->free_blocks_total += blocks;
 
                /* Count free base pages */
@@ -1225,6 +1230,7 @@ const char * const vmstat_text[] = {
        "nr_vmscan_immediate_reclaim",
        "nr_dirtied",
        "nr_written",
+       "nr_throttled_written",
        "nr_kernel_misc_reclaimable",
        "nr_foll_pin_acquired",
        "nr_foll_pin_released",
@@ -1445,7 +1451,11 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
 
        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
        for (order = 0; order < MAX_ORDER; ++order)
-               seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+               /*
+                * Access to nr_free is lockless as nr_free is used only for
+                * printing purposes. Use data_race to avoid KCSAN warning.
+                */
+               seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
        seq_putc(m, '\n');
 }
 
@@ -1656,6 +1666,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
        }
        seq_printf(m,
                   "\n  pages free     %lu"
+                  "\n        boost    %lu"
                   "\n        min      %lu"
                   "\n        low      %lu"
                   "\n        high     %lu"
@@ -1664,6 +1675,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n        managed  %lu"
                   "\n        cma      %lu",
                   zone_page_state(zone, NR_FREE_PAGES),
+                  zone->watermark_boost,
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),
@@ -2179,7 +2191,7 @@ static void extfrag_show_print(struct seq_file *m,
        for (order = 0; order < MAX_ORDER; ++order) {
                fill_contig_page_info(zone, order, &info);
                index = __fragmentation_index(order, &info);
-               seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+               seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
        }
 
        seq_putc(m, '\n');
index 68e8831068f4b52bb792fb0d5bc29dbee996039d..b897ce3b399a1019a7f62d00fff434c685a2794d 100644 (file)
@@ -1830,10 +1830,11 @@ static inline void zs_pool_dec_isolated(struct zs_pool *pool)
        VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
        atomic_long_dec(&pool->isolated_pages);
        /*
-        * There's no possibility of racing, since wait_for_isolated_drain()
-        * checks the isolated count under &class->lock after enqueuing
-        * on migration_wait.
+        * Checking pool->destroying must happen after atomic_long_dec()
+        * for pool->isolated_pages above. Paired with the smp_mb() in
+        * zs_unregister_migration().
         */
+       smp_mb__after_atomic();
        if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
                wake_up_all(&pool->migration_wait);
 }
index bc7f419184aa502f549c25aeb12a09f1067d0a37..3c6498dab6bd5ca1ad40aeec94e0c2d939504e48 100644 (file)
 #include <linux/random.h>
 #include <linux/memblock.h>
 #include <linux/highmem.h>
-#include <linux/swap.h>
 #include <linux/cache.h>
 #include <linux/err.h>
 #include <linux/time.h>
index 2fffcf2b54f3f304c6643d04d2c066d4ea6f5de4..319dd7bbfe33d64d25c36bc3a1cd3bd4caf0a779 100644 (file)
@@ -78,7 +78,6 @@
 #include <asm/ioctls.h>
 #include <linux/memblock.h>
 #include <linux/highmem.h>
-#include <linux/swap.h>
 #include <linux/types.h>
 #include <linux/fcntl.h>
 #include <linux/module.h>
index e62b40bd349e22910e19a2241afeb97aa5ef97c7..816f74dadfa385b63985960cc9143b989d09c933 100644 (file)
@@ -24,7 +24,6 @@
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/workqueue.h>
-#include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 
index 896b8f5bc885350980a9547fc921583515dff7ec..04a060ac7fdf6e1f7de8fbf19880870ece8ea520 100644 (file)
@@ -12,7 +12,6 @@
 #include <linux/openvswitch.h>
 #include <linux/netlink.h>
 #include <linux/rculist.h>
-#include <linux/swap.h>
 
 #include <net/netlink.h>
 #include <net/genetlink.h>
index ec0f52567c16ffbd372e9901faa370a24c60cf39..35928fefae3327f97688f0857de63bc17e3429d6 100644 (file)
@@ -33,7 +33,6 @@
 #include <linux/seq_file.h>
 #include <linux/memblock.h>
 #include <linux/highmem.h>
-#include <linux/swap.h>
 #include <linux/slab.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
index b0503ef058d3343abf159e7378663979128c997b..bec3528aa2de1369619940999d300b1b9c1849d2 100644 (file)
@@ -26,7 +26,7 @@ config SAMPLE_TRACE_PRINTK
 config SAMPLE_FTRACE_DIRECT
        tristate "Build register_ftrace_direct() example"
        depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m
-       depends on X86_64 # has x86_64 inlined asm
+       depends on HAVE_SAMPLE_FTRACE_DIRECT
        help
          This builds an ftrace direct function example
          that hooks to wake_up_process and prints the parameters.
@@ -120,6 +120,15 @@ config SAMPLE_CONNECTOR
          with it.
          See also Documentation/driver-api/connector.rst
 
+config SAMPLE_FANOTIFY_ERROR
+       bool "Build fanotify error monitoring sample"
+       depends on FANOTIFY && CC_CAN_LINK && HEADERS_INSTALL
+       help
+         When enabled, this builds an example code that uses the
+         FAN_FS_ERROR fanotify mechanism to monitor filesystem
+         errors.
+         See also Documentation/admin-guide/filesystem-monitoring.rst.
+
 config SAMPLE_HIDRAW
        bool "hidraw sample"
        depends on CC_CAN_LINK && HEADERS_INSTALL
@@ -224,3 +233,9 @@ config SAMPLE_WATCH_QUEUE
          sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function.
 
 endif # SAMPLES
+
+config HAVE_SAMPLE_FTRACE_DIRECT
+       bool
+
+config HAVE_SAMPLE_FTRACE_MULTI_DIRECT
+       bool
index 087e0988ccc569580240bcc719c2593be83f8b46..b7b98307c2b4051cb84e6bd0c38c3b6dc21fb864 100644 (file)
@@ -5,6 +5,7 @@ subdir-$(CONFIG_SAMPLE_AUXDISPLAY)      += auxdisplay
 subdir-$(CONFIG_SAMPLE_ANDROID_BINDERFS) += binderfs
 obj-$(CONFIG_SAMPLE_CONFIGFS)          += configfs/
 obj-$(CONFIG_SAMPLE_CONNECTOR)         += connector/
+obj-$(CONFIG_SAMPLE_FANOTIFY_ERROR)    += fanotify/
 subdir-$(CONFIG_SAMPLE_HIDRAW)         += hidraw
 obj-$(CONFIG_SAMPLE_HW_BREAKPOINT)     += hw_breakpoint/
 obj-$(CONFIG_SAMPLE_KDB)               += kdb/
@@ -21,6 +22,7 @@ subdir-$(CONFIG_SAMPLE_TIMER)         += timers
 obj-$(CONFIG_SAMPLE_TRACE_EVENTS)      += trace_events/
 obj-$(CONFIG_SAMPLE_TRACE_PRINTK)      += trace_printk/
 obj-$(CONFIG_SAMPLE_FTRACE_DIRECT)     += ftrace/
+obj-$(CONFIG_SAMPLE_FTRACE_MULTI_DIRECT) += ftrace/
 obj-$(CONFIG_SAMPLE_TRACE_ARRAY)       += ftrace/
 subdir-$(CONFIG_SAMPLE_UHID)           += uhid
 obj-$(CONFIG_VIDEO_PCI_SKELETON)       += v4l/
diff --git a/samples/fanotify/.gitignore b/samples/fanotify/.gitignore
new file mode 100644 (file)
index 0000000..d74593e
--- /dev/null
@@ -0,0 +1 @@
+fs-monitor
diff --git a/samples/fanotify/Makefile b/samples/fanotify/Makefile
new file mode 100644 (file)
index 0000000..e20db1b
--- /dev/null
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+userprogs-always-y += fs-monitor
+
+userccflags += -I usr/include -Wall
+
diff --git a/samples/fanotify/fs-monitor.c b/samples/fanotify/fs-monitor.c
new file mode 100644 (file)
index 0000000..608db24
--- /dev/null
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021, Collabora Ltd.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <err.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/fanotify.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef FAN_FS_ERROR
+#define FAN_FS_ERROR           0x00008000
+#define FAN_EVENT_INFO_TYPE_ERROR      5
+
+struct fanotify_event_info_error {
+       struct fanotify_event_info_header hdr;
+       __s32 error;
+       __u32 error_count;
+};
+#endif
+
+#ifndef FILEID_INO32_GEN
+#define FILEID_INO32_GEN       1
+#endif
+
+#ifndef FILEID_INVALID
+#define        FILEID_INVALID          0xff
+#endif
+
+static void print_fh(struct file_handle *fh)
+{
+       int i;
+       uint32_t *h = (uint32_t *) fh->f_handle;
+
+       printf("\tfh: ");
+       for (i = 0; i < fh->handle_bytes; i++)
+               printf("%hhx", fh->f_handle[i]);
+       printf("\n");
+
+       printf("\tdecoded fh: ");
+       if (fh->handle_type == FILEID_INO32_GEN)
+               printf("inode=%u gen=%u\n", h[0], h[1]);
+       else if (fh->handle_type == FILEID_INVALID && !fh->handle_bytes)
+               printf("Type %d (Superblock error)\n", fh->handle_type);
+       else
+               printf("Type %d (Unknown)\n", fh->handle_type);
+
+}
+
+static void handle_notifications(char *buffer, int len)
+{
+       struct fanotify_event_metadata *event =
+               (struct fanotify_event_metadata *) buffer;
+       struct fanotify_event_info_header *info;
+       struct fanotify_event_info_error *err;
+       struct fanotify_event_info_fid *fid;
+       int off;
+
+       for (; FAN_EVENT_OK(event, len); event = FAN_EVENT_NEXT(event, len)) {
+
+               if (event->mask != FAN_FS_ERROR) {
+                       printf("unexpected FAN MARK: %llx\n",
+                                                       (unsigned long long)event->mask);
+                       goto next_event;
+               }
+
+               if (event->fd != FAN_NOFD) {
+                       printf("Unexpected fd (!= FAN_NOFD)\n");
+                       goto next_event;
+               }
+
+               printf("FAN_FS_ERROR (len=%d)\n", event->event_len);
+
+               for (off = sizeof(*event) ; off < event->event_len;
+                    off += info->len) {
+                       info = (struct fanotify_event_info_header *)
+                               ((char *) event + off);
+
+                       switch (info->info_type) {
+                       case FAN_EVENT_INFO_TYPE_ERROR:
+                               err = (struct fanotify_event_info_error *) info;
+
+                               printf("\tGeneric Error Record: len=%d\n",
+                                      err->hdr.len);
+                               printf("\terror: %d\n", err->error);
+                               printf("\terror_count: %d\n", err->error_count);
+                               break;
+
+                       case FAN_EVENT_INFO_TYPE_FID:
+                               fid = (struct fanotify_event_info_fid *) info;
+
+                               printf("\tfsid: %x%x\n",
+                                      fid->fsid.val[0], fid->fsid.val[1]);
+                               print_fh((struct file_handle *) &fid->handle);
+                               break;
+
+                       default:
+                               printf("\tUnknown info type=%d len=%d:\n",
+                                      info->info_type, info->len);
+                       }
+               }
+next_event:
+               printf("---\n\n");
+       }
+}
+
+int main(int argc, char **argv)
+{
+       int fd;
+
+       char buffer[BUFSIZ];
+
+       if (argc < 2) {
+               printf("Missing path argument\n");
+               return 1;
+       }
+
+       fd = fanotify_init(FAN_CLASS_NOTIF|FAN_REPORT_FID, O_RDONLY);
+       if (fd < 0)
+               errx(1, "fanotify_init");
+
+       if (fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM,
+                         FAN_FS_ERROR, AT_FDCWD, argv[1])) {
+               errx(1, "fanotify_mark");
+       }
+
+       while (1) {
+               int n = read(fd, buffer, BUFSIZ);
+
+               if (n < 0)
+                       errx(1, "read");
+
+               handle_notifications(buffer, n);
+       }
+
+       return 0;
+}
index ab1d1c05c2886ff2d8f56c4149e33ff04a41d33a..e8a3f8520a44e43eec906daafacb364db27421c3 100644 (file)
@@ -3,7 +3,7 @@
 obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o
 obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o
 obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o
-obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-multi.o
+obj-$(CONFIG_SAMPLE_FTRACE_MULTI_DIRECT) += ftrace-direct-multi.o
 
 CFLAGS_sample-trace-array.o := -I$(src)
 obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o
index 5b9a09957c6e0e653590affbf1b214b7328262f5..690e4a9ff33304f065c00422511b0aa829d3e7df 100644 (file)
@@ -2,6 +2,7 @@
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/ftrace.h>
+#include <asm/asm-offsets.h>
 
 void my_direct_func1(void)
 {
@@ -18,6 +19,8 @@ extern void my_tramp2(void *);
 
 static unsigned long my_ip = (unsigned long)schedule;
 
+#ifdef CONFIG_X86_64
+
 asm (
 "      .pushsection    .text, \"ax\", @progbits\n"
 "      .type           my_tramp1, @function\n"
@@ -41,6 +44,47 @@ asm (
 "      .popsection\n"
 );
 
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_S390
+
+asm (
+"      .pushsection    .text, \"ax\", @progbits\n"
+"      .type           my_tramp1, @function\n"
+"      .globl          my_tramp1\n"
+"   my_tramp1:"
+"      lgr             %r1,%r15\n"
+"      stmg            %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+"      stg             %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+"      aghi            %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+"      stg             %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+"      brasl           %r14,my_direct_func1\n"
+"      aghi            %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+"      lmg             %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+"      lg              %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+"      lgr             %r1,%r0\n"
+"      br              %r1\n"
+"      .size           my_tramp1, .-my_tramp1\n"
+"      .type           my_tramp2, @function\n"
+"      .globl          my_tramp2\n"
+"   my_tramp2:"
+"      lgr             %r1,%r15\n"
+"      stmg            %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+"      stg             %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+"      aghi            %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+"      stg             %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+"      brasl           %r14,my_direct_func2\n"
+"      aghi            %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+"      lmg             %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+"      lg              %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+"      lgr             %r1,%r0\n"
+"      br              %r1\n"
+"      .size           my_tramp2, .-my_tramp2\n"
+"      .popsection\n"
+);
+
+#endif /* CONFIG_S390 */
+
 static unsigned long my_tramp = (unsigned long)my_tramp1;
 static unsigned long tramps[2] = {
        (unsigned long)my_tramp1,
index 3f0079c9bd6fa2bb99e303731d34911b4f8f035f..6e0de725bf221f97a60de3d8a3062b2a29dd7b5d 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/mm.h> /* for handle_mm_fault() */
 #include <linux/ftrace.h>
+#include <asm/asm-offsets.h>
 
 void my_direct_func(struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags)
@@ -13,6 +14,8 @@ void my_direct_func(struct vm_area_struct *vma,
 
 extern void my_tramp(void *);
 
+#ifdef CONFIG_X86_64
+
 asm (
 "      .pushsection    .text, \"ax\", @progbits\n"
 "      .type           my_tramp, @function\n"
@@ -33,6 +36,31 @@ asm (
 "      .popsection\n"
 );
 
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_S390
+
+asm (
+"      .pushsection    .text, \"ax\", @progbits\n"
+"      .type           my_tramp, @function\n"
+"      .globl          my_tramp\n"
+"   my_tramp:"
+"      lgr             %r1,%r15\n"
+"      stmg            %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+"      stg             %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+"      aghi            %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+"      stg             %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+"      brasl           %r14,my_direct_func\n"
+"      aghi            %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+"      lmg             %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+"      lg              %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+"      lgr             %r1,%r0\n"
+"      br              %r1\n"
+"      .size           my_tramp, .-my_tramp\n"
+"      .popsection\n"
+);
+
+#endif /* CONFIG_S390 */
 
 static int __init ftrace_direct_init(void)
 {
index a2729d1ef17f538e66aa2e443b286c0560dee99f..a30aa42ec76a8ff92ee0377264eb8166e7d206b2 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/sched.h> /* for wake_up_process() */
 #include <linux/ftrace.h>
+#include <asm/asm-offsets.h>
 
 void my_direct_func(struct task_struct *p)
 {
@@ -11,6 +12,8 @@ void my_direct_func(struct task_struct *p)
 
 extern void my_tramp(void *);
 
+#ifdef CONFIG_X86_64
+
 asm (
 "      .pushsection    .text, \"ax\", @progbits\n"
 "      .type           my_tramp, @function\n"
@@ -27,6 +30,31 @@ asm (
 "      .popsection\n"
 );
 
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_S390
+
+asm (
+"      .pushsection    .text, \"ax\", @progbits\n"
+"      .type           my_tramp, @function\n"
+"      .globl          my_tramp\n"
+"   my_tramp:"
+"      lgr             %r1,%r15\n"
+"      stmg            %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+"      stg             %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+"      aghi            %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n"
+"      stg             %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n"
+"      brasl           %r14,my_direct_func\n"
+"      aghi            %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n"
+"      lmg             %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n"
+"      lg              %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n"
+"      lgr             %r1,%r0\n"
+"      br              %r1\n"
+"      .size           my_tramp, .-my_tramp\n"
+"      .popsection\n"
+);
+
+#endif /* CONFIG_S390 */
 
 static int __init ftrace_direct_init(void)
 {
index 3efc984d4c6905721d448940c0cbd3f88f84a6c2..78656b527fe59a2b944b29ce808d8842f3c87bea 100644 (file)
@@ -155,7 +155,7 @@ $(obj)/%.ll: $(src)/%.c FORCE
 # (See cmd_cc_o_c + relevant part of rule_cc_o_c)
 
 quiet_cmd_cc_o_c = CC $(quiet_modtag)  $@
-      cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $<
+      cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< $(cmd_objtool)
 
 ifdef CONFIG_MODVERSIONS
 # When module versioning is enabled the following steps are executed:
@@ -224,27 +224,38 @@ cmd_record_mcount = $(if $(findstring $(strip $(CC_FLAGS_FTRACE)),$(_c_flags)),
 endif # CONFIG_FTRACE_MCOUNT_USE_RECORDMCOUNT
 
 ifdef CONFIG_STACK_VALIDATION
-ifndef CONFIG_LTO_CLANG
 
-__objtool_obj := $(objtree)/tools/objtool/objtool
+objtool := $(objtree)/tools/objtool/objtool
+
+objtool_args =                                                         \
+       $(if $(CONFIG_UNWINDER_ORC),orc generate,check)                 \
+       $(if $(part-of-module), --module)                               \
+       $(if $(CONFIG_FRAME_POINTER),, --no-fp)                         \
+       $(if $(CONFIG_GCOV_KERNEL)$(CONFIG_LTO_CLANG), --no-unreachable)\
+       $(if $(CONFIG_RETPOLINE), --retpoline)                          \
+       $(if $(CONFIG_X86_SMAP), --uaccess)                             \
+       $(if $(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL), --mcount)
+
+cmd_objtool = $(if $(objtool-enabled), ; $(objtool) $(objtool_args) $@)
+cmd_gen_objtooldep = $(if $(objtool-enabled), { echo ; echo '$@: $$(wildcard $(objtool))' ; } >> $(dot-target).cmd)
+
+endif # CONFIG_STACK_VALIDATION
+
+ifdef CONFIG_LTO_CLANG
+
+# Skip objtool for LLVM bitcode
+$(obj)/%.o: objtool-enabled :=
+
+else
 
 # 'OBJECT_FILES_NON_STANDARD := y': skip objtool checking for a directory
 # 'OBJECT_FILES_NON_STANDARD_foo.o := 'y': skip objtool checking for a file
 # 'OBJECT_FILES_NON_STANDARD_foo.o := 'n': override directory skip for a file
-cmd_objtool = $(if $(patsubst y%,, \
-       $(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n), \
-       $(__objtool_obj) $(objtool_args) $@)
-objtool_obj = $(if $(patsubst y%,, \
-       $(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n), \
-       $(__objtool_obj))
-
-endif # CONFIG_LTO_CLANG
-endif # CONFIG_STACK_VALIDATION
 
-# Rebuild all objects when objtool changes, or is enabled/disabled.
-objtool_dep = $(objtool_obj)                                   \
-             $(wildcard include/config/ORC_UNWINDER            \
-                        include/config/STACK_VALIDATION)
+$(obj)/%.o: objtool-enabled = $(if $(filter-out y%, \
+       $(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n),y)
+
+endif
 
 ifdef CONFIG_TRIM_UNUSED_KSYMS
 cmd_gen_ksymdeps = \
@@ -259,7 +270,7 @@ define rule_cc_o_c
        $(call cmd,gen_ksymdeps)
        $(call cmd,checksrc)
        $(call cmd,checkdoc)
-       $(call cmd,objtool)
+       $(call cmd,gen_objtooldep)
        $(call cmd,modversions_c)
        $(call cmd,record_mcount)
 endef
@@ -267,13 +278,12 @@ endef
 define rule_as_o_S
        $(call cmd_and_fixdep,as_o_S)
        $(call cmd,gen_ksymdeps)
-       $(call cmd,objtool)
+       $(call cmd,gen_objtooldep)
        $(call cmd,modversions_S)
 endef
 
 # Built-in and composite module parts
-.SECONDEXPANSION:
-$(obj)/%.o: $(src)/%.c $(recordmcount_source) $$(objtool_dep) FORCE
+$(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE
        $(call if_changed_rule,cc_o_c)
        $(call cmd,force_checksrc)
 
@@ -285,14 +295,13 @@ cmd_cc_lto_link_modules =                                         \
        $(LD) $(ld_flags) -r -o $@                                      \
                $(shell [ -s $(@:.lto.o=.o.symversions) ] &&            \
                        echo -T $(@:.lto.o=.o.symversions))             \
-               --whole-archive $(filter-out FORCE,$^)
+               --whole-archive $(filter-out FORCE,$^)                  \
+               $(cmd_objtool)
 
-ifdef CONFIG_STACK_VALIDATION
 # objtool was skipped for LLVM bitcode, run it now that we have compiled
 # modules into native code
-cmd_cc_lto_link_modules += ;                                           \
-       $(objtree)/tools/objtool/objtool $(objtool_args) --module $@
-endif
+$(obj)/%.lto.o: objtool-enabled = y
+$(obj)/%.lto.o: part-of-module := y
 
 $(obj)/%.lto.o: $(obj)/%.o FORCE
        $(call if_changed,cc_lto_link_modules)
@@ -356,7 +365,7 @@ $(obj)/%.s: $(src)/%.S FORCE
        $(call if_changed_dep,cpp_s_S)
 
 quiet_cmd_as_o_S = AS $(quiet_modtag)  $@
-      cmd_as_o_S = $(CC) $(a_flags) -c -o $@ $<
+      cmd_as_o_S = $(CC) $(a_flags) -c -o $@ $< $(cmd_objtool)
 
 ifdef CONFIG_ASM_MODVERSIONS
 
@@ -375,7 +384,7 @@ cmd_modversions_S =                                                         \
        fi
 endif
 
-$(obj)/%.o: $(src)/%.S $$(objtool_dep) FORCE
+$(obj)/%.o: $(src)/%.S FORCE
        $(call if_changed_rule,as_o_S)
 
 targets += $(filter-out $(subdir-builtin), $(real-obj-y))
diff --git a/scripts/Makefile.debug b/scripts/Makefile.debug
new file mode 100644 (file)
index 0000000..9f39b01
--- /dev/null
@@ -0,0 +1,33 @@
+DEBUG_CFLAGS   :=
+
+ifdef CONFIG_DEBUG_INFO_SPLIT
+DEBUG_CFLAGS   += -gsplit-dwarf
+else
+DEBUG_CFLAGS   += -g
+endif
+
+ifndef CONFIG_AS_IS_LLVM
+KBUILD_AFLAGS  += -Wa,-gdwarf-2
+endif
+
+ifndef CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT
+dwarf-version-$(CONFIG_DEBUG_INFO_DWARF4) := 4
+dwarf-version-$(CONFIG_DEBUG_INFO_DWARF5) := 5
+DEBUG_CFLAGS   += -gdwarf-$(dwarf-version-y)
+endif
+
+ifdef CONFIG_DEBUG_INFO_REDUCED
+DEBUG_CFLAGS   += -fno-var-tracking
+ifdef CONFIG_CC_IS_GCC
+DEBUG_CFLAGS   += -femit-struct-debug-baseonly
+endif
+endif
+
+ifdef CONFIG_DEBUG_INFO_COMPRESSED
+DEBUG_CFLAGS   += -gz=zlib
+KBUILD_AFLAGS  += -gz=zlib
+KBUILD_LDFLAGS += --compress-debug-sections=zlib
+endif
+
+KBUILD_CFLAGS += $(DEBUG_CFLAGS)
+export DEBUG_CFLAGS
index 56d50eb0cd8004524e309cab4ffb720ab4972aee..d1f865b8c0cbabce6643118f6b79d08725ac015e 100644 (file)
@@ -232,17 +232,6 @@ ifeq ($(CONFIG_LTO_CLANG),y)
 mod-prelink-ext := .lto
 endif
 
-# Objtool arguments are also needed for modfinal with LTO, so we define
-# then here to avoid duplication.
-objtool_args =                                                         \
-       $(if $(CONFIG_UNWINDER_ORC),orc generate,check)                 \
-       $(if $(part-of-module), --module)                               \
-       $(if $(CONFIG_FRAME_POINTER),, --no-fp)                         \
-       $(if $(CONFIG_GCOV_KERNEL)$(CONFIG_LTO_CLANG), --no-unreachable)\
-       $(if $(CONFIG_RETPOLINE), --retpoline)                          \
-       $(if $(CONFIG_X86_SMAP), --uaccess)                             \
-       $(if $(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL), --mcount)
-
 # Useful for describing the dependency of composite objects
 # Usage:
 #   $(call multi_depend, multi_used_targets, suffix_to_remove, suffix_to_add)
index b74c65284fb2f6b8079282fc01ce2a855df367ac..77b612183c082a958ac35eca5008b28aef16f127 100644 (file)
@@ -103,7 +103,7 @@ snap-pkg:
 
 # tarball targets
 # ---------------------------------------------------------------------------
-tar-pkgs := dir-pkg tar-pkg targz-pkg tarbz2-pkg tarxz-pkg
+tar-pkgs := dir-pkg tar-pkg targz-pkg tarbz2-pkg tarxz-pkg tarzst-pkg
 PHONY += $(tar-pkgs)
 $(tar-pkgs):
        $(MAKE) -f $(srctree)/Makefile
@@ -130,10 +130,12 @@ $(if $(findstring tar-src,$@),,                                     \
 $(if $(findstring bz2,$@),$(KBZIP2),                                 \
 $(if $(findstring gz,$@),$(KGZIP),                                  \
 $(if $(findstring xz,$@),$(XZ),                                     \
-$(error unknown target $@))))                                       \
+$(if $(findstring zst,$@),$(ZSTD),                                  \
+$(error unknown target $@)))))                                      \
        -f -9 $(perf-tar).tar)
 
-perf-tar-pkgs := perf-tar-src-pkg perf-targz-src-pkg perf-tarbz2-src-pkg perf-tarxz-src-pkg
+perf-tar-pkgs := perf-tar-src-pkg perf-targz-src-pkg perf-tarbz2-src-pkg \
+                perf-tarxz-src-pkg perf-tarzst-src-pkg
 PHONY += $(perf-tar-pkgs)
 $(perf-tar-pkgs):
        $(call cmd,perf_tar)
@@ -153,9 +155,11 @@ help:
        @echo '  targz-pkg           - Build the kernel as a gzip compressed tarball'
        @echo '  tarbz2-pkg          - Build the kernel as a bzip2 compressed tarball'
        @echo '  tarxz-pkg           - Build the kernel as a xz compressed tarball'
+       @echo '  tarzst-pkg          - Build the kernel as a zstd compressed tarball'
        @echo '  perf-tar-src-pkg    - Build $(perf-tar).tar source tarball'
        @echo '  perf-targz-src-pkg  - Build $(perf-tar).tar.gz source tarball'
        @echo '  perf-tarbz2-src-pkg - Build $(perf-tar).tar.bz2 source tarball'
        @echo '  perf-tarxz-src-pkg  - Build $(perf-tar).tar.xz source tarball'
+       @echo '  perf-tarzst-src-pkg - Build $(perf-tar).tar.zst source tarball'
 
 .PHONY: $(PHONY)
index c27d2312cfc307f4469ce7322b06e5025e2fb017..88cb294dc4472e24fdd896905d61ba58d5ad834e 100755 (executable)
@@ -489,7 +489,8 @@ our $Attribute      = qr{
                        ____cacheline_aligned|
                        ____cacheline_aligned_in_smp|
                        ____cacheline_internodealigned_in_smp|
-                       __weak
+                       __weak|
+                       __alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\)
                  }x;
 our $Modifier;
 our $Inline    = qr{inline|__always_inline|noinline|__inline|__inline__};
index 31d884e35f2f1526857e4435ce5c7ee097e2a9f2..c711a196511c666edfda6ef9cc0dbe1875c5592a 100755 (executable)
@@ -126,7 +126,7 @@ if [ $marker -ne 0 ]; then
 fi
 echo Code starting with the faulting instruction  > $T.aa
 echo =========================================== >> $T.aa
-code=`echo $code | sed -e 's/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'`
+code=`echo $code | sed -e 's/\r//;s/ [<(]/ /;s/[>)] / /;s/ /,0x/g; s/[>)]$//'`
 echo -n "      .$type 0x" > $T.s
 echo $code >> $T.s
 disas $T 0
index 5d84b44a2a2a7169f644742b0b94abec44f805da..971da3598fe48b95c7c9a1d6710947eda378f54f 100644 (file)
@@ -646,19 +646,8 @@ static void check_conf(struct menu *menu)
 
                switch (input_mode) {
                case listnewconfig:
-                       if (sym->name) {
-                               const char *str;
-
-                               if (sym->type == S_STRING) {
-                                       str = sym_get_string_value(sym);
-                                       str = sym_escape_string_value(str);
-                                       printf("%s%s=%s\n", CONFIG_, sym->name, str);
-                                       free((void *)str);
-                               } else {
-                                       str = sym_get_string_value(sym);
-                                       printf("%s%s=%s\n", CONFIG_, sym->name, str);
-                               }
-                       }
+                       if (sym->name)
+                               print_symbol_for_listconfig(sym);
                        break;
                case helpnewconfig:
                        printf("-----\n");
index cf72680cd76925564be02e77e7fe6829a5a65061..42bc56ee238c8f5f9318f5d6cd9d8f1b57ba2b8d 100644 (file)
@@ -11,6 +11,7 @@
 #include <fcntl.h>
 #include <limits.h>
 #include <stdarg.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -129,41 +130,22 @@ static size_t depfile_prefix_len;
 /* touch depfile for symbol 'name' */
 static int conf_touch_dep(const char *name)
 {
-       int fd, ret;
-       char *d;
+       int fd;
 
        /* check overflow: prefix + name + '\0' must fit in buffer. */
        if (depfile_prefix_len + strlen(name) + 1 > sizeof(depfile_path))
                return -1;
 
-       d = depfile_path + depfile_prefix_len;
-       strcpy(d, name);
+       strcpy(depfile_path + depfile_prefix_len, name);
 
-       /* Assume directory path already exists. */
        fd = open(depfile_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-       if (fd == -1) {
-               if (errno != ENOENT)
-                       return -1;
-
-               ret = make_parent_dir(depfile_path);
-               if (ret)
-                       return ret;
-
-               /* Try it again. */
-               fd = open(depfile_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-               if (fd == -1)
-                       return -1;
-       }
+       if (fd == -1)
+               return -1;
        close(fd);
 
        return 0;
 }
 
-struct conf_printer {
-       void (*print_symbol)(FILE *, struct symbol *, const char *, void *);
-       void (*print_comment)(FILE *, const char *, void *);
-};
-
 static void conf_warning(const char *fmt, ...)
        __attribute__ ((format (printf, 1, 2)));
 
@@ -227,6 +209,13 @@ static const char *conf_get_autoconfig_name(void)
        return name ? name : "include/config/auto.conf";
 }
 
+static const char *conf_get_autoheader_name(void)
+{
+       char *name = getenv("KCONFIG_AUTOHEADER");
+
+       return name ? name : "include/generated/autoconf.h";
+}
+
 static int conf_set_sym_val(struct symbol *sym, int def, int def_flags, char *p)
 {
        char *p2;
@@ -594,169 +583,171 @@ int conf_read(const char *name)
        return 0;
 }
 
-/*
- * Kconfig configuration printer
- *
- * This printer is used when generating the resulting configuration after
- * kconfig invocation and `defconfig' files. Unset symbol might be omitted by
- * passing a non-NULL argument to the printer.
- *
- */
-static void
-kconfig_print_symbol(FILE *fp, struct symbol *sym, const char *value, void *arg)
+struct comment_style {
+       const char *decoration;
+       const char *prefix;
+       const char *postfix;
+};
+
+static const struct comment_style comment_style_pound = {
+       .decoration = "#",
+       .prefix = "#",
+       .postfix = "#",
+};
+
+static const struct comment_style comment_style_c = {
+       .decoration = " *",
+       .prefix = "/*",
+       .postfix = " */",
+};
+
+static void conf_write_heading(FILE *fp, const struct comment_style *cs)
 {
+       fprintf(fp, "%s\n", cs->prefix);
 
-       switch (sym->type) {
-       case S_BOOLEAN:
-       case S_TRISTATE:
-               if (*value == 'n') {
-                       bool skip_unset = (arg != NULL);
+       fprintf(fp, "%s Automatically generated file; DO NOT EDIT.\n",
+               cs->decoration);
 
-                       if (!skip_unset)
-                               fprintf(fp, "# %s%s is not set\n",
-                                   CONFIG_, sym->name);
-                       return;
-               }
-               break;
-       default:
-               break;
-       }
+       fprintf(fp, "%s %s\n", cs->decoration, rootmenu.prompt->text);
 
-       fprintf(fp, "%s%s=%s\n", CONFIG_, sym->name, value);
+       fprintf(fp, "%s\n", cs->postfix);
 }
 
-static void
-kconfig_print_comment(FILE *fp, const char *value, void *arg)
+/* The returned pointer must be freed on the caller side */
+static char *escape_string_value(const char *in)
 {
-       const char *p = value;
-       size_t l;
+       const char *p;
+       char *out;
+       size_t len;
 
-       for (;;) {
-               l = strcspn(p, "\n");
-               fprintf(fp, "#");
-               if (l) {
-                       fprintf(fp, " ");
-                       xfwrite(p, l, 1, fp);
-                       p += l;
-               }
-               fprintf(fp, "\n");
-               if (*p++ == '\0')
+       len = strlen(in) + strlen("\"\"") + 1;
+
+       p = in;
+       while (1) {
+               p += strcspn(p, "\"\\");
+
+               if (p[0] == '\0')
                        break;
+
+               len++;
+               p++;
        }
-}
 
-static struct conf_printer kconfig_printer_cb =
-{
-       .print_symbol = kconfig_print_symbol,
-       .print_comment = kconfig_print_comment,
-};
+       out = xmalloc(len);
+       out[0] = '\0';
+
+       strcat(out, "\"");
+
+       p = in;
+       while (1) {
+               len = strcspn(p, "\"\\");
+               strncat(out, p, len);
+               p += len;
+
+               if (p[0] == '\0')
+                       break;
+
+               strcat(out, "\\");
+               strncat(out, p++, 1);
+       }
+
+       strcat(out, "\"");
+
+       return out;
+}
 
 /*
- * Header printer
+ * Kconfig configuration printer
  *
- * This printer is used when generating the `include/generated/autoconf.h' file.
+ * This printer is used when generating the resulting configuration after
+ * kconfig invocation and `defconfig' files. Unset symbol might be omitted by
+ * passing a non-NULL argument to the printer.
  */
-static void
-header_print_symbol(FILE *fp, struct symbol *sym, const char *value, void *arg)
+enum output_n { OUTPUT_N, OUTPUT_N_AS_UNSET, OUTPUT_N_NONE };
+
+static void __print_symbol(FILE *fp, struct symbol *sym, enum output_n output_n,
+                          bool escape_string)
 {
+       const char *val;
+       char *escaped = NULL;
 
-       switch (sym->type) {
-       case S_BOOLEAN:
-       case S_TRISTATE: {
-               const char *suffix = "";
+       if (sym->type == S_UNKNOWN)
+               return;
 
-               switch (*value) {
-               case 'n':
-                       break;
-               case 'm':
-                       suffix = "_MODULE";
-                       /* fall through */
-               default:
-                       fprintf(fp, "#define %s%s%s 1\n",
-                           CONFIG_, sym->name, suffix);
-               }
-               break;
-       }
-       case S_HEX: {
-               const char *prefix = "";
+       val = sym_get_string_value(sym);
 
-               if (value[0] != '0' || (value[1] != 'x' && value[1] != 'X'))
-                       prefix = "0x";
-               fprintf(fp, "#define %s%s %s%s\n",
-                   CONFIG_, sym->name, prefix, value);
-               break;
+       if ((sym->type == S_BOOLEAN || sym->type == S_TRISTATE) &&
+           output_n != OUTPUT_N && *val == 'n') {
+               if (output_n == OUTPUT_N_AS_UNSET)
+                       fprintf(fp, "# %s%s is not set\n", CONFIG_, sym->name);
+               return;
        }
-       case S_STRING:
-       case S_INT:
-               fprintf(fp, "#define %s%s %s\n",
-                   CONFIG_, sym->name, value);
-               break;
-       default:
-               break;
+
+       if (sym->type == S_STRING && escape_string) {
+               escaped = escape_string_value(val);
+               val = escaped;
        }
 
+       fprintf(fp, "%s%s=%s\n", CONFIG_, sym->name, val);
+
+       free(escaped);
 }
 
-static void
-header_print_comment(FILE *fp, const char *value, void *arg)
+static void print_symbol_for_dotconfig(FILE *fp, struct symbol *sym)
 {
-       const char *p = value;
-       size_t l;
+       __print_symbol(fp, sym, OUTPUT_N_AS_UNSET, true);
+}
 
-       fprintf(fp, "/*\n");
-       for (;;) {
-               l = strcspn(p, "\n");
-               fprintf(fp, " *");
-               if (l) {
-                       fprintf(fp, " ");
-                       xfwrite(p, l, 1, fp);
-                       p += l;
-               }
-               fprintf(fp, "\n");
-               if (*p++ == '\0')
-                       break;
-       }
-       fprintf(fp, " */\n");
+static void print_symbol_for_autoconf(FILE *fp, struct symbol *sym)
+{
+       __print_symbol(fp, sym, OUTPUT_N_NONE, true);
 }
 
-static struct conf_printer header_printer_cb =
+void print_symbol_for_listconfig(struct symbol *sym)
 {
-       .print_symbol = header_print_symbol,
-       .print_comment = header_print_comment,
-};
+       __print_symbol(stdout, sym, OUTPUT_N, true);
+}
 
-static void conf_write_symbol(FILE *fp, struct symbol *sym,
-                             struct conf_printer *printer, void *printer_arg)
+static void print_symbol_for_c(FILE *fp, struct symbol *sym)
 {
-       const char *str;
+       const char *val;
+       const char *sym_suffix = "";
+       const char *val_prefix = "";
+       char *escaped = NULL;
+
+       if (sym->type == S_UNKNOWN)
+               return;
+
+       val = sym_get_string_value(sym);
 
        switch (sym->type) {
-       case S_UNKNOWN:
+       case S_BOOLEAN:
+       case S_TRISTATE:
+               switch (*val) {
+               case 'n':
+                       return;
+               case 'm':
+                       sym_suffix = "_MODULE";
+                       /* fall through */
+               default:
+                       val = "1";
+               }
                break;
-       case S_STRING:
-               str = sym_get_string_value(sym);
-               str = sym_escape_string_value(str);
-               printer->print_symbol(fp, sym, str, printer_arg);
-               free((void *)str);
+       case S_HEX:
+               if (val[0] != '0' || (val[1] != 'x' && val[1] != 'X'))
+                       val_prefix = "0x";
                break;
+       case S_STRING:
+               escaped = escape_string_value(val);
+               val = escaped;
        default:
-               str = sym_get_string_value(sym);
-               printer->print_symbol(fp, sym, str, printer_arg);
+               break;
        }
-}
-
-static void
-conf_write_heading(FILE *fp, struct conf_printer *printer, void *printer_arg)
-{
-       char buf[256];
 
-       snprintf(buf, sizeof(buf),
-           "\n"
-           "Automatically generated file; DO NOT EDIT.\n"
-           "%s\n",
-           rootmenu.prompt->text);
+       fprintf(fp, "#define %s%s%s %s%s\n", CONFIG_, sym->name, sym_suffix,
+               val_prefix, val);
 
-       printer->print_comment(fp, buf, printer_arg);
+       free(escaped);
 }
 
 /*
@@ -815,7 +806,7 @@ int conf_write_defconfig(const char *filename)
                                                goto next_menu;
                                }
                        }
-                       conf_write_symbol(out, sym, &kconfig_printer_cb, NULL);
+                       print_symbol_for_dotconfig(out, sym);
                }
 next_menu:
                if (menu->list != NULL) {
@@ -875,7 +866,7 @@ int conf_write(const char *name)
        if (!out)
                return 1;
 
-       conf_write_heading(out, &kconfig_printer_cb, NULL);
+       conf_write_heading(out, &comment_style_pound);
 
        if (!conf_get_changed())
                sym_clear_all_valid();
@@ -902,7 +893,7 @@ int conf_write(const char *name)
                                need_newline = false;
                        }
                        sym->flags |= SYMBOL_WRITTEN;
-                       conf_write_symbol(out, sym, &kconfig_printer_cb, NULL);
+                       print_symbol_for_dotconfig(out, sym);
                }
 
 next:
@@ -952,32 +943,50 @@ next:
 }
 
 /* write a dependency file as used by kbuild to track dependencies */
-static int conf_write_dep(const char *name)
+static int conf_write_autoconf_cmd(const char *autoconf_name)
 {
+       char name[PATH_MAX], tmp[PATH_MAX];
        struct file *file;
        FILE *out;
+       int ret;
 
-       out = fopen("..config.tmp", "w");
-       if (!out)
-               return 1;
-       fprintf(out, "deps_config := \\\n");
-       for (file = file_list; file; file = file->next) {
-               if (file->next)
-                       fprintf(out, "\t%s \\\n", file->name);
-               else
-                       fprintf(out, "\t%s\n", file->name);
+       ret = snprintf(name, sizeof(name), "%s.cmd", autoconf_name);
+       if (ret >= sizeof(name)) /* check truncation */
+               return -1;
+
+       if (make_parent_dir(name))
+               return -1;
+
+       ret = snprintf(tmp, sizeof(tmp), "%s.cmd.tmp", autoconf_name);
+       if (ret >= sizeof(tmp)) /* check truncation */
+               return -1;
+
+       out = fopen(tmp, "w");
+       if (!out) {
+               perror("fopen");
+               return -1;
        }
-       fprintf(out, "\n%s: \\\n"
-                    "\t$(deps_config)\n\n", conf_get_autoconfig_name());
 
-       env_write_dep(out, conf_get_autoconfig_name());
+       fprintf(out, "deps_config := \\\n");
+       for (file = file_list; file; file = file->next)
+               fprintf(out, "\t%s \\\n", file->name);
+
+       fprintf(out, "\n%s: $(deps_config)\n\n", autoconf_name);
+
+       env_write_dep(out, autoconf_name);
 
        fprintf(out, "\n$(deps_config): ;\n");
+
+       if (ferror(out)) /* error check for all fprintf() calls */
+               return -1;
+
        fclose(out);
 
-       if (make_parent_dir(name))
-               return 1;
-       rename("..config.tmp", name);
+       if (rename(tmp, name)) {
+               perror("rename");
+               return -1;
+       }
+
        return 0;
 }
 
@@ -1053,63 +1062,83 @@ static int conf_touch_deps(void)
        return 0;
 }
 
+static int __conf_write_autoconf(const char *filename,
+                                void (*print_symbol)(FILE *, struct symbol *),
+                                const struct comment_style *comment_style)
+{
+       char tmp[PATH_MAX];
+       FILE *file;
+       struct symbol *sym;
+       int ret, i;
+
+       if (make_parent_dir(filename))
+               return -1;
+
+       ret = snprintf(tmp, sizeof(tmp), "%s.tmp", filename);
+       if (ret >= sizeof(tmp)) /* check truncation */
+               return -1;
+
+       file = fopen(tmp, "w");
+       if (!file) {
+               perror("fopen");
+               return -1;
+       }
+
+       conf_write_heading(file, comment_style);
+
+       for_all_symbols(i, sym)
+               if ((sym->flags & SYMBOL_WRITE) && sym->name)
+                       print_symbol(file, sym);
+
+       /* check possible errors in conf_write_heading() and print_symbol() */
+       if (ferror(file))
+               return -1;
+
+       fclose(file);
+
+       if (rename(tmp, filename)) {
+               perror("rename");
+               return -1;
+       }
+
+       return 0;
+}
+
 int conf_write_autoconf(int overwrite)
 {
        struct symbol *sym;
-       const char *name;
        const char *autoconf_name = conf_get_autoconfig_name();
-       FILE *out, *out_h;
-       int i;
+       int ret, i;
 
        if (!overwrite && is_present(autoconf_name))
                return 0;
 
-       conf_write_dep("include/config/auto.conf.cmd");
+       ret = conf_write_autoconf_cmd(autoconf_name);
+       if (ret)
+               return -1;
 
        if (conf_touch_deps())
                return 1;
 
-       out = fopen(".tmpconfig", "w");
-       if (!out)
-               return 1;
-
-       out_h = fopen(".tmpconfig.h", "w");
-       if (!out_h) {
-               fclose(out);
-               return 1;
-       }
-
-       conf_write_heading(out, &kconfig_printer_cb, NULL);
-       conf_write_heading(out_h, &header_printer_cb, NULL);
-
-       for_all_symbols(i, sym) {
+       for_all_symbols(i, sym)
                sym_calc_value(sym);
-               if (!(sym->flags & SYMBOL_WRITE) || !sym->name)
-                       continue;
-
-               /* write symbols to auto.conf and autoconf.h */
-               conf_write_symbol(out, sym, &kconfig_printer_cb, (void *)1);
-               conf_write_symbol(out_h, sym, &header_printer_cb, NULL);
-       }
-       fclose(out);
-       fclose(out_h);
 
-       name = getenv("KCONFIG_AUTOHEADER");
-       if (!name)
-               name = "include/generated/autoconf.h";
-       if (make_parent_dir(name))
-               return 1;
-       if (rename(".tmpconfig.h", name))
-               return 1;
+       ret = __conf_write_autoconf(conf_get_autoheader_name(),
+                                   print_symbol_for_c,
+                                   &comment_style_c);
+       if (ret)
+               return ret;
 
-       if (make_parent_dir(autoconf_name))
-               return 1;
        /*
-        * This must be the last step, kbuild has a dependency on auto.conf
-        * and this marks the successful completion of the previous steps.
+        * Create include/config/auto.conf. This must be the last step because
+        * Kbuild has a dependency on auto.conf and this marks the successful
+        * completion of the previous steps.
         */
-       if (rename(".tmpconfig", autoconf_name))
-               return 1;
+       ret = __conf_write_autoconf(conf_get_autoconfig_name(),
+                                   print_symbol_for_autoconf,
+                                   &comment_style_pound);
+       if (ret)
+               return ret;
 
        return 0;
 }
index 312cbad2d34d41c494b6441268950a1a968f7141..cc386e44368346c2309d2f60aab3cad10a5cefb0 100644 (file)
@@ -84,8 +84,7 @@ static void warn_ignored_character(char chr)
 n      [A-Za-z0-9_-]
 
 %%
-       int str = 0;
-       int ts, i;
+       char open_quote = 0;
 
 #.*                    /* ignore comment */
 [ \t]*                 /* whitespaces */
@@ -134,7 +133,7 @@ n   [A-Za-z0-9_-]
 ":="                   return T_COLON_EQUAL;
 "+="                   return T_PLUS_EQUAL;
 \"|\'                  {
-                               str = yytext[0];
+                               open_quote = yytext[0];
                                new_string();
                                BEGIN(STRING);
                        }
@@ -171,7 +170,7 @@ n   [A-Za-z0-9_-]
                append_string(yytext + 1, yyleng - 1);
        }
        \'|\"   {
-               if (str == yytext[0]) {
+               if (open_quote == yytext[0]) {
                        BEGIN(INITIAL);
                        yylval.string = text;
                        return T_WORD_QUOTE;
@@ -196,6 +195,8 @@ n   [A-Za-z0-9_-]
 
 <HELP>{
        [ \t]+  {
+               int ts, i;
+
                ts = 0;
                for (i = 0; i < yyleng; i++) {
                        if (yytext[i] == '\t')
index a11626bdc421c4ef215525439300f4a70008c15a..edd1e617b25c5c3683ba4287f108ccb690608d10 100644 (file)
@@ -18,7 +18,7 @@ extern struct symbol * symbol_hash[SYMBOL_HASHSIZE];
 
 struct symbol * sym_lookup(const char *name, int flags);
 struct symbol * sym_find(const char *name);
-const char * sym_escape_string_value(const char *in);
+void print_symbol_for_listconfig(struct symbol *sym);
 struct symbol ** sym_re_search(const char *pattern);
 const char * sym_type_name(enum symbol_type type);
 void sym_calc_value(struct symbol *sym);
index 606ba8a63c24edee28d8e4e99aed1104e42a29c8..3d6f7cba88464b30c21aea07c148424250c23455 100644 (file)
@@ -728,7 +728,7 @@ static void get_prompt_str(struct gstr *r, struct property *prop,
                get_dep_str(r, prop->visible.expr, "  Visible if: ");
 
        menu = prop->menu->parent;
-       for (i = 0; menu != &rootmenu && i < 8; menu = menu->parent) {
+       for (i = 0; menu && i < 8; menu = menu->parent) {
                bool accessible = menu_is_visible(menu);
 
                submenu[i++] = menu;
@@ -758,21 +758,24 @@ static void get_prompt_str(struct gstr *r, struct property *prop,
                list_add_tail(&jump->entries, head);
        }
 
-       if (i > 0) {
-               str_printf(r, "  Location:\n");
-               for (j = 4; --i >= 0; j += 2) {
-                       menu = submenu[i];
-                       if (jump && menu == location)
-                               jump->offset = strlen(r->s);
-                       str_printf(r, "%*c-> %s", j, ' ',
-                                  menu_get_prompt(menu));
-                       if (menu->sym) {
-                               str_printf(r, " (%s [=%s])", menu->sym->name ?
-                                       menu->sym->name : "<choice>",
-                                       sym_get_string_value(menu->sym));
-                       }
-                       str_append(r, "\n");
+       str_printf(r, "  Location:\n");
+       for (j = 4; --i >= 0; j += 2) {
+               menu = submenu[i];
+               if (jump && menu == location)
+                       jump->offset = strlen(r->s);
+
+               if (menu == &rootmenu)
+                       /* The real rootmenu prompt is ugly */
+                       str_printf(r, "%*cMain menu", j, ' ');
+               else
+                       str_printf(r, "%*c-> %s", j, ' ', menu_get_prompt(menu));
+
+               if (menu->sym) {
+                       str_printf(r, " (%s [=%s])", menu->sym->name ?
+                               menu->sym->name : "<choice>",
+                               sym_get_string_value(menu->sym));
                }
+               str_append(r, "\n");
        }
 }
 
index 5844d636d38f474e2571c0a9f8bd3a734edd7928..0572330bf8a78aed51877dfa7429aab23d55069d 100644 (file)
@@ -871,49 +871,6 @@ struct symbol *sym_find(const char *name)
        return symbol;
 }
 
-const char *sym_escape_string_value(const char *in)
-{
-       const char *p;
-       size_t reslen;
-       char *res;
-       size_t l;
-
-       reslen = strlen(in) + strlen("\"\"") + 1;
-
-       p = in;
-       for (;;) {
-               l = strcspn(p, "\"\\");
-               p += l;
-
-               if (p[0] == '\0')
-                       break;
-
-               reslen++;
-               p++;
-       }
-
-       res = xmalloc(reslen);
-       res[0] = '\0';
-
-       strcat(res, "\"");
-
-       p = in;
-       for (;;) {
-               l = strcspn(p, "\"\\");
-               strncat(res, p, l);
-               p += l;
-
-               if (p[0] == '\0')
-                       break;
-
-               strcat(res, "\\");
-               strncat(res, p++, 1);
-       }
-
-       strcat(res, "\"");
-       return res;
-}
-
 struct sym_match {
        struct symbol   *sym;
        off_t           so, eo;
index 3ea7cece7c97d934821c63db5e7086bde3122460..5cdd9bc5c3852b8972774259a8aba06e7a52ce3c 100755 (executable)
@@ -360,14 +360,14 @@ if [ -n "${CONFIG_KALLSYMS}" ]; then
        # kallsyms support
        # Generate section listing all symbols and add it into vmlinux
        # It's a three step process:
-       # 1)  Link .tmp_vmlinux1 so it has all symbols and sections,
+       # 1)  Link .tmp_vmlinux.kallsyms1 so it has all symbols and sections,
        #     but __kallsyms is empty.
        #     Running kallsyms on that gives us .tmp_kallsyms1.o with
        #     the right size
-       # 2)  Link .tmp_vmlinux2 so it now has a __kallsyms section of
+       # 2)  Link .tmp_vmlinux.kallsyms2 so it now has a __kallsyms section of
        #     the right size, but due to the added section, some
        #     addresses have shifted.
-       #     From here, we generate a correct .tmp_kallsyms2.o
+       #     From here, we generate a correct .tmp_vmlinux.kallsyms2.o
        # 3)  That link may have expanded the kernel image enough that
        #     more linker branch stubs / trampolines had to be added, which
        #     introduces new names, which further expands kallsyms. Do another
index 221aa7df008d66f7263fb9267825a45010d8454c..cb54c7f1aa80680eb06f7125b24580146d09e461 100755 (executable)
@@ -39,6 +39,10 @@ case "${1}" in
                opts="-I ${XZ}"
                tarball=${tarball}.xz
                ;;
+       tarzst-pkg)
+               opts="-I ${ZSTD}"
+               tarball=${tarball}.zst
+               ;;
        *)
                echo "Unknown tarball target \"${1}\" requested, please add it to ${0}." >&2
                exit 1
index 17fdc620d5488c50de35e67dcbadc243e205b9ad..acf6ea71129921ad77a201d13761382ee032483e 100644 (file)
@@ -178,6 +178,7 @@ assum||assume
 assumtpion||assumption
 asuming||assuming
 asycronous||asynchronous
+asychronous||asynchronous
 asynchnous||asynchronous
 asynchromous||asynchronous
 asymetric||asymmetric
@@ -241,6 +242,7 @@ beter||better
 betweeen||between
 bianries||binaries
 bitmast||bitmask
+bitwiedh||bitwidth
 boardcast||broadcast
 borad||board
 boundry||boundary
@@ -265,7 +267,10 @@ calucate||calculate
 calulate||calculate
 cancelation||cancellation
 cancle||cancel
+cant||can't
+cant'||can't
 canot||cannot
+cann't||can't
 capabilites||capabilities
 capabilties||capabilities
 capabilty||capability
@@ -501,6 +506,7 @@ disble||disable
 disgest||digest
 disired||desired
 dispalying||displaying
+dissable||disable
 diplay||display
 directon||direction
 direcly||directly
@@ -595,6 +601,7 @@ exceded||exceeded
 exceds||exceeds
 exceeed||exceed
 excellant||excellent
+exchnage||exchange
 execeeded||exceeded
 execeeds||exceeds
 exeed||exceed
@@ -938,6 +945,7 @@ migrateable||migratable
 milliseonds||milliseconds
 minium||minimum
 minimam||minimum
+minimun||minimum
 miniumum||minimum
 minumum||minimum
 misalinged||misaligned
@@ -956,6 +964,7 @@ mmnemonic||mnemonic
 mnay||many
 modfiy||modify
 modifer||modifier
+modul||module
 modulues||modules
 momery||memory
 memomry||memory
@@ -1154,6 +1163,7 @@ programable||programmable
 programers||programmers
 programm||program
 programms||programs
+progres||progress
 progresss||progress
 prohibitted||prohibited
 prohibitting||prohibiting
@@ -1328,6 +1338,7 @@ servive||service
 setts||sets
 settting||setting
 shapshot||snapshot
+shoft||shift
 shotdown||shutdown
 shoud||should
 shouldnt||shouldn't
@@ -1439,6 +1450,7 @@ syfs||sysfs
 symetric||symmetric
 synax||syntax
 synchonized||synchronized
+sychronization||synchronization
 synchronuously||synchronously
 syncronize||synchronize
 syncronized||synchronized
@@ -1521,6 +1533,7 @@ unexpexted||unexpected
 unfortunatelly||unfortunately
 unifiy||unify
 uniterrupted||uninterrupted
+uninterruptable||uninterruptible
 unintialized||uninitialized
 unitialized||uninitialized
 unkmown||unknown
@@ -1553,6 +1566,7 @@ unuseful||useless
 unvalid||invalid
 upate||update
 upsupported||unsupported
+useable||usable
 usefule||useful
 usefull||useful
 usege||usage
@@ -1574,6 +1588,7 @@ varient||variant
 vaule||value
 verbse||verbose
 veify||verify
+verfication||verification
 veriosn||version
 verisons||versions
 verison||version
@@ -1586,6 +1601,7 @@ visiters||visitors
 vitual||virtual
 vunerable||vulnerable
 wakeus||wakeups
+was't||wasn't
 wathdog||watchdog
 wating||waiting
 wiat||wait
index fe6c0395fa025115c73f09b911ff0b876f150fbe..0b847f435beb5a31bc2162406658a6927a683154 100644 (file)
@@ -163,20 +163,6 @@ config HARDENED_USERCOPY
          or are part of the kernel text. This kills entire classes
          of heap overflow exploits and similar kernel memory exposures.
 
-config HARDENED_USERCOPY_FALLBACK
-       bool "Allow usercopy whitelist violations to fallback to object size"
-       depends on HARDENED_USERCOPY
-       default y
-       help
-         This is a temporary option that allows missing usercopy whitelists
-         to be discovered via a WARN() to the kernel log, instead of
-         rejecting the copy, falling back to non-whitelisted hardened
-         usercopy that checks the slab allocation size instead of the
-         whitelist size. This option will be removed once it seems like
-         all missing usercopy whitelists have been identified and fixed.
-         Booting with "slab_common.usercopy_fallback=Y/N" can change
-         this setting.
-
 config HARDENED_USERCOPY_PAGESPAN
        bool "Refuse to copy allocations that span multiple pages"
        depends on HARDENED_USERCOPY
index bfabb19dc0d3d21b4ce91b721439a36777e91cb5..196b6640bf3783601833f348447d720f71f9f7ec 100644 (file)
@@ -57,6 +57,19 @@ test_write_fail "$file" "1 2 3 5 4" "$orig_content" \
 test_content "$file" "$orig_content" "1 2 3 4 5" "successfully written"
 echo "$orig_content" > "$file"
 
+# Test schemes file
+# =================
+
+file="$DBGFS/schemes"
+orig_content=$(cat "$file")
+
+test_write_succ "$file" "1 2 3 4 5 6 4 0 0 0 1 2 3 1 100 3 2 1" \
+       "$orig_content" "valid input"
+test_write_fail "$file" "1 2
+3 4 5 6 3 0 0 0 1 2 3 1 100 3 2 1" "$orig_content" "multi lines"
+test_write_succ "$file" "" "$orig_content" "disabling"
+echo "$orig_content" > "$file"
+
 # Test target_ids file
 # ====================
 
index 84285a6f60b079346a5c527ea99a5d400c678c0a..dc7ade19679808abb5cfbfe994beb550b0101881 100644 (file)
@@ -22,6 +22,9 @@ ppc64*)
 ppc*)
   ARG1=%r3
 ;;
+s390*)
+  ARG1=%r2
+;;
 *)
   echo "Please implement other architecture here"
   exit_untested
index 474ca1a9a088528e2f15b5a642fcd775a518e5d2..47d84b5cb6ca4da3a4dc6ace23be42383cb7070e 100644 (file)
@@ -32,6 +32,10 @@ ppc*)
   GOODREG=%r3
   BADREG=%msr
 ;;
+s390*)
+  GOODREG=%r2
+  BADREG=%s2
+;;
 *)
   echo "Please implement other architecture here"
   exit_untested
index a7e8cd5bb265d86cca85e91adc69043e18a3c13a..1eef042a31e1a75cc56c0d70db4f67a89ce4716c 100644 (file)
@@ -1,5 +1,4 @@
 CONFIG_MEMORY_HOTPLUG=y
-CONFIG_MEMORY_HOTPLUG_SPARSE=y
 CONFIG_NOTIFIER_ERROR_INJECTION=y
 CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
 CONFIG_MEMORY_HOTREMOVE=y
index b02eac613fddadc0f23c0d33147e177bf8d1c185..2e7e86e852828075247177264591255668300b80 100644 (file)
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 hugepage-mmap
+hugepage-mremap
 hugepage-shm
 khugepaged
 map_hugetlb
index d9605bd10f2deb4daea9da2eb7ec16e728fa8409..1607322a112c91732f7c020325513b2c8c9c06ab 100644 (file)
@@ -29,6 +29,7 @@ TEST_GEN_FILES = compaction_test
 TEST_GEN_FILES += gup_test
 TEST_GEN_FILES += hmm-tests
 TEST_GEN_FILES += hugepage-mmap
+TEST_GEN_FILES += hugepage-mremap
 TEST_GEN_FILES += hugepage-shm
 TEST_GEN_FILES += khugepaged
 TEST_GEN_FILES += madv_populate
diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c
new file mode 100644 (file)
index 0000000..257df94
--- /dev/null
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-mremap:
+ *
+ * Example of remapping huge page memory in a user application using the
+ * mremap system call.  Code assumes a hugetlbfs filesystem is mounted
+ * at './huge'.  The code will use 10MB worth of huge pages.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <fcntl.h> /* Definition of O_* constants */
+#include <sys/syscall.h> /* Definition of SYS_* constants */
+#include <linux/userfaultfd.h>
+#include <sys/ioctl.h>
+
+#define LENGTH (1UL * 1024 * 1024 * 1024)
+
+#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC)
+#define FLAGS (MAP_SHARED | MAP_ANONYMOUS)
+
+static void check_bytes(char *addr)
+{
+       printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+       unsigned long i;
+
+       for (i = 0; i < LENGTH; i++)
+               *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+       unsigned long i;
+
+       check_bytes(addr);
+       for (i = 0; i < LENGTH; i++)
+               if (*(addr + i) != (char)i) {
+                       printf("Mismatch at %lu\n", i);
+                       return 1;
+               }
+       return 0;
+}
+
+static void register_region_with_uffd(char *addr, size_t len)
+{
+       long uffd; /* userfaultfd file descriptor */
+       struct uffdio_api uffdio_api;
+       struct uffdio_register uffdio_register;
+
+       /* Create and enable userfaultfd object. */
+
+       uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+       if (uffd == -1) {
+               perror("userfaultfd");
+               exit(1);
+       }
+
+       uffdio_api.api = UFFD_API;
+       uffdio_api.features = 0;
+       if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
+               perror("ioctl-UFFDIO_API");
+               exit(1);
+       }
+
+       /* Create a private anonymous mapping. The memory will be
+        * demand-zero paged--that is, not yet allocated. When we
+        * actually touch the memory, it will be allocated via
+        * the userfaultfd.
+        */
+
+       addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       printf("Address returned by mmap() = %p\n", addr);
+
+       /* Register the memory range of the mapping we just created for
+        * handling by the userfaultfd object. In mode, we request to track
+        * missing pages (i.e., pages that have not yet been faulted in).
+        */
+
+       uffdio_register.range.start = (unsigned long)addr;
+       uffdio_register.range.len = len;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
+               perror("ioctl-UFFDIO_REGISTER");
+               exit(1);
+       }
+}
+
+int main(void)
+{
+       int ret = 0;
+
+       int fd = open("/huge/test", O_CREAT | O_RDWR, 0755);
+
+       if (fd < 0) {
+               perror("Open failed");
+               exit(1);
+       }
+
+       /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */
+       unsigned long suggested_addr = 0x7eaa40000000;
+       void *haddr = mmap((void *)suggested_addr, LENGTH, PROTECTION,
+                          MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
+       printf("Map haddr: Returned address is %p\n", haddr);
+       if (haddr == MAP_FAILED) {
+               perror("mmap1");
+               exit(1);
+       }
+
+       /* mmap again to a dummy address to hopefully trigger pmd sharing. */
+       suggested_addr = 0x7daa40000000;
+       void *daddr = mmap((void *)suggested_addr, LENGTH, PROTECTION,
+                          MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
+       printf("Map daddr: Returned address is %p\n", daddr);
+       if (daddr == MAP_FAILED) {
+               perror("mmap3");
+               exit(1);
+       }
+
+       suggested_addr = 0x7faa40000000;
+       void *vaddr =
+               mmap((void *)suggested_addr, LENGTH, PROTECTION, FLAGS, -1, 0);
+       printf("Map vaddr: Returned address is %p\n", vaddr);
+       if (vaddr == MAP_FAILED) {
+               perror("mmap2");
+               exit(1);
+       }
+
+       register_region_with_uffd(haddr, LENGTH);
+
+       void *addr = mremap(haddr, LENGTH, LENGTH,
+                           MREMAP_MAYMOVE | MREMAP_FIXED, vaddr);
+       if (addr == MAP_FAILED) {
+               perror("mremap");
+               exit(1);
+       }
+
+       printf("Mremap: Returned address is %p\n", addr);
+       check_bytes(addr);
+       write_bytes(addr);
+       ret = read_bytes(addr);
+
+       munmap(addr, LENGTH);
+
+       return ret;
+}
index b61dcdb44c5beca53c91b5fafe68d8e894a06f9a..1436e1a9a3d3888eb4f7a64c9da4b601eeb853b0 100644 (file)
@@ -5,6 +5,10 @@
 #include <time.h>
 #include <string.h>
 #include <numa.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <err.h>
 
 #include "../kselftest.h"
 #include "../../../../include/vdso/time64.h"
 #define KSM_MERGE_ACROSS_NODES_DEFAULT true
 #define MB (1ul << 20)
 
+#define PAGE_SHIFT 12
+#define HPAGE_SHIFT 21
+
+#define PAGE_SIZE (1 << PAGE_SHIFT)
+#define HPAGE_SIZE (1 << HPAGE_SHIFT)
+
+#define PAGEMAP_PRESENT(ent)   (((ent) & (1ull << 63)) != 0)
+#define PAGEMAP_PFN(ent)       ((ent) & ((1ull << 55) - 1))
+
 struct ksm_sysfs {
        unsigned long max_page_sharing;
        unsigned long merge_across_nodes;
@@ -34,6 +47,7 @@ enum ksm_test_name {
        CHECK_KSM_ZERO_PAGE_MERGE,
        CHECK_KSM_NUMA_MERGE,
        KSM_MERGE_TIME,
+       KSM_MERGE_TIME_HUGE_PAGES,
        KSM_COW_TIME
 };
 
@@ -99,6 +113,9 @@ static void print_help(void)
               " -U (page unmerging)\n"
               " -P evaluate merging time and speed.\n"
               "    For this test, the size of duplicated memory area (in MiB)\n"
+              "    must be provided using -s option\n"
+                                " -H evaluate merging time and speed of area allocated mostly with huge pages\n"
+              "    For this test, the size of duplicated memory area (in MiB)\n"
               "    must be provided using -s option\n"
               " -C evaluate the time required to break COW of merged pages.\n\n");
 
@@ -354,12 +371,34 @@ err_out:
        return KSFT_FAIL;
 }
 
+static int get_next_mem_node(int node)
+{
+
+       long node_size;
+       int mem_node = 0;
+       int i, max_node = numa_max_node();
+
+       for (i = node + 1; i <= max_node + node; i++) {
+               mem_node = i % (max_node + 1);
+               node_size = numa_node_size(mem_node, NULL);
+               if (node_size > 0)
+                       break;
+       }
+       return mem_node;
+}
+
+static int get_first_mem_node(void)
+{
+       return get_next_mem_node(numa_max_node());
+}
+
 static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes,
                                size_t page_size)
 {
        void *numa1_map_ptr, *numa2_map_ptr;
        struct timespec start_time;
        int page_count = 2;
+       int first_node;
 
        if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
                perror("clock_gettime");
@@ -370,7 +409,7 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a
                perror("NUMA support not enabled");
                return KSFT_SKIP;
        }
-       if (numa_max_node() < 1) {
+       if (numa_num_configured_nodes() <= 1) {
                printf("At least 2 NUMA nodes must be available\n");
                return KSFT_SKIP;
        }
@@ -378,8 +417,9 @@ static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_a
                return KSFT_FAIL;
 
        /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */
-       numa1_map_ptr = numa_alloc_onnode(page_size, 0);
-       numa2_map_ptr = numa_alloc_onnode(page_size, 1);
+       first_node = get_first_mem_node();
+       numa1_map_ptr = numa_alloc_onnode(page_size, first_node);
+       numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node));
        if (!numa1_map_ptr || !numa2_map_ptr) {
                perror("numa_alloc_onnode");
                return KSFT_FAIL;
@@ -416,6 +456,101 @@ err_out:
        return KSFT_FAIL;
 }
 
+int64_t allocate_transhuge(void *ptr, int pagemap_fd)
+{
+       uint64_t ent[2];
+
+       /* drop pmd */
+       if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
+                               MAP_FIXED | MAP_ANONYMOUS |
+                               MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
+               errx(2, "mmap transhuge");
+
+       if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
+               err(2, "MADV_HUGEPAGE");
+
+       /* allocate transparent huge page */
+       *(volatile void **)ptr = ptr;
+
+       if (pread(pagemap_fd, ent, sizeof(ent),
+                       (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
+               err(2, "read pagemap");
+
+       if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
+           PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
+           !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
+               return PAGEMAP_PFN(ent[0]);
+
+       return -1;
+}
+
+static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size)
+{
+       void *map_ptr, *map_ptr_orig;
+       struct timespec start_time, end_time;
+       unsigned long scan_time_ns;
+       int pagemap_fd, n_normal_pages, n_huge_pages;
+
+       map_size *= MB;
+       size_t len = map_size;
+
+       len -= len % HPAGE_SIZE;
+       map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
+                       MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
+       map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE;
+
+       if (map_ptr_orig == MAP_FAILED)
+               err(2, "initial mmap");
+
+       if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE))
+               err(2, "MADV_HUGEPAGE");
+
+       pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+       if (pagemap_fd < 0)
+               err(2, "open pagemap");
+
+       n_normal_pages = 0;
+       n_huge_pages = 0;
+       for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) {
+               if (allocate_transhuge(p, pagemap_fd) < 0)
+                       n_normal_pages++;
+               else
+                       n_huge_pages++;
+       }
+       printf("Number of normal pages:    %d\n", n_normal_pages);
+       printf("Number of huge pages:    %d\n", n_huge_pages);
+
+       memset(map_ptr, '*', len);
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+       if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+               goto err_out;
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+
+       scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+                      (end_time.tv_nsec - start_time.tv_nsec);
+
+       printf("Total size:    %lu MiB\n", map_size / MB);
+       printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+              scan_time_ns % NSEC_PER_SEC);
+       printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
+                                              ((double)scan_time_ns / NSEC_PER_SEC));
+
+       munmap(map_ptr_orig, len + HPAGE_SIZE);
+       return KSFT_PASS;
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr_orig, len + HPAGE_SIZE);
+       return KSFT_FAIL;
+}
+
 static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size)
 {
        void *map_ptr;
@@ -541,7 +676,7 @@ int main(int argc, char *argv[])
        bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
        long size_MB = 0;
 
-       while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPC")) != -1) {
+       while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCH")) != -1) {
                switch (opt) {
                case 'a':
                        prot = str_to_prot(optarg);
@@ -595,6 +730,9 @@ int main(int argc, char *argv[])
                case 'P':
                        test_name = KSM_MERGE_TIME;
                        break;
+               case 'H':
+                       test_name = KSM_MERGE_TIME_HUGE_PAGES;
+                       break;
                case 'C':
                        test_name = KSM_COW_TIME;
                        break;
@@ -647,6 +785,14 @@ int main(int argc, char *argv[])
                ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
                                     size_MB);
                break;
+       case KSM_MERGE_TIME_HUGE_PAGES:
+               if (size_MB == 0) {
+                       printf("Option '-s' is required.\n");
+                       return KSFT_FAIL;
+               }
+               ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
+                               ksm_scan_limit_sec, size_MB);
+               break;
        case KSM_COW_TIME:
                ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
                                   page_size);
index b959e4ebdad45cf43af0a7868dfee2469598f516..3ee0e82756002252658e3d74926df72c95166bfd 100644 (file)
 #include <unistd.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/mman.h>
 #include <sys/mman.h>
 
 #include "../kselftest.h"
 
-#if defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE)
-
 /*
  * For now, we're using 2 MiB of private anonymous memory for all tests.
  */
@@ -328,15 +327,3 @@ int main(int argc, char **argv)
                                   err, ksft_test_num());
        return ksft_exit_pass();
 }
-
-#else /* defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) */
-
-#warning "missing MADV_POPULATE_READ or MADV_POPULATE_WRITE definition"
-
-int main(int argc, char **argv)
-{
-       ksft_print_header();
-       ksft_exit_skip("MADV_POPULATE_READ or MADV_POPULATE_WRITE not defined\n");
-}
-
-#endif /* defined(MADV_POPULATE_READ) && defined(MADV_POPULATE_WRITE) */
index 45e803af7c7754bbb9d40e4b976d4c0f849912b3..a24d30af30949a1a9dbc21a7efe4f6a6d0c01e38 100755 (executable)
@@ -108,6 +108,17 @@ else
        echo "[PASS]"
 fi
 
+echo "-----------------------"
+echo "running hugepage-mremap"
+echo "-----------------------"
+./hugepage-mremap
+if [ $? -ne 0 ]; then
+       echo "[FAIL]"
+       exitcode=1
+else
+       echo "[PASS]"
+fi
+
 echo "NOTE: The above hugetlb tests provide minimal coverage.  Use"
 echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
 echo "      hugetlb regression testing."
index fd7f1b4a96f949bd21e2febef0f0a5fa6c973e99..5e4c036f6ad384bb4baad09610e1acf357d465ad 100644 (file)
@@ -79,7 +79,7 @@ int main(int argc, char **argv)
 
        warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
              " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
-             len >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
+             ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
 
        pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
        if (pagemap_fd < 0)
index 60aa1a4fc69b63e165d241b4c14bce5f20c8a13d..8a09057d2f223544302bbb6b1e7cf37ff80c1c9c 100644 (file)
@@ -57,6 +57,7 @@
 #include <assert.h>
 #include <inttypes.h>
 #include <stdint.h>
+#include <sys/random.h>
 
 #include "../kselftest.h"
 
@@ -307,37 +308,24 @@ static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 }
 
 struct uffd_test_ops {
-       unsigned long expected_ioctls;
        void (*allocate_area)(void **alloc_area);
        void (*release_pages)(char *rel_area);
        void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
 };
 
-#define SHMEM_EXPECTED_IOCTLS          ((1 << _UFFDIO_WAKE) | \
-                                        (1 << _UFFDIO_COPY) | \
-                                        (1 << _UFFDIO_ZEROPAGE))
-
-#define ANON_EXPECTED_IOCTLS           ((1 << _UFFDIO_WAKE) | \
-                                        (1 << _UFFDIO_COPY) | \
-                                        (1 << _UFFDIO_ZEROPAGE) | \
-                                        (1 << _UFFDIO_WRITEPROTECT))
-
 static struct uffd_test_ops anon_uffd_test_ops = {
-       .expected_ioctls = ANON_EXPECTED_IOCTLS,
        .allocate_area  = anon_allocate_area,
        .release_pages  = anon_release_pages,
        .alias_mapping = noop_alias_mapping,
 };
 
 static struct uffd_test_ops shmem_uffd_test_ops = {
-       .expected_ioctls = SHMEM_EXPECTED_IOCTLS,
        .allocate_area  = shmem_allocate_area,
        .release_pages  = shmem_release_pages,
        .alias_mapping = shmem_alias_mapping,
 };
 
 static struct uffd_test_ops hugetlb_uffd_test_ops = {
-       .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE),
        .allocate_area  = hugetlb_allocate_area,
        .release_pages  = hugetlb_release_pages,
        .alias_mapping = hugetlb_alias_mapping,
@@ -345,6 +333,43 @@ static struct uffd_test_ops hugetlb_uffd_test_ops = {
 
 static struct uffd_test_ops *uffd_test_ops;
 
+static inline uint64_t uffd_minor_feature(void)
+{
+       if (test_type == TEST_HUGETLB && map_shared)
+               return UFFD_FEATURE_MINOR_HUGETLBFS;
+       else if (test_type == TEST_SHMEM)
+               return UFFD_FEATURE_MINOR_SHMEM;
+       else
+               return 0;
+}
+
+static uint64_t get_expected_ioctls(uint64_t mode)
+{
+       uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
+
+       if (test_type == TEST_HUGETLB)
+               ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
+
+       if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
+               ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
+
+       if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
+               ioctls &= ~(1 << _UFFDIO_CONTINUE);
+
+       return ioctls;
+}
+
+static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
+{
+       uint64_t expected = get_expected_ioctls(mode);
+       uint64_t actual = ioctls & expected;
+
+       if (actual != expected) {
+               err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
+                   expected, actual);
+       }
+}
+
 static void userfaultfd_open(uint64_t *features)
 {
        struct uffdio_api uffdio_api;
@@ -405,7 +430,7 @@ static void uffd_test_ctx_clear(void)
        munmap_area((void **)&area_dst_alias);
 }
 
-static void uffd_test_ctx_init_ext(uint64_t *features)
+static void uffd_test_ctx_init(uint64_t features)
 {
        unsigned long nr, cpu;
 
@@ -414,7 +439,7 @@ static void uffd_test_ctx_init_ext(uint64_t *features)
        uffd_test_ops->allocate_area((void **)&area_src);
        uffd_test_ops->allocate_area((void **)&area_dst);
 
-       userfaultfd_open(features);
+       userfaultfd_open(&features);
 
        count_verify = malloc(nr_pages * sizeof(unsigned long long));
        if (!count_verify)
@@ -462,11 +487,6 @@ static void uffd_test_ctx_init_ext(uint64_t *features)
                        err("pipe");
 }
 
-static inline void uffd_test_ctx_init(uint64_t features)
-{
-       uffd_test_ctx_init_ext(&features);
-}
-
 static int my_bcmp(char *str1, char *str2, size_t n)
 {
        unsigned long i;
@@ -518,22 +538,10 @@ static void continue_range(int ufd, __u64 start, __u64 len)
 static void *locking_thread(void *arg)
 {
        unsigned long cpu = (unsigned long) arg;
-       struct random_data rand;
        unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
-       int32_t rand_nr;
        unsigned long long count;
-       char randstate[64];
-       unsigned int seed;
 
-       if (bounces & BOUNCE_RANDOM) {
-               seed = (unsigned int) time(NULL) - bounces;
-               if (!(bounces & BOUNCE_RACINGFAULTS))
-                       seed += cpu;
-               bzero(&rand, sizeof(rand));
-               bzero(&randstate, sizeof(randstate));
-               if (initstate_r(seed, randstate, sizeof(randstate), &rand))
-                       err("initstate_r failed");
-       } else {
+       if (!(bounces & BOUNCE_RANDOM)) {
                page_nr = -bounces;
                if (!(bounces & BOUNCE_RACINGFAULTS))
                        page_nr += cpu * nr_pages_per_cpu;
@@ -541,15 +549,8 @@ static void *locking_thread(void *arg)
 
        while (!finished) {
                if (bounces & BOUNCE_RANDOM) {
-                       if (random_r(&rand, &rand_nr))
-                               err("random_r failed");
-                       page_nr = rand_nr;
-                       if (sizeof(page_nr) > sizeof(rand_nr)) {
-                               if (random_r(&rand, &rand_nr))
-                                       err("random_r failed");
-                               page_nr |= (((unsigned long) rand_nr) << 16) <<
-                                          16;
-                       }
+                       if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
+                               err("getrandom failed");
                } else
                        page_nr += 1;
                page_nr %= nr_pages;
@@ -1030,11 +1031,9 @@ static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
 {
        struct uffdio_zeropage uffdio_zeropage;
        int ret;
-       unsigned long has_zeropage;
+       bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
        __s64 res;
 
-       has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
-
        if (offset >= nr_pages * page_size)
                err("unexpected offset %lu", offset);
        uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
@@ -1074,7 +1073,6 @@ static int uffdio_zeropage(int ufd, unsigned long offset)
 static int userfaultfd_zeropage_test(void)
 {
        struct uffdio_register uffdio_register;
-       unsigned long expected_ioctls;
 
        printf("testing UFFDIO_ZEROPAGE: ");
        fflush(stdout);
@@ -1089,9 +1087,8 @@ static int userfaultfd_zeropage_test(void)
        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
                err("register failure");
 
-       expected_ioctls = uffd_test_ops->expected_ioctls;
-       if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
-               err("unexpected missing ioctl for anon memory");
+       assert_expected_ioctls_present(
+               uffdio_register.mode, uffdio_register.ioctls);
 
        if (uffdio_zeropage(uffd, 0))
                if (my_bcmp(area_dst, zeropage, page_size))
@@ -1104,7 +1101,6 @@ static int userfaultfd_zeropage_test(void)
 static int userfaultfd_events_test(void)
 {
        struct uffdio_register uffdio_register;
-       unsigned long expected_ioctls;
        pthread_t uffd_mon;
        int err, features;
        pid_t pid;
@@ -1128,9 +1124,8 @@ static int userfaultfd_events_test(void)
        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
                err("register failure");
 
-       expected_ioctls = uffd_test_ops->expected_ioctls;
-       if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
-               err("unexpected missing ioctl for anon memory");
+       assert_expected_ioctls_present(
+               uffdio_register.mode, uffdio_register.ioctls);
 
        if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
                err("uffd_poll_thread create");
@@ -1158,7 +1153,6 @@ static int userfaultfd_events_test(void)
 static int userfaultfd_sig_test(void)
 {
        struct uffdio_register uffdio_register;
-       unsigned long expected_ioctls;
        unsigned long userfaults;
        pthread_t uffd_mon;
        int err, features;
@@ -1182,9 +1176,8 @@ static int userfaultfd_sig_test(void)
        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
                err("register failure");
 
-       expected_ioctls = uffd_test_ops->expected_ioctls;
-       if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
-               err("unexpected missing ioctl for anon memory");
+       assert_expected_ioctls_present(
+               uffdio_register.mode, uffdio_register.ioctls);
 
        if (faulting_process(1))
                err("faulting process failed");
@@ -1219,14 +1212,12 @@ static int userfaultfd_sig_test(void)
 static int userfaultfd_minor_test(void)
 {
        struct uffdio_register uffdio_register;
-       unsigned long expected_ioctls;
        unsigned long p;
        pthread_t uffd_mon;
        uint8_t expected_byte;
        void *expected_page;
        char c;
        struct uffd_stats stats = { 0 };
-       uint64_t req_features, features_out;
 
        if (!test_uffdio_minor)
                return 0;
@@ -1234,21 +1225,7 @@ static int userfaultfd_minor_test(void)
        printf("testing minor faults: ");
        fflush(stdout);
 
-       if (test_type == TEST_HUGETLB)
-               req_features = UFFD_FEATURE_MINOR_HUGETLBFS;
-       else if (test_type == TEST_SHMEM)
-               req_features = UFFD_FEATURE_MINOR_SHMEM;
-       else
-               return 1;
-
-       features_out = req_features;
-       uffd_test_ctx_init_ext(&features_out);
-       /* If kernel reports required features aren't supported, skip test. */
-       if ((features_out & req_features) != req_features) {
-               printf("skipping test due to lack of feature support\n");
-               fflush(stdout);
-               return 0;
-       }
+       uffd_test_ctx_init(uffd_minor_feature());
 
        uffdio_register.range.start = (unsigned long)area_dst_alias;
        uffdio_register.range.len = nr_pages * page_size;
@@ -1256,10 +1233,8 @@ static int userfaultfd_minor_test(void)
        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
                err("register failure");
 
-       expected_ioctls = uffd_test_ops->expected_ioctls;
-       expected_ioctls |= 1 << _UFFDIO_CONTINUE;
-       if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
-               err("unexpected missing ioctl(s)");
+       assert_expected_ioctls_present(
+               uffdio_register.mode, uffdio_register.ioctls);
 
        /*
         * After registering with UFFD, populate the non-UFFD-registered side of
@@ -1456,8 +1431,6 @@ static int userfaultfd_stress(void)
        pthread_attr_setstacksize(&attr, 16*1024*1024);
 
        while (bounces--) {
-               unsigned long expected_ioctls;
-
                printf("bounces: %d, mode:", bounces);
                if (bounces & BOUNCE_RANDOM)
                        printf(" rnd");
@@ -1485,10 +1458,8 @@ static int userfaultfd_stress(void)
                        uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
                if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
                        err("register failure");
-               expected_ioctls = uffd_test_ops->expected_ioctls;
-               if ((uffdio_register.ioctls & expected_ioctls) !=
-                   expected_ioctls)
-                       err("unexpected missing ioctl for anon memory");
+               assert_expected_ioctls_present(
+                       uffdio_register.mode, uffdio_register.ioctls);
 
                if (area_dst_alias) {
                        uffdio_register.range.start = (unsigned long)
@@ -1609,6 +1580,8 @@ unsigned long default_huge_page_size(void)
 
 static void set_test_type(const char *type)
 {
+       uint64_t features = UFFD_API_FEATURES;
+
        if (!strcmp(type, "anon")) {
                test_type = TEST_ANON;
                uffd_test_ops = &anon_uffd_test_ops;
@@ -1642,6 +1615,22 @@ static void set_test_type(const char *type)
        if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
            > page_size)
                err("Impossible to run this test");
+
+       /*
+        * Whether we can test certain features depends not just on test type,
+        * but also on whether or not this particular kernel supports the
+        * feature.
+        */
+
+       userfaultfd_open(&features);
+
+       test_uffdio_wp = test_uffdio_wp &&
+               (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
+       test_uffdio_minor = test_uffdio_minor &&
+               (features & uffd_minor_feature());
+
+       close(uffd);
+       uffd = -1;
 }
 
 static void sigalrm(int sig)
index f62f10c988db107c5aba893e31972f0e64a3b0f4..b1ed76d9a97942cce403657f91b6521bbb84a0e1 100644 (file)
@@ -390,7 +390,7 @@ static void show_page_range(unsigned long voffset, unsigned long offset,
                if (opt_pid)
                        printf("%lx\t", voff);
                if (opt_file)
-                       printf("%lu\t", voff);
+                       printf("%lx\t", voff);
                if (opt_list_cgroup)
                        printf("@%llu\t", (unsigned long long)cgroup0);
                if (opt_list_mapcnt)
@@ -418,7 +418,7 @@ static void show_page(unsigned long voffset, unsigned long offset,
        if (opt_pid)
                printf("%lx\t", voffset);
        if (opt_file)
-               printf("%lu\t", voffset);
+               printf("%lx\t", voffset);
        if (opt_list_cgroup)
                printf("@%llu\t", (unsigned long long)cgroup);
        if (opt_list_mapcnt)
@@ -967,22 +967,19 @@ static struct sigaction sigbus_action = {
        .sa_flags = SA_SIGINFO,
 };
 
-static void walk_file(const char *name, const struct stat *st)
+static void walk_file_range(const char *name, int fd,
+                           unsigned long off, unsigned long end)
 {
        uint8_t vec[PAGEMAP_BATCH];
        uint64_t buf[PAGEMAP_BATCH], flags;
        uint64_t cgroup = 0;
        uint64_t mapcnt = 0;
        unsigned long nr_pages, pfn, i;
-       off_t off, end = st->st_size;
-       int fd;
        ssize_t len;
        void *ptr;
        int first = 1;
 
-       fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
-
-       for (off = 0; off < end; off += len) {
+       for (; off < end; off += len) {
                nr_pages = (end - off + page_size - 1) / page_size;
                if (nr_pages > PAGEMAP_BATCH)
                        nr_pages = PAGEMAP_BATCH;
@@ -1037,12 +1034,26 @@ got_sigbus:
                        if (first && opt_list) {
                                first = 0;
                                flush_page_range();
-                               show_file(name, st);
                        }
                        add_page(off / page_size + i, pfn,
                                 flags, cgroup, mapcnt, buf[i]);
                }
        }
+}
+
+static void walk_file(const char *name, const struct stat *st)
+{
+       int i;
+       int fd;
+
+       fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
+
+       if (!nr_addr_ranges)
+               add_addr_range(0, st->st_size / page_size);
+
+       for (i = 0; i < nr_addr_ranges; i++)
+               walk_file_range(name, fd, opt_offset[i] * page_size,
+                               (opt_offset[i] + opt_size[i]) * page_size);
 
        close(fd);
 }
@@ -1062,10 +1073,10 @@ int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f)
        return 0;
 }
 
+struct stat st;
+
 static void walk_page_cache(void)
 {
-       struct stat st;
-
        kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
        pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
        sigaction(SIGBUS, &sigbus_action, NULL);
@@ -1362,6 +1373,11 @@ int main(int argc, char *argv[])
        if (opt_list)
                printf("\n\n");
 
+       if (opt_file) {
+               show_file(opt_file, &st);
+               printf("\n");
+       }
+
        show_summary();
 
        if (opt_list_mapcnt)
index 0e75f22c94750b8e375f81c13e3b2851d1f7e88a..9ebb84a9c731031bc0e07f2d52a885702a1faac2 100644 (file)
@@ -5,6 +5,8 @@
  * Example use:
  * cat /sys/kernel/debug/page_owner > page_owner_full.txt
  * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt
+ * Or sort by total memory:
+ * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt
  *
  * See Documentation/vm/page_owner.rst
 */
 #include <fcntl.h>
 #include <unistd.h>
 #include <string.h>
+#include <regex.h>
+#include <errno.h>
 
 struct block_list {
        char *txt;
        int len;
        int num;
+       int page_num;
 };
 
-
+static int sort_by_memory;
+static regex_t order_pattern;
 static struct block_list *list;
 static int list_size;
 static int max_size;
@@ -59,12 +65,50 @@ static int compare_num(const void *p1, const void *p2)
        return l2->num - l1->num;
 }
 
+static int compare_page_num(const void *p1, const void *p2)
+{
+       const struct block_list *l1 = p1, *l2 = p2;
+
+       return l2->page_num - l1->page_num;
+}
+
+static int get_page_num(char *buf)
+{
+       int err, val_len, order_val;
+       char order_str[4] = {0};
+       char *endptr;
+       regmatch_t pmatch[2];
+
+       err = regexec(&order_pattern, buf, 2, pmatch, REG_NOTBOL);
+       if (err != 0 || pmatch[1].rm_so == -1) {
+               printf("no order pattern in %s\n", buf);
+               return 0;
+       }
+       val_len = pmatch[1].rm_eo - pmatch[1].rm_so;
+       if (val_len > 2) /* max_order should not exceed 2 digits */
+               goto wrong_order;
+
+       memcpy(order_str, buf + pmatch[1].rm_so, val_len);
+
+       errno = 0;
+       order_val = strtol(order_str, &endptr, 10);
+       if (errno != 0 || endptr == order_str || *endptr != '\0')
+               goto wrong_order;
+
+       return 1 << order_val;
+
+wrong_order:
+       printf("wrong order in follow buf:\n%s\n", buf);
+       return 0;
+}
+
 static void add_list(char *buf, int len)
 {
        if (list_size != 0 &&
            len == list[list_size-1].len &&
            memcmp(buf, list[list_size-1].txt, len) == 0) {
                list[list_size-1].num++;
+               list[list_size-1].page_num += get_page_num(buf);
                return;
        }
        if (list_size == max_size) {
@@ -74,6 +118,7 @@ static void add_list(char *buf, int len)
        list[list_size].txt = malloc(len+1);
        list[list_size].len = len;
        list[list_size].num = 1;
+       list[list_size].page_num = get_page_num(buf);
        memcpy(list[list_size].txt, buf, len);
        list[list_size].txt[len] = 0;
        list_size++;
@@ -85,6 +130,13 @@ static void add_list(char *buf, int len)
 
 #define BUF_SIZE       (128 * 1024)
 
+static void usage(void)
+{
+       printf("Usage: ./page_owner_sort [-m] <input> <output>\n"
+               "-m     Sort by total memory. If this option is unset, sort by times\n"
+       );
+}
+
 int main(int argc, char **argv)
 {
        FILE *fin, *fout;
@@ -92,21 +144,39 @@ int main(int argc, char **argv)
        int ret, i, count;
        struct block_list *list2;
        struct stat st;
+       int err;
+       int opt;
 
-       if (argc < 3) {
-               printf("Usage: ./program <input> <output>\n");
-               perror("open: ");
+       while ((opt = getopt(argc, argv, "m")) != -1)
+               switch (opt) {
+               case 'm':
+                       sort_by_memory = 1;
+                       break;
+               default:
+                       usage();
+                       exit(1);
+               }
+
+       if (optind >= (argc - 1)) {
+               usage();
                exit(1);
        }
 
-       fin = fopen(argv[1], "r");
-       fout = fopen(argv[2], "w");
+       fin = fopen(argv[optind], "r");
+       fout = fopen(argv[optind + 1], "w");
        if (!fin || !fout) {
-               printf("Usage: ./program <input> <output>\n");
+               usage();
                perror("open: ");
                exit(1);
        }
 
+       err = regcomp(&order_pattern, "order\\s*([0-9]*),", REG_EXTENDED|REG_NEWLINE);
+       if (err != 0 || order_pattern.re_nsub != 1) {
+               printf("%s: Invalid pattern 'order\\s*([0-9]*),' code %d\n",
+                       argv[0], err);
+               exit(1);
+       }
+
        fstat(fileno(fin), &st);
        max_size = st.st_size / 100; /* hack ... */
 
@@ -145,13 +215,19 @@ int main(int argc, char **argv)
                        list2[count++] = list[i];
                } else {
                        list2[count-1].num += list[i].num;
+                       list2[count-1].page_num += list[i].page_num;
                }
        }
 
-       qsort(list2, count, sizeof(list[0]), compare_num);
+       if (sort_by_memory)
+               qsort(list2, count, sizeof(list[0]), compare_page_num);
+       else
+               qsort(list2, count, sizeof(list[0]), compare_num);
 
        for (i = 0; i < count; i++)
-               fprintf(fout, "%d times:\n%s\n", list2[i].num, list2[i].txt);
+               fprintf(fout, "%d times, %d pages:\n%s\n",
+                               list2[i].num, list2[i].page_num, list2[i].txt);
 
+       regfree(&order_pattern);
        return 0;
 }
index 03b21189d58be5e81f817f10f4b9c46965f12c82..0e2c8a5838b142ae14c7ed25fde5613b4a06f89c 100644 (file)
@@ -188,7 +188,7 @@ struct generic_type {
        mode_t mode;
 };
 
-static struct generic_type generic_type_table[] = {
+static const struct generic_type generic_type_table[] = {
        [GT_DIR] = {
                .type = "dir",
                .mode = S_IFDIR
@@ -320,6 +320,12 @@ static int cpio_mkfile(const char *name, const char *location,
                goto error;
        }
 
+       if (buf.st_mtime > 0xffffffff) {
+               fprintf(stderr, "%s: Timestamp exceeds maximum cpio timestamp, clipping.\n",
+                       location);
+               buf.st_mtime = 0xffffffff;
+       }
+
        filebuf = malloc(buf.st_size);
        if (!filebuf) {
                fprintf (stderr, "out of memory\n");
@@ -491,7 +497,7 @@ static void usage(const char *prog)
                prog);
 }
 
-struct file_handler file_handler_table[] = {
+static const struct file_handler file_handler_table[] = {
        {
                .type    = "file",
                .handler = cpio_mkfile_line,
@@ -551,6 +557,16 @@ int main (int argc, char *argv[])
                }
        }
 
+       /*
+        * Timestamps after 2106-02-07 06:28:15 UTC have an ascii hex time_t
+        * representation that exceeds 8 chars and breaks the cpio header
+        * specification.
+        */
+       if (default_mtime > 0xffffffff) {
+               fprintf(stderr, "ERROR: Timestamp too large for cpio format\n");
+               exit(1);
+       }
+
        if (argc - optind != 1) {
                usage(argv[0]);
                exit(1);