]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 14 Dec 2016 17:17:42 +0000 (09:17 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 14 Dec 2016 17:17:42 +0000 (09:17 -0800)
Pull ext4 updates from Ted Ts'o:
 "This merge request includes the dax-4.0-iomap-pmd branch which is
  needed for both ext4 and xfs dax changes to use iomap for DAX. It also
  includes the fscrypt branch which is needed for ubifs encryption work
  as well as ext4 encryption and fscrypt cleanups.

  Lots of cleanups and bug fixes, especially making sure ext4 is robust
  against maliciously corrupted file systems --- especially maliciously
  corrupted xattr blocks and a maliciously corrupted superblock. Also
  fix ext4 support for 64k block sizes so it works well on ppcle. Fixed
  mbcache so we don't miss some common xattr blocks that can be merged"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (86 commits)
  dax: Fix sleep in atomic contex in grab_mapping_entry()
  fscrypt: Rename FS_WRITE_PATH_FL to FS_CTX_HAS_BOUNCE_BUFFER_FL
  fscrypt: Delay bounce page pool allocation until needed
  fscrypt: Cleanup page locking requirements for fscrypt_{decrypt,encrypt}_page()
  fscrypt: Cleanup fscrypt_{decrypt,encrypt}_page()
  fscrypt: Never allocate fscrypt_ctx on in-place encryption
  fscrypt: Use correct index in decrypt path.
  fscrypt: move the policy flags and encryption mode definitions to uapi header
  fscrypt: move non-public structures and constants to fscrypt_private.h
  fscrypt: unexport fscrypt_initialize()
  fscrypt: rename get_crypt_info() to fscrypt_get_crypt_info()
  fscrypto: move ioctl processing more fully into common code
  fscrypto: remove unneeded Kconfig dependencies
  MAINTAINERS: fscrypto: recommend linux-fsdevel for fscrypto patches
  ext4: do not perform data journaling when data is encrypted
  ext4: return -ENOMEM instead of success
  ext4: reject inodes with negative size
  ext4: remove another test in ext4_alloc_file_blocks()
  Documentation: fix description of ext4's block_validity mount option
  ext4: fix checks for data=ordered and journal_async_commit options
  ...

1  2 
MAINTAINERS
fs/dax.c
fs/ext4/page-io.c
fs/ext4/super.c
fs/f2fs/data.c
fs/f2fs/f2fs.h
fs/f2fs/file.c
fs/xfs/xfs_aops.c
include/uapi/linux/fs.h
mm/filemap.c

diff --combined MAINTAINERS
index 1174508ee59755896d62df095a6bbc787b629b7a,0774714c4e67c74e69ccc62214ccc9862ff5141c..8007e2811264205bd76ad7b39f632cae74ea90c1
@@@ -35,13 -35,13 +35,13 @@@ trivial patch so apply some common sens
  
        PLEASE check your patch with the automated style checker
        (scripts/checkpatch.pl) to catch trivial style violations.
 -      See Documentation/CodingStyle for guidance here.
 +      See Documentation/process/coding-style.rst for guidance here.
  
        PLEASE CC: the maintainers and mailing lists that are generated
        by scripts/get_maintainer.pl.  The results returned by the
        script will be best if you have git installed and are making
        your changes in a branch derived from Linus' latest git tree.
 -      See Documentation/SubmittingPatches for details.
 +      See Documentation/process/submitting-patches.rst for details.
  
        PLEASE try to include any credit lines you want added with the
        patch. It avoids people being missed off by mistake and makes
@@@ -54,7 -54,7 +54,7 @@@
        of the Linux Foundation certificate of contribution and should
        include a Signed-off-by: line.  The current version of this
        "Developer's Certificate of Origin" (DCO) is listed in the file
 -      Documentation/SubmittingPatches.
 +      Documentation/process/submitting-patches.rst.
  
  6.    Make sure you have the right to send any changes you make. If you
        do changes at work you may find your employer owns the patch
@@@ -74,14 -74,9 +74,14 @@@ Descriptions of section entries
           These reviewers should be CCed on patches.
        L: Mailing list that is relevant to this area
        W: Web-page with status/info
 +      B: URI for where to file bugs. A web-page with detailed bug
 +         filing info, a direct bug tracker link, or a mailto: URI.
 +      C: URI for chat protocol, server and channel where developers
 +         usually hang out, for example irc://server/channel.
        Q: Patchwork web based patch tracking system site
        T: SCM tree type and location.
           Type is one of: git, hg, quilt, stgit, topgit
 +      B: Bug tracking system location.
        S: Status, one of the following:
           Supported:   Someone is actually paid to look after this.
           Maintained:  Someone actually looks after it.
@@@ -260,12 -255,6 +260,12 @@@ L:       linux-gpio@vger.kernel.or
  S:    Maintained
  F:    drivers/gpio/gpio-104-idio-16.c
  
 +ACCES 104-QUAD-8 IIO DRIVER
 +M:    William Breathitt Gray <vilhelm.gray@gmail.com>
 +L:    linux-iio@vger.kernel.org
 +S:    Maintained
 +F:    drivers/iio/counter/104-quad-8.c
 +
  ACENIC DRIVER
  M:    Jes Sorensen <jes@trained-monkey.org>
  L:    linux-acenic@sunsite.dk
@@@ -292,7 -281,6 +292,7 @@@ L: linux-acpi@vger.kernel.or
  W:    https://01.org/linux-acpi
  Q:    https://patchwork.kernel.org/project/linux-acpi/list/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
 +B:    https://bugzilla.kernel.org
  S:    Supported
  F:    drivers/acpi/
  F:    drivers/pnp/pnpacpi/
@@@ -316,8 -304,6 +316,8 @@@ W: https://acpica.org
  W:    https://github.com/acpica/acpica/
  Q:    https://patchwork.kernel.org/project/linux-acpi/list/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
 +B:    https://bugzilla.kernel.org
 +B:    https://bugs.acpica.org
  S:    Supported
  F:    drivers/acpi/acpica/
  F:    include/acpi/
@@@ -327,7 -313,6 +327,7 @@@ ACPI FAN DRIVE
  M:    Zhang Rui <rui.zhang@intel.com>
  L:    linux-acpi@vger.kernel.org
  W:    https://01.org/linux-acpi
 +B:    https://bugzilla.kernel.org
  S:    Supported
  F:    drivers/acpi/fan.c
  
@@@ -343,7 -328,6 +343,7 @@@ ACPI THERMAL DRIVE
  M:    Zhang Rui <rui.zhang@intel.com>
  L:    linux-acpi@vger.kernel.org
  W:    https://01.org/linux-acpi
 +B:    https://bugzilla.kernel.org
  S:    Supported
  F:    drivers/acpi/*thermal*
  
@@@ -351,7 -335,6 +351,7 @@@ ACPI VIDEO DRIVE
  M:    Zhang Rui <rui.zhang@intel.com>
  L:    linux-acpi@vger.kernel.org
  W:    https://01.org/linux-acpi
 +B:    https://bugzilla.kernel.org
  S:    Supported
  F:    drivers/acpi/acpi_video.c
  
@@@ -587,11 -570,6 +587,11 @@@ T:       git git://linuxtv.org/anttip/media_t
  S:    Maintained
  F:    drivers/media/usb/airspy/
  
 +ALACRITECH GIGABIT ETHERNET DRIVER
 +M:    Lino Sanfilippo <LinoSanfilippo@gmx.de>
 +S:    Maintained
 +F:    drivers/net/ethernet/alacritech/*
 +
  ALCATEL SPEEDTOUCH USB DRIVER
  M:    Duncan Sands <duncan.sands@free.fr>
  L:    linux-usb@vger.kernel.org
@@@ -809,7 -787,7 +809,7 @@@ S: Supporte
  F:    drivers/iio/*/ad*
  X:    drivers/iio/*/adjd*
  F:    drivers/staging/iio/*/ad*
 -F:    staging/iio/trigger/iio-trig-bfin-timer.c
 +F:    drivers/staging/iio/trigger/iio-trig-bfin-timer.c
  
  ANALOG DEVICES INC DMA DRIVERS
  M:    Lars-Peter Clausen <lars@metafoo.de>
@@@ -1058,7 -1036,6 +1058,7 @@@ F:      arch/arm/mach-meson
  F:    arch/arm/boot/dts/meson*
  F:    arch/arm64/boot/dts/amlogic/
  F:    drivers/pinctrl/meson/
 +F:    drivers/mmc/host/meson*
  N:    meson
  
  ARM/Annapurna Labs ALPINE ARCHITECTURE
@@@ -1798,7 -1775,6 +1798,7 @@@ F:      drivers/char/hw_random/st-rng.
  F:    drivers/clocksource/arm_global_timer.c
  F:    drivers/clocksource/clksrc_st_lpc.c
  F:    drivers/cpufreq/sti-cpufreq.c
 +F:    drivers/dma/st_fdma*
  F:    drivers/i2c/busses/i2c-st.c
  F:    drivers/media/rc/st_rc.c
  F:    drivers/media/platform/sti/c8sectpfe/
@@@ -1809,7 -1785,6 +1809,7 @@@ F:      drivers/phy/phy-stih407-usb.
  F:    drivers/phy/phy-stih41x-usb.c
  F:    drivers/pinctrl/pinctrl-st.c
  F:    drivers/remoteproc/st_remoteproc.c
 +F:    drivers/remoteproc/st_slim_rproc.c
  F:    drivers/reset/sti/
  F:    drivers/rtc/rtc-st-lpc.c
  F:    drivers/tty/serial/st-asc.c
@@@ -1818,7 -1793,6 +1818,7 @@@ F:      drivers/usb/host/ehci-st.
  F:    drivers/usb/host/ohci-st.c
  F:    drivers/watchdog/st_lpc_wdt.c
  F:    drivers/ata/ahci_st.c
 +F:    include/linux/remoteproc/st_slim_rproc.h
  
  ARM/STM32 ARCHITECTURE
  M:    Maxime Coquelin <mcoquelin.stm32@gmail.com>
@@@ -2556,8 -2530,6 +2556,8 @@@ L:      netdev@vger.kernel.or
  L:    linux-kernel@vger.kernel.org
  S:    Supported
  F:    kernel/bpf/
 +F:    tools/testing/selftests/bpf/
 +F:    lib/test_bpf.c
  
  BROADCOM B44 10/100 ETHERNET DRIVER
  M:    Michael Chan <michael.chan@broadcom.com>
@@@ -2618,7 -2590,6 +2618,7 @@@ L:      linux-arm-kernel@lists.infradead.or
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rpi/linux-rpi.git
  S:    Maintained
  N:    bcm2835
 +F:    drivers/staging/vc04_services
  
  BROADCOM BCM47XX MIPS ARCHITECTURE
  M:    Hauke Mehrtens <hauke@hauke-m.de>
@@@ -2771,14 -2742,6 +2771,14 @@@ L:    bcm-kernel-feedback-list@broadcom.co
  S:    Maintained
  F:    drivers/mtd/nand/brcmnand/
  
 +BROADCOM STB AVS CPUFREQ DRIVER
 +M:    Markus Mayer <mmayer@broadcom.com>
 +M:    bcm-kernel-feedback-list@broadcom.com
 +L:    linux-pm@vger.kernel.org
 +S:    Maintained
 +F:    Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
 +F:    drivers/cpufreq/brcmstb*
 +
  BROADCOM SPECIFIC AMBA DRIVER (BCMA)
  M:    Rafał Miłecki <zajec5@gmail.com>
  L:    linux-wireless@vger.kernel.org
@@@ -2967,7 -2930,7 +2967,7 @@@ CAPELLA MICROSYSTEMS LIGHT SENSOR DRIVE
  M:    Kevin Tsai <ktsai@capellamicro.com>
  S:    Maintained
  F:    drivers/iio/light/cm*
 -F:    Documentation/devicetree/bindings/i2c/trivial-devices.txt
 +F:    Documentation/devicetree/bindings/i2c/trivial-admin-guide/devices.rst
  
  CAVIUM I2C DRIVER
  M:    Jan Glauber <jglauber@cavium.com>
@@@ -3067,12 -3030,6 +3067,12 @@@ F:    drivers/usb/host/whci
  F:    drivers/usb/wusbcore/
  F:    include/linux/usb/wusb*
  
 +HT16K33 LED CONTROLLER DRIVER
 +M:    Robin van der Gracht <robin@protonic.nl>
 +S:    Maintained
 +F:    drivers/auxdisplay/ht16k33.c
 +F:    Documentation/devicetree/bindings/display/ht16k33.txt
 +
  CFAG12864B LCD DRIVER
  M:    Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
  W:    http://miguelojeda.es/auxdisplay.htm
@@@ -3121,7 -3078,7 +3121,7 @@@ M:      Harry Wei <harryxiyou@gmail.com
  L:    xiyoulinuxkernelgroup@googlegroups.com (subscribers-only)
  L:    linux-kernel@zh-kernel.org (moderated for non-subscribers)
  S:    Maintained
 -F:    Documentation/zh_CN/
 +F:    Documentation/translations/zh_CN/
  
  CHIPIDEA USB HIGH SPEED DUAL ROLE CONTROLLER
  M:    Peter Chen <Peter.Chen@nxp.com>
@@@ -3377,7 -3334,6 +3377,7 @@@ L:      linux-pm@vger.kernel.or
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
  T:    git git://git.linaro.org/people/vireshk/linux.git (For ARM Updates)
 +B:    https://bugzilla.kernel.org
  F:    Documentation/cpu-freq/
  F:    drivers/cpufreq/
  F:    include/linux/cpufreq.h
@@@ -3417,7 -3373,6 +3417,7 @@@ M:      Daniel Lezcano <daniel.lezcano@linar
  L:    linux-pm@vger.kernel.org
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
 +B:    https://bugzilla.kernel.org
  F:    drivers/cpuidle/*
  F:    include/linux/cpuidle.h
  
@@@ -3956,7 -3911,7 +3956,7 @@@ F:      include/linux/dma-buf
  F:    include/linux/reservation.h
  F:    include/linux/*fence.h
  F:    Documentation/dma-buf-sharing.txt
 -T:    git git://git.linaro.org/people/sumitsemwal/linux-dma-buf.git
 +T:    git git://anongit.freedesktop.org/drm/drm-misc
  
  SYNC FILE FRAMEWORK
  M:    Sumit Semwal <sumit.semwal@linaro.org>
@@@ -3964,12 -3919,10 +3964,12 @@@ R:   Gustavo Padovan <gustavo@padovan.org
  S:    Maintained
  L:    linux-media@vger.kernel.org
  L:    dri-devel@lists.freedesktop.org
 -F:    drivers/dma-buf/sync_file.c
 +F:    drivers/dma-buf/sync_*
 +F:    drivers/dma-buf/sw_sync.c
  F:    include/linux/sync_file.h
 +F:    include/uapi/linux/sync_file.h
  F:    Documentation/sync_file.txt
 -T:    git git://git.linaro.org/people/sumitsemwal/linux-dma-buf.git
 +T:    git git://anongit.freedesktop.org/drm/drm-misc
  
  DMA GENERIC OFFLOAD ENGINE SUBSYSTEM
  M:    Vinod Koul <vinod.koul@intel.com>
@@@ -4057,8 -4010,6 +4057,8 @@@ DRM DRIVER
  M:    David Airlie <airlied@linux.ie>
  L:    dri-devel@lists.freedesktop.org
  T:    git git://people.freedesktop.org/~airlied/linux
 +B:    https://bugs.freedesktop.org/
 +C:    irc://chat.freenode.net/dri-devel
  S:    Maintained
  F:    drivers/gpu/drm/
  F:    drivers/gpu/vga/
@@@ -4069,30 -4020,11 +4069,30 @@@ F:   Documentation/gpu
  F:    include/drm/
  F:    include/uapi/drm/
  
 +DRM DRIVERS AND MISC GPU PATCHES
 +M:    Daniel Vetter <daniel.vetter@intel.com>
 +M:    Jani Nikula <jani.nikula@linux.intel.com>
 +M:    Sean Paul <seanpaul@chromium.org>
 +W:    https://01.org/linuxgraphics/gfx-docs/maintainer-tools/drm-misc.html
 +S:    Maintained
 +T:    git git://anongit.freedesktop.org/drm/drm-misc
 +F:    Documentation/gpu/
 +F:    drivers/gpu/vga/
 +F:    drivers/gpu/drm/*
 +F:    include/drm/drm*
 +F:    include/uapi/drm/drm*
 +
  DRM DRIVER FOR AST SERVER GRAPHICS CHIPS
  M:    Dave Airlie <airlied@redhat.com>
  S:    Odd Fixes
  F:    drivers/gpu/drm/ast/
  
 +DRM DRIVERS FOR BRIDGE CHIPS
 +M:    Archit Taneja <architt@codeaurora.org>
 +S:    Maintained
 +T:    git git://anongit.freedesktop.org/drm/drm-misc
 +F:    drivers/gpu/drm/bridge/
 +
  DRM DRIVER FOR BOCHS VIRTUAL GPU
  M:    Gerd Hoffmann <kraxel@redhat.com>
  S:    Odd Fixes
@@@ -4128,9 -4060,8 +4128,9 @@@ INTEL DRM DRIVERS (excluding Poulsbo, M
  M:    Daniel Vetter <daniel.vetter@intel.com>
  M:    Jani Nikula <jani.nikula@linux.intel.com>
  L:    intel-gfx@lists.freedesktop.org
 -L:    dri-devel@lists.freedesktop.org
  W:    https://01.org/linuxgraphics/
 +B:    https://01.org/linuxgraphics/documentation/how-report-bugs
 +C:    irc://chat.freenode.net/intel-gfx
  Q:    http://patchwork.freedesktop.org/project/intel-gfx/
  T:    git git://anongit.freedesktop.org/drm-intel
  S:    Supported
@@@ -4139,16 -4070,6 +4139,16 @@@ F:    include/drm/i915
  F:    include/uapi/drm/i915_drm.h
  F:    Documentation/gpu/i915.rst
  
 +INTEL GVT-g DRIVERS (Intel GPU Virtualization)
 +M:      Zhenyu Wang <zhenyuw@linux.intel.com>
 +M:      Zhi Wang <zhi.a.wang@intel.com>
 +L:      igvt-g-dev@lists.01.org
 +L:      intel-gfx@lists.freedesktop.org
 +W:      https://01.org/igvt-g
 +T:      git https://github.com/01org/gvt-linux.git
 +S:      Supported
 +F:      drivers/gpu/drm/i915/gvt/
 +
  DRM DRIVERS FOR ATMEL HLCDC
  M:    Boris Brezillon <boris.brezillon@free-electrons.com>
  L:    dri-devel@lists.freedesktop.org
@@@ -4163,15 -4084,6 +4163,15 @@@ S:    Supporte
  F:    drivers/gpu/drm/sun4i/
  F:    Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt
  
 +DRM DRIVERS FOR AMLOGIC SOCS
 +M:    Neil Armstrong <narmstrong@baylibre.com>
 +L:    dri-devel@lists.freedesktop.org
 +L:    linux-amlogic@lists.infradead.org
 +W:    http://linux-meson.com/
 +S:    Supported
 +F:    drivers/gpu/drm/meson/
 +F:    Documentation/devicetree/bindings/display/amlogic,meson-vpu.txt
 +
  DRM DRIVERS FOR EXYNOS
  M:    Inki Dae <inki.dae@samsung.com>
  M:    Joonyoung Shim <jy0922.shim@samsung.com>
@@@ -4211,7 -4123,6 +4211,7 @@@ F:      drivers/gpu/drm/gma500
  
  DRM DRIVERS FOR HISILICON
  M:    Xinliang Liu <z.liuxinliang@hisilicon.com>
 +M:    Rongrong Zou <zourongrong@gmail.com>
  R:    Xinwei Kong <kong.kongxinwei@hisilicon.com>
  R:    Chen Feng <puck.chen@hisilicon.com>
  L:    dri-devel@lists.freedesktop.org
@@@ -4336,7 -4247,6 +4336,7 @@@ DRM DRIVERS FOR VIVANTE GPU I
  M:    Lucas Stach <l.stach@pengutronix.de>
  R:    Russell King <linux+etnaviv@armlinux.org.uk>
  R:    Christian Gmeiner <christian.gmeiner@gmail.com>
 +L:    etnaviv@lists.freedesktop.org
  L:    dri-devel@lists.freedesktop.org
  S:    Maintained
  F:    drivers/gpu/drm/etnaviv/
@@@ -4377,13 -4287,6 +4377,13 @@@ S:    Maintaine
  F:    drivers/gpu/drm/tilcdc/
  F:    Documentation/devicetree/bindings/display/tilcdc/
  
 +DRM DRIVERS FOR ZTE ZX
 +M:    Shawn Guo <shawnguo@kernel.org>
 +L:    dri-devel@lists.freedesktop.org
 +S:    Maintained
 +F:    drivers/gpu/drm/zte/
 +F:    Documentation/devicetree/bindings/display/zte,vou.txt
 +
  DSBR100 USB FM RADIO DRIVER
  M:    Alexey Klimov <klimov.linux@gmail.com>
  L:    linux-media@vger.kernel.org
@@@ -4728,14 -4631,12 +4728,14 @@@ L:   linux-efi@vger.kernel.or
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git
  S:    Maintained
  F:    Documentation/efi-stub.txt
 -F:    arch/ia64/kernel/efi.c
 +F:    arch/*/kernel/efi.c
  F:    arch/x86/boot/compressed/eboot.[ch]
 -F:    arch/x86/include/asm/efi.h
 +F:    arch/*/include/asm/efi.h
  F:    arch/x86/platform/efi/
  F:    drivers/firmware/efi/
  F:    include/linux/efi*.h
 +F:    arch/arm/boot/compressed/efi-header.S
 +F:    arch/arm64/kernel/efi-entry.S
  
  EFI VARIABLE FILESYSTEM
  M:    Matthew Garrett <matthew.garrett@nebula.com>
@@@ -5049,9 -4950,7 +5049,9 @@@ K:      fmc_d.*registe
  FPGA MANAGER FRAMEWORK
  M:    Alan Tull <atull@opensource.altera.com>
  R:    Moritz Fischer <moritz.fischer@ettus.com>
 +L:    linux-fpga@vger.kernel.org
  S:    Maintained
 +T:    git git://git.kernel.org/pub/scm/linux/kernel/git/atull/linux-fpga.git
  F:    drivers/fpga/
  F:    include/linux/fpga/fpga-mgr.h
  W:    http://www.rocketboards.org
@@@ -5069,9 -4968,10 +5069,9 @@@ F:     drivers/net/wan/dlci.
  F:    drivers/net/wan/sdla.c
  
  FRAMEBUFFER LAYER
 -M:    Tomi Valkeinen <tomi.valkeinen@ti.com>
  L:    linux-fbdev@vger.kernel.org
  Q:    http://patchwork.kernel.org/project/linux-fbdev/list/
 -S:    Maintained
 +S:    Orphan
  F:    Documentation/fb/
  F:    drivers/video/
  F:    include/video/
@@@ -5144,18 -5044,9 +5144,18 @@@ S:    Maintaine
  F:    drivers/net/ethernet/freescale/fman
  F:    Documentation/devicetree/bindings/powerpc/fsl/fman.txt
  
 +FREESCALE SOC DRIVERS
 +M:    Scott Wood <oss@buserror.net>
 +L:    linuxppc-dev@lists.ozlabs.org
 +L:    linux-arm-kernel@lists.infradead.org
 +S:    Maintained
 +F:    drivers/soc/fsl/
 +F:    include/linux/fsl/
 +
  FREESCALE QUICC ENGINE LIBRARY
 +M:    Qiang Zhao <qiang.zhao@nxp.com>
  L:    linuxppc-dev@lists.ozlabs.org
 -S:    Orphan
 +S:    Maintained
  F:    drivers/soc/fsl/qe/
  F:    include/soc/fsl/*qe*.h
  F:    include/soc/fsl/*ucc*.h
@@@ -5207,6 -5098,13 +5207,6 @@@ F:     sound/soc/fsl/fsl
  F:    sound/soc/fsl/imx*
  F:    sound/soc/fsl/mpc8610_hpcd.c
  
 -FREESCALE QORIQ MANAGEMENT COMPLEX DRIVER
 -M:    "J. German Rivera" <German.Rivera@freescale.com>
 -M:    Stuart Yoder <stuart.yoder@nxp.com>
 -L:    linux-kernel@vger.kernel.org
 -S:    Maintained
 -F:    drivers/staging/fsl-mc/
 -
  FREEVXFS FILESYSTEM
  M:    Christoph Hellwig <hch@infradead.org>
  W:    ftp://ftp.openlinux.org/pub/people/hch/vxfs
@@@ -5240,6 -5138,7 +5240,7 @@@ F:      include/linux/fscache*.
  FS-CRYPTO: FILE SYSTEM LEVEL ENCRYPTION SUPPORT
  M:    Theodore Y. Ts'o <tytso@mit.edu>
  M:    Jaegeuk Kim <jaegeuk@kernel.org>
+ L:    linux-fsdevel@vger.kernel.org
  S:    Supported
  F:    fs/crypto/
  F:    include/linux/fscrypto.h
@@@ -5304,7 -5203,6 +5305,7 @@@ L:      kernel-hardening@lists.openwall.co
  S:    Maintained
  F:    scripts/gcc-plugins/
  F:    scripts/gcc-plugin.sh
 +F:    scripts/Makefile.gcc-plugins
  F:    Documentation/gcc-plugins.txt
  
  GCOV BASED KERNEL PROFILING
@@@ -5766,7 -5664,6 +5767,7 @@@ HIBERNATION (aka Software Suspend, aka 
  M:    "Rafael J. Wysocki" <rjw@rjwysocki.net>
  M:    Pavel Machek <pavel@ucw.cz>
  L:    linux-pm@vger.kernel.org
 +B:    https://bugzilla.kernel.org
  S:    Supported
  F:    arch/x86/power/
  F:    drivers/base/power/
@@@ -5948,7 -5845,6 +5949,7 @@@ F:      drivers/input/serio/hyperv-keyboard.
  F:    drivers/pci/host/pci-hyperv.c
  F:    drivers/net/hyperv/
  F:    drivers/scsi/storvsc_drv.c
 +F:    drivers/uio/uio_hv_generic.c
  F:    drivers/video/fbdev/hyperv_fb.c
  F:    include/linux/hyperv.h
  F:    tools/hv/
@@@ -6192,9 -6088,14 +6193,9 @@@ S:     Maintaine
  F:    Documentation/cdrom/ide-cd
  F:    drivers/ide/ide-cd*
  
 -IDLE-I7300
 -M:    Andy Henroid <andrew.d.henroid@intel.com>
 -L:    linux-pm@vger.kernel.org
 -S:    Supported
 -F:    drivers/idle/i7300_idle.c
 -
  IEEE 802.15.4 SUBSYSTEM
  M:    Alexander Aring <aar@pengutronix.de>
 +M:    Stefan Schmidt <stefan@osg.samsung.com>
  L:    linux-wpan@vger.kernel.org
  W:    http://wpan.cakelab.org/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git
@@@ -6224,22 -6125,6 +6225,22 @@@ L:    linux-media@vger.kernel.or
  S:    Maintained
  F:    drivers/media/rc/iguanair.c
  
 +IIO DIGITAL POTENTIOMETER DAC
 +M:    Peter Rosin <peda@axentia.se>
 +L:    linux-iio@vger.kernel.org
 +S:    Maintained
 +F:    Documentation/ABI/testing/sysfs-bus-iio-dac-dpot-dac
 +F:    Documentation/devicetree/bindings/iio/dac/dpot-dac.txt
 +F:    drivers/iio/dac/dpot-dac.c
 +
 +IIO ENVELOPE DETECTOR
 +M:    Peter Rosin <peda@axentia.se>
 +L:    linux-iio@vger.kernel.org
 +S:    Maintained
 +F:    Documentation/ABI/testing/sysfs-bus-iio-adc-envelope-detector
 +F:    Documentation/devicetree/bindings/iio/adc/envelope-detector.txt
 +F:    drivers/iio/adc/envelope-detector.c
 +
  IIO SUBSYSTEM AND DRIVERS
  M:    Jonathan Cameron <jic23@kernel.org>
  R:    Hartmut Knaack <knaack.h@gmx.de>
@@@ -6397,11 -6282,9 +6398,11 @@@ S:    Maintaine
  F:    drivers/platform/x86/intel-vbtn.c
  
  INTEL IDLE DRIVER
 +M:    Jacob Pan <jacob.jun.pan@linux.intel.com>
  M:    Len Brown <lenb@kernel.org>
  L:    linux-pm@vger.kernel.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git
 +B:    https://bugzilla.kernel.org
  S:    Supported
  F:    drivers/idle/intel_idle.c
  
@@@ -6621,13 -6504,6 +6622,13 @@@ S:    Maintaine
  F:    arch/x86/include/asm/pmc_core.h
  F:    drivers/platform/x86/intel_pmc_core*
  
 +INVENSENSE MPU-3050 GYROSCOPE DRIVER
 +M:    Linus Walleij <linus.walleij@linaro.org>
 +L:    linux-iio@vger.kernel.org
 +S:    Maintained
 +F:    drivers/iio/gyro/mpu3050*
 +F:    Documentation/devicetree/bindings/iio/gyroscope/inv,mpu3050.txt
 +
  IOC3 ETHERNET DRIVER
  M:    Ralf Baechle <ralf@linux-mips.org>
  L:    linux-mips@linux-mips.org
@@@ -7209,7 -7085,6 +7210,7 @@@ F:      drivers/scsi/53c700
  LED SUBSYSTEM
  M:    Richard Purdie <rpurdie@rpsys.net>
  M:    Jacek Anaszewski <j.anaszewski@samsung.com>
 +M:    Pavel Machek <pavel@ucw.cz>
  L:    linux-leds@vger.kernel.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/j.anaszewski/linux-leds.git
  S:    Maintained
@@@ -7682,10 -7557,8 +7683,10 @@@ S:    Maintaine
  MARVELL 88E6XXX ETHERNET SWITCH FABRIC DRIVER
  M:    Andrew Lunn <andrew@lunn.ch>
  M:    Vivien Didelot <vivien.didelot@savoirfairelinux.com>
 +L:    netdev@vger.kernel.org
  S:    Maintained
  F:    drivers/net/dsa/mv88e6xxx/
 +F:    Documentation/devicetree/bindings/net/dsa/marvell.txt
  
  MARVELL ARMADA DRM SUPPORT
  M:    Russell King <rmk+kernel@armlinux.org.uk>
@@@ -7835,7 -7708,6 +7836,7 @@@ MCP4531 MICROCHIP DIGITAL POTENTIOMETE
  M:    Peter Rosin <peda@axentia.se>
  L:    linux-iio@vger.kernel.org
  S:    Maintained
 +F:    Documentation/ABI/testing/sysfs-bus-iio-potentiometer-mcp4531
  F:    drivers/iio/potentiometer/mcp4531.c
  
  MEASUREMENT COMPUTING CIO-DAC IIO DRIVER
@@@ -8186,7 -8058,6 +8187,7 @@@ F:      drivers/infiniband/hw/mlx4
  F:    include/linux/mlx4/
  
  MELLANOX MLX5 core VPI driver
 +M:    Saeed Mahameed <saeedm@mellanox.com>
  M:    Matan Barak <matanb@mellanox.com>
  M:    Leon Romanovsky <leonro@mellanox.com>
  L:    netdev@vger.kernel.org
@@@ -8400,12 -8271,6 +8401,12 @@@ T:    git git://linuxtv.org/mkrufky/tuners
  S:    Maintained
  F:    drivers/media/tuners/mxl5007t.*
  
 +MXSFB DRM DRIVER
 +M:    Marek Vasut <marex@denx.de>
 +S:    Supported
 +F:    drivers/gpu/drm/mxsfb/
 +F:    Documentation/devicetree/bindings/display/mxsfb-drm.txt
 +
  MYRICOM MYRI-10G 10GbE DRIVER (MYRI10GE)
  M:    Hyong-Youb Kim <hykim@myri.com>
  L:    netdev@vger.kernel.org
@@@ -8583,6 -8448,7 +8584,6 @@@ F:      include/uapi/linux/net_namespace.
  F:    tools/net/
  F:    tools/testing/selftests/net/
  F:    lib/random32.c
 -F:    lib/test_bpf.c
  
  NETWORKING [IPv4/IPv6]
  M:    "David S. Miller" <davem@davemloft.net>
@@@ -8811,16 -8677,6 +8812,16 @@@ L:    linux-nvme@lists.infradead.or
  S:    Supported
  F:    drivers/nvme/target/
  
 +NVM EXPRESS FC TRANSPORT DRIVERS
 +M:    James Smart <james.smart@broadcom.com>
 +L:    linux-nvme@lists.infradead.org
 +S:    Supported
 +F:    include/linux/nvme-fc.h
 +F:    include/linux/nvme-fc-driver.h
 +F:    drivers/nvme/host/fc.c
 +F:    drivers/nvme/target/fc.c
 +F:    drivers/nvme/target/fcloop.c
 +
  NVMEM FRAMEWORK
  M:    Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
  M:    Maxime Ripard <maxime.ripard@free-electrons.com>
@@@ -8883,7 -8739,6 +8884,7 @@@ F:      drivers/regulator/tps65217-regulator
  F:    drivers/regulator/tps65218-regulator.c
  F:    drivers/regulator/tps65910-regulator.c
  F:    drivers/regulator/twl-regulator.c
 +F:    drivers/regulator/twl6030-regulator.c
  F:    include/linux/i2c-omap.h
  
  OMAP DEVICE TREE SUPPORT
@@@ -9104,11 -8959,9 +9105,11 @@@ F:    drivers/of/resolver.
  
  OPENRISC ARCHITECTURE
  M:    Jonas Bonn <jonas@southpole.se>
 -W:    http://openrisc.net
 +M:    Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
 +M:    Stafford Horne <shorne@gmail.com>
 +L:    openrisc@lists.librecores.org
 +W:    http://openrisc.io
  S:    Maintained
 -T:    git git://openrisc.net/~jonas/linux
  F:    arch/openrisc/
  
  OPENVSWITCH
@@@ -9240,7 -9093,7 +9241,7 @@@ F:      drivers/misc/panel.
  
  PARALLEL PORT SUBSYSTEM
  M:    Sudip Mukherjee <sudipm.mukherjee@gmail.com>
 -M:    Sudip Mukherjee <sudip@vectorindia.org>
 +M:    Sudip Mukherjee <sudip.mukherjee@codethink.co.uk>
  L:    linux-parport@lists.infradead.org (subscribers-only)
  S:    Maintained
  F:    drivers/parport/
@@@ -9395,12 -9248,11 +9396,12 @@@ S:   Maintaine
  F:    drivers/pci/host/*layerscape*
  
  PCI DRIVER FOR IMX6
 -M:    Richard Zhu <Richard.Zhu@freescale.com>
 +M:    Richard Zhu <hongxing.zhu@nxp.com>
  M:    Lucas Stach <l.stach@pengutronix.de>
  L:    linux-pci@vger.kernel.org
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
 +F:    Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
  F:    drivers/pci/host/*imx6*
  
  PCI DRIVER FOR TI KEYSTONE
@@@ -9459,11 -9311,17 +9460,11 @@@ F:   drivers/pci/host/pci-exynos.
  
  PCI DRIVER FOR SYNOPSIS DESIGNWARE
  M:    Jingoo Han <jingoohan1@gmail.com>
 -M:    Pratyush Anand <pratyush.anand@gmail.com>
 -L:    linux-pci@vger.kernel.org
 -S:    Maintained
 -F:    drivers/pci/host/*designware*
 -
 -PCI DRIVER FOR SYNOPSYS PROTOTYPING DEVICE
 -M:    Jose Abreu <Jose.Abreu@synopsys.com>
 +M:    Joao Pinto <Joao.Pinto@synopsys.com>
  L:    linux-pci@vger.kernel.org
  S:    Maintained
  F:    Documentation/devicetree/bindings/pci/designware-pcie.txt
 -F:    drivers/pci/host/pcie-designware-plat.c
 +F:    drivers/pci/host/*designware*
  
  PCI DRIVER FOR GENERIC OF HOSTS
  M:    Will Deacon <will.deacon@arm.com>
@@@ -9478,7 -9336,7 +9479,7 @@@ PCI DRIVER FOR INTEL VOLUME MANAGEMENT 
  M:    Keith Busch <keith.busch@intel.com>
  L:    linux-pci@vger.kernel.org
  S:    Supported
 -F:    arch/x86/pci/vmd.c
 +F:    drivers/pci/host/vmd.c
  
  PCIE DRIVER FOR ST SPEAR13XX
  M:    Pratyush Anand <pratyush.anand@gmail.com>
@@@ -9711,8 -9569,8 +9712,8 @@@ F:      arch/mips/boot/dts/pistachio
  F:      arch/mips/configs/pistachio*_defconfig
  
  PKTCDVD DRIVER
 -M:    Jiri Kosina <jikos@kernel.org>
 -S:    Maintained
 +S:    Orphan
 +M:    linux-block@vger.kernel.org
  F:    drivers/block/pktcdvd.c
  F:    include/linux/pktcdvd.h
  F:    include/uapi/linux/pktcdvd.h
@@@ -9765,7 -9623,6 +9766,7 @@@ POWER MANAGEMENT COR
  M:    "Rafael J. Wysocki" <rjw@rjwysocki.net>
  L:    linux-pm@vger.kernel.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
 +B:    https://bugzilla.kernel.org
  S:    Supported
  F:    drivers/base/power/
  F:    include/linux/pm.h
@@@ -9947,7 -9804,7 +9948,7 @@@ F:      drivers/media/usb/pwc/
  
  PWM FAN DRIVER
  M:    Kamil Debski <kamil@wypas.org>
 -M:    Lukasz Majewski <l.majewski@samsung.com>
 +M:    Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
  L:    linux-hwmon@vger.kernel.org
  S:    Supported
  F:    Documentation/devicetree/bindings/hwmon/pwm-fan.txt
@@@ -10089,12 -9946,6 +10090,12 @@@ F:  fs/qnx4
  F:    include/uapi/linux/qnx4_fs.h
  F:    include/uapi/linux/qnxtypes.h
  
 +QORIQ DPAA2 FSL-MC BUS DRIVER
 +M:    Stuart Yoder <stuart.yoder@nxp.com>
 +L:    linux-kernel@vger.kernel.org
 +S:    Maintained
 +F:    drivers/staging/fsl-mc/
 +
  QT1010 MEDIA DRIVER
  M:    Antti Palosaari <crope@iki.fi>
  L:    linux-media@vger.kernel.org
@@@ -10557,7 -10408,7 +10558,7 @@@ F:   arch/s390/pci
  F:    drivers/pci/hotplug/s390_pci_hpc.c
  
  S390 ZCRYPT DRIVER
 -M:    Ingo Tuchscherer <ingo.tuchscherer@de.ibm.com>
 +M:    Harald Freudenberger <freude@de.ibm.com>
  L:    linux-s390@vger.kernel.org
  W:    http://www.ibm.com/developerworks/linux/linux390/
  S:    Supported
@@@ -10724,7 -10575,7 +10725,7 @@@ L:   netdev@vger.kernel.or
  F:    drivers/net/ethernet/samsung/sxgbe/
  
  SAMSUNG THERMAL DRIVER
 -M:    Lukasz Majewski <l.majewski@samsung.com>
 +M:    Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
  L:    linux-pm@vger.kernel.org
  L:    linux-samsung-soc@vger.kernel.org
  S:    Supported
@@@ -10850,11 -10701,6 +10851,11 @@@ W: http://www.sunplus.co
  S:    Supported
  F:    arch/score/
  
 +SCR24X CHIP CARD INTERFACE DRIVER
 +M:    Lubomir Rintel <lkundrak@v3.sk>
 +S:    Supported
 +F:    drivers/char/pcmcia/scr24x_cs.c
 +
  SYSTEM CONTROL & POWER INTERFACE (SCPI) Message Protocol drivers
  M:    Sudeep Holla <sudeep.holla@arm.com>
  L:    linux-arm-kernel@lists.infradead.org
@@@ -11258,7 -11104,7 +11259,7 @@@ F:   include/media/i2c/ov2659.
  SILICON MOTION SM712 FRAME BUFFER DRIVER
  M:    Sudip Mukherjee <sudipm.mukherjee@gmail.com>
  M:    Teddy Wang <teddy.wang@siliconmotion.com>
 -M:    Sudip Mukherjee <sudip@vectorindia.org>
 +M:    Sudip Mukherjee <sudip.mukherjee@codethink.co.uk>
  L:    linux-fbdev@vger.kernel.org
  S:    Maintained
  F:    drivers/video/fbdev/sm712*
@@@ -11620,7 -11466,7 +11621,7 @@@ STABLE BRANC
  M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  L:    stable@vger.kernel.org
  S:    Supported
 -F:    Documentation/stable_kernel_rules.txt
 +F:    Documentation/process/stable-kernel-rules.rst
  
  STAGING SUBSYSTEM
  M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
@@@ -11686,11 -11532,17 +11687,11 @@@ F:        drivers/staging/rtl8712
  STAGING - SILICON MOTION SM750 FRAME BUFFER DRIVER
  M:    Sudip Mukherjee <sudipm.mukherjee@gmail.com>
  M:    Teddy Wang <teddy.wang@siliconmotion.com>
 -M:    Sudip Mukherjee <sudip@vectorindia.org>
 +M:    Sudip Mukherjee <sudip.mukherjee@codethink.co.uk>
  L:    linux-fbdev@vger.kernel.org
  S:    Maintained
  F:    drivers/staging/sm750fb/
  
 -STAGING - SLICOSS
 -M:    Lior Dotan <liodot@gmail.com>
 -M:    Christopher Harrer <charrer@alacritech.com>
 -S:    Odd Fixes
 -F:    drivers/staging/slicoss/
 -
  STAGING - SPEAKUP CONSOLE SPEECH DRIVER
  M:    William Hubbs <w.d.hubbs@gmail.com>
  M:    Chris Brannon <chris@the-brannons.com>
@@@ -11760,7 -11612,6 +11761,7 @@@ M:   "Rafael J. Wysocki" <rjw@rjwysocki.n
  M:    Len Brown <len.brown@intel.com>
  M:    Pavel Machek <pavel@ucw.cz>
  L:    linux-pm@vger.kernel.org
 +B:    https://bugzilla.kernel.org
  S:    Supported
  F:    Documentation/power/
  F:    arch/x86/kernel/acpi/
@@@ -12904,15 -12755,6 +12905,15 @@@ F: drivers/vfio
  F:    include/linux/vfio.h
  F:    include/uapi/linux/vfio.h
  
 +VFIO MEDIATED DEVICE DRIVERS
 +M:    Kirti Wankhede <kwankhede@nvidia.com>
 +L:    kvm@vger.kernel.org
 +S:    Maintained
 +F:    Documentation/vfio-mediated-device.txt
 +F:    drivers/vfio/mdev/
 +F:    include/linux/mdev.h
 +F:    samples/vfio-mdev/
 +
  VFIO PLATFORM DRIVER
  M:    Baptiste Reynal <b.reynal@virtualopensystems.com>
  L:    kvm@vger.kernel.org
@@@ -13065,7 -12907,7 +13066,7 @@@ M:   Greg Kroah-Hartman <gregkh@linuxfoun
  L:    devel@driverdev.osuosl.org
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git
 -F:    Documentation/vme_api.txt
 +F:    Documentation/driver-api/vme.rst
  F:    drivers/staging/vme/
  F:    drivers/vme/
  F:    include/linux/vme*
@@@ -13289,7 -13131,7 +13290,7 @@@ T:   git git://git.kernel.org/pub/scm/lin
  S:    Maintained
  F:    include/linux/workqueue.h
  F:    kernel/workqueue.c
 -F:    Documentation/workqueue.txt
 +F:    Documentation/core-api/workqueue.rst
  
  X-POWERS MULTIFUNCTION PMIC DEVICE DRIVERS
  M:    Chen-Yu Tsai <wens@csie.org>
@@@ -13354,6 -13196,7 +13355,6 @@@ F:   drivers/media/tuners/tuner-xc2028.
  
  XEN HYPERVISOR INTERFACE
  M:    Boris Ostrovsky <boris.ostrovsky@oracle.com>
 -M:    David Vrabel <david.vrabel@citrix.com>
  M:    Juergen Gross <jgross@suse.com>
  L:    xen-devel@lists.xenproject.org (moderated for non-subscribers)
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git
diff --combined fs/dax.c
index 6916ed37d4631846a7478cfa3e3dd14c8808d373,5bfd27b4a69c684346a7cd9d755178b7cbdf6228..5ae8e11ad78677ef3103fc569d0959742a380370
+++ b/fs/dax.c
  #include <linux/iomap.h>
  #include "internal.h"
  
- /*
-  * We use lowest available bit in exceptional entry for locking, other two
-  * bits to determine entry type. In total 3 special bits.
-  */
- #define RADIX_DAX_SHIFT       (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
- #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
- #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
- #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
- #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
- #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
- #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-               RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
-               RADIX_TREE_EXCEPTIONAL_ENTRY))
  /* We choose 4096 entries - same as per-zone page wait tables */
  #define DAX_WAIT_TABLE_BITS 12
  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  
- wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  
  static int __init init_dax_wait_table(void)
  {
  }
  fs_initcall(init_dax_wait_table);
  
- static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
-                                             pgoff_t index)
- {
-       unsigned long hash = hash_long((unsigned long)mapping ^ index,
-                                      DAX_WAIT_TABLE_BITS);
-       return wait_table + hash;
- }
  static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
  {
        struct request_queue *q = bdev->bd_queue;
@@@ -98,209 -76,52 +76,52 @@@ static void dax_unmap_atomic(struct blo
        blk_queue_exit(bdev->bd_queue);
  }
  
- struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+ static int dax_is_pmd_entry(void *entry)
  {
-       struct page *page = alloc_pages(GFP_KERNEL, 0);
-       struct blk_dax_ctl dax = {
-               .size = PAGE_SIZE,
-               .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
-       };
-       long rc;
-       if (!page)
-               return ERR_PTR(-ENOMEM);
-       rc = dax_map_atomic(bdev, &dax);
-       if (rc < 0)
-               return ERR_PTR(rc);
-       memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
-       dax_unmap_atomic(bdev, &dax);
-       return page;
+       return (unsigned long)entry & RADIX_DAX_PMD;
  }
  
- static bool buffer_written(struct buffer_head *bh)
+ static int dax_is_pte_entry(void *entry)
  {
-       return buffer_mapped(bh) && !buffer_unwritten(bh);
+       return !((unsigned long)entry & RADIX_DAX_PMD);
  }
  
- /*
-  * When ext4 encounters a hole, it returns without modifying the buffer_head
-  * which means that we can't trust b_size.  To cope with this, we set b_state
-  * to 0 before calling get_block and, if any bit is set, we know we can trust
-  * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
-  * and would save us time calling get_block repeatedly.
-  */
- static bool buffer_size_valid(struct buffer_head *bh)
+ static int dax_is_zero_entry(void *entry)
  {
-       return bh->b_state != 0;
+       return (unsigned long)entry & RADIX_DAX_HZP;
  }
  
- static sector_t to_sector(const struct buffer_head *bh,
-               const struct inode *inode)
+ static int dax_is_empty_entry(void *entry)
  {
-       sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
-       return sector;
+       return (unsigned long)entry & RADIX_DAX_EMPTY;
  }
  
- static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
-                     loff_t start, loff_t end, get_block_t get_block,
-                     struct buffer_head *bh)
+ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
  {
-       loff_t pos = start, max = start, bh_max = start;
-       bool hole = false;
-       struct block_device *bdev = NULL;
-       int rw = iov_iter_rw(iter), rc;
-       long map_len = 0;
+       struct page *page = alloc_pages(GFP_KERNEL, 0);
        struct blk_dax_ctl dax = {
-               .addr = ERR_PTR(-EIO),
+               .size = PAGE_SIZE,
+               .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
        };
-       unsigned blkbits = inode->i_blkbits;
-       sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
-                                                               >> blkbits;
-       if (rw == READ)
-               end = min(end, i_size_read(inode));
-       while (pos < end) {
-               size_t len;
-               if (pos == max) {
-                       long page = pos >> PAGE_SHIFT;
-                       sector_t block = page << (PAGE_SHIFT - blkbits);
-                       unsigned first = pos - (block << blkbits);
-                       long size;
-                       if (pos == bh_max) {
-                               bh->b_size = PAGE_ALIGN(end - pos);
-                               bh->b_state = 0;
-                               rc = get_block(inode, block, bh, rw == WRITE);
-                               if (rc)
-                                       break;
-                               if (!buffer_size_valid(bh))
-                                       bh->b_size = 1 << blkbits;
-                               bh_max = pos - first + bh->b_size;
-                               bdev = bh->b_bdev;
-                               /*
-                                * We allow uninitialized buffers for writes
-                                * beyond EOF as those cannot race with faults
-                                */
-                               WARN_ON_ONCE(
-                                       (buffer_new(bh) && block < file_blks) ||
-                                       (rw == WRITE && buffer_unwritten(bh)));
-                       } else {
-                               unsigned done = bh->b_size -
-                                               (bh_max - (pos - first));
-                               bh->b_blocknr += done >> blkbits;
-                               bh->b_size -= done;
-                       }
-                       hole = rw == READ && !buffer_written(bh);
-                       if (hole) {
-                               size = bh->b_size - first;
-                       } else {
-                               dax_unmap_atomic(bdev, &dax);
-                               dax.sector = to_sector(bh, inode);
-                               dax.size = bh->b_size;
-                               map_len = dax_map_atomic(bdev, &dax);
-                               if (map_len < 0) {
-                                       rc = map_len;
-                                       break;
-                               }
-                               dax.addr += first;
-                               size = map_len - first;
-                       }
-                       /*
-                        * pos + size is one past the last offset for IO,
-                        * so pos + size can overflow loff_t at extreme offsets.
-                        * Cast to u64 to catch this and get the true minimum.
-                        */
-                       max = min_t(u64, pos + size, end);
-               }
-               if (iov_iter_rw(iter) == WRITE) {
-                       len = copy_from_iter_pmem(dax.addr, max - pos, iter);
-               } else if (!hole)
-                       len = copy_to_iter((void __force *) dax.addr, max - pos,
-                                       iter);
-               else
-                       len = iov_iter_zero(max - pos, iter);
-               if (!len) {
-                       rc = -EFAULT;
-                       break;
-               }
+       long rc;
  
-               pos += len;
-               if (!IS_ERR(dax.addr))
-                       dax.addr += len;
-       }
+       if (!page)
+               return ERR_PTR(-ENOMEM);
  
+       rc = dax_map_atomic(bdev, &dax);
+       if (rc < 0)
+               return ERR_PTR(rc);
+       memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
        dax_unmap_atomic(bdev, &dax);
-       return (pos == start) ? rc : pos - start;
- }
- /**
-  * dax_do_io - Perform I/O to a DAX file
-  * @iocb: The control block for this I/O
-  * @inode: The file which the I/O is directed at
-  * @iter: The addresses to do I/O from or to
-  * @get_block: The filesystem method used to translate file offsets to blocks
-  * @end_io: A filesystem callback for I/O completion
-  * @flags: See below
-  *
-  * This function uses the same locking scheme as do_blockdev_direct_IO:
-  * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
-  * caller for writes.  For reads, we take and release the i_mutex ourselves.
-  * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
-  * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
-  * is in progress.
-  */
- ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
-                 struct iov_iter *iter, get_block_t get_block,
-                 dio_iodone_t end_io, int flags)
- {
-       struct buffer_head bh;
-       ssize_t retval = -EINVAL;
-       loff_t pos = iocb->ki_pos;
-       loff_t end = pos + iov_iter_count(iter);
-       memset(&bh, 0, sizeof(bh));
-       bh.b_bdev = inode->i_sb->s_bdev;
-       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-               inode_lock(inode);
-       /* Protects against truncate */
-       if (!(flags & DIO_SKIP_DIO_COUNT))
-               inode_dio_begin(inode);
-       retval = dax_io(inode, iter, pos, end, get_block, &bh);
-       if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-               inode_unlock(inode);
-       if (end_io) {
-               int err;
-               err = end_io(iocb, pos, retval, bh.b_private);
-               if (err)
-                       retval = err;
-       }
-       if (!(flags & DIO_SKIP_DIO_COUNT))
-               inode_dio_end(inode);
-       return retval;
+       return page;
  }
- EXPORT_SYMBOL_GPL(dax_do_io);
  
  /*
   * DAX radix tree locking
   */
  struct exceptional_entry_key {
        struct address_space *mapping;
-       unsigned long index;
+       pgoff_t entry_start;
  };
  
  struct wait_exceptional_entry_queue {
        struct exceptional_entry_key key;
  };
  
+ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+               pgoff_t index, void *entry, struct exceptional_entry_key *key)
+ {
+       unsigned long hash;
+       /*
+        * If 'entry' is a PMD, align the 'index' that we use for the wait
+        * queue to the start of that PMD.  This ensures that all offsets in
+        * the range covered by the PMD map to the same bit lock.
+        */
+       if (dax_is_pmd_entry(entry))
+               index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+       key->mapping = mapping;
+       key->entry_start = index;
+       hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
+       return wait_table + hash;
+ }
  static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
                                       int sync, void *keyp)
  {
                container_of(wait, struct wait_exceptional_entry_queue, wait);
  
        if (key->mapping != ewait->key.mapping ||
-           key->index != ewait->key.index)
+           key->entry_start != ewait->key.entry_start)
                return 0;
        return autoremove_wake_function(wait, mode, sync, NULL);
  }
@@@ -342,7 -183,7 +183,7 @@@ static inline void *lock_slot(struct ad
                radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  
        entry |= RADIX_DAX_ENTRY_LOCK;
 -      radix_tree_replace_slot(slot, (void *)entry);
 +      radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
        return (void *)entry;
  }
  
@@@ -356,7 -197,7 +197,7 @@@ static inline void *unlock_slot(struct 
                radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
  
        entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
 -      radix_tree_replace_slot(slot, (void *)entry);
 +      radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
        return (void *)entry;
  }
  
  static void *get_unlocked_mapping_entry(struct address_space *mapping,
                                        pgoff_t index, void ***slotp)
  {
-       void *ret, **slot;
+       void *entry, **slot;
        struct wait_exceptional_entry_queue ewait;
-       wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+       wait_queue_head_t *wq;
  
        init_wait(&ewait.wait);
        ewait.wait.func = wake_exceptional_entry_func;
-       ewait.key.mapping = mapping;
-       ewait.key.index = index;
  
        for (;;) {
-               ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+               entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
                                          &slot);
-               if (!ret || !radix_tree_exceptional_entry(ret) ||
+               if (!entry || !radix_tree_exceptional_entry(entry) ||
                    !slot_locked(mapping, slot)) {
                        if (slotp)
                                *slotp = slot;
-                       return ret;
+                       return entry;
                }
+               wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
                prepare_to_wait_exclusive(wq, &ewait.wait,
                                          TASK_UNINTERRUPTIBLE);
                spin_unlock_irq(&mapping->tree_lock);
        }
  }
  
+ static void put_locked_mapping_entry(struct address_space *mapping,
+                                    pgoff_t index, void *entry)
+ {
+       if (!radix_tree_exceptional_entry(entry)) {
+               unlock_page(entry);
+               put_page(entry);
+       } else {
+               dax_unlock_mapping_entry(mapping, index);
+       }
+ }
+ /*
+  * Called when we are done with radix tree entry we looked up via
+  * get_unlocked_mapping_entry() and which we didn't lock in the end.
+  */
+ static void put_unlocked_mapping_entry(struct address_space *mapping,
+                                      pgoff_t index, void *entry)
+ {
+       if (!radix_tree_exceptional_entry(entry))
+               return;
+       /* We have to wake up next waiter for the radix tree entry lock */
+       dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+ }
  /*
   * Find radix tree entry at given index. If it points to a page, return with
   * the page locked. If it points to the exceptional entry, return with the
   * radix tree entry locked. If the radix tree doesn't contain given index,
   * create empty exceptional entry for the index and return with it locked.
   *
+  * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
+  * either return that locked entry or will return an error.  This error will
+  * happen if there are any 4k entries (either zero pages or DAX entries)
+  * within the 2MiB range that we are requesting.
+  *
+  * We always favor 4k entries over 2MiB entries. There isn't a flow where we
+  * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
+  * insertion will fail if it finds any 4k entries already in the tree, and a
+  * 4k insertion will cause an existing 2MiB entry to be unmapped and
+  * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
+  * well as 2MiB empty entries.
+  *
+  * The exception to this downgrade path is for 2MiB DAX PMD entries that have
+  * real storage backing them.  We will leave these real 2MiB DAX entries in
+  * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+  *
   * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
   * persistent memory the benefit is doubtful. We can add that later if we can
   * show it helps.
   */
- static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
+               unsigned long size_flag)
  {
-       void *ret, **slot;
+       bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
+       void *entry, **slot;
  
  restart:
        spin_lock_irq(&mapping->tree_lock);
-       ret = get_unlocked_mapping_entry(mapping, index, &slot);
+       entry = get_unlocked_mapping_entry(mapping, index, &slot);
+       if (entry) {
+               if (size_flag & RADIX_DAX_PMD) {
+                       if (!radix_tree_exceptional_entry(entry) ||
+                           dax_is_pte_entry(entry)) {
+                               put_unlocked_mapping_entry(mapping, index,
+                                               entry);
+                               entry = ERR_PTR(-EEXIST);
+                               goto out_unlock;
+                       }
+               } else { /* trying to grab a PTE entry */
+                       if (radix_tree_exceptional_entry(entry) &&
+                           dax_is_pmd_entry(entry) &&
+                           (dax_is_zero_entry(entry) ||
+                            dax_is_empty_entry(entry))) {
+                               pmd_downgrade = true;
+                       }
+               }
+       }
        /* No entry for given index? Make sure radix tree is big enough. */
-       if (!ret) {
+       if (!entry || pmd_downgrade) {
                int err;
  
+               if (pmd_downgrade) {
+                       /*
+                        * Make sure 'entry' remains valid while we drop
+                        * mapping->tree_lock.
+                        */
+                       entry = lock_slot(mapping, slot);
+               }
                spin_unlock_irq(&mapping->tree_lock);
+               /*
+                * Besides huge zero pages the only other thing that gets
+                * downgraded are empty entries which don't need to be
+                * unmapped.
+                */
+               if (pmd_downgrade && dax_is_zero_entry(entry))
+                       unmap_mapping_range(mapping,
+                               (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
                err = radix_tree_preload(
                                mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
-               if (err)
+               if (err) {
+                       if (pmd_downgrade)
+                               put_locked_mapping_entry(mapping, index, entry);
                        return ERR_PTR(err);
-               ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
-                              RADIX_DAX_ENTRY_LOCK);
+               }
                spin_lock_irq(&mapping->tree_lock);
-               err = radix_tree_insert(&mapping->page_tree, index, ret);
+               if (pmd_downgrade) {
+                       radix_tree_delete(&mapping->page_tree, index);
+                       mapping->nrexceptional--;
+                       dax_wake_mapping_entry_waiter(mapping, index, entry,
+                                       true);
+               }
+               entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
+               err = __radix_tree_insert(&mapping->page_tree, index,
+                               dax_radix_order(entry), entry);
                radix_tree_preload_end();
                if (err) {
                        spin_unlock_irq(&mapping->tree_lock);
-                       /* Someone already created the entry? */
-                       if (err == -EEXIST)
+                       /*
+                        * Someone already created the entry?  This is a
+                        * normal failure when inserting PMDs in a range
+                        * that already contains PTEs.  In that case we want
+                        * to return -EEXIST immediately.
+                        */
+                       if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
                                goto restart;
+                       /*
+                        * Our insertion of a DAX PMD entry failed, most
+                        * likely because it collided with a PTE sized entry
+                        * at a different index in the PMD range.  We haven't
+                        * inserted anything into the radix tree and have no
+                        * waiters to wake.
+                        */
                        return ERR_PTR(err);
                }
                /* Good, we have inserted empty locked entry into the tree. */
                mapping->nrexceptional++;
                spin_unlock_irq(&mapping->tree_lock);
-               return ret;
+               return entry;
        }
        /* Normal page in radix tree? */
-       if (!radix_tree_exceptional_entry(ret)) {
-               struct page *page = ret;
+       if (!radix_tree_exceptional_entry(entry)) {
+               struct page *page = entry;
  
                get_page(page);
                spin_unlock_irq(&mapping->tree_lock);
                }
                return page;
        }
-       ret = lock_slot(mapping, slot);
+       entry = lock_slot(mapping, slot);
+  out_unlock:
        spin_unlock_irq(&mapping->tree_lock);
-       return ret;
+       return entry;
  }
  
+ /*
+  * We do not necessarily hold the mapping->tree_lock when we call this
+  * function so it is possible that 'entry' is no longer a valid item in the
+  * radix tree.  This is okay because all we really need to do is to find the
+  * correct waitqueue where tasks might be waiting for that old 'entry' and
+  * wake them.
+  */
  void dax_wake_mapping_entry_waiter(struct address_space *mapping,
-                                  pgoff_t index, bool wake_all)
+               pgoff_t index, void *entry, bool wake_all)
  {
-       wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+       struct exceptional_entry_key key;
+       wait_queue_head_t *wq;
+       wq = dax_entry_waitqueue(mapping, index, entry, &key);
  
        /*
         * Checking for locked entry and prepare_to_wait_exclusive() happens
         * So at this point all tasks that could have seen our entry locked
         * must be in the waitqueue and the following check will see them.
         */
-       if (waitqueue_active(wq)) {
-               struct exceptional_entry_key key;
-               key.mapping = mapping;
-               key.index = index;
+       if (waitqueue_active(wq))
                __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
-       }
  }
  
  void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
  {
-       void *ret, **slot;
+       void *entry, **slot;
  
        spin_lock_irq(&mapping->tree_lock);
-       ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
-       if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
+       entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+       if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
                         !slot_locked(mapping, slot))) {
                spin_unlock_irq(&mapping->tree_lock);
                return;
        }
        unlock_slot(mapping, slot);
        spin_unlock_irq(&mapping->tree_lock);
-       dax_wake_mapping_entry_waiter(mapping, index, false);
- }
- static void put_locked_mapping_entry(struct address_space *mapping,
-                                    pgoff_t index, void *entry)
- {
-       if (!radix_tree_exceptional_entry(entry)) {
-               unlock_page(entry);
-               put_page(entry);
-       } else {
-               dax_unlock_mapping_entry(mapping, index);
-       }
- }
- /*
-  * Called when we are done with radix tree entry we looked up via
-  * get_unlocked_mapping_entry() and which we didn't lock in the end.
-  */
- static void put_unlocked_mapping_entry(struct address_space *mapping,
-                                      pgoff_t index, void *entry)
- {
-       if (!radix_tree_exceptional_entry(entry))
-               return;
-       /* We have to wake up next waiter for the radix tree entry lock */
-       dax_wake_mapping_entry_waiter(mapping, index, false);
+       dax_wake_mapping_entry_waiter(mapping, index, entry, false);
  }
  
  /*
@@@ -547,7 -473,7 +473,7 @@@ int dax_delete_mapping_entry(struct add
        radix_tree_delete(&mapping->page_tree, index);
        mapping->nrexceptional--;
        spin_unlock_irq(&mapping->tree_lock);
-       dax_wake_mapping_entry_waiter(mapping, index, true);
+       dax_wake_mapping_entry_waiter(mapping, index, entry, true);
  
        return 1;
  }
@@@ -600,11 -526,17 +526,17 @@@ static int copy_user_dax(struct block_d
        return 0;
  }
  
- #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
+ /*
+  * By this point grab_mapping_entry() has ensured that we have a locked entry
+  * of the appropriate size so we don't have to worry about downgrading PMDs to
+  * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
+  * already in the tree, we will skip the insertion and just dirty the PMD as
+  * appropriate.
+  */
  static void *dax_insert_mapping_entry(struct address_space *mapping,
                                      struct vm_fault *vmf,
-                                     void *entry, sector_t sector)
+                                     void *entry, sector_t sector,
+                                     unsigned long flags)
  {
        struct radix_tree_root *page_tree = &mapping->page_tree;
        int error = 0;
                error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
                if (error)
                        return ERR_PTR(error);
+       } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
+               /* replacing huge zero page with PMD block mapping */
+               unmap_mapping_range(mapping,
+                       (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
        }
  
        spin_lock_irq(&mapping->tree_lock);
-       new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
-                      RADIX_DAX_ENTRY_LOCK);
+       new_entry = dax_radix_locked_entry(sector, flags);
        if (hole_fill) {
                __delete_from_page_cache(entry, NULL);
                /* Drop pagecache reference */
                put_page(entry);
-               error = radix_tree_insert(page_tree, index, new_entry);
+               error = __radix_tree_insert(page_tree, index,
+                               dax_radix_order(new_entry), new_entry);
                if (error) {
                        new_entry = ERR_PTR(error);
                        goto unlock;
                }
                mapping->nrexceptional++;
-       } else {
+       } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+               /*
+                * Only swap our new entry into the radix tree if the current
+                * entry is a zero page or an empty entry.  If a normal PTE or
+                * PMD entry is already in the tree, we leave it alone.  This
+                * means that if we are trying to insert a PTE and the
+                * existing entry is a PMD, we will just leave the PMD in the
+                * tree and dirty it if necessary.
+                */
 +              struct radix_tree_node *node;
                void **slot;
                void *ret;
  
 -              ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
 +              ret = __radix_tree_lookup(page_tree, index, &node, &slot);
                WARN_ON_ONCE(ret != entry);
 -              radix_tree_replace_slot(slot, new_entry);
 +              __radix_tree_replace(page_tree, node, slot,
 +                                   new_entry, NULL, NULL);
        }
        if (vmf->flags & FAULT_FLAG_WRITE)
                radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
@@@ -674,7 -617,6 +619,6 @@@ static int dax_writeback_one(struct blo
                struct address_space *mapping, pgoff_t index, void *entry)
  {
        struct radix_tree_root *page_tree = &mapping->page_tree;
-       int type = RADIX_DAX_TYPE(entry);
        struct radix_tree_node *node;
        struct blk_dax_ctl dax;
        void **slot;
        if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
                goto unlock;
  
-       if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+       if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+                               dax_is_zero_entry(entry))) {
                ret = -EIO;
                goto unlock;
        }
  
-       dax.sector = RADIX_DAX_SECTOR(entry);
-       dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+       /*
+        * Even if dax_writeback_mapping_range() was given a wbc->range_start
+        * in the middle of a PMD, the 'index' we are given will be aligned to
+        * the start index of the PMD, as will the sector we pull from
+        * 'entry'.  This allows us to flush for PMD_SIZE and not have to
+        * worry about partial PMD writebacks.
+        */
+       dax.sector = dax_radix_sector(entry);
+       dax.size = PAGE_SIZE << dax_radix_order(entry);
        spin_unlock_irq(&mapping->tree_lock);
  
        /*
@@@ -740,12 -690,11 +692,11 @@@ int dax_writeback_mapping_range(struct 
                struct block_device *bdev, struct writeback_control *wbc)
  {
        struct inode *inode = mapping->host;
-       pgoff_t start_index, end_index, pmd_index;
+       pgoff_t start_index, end_index;
        pgoff_t indices[PAGEVEC_SIZE];
        struct pagevec pvec;
        bool done = false;
        int i, ret = 0;
-       void *entry;
  
        if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
                return -EIO;
  
        start_index = wbc->range_start >> PAGE_SHIFT;
        end_index = wbc->range_end >> PAGE_SHIFT;
-       pmd_index = DAX_PMD_INDEX(start_index);
-       rcu_read_lock();
-       entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
-       rcu_read_unlock();
-       /* see if the start of our range is covered by a PMD entry */
-       if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
-               start_index = pmd_index;
  
        tag_pages_for_writeback(mapping, start_index, end_index);
  
@@@ -808,7 -748,7 +750,7 @@@ static int dax_insert_mapping(struct ad
                return PTR_ERR(dax.addr);
        dax_unmap_atomic(bdev, &dax);
  
-       ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+       ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
        if (IS_ERR(ret))
                return PTR_ERR(ret);
        *entryp = ret;
        return vm_insert_mixed(vma, vaddr, dax.pfn);
  }
  
- /**
-  * dax_fault - handle a page fault on a DAX file
-  * @vma: The virtual memory area where the fault occurred
-  * @vmf: The description of the fault
-  * @get_block: The filesystem method used to translate file offsets to blocks
-  *
-  * When a page fault occurs, filesystems may call this helper in their
-  * fault handler for DAX files. dax_fault() assumes the caller has done all
-  * the necessary locking for the page fault to proceed successfully.
-  */
- int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-                       get_block_t get_block)
- {
-       struct file *file = vma->vm_file;
-       struct address_space *mapping = file->f_mapping;
-       struct inode *inode = mapping->host;
-       void *entry;
-       struct buffer_head bh;
-       unsigned long vaddr = (unsigned long)vmf->virtual_address;
-       unsigned blkbits = inode->i_blkbits;
-       sector_t block;
-       pgoff_t size;
-       int error;
-       int major = 0;
-       /*
-        * Check whether offset isn't beyond end of file now. Caller is supposed
-        * to hold locks serializing us with truncate / punch hole so this is
-        * a reliable test.
-        */
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (vmf->pgoff >= size)
-               return VM_FAULT_SIGBUS;
-       memset(&bh, 0, sizeof(bh));
-       block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
-       bh.b_bdev = inode->i_sb->s_bdev;
-       bh.b_size = PAGE_SIZE;
-       entry = grab_mapping_entry(mapping, vmf->pgoff);
-       if (IS_ERR(entry)) {
-               error = PTR_ERR(entry);
-               goto out;
-       }
-       error = get_block(inode, block, &bh, 0);
-       if (!error && (bh.b_size < PAGE_SIZE))
-               error = -EIO;           /* fs corruption? */
-       if (error)
-               goto unlock_entry;
-       if (vmf->cow_page) {
-               struct page *new_page = vmf->cow_page;
-               if (buffer_written(&bh))
-                       error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
-                                       bh.b_size, new_page, vaddr);
-               else
-                       clear_user_highpage(new_page, vaddr);
-               if (error)
-                       goto unlock_entry;
-               if (!radix_tree_exceptional_entry(entry)) {
-                       vmf->page = entry;
-                       return VM_FAULT_LOCKED;
-               }
-               vmf->entry = entry;
-               return VM_FAULT_DAX_LOCKED;
-       }
-       if (!buffer_mapped(&bh)) {
-               if (vmf->flags & FAULT_FLAG_WRITE) {
-                       error = get_block(inode, block, &bh, 1);
-                       count_vm_event(PGMAJFAULT);
-                       mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-                       major = VM_FAULT_MAJOR;
-                       if (!error && (bh.b_size < PAGE_SIZE))
-                               error = -EIO;
-                       if (error)
-                               goto unlock_entry;
-               } else {
-                       return dax_load_hole(mapping, entry, vmf);
-               }
-       }
-       /* Filesystem should not return unwritten buffers to us! */
-       WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-       error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
-                       bh.b_size, &entry, vma, vmf);
-  unlock_entry:
-       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
-  out:
-       if (error == -ENOMEM)
-               return VM_FAULT_OOM | major;
-       /* -EBUSY is fine, somebody else faulted on the same PTE */
-       if ((error < 0) && (error != -EBUSY))
-               return VM_FAULT_SIGBUS | major;
-       return VM_FAULT_NOPAGE | major;
- }
- EXPORT_SYMBOL_GPL(dax_fault);
- #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
- /*
-  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
-  * more often than one might expect in the below function.
-  */
- #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
- static void __dax_dbg(struct buffer_head *bh, unsigned long address,
-               const char *reason, const char *fn)
- {
-       if (bh) {
-               char bname[BDEVNAME_SIZE];
-               bdevname(bh->b_bdev, bname);
-               pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
-                       "length %zd fallback: %s\n", fn, current->comm,
-                       address, bname, bh->b_state, (u64)bh->b_blocknr,
-                       bh->b_size, reason);
-       } else {
-               pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
-                       current->comm, address, reason);
-       }
- }
- #define dax_pmd_dbg(bh, address, reason)      __dax_dbg(bh, address, reason, "dax_pmd")
- /**
-  * dax_pmd_fault - handle a PMD fault on a DAX file
-  * @vma: The virtual memory area where the fault occurred
-  * @vmf: The description of the fault
-  * @get_block: The filesystem method used to translate file offsets to blocks
-  *
-  * When a page fault occurs, filesystems may call this helper in their
-  * pmd_fault handler for DAX files.
-  */
- int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-               pmd_t *pmd, unsigned int flags, get_block_t get_block)
- {
-       struct file *file = vma->vm_file;
-       struct address_space *mapping = file->f_mapping;
-       struct inode *inode = mapping->host;
-       struct buffer_head bh;
-       unsigned blkbits = inode->i_blkbits;
-       unsigned long pmd_addr = address & PMD_MASK;
-       bool write = flags & FAULT_FLAG_WRITE;
-       struct block_device *bdev;
-       pgoff_t size, pgoff;
-       sector_t block;
-       int result = 0;
-       bool alloc = false;
-       /* dax pmd mappings require pfn_t_devmap() */
-       if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
-               return VM_FAULT_FALLBACK;
-       /* Fall back to PTEs if we're going to COW */
-       if (write && !(vma->vm_flags & VM_SHARED)) {
-               split_huge_pmd(vma, pmd, address);
-               dax_pmd_dbg(NULL, address, "cow write");
-               return VM_FAULT_FALLBACK;
-       }
-       /* If the PMD would extend outside the VMA */
-       if (pmd_addr < vma->vm_start) {
-               dax_pmd_dbg(NULL, address, "vma start unaligned");
-               return VM_FAULT_FALLBACK;
-       }
-       if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
-               dax_pmd_dbg(NULL, address, "vma end unaligned");
-               return VM_FAULT_FALLBACK;
-       }
-       pgoff = linear_page_index(vma, pmd_addr);
-       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       if (pgoff >= size)
-               return VM_FAULT_SIGBUS;
-       /* If the PMD would cover blocks out of the file */
-       if ((pgoff | PG_PMD_COLOUR) >= size) {
-               dax_pmd_dbg(NULL, address,
-                               "offset + huge page size > file size");
-               return VM_FAULT_FALLBACK;
-       }
-       memset(&bh, 0, sizeof(bh));
-       bh.b_bdev = inode->i_sb->s_bdev;
-       block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
-       bh.b_size = PMD_SIZE;
-       if (get_block(inode, block, &bh, 0) != 0)
-               return VM_FAULT_SIGBUS;
-       if (!buffer_mapped(&bh) && write) {
-               if (get_block(inode, block, &bh, 1) != 0)
-                       return VM_FAULT_SIGBUS;
-               alloc = true;
-               WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-       }
-       bdev = bh.b_bdev;
-       /*
-        * If the filesystem isn't willing to tell us the length of a hole,
-        * just fall back to PTEs.  Calling get_block 512 times in a loop
-        * would be silly.
-        */
-       if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
-               dax_pmd_dbg(&bh, address, "allocated block too small");
-               return VM_FAULT_FALLBACK;
-       }
-       /*
-        * If we allocated new storage, make sure no process has any
-        * zero pages covering this hole
-        */
-       if (alloc) {
-               loff_t lstart = pgoff << PAGE_SHIFT;
-               loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
-               truncate_pagecache_range(inode, lstart, lend);
-       }
-       if (!write && !buffer_mapped(&bh)) {
-               spinlock_t *ptl;
-               pmd_t entry;
-               struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
-               if (unlikely(!zero_page)) {
-                       dax_pmd_dbg(&bh, address, "no zero page");
-                       goto fallback;
-               }
-               ptl = pmd_lock(vma->vm_mm, pmd);
-               if (!pmd_none(*pmd)) {
-                       spin_unlock(ptl);
-                       dax_pmd_dbg(&bh, address, "pmd already present");
-                       goto fallback;
-               }
-               dev_dbg(part_to_dev(bdev->bd_part),
-                               "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
-                               __func__, current->comm, address,
-                               (unsigned long long) to_sector(&bh, inode));
-               entry = mk_pmd(zero_page, vma->vm_page_prot);
-               entry = pmd_mkhuge(entry);
-               set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
-               result = VM_FAULT_NOPAGE;
-               spin_unlock(ptl);
-       } else {
-               struct blk_dax_ctl dax = {
-                       .sector = to_sector(&bh, inode),
-                       .size = PMD_SIZE,
-               };
-               long length = dax_map_atomic(bdev, &dax);
-               if (length < 0) {
-                       dax_pmd_dbg(&bh, address, "dax-error fallback");
-                       goto fallback;
-               }
-               if (length < PMD_SIZE) {
-                       dax_pmd_dbg(&bh, address, "dax-length too small");
-                       dax_unmap_atomic(bdev, &dax);
-                       goto fallback;
-               }
-               if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
-                       dax_pmd_dbg(&bh, address, "pfn unaligned");
-                       dax_unmap_atomic(bdev, &dax);
-                       goto fallback;
-               }
-               if (!pfn_t_devmap(dax.pfn)) {
-                       dax_unmap_atomic(bdev, &dax);
-                       dax_pmd_dbg(&bh, address, "pfn not in memmap");
-                       goto fallback;
-               }
-               dax_unmap_atomic(bdev, &dax);
-               /*
-                * For PTE faults we insert a radix tree entry for reads, and
-                * leave it clean.  Then on the first write we dirty the radix
-                * tree entry via the dax_pfn_mkwrite() path.  This sequence
-                * allows the dax_pfn_mkwrite() call to be simpler and avoid a
-                * call into get_block() to translate the pgoff to a sector in
-                * order to be able to create a new radix tree entry.
-                *
-                * The PMD path doesn't have an equivalent to
-                * dax_pfn_mkwrite(), though, so for a read followed by a
-                * write we traverse all the way through dax_pmd_fault()
-                * twice.  This means we can just skip inserting a radix tree
-                * entry completely on the initial read and just wait until
-                * the write to insert a dirty entry.
-                */
-               if (write) {
-                       /*
-                        * We should insert radix-tree entry and dirty it here.
-                        * For now this is broken...
-                        */
-               }
-               dev_dbg(part_to_dev(bdev->bd_part),
-                               "%s: %s addr: %lx pfn: %lx sect: %llx\n",
-                               __func__, current->comm, address,
-                               pfn_t_to_pfn(dax.pfn),
-                               (unsigned long long) dax.sector);
-               result |= vmf_insert_pfn_pmd(vma, address, pmd,
-                               dax.pfn, write);
-       }
-  out:
-       return result;
-  fallback:
-       count_vm_event(THP_FAULT_FALLBACK);
-       result = VM_FAULT_FALLBACK;
-       goto out;
- }
- EXPORT_SYMBOL_GPL(dax_pmd_fault);
- #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  /**
   * dax_pfn_mkwrite - handle first write to DAX page
   * @vma: The virtual memory area where the fault occurred
@@@ -1193,62 -816,14 +818,14 @@@ int __dax_zero_page_range(struct block_
  }
  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
  
- /**
-  * dax_zero_page_range - zero a range within a page of a DAX file
-  * @inode: The file being truncated
-  * @from: The file offset that is being truncated to
-  * @length: The number of bytes to zero
-  * @get_block: The filesystem method used to translate file offsets to blocks
-  *
-  * This function can be called by a filesystem when it is zeroing part of a
-  * page in a DAX file.  This is intended for hole-punch operations.  If
-  * you are truncating a file, the helper function dax_truncate_page() may be
-  * more convenient.
-  */
- int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
-                                                       get_block_t get_block)
- {
-       struct buffer_head bh;
-       pgoff_t index = from >> PAGE_SHIFT;
-       unsigned offset = from & (PAGE_SIZE-1);
-       int err;
-       /* Block boundary? Nothing to do */
-       if (!length)
-               return 0;
-       BUG_ON((offset + length) > PAGE_SIZE);
-       memset(&bh, 0, sizeof(bh));
-       bh.b_bdev = inode->i_sb->s_bdev;
-       bh.b_size = PAGE_SIZE;
-       err = get_block(inode, index, &bh, 0);
-       if (err < 0 || !buffer_written(&bh))
-               return err;
-       return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
-                       offset, length);
- }
- EXPORT_SYMBOL_GPL(dax_zero_page_range);
- /**
-  * dax_truncate_page - handle a partial page being truncated in a DAX file
-  * @inode: The file being truncated
-  * @from: The file offset that is being truncated to
-  * @get_block: The filesystem method used to translate file offsets to blocks
-  *
-  * Similar to block_truncate_page(), this function can be called by a
-  * filesystem when it is truncating a DAX file to handle the partial page.
-  */
- int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+ #ifdef CONFIG_FS_IOMAP
+ static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
  {
-       unsigned length = PAGE_ALIGN(from) - from;
-       return dax_zero_page_range(inode, from, length, get_block);
+       return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
  }
- EXPORT_SYMBOL_GPL(dax_truncate_page);
  
- #ifdef CONFIG_FS_IOMAP
  static loff_t
iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
                struct iomap *iomap)
  {
        struct iov_iter *iter = data;
                struct blk_dax_ctl dax = { 0 };
                ssize_t map_len;
  
-               dax.sector = iomap->blkno +
-                       (((pos & PAGE_MASK) - iomap->offset) >> 9);
+               dax.sector = dax_iomap_sector(iomap, pos);
                dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
                map_len = dax_map_atomic(iomap->bdev, &dax);
                if (map_len < 0) {
  }
  
  /**
-  * iomap_dax_rw - Perform I/O to a DAX file
+  * dax_iomap_rw - Perform I/O to a DAX file
   * @iocb:     The control block for this I/O
   * @iter:     The addresses to do I/O from or to
   * @ops:      iomap ops passed from the file system
   * and evicting any page cache pages in the region under I/O.
   */
  ssize_t
iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                struct iomap_ops *ops)
  {
        struct address_space *mapping = iocb->ki_filp->f_mapping;
  
        while (iov_iter_count(iter)) {
                ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
-                               iter, iomap_dax_actor);
+                               iter, dax_iomap_actor);
                if (ret <= 0)
                        break;
                pos += ret;
        iocb->ki_pos += done;
        return done ? done : ret;
  }
- EXPORT_SYMBOL_GPL(iomap_dax_rw);
+ EXPORT_SYMBOL_GPL(dax_iomap_rw);
  
  /**
-  * iomap_dax_fault - handle a page fault on a DAX file
+  * dax_iomap_fault - handle a page fault on a DAX file
   * @vma: The virtual memory area where the fault occurred
   * @vmf: The description of the fault
   * @ops: iomap ops passed from the file system
   * or mkwrite handler for DAX files. Assumes the caller has done all the
   * necessary locking for the page fault to proceed successfully.
   */
- int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                        struct iomap_ops *ops)
  {
        struct address_space *mapping = vma->vm_file->f_mapping;
        loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
        sector_t sector;
        struct iomap iomap = { 0 };
-       unsigned flags = 0;
+       unsigned flags = IOMAP_FAULT;
        int error, major = 0;
+       int locked_status = 0;
        void *entry;
  
        /*
        if (pos >= i_size_read(inode))
                return VM_FAULT_SIGBUS;
  
-       entry = grab_mapping_entry(mapping, vmf->pgoff);
+       entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
        if (IS_ERR(entry)) {
                error = PTR_ERR(entry);
                goto out;
                goto unlock_entry;
        if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
                error = -EIO;           /* fs corruption? */
-               goto unlock_entry;
+               goto finish_iomap;
        }
  
-       sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+       sector = dax_iomap_sector(&iomap, pos);
  
        if (vmf->cow_page) {
                switch (iomap.type) {
                }
  
                if (error)
-                       goto unlock_entry;
+                       goto finish_iomap;
                if (!radix_tree_exceptional_entry(entry)) {
                        vmf->page = entry;
-                       return VM_FAULT_LOCKED;
+                       locked_status = VM_FAULT_LOCKED;
+               } else {
+                       vmf->entry = entry;
+                       locked_status = VM_FAULT_DAX_LOCKED;
                }
-               vmf->entry = entry;
-               return VM_FAULT_DAX_LOCKED;
+               goto finish_iomap;
        }
  
        switch (iomap.type) {
                break;
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
-               if (!(vmf->flags & FAULT_FLAG_WRITE))
-                       return dax_load_hole(mapping, entry, vmf);
+               if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+                       locked_status = dax_load_hole(mapping, entry, vmf);
+                       break;
+               }
                /*FALLTHRU*/
        default:
                WARN_ON_ONCE(1);
                break;
        }
  
+  finish_iomap:
+       if (ops->iomap_end) {
+               if (error) {
+                       /* keep previous error */
+                       ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
+                                       &iomap);
+               } else {
+                       error = ops->iomap_end(inode, pos, PAGE_SIZE,
+                                       PAGE_SIZE, flags, &iomap);
+               }
+       }
   unlock_entry:
-       put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+       if (!locked_status || error)
+               put_locked_mapping_entry(mapping, vmf->pgoff, entry);
   out:
        if (error == -ENOMEM)
                return VM_FAULT_OOM | major;
        /* -EBUSY is fine, somebody else faulted on the same PTE */
        if (error < 0 && error != -EBUSY)
                return VM_FAULT_SIGBUS | major;
+       if (locked_status) {
+               WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
+               return locked_status;
+       }
        return VM_FAULT_NOPAGE | major;
  }
- EXPORT_SYMBOL_GPL(iomap_dax_fault);
+ EXPORT_SYMBOL_GPL(dax_iomap_fault);
+ #ifdef CONFIG_FS_DAX_PMD
+ /*
+  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+  * more often than one might expect in the below functions.
+  */
+ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
+               struct vm_fault *vmf, unsigned long address,
+               struct iomap *iomap, loff_t pos, bool write, void **entryp)
+ {
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct block_device *bdev = iomap->bdev;
+       struct blk_dax_ctl dax = {
+               .sector = dax_iomap_sector(iomap, pos),
+               .size = PMD_SIZE,
+       };
+       long length = dax_map_atomic(bdev, &dax);
+       void *ret;
+       if (length < 0) /* dax_map_atomic() failed */
+               return VM_FAULT_FALLBACK;
+       if (length < PMD_SIZE)
+               goto unmap_fallback;
+       if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
+               goto unmap_fallback;
+       if (!pfn_t_devmap(dax.pfn))
+               goto unmap_fallback;
+       dax_unmap_atomic(bdev, &dax);
+       ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+                       RADIX_DAX_PMD);
+       if (IS_ERR(ret))
+               return VM_FAULT_FALLBACK;
+       *entryp = ret;
+       return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+  unmap_fallback:
+       dax_unmap_atomic(bdev, &dax);
+       return VM_FAULT_FALLBACK;
+ }
+ static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
+               struct vm_fault *vmf, unsigned long address,
+               struct iomap *iomap, void **entryp)
+ {
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       unsigned long pmd_addr = address & PMD_MASK;
+       struct page *zero_page;
+       spinlock_t *ptl;
+       pmd_t pmd_entry;
+       void *ret;
+       zero_page = mm_get_huge_zero_page(vma->vm_mm);
+       if (unlikely(!zero_page))
+               return VM_FAULT_FALLBACK;
+       ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
+                       RADIX_DAX_PMD | RADIX_DAX_HZP);
+       if (IS_ERR(ret))
+               return VM_FAULT_FALLBACK;
+       *entryp = ret;
+       ptl = pmd_lock(vma->vm_mm, pmd);
+       if (!pmd_none(*pmd)) {
+               spin_unlock(ptl);
+               return VM_FAULT_FALLBACK;
+       }
+       pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+       pmd_entry = pmd_mkhuge(pmd_entry);
+       set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+       spin_unlock(ptl);
+       return VM_FAULT_NOPAGE;
+ }
+ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+               pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+ {
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       unsigned long pmd_addr = address & PMD_MASK;
+       bool write = flags & FAULT_FLAG_WRITE;
+       unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
+       struct inode *inode = mapping->host;
+       int result = VM_FAULT_FALLBACK;
+       struct iomap iomap = { 0 };
+       pgoff_t max_pgoff, pgoff;
+       struct vm_fault vmf;
+       void *entry;
+       loff_t pos;
+       int error;
+       /* Fall back to PTEs if we're going to COW */
+       if (write && !(vma->vm_flags & VM_SHARED))
+               goto fallback;
+       /* If the PMD would extend outside the VMA */
+       if (pmd_addr < vma->vm_start)
+               goto fallback;
+       if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+               goto fallback;
+       /*
+        * Check whether offset isn't beyond end of file now. Caller is
+        * supposed to hold locks serializing us with truncate / punch hole so
+        * this is a reliable test.
+        */
+       pgoff = linear_page_index(vma, pmd_addr);
+       max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+       if (pgoff > max_pgoff)
+               return VM_FAULT_SIGBUS;
+       /* If the PMD would extend beyond the file size */
+       if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+               goto fallback;
+       /*
+        * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+        * PMD or a HZP entry.  If it can't (because a 4k page is already in
+        * the tree, for instance), it will return -EEXIST and we just fall
+        * back to 4k entries.
+        */
+       entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+       if (IS_ERR(entry))
+               goto fallback;
+       /*
+        * Note that we don't use iomap_apply here.  We aren't doing I/O, only
+        * setting up a mapping, so really we're using iomap_begin() as a way
+        * to look up our filesystem block.
+        */
+       pos = (loff_t)pgoff << PAGE_SHIFT;
+       error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
+       if (error)
+               goto unlock_entry;
+       if (iomap.offset + iomap.length < pos + PMD_SIZE)
+               goto finish_iomap;
+       vmf.pgoff = pgoff;
+       vmf.flags = flags;
+       vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
+       switch (iomap.type) {
+       case IOMAP_MAPPED:
+               result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
+                               &iomap, pos, write, &entry);
+               break;
+       case IOMAP_UNWRITTEN:
+       case IOMAP_HOLE:
+               if (WARN_ON_ONCE(write))
+                       goto finish_iomap;
+               result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
+                               &entry);
+               break;
+       default:
+               WARN_ON_ONCE(1);
+               break;
+       }
+  finish_iomap:
+       if (ops->iomap_end) {
+               if (result == VM_FAULT_FALLBACK) {
+                       ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
+                                       &iomap);
+               } else {
+                       error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
+                                       iomap_flags, &iomap);
+                       if (error)
+                               result = VM_FAULT_FALLBACK;
+               }
+       }
+  unlock_entry:
+       put_locked_mapping_entry(mapping, pgoff, entry);
+  fallback:
+       if (result == VM_FAULT_FALLBACK) {
+               split_huge_pmd(vma, pmd, address);
+               count_vm_event(THP_FAULT_FALLBACK);
+       }
+       return result;
+ }
+ EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
+ #endif /* CONFIG_FS_DAX_PMD */
  #endif /* CONFIG_FS_IOMAP */
diff --combined fs/ext4/page-io.c
index e0b3b54cdef32651d32685bc6cfe56ae23602dcf,902a3e3059b3442b1d47ffb939f68ab5df8950f8..e2332a65e8fbb0d12ef754f8f70c5ee453013525
@@@ -340,7 -340,7 +340,7 @@@ void ext4_io_submit(struct ext4_io_subm
  
        if (bio) {
                int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
 -                                WRITE_SYNC : 0;
 +                                REQ_SYNC : 0;
                bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
                submit_bio(io->io_bio);
        }
@@@ -470,7 -470,8 +470,8 @@@ int ext4_bio_write_page(struct ext4_io_
                gfp_t gfp_flags = GFP_NOFS;
  
        retry_encrypt:
-               data_page = fscrypt_encrypt_page(inode, page, gfp_flags);
+               data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
+                                               page->index, gfp_flags);
                if (IS_ERR(data_page)) {
                        ret = PTR_ERR(data_page);
                        if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
diff --combined fs/ext4/super.c
index caa4147cda47b599e84a301f0acc13aa006f519c,79af71d4fccd8870f866706777d4d66034df1795..dfc8309d7755d55a6e7e73814ede13f204d60cae
@@@ -863,7 -863,6 +863,6 @@@ static void ext4_put_super(struct super
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
-       brelse(sbi->s_sbh);
  #ifdef CONFIG_QUOTA
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(sbi->s_qf_names[i]);
        }
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
+       brelse(sbi->s_sbh);
        sb->s_fs_info = NULL;
        /*
         * Now that we are completely done shutting down the
@@@ -1114,37 -1114,55 +1114,55 @@@ static int ext4_prepare_context(struct 
  static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
                                                        void *fs_data)
  {
-       handle_t *handle;
-       int res, res2;
+       handle_t *handle = fs_data;
+       int res, res2, retries = 0;
+       /*
+        * If a journal handle was specified, then the encryption context is
+        * being set on a new inode via inheritance and is part of a larger
+        * transaction to create the inode.  Otherwise the encryption context is
+        * being set on an existing inode in its own transaction.  Only in the
+        * latter case should the "retry on ENOSPC" logic be used.
+        */
  
-       /* fs_data is null when internally used. */
-       if (fs_data) {
-               res  = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
-                               EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
-                               len, 0);
+       if (handle) {
+               res = ext4_xattr_set_handle(handle, inode,
+                                           EXT4_XATTR_INDEX_ENCRYPTION,
+                                           EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+                                           ctx, len, 0);
                if (!res) {
                        ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
                        ext4_clear_inode_state(inode,
                                        EXT4_STATE_MAY_INLINE_DATA);
+                       /*
+                        * Update inode->i_flags - e.g. S_DAX may get disabled
+                        */
+                       ext4_set_inode_flags(inode);
                }
                return res;
        }
  
+ retry:
        handle = ext4_journal_start(inode, EXT4_HT_MISC,
                        ext4_jbd2_credits_xattr(inode));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
  
-       res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
-                       EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
-                       len, 0);
+       res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+                                   EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+                                   ctx, len, 0);
        if (!res) {
                ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+               /* Update inode->i_flags - e.g. S_DAX may get disabled */
+               ext4_set_inode_flags(inode);
                res = ext4_mark_inode_dirty(handle, inode);
                if (res)
                        EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
        }
        res2 = ext4_journal_stop(handle);
+       if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+               goto retry;
        if (!res)
                res = res2;
        return res;
@@@ -1883,12 -1901,6 +1901,6 @@@ static int parse_options(char *options
                        return 0;
                }
        }
-       if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
-           test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
-               ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
-                        "in data=ordered mode");
-               return 0;
-       }
        return 1;
  }
  
@@@ -2330,7 -2342,7 +2342,7 @@@ static void ext4_orphan_cleanup(struct 
                                struct ext4_super_block *es)
  {
        unsigned int s_flags = sb->s_flags;
-       int nr_orphans = 0, nr_truncates = 0;
+       int ret, nr_orphans = 0, nr_truncates = 0;
  #ifdef CONFIG_QUOTA
        int i;
  #endif
                                  inode->i_ino, inode->i_size);
                        inode_lock(inode);
                        truncate_inode_pages(inode->i_mapping, inode->i_size);
-                       ext4_truncate(inode);
+                       ret = ext4_truncate(inode);
+                       if (ret)
+                               ext4_std_error(inode->i_sb, ret);
                        inode_unlock(inode);
                        nr_truncates++;
                } else {
@@@ -3193,10 -3207,15 +3207,15 @@@ static int count_overhead(struct super_
                        ext4_set_bit(s++, buf);
                        count++;
                }
-               for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
-                       ext4_set_bit(EXT4_B2C(sbi, s++), buf);
-                       count++;
+               j = ext4_bg_num_gdb(sb, grp);
+               if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
+                       ext4_error(sb, "Invalid number of block group "
+                                  "descriptor blocks: %d", j);
+                       j = EXT4_BLOCKS_PER_GROUP(sb) - s;
                }
+               count += j;
+               for (; j > 0; j--)
+                       ext4_set_bit(EXT4_B2C(sbi, s++), buf);
        }
        if (!count)
                return 0;
@@@ -3301,7 -3320,7 +3320,7 @@@ static int ext4_fill_super(struct super
        char *orig_data = kstrdup(data, GFP_KERNEL);
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
-       struct ext4_sb_info *sbi;
+       struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        ext4_fsblk_t block;
        ext4_fsblk_t sb_block = get_sb_block(&data);
        ext4_fsblk_t logical_sb_block;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        ext4_group_t first_not_zeroed;
  
-       sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-       if (!sbi)
-               goto out_free_orig;
+       if ((data && !orig_data) || !sbi)
+               goto out_free_base;
  
        sbi->s_blockgroup_lock =
                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
-       if (!sbi->s_blockgroup_lock) {
-               kfree(sbi);
-               goto out_free_orig;
-       }
+       if (!sbi->s_blockgroup_lock)
+               goto out_free_base;
        sb->s_fs_info = sbi;
        sbi->s_sb = sb;
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
         */
        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
  
-       if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
-                          &journal_devnum, &journal_ioprio, 0)) {
-               ext4_msg(sb, KERN_WARNING,
-                        "failed to parse options in superblock: %s",
-                        sbi->s_es->s_mount_opts);
+       if (sbi->s_es->s_mount_opts[0]) {
+               char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
+                                             sizeof(sbi->s_es->s_mount_opts),
+                                             GFP_KERNEL);
+               if (!s_mount_opts)
+                       goto failed_mount;
+               if (!parse_options(s_mount_opts, sb, &journal_devnum,
+                                  &journal_ioprio, 0)) {
+                       ext4_msg(sb, KERN_WARNING,
+                                "failed to parse options in superblock: %s",
+                                s_mount_opts);
+               }
+               kfree(s_mount_opts);
        }
        sbi->s_def_mount_opt = sbi->s_mount_opt;
        if (!parse_options((char *) data, sb, &journal_devnum,
                                 "both data=journal and dax");
                        goto failed_mount;
                }
+               if (ext4_has_feature_encrypt(sb)) {
+                       ext4_msg(sb, KERN_WARNING,
+                                "encrypted files will use data=ordered "
+                                "instead of data journaling mode");
+               }
                if (test_opt(sb, DELALLOC))
                        clear_opt(sb, DELALLOC);
        } else {
  
        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
-       if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
-               goto cantfind_ext4;
  
        sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0)
                goto cantfind_ext4;
+       if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+           sbi->s_inodes_per_group > blocksize * 8) {
+               ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
+                        sbi->s_blocks_per_group);
+               goto failed_mount;
+       }
        sbi->s_itb_per_group = sbi->s_inodes_per_group /
                                        sbi->s_inodes_per_block;
        sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
        }
        sbi->s_cluster_ratio = clustersize / blocksize;
  
-       if (sbi->s_inodes_per_group > blocksize * 8) {
-               ext4_msg(sb, KERN_ERR,
-                      "#inodes per group too big: %lu",
-                      sbi->s_inodes_per_group);
-               goto failed_mount;
-       }
        /* Do we have standard group size of clustersize * 8 blocks ? */
        if (sbi->s_blocks_per_group == clustersize << 3)
                set_opt2(sb, STD_GROUP_SIZE);
                        (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
+       if (ext4_has_feature_meta_bg(sb)) {
+               if (le32_to_cpu(es->s_first_meta_bg) >= db_count) {
+                       ext4_msg(sb, KERN_WARNING,
+                                "first meta block group too large: %u "
+                                "(group descriptor block count %u)",
+                                le32_to_cpu(es->s_first_meta_bg), db_count);
+                       goto failed_mount;
+               }
+       }
        sbi->s_group_desc = ext4_kvmalloc(db_count *
                                          sizeof(struct buffer_head *),
                                          GFP_KERNEL);
        default:
                break;
        }
+       if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+           test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+               ext4_msg(sb, KERN_ERR, "can't mount with "
+                       "journal_async_commit in data=ordered mode");
+               goto failed_mount_wq;
+       }
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
  
        sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
@@@ -4160,7 -4204,9 +4204,9 @@@ no_journal
  
        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
                ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
-                        "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+                        "Opts: %.*s%s%s", descr,
+                        (int) sizeof(sbi->s_es->s_mount_opts),
+                        sbi->s_es->s_mount_opts,
                         *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
  
        if (es->s_error_count)
@@@ -4239,8 -4285,8 +4285,8 @@@ failed_mount
  out_fail:
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
+ out_free_base:
        kfree(sbi);
- out_free_orig:
        kfree(orig_data);
        return err ? err : ret;
  }
@@@ -4550,7 -4596,8 +4596,8 @@@ static int ext4_commit_super(struct sup
                                &EXT4_SB(sb)->s_freeinodes_counter));
        BUFFER_TRACE(sbh, "marking dirty");
        ext4_superblock_csum_set(sb);
-       lock_buffer(sbh);
+       if (sync)
+               lock_buffer(sbh);
        if (buffer_write_io_error(sbh)) {
                /*
                 * Oh, dear.  A previous attempt to write the
                set_buffer_uptodate(sbh);
        }
        mark_buffer_dirty(sbh);
-       unlock_buffer(sbh);
        if (sync) {
+               unlock_buffer(sbh);
                error = __sync_dirty_buffer(sbh,
 -                      test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
 +                      test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
                if (error)
                        return error;
  
@@@ -4857,6 -4904,13 +4904,13 @@@ static int ext4_remount(struct super_bl
                        err = -EINVAL;
                        goto restore_opts;
                }
+       } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
+               if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                               "journal_async_commit in data=ordered mode");
+                       err = -EINVAL;
+                       goto restore_opts;
+               }
        }
  
        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
@@@ -5366,7 -5420,7 +5420,7 @@@ static int ext4_quota_off(struct super_
        handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
        if (IS_ERR(handle))
                goto out;
-       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       inode->i_mtime = inode->i_ctime = current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
  
diff --combined fs/f2fs/data.c
index 7c344b3ad70faf87b65acdfb5f4c915f96176f03,9f0ba90b92e4aee60622322973ef80d14b24e1e6..9ac262564fa6b5934b8ec4987425ea3205bba1cf
  #include "trace.h"
  #include <trace/events/f2fs.h>
  
 +static bool __is_cp_guaranteed(struct page *page)
 +{
 +      struct address_space *mapping = page->mapping;
 +      struct inode *inode;
 +      struct f2fs_sb_info *sbi;
 +
 +      if (!mapping)
 +              return false;
 +
 +      inode = mapping->host;
 +      sbi = F2FS_I_SB(inode);
 +
 +      if (inode->i_ino == F2FS_META_INO(sbi) ||
 +                      inode->i_ino ==  F2FS_NODE_INO(sbi) ||
 +                      S_ISDIR(inode->i_mode) ||
 +                      is_cold_data(page))
 +              return true;
 +      return false;
 +}
 +
  static void f2fs_read_end_io(struct bio *bio)
  {
        struct bio_vec *bvec;
@@@ -91,7 -71,6 +91,7 @@@ static void f2fs_write_end_io(struct bi
  
        bio_for_each_segment_all(bvec, bio, i) {
                struct page *page = bvec->bv_page;
 +              enum count_type type = WB_DATA_TYPE(page);
  
                fscrypt_pullback_bio_page(&page, true);
  
                        mapping_set_error(page->mapping, -EIO);
                        f2fs_stop_checkpoint(sbi, true);
                }
 +              dec_page_count(sbi, type);
 +              clear_cold_data(page);
                end_page_writeback(page);
        }
 -      if (atomic_dec_and_test(&sbi->nr_wb_bios) &&
 +      if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
                                wq_has_sleeper(&sbi->cp_wait))
                wake_up(&sbi->cp_wait);
  
        bio_put(bio);
  }
  
 +/*
 + * Return true, if pre_bio's bdev is same as its target device.
 + */
 +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
 +                              block_t blk_addr, struct bio *bio)
 +{
 +      struct block_device *bdev = sbi->sb->s_bdev;
 +      int i;
 +
 +      for (i = 0; i < sbi->s_ndevs; i++) {
 +              if (FDEV(i).start_blk <= blk_addr &&
 +                                      FDEV(i).end_blk >= blk_addr) {
 +                      blk_addr -= FDEV(i).start_blk;
 +                      bdev = FDEV(i).bdev;
 +                      break;
 +              }
 +      }
 +      if (bio) {
 +              bio->bi_bdev = bdev;
 +              bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
 +      }
 +      return bdev;
 +}
 +
 +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
 +{
 +      int i;
 +
 +      for (i = 0; i < sbi->s_ndevs; i++)
 +              if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
 +                      return i;
 +      return 0;
 +}
 +
 +static bool __same_bdev(struct f2fs_sb_info *sbi,
 +                              block_t blk_addr, struct bio *bio)
 +{
 +      return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
 +}
 +
  /*
   * Low-level block read/write IO operations.
   */
@@@ -160,7 -97,8 +160,7 @@@ static struct bio *__bio_alloc(struct f
  
        bio = f2fs_bio_alloc(npages);
  
 -      bio->bi_bdev = sbi->sb->s_bdev;
 -      bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
 +      f2fs_target_device(sbi, blk_addr, bio);
        bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
        bio->bi_private = is_read ? NULL : sbi;
  
@@@ -171,7 -109,8 +171,7 @@@ static inline void __submit_bio(struct 
                                struct bio *bio, enum page_type type)
  {
        if (!is_read_io(bio_op(bio))) {
 -              atomic_inc(&sbi->nr_wb_bios);
 -              if (f2fs_sb_mounted_hmsmr(sbi->sb) &&
 +              if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
                        current->plug && (type == DATA || type == NODE))
                        blk_finish_plug(current->plug);
        }
@@@ -259,9 -198,11 +259,9 @@@ static void __f2fs_submit_merged_bio(st
        if (type >= META_FLUSH) {
                io->fio.type = META_FLUSH;
                io->fio.op = REQ_OP_WRITE;
 -              if (test_opt(sbi, NOBARRIER))
 -                      io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO;
 -              else
 -                      io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META |
 -                                                              REQ_PRIO;
 +              io->fio.op_flags = REQ_PREFLUSH | REQ_META | REQ_PRIO;
 +              if (!test_opt(sbi, NOBARRIER))
 +                      io->fio.op_flags |= REQ_FUA;
        }
        __submit_merged_bio(io);
  out:
@@@ -329,24 -270,22 +329,24 @@@ void f2fs_submit_page_mbio(struct f2fs_
                verify_block_addr(sbi, fio->old_blkaddr);
        verify_block_addr(sbi, fio->new_blkaddr);
  
 +      bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 +
 +      if (!is_read)
 +              inc_page_count(sbi, WB_DATA_TYPE(bio_page));
 +
        down_write(&io->io_rwsem);
  
        if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
 -          (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
 +          (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
 +                      !__same_bdev(sbi, fio->new_blkaddr, io->bio)))
                __submit_merged_bio(io);
  alloc_new:
        if (io->bio == NULL) {
 -              int bio_blocks = MAX_BIO_BLOCKS(sbi);
 -
                io->bio = __bio_alloc(sbi, fio->new_blkaddr,
 -                                              bio_blocks, is_read);
 +                                              BIO_MAX_PAGES, is_read);
                io->fio = *fio;
        }
  
 -      bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 -
        if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
                                                        PAGE_SIZE) {
                __submit_merged_bio(io);
@@@ -544,7 -483,7 +544,7 @@@ struct page *find_data_page(struct inod
                return page;
        f2fs_put_page(page, 0);
  
 -      page = get_read_data_page(inode, index, READ_SYNC, false);
 +      page = get_read_data_page(inode, index, 0, false);
        if (IS_ERR(page))
                return page;
  
@@@ -570,7 -509,7 +570,7 @@@ struct page *get_lock_data_page(struct 
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
  repeat:
 -      page = get_read_data_page(inode, index, READ_SYNC, for_write);
 +      page = get_read_data_page(inode, index, 0, for_write);
        if (IS_ERR(page))
                return page;
  
@@@ -651,6 -590,7 +651,6 @@@ static int __allocate_data_block(struc
        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct f2fs_summary sum;
        struct node_info ni;
 -      int seg = CURSEG_WARM_DATA;
        pgoff_t fofs;
        blkcnt_t count = 1;
  
@@@ -668,8 -608,11 +668,8 @@@ alloc
        get_node_info(sbi, dn->nid, &ni);
        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
  
 -      if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
 -              seg = CURSEG_DIRECT_IO;
 -
        allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
 -                                                              &sum, seg);
 +                                              &sum, CURSEG_WARM_DATA);
        set_data_blkaddr(dn);
  
        /* update i_size */
        return 0;
  }
  
 -ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 +static inline bool __force_buffered_io(struct inode *inode, int rw)
 +{
 +      return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) ||
 +                      (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
 +                      F2FS_I_SB(inode)->s_ndevs);
 +}
 +
 +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
  {
        struct inode *inode = file_inode(iocb->ki_filp);
        struct f2fs_map_blocks map;
 -      ssize_t ret = 0;
 +      int err = 0;
  
        map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
        map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
        map.m_next_pgofs = NULL;
  
        if (iocb->ki_flags & IOCB_DIRECT) {
 -              ret = f2fs_convert_inline_inode(inode);
 -              if (ret)
 -                      return ret;
 -              return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
 +              err = f2fs_convert_inline_inode(inode);
 +              if (err)
 +                      return err;
 +              return f2fs_map_blocks(inode, &map, 1,
 +                      __force_buffered_io(inode, WRITE) ?
 +                              F2FS_GET_BLOCK_PRE_AIO :
 +                              F2FS_GET_BLOCK_PRE_DIO);
        }
        if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
 -              ret = f2fs_convert_inline_inode(inode);
 -              if (ret)
 -                      return ret;
 +              err = f2fs_convert_inline_inode(inode);
 +              if (err)
 +                      return err;
        }
        if (!f2fs_has_inline_data(inode))
                return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
 -      return ret;
 +      return err;
  }
  
  /*
@@@ -743,6 -676,7 +743,6 @@@ int f2fs_map_blocks(struct inode *inode
        unsigned int ofs_in_node, last_ofs_in_node;
        blkcnt_t prealloc;
        struct extent_info ei;
 -      bool allocated = false;
        block_t blkaddr;
  
        if (!maxblocks)
@@@ -782,7 -716,7 +782,7 @@@ next_dnode
        }
  
        prealloc = 0;
 -      ofs_in_node = dn.ofs_in_node;
 +      last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
        end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
  
  next_block:
                                }
                        } else {
                                err = __allocate_data_block(&dn);
 -                              if (!err) {
 +                              if (!err)
                                        set_inode_flag(inode, FI_APPEND_WRITE);
 -                                      allocated = true;
 -                              }
                        }
                        if (err)
                                goto sync_out;
@@@ -857,6 -793,7 +857,6 @@@ skip
                err = reserve_new_blocks(&dn, prealloc);
                if (err)
                        goto sync_out;
 -              allocated = dn.node_changed;
  
                map->m_len += dn.ofs_in_node - ofs_in_node;
                if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
  
        if (create) {
                f2fs_unlock_op(sbi);
 -              f2fs_balance_fs(sbi, allocated);
 +              f2fs_balance_fs(sbi, dn.node_changed);
        }
 -      allocated = false;
        goto next_dnode;
  
  sync_out:
  unlock_out:
        if (create) {
                f2fs_unlock_op(sbi);
 -              f2fs_balance_fs(sbi, allocated);
 +              f2fs_balance_fs(sbi, dn.node_changed);
        }
  out:
        trace_f2fs_map_blocks(inode, map, err);
@@@ -896,19 -834,19 +896,19 @@@ static int __get_data_block(struct inod
                        pgoff_t *next_pgofs)
  {
        struct f2fs_map_blocks map;
 -      int ret;
 +      int err;
  
        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;
        map.m_next_pgofs = next_pgofs;
  
 -      ret = f2fs_map_blocks(inode, &map, create, flag);
 -      if (!ret) {
 +      err = f2fs_map_blocks(inode, &map, create, flag);
 +      if (!err) {
                map_bh(bh, inode->i_sb, map.m_pblk);
                bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
                bh->b_size = map.m_len << inode->i_blkbits;
        }
 -      return ret;
 +      return err;
  }
  
  static int get_data_block(struct inode *inode, sector_t iblock,
@@@ -953,6 -891,7 +953,6 @@@ int f2fs_fiemap(struct inode *inode, st
        struct buffer_head map_bh;
        sector_t start_blk, last_blk;
        pgoff_t next_pgofs;
 -      loff_t isize;
        u64 logical = 0, phys = 0, size = 0;
        u32 flags = 0;
        int ret = 0;
  
        inode_lock(inode);
  
 -      isize = i_size_read(inode);
 -      if (start >= isize)
 -              goto out;
 -
 -      if (start + len > isize)
 -              len = isize - start;
 -
        if (logical_to_blk(inode, len) == 0)
                len = blk_to_logical(inode, 1);
  
@@@ -987,11 -933,13 +987,11 @@@ next
        /* HOLE */
        if (!buffer_mapped(&map_bh)) {
                start_blk = next_pgofs;
 -              /* Go through holes util pass the EOF */
 -              if (blk_to_logical(inode, start_blk) < isize)
 +
 +              if (blk_to_logical(inode, start_blk) < blk_to_logical(inode,
 +                                      F2FS_I_SB(inode)->max_file_blocks))
                        goto prep_next;
 -              /* Found a hole beyond isize means no more extents.
 -               * Note that the premise is that filesystems don't
 -               * punch holes beyond isize and keep size unchanged.
 -               */
 +
                flags |= FIEMAP_EXTENT_LAST;
        }
  
@@@ -1034,6 -982,7 +1034,6 @@@ static struct bio *f2fs_grab_bio(struc
  {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        struct fscrypt_ctx *ctx = NULL;
 -      struct block_device *bdev = sbi->sb->s_bdev;
        struct bio *bio;
  
        if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
                        fscrypt_release_ctx(ctx);
                return ERR_PTR(-ENOMEM);
        }
 -      bio->bi_bdev = bdev;
 -      bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
 +      f2fs_target_device(sbi, blkaddr, bio);
        bio->bi_end_io = f2fs_read_end_io;
        bio->bi_private = ctx;
  
@@@ -1146,8 -1096,7 +1146,8 @@@ got_it
                 * This page will go to BIO.  Do we need to send this
                 * BIO off first?
                 */
 -              if (bio && (last_block_in_bio != block_nr - 1)) {
 +              if (bio && (last_block_in_bio != block_nr - 1 ||
 +                      !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
  submit_and_realloc:
                        __submit_bio(F2FS_I_SB(inode), bio, DATA);
                        bio = NULL;
@@@ -1246,7 -1195,9 +1246,9 @@@ int do_write_data_page(struct f2fs_io_i
                                                        fio->old_blkaddr);
  retry_encrypt:
                fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
-                                                               gfp_flags);
+                                                       PAGE_SIZE, 0,
+                                                       fio->page->index,
+                                                       gfp_flags);
                if (IS_ERR(fio->encrypted_page)) {
                        err = PTR_ERR(fio->encrypted_page);
                        if (err == -ENOMEM) {
@@@ -1302,7 -1253,7 +1304,7 @@@ static int f2fs_write_data_page(struct 
                .sbi = sbi,
                .type = DATA,
                .op = REQ_OP_WRITE,
 -              .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
 +              .op_flags = wbc_to_write_flags(wbc),
                .page = page,
                .encrypted_page = NULL,
        };
@@@ -1362,6 -1313,7 +1364,6 @@@ done
        if (err && err != -ENOENT)
                goto redirty_out;
  
 -      clear_cold_data(page);
  out:
        inode_dec_dirty_pages(inode);
        if (err)
  
  redirty_out:
        redirty_page_for_writepage(wbc, page);
 +      if (!err)
 +              return AOP_WRITEPAGE_ACTIVATE;
        unlock_page(page);
        return err;
  }
@@@ -1479,15 -1429,6 +1481,15 @@@ continue_unlock
  
                        ret = mapping->a_ops->writepage(page, wbc);
                        if (unlikely(ret)) {
 +                              /*
 +                               * keep nr_to_write, since vfs uses this to
 +                               * get # of written pages.
 +                               */
 +                              if (ret == AOP_WRITEPAGE_ACTIVATE) {
 +                                      unlock_page(page);
 +                                      ret = 0;
 +                                      continue;
 +                              }
                                done_index = page->index + 1;
                                done = 1;
                                break;
@@@ -1724,7 -1665,7 +1726,7 @@@ repeat
                        err = PTR_ERR(bio);
                        goto fail;
                }
 -              bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
 +              bio->bi_opf = REQ_OP_READ;
                if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
                        bio_put(bio);
                        err = -EFAULT;
@@@ -1775,6 -1716,7 +1777,6 @@@ static int f2fs_write_end(struct file *
                goto unlock_out;
  
        set_page_dirty(page);
 -      clear_cold_data(page);
  
        if (pos + copied > i_size_read(inode))
                f2fs_i_size_write(inode, pos + copied);
@@@ -1811,7 -1753,9 +1813,7 @@@ static ssize_t f2fs_direct_IO(struct ki
        if (err)
                return err;
  
 -      if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 -              return 0;
 -      if (test_opt(F2FS_I_SB(inode), LFS))
 +      if (__force_buffered_io(inode, rw))
                return 0;
  
        trace_f2fs_direct_IO_enter(inode, offset, count, rw);
@@@ -1843,14 -1787,12 +1845,14 @@@ void f2fs_invalidate_page(struct page *
                return;
  
        if (PageDirty(page)) {
 -              if (inode->i_ino == F2FS_META_INO(sbi))
 +              if (inode->i_ino == F2FS_META_INO(sbi)) {
                        dec_page_count(sbi, F2FS_DIRTY_META);
 -              else if (inode->i_ino == F2FS_NODE_INO(sbi))
 +              } else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
                        dec_page_count(sbi, F2FS_DIRTY_NODES);
 -              else
 +              } else {
                        inode_dec_dirty_pages(inode);
 +                      remove_dirty_inode(inode);
 +              }
        }
  
        /* This is atomic written page, keep Private */
diff --combined fs/f2fs/f2fs.h
index 23c86e8cf5237e55bd375cb57ec13de10f264dbb,8e94b7bda42b0ba9a4628b92c598ba77a5e722dd..2da8c3aa0ce5db222ed1c60aa6d394140d564aae
@@@ -103,7 -103,7 +103,7 @@@ struct f2fs_mount_info 
  };
  
  #define F2FS_FEATURE_ENCRYPT  0x0001
 -#define F2FS_FEATURE_HMSMR    0x0002
 +#define F2FS_FEATURE_BLKZONED 0x0002
  
  #define F2FS_HAS_FEATURE(sb, mask)                                    \
        ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@@ -401,7 -401,6 +401,7 @@@ struct f2fs_map_blocks 
  #define FADVISE_LOST_PINO_BIT 0x02
  #define FADVISE_ENCRYPT_BIT   0x04
  #define FADVISE_ENC_NAME_BIT  0x08
 +#define FADVISE_KEEP_SIZE_BIT 0x10
  
  #define file_is_cold(inode)   is_file(inode, FADVISE_COLD_BIT)
  #define file_wrong_pino(inode)        is_file(inode, FADVISE_LOST_PINO_BIT)
  #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
  #define file_enc_name(inode)  is_file(inode, FADVISE_ENC_NAME_BIT)
  #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
 +#define file_keep_isize(inode)        is_file(inode, FADVISE_KEEP_SIZE_BIT)
 +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
  
  #define DEF_DIR_LEVEL         0
  
@@@ -431,7 -428,7 +431,7 @@@ struct f2fs_inode_info 
        /* Use below internally in f2fs*/
        unsigned long flags;            /* use to pass per-file flags */
        struct rw_semaphore i_sem;      /* protect fi info */
 -      struct percpu_counter dirty_pages;      /* # of dirty pages */
 +      atomic_t dirty_pages;           /* # of dirty pages */
        f2fs_hash_t chash;              /* hash value of given file name */
        unsigned int clevel;            /* maximum level of given file name */
        nid_t i_xattr_nid;              /* node id that contains xattrs */
@@@ -496,26 -493,20 +496,26 @@@ static inline bool __is_front_mergeable
        return __is_extent_mergeable(cur, front);
  }
  
 -extern void f2fs_mark_inode_dirty_sync(struct inode *);
 +extern void f2fs_mark_inode_dirty_sync(struct inode *, bool);
  static inline void __try_update_largest_extent(struct inode *inode,
                        struct extent_tree *et, struct extent_node *en)
  {
        if (en->ei.len > et->largest.len) {
                et->largest = en->ei;
 -              f2fs_mark_inode_dirty_sync(inode);
 +              f2fs_mark_inode_dirty_sync(inode, true);
        }
  }
  
 +enum nid_list {
 +      FREE_NID_LIST,
 +      ALLOC_NID_LIST,
 +      MAX_NID_LIST,
 +};
 +
  struct f2fs_nm_info {
        block_t nat_blkaddr;            /* base disk address of NAT */
        nid_t max_nid;                  /* maximum possible node ids */
 -      nid_t available_nids;           /* maximum available node ids */
 +      nid_t available_nids;           /* # of available node ids */
        nid_t next_scan_nid;            /* the next nid to be scanned */
        unsigned int ram_thresh;        /* control the memory footprint */
        unsigned int ra_nid_pages;      /* # of nid pages to be readaheaded */
  
        /* free node ids management */
        struct radix_tree_root free_nid_root;/* root of the free_nid cache */
 -      struct list_head free_nid_list; /* a list for free nids */
 -      spinlock_t free_nid_list_lock;  /* protect free nid list */
 -      unsigned int fcnt;              /* the number of free node id */
 +      struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */
 +      unsigned int nid_cnt[MAX_NID_LIST];     /* the number of free node id */
 +      spinlock_t nid_list_lock;       /* protect nid lists ops */
        struct mutex build_lock;        /* lock for build free nids */
  
        /* for checkpoint */
@@@ -594,6 -585,7 +594,6 @@@ enum 
        CURSEG_WARM_NODE,       /* direct node blocks of normal files */
        CURSEG_COLD_NODE,       /* indirect node blocks */
        NO_CHECK_TYPE,
 -      CURSEG_DIRECT_IO,       /* to use for the direct IO path */
  };
  
  struct flush_cmd {
@@@ -657,7 -649,6 +657,7 @@@ struct f2fs_sm_info 
   * f2fs monitors the number of several block types such as on-writeback,
   * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
   */
 +#define WB_DATA_TYPE(p)       (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
  enum count_type {
        F2FS_DIRTY_DENTS,
        F2FS_DIRTY_DATA,
        F2FS_DIRTY_META,
        F2FS_INMEM_PAGES,
        F2FS_DIRTY_IMETA,
 +      F2FS_WB_CP_DATA,
 +      F2FS_WB_DATA,
        NR_COUNT_TYPE,
  };
  
@@@ -699,7 -688,7 +699,7 @@@ struct f2fs_io_info 
        struct f2fs_sb_info *sbi;       /* f2fs_sb_info pointer */
        enum page_type type;    /* contains DATA/NODE/META/META_FLUSH */
        int op;                 /* contains REQ_OP_ */
 -      int op_flags;           /* rq_flag_bits */
 +      int op_flags;           /* req_flag_bits */
        block_t new_blkaddr;    /* new block address to be written */
        block_t old_blkaddr;    /* old block address before Cow */
        struct page *page;      /* page to be written */
@@@ -715,20 -704,6 +715,20 @@@ struct f2fs_bio_info 
        struct rw_semaphore io_rwsem;   /* blocking op for bio */
  };
  
 +#define FDEV(i)                               (sbi->devs[i])
 +#define RDEV(i)                               (raw_super->devs[i])
 +struct f2fs_dev_info {
 +      struct block_device *bdev;
 +      char path[MAX_PATH_LEN];
 +      unsigned int total_segments;
 +      block_t start_blk;
 +      block_t end_blk;
 +#ifdef CONFIG_BLK_DEV_ZONED
 +      unsigned int nr_blkz;                   /* Total number of zones */
 +      u8 *blkz_type;                          /* Array of zones type */
 +#endif
 +};
 +
  enum inode_type {
        DIR_INODE,                      /* for dirty dir inode */
        FILE_INODE,                     /* for dirty regular/symlink inode */
@@@ -775,12 -750,6 +775,12 @@@ struct f2fs_sb_info 
        u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
        u8 key_prefix_size;
  #endif
 +
 +#ifdef CONFIG_BLK_DEV_ZONED
 +      unsigned int blocks_per_blkz;           /* F2FS blocks per zone */
 +      unsigned int log_blocks_per_blkz;       /* log2 F2FS blocks per zone */
 +#endif
 +
        /* for node-related operations */
        struct f2fs_nm_info *nm_info;           /* node manager */
        struct inode *node_inode;               /* cache node blocks */
  
        /* for checkpoint */
        struct f2fs_checkpoint *ckpt;           /* raw checkpoint pointer */
 +      int cur_cp_pack;                        /* remain current cp pack */
        spinlock_t cp_lock;                     /* for flag in ckpt */
        struct inode *meta_inode;               /* cache meta blocks */
        struct mutex cp_mutex;                  /* checkpoint procedure lock */
        block_t discard_blks;                   /* discard command candidats */
        block_t last_valid_block_count;         /* for recovery */
        u32 s_next_generation;                  /* for NFS support */
 -      atomic_t nr_wb_bios;                    /* # of writeback bios */
  
        /* # of pages, see count_type */
 -      struct percpu_counter nr_pages[NR_COUNT_TYPE];
 +      atomic_t nr_pages[NR_COUNT_TYPE];
        /* # of allocated blocks */
        struct percpu_counter alloc_valid_block_count;
  
  
        /* For shrinker support */
        struct list_head s_list;
 +      int s_ndevs;                            /* number of devices */
 +      struct f2fs_dev_info *devs;             /* for device list */
        struct mutex umount_mutex;
        unsigned int shrinker_run_no;
  
@@@ -1138,6 -1105,13 +1138,6 @@@ static inline void clear_ckpt_flags(str
        spin_unlock(&sbi->cp_lock);
  }
  
 -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
 -{
 -      struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
 -
 -      return blk_queue_discard(q);
 -}
 -
  static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
  {
        down_read(&sbi->cp_rwsem);
@@@ -1258,10 -1232,9 +1258,10 @@@ static inline void dec_valid_block_coun
  
  static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
  {
 -      percpu_counter_inc(&sbi->nr_pages[count_type]);
 +      atomic_inc(&sbi->nr_pages[count_type]);
  
 -      if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES)
 +      if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
 +              count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA)
                return;
  
        set_sbi_flag(sbi, SBI_IS_DIRTY);
  
  static inline void inode_inc_dirty_pages(struct inode *inode)
  {
 -      percpu_counter_inc(&F2FS_I(inode)->dirty_pages);
 +      atomic_inc(&F2FS_I(inode)->dirty_pages);
        inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
                                F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
  }
  
  static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
  {
 -      percpu_counter_dec(&sbi->nr_pages[count_type]);
 +      atomic_dec(&sbi->nr_pages[count_type]);
  }
  
  static inline void inode_dec_dirty_pages(struct inode *inode)
                        !S_ISLNK(inode->i_mode))
                return;
  
 -      percpu_counter_dec(&F2FS_I(inode)->dirty_pages);
 +      atomic_dec(&F2FS_I(inode)->dirty_pages);
        dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
                                F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
  }
  
  static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
  {
 -      return percpu_counter_sum_positive(&sbi->nr_pages[count_type]);
 +      return atomic_read(&sbi->nr_pages[count_type]);
  }
  
 -static inline s64 get_dirty_pages(struct inode *inode)
 +static inline int get_dirty_pages(struct inode *inode)
  {
 -      return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages);
 +      return atomic_read(&F2FS_I(inode)->dirty_pages);
  }
  
  static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@@ -1356,27 -1329,22 +1356,27 @@@ static inline void *__bitmap_ptr(struc
  
  static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
  {
 -      block_t start_addr;
 -      struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 -      unsigned long long ckpt_version = cur_cp_version(ckpt);
 -
 -      start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
 +      block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
  
 -      /*
 -       * odd numbered checkpoint should at cp segment 0
 -       * and even segment must be at cp segment 1
 -       */
 -      if (!(ckpt_version & 1))
 +      if (sbi->cur_cp_pack == 2)
                start_addr += sbi->blocks_per_seg;
 +      return start_addr;
 +}
 +
 +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
 +{
 +      block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
  
 +      if (sbi->cur_cp_pack == 1)
 +              start_addr += sbi->blocks_per_seg;
        return start_addr;
  }
  
 +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi)
 +{
 +      sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1;
 +}
 +
  static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
  {
        return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
@@@ -1653,7 -1621,7 +1653,7 @@@ static inline void __mark_inode_dirty_f
                        return;
        case FI_DATA_EXIST:
        case FI_INLINE_DOTS:
 -              f2fs_mark_inode_dirty_sync(inode);
 +              f2fs_mark_inode_dirty_sync(inode, true);
        }
  }
  
@@@ -1680,7 -1648,7 +1680,7 @@@ static inline void set_acl_inode(struc
  {
        F2FS_I(inode)->i_acl_mode = mode;
        set_inode_flag(inode, FI_ACL_MODE);
 -      f2fs_mark_inode_dirty_sync(inode);
 +      f2fs_mark_inode_dirty_sync(inode, false);
  }
  
  static inline void f2fs_i_links_write(struct inode *inode, bool inc)
                inc_nlink(inode);
        else
                drop_nlink(inode);
 -      f2fs_mark_inode_dirty_sync(inode);
 +      f2fs_mark_inode_dirty_sync(inode, true);
  }
  
  static inline void f2fs_i_blocks_write(struct inode *inode,
  
        inode->i_blocks = add ? inode->i_blocks + diff :
                                inode->i_blocks - diff;
 -      f2fs_mark_inode_dirty_sync(inode);
 +      f2fs_mark_inode_dirty_sync(inode, true);
        if (clean || recover)
                set_inode_flag(inode, FI_AUTO_RECOVER);
  }
@@@ -1714,27 -1682,34 +1714,27 @@@ static inline void f2fs_i_size_write(st
                return;
  
        i_size_write(inode, i_size);
 -      f2fs_mark_inode_dirty_sync(inode);
 +      f2fs_mark_inode_dirty_sync(inode, true);
        if (clean || recover)
                set_inode_flag(inode, FI_AUTO_RECOVER);
  }
  
 -static inline bool f2fs_skip_inode_update(struct inode *inode)
 -{
 -      if (!is_inode_flag_set(inode, FI_AUTO_RECOVER))
 -              return false;
 -      return F2FS_I(inode)->last_disk_size == i_size_read(inode);
 -}
 -
  static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
  {
        F2FS_I(inode)->i_current_depth = depth;
 -      f2fs_mark_inode_dirty_sync(inode);
 +      f2fs_mark_inode_dirty_sync(inode, true);
  }
  
  static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
  {
        F2FS_I(inode)->i_xattr_nid = xnid;
 -      f2fs_mark_inode_dirty_sync(inode);
 +      f2fs_mark_inode_dirty_sync(inode, true);
  }
  
  static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino)
  {
        F2FS_I(inode)->i_pino = pino;
 -      f2fs_mark_inode_dirty_sync(inode);
 +      f2fs_mark_inode_dirty_sync(inode, true);
  }
  
  static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
@@@ -1862,31 -1837,13 +1862,31 @@@ static inline int is_file(struct inode 
  static inline void set_file(struct inode *inode, int type)
  {
        F2FS_I(inode)->i_advise |= type;
 -      f2fs_mark_inode_dirty_sync(inode);
 +      f2fs_mark_inode_dirty_sync(inode, true);
  }
  
  static inline void clear_file(struct inode *inode, int type)
  {
        F2FS_I(inode)->i_advise &= ~type;
 -      f2fs_mark_inode_dirty_sync(inode);
 +      f2fs_mark_inode_dirty_sync(inode, true);
 +}
 +
 +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
 +{
 +      if (dsync) {
 +              struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 +              bool ret;
 +
 +              spin_lock(&sbi->inode_lock[DIRTY_META]);
 +              ret = list_empty(&F2FS_I(inode)->gdirty_list);
 +              spin_unlock(&sbi->inode_lock[DIRTY_META]);
 +              return ret;
 +      }
 +      if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
 +                      file_keep_isize(inode) ||
 +                      i_size_read(inode) & PAGE_MASK)
 +              return false;
 +      return F2FS_I(inode)->last_disk_size == i_size_read(inode);
  }
  
  static inline int f2fs_readonly(struct super_block *sb)
@@@ -1998,7 -1955,7 +1998,7 @@@ void set_de_type(struct f2fs_dir_entry 
  unsigned char get_de_type(struct f2fs_dir_entry *);
  struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
                        f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
 -bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
 +int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
                        unsigned int, struct fscrypt_str *);
  void do_make_empty_dir(struct inode *, struct inode *,
                        struct f2fs_dentry_ptr *);
@@@ -2038,7 -1995,7 +2038,7 @@@ static inline int f2fs_add_link(struct 
  /*
   * super.c
   */
 -int f2fs_inode_dirtied(struct inode *);
 +int f2fs_inode_dirtied(struct inode *, bool);
  void f2fs_inode_synced(struct inode *);
  int f2fs_commit_super(struct f2fs_sb_info *, bool);
  int f2fs_sync_fs(struct super_block *, int);
@@@ -2077,7 -2034,7 +2077,7 @@@ void move_node_page(struct page *, int)
  int fsync_node_pages(struct f2fs_sb_info *, struct inode *,
                        struct writeback_control *, bool);
  int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
 -void build_free_nids(struct f2fs_sb_info *);
 +void build_free_nids(struct f2fs_sb_info *, bool);
  bool alloc_nid(struct f2fs_sb_info *, nid_t *);
  void alloc_nid_done(struct f2fs_sb_info *, nid_t);
  void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
@@@ -2103,7 -2060,7 +2103,7 @@@ void f2fs_balance_fs(struct f2fs_sb_inf
  void f2fs_balance_fs_bg(struct f2fs_sb_info *);
  int f2fs_issue_flush(struct f2fs_sb_info *);
  int create_flush_cmd_control(struct f2fs_sb_info *);
 -void destroy_flush_cmd_control(struct f2fs_sb_info *);
 +void destroy_flush_cmd_control(struct f2fs_sb_info *, bool);
  void invalidate_blocks(struct f2fs_sb_info *, block_t);
  bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
  void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
@@@ -2175,15 -2132,12 +2175,15 @@@ void f2fs_submit_merged_bio_cond(struc
  void f2fs_flush_merged_bios(struct f2fs_sb_info *);
  int f2fs_submit_page_bio(struct f2fs_io_info *);
  void f2fs_submit_page_mbio(struct f2fs_io_info *);
 +struct block_device *f2fs_target_device(struct f2fs_sb_info *,
 +                              block_t, struct bio *);
 +int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
  void set_data_blkaddr(struct dnode_of_data *);
  void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
  int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
  int reserve_new_block(struct dnode_of_data *);
  int f2fs_get_block(struct dnode_of_data *, pgoff_t);
 -ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
 +int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
  int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
  struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
  struct page *find_data_page(struct inode *, pgoff_t);
@@@ -2206,7 -2160,7 +2206,7 @@@ int f2fs_migrate_page(struct address_sp
  int start_gc_thread(struct f2fs_sb_info *);
  void stop_gc_thread(struct f2fs_sb_info *);
  block_t start_bidx_of_node(unsigned int, struct inode *);
 -int f2fs_gc(struct f2fs_sb_info *, bool);
 +int f2fs_gc(struct f2fs_sb_info *, bool, bool);
  void build_gc_manager(struct f2fs_sb_info *);
  
  /*
@@@ -2227,12 -2181,12 +2227,12 @@@ struct f2fs_stat_info 
        unsigned long long hit_largest, hit_cached, hit_rbtree;
        unsigned long long hit_total, total_ext;
        int ext_tree, zombie_tree, ext_node;
 -      s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
 -      s64 inmem_pages;
 +      int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
 +      int inmem_pages;
        unsigned int ndirty_dirs, ndirty_files, ndirty_all;
 -      int nats, dirty_nats, sits, dirty_sits, fnids;
 +      int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
        int total_count, utilization;
 -      int bg_gc, wb_bios;
 +      int bg_gc, nr_wb_cp_data, nr_wb_data;
        int inline_xattr, inline_inode, inline_dir, orphans;
        unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
        unsigned int bimodal, avg_vblocks;
@@@ -2458,30 -2412,9 +2458,30 @@@ static inline int f2fs_sb_has_crypto(st
        return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
  }
  
 -static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb)
 +static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
 +{
 +      return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
 +}
 +
 +#ifdef CONFIG_BLK_DEV_ZONED
 +static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 +                      struct block_device *bdev, block_t blkaddr)
 +{
 +      unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
 +      int i;
 +
 +      for (i = 0; i < sbi->s_ndevs; i++)
 +              if (FDEV(i).bdev == bdev)
 +                      return FDEV(i).blkz_type[zno];
 +      return -EINVAL;
 +}
 +#endif
 +
 +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
  {
 -      return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR);
 +      struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
 +
 +      return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb);
  }
  
  static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
@@@ -2520,8 -2453,8 +2520,8 @@@ static inline bool f2fs_may_encrypt(str
  #define fscrypt_pullback_bio_page     fscrypt_notsupp_pullback_bio_page
  #define fscrypt_restore_control_page  fscrypt_notsupp_restore_control_page
  #define fscrypt_zeroout_range         fscrypt_notsupp_zeroout_range
- #define fscrypt_process_policy                fscrypt_notsupp_process_policy
- #define fscrypt_get_policy            fscrypt_notsupp_get_policy
+ #define fscrypt_ioctl_set_policy      fscrypt_notsupp_ioctl_set_policy
+ #define fscrypt_ioctl_get_policy      fscrypt_notsupp_ioctl_get_policy
  #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
  #define fscrypt_inherit_context               fscrypt_notsupp_inherit_context
  #define fscrypt_get_encryption_info   fscrypt_notsupp_get_encryption_info
diff --combined fs/f2fs/file.c
index 383b5c29f46b7718393ec0011b956a4208300940,f0c83f74557d04498b41c93b67adac59a3aa703b..49f10dce817dc9e4806b6a417b96391a1c794fd1
@@@ -94,6 -94,8 +94,6 @@@ mapped
        if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
                f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
  
 -      /* if gced page is attached, don't write to cold segment */
 -      clear_cold_data(page);
  out:
        sb_end_pagefault(inode->i_sb);
        f2fs_update_time(sbi, REQ_TIME);
@@@ -208,7 -210,7 +208,7 @@@ static int f2fs_do_sync_file(struct fil
        }
  
        /* if the inode is dirty, let's recover all the time */
 -      if (!datasync && !f2fs_skip_inode_update(inode)) {
 +      if (!f2fs_skip_inode_update(inode, datasync)) {
                f2fs_write_inode(inode, NULL);
                goto go_write;
        }
@@@ -262,7 -264,7 +262,7 @@@ sync_nodes
        }
  
        if (need_inode_block_update(sbi, ino)) {
 -              f2fs_mark_inode_dirty_sync(inode);
 +              f2fs_mark_inode_dirty_sync(inode, true);
                f2fs_write_inode(inode, NULL);
                goto sync_nodes;
        }
@@@ -630,7 -632,7 +630,7 @@@ int f2fs_truncate(struct inode *inode
                return err;
  
        inode->i_mtime = inode->i_ctime = current_time(inode);
 -      f2fs_mark_inode_dirty_sync(inode);
 +      f2fs_mark_inode_dirty_sync(inode, false);
        return 0;
  }
  
@@@ -677,7 -679,6 +677,7 @@@ int f2fs_setattr(struct dentry *dentry
  {
        struct inode *inode = d_inode(dentry);
        int err;
 +      bool size_changed = false;
  
        err = setattr_prepare(dentry, attr);
        if (err)
                        err = f2fs_truncate(inode);
                        if (err)
                                return err;
 -                      f2fs_balance_fs(F2FS_I_SB(inode), true);
                } else {
                        /*
                         * do not trim all blocks after i_size if target size is
                        }
                        inode->i_mtime = inode->i_ctime = current_time(inode);
                }
 +
 +              size_changed = true;
        }
  
        __setattr_copy(inode, attr);
                }
        }
  
 -      f2fs_mark_inode_dirty_sync(inode);
 +      /* file size may changed here */
 +      f2fs_mark_inode_dirty_sync(inode, size_changed);
 +
 +      /* inode change will produce dirty node pages flushed by checkpoint */
 +      f2fs_balance_fs(F2FS_I_SB(inode), true);
 +
        return err;
  }
  
@@@ -972,7 -967,7 +972,7 @@@ static int __clone_blkaddrs(struct inod
                                new_size = (dst + i) << PAGE_SHIFT;
                                if (dst_inode->i_size < new_size)
                                        f2fs_i_size_write(dst_inode, new_size);
 -                      } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen);
 +                      } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR));
  
                        f2fs_put_dnode(&dn);
                } else {
@@@ -1223,9 -1218,6 +1223,9 @@@ static int f2fs_zero_range(struct inod
                        ret = f2fs_do_zero_range(&dn, index, end);
                        f2fs_put_dnode(&dn);
                        f2fs_unlock_op(sbi);
 +
 +                      f2fs_balance_fs(sbi, dn.node_changed);
 +
                        if (ret)
                                goto out;
  
@@@ -1321,15 -1313,15 +1321,15 @@@ static int expand_inode_data(struct ino
        pgoff_t pg_end;
        loff_t new_size = i_size_read(inode);
        loff_t off_end;
 -      int ret;
 +      int err;
  
 -      ret = inode_newsize_ok(inode, (len + offset));
 -      if (ret)
 -              return ret;
 +      err = inode_newsize_ok(inode, (len + offset));
 +      if (err)
 +              return err;
  
 -      ret = f2fs_convert_inline_inode(inode);
 -      if (ret)
 -              return ret;
 +      err = f2fs_convert_inline_inode(inode);
 +      if (err)
 +              return err;
  
        f2fs_balance_fs(sbi, true);
  
        if (off_end)
                map.m_len++;
  
 -      ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
 -      if (ret) {
 +      err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
 +      if (err) {
                pgoff_t last_off;
  
                if (!map.m_len)
 -                      return ret;
 +                      return err;
  
                last_off = map.m_lblk + map.m_len - 1;
  
        if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
                f2fs_i_size_write(inode, new_size);
  
 -      return ret;
 +      return err;
  }
  
  static long f2fs_fallocate(struct file *file, int mode,
  
        if (!ret) {
                inode->i_mtime = inode->i_ctime = current_time(inode);
 -              f2fs_mark_inode_dirty_sync(inode);
 +              f2fs_mark_inode_dirty_sync(inode, false);
 +              if (mode & FALLOC_FL_KEEP_SIZE)
 +                      file_set_keep_isize(inode);
                f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
        }
  
@@@ -1536,7 -1526,7 +1536,7 @@@ static int f2fs_ioc_start_atomic_write(
                goto out;
  
        f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
 -              "Unexpected flush for atomic writes: ino=%lu, npages=%lld",
 +              "Unexpected flush for atomic writes: ino=%lu, npages=%u",
                                        inode->i_ino, get_dirty_pages(inode));
        ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
        if (ret)
@@@ -1762,31 -1752,16 +1762,16 @@@ static bool uuid_is_nonzero(__u8 u[16]
  
  static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
  {
-       struct fscrypt_policy policy;
        struct inode *inode = file_inode(filp);
  
-       if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
-                                                       sizeof(policy)))
-               return -EFAULT;
        f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
  
-       return fscrypt_process_policy(filp, &policy);
+       return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
  }
  
  static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
  {
-       struct fscrypt_policy policy;
-       struct inode *inode = file_inode(filp);
-       int err;
-       err = fscrypt_get_policy(inode, &policy);
-       if (err)
-               return err;
-       if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
-               return -EFAULT;
-       return 0;
+       return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
  }
  
  static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@@ -1852,7 -1827,7 +1837,7 @@@ static int f2fs_ioc_gc(struct file *fil
                mutex_lock(&sbi->gc_mutex);
        }
  
 -      ret = f2fs_gc(sbi, sync);
 +      ret = f2fs_gc(sbi, sync, true);
  out:
        mnt_drop_write_file(filp);
        return ret;
@@@ -2266,15 -2241,12 +2251,15 @@@ static ssize_t f2fs_file_write_iter(str
        inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret > 0) {
 -              ret = f2fs_preallocate_blocks(iocb, from);
 -              if (!ret) {
 -                      blk_start_plug(&plug);
 -                      ret = __generic_file_write_iter(iocb, from);
 -                      blk_finish_plug(&plug);
 +              int err = f2fs_preallocate_blocks(iocb, from);
 +
 +              if (err) {
 +                      inode_unlock(inode);
 +                      return err;
                }
 +              blk_start_plug(&plug);
 +              ret = __generic_file_write_iter(iocb, from);
 +              blk_finish_plug(&plug);
        }
        inode_unlock(inode);
  
diff --combined fs/xfs/xfs_aops.c
index 6be5204a06d3ac1fc8da7e486b92ac00cf0a45bb,561cf1456c6ca1ed07484e5daf455c6f40015959..38755ca96c7a6d884c0c13421ab1d0b08fbc1f4b
@@@ -495,8 -495,8 +495,8 @@@ xfs_submit_ioend
  
        ioend->io_bio->bi_private = ioend;
        ioend->io_bio->bi_end_io = xfs_end_bio;
 -      bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
 -                       (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
 +      ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 +
        /*
         * If we are failing the IO now, just mark the ioend with an
         * error and finish it. This will run IO completion immediately
@@@ -567,7 -567,8 +567,7 @@@ xfs_chain_bio
  
        bio_chain(ioend->io_bio, new);
        bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
 -      bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
 -                        (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
 +      ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
        submit_bio(ioend->io_bio);
        ioend->io_bio = new;
  }
@@@ -1297,8 -1298,7 +1297,7 @@@ __xfs_get_blocks
        sector_t                iblock,
        struct buffer_head      *bh_result,
        int                     create,
-       bool                    direct,
-       bool                    dax_fault)
+       bool                    direct)
  {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
                if (ISUNWRITTEN(&imap))
                        set_buffer_unwritten(bh_result);
                /* direct IO needs special help */
-               if (create) {
-                       if (dax_fault)
-                               ASSERT(!ISUNWRITTEN(&imap));
-                       else
-                               xfs_map_direct(inode, bh_result, &imap, offset,
-                                               is_cow);
-               }
+               if (create)
+                       xfs_map_direct(inode, bh_result, &imap, offset, is_cow);
        }
  
        /*
@@@ -1465,7 -1460,7 +1459,7 @@@ xfs_get_blocks
        struct buffer_head      *bh_result,
        int                     create)
  {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
+       return __xfs_get_blocks(inode, iblock, bh_result, create, false);
  }
  
  int
@@@ -1475,17 -1470,7 +1469,7 @@@ xfs_get_blocks_direct
        struct buffer_head      *bh_result,
        int                     create)
  {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
- }
- int
- xfs_get_blocks_dax_fault(
-       struct inode            *inode,
-       sector_t                iblock,
-       struct buffer_head      *bh_result,
-       int                     create)
- {
-       return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
+       return __xfs_get_blocks(inode, iblock, bh_result, create, true);
  }
  
  /*
diff --combined include/uapi/linux/fs.h
index c1d11df07b289fe47af000de9cf94e7e84cc120f,0496d37abe289884310add3545a2ede4d7de04a8..36da93fbf18860a08e75590e54d34caa6967d9ec
@@@ -225,10 -225,6 +225,10 @@@ struct fsxattr 
  #define BLKSECDISCARD _IO(0x12,125)
  #define BLKROTATIONAL _IO(0x12,126)
  #define BLKZEROOUT _IO(0x12,127)
 +/*
 + * A jump here: 130-131 are reserved for zoned block devices
 + * (see uapi/linux/blkzoned.h)
 + */
  
  #define BMAP_IOCTL 1          /* obsolete - kept for compatibility */
  #define FIBMAP           _IO(0x00,1)  /* bmap access */
  /* Policy provided via an ioctl on the topmost directory */
  #define FS_KEY_DESCRIPTOR_SIZE        8
  
+ #define FS_POLICY_FLAGS_PAD_4         0x00
+ #define FS_POLICY_FLAGS_PAD_8         0x01
+ #define FS_POLICY_FLAGS_PAD_16                0x02
+ #define FS_POLICY_FLAGS_PAD_32                0x03
+ #define FS_POLICY_FLAGS_PAD_MASK      0x03
+ #define FS_POLICY_FLAGS_VALID         0x03
+ /* Encryption algorithms */
+ #define FS_ENCRYPTION_MODE_INVALID            0
+ #define FS_ENCRYPTION_MODE_AES_256_XTS                1
+ #define FS_ENCRYPTION_MODE_AES_256_GCM                2
+ #define FS_ENCRYPTION_MODE_AES_256_CBC                3
+ #define FS_ENCRYPTION_MODE_AES_256_CTS                4
  struct fscrypt_policy {
        __u8 version;
        __u8 contents_encryption_mode;
diff --combined mm/filemap.c
index 5b4dd03130da33b91cca705e6d202fffed177b55,db26ebc6c62f0c9a966f5ab8033f6442527f0eb5..69568388c699493ac694a960ca7c24b90b13e080
@@@ -132,29 -132,43 +132,28 @@@ static int page_cache_tree_insert(struc
                if (!dax_mapping(mapping)) {
                        if (shadowp)
                                *shadowp = p;
 -                      if (node)
 -                              workingset_node_shadows_dec(node);
                } else {
                        /* DAX can replace empty locked entry with a hole */
                        WARN_ON_ONCE(p !=
-                               (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
-                                        RADIX_DAX_ENTRY_LOCK));
+                               dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
 -                      /* DAX accounts exceptional entries as normal pages */
 -                      if (node)
 -                              workingset_node_pages_dec(node);
                        /* Wakeup waiters for exceptional entry lock */
-                       dax_wake_mapping_entry_waiter(mapping, page->index,
+                       dax_wake_mapping_entry_waiter(mapping, page->index, p,
                                                      false);
                }
        }
 -      radix_tree_replace_slot(slot, page);
 +      __radix_tree_replace(&mapping->page_tree, node, slot, page,
 +                           workingset_update_node, mapping);
        mapping->nrpages++;
 -      if (node) {
 -              workingset_node_pages_inc(node);
 -              /*
 -               * Don't track node that contains actual pages.
 -               *
 -               * Avoid acquiring the list_lru lock if already
 -               * untracked.  The list_empty() test is safe as
 -               * node->private_list is protected by
 -               * mapping->tree_lock.
 -               */
 -              if (!list_empty(&node->private_list))
 -                      list_lru_del(&workingset_shadow_nodes,
 -                                   &node->private_list);
 -      }
        return 0;
  }
  
  static void page_cache_tree_delete(struct address_space *mapping,
                                   struct page *page, void *shadow)
  {
 -      int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
 +      int i, nr;
 +
 +      /* hugetlb pages are represented by one entry in the radix tree */
 +      nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
  
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(PageTail(page), page);
                __radix_tree_lookup(&mapping->page_tree, page->index + i,
                                    &node, &slot);
  
 -              radix_tree_clear_tags(&mapping->page_tree, node, slot);
 -
 -              if (!node) {
 -                      VM_BUG_ON_PAGE(nr != 1, page);
 -                      /*
 -                       * We need a node to properly account shadow
 -                       * entries. Don't plant any without. XXX
 -                       */
 -                      shadow = NULL;
 -              }
 -
 -              radix_tree_replace_slot(slot, shadow);
 +              VM_BUG_ON_PAGE(!node && nr != 1, page);
  
 -              if (!node)
 -                      break;
 -
 -              workingset_node_pages_dec(node);
 -              if (shadow)
 -                      workingset_node_shadows_inc(node);
 -              else
 -                      if (__radix_tree_delete_node(&mapping->page_tree, node))
 -                              continue;
 -
 -              /*
 -               * Track node that only contains shadow entries. DAX mappings
 -               * contain no shadow entries and may contain other exceptional
 -               * entries so skip those.
 -               *
 -               * Avoid acquiring the list_lru lock if already tracked.
 -               * The list_empty() test is safe as node->private_list is
 -               * protected by mapping->tree_lock.
 -               */
 -              if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
 -                              list_empty(&node->private_list)) {
 -                      node->private_data = mapping;
 -                      list_lru_add(&workingset_shadow_nodes,
 -                                      &node->private_list);
 -              }
 +              radix_tree_clear_tags(&mapping->page_tree, node, slot);
 +              __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
 +                                   workingset_update_node, mapping);
        }
  
        if (shadow) {
@@@ -1684,9 -1731,6 +1683,9 @@@ find_page
                        if (inode->i_blkbits == PAGE_SHIFT ||
                                        !mapping->a_ops->is_partially_uptodate)
                                goto page_not_up_to_date;
 +                      /* pipes can't handle partially uptodate pages */
 +                      if (unlikely(iter->type & ITER_PIPE))
 +                              goto page_not_up_to_date;
                        if (!trylock_page(page))
                                goto page_not_up_to_date;
                        /* Did it get truncated before we got the lock? */