]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/commitdiff
Merge branch 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 17 May 2019 02:01:23 +0000 (19:01 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 17 May 2019 02:01:23 +0000 (19:01 -0700)
Pull cgroup fix from Tejun Heo:
 "The cgroup2 freezer pulled in this cycle broke strace. This pull
  request includes a workaround for the problem.

  It's not a complete fix in that it may cause spurious frozen state
  flip-flops which is fairly minor. Will push a full fix once it's
  ready"

* 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  signal: unconditionally leave the frozen state in ptrace_stop()

205 files changed:
Documentation/device-mapper/dm-dust.txt [new file with mode: 0644]
Documentation/device-mapper/dm-integrity.txt
Documentation/devicetree/bindings/vendor-prefixes.txt [deleted file]
Documentation/devicetree/bindings/vendor-prefixes.yaml [new file with mode: 0644]
Documentation/media/uapi/v4l/field-order.rst
Documentation/networking/rxrpc.txt
Documentation/x86/mds.rst
arch/Kconfig
arch/alpha/include/asm/segment.h [deleted file]
arch/alpha/kernel/smc37c669.c
arch/alpha/kernel/smc37c93x.c
arch/arc/include/asm/uaccess.h
arch/arm/configs/mini2440_defconfig
arch/arm/configs/pxa_defconfig
arch/arm/include/asm/Kbuild
arch/arm64/include/asm/Kbuild
arch/c6x/include/asm/Kbuild
arch/h8300/Kconfig
arch/h8300/include/asm/Kbuild
arch/h8300/include/asm/uaccess.h [deleted file]
arch/hexagon/include/asm/Kbuild
arch/hexagon/include/asm/uaccess.h
arch/ia64/include/asm/segment.h [deleted file]
arch/mips/configs/ip22_defconfig
arch/mips/configs/ip27_defconfig
arch/mips/include/asm/Kbuild
arch/nds32/include/asm/Kbuild
arch/nios2/include/asm/Kbuild
arch/openrisc/include/asm/Kbuild
arch/openrisc/kernel/ptrace.c
arch/openrisc/kernel/setup.c
arch/openrisc/kernel/traps.c
arch/openrisc/mm/init.c
arch/openrisc/mm/tlb.c
arch/parisc/include/asm/Kbuild
arch/s390/include/asm/segment.h [deleted file]
arch/s390/kernel/ptrace.c
arch/unicore32/configs/unicore32_defconfig
arch/unicore32/include/asm/Kbuild
arch/x86/Kconfig
arch/x86/entry/vdso/vdso2c.c
arch/x86/events/amd/iommu.c
arch/x86/events/intel/core.c
arch/x86/events/perf_event.h
arch/x86/include/asm/arch_hweight.h
arch/x86/include/asm/vdso.h
arch/x86/kernel/kprobes/core.c
arch/x86/kernel/traps.c
arch/x86/mm/init_64.c
arch/x86/mm/mem_encrypt.c
arch/x86/mm/mm_internal.h
arch/xtensa/include/asm/segment.h [deleted file]
drivers/block/rbd.c
drivers/edac/Kconfig
drivers/edac/edac_mc.c
drivers/hwmon/hwmon.c
drivers/md/Kconfig
drivers/md/Makefile
drivers/md/dm-cache-metadata.c
drivers/md/dm-crypt.c
drivers/md/dm-delay.c
drivers/md/dm-dust.c [new file with mode: 0644]
drivers/md/dm-exception-store.h
drivers/md/dm-init.c
drivers/md/dm-integrity.c
drivers/md/dm-ioctl.c
drivers/md/dm-mpath.c
drivers/md/dm-rq.c
drivers/md/dm-snap.c
drivers/md/dm-target.c
drivers/md/dm-thin-metadata.c
drivers/md/dm-writecache.c
drivers/md/dm-zoned-metadata.c
drivers/md/dm-zoned-target.c
drivers/md/dm.c
drivers/md/persistent-data/dm-space-map-common.c
drivers/media/platform/atmel/atmel-isc-regs.h
drivers/media/platform/atmel/atmel-isc.c
drivers/media/platform/coda/coda-common.c
drivers/media/platform/davinci/vpbe.c
drivers/media/platform/omap/omap_vout.c
drivers/media/platform/rcar-vin/rcar-csi2.c
drivers/media/platform/tegra-cec/tegra_cec.c
drivers/net/ethernet/mellanox/mlxsw/Kconfig
drivers/staging/media/imx/imx-ic-prpencvf.c
drivers/staging/media/imx/imx-media-capture.c
drivers/staging/media/imx/imx-media-csi.c
drivers/staging/media/imx/imx-media.h
drivers/staging/media/imx/imx7-media-csi.c
drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c
drivers/staging/media/rockchip/vpu/rockchip_vpu_enc.c
drivers/thermal/Kconfig
drivers/thermal/intel/Kconfig
drivers/thermal/intel/int340x_thermal/int3403_thermal.c
drivers/thermal/intel/int340x_thermal/processor_thermal_device.c
drivers/thermal/qcom/Kconfig
drivers/thermal/thermal_core.c
fs/afs/addr_list.c
fs/afs/afs.h
fs/afs/callback.c
fs/afs/cell.c
fs/afs/cmservice.c
fs/afs/dir.c
fs/afs/dir_silly.c
fs/afs/dynroot.c
fs/afs/file.c
fs/afs/flock.c
fs/afs/fs_probe.c
fs/afs/fsclient.c
fs/afs/inode.c
fs/afs/internal.h
fs/afs/proc.c
fs/afs/rotate.c
fs/afs/rxrpc.c
fs/afs/security.c
fs/afs/server.c
fs/afs/super.c
fs/afs/vl_list.c
fs/afs/vl_probe.c
fs/afs/vl_rotate.c
fs/afs/vlclient.c
fs/afs/write.c
fs/afs/xattr.c
fs/afs/yfsclient.c
fs/ceph/caps.c
fs/ceph/debugfs.c
fs/ceph/export.c
fs/ceph/file.c
fs/ceph/inode.c
fs/ceph/locks.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/mdsmap.c
fs/ceph/quota.c
fs/ceph/super.c
fs/ceph/super.h
fs/cifs/dns_resolve.c
fs/configfs/dir.c
fs/nfs/dns_resolve.c
include/asm-generic/segment.h [deleted file]
include/asm-generic/uaccess.h
include/linux/ceph/ceph_fs.h
include/linux/ceph/messenger.h
include/linux/ceph/osdmap.h
include/linux/device-mapper.h
include/linux/dns_resolver.h
include/linux/list.h
include/linux/list_bl.h
include/linux/overflow.h
include/linux/slab_def.h
include/linux/thermal.h
include/linux/wait_bit.h
include/media/davinci/vpbe.h
include/net/af_rxrpc.h
kernel/locking/rwsem-xadd.c
kernel/time/ntp.c
lib/Kconfig
lib/Kconfig.debug
lib/hweight.c
mm/slab.c
net/ceph/cls_lock_client.c
net/ceph/debugfs.c
net/ceph/messenger.c
net/ceph/mon_client.c
net/ceph/osd_client.c
net/dns_resolver/dns_query.c
net/rxrpc/af_rxrpc.c
net/rxrpc/ar-internal.h
net/rxrpc/call_object.c
net/rxrpc/conn_client.c
net/rxrpc/sendmsg.c
tools/objtool/Documentation/stack-validation.txt
tools/objtool/check.c
tools/testing/selftests/.gitignore
tools/testing/selftests/Makefile
tools/testing/selftests/breakpoints/breakpoint_test.c
tools/testing/selftests/breakpoints/breakpoint_test_arm64.c
tools/testing/selftests/breakpoints/step_after_suspend_test.c
tools/testing/selftests/capabilities/test_execve.c
tools/testing/selftests/drivers/.gitignore [new file with mode: 0644]
tools/testing/selftests/futex/functional/futex_requeue_pi.c
tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c
tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c
tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c
tools/testing/selftests/futex/functional/futex_wait_timeout.c
tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c
tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
tools/testing/selftests/kselftest.h
tools/testing/selftests/kselftest/prefix.pl [new file with mode: 0755]
tools/testing/selftests/kselftest/runner.sh [new file with mode: 0644]
tools/testing/selftests/lib.mk
tools/testing/selftests/membarrier/membarrier_test.c
tools/testing/selftests/pidfd/.gitignore [new file with mode: 0644]
tools/testing/selftests/pidfd/pidfd_test.c
tools/testing/selftests/rseq/Makefile
tools/testing/selftests/rseq/rseq-arm.h
tools/testing/selftests/rseq/rseq-arm64.h
tools/testing/selftests/rseq/rseq-mips.h
tools/testing/selftests/rseq/rseq-ppc.h
tools/testing/selftests/rseq/rseq-s390.h
tools/testing/selftests/rseq/rseq-x86.h
tools/testing/selftests/rseq/rseq.c
tools/testing/selftests/rseq/rseq.h
tools/testing/selftests/sigaltstack/sas.c
tools/testing/selftests/sync/sync_test.c

diff --git a/Documentation/device-mapper/dm-dust.txt b/Documentation/device-mapper/dm-dust.txt
new file mode 100644 (file)
index 0000000..954d402
--- /dev/null
@@ -0,0 +1,272 @@
+dm-dust
+=======
+
+This target emulates the behavior of bad sectors at arbitrary
+locations, and the ability to enable the emulation of the failures
+at an arbitrary time.
+
+This target behaves similarly to a linear target.  At a given time,
+the user can send a message to the target to start failing read
+requests on specific blocks (to emulate the behavior of a hard disk
+drive with bad sectors).
+
+When the failure behavior is enabled (i.e.: when the output of
+"dmsetup status" displays "fail_read_on_bad_block"), reads of blocks
+in the "bad block list" will fail with EIO ("Input/output error").
+
+Writes of blocks in the "bad block list will result in the following:
+
+1. Remove the block from the "bad block list".
+2. Successfully complete the write.
+
+This emulates the "remapped sector" behavior of a drive with bad
+sectors.
+
+Normally, a drive that is encountering bad sectors will most likely
+encounter more bad sectors, at an unknown time or location.
+With dm-dust, the user can use the "addbadblock" and "removebadblock"
+messages to add arbitrary bad blocks at new locations, and the
+"enable" and "disable" messages to modulate the state of whether the
+configured "bad blocks" will be treated as bad, or bypassed.
+This allows the pre-writing of test data and metadata prior to
+simulating a "failure" event where bad sectors start to appear.
+
+Table parameters:
+-----------------
+<device_path> <offset> <blksz>
+
+Mandatory parameters:
+    <device_path>: path to the block device.
+    <offset>: offset to data area from start of device_path
+    <blksz>: block size in bytes
+            (minimum 512, maximum 1073741824, must be a power of 2)
+
+Usage instructions:
+-------------------
+
+First, find the size (in 512-byte sectors) of the device to be used:
+
+$ sudo blockdev --getsz /dev/vdb1
+33552384
+
+Create the dm-dust device:
+(For a device with a block size of 512 bytes)
+$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 512'
+
+(For a device with a block size of 4096 bytes)
+$ sudo dmsetup create dust1 --table '0 33552384 dust /dev/vdb1 0 4096'
+
+Check the status of the read behavior ("bypass" indicates that all I/O
+will be passed through to the underlying device):
+$ sudo dmsetup status dust1
+0 33552384 dust 252:17 bypass
+
+$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=128 iflag=direct
+128+0 records in
+128+0 records out
+
+$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
+128+0 records in
+128+0 records out
+
+Adding and removing bad blocks:
+-------------------------------
+
+At any time (i.e.: whether the device has the "bad block" emulation
+enabled or disabled), bad blocks may be added or removed from the
+device via the "addbadblock" and "removebadblock" messages:
+
+$ sudo dmsetup message dust1 0 addbadblock 60
+kernel: device-mapper: dust: badblock added at block 60
+
+$ sudo dmsetup message dust1 0 addbadblock 67
+kernel: device-mapper: dust: badblock added at block 67
+
+$ sudo dmsetup message dust1 0 addbadblock 72
+kernel: device-mapper: dust: badblock added at block 72
+
+These bad blocks will be stored in the "bad block list".
+While the device is in "bypass" mode, reads and writes will succeed:
+
+$ sudo dmsetup status dust1
+0 33552384 dust 252:17 bypass
+
+Enabling block read failures:
+-----------------------------
+
+To enable the "fail read on bad block" behavior, send the "enable" message:
+
+$ sudo dmsetup message dust1 0 enable
+kernel: device-mapper: dust: enabling read failures on bad sectors
+
+$ sudo dmsetup status dust1
+0 33552384 dust 252:17 fail_read_on_bad_block
+
+With the device in "fail read on bad block" mode, attempting to read a
+block will encounter an "Input/output error":
+
+$ sudo dd if=/dev/mapper/dust1 of=/dev/null bs=512 count=1 skip=67 iflag=direct
+dd: error reading '/dev/mapper/dust1': Input/output error
+0+0 records in
+0+0 records out
+0 bytes copied, 0.00040651 s, 0.0 kB/s
+
+...and writing to the bad blocks will remove the blocks from the list,
+therefore emulating the "remap" behavior of hard disk drives:
+
+$ sudo dd if=/dev/zero of=/dev/mapper/dust1 bs=512 count=128 oflag=direct
+128+0 records in
+128+0 records out
+
+kernel: device-mapper: dust: block 60 removed from badblocklist by write
+kernel: device-mapper: dust: block 67 removed from badblocklist by write
+kernel: device-mapper: dust: block 72 removed from badblocklist by write
+kernel: device-mapper: dust: block 87 removed from badblocklist by write
+
+Bad block add/remove error handling:
+------------------------------------
+
+Attempting to add a bad block that already exists in the list will
+result in an "Invalid argument" error, as well as a helpful message:
+
+$ sudo dmsetup message dust1 0 addbadblock 88
+device-mapper: message ioctl on dust1  failed: Invalid argument
+kernel: device-mapper: dust: block 88 already in badblocklist
+
+Attempting to remove a bad block that doesn't exist in the list will
+result in an "Invalid argument" error, as well as a helpful message:
+
+$ sudo dmsetup message dust1 0 removebadblock 87
+device-mapper: message ioctl on dust1  failed: Invalid argument
+kernel: device-mapper: dust: block 87 not found in badblocklist
+
+Counting the number of bad blocks in the bad block list:
+--------------------------------------------------------
+
+To count the number of bad blocks configured in the device, run the
+following message command:
+
+$ sudo dmsetup message dust1 0 countbadblocks
+
+A message will print with the number of bad blocks currently
+configured on the device:
+
+kernel: device-mapper: dust: countbadblocks: 895 badblock(s) found
+
+Querying for specific bad blocks:
+---------------------------------
+
+To find out if a specific block is in the bad block list, run the
+following message command:
+
+$ sudo dmsetup message dust1 0 queryblock 72
+
+The following message will print if the block is in the list:
+device-mapper: dust: queryblock: block 72 found in badblocklist
+
+The following message will print if the block is in the list:
+device-mapper: dust: queryblock: block 72 not found in badblocklist
+
+The "queryblock" message command will work in both the "enabled"
+and "disabled" modes, allowing the verification of whether a block
+will be treated as "bad" without having to issue I/O to the device,
+or having to "enable" the bad block emulation.
+
+Clearing the bad block list:
+----------------------------
+
+To clear the bad block list (without needing to individually run
+a "removebadblock" message command for every block), run the
+following message command:
+
+$ sudo dmsetup message dust1 0 clearbadblocks
+
+After clearing the bad block list, the following message will appear:
+
+kernel: device-mapper: dust: clearbadblocks: badblocks cleared
+
+If there were no bad blocks to clear, the following message will
+appear:
+
+kernel: device-mapper: dust: clearbadblocks: no badblocks found
+
+Message commands list:
+----------------------
+
+Below is a list of the messages that can be sent to a dust device:
+
+Operations on blocks (requires a <blknum> argument):
+
+addbadblock <blknum>
+queryblock <blknum>
+removebadblock <blknum>
+
+...where <blknum> is a block number within range of the device
+  (corresponding to the block size of the device.)
+
+Single argument message commands:
+
+countbadblocks
+clearbadblocks
+disable
+enable
+quiet
+
+Device removal:
+---------------
+
+When finished, remove the device via the "dmsetup remove" command:
+
+$ sudo dmsetup remove dust1
+
+Quiet mode:
+-----------
+
+On test runs with many bad blocks, it may be desirable to avoid
+excessive logging (from bad blocks added, removed, or "remapped").
+This can be done by enabling "quiet mode" via the following message:
+
+$ sudo dmsetup message dust1 0 quiet
+
+This will suppress log messages from add / remove / removed by write
+operations.  Log messages from "countbadblocks" or "queryblock"
+message commands will still print in quiet mode.
+
+The status of quiet mode can be seen by running "dmsetup status":
+
+$ sudo dmsetup status dust1
+0 33552384 dust 252:17 fail_read_on_bad_block quiet
+
+To disable quiet mode, send the "quiet" message again:
+
+$ sudo dmsetup message dust1 0 quiet
+
+$ sudo dmsetup status dust1
+0 33552384 dust 252:17 fail_read_on_bad_block verbose
+
+(The presence of "verbose" indicates normal logging.)
+
+"Why not...?"
+-------------
+
+scsi_debug has a "medium error" mode that can fail reads on one
+specified sector (sector 0x1234, hardcoded in the source code), but
+it uses RAM for the persistent storage, which drastically decreases
+the potential device size.
+
+dm-flakey fails all I/O from all block locations at a specified time
+frequency, and not a given point in time.
+
+When a bad sector occurs on a hard disk drive, reads to that sector
+are failed by the device, usually resulting in an error code of EIO
+("I/O error") or ENODATA ("No data available").  However, a write to
+the sector may succeed, and result in the sector becoming readable
+after the device controller no longer experiences errors reading the
+sector (or after a reallocation of the sector).  However, there may
+be bad sectors that occur on the device in the future, in a different,
+unpredictable location.
+
+This target seeks to provide a device that can exhibit the behavior
+of a bad sector at a known sector location, at a known time, based
+on a large storage device (at least tens of gigabytes, not occupying
+system memory).
index 297251b0d2d5715872449d0b4c556a0fb72cd4dc..d63d78ffeb7308fc8f8bf0c9eabbe89b1c028fb4 100644 (file)
@@ -21,6 +21,13 @@ mode it calculates and verifies the integrity tag internally. In this
 mode, the dm-integrity target can be used to detect silent data
 corruption on the disk or in the I/O path.
 
+There's an alternate mode of operation where dm-integrity uses bitmap
+instead of a journal. If a bit in the bitmap is 1, the corresponding
+region's data and integrity tags are not synchronized - if the machine
+crashes, the unsynchronized regions will be recalculated. The bitmap mode
+is faster than the journal mode, because we don't have to write the data
+twice, but it is also less reliable, because if data corruption happens
+when the machine crashes, it may not be detected.
 
 When loading the target for the first time, the kernel driver will format
 the device. But it will only format the device if the superblock contains
@@ -59,6 +66,10 @@ Target arguments:
                either both data and tag or none of them are written. The
                journaled mode degrades write throughput twice because the
                data have to be written twice.
+       B - bitmap mode - data and metadata are written without any
+               synchronization, the driver maintains a bitmap of dirty
+               regions where data and metadata don't match. This mode can
+               only be used with internal hash.
        R - recovery mode - in this mode, journal is not replayed,
                checksums are not checked and writes to the device are not
                allowed. This mode is useful for data recovery if the
@@ -79,6 +90,10 @@ interleave_sectors:number
        a power of two. If the device is already formatted, the value from
        the superblock is used.
 
+meta_device:device
+       Don't interleave the data and metadata on on device. Use a
+       separate device for metadata.
+
 buffer_sectors:number
        The number of sectors in one buffer. The value is rounded down to
        a power of two.
@@ -146,6 +161,15 @@ block_size:number
        Supported values are 512, 1024, 2048 and 4096 bytes.  If not
        specified the default block size is 512 bytes.
 
+sectors_per_bit:number
+       In the bitmap mode, this parameter specifies the number of
+       512-byte sectors that corresponds to one bitmap bit.
+
+bitmap_flush_interval:number
+       The bitmap flush interval in milliseconds. The metadata buffers
+       are synchronized when this interval expires.
+
+
 The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
 be changed when reloading the target (load an inactive table and swap the
 tables with suspend and resume). The other arguments should not be changed
@@ -167,7 +191,13 @@ The layout of the formatted block device:
          provides (i.e. the size of the device minus the size of all
          metadata and padding). The user of this target should not send
          bios that access data beyond the "provided data sectors" limit.
-       * flags - a flag is set if journal_mac is used
+       * flags
+         SB_FLAG_HAVE_JOURNAL_MAC - a flag is set if journal_mac is used
+         SB_FLAG_RECALCULATING - recalculating is in progress
+         SB_FLAG_DIRTY_BITMAP - journal area contains the bitmap of dirty
+               blocks
+       * log2(sectors per block)
+       * a position where recalculating finished
 * journal
        The journal is divided into sections, each section contains:
        * metadata area (4kiB), it contains journal entries
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
deleted file mode 100644 (file)
index e9034a6..0000000
+++ /dev/null
@@ -1,476 +0,0 @@
-Device tree binding vendor prefix registry.  Keep list in alphabetical order.
-
-This isn't an exhaustive list, but you should add new prefixes to it before
-using them to avoid name-space collisions.
-
-abilis Abilis Systems
-abracon        Abracon Corporation
-actions        Actions Semiconductor Co., Ltd.
-active-semi    Active-Semi International Inc
-ad     Avionic Design GmbH
-adafruit       Adafruit Industries, LLC
-adapteva       Adapteva, Inc.
-adaptrum       Adaptrum, Inc.
-adh    AD Holdings Plc.
-adi    Analog Devices, Inc.
-advantech      Advantech Corporation
-aeroflexgaisler        Aeroflex Gaisler AB
-al     Annapurna Labs
-allo   Allo.com
-allwinner      Allwinner Technology Co., Ltd.
-alphascale     AlphaScale Integrated Circuits Systems, Inc.
-altr   Altera Corp.
-amarula        Amarula Solutions
-amazon Amazon.com, Inc.
-amcc   Applied Micro Circuits Corporation (APM, formally AMCC)
-amd    Advanced Micro Devices (AMD), Inc.
-amediatech     Shenzhen Amediatech Technology Co., Ltd
-amlogic        Amlogic, Inc.
-ampire Ampire Co., Ltd.
-ams    AMS AG
-amstaos        AMS-Taos Inc.
-analogix       Analogix Semiconductor, Inc.
-andestech      Andes Technology Corporation
-apm    Applied Micro Circuits Corporation (APM)
-aptina Aptina Imaging
-arasan Arasan Chip Systems
-archermind ArcherMind Technology (Nanjing) Co., Ltd.
-arctic Arctic Sand
-arcx   arcx Inc. / Archronix Inc.
-aries  Aries Embedded GmbH
-arm    ARM Ltd.
-armadeus       ARMadeus Systems SARL
-arrow  Arrow Electronics
-artesyn        Artesyn Embedded Technologies Inc.
-asahi-kasei    Asahi Kasei Corp.
-aspeed ASPEED Technology Inc.
-asus   AsusTek Computer Inc.
-atlas  Atlas Scientific LLC
-atmel  Atmel Corporation
-auo    AU Optronics Corporation
-auvidea Auvidea GmbH
-avago  Avago Technologies
-avia   avia semiconductor
-avic   Shanghai AVIC Optoelectronics Co., Ltd.
-avnet  Avnet, Inc.
-axentia        Axentia Technologies AB
-axis   Axis Communications AB
-azoteq Azoteq (Pty) Ltd
-azw     Shenzhen AZW Technology Co., Ltd.
-bananapi BIPAI KEJI LIMITED
-bhf    Beckhoff Automation GmbH & Co. KG
-bitmain        Bitmain Technologies
-boe    BOE Technology Group Co., Ltd.
-bosch  Bosch Sensortec GmbH
-boundary       Boundary Devices Inc.
-brcm   Broadcom Corporation
-buffalo        Buffalo, Inc.
-bticino Bticino International
-calxeda        Calxeda
-capella        Capella Microsystems, Inc
-cascoda        Cascoda, Ltd.
-catalyst       Catalyst Semiconductor, Inc.
-cavium Cavium, Inc.
-cdns   Cadence Design Systems Inc.
-cdtech CDTech(H.K.) Electronics Limited
-ceva   Ceva, Inc.
-chipidea       Chipidea, Inc
-chipone                ChipOne
-chipspark      ChipSPARK
-chrp   Common Hardware Reference Platform
-chunghwa       Chunghwa Picture Tubes Ltd.
-ciaa   Computadora Industrial Abierta Argentina
-cirrus Cirrus Logic, Inc.
-cloudengines   Cloud Engines, Inc.
-cnm    Chips&Media, Inc.
-cnxt   Conexant Systems, Inc.
-compulab       CompuLab Ltd.
-cortina        Cortina Systems, Inc.
-cosmic Cosmic Circuits
-crane  Crane Connectivity Solutions
-creative       Creative Technology Ltd
-crystalfontz   Crystalfontz America, Inc.
-csky   Hangzhou C-SKY Microsystems Co., Ltd
-cubietech      Cubietech, Ltd.
-cypress        Cypress Semiconductor Corporation
-cznic  CZ.NIC, z.s.p.o.
-dallas Maxim Integrated Products (formerly Dallas Semiconductor)
-dataimage      DataImage, Inc.
-davicom        DAVICOM Semiconductor, Inc.
-delta  Delta Electronics, Inc.
-denx   Denx Software Engineering
-devantech      Devantech, Ltd.
-dh     DH electronics GmbH
-digi   Digi International Inc.
-digilent       Diglent, Inc.
-dioo   Dioo Microcircuit Co., Ltd
-dlc    DLC Display Co., Ltd.
-dlg    Dialog Semiconductor
-dlink  D-Link Corporation
-dmo    Data Modul AG
-domintech      Domintech Co., Ltd.
-dongwoon       Dongwoon Anatech
-dptechnics     DPTechnics
-dragino        Dragino Technology Co., Limited
-ea     Embedded Artists AB
-ebs-systart EBS-SYSTART GmbH
-ebv    EBV Elektronik
-eckelmann      Eckelmann AG
-edt    Emerging Display Technologies
-eeti   eGalax_eMPIA Technology Inc
-elan   Elan Microelectronic Corp.
-elgin  Elgin S/A.
-embest Shenzhen Embest Technology Co., Ltd.
-emlid  Emlid, Ltd.
-emmicro        EM Microelectronic
-emtrion        emtrion GmbH
-endless        Endless Mobile, Inc.
-energymicro    Silicon Laboratories (formerly Energy Micro AS)
-engicam        Engicam S.r.l.
-epcos  EPCOS AG
-epfl   Ecole Polytechnique Fédérale de Lausanne
-epson  Seiko Epson Corp.
-est    ESTeem Wireless Modems
-ettus  NI Ettus Research
-eukrea  Eukréa Electromatique
-everest        Everest Semiconductor Co. Ltd.
-everspin       Everspin Technologies, Inc.
-exar   Exar Corporation
-excito Excito
-ezchip EZchip Semiconductor
-facebook       Facebook
-fairphone      Fairphone B.V.
-faraday        Faraday Technology Corporation
-fastrax        Fastrax Oy
-fcs    Fairchild Semiconductor
-feiyang        Shenzhen Fly Young Technology Co.,LTD.
-firefly        Firefly
-focaltech      FocalTech Systems Co.,Ltd
-friendlyarm    Guangzhou FriendlyARM Computer Tech Co., Ltd
-fsl    Freescale Semiconductor
-fujitsu        Fujitsu Ltd.
-gateworks      Gateworks Corporation
-gcw Game Consoles Worldwide
-ge     General Electric Company
-geekbuying     GeekBuying
-gef    GE Fanuc Intelligent Platforms Embedded Systems, Inc.
-GEFanuc        GE Fanuc Intelligent Platforms Embedded Systems, Inc.
-geniatech      Geniatech, Inc.
-giantec        Giantec Semiconductor, Inc.
-giantplus      Giantplus Technology Co., Ltd.
-globalscale    Globalscale Technologies, Inc.
-globaltop      GlobalTop Technology, Inc.
-gmt    Global Mixed-mode Technology, Inc.
-goodix Shenzhen Huiding Technology Co., Ltd.
-google Google, Inc.
-grinn  Grinn
-grmn   Garmin Limited
-gumstix        Gumstix, Inc.
-gw     Gateworks Corporation
-hannstar       HannStar Display Corporation
-haoyu  Haoyu Microelectronic Co. Ltd.
-hardkernel     Hardkernel Co., Ltd
-hideep HiDeep Inc.
-himax  Himax Technologies, Inc.
-hisilicon      Hisilicon Limited.
-hit    Hitachi Ltd.
-hitex  Hitex Development Tools
-holt   Holt Integrated Circuits, Inc.
-honeywell      Honeywell
-hp     Hewlett Packard
-holtek Holtek Semiconductor, Inc.
-hwacom HwaCom Systems Inc.
-i2se   I2SE GmbH
-ibm    International Business Machines (IBM)
-icplus IC Plus Corp.
-idt    Integrated Device Technologies, Inc.
-ifi    Ingenieurburo Fur Ic-Technologie (I/F/I)
-ilitek ILI Technology Corporation (ILITEK)
-img    Imagination Technologies Ltd.
-infineon Infineon Technologies
-inforce        Inforce Computing
-ingenic        Ingenic Semiconductor
-innolux        Innolux Corporation
-inside-secure  INSIDE Secure
-intel  Intel Corporation
-intercontrol   Inter Control Group
-invensense     InvenSense Inc.
-inversepath    Inverse Path
-iom    Iomega Corporation
-isee   ISEE 2007 S.L.
-isil   Intersil
-issi   Integrated Silicon Solutions Inc.
-itead  ITEAD Intelligent Systems Co.Ltd
-iwave  iWave Systems Technologies Pvt. Ltd.
-jdi    Japan Display Inc.
-jedec  JEDEC Solid State Technology Association
-jianda Jiandangjing Technology Co., Ltd.
-karo   Ka-Ro electronics GmbH
-keithkoep      Keith & Koep GmbH
-keymile        Keymile GmbH
-khadas Khadas
-kiebackpeter    Kieback & Peter GmbH
-kinetic Kinetic Technologies
-kingdisplay    King & Display Technology Co., Ltd.
-kingnovel      Kingnovel Technology Co., Ltd.
-kionix Kionix, Inc.
-kobo   Rakuten Kobo Inc.
-koe    Kaohsiung Opto-Electronics Inc.
-kosagi Sutajio Ko-Usagi PTE Ltd.
-kyo    Kyocera Corporation
-lacie  LaCie
-laird  Laird PLC
-lantiq Lantiq Semiconductor
-lattice        Lattice Semiconductor
-lego   LEGO Systems A/S
-lemaker        Shenzhen LeMaker Technology Co., Ltd.
-lenovo Lenovo Group Ltd.
-lg     LG Corporation
-libretech      Shenzhen Libre Technology Co., Ltd
-licheepi       Lichee Pi
-linaro Linaro Limited
-linksys        Belkin International, Inc. (Linksys)
-linux  Linux-specific binding
-linx   Linx Technologies
-lltc   Linear Technology Corporation
-logicpd        Logic PD, Inc.
-lsi    LSI Corp. (LSI Logic)
-lwn    Liebherr-Werk Nenzing GmbH
-macnica        Macnica Americas
-marvell        Marvell Technology Group Ltd.
-maxbotix       MaxBotix Inc.
-maxim  Maxim Integrated Products
-mbvl   Mobiveil Inc.
-mcube  mCube
-meas   Measurement Specialties
-mediatek       MediaTek Inc.
-megachips      MegaChips
-mele   Shenzhen MeLE Digital Technology Ltd.
-melexis        Melexis N.V.
-melfas MELFAS Inc.
-mellanox       Mellanox Technologies
-memsic MEMSIC Inc.
-menlo  Menlo Systems GmbH
-merrii Merrii Technology Co., Ltd.
-micrel Micrel Inc.
-microchip      Microchip Technology Inc.
-microcrystal   Micro Crystal AG
-micron Micron Technology Inc.
-mikroe         MikroElektronika d.o.o.
-minix  MINIX Technology Ltd.
-miramems       MiraMEMS Sensing Technology Co., Ltd.
-mitsubishi     Mitsubishi Electric Corporation
-mosaixtech     Mosaix Technologies, Inc.
-motorola       Motorola, Inc.
-moxa   Moxa Inc.
-mpl    MPL AG
-mqmaker        mqmaker Inc.
-mscc   Microsemi Corporation
-msi    Micro-Star International Co. Ltd.
-mti    Imagination Technologies Ltd. (formerly MIPS Technologies Inc.)
-multi-inno     Multi-Inno Technology Co.,Ltd
-mundoreader    Mundo Reader S.L.
-murata Murata Manufacturing Co., Ltd.
-mxicy  Macronix International Co., Ltd.
-myir   MYIR Tech Limited
-national       National Semiconductor
-nec    NEC LCD Technologies, Ltd.
-neonode                Neonode Inc.
-netgear        NETGEAR
-netlogic       Broadcom Corporation (formerly NetLogic Microsystems)
-netron-dy      Netron DY
-netxeon                Shenzhen Netxeon Technology CO., LTD
-nexbox Nexbox
-nextthing      Next Thing Co.
-newhaven       Newhaven Display International
-ni     National Instruments
-nintendo       Nintendo
-nlt    NLT Technologies, Ltd.
-nokia  Nokia
-nordic Nordic Semiconductor
-novtech NovTech, Inc.
-nutsboard      NutsBoard
-nuvoton        Nuvoton Technology Corporation
-nvd    New Vision Display
-nvidia NVIDIA
-nxp    NXP Semiconductors
-oceanic        Oceanic Systems (UK) Ltd.
-okaya  Okaya Electric America, Inc.
-oki    Oki Electric Industry Co., Ltd.
-olimex OLIMEX Ltd.
-olpc   One Laptop Per Child
-onion  Onion Corporation
-onnn   ON Semiconductor Corp.
-ontat  On Tat Industrial Company
-opalkelly      Opal Kelly Incorporated
-opencores      OpenCores.org
-openrisc       OpenRISC.io
-option Option NV
-oranth Shenzhen Oranth Technology Co., Ltd.
-ORCL   Oracle Corporation
-orisetech      Orise Technology
-ortustech      Ortus Technology Co., Ltd.
-osddisplays    OSD Displays
-ovti   OmniVision Technologies
-oxsemi Oxford Semiconductor, Ltd.
-panasonic      Panasonic Corporation
-parade Parade Technologies Inc.
-pda    Precision Design Associates, Inc.
-pericom        Pericom Technology Inc.
-pervasive      Pervasive Displays, Inc.
-phicomm PHICOMM Co., Ltd.
-phytec PHYTEC Messtechnik GmbH
-picochip       Picochip Ltd
-pine64 Pine64
-pixcir  PIXCIR MICROELECTRONICS Co., Ltd
-plantower Plantower Co., Ltd
-plathome       Plat'Home Co., Ltd.
-plda   PLDA
-plx    Broadcom Corporation (formerly PLX Technology)
-pni    PNI Sensor Corporation
-portwell       Portwell Inc.
-poslab Poslab Technology Co., Ltd.
-powervr        PowerVR (deprecated, use img)
-probox2        PROBOX2 (by W2COMP Co., Ltd.)
-pulsedlight    PulsedLight, Inc
-qca    Qualcomm Atheros, Inc.
-qcom   Qualcomm Technologies, Inc
-qemu   QEMU, a generic and open source machine emulator and virtualizer
-qi     Qi Hardware
-qiaodian       QiaoDian XianShi Corporation
-qnap   QNAP Systems, Inc.
-radxa  Radxa
-raidsonic      RaidSonic Technology GmbH
-ralink Mediatek/Ralink Technology Corp.
-ramtron        Ramtron International
-raspberrypi    Raspberry Pi Foundation
-raydium        Raydium Semiconductor Corp.
-rda    Unisoc Communications, Inc.
-realtek Realtek Semiconductor Corp.
-renesas        Renesas Electronics Corporation
-richtek        Richtek Technology Corporation
-ricoh  Ricoh Co. Ltd.
-rikomagic      Rikomagic Tech Corp. Ltd
-riscv  RISC-V Foundation
-rockchip       Fuzhou Rockchip Electronics Co., Ltd
-rocktech       ROCKTECH DISPLAYS LIMITED
-rohm   ROHM Semiconductor Co., Ltd
-ronbo   Ronbo Electronics
-roofull        Shenzhen Roofull Technology Co, Ltd
-samsung        Samsung Semiconductor
-samtec Samtec/Softing company
-sancloud       Sancloud Ltd
-sandisk        Sandisk Corporation
-sbs    Smart Battery System
-schindler      Schindler
-seagate        Seagate Technology PLC
-seirobotics    Shenzhen SEI Robotics Co., Ltd
-semtech        Semtech Corporation
-sensirion      Sensirion AG
-sff    Small Form Factor Committee
-sgd    Solomon Goldentek Display Corporation
-sgx    SGX Sensortech
-sharp  Sharp Corporation
-shimafuji      Shimafuji Electric, Inc.
-si-en  Si-En Technology Ltd.
-si-linux       Silicon Linux Corporation
-sifive SiFive, Inc.
-sigma  Sigma Designs, Inc.
-sii    Seiko Instruments, Inc.
-sil    Silicon Image
-silabs Silicon Laboratories
-silead Silead Inc.
-silergy        Silergy Corp.
-siliconmitus   Silicon Mitus, Inc.
-simtek
-sirf   SiRF Technology, Inc.
-sis    Silicon Integrated Systems Corp.
-sitronix       Sitronix Technology Corporation
-skyworks       Skyworks Solutions, Inc.
-smsc   Standard Microsystems Corporation
-snps   Synopsys, Inc.
-socionext      Socionext Inc.
-solidrun       SolidRun
-solomon        Solomon Systech Limited
-sony   Sony Corporation
-spansion       Spansion Inc.
-sprd   Spreadtrum Communications Inc.
-sst    Silicon Storage Technology, Inc.
-st     STMicroelectronics
-starry Starry Electronic Technology (ShenZhen) Co., LTD
-startek        Startek
-ste    ST-Ericsson
-stericsson     ST-Ericsson
-summit Summit microelectronics
-sunchip        Shenzhen Sunchip Technology Co., Ltd
-SUNW   Sun Microsystems, Inc
-swir   Sierra Wireless
-syna   Synaptics Inc.
-synology       Synology, Inc.
-tbs    TBS Technologies
-tbs-biometrics Touchless Biometric Systems AG
-tcg    Trusted Computing Group
-tcl    Toby Churchill Ltd.
-technexion     TechNexion
-technologic    Technologic Systems
-tempo  Tempo Semiconductor
-techstar       Shenzhen Techstar Electronics Co., Ltd.
-terasic        Terasic Inc.
-thine  THine Electronics, Inc.
-ti     Texas Instruments
-tianma Tianma Micro-electronics Co., Ltd.
-tlm    Trusted Logic Mobility
-tmt    Tecon Microprocessor Technologies, LLC.
-topeet  Topeet
-toradex        Toradex AG
-toshiba        Toshiba Corporation
-toumaz Toumaz
-tpk    TPK U.S.A. LLC
-tplink TP-LINK Technologies Co., Ltd.
-tpo    TPO
-tq     TQ Systems GmbH
-tronfy Tronfy
-tronsmart      Tronsmart
-truly  Truly Semiconductors Limited
-tsd    Theobroma Systems Design und Consulting GmbH
-tyan   Tyan Computer Corporation
-u-blox u-blox
-ucrobotics     uCRobotics
-ubnt   Ubiquiti Networks
-udoo   Udoo
-uniwest        United Western Technologies Corp (UniWest)
-upisemi        uPI Semiconductor Corp.
-urt    United Radiant Technology Corporation
-usi    Universal Scientific Industrial Co., Ltd.
-v3     V3 Semiconductor
-vamrs  Vamrs Ltd.
-variscite      Variscite Ltd.
-via    VIA Technologies, Inc.
-virtio Virtual I/O Device Specification, developed by the OASIS consortium
-vishay Vishay Intertechnology, Inc
-vitesse        Vitesse Semiconductor Corporation
-vivante        Vivante Corporation
-vocore VoCore Studio
-voipac Voipac Technologies s.r.o.
-vot    Vision Optical Technology Co., Ltd.
-wd     Western Digital Corp.
-wetek  WeTek Electronics, limited.
-wexler Wexler
-whwave  Shenzhen whwave Electronics, Inc.
-wi2wi  Wi2Wi, Inc.
-winbond Winbond Electronics corp.
-winstar        Winstar Display Corp.
-wlf    Wolfson Microelectronics
-wm     Wondermedia Technologies, Inc.
-x-powers       X-Powers
-xes    Extreme Engineering Solutions (X-ES)
-xillybus       Xillybus Ltd.
-xlnx   Xilinx
-xunlong        Shenzhen Xunlong Software CO.,Limited
-ysoft  Y Soft Corporation a.s.
-zarlink        Zarlink Semiconductor
-zeitec ZEITEC Semiconductor Co., LTD.
-zidoo  Shenzhen Zidoo Technology Co., Ltd.
-zii    Zodiac Inflight Innovations
-zte    ZTE Corp.
-zyxel  ZyXEL Communications Corp.
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml
new file mode 100644 (file)
index 0000000..33a65a4
--- /dev/null
@@ -0,0 +1,977 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/vendor-prefixes.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Devicetree Vendor Prefix Registry
+
+maintainers:
+  - Rob Herring <robh@kernel.org>
+
+select: true
+
+properties: {}
+
+patternProperties:
+  # Prefixes which are not vendors, but followed the pattern
+  # DO NOT ADD NEW PROPERTIES TO THIS LIST
+  "^(at25|devbus|dmacap|dsa|exynos|gpio-fan|gpio|gpmc|hdmi|i2c-gpio),.*": true
+  "^(keypad|m25p|max8952|max8997|max8998|mpmc),.*": true
+  "^(pinctrl-single|#pinctrl-single|PowerPC),.*": true
+  "^(pl022|pxa-mmc|rcar_sound|rotary-encoder|s5m8767|sdhci),.*": true
+  "^(simple-audio-card|simple-graph-card|st-plgpio|st-spics|ts),.*": true
+
+  # Keep list in alphabetical order.
+  "^abilis,.*":
+    description: Abilis Systems
+  "^abracon,.*":
+    description: Abracon Corporation
+  "^actions,.*":
+    description: Actions Semiconductor Co., Ltd.
+  "^active-semi,.*":
+    description: Active-Semi International Inc
+  "^ad,.*":
+    description: Avionic Design GmbH
+  "^adafruit,.*":
+    description: Adafruit Industries, LLC
+  "^adapteva,.*":
+    description: Adapteva, Inc.
+  "^adaptrum,.*":
+    description: Adaptrum, Inc.
+  "^adh,.*":
+    description: AD Holdings Plc.
+  "^adi,.*":
+    description: Analog Devices, Inc.
+  "^advantech,.*":
+    description: Advantech Corporation
+  "^aeroflexgaisler,.*":
+    description: Aeroflex Gaisler AB
+  "^al,.*":
+    description: Annapurna Labs
+  "^allo,.*":
+    description: Allo.com
+  "^allwinner,.*":
+    description: Allwinner Technology Co., Ltd.
+  "^alphascale,.*":
+    description: AlphaScale Integrated Circuits Systems, Inc.
+  "^altr,.*":
+    description: Altera Corp.
+  "^amarula,.*":
+    description: Amarula Solutions
+  "^amazon,.*":
+    description: Amazon.com, Inc.
+  "^amcc,.*":
+    description: Applied Micro Circuits Corporation (APM, formally AMCC)
+  "^amd,.*":
+    description: Advanced Micro Devices (AMD), Inc.
+  "^amediatech,.*":
+    description: Shenzhen Amediatech Technology Co., Ltd
+  "^amlogic,.*":
+    description: Amlogic, Inc.
+  "^ampire,.*":
+    description: Ampire Co., Ltd.
+  "^ams,.*":
+    description: AMS AG
+  "^amstaos,.*":
+    description: AMS-Taos Inc.
+  "^analogix,.*":
+    description: Analogix Semiconductor, Inc.
+  "^andestech,.*":
+    description: Andes Technology Corporation
+  "^apm,.*":
+    description: Applied Micro Circuits Corporation (APM)
+  "^aptina,.*":
+    description: Aptina Imaging
+  "^arasan,.*":
+    description: Arasan Chip Systems
+  "^archermind,.*":
+    description: ArcherMind Technology (Nanjing) Co., Ltd.
+  "^arctic,.*":
+    description: Arctic Sand
+  "^arcx,.*":
+    description: arcx Inc. / Archronix Inc.
+  "^aries,.*":
+    description: Aries Embedded GmbH
+  "^arm,.*":
+    description: ARM Ltd.
+  "^armadeus,.*":
+    description: ARMadeus Systems SARL
+  "^arrow,.*":
+    description: Arrow Electronics
+  "^artesyn,.*":
+    description: Artesyn Embedded Technologies Inc.
+  "^asahi-kasei,.*":
+    description: Asahi Kasei Corp.
+  "^aspeed,.*":
+    description: ASPEED Technology Inc.
+  "^asus,.*":
+    description: AsusTek Computer Inc.
+  "^atlas,.*":
+    description: Atlas Scientific LLC
+  "^atmel,.*":
+    description: Atmel Corporation
+  "^auo,.*":
+    description: AU Optronics Corporation
+  "^auvidea,.*":
+    description: Auvidea GmbH
+  "^avago,.*":
+    description: Avago Technologies
+  "^avia,.*":
+    description: avia semiconductor
+  "^avic,.*":
+    description: Shanghai AVIC Optoelectronics Co., Ltd.
+  "^avnet,.*":
+    description: Avnet, Inc.
+  "^axentia,.*":
+    description: Axentia Technologies AB
+  "^axis,.*":
+    description: Axis Communications AB
+  "^azoteq,.*":
+    description: Azoteq (Pty) Ltd
+  "^azw,.*":
+    description: Shenzhen AZW Technology Co., Ltd.
+  "^bananapi,.*":
+    description: BIPAI KEJI LIMITED
+  "^bhf,.*":
+    description: Beckhoff Automation GmbH & Co. KG
+  "^bitmain,.*":
+    description: Bitmain Technologies
+  "^boe,.*":
+    description: BOE Technology Group Co., Ltd.
+  "^bosch,.*":
+    description: Bosch Sensortec GmbH
+  "^boundary,.*":
+    description: Boundary Devices Inc.
+  "^brcm,.*":
+    description: Broadcom Corporation
+  "^buffalo,.*":
+    description: Buffalo, Inc.
+  "^bticino,.*":
+    description: Bticino International
+  "^calxeda,.*":
+    description: Calxeda
+  "^capella,.*":
+    description: Capella Microsystems, Inc
+  "^cascoda,.*":
+    description: Cascoda, Ltd.
+  "^catalyst,.*":
+    description: Catalyst Semiconductor, Inc.
+  "^cavium,.*":
+    description: Cavium, Inc.
+  "^cdns,.*":
+    description: Cadence Design Systems Inc.
+  "^cdtech,.*":
+    description: CDTech(H.K.) Electronics Limited
+  "^ceva,.*":
+    description: Ceva, Inc.
+  "^chipidea,.*":
+    description: Chipidea, Inc
+  "^chipone,.*":
+    description: ChipOne
+  "^chipspark,.*":
+    description: ChipSPARK
+  "^chrp,.*":
+    description: Common Hardware Reference Platform
+  "^chunghwa,.*":
+    description: Chunghwa Picture Tubes Ltd.
+  "^ciaa,.*":
+    description: Computadora Industrial Abierta Argentina
+  "^cirrus,.*":
+    description: Cirrus Logic, Inc.
+  "^cloudengines,.*":
+    description: Cloud Engines, Inc.
+  "^cnm,.*":
+    description: Chips&Media, Inc.
+  "^cnxt,.*":
+    description: Conexant Systems, Inc.
+  "^compulab,.*":
+    description: CompuLab Ltd.
+  "^cortina,.*":
+    description: Cortina Systems, Inc.
+  "^cosmic,.*":
+    description: Cosmic Circuits
+  "^crane,.*":
+    description: Crane Connectivity Solutions
+  "^creative,.*":
+    description: Creative Technology Ltd
+  "^crystalfontz,.*":
+    description: Crystalfontz America, Inc.
+  "^csky,.*":
+    description: Hangzhou C-SKY Microsystems Co., Ltd
+  "^cubietech,.*":
+    description: Cubietech, Ltd.
+  "^cypress,.*":
+    description: Cypress Semiconductor Corporation
+  "^cznic,.*":
+    description: CZ.NIC, z.s.p.o.
+  "^dallas,.*":
+    description: Maxim Integrated Products (formerly Dallas Semiconductor)
+  "^dataimage,.*":
+    description: DataImage, Inc.
+  "^davicom,.*":
+    description: DAVICOM Semiconductor, Inc.
+  "^delta,.*":
+    description: Delta Electronics, Inc.
+  "^denx,.*":
+    description: Denx Software Engineering
+  "^devantech,.*":
+    description: Devantech, Ltd.
+  "^dh,.*":
+    description: DH electronics GmbH
+  "^digi,.*":
+    description: Digi International Inc.
+  "^digilent,.*":
+    description: Diglent, Inc.
+  "^dioo,.*":
+    description: Dioo Microcircuit Co., Ltd
+  "^dlc,.*":
+    description: DLC Display Co., Ltd.
+  "^dlg,.*":
+    description: Dialog Semiconductor
+  "^dlink,.*":
+    description: D-Link Corporation
+  "^dmo,.*":
+    description: Data Modul AG
+  "^domintech,.*":
+    description: Domintech Co., Ltd.
+  "^dongwoon,.*":
+    description: Dongwoon Anatech
+  "^dptechnics,.*":
+    description: DPTechnics
+  "^dragino,.*":
+    description: Dragino Technology Co., Limited
+  "^ea,.*":
+    description: Embedded Artists AB
+  "^ebs-systart,.*":
+    description: EBS-SYSTART GmbH
+  "^ebv,.*":
+    description: EBV Elektronik
+  "^eckelmann,.*":
+    description: Eckelmann AG
+  "^edt,.*":
+    description: Emerging Display Technologies
+  "^eeti,.*":
+    description: eGalax_eMPIA Technology Inc
+  "^elan,.*":
+    description: Elan Microelectronic Corp.
+  "^elgin,.*":
+    description: Elgin S/A.
+  "^embest,.*":
+    description: Shenzhen Embest Technology Co., Ltd.
+  "^emlid,.*":
+    description: Emlid, Ltd.
+  "^emmicro,.*":
+    description: EM Microelectronic
+  "^emtrion,.*":
+    description: emtrion GmbH
+  "^endless,.*":
+    description: Endless Mobile, Inc.
+  "^energymicro,.*":
+    description: Silicon Laboratories (formerly Energy Micro AS)
+  "^engicam,.*":
+    description: Engicam S.r.l.
+  "^epcos,.*":
+    description: EPCOS AG
+  "^epfl,.*":
+    description: Ecole Polytechnique Fédérale de Lausanne
+  "^epson,.*":
+    description: Seiko Epson Corp.
+  "^est,.*":
+    description: ESTeem Wireless Modems
+  "^ettus,.*":
+    description: NI Ettus Research
+  "^eukrea,.*":
+    description: Eukréa Electromatique
+  "^everest,.*":
+    description: Everest Semiconductor Co. Ltd.
+  "^everspin,.*":
+    description: Everspin Technologies, Inc.
+  "^exar,.*":
+    description: Exar Corporation
+  "^excito,.*":
+    description: Excito
+  "^ezchip,.*":
+    description: EZchip Semiconductor
+  "^facebook,.*":
+    description: Facebook
+  "^fairphone,.*":
+    description: Fairphone B.V.
+  "^faraday,.*":
+    description: Faraday Technology Corporation
+  "^fastrax,.*":
+    description: Fastrax Oy
+  "^fcs,.*":
+    description: Fairchild Semiconductor
+  "^feiyang,.*":
+    description: Shenzhen Fly Young Technology Co.,LTD.
+  "^firefly,.*":
+    description: Firefly
+  "^focaltech,.*":
+    description: FocalTech Systems Co.,Ltd
+  "^friendlyarm,.*":
+    description: Guangzhou FriendlyARM Computer Tech Co., Ltd
+  "^fsl,.*":
+    description: Freescale Semiconductor
+  "^fujitsu,.*":
+    description: Fujitsu Ltd.
+  "^gateworks,.*":
+    description: Gateworks Corporation
+  "^gcw,.*":
+    description: Game Consoles Worldwide
+  "^ge,.*":
+    description: General Electric Company
+  "^geekbuying,.*":
+    description: GeekBuying
+  "^gef,.*":
+    description: GE Fanuc Intelligent Platforms Embedded Systems, Inc.
+  "^GEFanuc,.*":
+    description: GE Fanuc Intelligent Platforms Embedded Systems, Inc.
+  "^geniatech,.*":
+    description: Geniatech, Inc.
+  "^giantec,.*":
+    description: Giantec Semiconductor, Inc.
+  "^giantplus,.*":
+    description: Giantplus Technology Co., Ltd.
+  "^globalscale,.*":
+    description: Globalscale Technologies, Inc.
+  "^globaltop,.*":
+    description: GlobalTop Technology, Inc.
+  "^gmt,.*":
+    description: Global Mixed-mode Technology, Inc.
+  "^goodix,.*":
+    description: Shenzhen Huiding Technology Co., Ltd.
+  "^google,.*":
+    description: Google, Inc.
+  "^grinn,.*":
+    description: Grinn
+  "^grmn,.*":
+    description: Garmin Limited
+  "^gumstix,.*":
+    description: Gumstix, Inc.
+  "^gw,.*":
+    description: Gateworks Corporation
+  "^hannstar,.*":
+    description: HannStar Display Corporation
+  "^haoyu,.*":
+    description: Haoyu Microelectronic Co. Ltd.
+  "^hardkernel,.*":
+    description: Hardkernel Co., Ltd
+  "^hideep,.*":
+    description: HiDeep Inc.
+  "^himax,.*":
+    description: Himax Technologies, Inc.
+  "^hisilicon,.*":
+    description: Hisilicon Limited.
+  "^hit,.*":
+    description: Hitachi Ltd.
+  "^hitex,.*":
+    description: Hitex Development Tools
+  "^holt,.*":
+    description: Holt Integrated Circuits, Inc.
+  "^honeywell,.*":
+    description: Honeywell
+  "^hp,.*":
+    description: Hewlett Packard
+  "^holtek,.*":
+    description: Holtek Semiconductor, Inc.
+  "^hwacom,.*":
+    description: HwaCom Systems Inc.
+  "^i2se,.*":
+    description: I2SE GmbH
+  "^ibm,.*":
+    description: International Business Machines (IBM)
+  "^icplus,.*":
+    description: IC Plus Corp.
+  "^idt,.*":
+    description: Integrated Device Technologies, Inc.
+  "^ifi,.*":
+    description: Ingenieurburo Fur Ic-Technologie (I/F/I)
+  "^ilitek,.*":
+    description: ILI Technology Corporation (ILITEK)
+  "^img,.*":
+    description: Imagination Technologies Ltd.
+  "^infineon,.*":
+    description: Infineon Technologies
+  "^inforce,.*":
+    description: Inforce Computing
+  "^ingenic,.*":
+    description: Ingenic Semiconductor
+  "^innolux,.*":
+    description: Innolux Corporation
+  "^inside-secure,.*":
+    description: INSIDE Secure
+  "^intel,.*":
+    description: Intel Corporation
+  "^intercontrol,.*":
+    description: Inter Control Group
+  "^invensense,.*":
+    description: InvenSense Inc.
+  "^inversepath,.*":
+    description: Inverse Path
+  "^iom,.*":
+    description: Iomega Corporation
+  "^isee,.*":
+    description: ISEE 2007 S.L.
+  "^isil,.*":
+    description: Intersil
+  "^issi,.*":
+    description: Integrated Silicon Solutions Inc.
+  "^itead,.*":
+    description: ITEAD Intelligent Systems Co.Ltd
+  "^iwave,.*":
+    description: iWave Systems Technologies Pvt. Ltd.
+  "^jdi,.*":
+    description: Japan Display Inc.
+  "^jedec,.*":
+    description: JEDEC Solid State Technology Association
+  "^jianda,.*":
+    description: Jiandangjing Technology Co., Ltd.
+  "^karo,.*":
+    description: Ka-Ro electronics GmbH
+  "^keithkoep,.*":
+    description: Keith & Koep GmbH
+  "^keymile,.*":
+    description: Keymile GmbH
+  "^khadas,.*":
+    description: Khadas
+  "^kiebackpeter,.*":
+    description: Kieback & Peter GmbH
+  "^kinetic,.*":
+    description: Kinetic Technologies
+  "^kingdisplay,.*":
+    description: King & Display Technology Co., Ltd.
+  "^kingnovel,.*":
+    description: Kingnovel Technology Co., Ltd.
+  "^kionix,.*":
+    description: Kionix, Inc.
+  "^kobo,.*":
+    description: Rakuten Kobo Inc.
+  "^koe,.*":
+    description: Kaohsiung Opto-Electronics Inc.
+  "^kosagi,.*":
+    description: Sutajio Ko-Usagi PTE Ltd.
+  "^kyo,.*":
+    description: Kyocera Corporation
+  "^lacie,.*":
+    description: LaCie
+  "^laird,.*":
+    description: Laird PLC
+  "^lantiq,.*":
+    description: Lantiq Semiconductor
+  "^lattice,.*":
+    description: Lattice Semiconductor
+  "^lego,.*":
+    description: LEGO Systems A/S
+  "^lemaker,.*":
+    description: Shenzhen LeMaker Technology Co., Ltd.
+  "^lenovo,.*":
+    description: Lenovo Group Ltd.
+  "^lg,.*":
+    description: LG Corporation
+  "^libretech,.*":
+    description: Shenzhen Libre Technology Co., Ltd
+  "^licheepi,.*":
+    description: Lichee Pi
+  "^linaro,.*":
+    description: Linaro Limited
+  "^linksys,.*":
+    description: Belkin International, Inc. (Linksys)
+  "^linux,.*":
+    description: Linux-specific binding
+  "^linx,.*":
+    description: Linx Technologies
+  "^lltc,.*":
+    description: Linear Technology Corporation
+  "^logicpd,.*":
+    description: Logic PD, Inc.
+  "^lsi,.*":
+    description: LSI Corp. (LSI Logic)
+  "^lwn,.*":
+    description: Liebherr-Werk Nenzing GmbH
+  "^macnica,.*":
+    description: Macnica Americas
+  "^marvell,.*":
+    description: Marvell Technology Group Ltd.
+  "^maxbotix,.*":
+    description: MaxBotix Inc.
+  "^maxim,.*":
+    description: Maxim Integrated Products
+  "^mbvl,.*":
+    description: Mobiveil Inc.
+  "^mcube,.*":
+    description: mCube
+  "^meas,.*":
+    description: Measurement Specialties
+  "^mediatek,.*":
+    description: MediaTek Inc.
+  "^megachips,.*":
+    description: MegaChips
+  "^mele,.*":
+    description: Shenzhen MeLE Digital Technology Ltd.
+  "^melexis,.*":
+    description: Melexis N.V.
+  "^melfas,.*":
+    description: MELFAS Inc.
+  "^mellanox,.*":
+    description: Mellanox Technologies
+  "^memsic,.*":
+    description: MEMSIC Inc.
+  "^menlo,.*":
+    description: Menlo Systems GmbH
+  "^merrii,.*":
+    description: Merrii Technology Co., Ltd.
+  "^micrel,.*":
+    description: Micrel Inc.
+  "^microchip,.*":
+    description: Microchip Technology Inc.
+  "^microcrystal,.*":
+    description: Micro Crystal AG
+  "^micron,.*":
+    description: Micron Technology Inc.
+  "^mikroe,.*":
+    description: MikroElektronika d.o.o.
+  "^minix,.*":
+    description: MINIX Technology Ltd.
+  "^miramems,.*":
+    description: MiraMEMS Sensing Technology Co., Ltd.
+  "^mitsubishi,.*":
+    description: Mitsubishi Electric Corporation
+  "^mosaixtech,.*":
+    description: Mosaix Technologies, Inc.
+  "^motorola,.*":
+    description: Motorola, Inc.
+  "^moxa,.*":
+    description: Moxa Inc.
+  "^mpl,.*":
+    description: MPL AG
+  "^mqmaker,.*":
+    description: mqmaker Inc.
+  "^mscc,.*":
+    description: Microsemi Corporation
+  "^msi,.*":
+    description: Micro-Star International Co. Ltd.
+  "^mti,.*":
+    description: Imagination Technologies Ltd. (formerly MIPS Technologies Inc.)
+  "^multi-inno,.*":
+    description: Multi-Inno Technology Co.,Ltd
+  "^mundoreader,.*":
+    description: Mundo Reader S.L.
+  "^murata,.*":
+    description: Murata Manufacturing Co., Ltd.
+  "^mxicy,.*":
+    description: Macronix International Co., Ltd.
+  "^myir,.*":
+    description: MYIR Tech Limited
+  "^national,.*":
+    description: National Semiconductor
+  "^nec,.*":
+    description: NEC LCD Technologies, Ltd.
+  "^neonode,.*":
+    description: Neonode Inc.
+  "^netgear,.*":
+    description: NETGEAR
+  "^netlogic,.*":
+    description: Broadcom Corporation (formerly NetLogic Microsystems)
+  "^netron-dy,.*":
+    description: Netron DY
+  "^netxeon,.*":
+    description: Shenzhen Netxeon Technology CO., LTD
+  "^nexbox,.*":
+    description: Nexbox
+  "^nextthing,.*":
+    description: Next Thing Co.
+  "^newhaven,.*":
+    description: Newhaven Display International
+  "^ni,.*":
+    description: National Instruments
+  "^nintendo,.*":
+    description: Nintendo
+  "^nlt,.*":
+    description: NLT Technologies, Ltd.
+  "^nokia,.*":
+    description: Nokia
+  "^nordic,.*":
+    description: Nordic Semiconductor
+  "^novtech,.*":
+    description: NovTech, Inc.
+  "^nutsboard,.*":
+    description: NutsBoard
+  "^nuvoton,.*":
+    description: Nuvoton Technology Corporation
+  "^nvd,.*":
+    description: New Vision Display
+  "^nvidia,.*":
+    description: NVIDIA
+  "^nxp,.*":
+    description: NXP Semiconductors
+  "^oceanic,.*":
+    description: Oceanic Systems (UK) Ltd.
+  "^okaya,.*":
+    description: Okaya Electric America, Inc.
+  "^oki,.*":
+    description: Oki Electric Industry Co., Ltd.
+  "^olimex,.*":
+    description: OLIMEX Ltd.
+  "^olpc,.*":
+    description: One Laptop Per Child
+  "^onion,.*":
+    description: Onion Corporation
+  "^onnn,.*":
+    description: ON Semiconductor Corp.
+  "^ontat,.*":
+    description: On Tat Industrial Company
+  "^opalkelly,.*":
+    description: Opal Kelly Incorporated
+  "^opencores,.*":
+    description: OpenCores.org
+  "^openrisc,.*":
+    description: OpenRISC.io
+  "^option,.*":
+    description: Option NV
+  "^oranth,.*":
+    description: Shenzhen Oranth Technology Co., Ltd.
+  "^ORCL,.*":
+    description: Oracle Corporation
+  "^orisetech,.*":
+    description: Orise Technology
+  "^ortustech,.*":
+    description: Ortus Technology Co., Ltd.
+  "^osddisplays,.*":
+    description: OSD Displays
+  "^ovti,.*":
+    description: OmniVision Technologies
+  "^oxsemi,.*":
+    description: Oxford Semiconductor, Ltd.
+  "^panasonic,.*":
+    description: Panasonic Corporation
+  "^parade,.*":
+    description: Parade Technologies Inc.
+  "^pda,.*":
+    description: Precision Design Associates, Inc.
+  "^pericom,.*":
+    description: Pericom Technology Inc.
+  "^pervasive,.*":
+    description: Pervasive Displays, Inc.
+  "^phicomm,.*":
+    description: PHICOMM Co., Ltd.
+  "^phytec,.*":
+    description: PHYTEC Messtechnik GmbH
+  "^picochip,.*":
+    description: Picochip Ltd
+  "^pine64,.*":
+    description: Pine64
+  "^pixcir,.*":
+    description: PIXCIR MICROELECTRONICS Co., Ltd
+  "^plantower,.*":
+    description: Plantower Co., Ltd
+  "^plathome,.*":
+    description: Plat'Home Co., Ltd.
+  "^plda,.*":
+    description: PLDA
+  "^plx,.*":
+    description: Broadcom Corporation (formerly PLX Technology)
+  "^pni,.*":
+    description: PNI Sensor Corporation
+  "^portwell,.*":
+    description: Portwell Inc.
+  "^poslab,.*":
+    description: Poslab Technology Co., Ltd.
+  "^powervr,.*":
+    description: PowerVR (deprecated, use img)
+  "^probox2,.*":
+    description: PROBOX2 (by W2COMP Co., Ltd.)
+  "^pulsedlight,.*":
+    description: PulsedLight, Inc
+  "^qca,.*":
+    description: Qualcomm Atheros, Inc.
+  "^qcom,.*":
+    description: Qualcomm Technologies, Inc
+  "^qemu,.*":
+    description: QEMU, a generic and open source machine emulator and virtualizer
+  "^qi,.*":
+    description: Qi Hardware
+  "^qiaodian,.*":
+    description: QiaoDian XianShi Corporation
+  "^qnap,.*":
+    description: QNAP Systems, Inc.
+  "^radxa,.*":
+    description: Radxa
+  "^raidsonic,.*":
+    description: RaidSonic Technology GmbH
+  "^ralink,.*":
+    description: Mediatek/Ralink Technology Corp.
+  "^ramtron,.*":
+    description: Ramtron International
+  "^raspberrypi,.*":
+    description: Raspberry Pi Foundation
+  "^raydium,.*":
+    description: Raydium Semiconductor Corp.
+  "^rda,.*":
+    description: Unisoc Communications, Inc.
+  "^realtek,.*":
+    description: Realtek Semiconductor Corp.
+  "^renesas,.*":
+    description: Renesas Electronics Corporation
+  "^richtek,.*":
+    description: Richtek Technology Corporation
+  "^ricoh,.*":
+    description: Ricoh Co. Ltd.
+  "^rikomagic,.*":
+    description: Rikomagic Tech Corp. Ltd
+  "^riscv,.*":
+    description: RISC-V Foundation
+  "^rockchip,.*":
+    description: Fuzhou Rockchip Electronics Co., Ltd
+  "^rocktech,.*":
+    description: ROCKTECH DISPLAYS LIMITED
+  "^rohm,.*":
+    description: ROHM Semiconductor Co., Ltd
+  "^ronbo,.*":
+    description: Ronbo Electronics
+  "^roofull,.*":
+    description: Shenzhen Roofull Technology Co, Ltd
+  "^samsung,.*":
+    description: Samsung Semiconductor
+  "^samtec,.*":
+    description: Samtec/Softing company
+  "^sancloud,.*":
+    description: Sancloud Ltd
+  "^sandisk,.*":
+    description: Sandisk Corporation
+  "^sbs,.*":
+    description: Smart Battery System
+  "^schindler,.*":
+    description: Schindler
+  "^seagate,.*":
+    description: Seagate Technology PLC
+  "^seirobotics,.*":
+    description: Shenzhen SEI Robotics Co., Ltd
+  "^semtech,.*":
+    description: Semtech Corporation
+  "^sensirion,.*":
+    description: Sensirion AG
+  "^sff,.*":
+    description: Small Form Factor Committee
+  "^sgd,.*":
+    description: Solomon Goldentek Display Corporation
+  "^sgx,.*":
+    description: SGX Sensortech
+  "^sharp,.*":
+    description: Sharp Corporation
+  "^shimafuji,.*":
+    description: Shimafuji Electric, Inc.
+  "^si-en,.*":
+    description: Si-En Technology Ltd.
+  "^si-linux,.*":
+    description: Silicon Linux Corporation
+  "^sifive,.*":
+    description: SiFive, Inc.
+  "^sigma,.*":
+    description: Sigma Designs, Inc.
+  "^sii,.*":
+    description: Seiko Instruments, Inc.
+  "^sil,.*":
+    description: Silicon Image
+  "^silabs,.*":
+    description: Silicon Laboratories
+  "^silead,.*":
+    description: Silead Inc.
+  "^silergy,.*":
+    description: Silergy Corp.
+  "^siliconmitus,.*":
+    description: Silicon Mitus, Inc.
+  "^simte,.*":
+    description: k
+  "^sirf,.*":
+    description: SiRF Technology, Inc.
+  "^sis,.*":
+    description: Silicon Integrated Systems Corp.
+  "^sitronix,.*":
+    description: Sitronix Technology Corporation
+  "^skyworks,.*":
+    description: Skyworks Solutions, Inc.
+  "^smsc,.*":
+    description: Standard Microsystems Corporation
+  "^snps,.*":
+    description: Synopsys, Inc.
+  "^socionext,.*":
+    description: Socionext Inc.
+  "^solidrun,.*":
+    description: SolidRun
+  "^solomon,.*":
+    description: Solomon Systech Limited
+  "^sony,.*":
+    description: Sony Corporation
+  "^spansion,.*":
+    description: Spansion Inc.
+  "^sprd,.*":
+    description: Spreadtrum Communications Inc.
+  "^sst,.*":
+    description: Silicon Storage Technology, Inc.
+  "^st,.*":
+    description: STMicroelectronics
+  "^starry,.*":
+    description: Starry Electronic Technology (ShenZhen) Co., LTD
+  "^startek,.*":
+    description: Startek
+  "^ste,.*":
+    description: ST-Ericsson
+  "^stericsson,.*":
+    description: ST-Ericsson
+  "^summit,.*":
+    description: Summit microelectronics
+  "^sunchip,.*":
+    description: Shenzhen Sunchip Technology Co., Ltd
+  "^SUNW,.*":
+    description: Sun Microsystems, Inc
+  "^swir,.*":
+    description: Sierra Wireless
+  "^syna,.*":
+    description: Synaptics Inc.
+  "^synology,.*":
+    description: Synology, Inc.
+  "^tbs,.*":
+    description: TBS Technologies
+  "^tbs-biometrics,.*":
+    description: Touchless Biometric Systems AG
+  "^tcg,.*":
+    description: Trusted Computing Group
+  "^tcl,.*":
+    description: Toby Churchill Ltd.
+  "^technexion,.*":
+    description: TechNexion
+  "^technologic,.*":
+    description: Technologic Systems
+  "^tempo,.*":
+    description: Tempo Semiconductor
+  "^techstar,.*":
+    description: Shenzhen Techstar Electronics Co., Ltd.
+  "^terasic,.*":
+    description: Terasic Inc.
+  "^thine,.*":
+    description: THine Electronics, Inc.
+  "^ti,.*":
+    description: Texas Instruments
+  "^tianma,.*":
+    description: Tianma Micro-electronics Co., Ltd.
+  "^tlm,.*":
+    description: Trusted Logic Mobility
+  "^tmt,.*":
+    description: Tecon Microprocessor Technologies, LLC.
+  "^topeet,.*":
+    description: Topeet
+  "^toradex,.*":
+    description: Toradex AG
+  "^toshiba,.*":
+    description: Toshiba Corporation
+  "^toumaz,.*":
+    description: Toumaz
+  "^tpk,.*":
+    description: TPK U.S.A. LLC
+  "^tplink,.*":
+    description: TP-LINK Technologies Co., Ltd.
+  "^tpo,.*":
+    description: TPO
+  "^tq,.*":
+    description: TQ Systems GmbH
+  "^tronfy,.*":
+    description: Tronfy
+  "^tronsmart,.*":
+    description: Tronsmart
+  "^truly,.*":
+    description: Truly Semiconductors Limited
+  "^tsd,.*":
+    description: Theobroma Systems Design und Consulting GmbH
+  "^tyan,.*":
+    description: Tyan Computer Corporation
+  "^u-blox,.*":
+    description: u-blox
+  "^ucrobotics,.*":
+    description: uCRobotics
+  "^ubnt,.*":
+    description: Ubiquiti Networks
+  "^udoo,.*":
+    description: Udoo
+  "^uniwest,.*":
+    description: United Western Technologies Corp (UniWest)
+  "^upisemi,.*":
+    description: uPI Semiconductor Corp.
+  "^urt,.*":
+    description: United Radiant Technology Corporation
+  "^usi,.*":
+    description: Universal Scientific Industrial Co., Ltd.
+  "^v3,.*":
+    description: V3 Semiconductor
+  "^vamrs,.*":
+    description: Vamrs Ltd.
+  "^variscite,.*":
+    description: Variscite Ltd.
+  "^via,.*":
+    description: VIA Technologies, Inc.
+  "^virtio,.*":
+    description: Virtual I/O Device Specification, developed by the OASIS consortium
+  "^vishay,.*":
+    description: Vishay Intertechnology, Inc
+  "^vitesse,.*":
+    description: Vitesse Semiconductor Corporation
+  "^vivante,.*":
+    description: Vivante Corporation
+  "^vocore,.*":
+    description: VoCore Studio
+  "^voipac,.*":
+    description: Voipac Technologies s.r.o.
+  "^vot,.*":
+    description: Vision Optical Technology Co., Ltd.
+  "^wd,.*":
+    description: Western Digital Corp.
+  "^wetek,.*":
+    description: WeTek Electronics, limited.
+  "^wexler,.*":
+    description: Wexler
+  "^whwave,.*":
+    description: Shenzhen whwave Electronics, Inc.
+  "^wi2wi,.*":
+    description: Wi2Wi, Inc.
+  "^winbond,.*":
+    description: Winbond Electronics corp.
+  "^winstar,.*":
+    description: Winstar Display Corp.
+  "^wlf,.*":
+    description: Wolfson Microelectronics
+  "^wm,.*":
+    description: Wondermedia Technologies, Inc.
+  "^x-powers,.*":
+    description: X-Powers
+  "^xes,.*":
+    description: Extreme Engineering Solutions (X-ES)
+  "^xillybus,.*":
+    description: Xillybus Ltd.
+  "^xlnx,.*":
+    description: Xilinx
+  "^xunlong,.*":
+    description: Shenzhen Xunlong Software CO.,Limited
+  "^ysoft,.*":
+    description: Y Soft Corporation a.s.
+  "^zarlink,.*":
+    description: Zarlink Semiconductor
+  "^zeitec,.*":
+    description: ZEITEC Semiconductor Co., LTD.
+  "^zidoo,.*":
+    description: Shenzhen Zidoo Technology Co., Ltd.
+  "^zii,.*":
+    description: Zodiac Inflight Innovations
+  "^zte,.*":
+    description: ZTE Corp.
+  "^zyxel,.*":
+    description: ZyXEL Communications Corp.
+
+  # Normal property name match without a comma
+  # These should catch all node/property names without a prefix
+  "^[a-zA-Z0-9#][a-zA-Z0-9+\\-._@]{0,63}$": true
+  "^[a-zA-Z0-9+\\-._]*@[0-9a-zA-Z,]*$": true
+  "^#.*": true
+
+additionalProperties: false
+
+...
index 3fb473e3b8e2e205d4e0d6692a891d560eec3781..d640e922a9748c795ec670f4375098372052f6dd 100644 (file)
@@ -75,12 +75,11 @@ enum v4l2_field
 
     * - ``V4L2_FIELD_ANY``
       - 0
-      - Applications request this field order when any one of the
-       ``V4L2_FIELD_NONE``, ``V4L2_FIELD_TOP``, ``V4L2_FIELD_BOTTOM``, or
-       ``V4L2_FIELD_INTERLACED`` formats is acceptable. Drivers choose
-       depending on hardware capabilities or e. g. the requested image
-       size, and return the actual field order. Drivers must never return
-       ``V4L2_FIELD_ANY``. If multiple field orders are possible the
+      - Applications request this field order when any field format
+       is acceptable. Drivers choose depending on hardware capabilities or
+       e.g. the requested image size, and return the actual field order.
+       Drivers must never return ``V4L2_FIELD_ANY``.
+       If multiple field orders are possible the
        driver must choose one of the possible field orders during
        :ref:`VIDIOC_S_FMT <VIDIOC_G_FMT>` or
        :ref:`VIDIOC_TRY_FMT <VIDIOC_G_FMT>`. struct
@@ -88,9 +87,8 @@ enum v4l2_field
        ``V4L2_FIELD_ANY``.
     * - ``V4L2_FIELD_NONE``
       - 1
-      - Images are in progressive format, not interlaced. The driver may
-       also indicate this order when it cannot distinguish between
-       ``V4L2_FIELD_TOP`` and ``V4L2_FIELD_BOTTOM``.
+      - Images are in progressive (frame-based) format, not interlaced
+        (field-based).
     * - ``V4L2_FIELD_TOP``
       - 2
       - Images consist of the top (aka odd) field only.
index cd7303d7fa25dac9ae38d0e73186f3687b7872a7..180e07d956a7049f119a8ebac7dfa2241b6c0a56 100644 (file)
@@ -796,7 +796,9 @@ The kernel interface functions are as follows:
                                s64 tx_total_len,
                                gfp_t gfp,
                                rxrpc_notify_rx_t notify_rx,
-                               bool upgrade);
+                               bool upgrade,
+                               bool intr,
+                               unsigned int debug_id);
 
      This allocates the infrastructure to make a new RxRPC call and assigns
      call and connection numbers.  The call will be made on the UDP port that
@@ -824,6 +826,13 @@ The kernel interface functions are as follows:
      the server upgrade the service to a better one.  The resultant service ID
      is returned by rxrpc_kernel_recv_data().
 
+     intr should be set to true if the call should be interruptible.  If this
+     is not set, this function may not return until a channel has been
+     allocated; if it is set, the function may return -ERESTARTSYS.
+
+     debug_id is the call debugging ID to be used for tracing.  This can be
+     obtained by atomically incrementing rxrpc_debug_id.
+
      If this function is successful, an opaque reference to the RxRPC call is
      returned.  The caller now holds a reference on this and it must be
      properly ended.
@@ -1056,6 +1065,16 @@ The kernel interface functions are as follows:
      This value can be used to determine if the remote client has been
      restarted as it shouldn't change otherwise.
 
+ (*) Set the maxmimum lifespan on a call.
+
+       void rxrpc_kernel_set_max_life(struct socket *sock,
+                                      struct rxrpc_call *call,
+                                      unsigned long hard_timeout)
+
+     This sets the maximum lifespan on a call to hard_timeout (which is in
+     jiffies).  In the event of the timeout occurring, the call will be
+     aborted and -ETIME or -ETIMEDOUT will be returned.
+
 
 =======================
 CONFIGURABLE PARAMETERS
index 534e9baa4e1d19bd53f4c7b62ce12c3735c2d89c..5d4330be200f980cf4ea7a8e17b9e89012dcc381 100644 (file)
@@ -142,45 +142,13 @@ Mitigation points
    mds_user_clear.
 
    The mitigation is invoked in prepare_exit_to_usermode() which covers
-   most of the kernel to user space transitions. There are a few exceptions
-   which are not invoking prepare_exit_to_usermode() on return to user
-   space. These exceptions use the paranoid exit code.
+   all but one of the kernel to user space transitions.  The exception
+   is when we return from a Non Maskable Interrupt (NMI), which is
+   handled directly in do_nmi().
 
-   - Non Maskable Interrupt (NMI):
-
-     Access to sensible data like keys, credentials in the NMI context is
-     mostly theoretical: The CPU can do prefetching or execute a
-     misspeculated code path and thereby fetching data which might end up
-     leaking through a buffer.
-
-     But for mounting other attacks the kernel stack address of the task is
-     already valuable information. So in full mitigation mode, the NMI is
-     mitigated on the return from do_nmi() to provide almost complete
-     coverage.
-
-   - Double fault (#DF):
-
-     A double fault is usually fatal, but the ESPFIX workaround, which can
-     be triggered from user space through modify_ldt(2) is a recoverable
-     double fault. #DF uses the paranoid exit path, so explicit mitigation
-     in the double fault handler is required.
-
-   - Machine Check Exception (#MC):
-
-     Another corner case is a #MC which hits between the CPU buffer clear
-     invocation and the actual return to user. As this still is in kernel
-     space it takes the paranoid exit path which does not clear the CPU
-     buffers. So the #MC handler repopulates the buffers to some
-     extent. Machine checks are not reliably controllable and the window is
-     extremly small so mitigation would just tick a checkbox that this
-     theoretical corner case is covered. To keep the amount of special
-     cases small, ignore #MC.
-
-   - Debug Exception (#DB):
-
-     This takes the paranoid exit path only when the INT1 breakpoint is in
-     kernel space. #DB on a user space address takes the regular exit path,
-     so no extra mitigation required.
+   (The reason that NMI is special is that prepare_exit_to_usermode() can
+    enable IRQs.  In NMI context, NMIs are blocked, and we don't want to
+    enable IRQs with NMIs blocked.)
 
 
 2. C-State transition
index f11f0698b148c05924c5ab7528179611861e7ea4..c47b328eada033257e30c89a0f1678d030b5b95f 100644 (file)
@@ -781,7 +781,7 @@ config COMPAT_OLD_SIGACTION
        bool
 
 config 64BIT_TIME
-       def_bool ARCH_HAS_64BIT_TIME
+       def_bool y
        help
          This should be selected by all architectures that need to support
          new system calls with a 64-bit time_t. This is relevant on all 32-bit
diff --git a/arch/alpha/include/asm/segment.h b/arch/alpha/include/asm/segment.h
deleted file mode 100644 (file)
index 0453d97..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __ALPHA_SEGMENT_H
-#define __ALPHA_SEGMENT_H
-
-/* Only here because we have some old header files that expect it.. */
-
-#endif
index 4dbd4e41504159322ff13dbc335a7e1daea6efec..bbbd34586de01e1f11fe4ba1d6f21f1f8b0a2119 100644 (file)
@@ -10,7 +10,6 @@
 
 #include <asm/hwrpb.h>
 #include <asm/io.h>
-#include <asm/segment.h>
 
 #if 0
 # define DBG_DEVS(args)         printk args
index 733f08966fd217806593c03cbb91c994a3cc9425..71cd7aca38ce2464be5a0e2dad8a21a57a800ee7 100644 (file)
@@ -11,7 +11,6 @@
 
 #include <asm/hwrpb.h>
 #include <asm/io.h>
-#include <asm/segment.h>
 
 #define SMC_DEBUG 0
 
index eabc3efa6c6ddf9ba97a3f1ca7cd7379d144e2d9..526418543379b9272edba4eb757070fba5ff6996 100644 (file)
@@ -742,6 +742,7 @@ extern long arc_strnlen_user_noinline(const char __user *src, long n);
 
 #endif
 
+#include <asm/segment.h>
 #include <asm-generic/uaccess.h>
 
 #endif
index 8b0f7c4c3f09b98e524c19d5c4d045979a5491dc..7d26ca0b13025283ab3b5fd7cfb6ce2fa342b482 100644 (file)
@@ -152,7 +152,7 @@ CONFIG_SPI_S3C24XX=y
 CONFIG_SPI_SPIDEV=y
 CONFIG_GPIO_SYSFS=y
 CONFIG_SENSORS_LM75=y
-CONFIG_THERMAL=m
+CONFIG_THERMAL=y
 CONFIG_WATCHDOG=y
 CONFIG_S3C2410_WATCHDOG=y
 CONFIG_FB=y
index f6d24d762a7fde6857ac526976f894e76dee7665..07ebbdce36451c85e0a868d501821583f7c35315 100644 (file)
@@ -387,7 +387,7 @@ CONFIG_SENSORS_LM75=m
 CONFIG_SENSORS_LM90=m
 CONFIG_SENSORS_LM95245=m
 CONFIG_SENSORS_NTC_THERMISTOR=m
-CONFIG_THERMAL=m
+CONFIG_THERMAL=y
 CONFIG_WATCHDOG=y
 CONFIG_XILINX_WATCHDOG=m
 CONFIG_SA1100_WATCHDOG=m
index 0b2ecc98e0861faeedc8c1f32fb68324ba662fb6..60de9d13181ad07e0449bfdd69a45336d5294f27 100644 (file)
@@ -14,7 +14,6 @@ generic-y += msi.h
 generic-y += parport.h
 generic-y += preempt.h
 generic-y += seccomp.h
-generic-y += segment.h
 generic-y += serial.h
 generic-y += simd.h
 generic-y += trace_clock.h
index 9e977dedf193b53c817428c5f8e151250c44b9c9..1de6e05ce48b2a079c4ed3a31eebca91f294a87b 100644 (file)
@@ -17,7 +17,6 @@ generic-y += mmiowb.h
 generic-y += msi.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
-generic-y += segment.h
 generic-y += serial.h
 generic-y += set_memory.h
 generic-y += switch_to.h
index 6b168d32fbffe63a1acf325352d5fc21ad2c4ab2..2162eb32dcec828f89b9eac57a49b0cf864b3676 100644 (file)
@@ -30,7 +30,6 @@ generic-y += pci.h
 generic-y += percpu.h
 generic-y += pgalloc.h
 generic-y += preempt.h
-generic-y += segment.h
 generic-y += serial.h
 generic-y += shmparam.h
 generic-y += tlbflush.h
index 61c01db6c29230ca8b60ffc64d00179ccc579b24..ecfc4b4b6373cf4a89b8e92ad62b686963fe7000 100644 (file)
@@ -23,6 +23,7 @@ config H8300
        select HAVE_ARCH_KGDB
        select HAVE_ARCH_HASH
        select CPU_NO_EFFICIENT_FFS
+       select UACCESS_MEMCPY
 
 config CPU_BIG_ENDIAN
        def_bool y
index f2e22058e48823f1f7a682cff8c4c66b931784fe..79cd1e605ec4d1823822c1b806713a66fdb0e1b9 100644 (file)
@@ -47,6 +47,7 @@ generic-y += timex.h
 generic-y += tlbflush.h
 generic-y += topology.h
 generic-y += trace_clock.h
+generic-y += uaccess.h
 generic-y += unaligned.h
 generic-y += vga.h
 generic-y += word-at-a-time.h
diff --git a/arch/h8300/include/asm/uaccess.h b/arch/h8300/include/asm/uaccess.h
deleted file mode 100644 (file)
index bc80319..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_UACCESS_H
-#define _ASM_UACCESS_H
-
-#include <linux/string.h>
-
-static inline __must_check unsigned long
-raw_copy_from_user(void *to, const void __user * from, unsigned long n)
-{
-       if (__builtin_constant_p(n)) {
-               switch(n) {
-               case 1:
-                       *(u8 *)to = *(u8 __force *)from;
-                       return 0;
-               case 2:
-                       *(u16 *)to = *(u16 __force *)from;
-                       return 0;
-               case 4:
-                       *(u32 *)to = *(u32 __force *)from;
-                       return 0;
-               }
-       }
-
-       memcpy(to, (const void __force *)from, n);
-       return 0;
-}
-
-static inline __must_check unsigned long
-raw_copy_to_user(void __user *to, const void *from, unsigned long n)
-{
-       if (__builtin_constant_p(n)) {
-               switch(n) {
-               case 1:
-                       *(u8 __force *)to = *(u8 *)from;
-                       return 0;
-               case 2:
-                       *(u16 __force *)to = *(u16 *)from;
-                       return 0;
-               case 4:
-                       *(u32 __force *)to = *(u32 *)from;
-                       return 0;
-               default:
-                       break;
-               }
-       }
-
-       memcpy((void __force *)to, from, n);
-       return 0;
-}
-#define INLINE_COPY_FROM_USER
-#define INLINE_COPY_TO_USER
-
-#include <asm-generic/uaccess.h>
-
-#endif
index 4a3d72f76ea27be7520414bc503dddb5b4f1def8..84bb1ed1b9311078c793f64dd4c9c7abc477bfbf 100644 (file)
@@ -29,7 +29,6 @@ generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += sections.h
-generic-y += segment.h
 generic-y += serial.h
 generic-y += shmparam.h
 generic-y += topology.h
index a30e58d5f3516cce39fd35caec420603e508a45b..7a34092e8b58fe604ff58fc266e85f07bfe4c647 100644 (file)
@@ -24,7 +24,6 @@
  * User space memory access functions
  */
 #include <linux/mm.h>
-#include <asm/segment.h>
 #include <asm/sections.h>
 
 /*
diff --git a/arch/ia64/include/asm/segment.h b/arch/ia64/include/asm/segment.h
deleted file mode 100644 (file)
index b89e2b3..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_IA64_SEGMENT_H
-#define _ASM_IA64_SEGMENT_H
-
-/* Only here because we have some old header files that expect it.. */
-
-#endif /* _ASM_IA64_SEGMENT_H */
index ff40fbc2f4399dc2a51b76d39b022a993df5a7e8..21a1168ae301f6d1519990afce524d8b365a32de 100644 (file)
@@ -228,7 +228,7 @@ CONFIG_SERIAL_IP22_ZILOG=m
 # CONFIG_HW_RANDOM is not set
 CONFIG_RAW_DRIVER=m
 # CONFIG_HWMON is not set
-CONFIG_THERMAL=m
+CONFIG_THERMAL=y
 CONFIG_WATCHDOG=y
 CONFIG_INDYDOG=m
 # CONFIG_VGA_CONSOLE is not set
index 81c47e18131bd46422143e86d8ef1a12ed7b1260..54db5dedf7764594b473a2312581dde6801b0ee2 100644 (file)
@@ -271,7 +271,7 @@ CONFIG_I2C_PARPORT_LIGHT=m
 CONFIG_I2C_TAOS_EVM=m
 CONFIG_I2C_STUB=m
 # CONFIG_HWMON is not set
-CONFIG_THERMAL=m
+CONFIG_THERMAL=y
 CONFIG_MFD_PCF50633=m
 CONFIG_PCF50633_ADC=m
 CONFIG_PCF50633_GPIO=m
index 87b86cdf126a99245d8309263906e20e748edb89..a03cd4e24f3789169978a5818733d377dc7ef4b5 100644 (file)
@@ -19,7 +19,6 @@ generic-y += preempt.h
 generic-y += qrwlock.h
 generic-y += qspinlock.h
 generic-y += sections.h
-generic-y += segment.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
 generic-y += user.h
index f67a327777b579bde48b07dc83bd7812cfef8d4f..d8ce778d0640065d0c24f2a4ac4407bdc6583dca 100644 (file)
@@ -37,7 +37,6 @@ generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += sections.h
-generic-y += segment.h
 generic-y += serial.h
 generic-y += switch_to.h
 generic-y += timex.h
index d7ef3512504a6be516b7fe81c2a74d49f81e9cbe..a8ffdd007f6ca7b06a349dec829053d261adc4b4 100644 (file)
@@ -33,7 +33,6 @@ generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += sections.h
-generic-y += segment.h
 generic-y += serial.h
 generic-y += spinlock.h
 generic-y += topology.h
index 1919cc5e0f11d4af523998bf2fac993a4e932547..164be10062bc33d969e5dffbd43d1640b3606f63 100644 (file)
@@ -34,7 +34,6 @@ generic-y += qspinlock.h
 generic-y += qrwlock_types.h
 generic-y += qrwlock.h
 generic-y += sections.h
-generic-y += segment.h
 generic-y += shmparam.h
 generic-y += switch_to.h
 generic-y += topology.h
index eb97a8e7c8aa79d1d1e4ccf3329f7e2281d37e5e..e8fb2a764f4697e1f7a707cb021b883e4a158a5d 100644 (file)
@@ -30,7 +30,6 @@
 #include <linux/elf.h>
 
 #include <asm/thread_info.h>
-#include <asm/segment.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
index c605bdad1746ad25288b6b2e06de9641704e1f3f..17c00d06d91bbd421c0829c913b4135c93e91902 100644 (file)
@@ -39,7 +39,6 @@
 #include <linux/device.h>
 
 #include <asm/sections.h>
-#include <asm/segment.h>
 #include <asm/pgtable.h>
 #include <asm/types.h>
 #include <asm/setup.h>
index d8981cbb852a5f1fc1ea80667df3ed451579d13c..6ed7293ef007f4ae068622ff73ed178aae94c07e 100644 (file)
@@ -35,7 +35,6 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 
-#include <asm/segment.h>
 #include <asm/io.h>
 #include <asm/pgtable.h>
 #include <asm/unwinder.h>
index abe87e54e231cf73bf4d30581c6f34e5e0dcc896..e63cb4a91a3ea7d4ba591dbe8e14b8af959b578e 100644 (file)
@@ -32,7 +32,6 @@
 #include <linux/blkdev.h>      /* for initrd_* */
 #include <linux/pagemap.h>
 
-#include <asm/segment.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/dma.h>
index 6c253a2e86bc4e1a2cba5e509f09fdbe5c148635..7f9f50161dfedda1dc7809e51da8aed85bef7563 100644 (file)
@@ -26,7 +26,6 @@
 #include <linux/mm.h>
 #include <linux/init.h>
 
-#include <asm/segment.h>
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
index ed2d8cc9490969cab10137383d770202ade70a0a..005ee8ad0446a78b6febaa45960eb56c17a337b7 100644 (file)
@@ -19,7 +19,6 @@ generic-y += mmiowb.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += seccomp.h
-generic-y += segment.h
 generic-y += trace_clock.h
 generic-y += user.h
 generic-y += vga.h
diff --git a/arch/s390/include/asm/segment.h b/arch/s390/include/asm/segment.h
deleted file mode 100644 (file)
index 97a0582..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_SEGMENT_H
-#define _ASM_SEGMENT_H
-
-#endif
index cd3df5514552cc262dee1d1b99260d0ee089668b..ad71132374f0c7eecb9efe9c923e82d2e67b9872 100644 (file)
@@ -24,7 +24,6 @@
 #include <linux/seccomp.h>
 #include <linux/compat.h>
 #include <trace/syscall.h>
-#include <asm/segment.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
index aebd01fc28e59a3b5c530c8a596a4fcccd3e15df..360cc9abcdb0933ef3234df5fd647bb383c9bd0f 100644 (file)
@@ -119,7 +119,7 @@ CONFIG_I2C_PUV3=y
 #      Hardware Monitoring support
 #CONFIG_SENSORS_LM75=m
 #      Generic Thermal sysfs driver
-#CONFIG_THERMAL=m
+#CONFIG_THERMAL=y
 #CONFIG_THERMAL_HWMON=y
 
 #      Multimedia support
index c93dc6478cb2287c45859a8ae6ea21bb525b80e9..5fe2426bb7a579abfcd234e4379e4a1d6f3efe38 100644 (file)
@@ -28,7 +28,6 @@ generic-y += parport.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += sections.h
-generic-y += segment.h
 generic-y += serial.h
 generic-y += shmparam.h
 generic-y += syscalls.h
index 21e9f2fac04b545c4d3f8c59691bbd868474779a..2bbbd4d1ba31de5c0431393247ac4279878eb13d 100644 (file)
@@ -270,9 +270,6 @@ config GENERIC_BUG
 config GENERIC_BUG_RELATIVE_POINTERS
        bool
 
-config GENERIC_HWEIGHT
-       def_bool y
-
 config ARCH_MAY_HAVE_PC_FDC
        def_bool y
        depends on ISA_DMA_API
index 8e470b018512c29f41d0adc7dd8717b155a07229..3a4d8d4d39f87bb073c4c8b7504a293344ef3cfe 100644 (file)
@@ -73,14 +73,12 @@ const char *outfilename;
 enum {
        sym_vvar_start,
        sym_vvar_page,
-       sym_hpet_page,
        sym_pvclock_page,
        sym_hvclock_page,
 };
 
 const int special_pages[] = {
        sym_vvar_page,
-       sym_hpet_page,
        sym_pvclock_page,
        sym_hvclock_page,
 };
@@ -93,7 +91,6 @@ struct vdso_sym {
 struct vdso_sym required_syms[] = {
        [sym_vvar_start] = {"vvar_start", true},
        [sym_vvar_page] = {"vvar_page", true},
-       [sym_hpet_page] = {"hpet_page", true},
        [sym_pvclock_page] = {"pvclock_page", true},
        [sym_hvclock_page] = {"hvclock_page", true},
        {"VDSO32_NOTE_MASK", true},
index 7635c23f7d82e9839ba306fbb6721333d532d11f..58a6993d7eb3f6aff39f0842b25cb5e4ada0eec9 100644 (file)
@@ -393,7 +393,7 @@ static __init int _init_events_attrs(void)
        return 0;
 }
 
-const struct attribute_group *amd_iommu_attr_groups[] = {
+static const struct attribute_group *amd_iommu_attr_groups[] = {
        &amd_iommu_format_group,
        &amd_iommu_cpumask_group,
        &amd_iommu_events_group,
index ef763f535e3abbd034857ad48a678c1281a358c4..12ec402f41144119bf04fb3beef885f6f7673593 100644 (file)
@@ -3265,7 +3265,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
                return ret;
 
        if (event->attr.precise_ip) {
-               if (!(event->attr.freq || event->attr.wakeup_events)) {
+               if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
                        event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
                        if (!(event->attr.sample_type &
                              ~intel_pmu_large_pebs_flags(event)))
index 07fc84bb85c1e9e85138cd9205045215e1d0d528..a6ac2f4f76fc97ef2409e31fb3f8a3ac54a35bd4 100644 (file)
@@ -394,10 +394,10 @@ struct cpu_hw_events {
 
 /* Event constraint, but match on all event flags too. */
 #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
-       EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+       EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS)
 
 #define INTEL_FLAGS_EVENT_CONSTRAINT_RANGE(c, e, n)                    \
-       EVENT_CONSTRAINT_RANGE(c, e, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+       EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS)
 
 /* Check only flags, but allow all event/umask */
 #define INTEL_ALL_EVENT_CONSTRAINT(code, n)    \
index fc0693569f7aae7baebcf8c6606ce66c09af701e..ba88edd0d58b1dde141899aee64ce0e33edc4449 100644 (file)
@@ -12,8 +12,6 @@
 #define REG_OUT "a"
 #endif
 
-#define __HAVE_ARCH_SW_HWEIGHT
-
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
        unsigned int res;
index 27566e57e87d999c386d3ade383045e921747d6f..230474e2ddb5b3eb687cdd647a15f750a65e5f32 100644 (file)
@@ -19,7 +19,6 @@ struct vdso_image {
        long sym_vvar_start;  /* Negative offset to the vvar area */
 
        long sym_vvar_page;
-       long sym_hpet_page;
        long sym_pvclock_page;
        long sym_hvclock_page;
        long sym_VDSO32_NOTE_MASK;
index cf52ee0d87111c13e0eef8a2429db889ed585962..9e4fa2484d10dd276d4804017466d7b191c07b12 100644 (file)
@@ -768,7 +768,7 @@ static struct kprobe kretprobe_kprobe = {
 /*
  * Called from kretprobe_trampoline
  */
-static __used void *trampoline_handler(struct pt_regs *regs)
+__used __visible void *trampoline_handler(struct pt_regs *regs)
 {
        struct kprobe_ctlblk *kcb;
        struct kretprobe_instance *ri = NULL;
index 7de466eb960b8f708b8b0e55fbda5c850185874f..8b6d03e55d2f79888c0aa9740c51bc486d8fa05f 100644 (file)
@@ -58,7 +58,6 @@
 #include <asm/alternative.h>
 #include <asm/fpu/xstate.h>
 #include <asm/trace/mpx.h>
-#include <asm/nospec-branch.h>
 #include <asm/mpx.h>
 #include <asm/vm86.h>
 #include <asm/umip.h>
@@ -368,13 +367,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
                regs->ip = (unsigned long)general_protection;
                regs->sp = (unsigned long)&gpregs->orig_ax;
 
-               /*
-                * This situation can be triggered by userspace via
-                * modify_ldt(2) and the return does not take the regular
-                * user space exit, so a CPU buffer clear is required when
-                * MDS mitigation is enabled.
-                */
-               mds_user_clear_cpu_buffers();
                return;
        }
 #endif
index 20d14254b6869263a00beb160694ff4f36f41628..62fc457f3849af0bf867fa202149f4fa556cd6dc 100644 (file)
 
 #include "ident_map.c"
 
+#define DEFINE_POPULATE(fname, type1, type2, init)             \
+static inline void fname##_init(struct mm_struct *mm,          \
+               type1##_t *arg1, type2##_t *arg2, bool init)    \
+{                                                              \
+       if (init)                                               \
+               fname##_safe(mm, arg1, arg2);                   \
+       else                                                    \
+               fname(mm, arg1, arg2);                          \
+}
+
+DEFINE_POPULATE(p4d_populate, p4d, pud, init)
+DEFINE_POPULATE(pgd_populate, pgd, p4d, init)
+DEFINE_POPULATE(pud_populate, pud, pmd, init)
+DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init)
+
+#define DEFINE_ENTRY(type1, type2, init)                       \
+static inline void set_##type1##_init(type1##_t *arg1,         \
+                       type2##_t arg2, bool init)              \
+{                                                              \
+       if (init)                                               \
+               set_##type1##_safe(arg1, arg2);                 \
+       else                                                    \
+               set_##type1(arg1, arg2);                        \
+}
+
+DEFINE_ENTRY(p4d, p4d, init)
+DEFINE_ENTRY(pud, pud, init)
+DEFINE_ENTRY(pmd, pmd, init)
+DEFINE_ENTRY(pte, pte, init)
+
+
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  * physical space so we can cache the place of the first one and move
@@ -414,7 +445,7 @@ void __init cleanup_highmap(void)
  */
 static unsigned long __meminit
 phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
-             pgprot_t prot)
+             pgprot_t prot, bool init)
 {
        unsigned long pages = 0, paddr_next;
        unsigned long paddr_last = paddr_end;
@@ -432,7 +463,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
                                             E820_TYPE_RAM) &&
                            !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
                                             E820_TYPE_RESERVED_KERN))
-                               set_pte_safe(pte, __pte(0));
+                               set_pte_init(pte, __pte(0), init);
                        continue;
                }
 
@@ -452,7 +483,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
                        pr_info("   pte=%p addr=%lx pte=%016lx\n", pte, paddr,
                                pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
                pages++;
-               set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+               set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init);
                paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
        }
 
@@ -468,7 +499,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
  */
 static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
-             unsigned long page_size_mask, pgprot_t prot)
+             unsigned long page_size_mask, pgprot_t prot, bool init)
 {
        unsigned long pages = 0, paddr_next;
        unsigned long paddr_last = paddr_end;
@@ -487,7 +518,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
                                             E820_TYPE_RAM) &&
                            !e820__mapped_any(paddr & PMD_MASK, paddr_next,
                                             E820_TYPE_RESERVED_KERN))
-                               set_pmd_safe(pmd, __pmd(0));
+                               set_pmd_init(pmd, __pmd(0), init);
                        continue;
                }
 
@@ -496,7 +527,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
                                spin_lock(&init_mm.page_table_lock);
                                pte = (pte_t *)pmd_page_vaddr(*pmd);
                                paddr_last = phys_pte_init(pte, paddr,
-                                                          paddr_end, prot);
+                                                          paddr_end, prot,
+                                                          init);
                                spin_unlock(&init_mm.page_table_lock);
                                continue;
                        }
@@ -524,19 +556,20 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
                if (page_size_mask & (1<<PG_LEVEL_2M)) {
                        pages++;
                        spin_lock(&init_mm.page_table_lock);
-                       set_pte_safe((pte_t *)pmd,
-                               pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
-                                       __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+                       set_pte_init((pte_t *)pmd,
+                                    pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
+                                            __pgprot(pgprot_val(prot) | _PAGE_PSE)),
+                                    init);
                        spin_unlock(&init_mm.page_table_lock);
                        paddr_last = paddr_next;
                        continue;
                }
 
                pte = alloc_low_page();
-               paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
+               paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init);
 
                spin_lock(&init_mm.page_table_lock);
-               pmd_populate_kernel_safe(&init_mm, pmd, pte);
+               pmd_populate_kernel_init(&init_mm, pmd, pte, init);
                spin_unlock(&init_mm.page_table_lock);
        }
        update_page_count(PG_LEVEL_2M, pages);
@@ -551,7 +584,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
  */
 static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
-             unsigned long page_size_mask)
+             unsigned long page_size_mask, bool init)
 {
        unsigned long pages = 0, paddr_next;
        unsigned long paddr_last = paddr_end;
@@ -573,7 +606,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
                                             E820_TYPE_RAM) &&
                            !e820__mapped_any(paddr & PUD_MASK, paddr_next,
                                             E820_TYPE_RESERVED_KERN))
-                               set_pud_safe(pud, __pud(0));
+                               set_pud_init(pud, __pud(0), init);
                        continue;
                }
 
@@ -583,7 +616,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
                                paddr_last = phys_pmd_init(pmd, paddr,
                                                           paddr_end,
                                                           page_size_mask,
-                                                          prot);
+                                                          prot, init);
                                continue;
                        }
                        /*
@@ -610,9 +643,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
                if (page_size_mask & (1<<PG_LEVEL_1G)) {
                        pages++;
                        spin_lock(&init_mm.page_table_lock);
-                       set_pte_safe((pte_t *)pud,
-                               pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
-                                       PAGE_KERNEL_LARGE));
+                       set_pte_init((pte_t *)pud,
+                                    pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
+                                            PAGE_KERNEL_LARGE),
+                                    init);
                        spin_unlock(&init_mm.page_table_lock);
                        paddr_last = paddr_next;
                        continue;
@@ -620,10 +654,10 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
 
                pmd = alloc_low_page();
                paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
-                                          page_size_mask, prot);
+                                          page_size_mask, prot, init);
 
                spin_lock(&init_mm.page_table_lock);
-               pud_populate_safe(&init_mm, pud, pmd);
+               pud_populate_init(&init_mm, pud, pmd, init);
                spin_unlock(&init_mm.page_table_lock);
        }
 
@@ -634,14 +668,15 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
 
 static unsigned long __meminit
 phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
-             unsigned long page_size_mask)
+             unsigned long page_size_mask, bool init)
 {
        unsigned long paddr_next, paddr_last = paddr_end;
        unsigned long vaddr = (unsigned long)__va(paddr);
        int i = p4d_index(vaddr);
 
        if (!pgtable_l5_enabled())
-               return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
+               return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
+                                    page_size_mask, init);
 
        for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
                p4d_t *p4d;
@@ -657,39 +692,34 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
                                             E820_TYPE_RAM) &&
                            !e820__mapped_any(paddr & P4D_MASK, paddr_next,
                                             E820_TYPE_RESERVED_KERN))
-                               set_p4d_safe(p4d, __p4d(0));
+                               set_p4d_init(p4d, __p4d(0), init);
                        continue;
                }
 
                if (!p4d_none(*p4d)) {
                        pud = pud_offset(p4d, 0);
-                       paddr_last = phys_pud_init(pud, paddr,
-                                       paddr_end,
-                                       page_size_mask);
+                       paddr_last = phys_pud_init(pud, paddr, paddr_end,
+                                                  page_size_mask, init);
                        continue;
                }
 
                pud = alloc_low_page();
                paddr_last = phys_pud_init(pud, paddr, paddr_end,
-                                          page_size_mask);
+                                          page_size_mask, init);
 
                spin_lock(&init_mm.page_table_lock);
-               p4d_populate_safe(&init_mm, p4d, pud);
+               p4d_populate_init(&init_mm, p4d, pud, init);
                spin_unlock(&init_mm.page_table_lock);
        }
 
        return paddr_last;
 }
 
-/*
- * Create page table mapping for the physical memory for specific physical
- * addresses. The virtual and physical addresses have to be aligned on PMD level
- * down. It returns the last physical address mapped.
- */
-unsigned long __meminit
-kernel_physical_mapping_init(unsigned long paddr_start,
-                            unsigned long paddr_end,
-                            unsigned long page_size_mask)
+static unsigned long __meminit
+__kernel_physical_mapping_init(unsigned long paddr_start,
+                              unsigned long paddr_end,
+                              unsigned long page_size_mask,
+                              bool init)
 {
        bool pgd_changed = false;
        unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
@@ -709,19 +739,22 @@ kernel_physical_mapping_init(unsigned long paddr_start,
                        p4d = (p4d_t *)pgd_page_vaddr(*pgd);
                        paddr_last = phys_p4d_init(p4d, __pa(vaddr),
                                                   __pa(vaddr_end),
-                                                  page_size_mask);
+                                                  page_size_mask,
+                                                  init);
                        continue;
                }
 
                p4d = alloc_low_page();
                paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
-                                          page_size_mask);
+                                          page_size_mask, init);
 
                spin_lock(&init_mm.page_table_lock);
                if (pgtable_l5_enabled())
-                       pgd_populate_safe(&init_mm, pgd, p4d);
+                       pgd_populate_init(&init_mm, pgd, p4d, init);
                else
-                       p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
+                       p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr),
+                                         (pud_t *) p4d, init);
+
                spin_unlock(&init_mm.page_table_lock);
                pgd_changed = true;
        }
@@ -732,6 +765,37 @@ kernel_physical_mapping_init(unsigned long paddr_start,
        return paddr_last;
 }
 
+
+/*
+ * Create page table mapping for the physical memory for specific physical
+ * addresses. Note that it can only be used to populate non-present entries.
+ * The virtual and physical addresses have to be aligned on PMD level
+ * down. It returns the last physical address mapped.
+ */
+unsigned long __meminit
+kernel_physical_mapping_init(unsigned long paddr_start,
+                            unsigned long paddr_end,
+                            unsigned long page_size_mask)
+{
+       return __kernel_physical_mapping_init(paddr_start, paddr_end,
+                                             page_size_mask, true);
+}
+
+/*
+ * This function is similar to kernel_physical_mapping_init() above with the
+ * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe()
+ * when updating the mapping. The caller is responsible to flush the TLBs after
+ * the function returns.
+ */
+unsigned long __meminit
+kernel_physical_mapping_change(unsigned long paddr_start,
+                              unsigned long paddr_end,
+                              unsigned long page_size_mask)
+{
+       return __kernel_physical_mapping_init(paddr_start, paddr_end,
+                                             page_size_mask, false);
+}
+
 #ifndef CONFIG_NUMA
 void __init initmem_init(void)
 {
index 385afa2b9e17a7ac15683b0d75252274499a8bc0..51f50a7a07ef7842c7e038f58066314dc04d01b5 100644 (file)
@@ -301,9 +301,13 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr,
                else
                        split_page_size_mask = 1 << PG_LEVEL_2M;
 
-               kernel_physical_mapping_init(__pa(vaddr & pmask),
-                                            __pa((vaddr_end & pmask) + psize),
-                                            split_page_size_mask);
+               /*
+                * kernel_physical_mapping_change() does not flush the TLBs, so
+                * a TLB flush is required after we exit from the for loop.
+                */
+               kernel_physical_mapping_change(__pa(vaddr & pmask),
+                                              __pa((vaddr_end & pmask) + psize),
+                                              split_page_size_mask);
        }
 
        ret = 0;
index 319bde386d5f4a9402f695ffb6948b76b58f47bd..eeae142062ed4b06afbb8247a08d3a19b2f00766 100644 (file)
@@ -13,6 +13,9 @@ void early_ioremap_page_table_range_init(void);
 unsigned long kernel_physical_mapping_init(unsigned long start,
                                             unsigned long end,
                                             unsigned long page_size_mask);
+unsigned long kernel_physical_mapping_change(unsigned long start,
+                                            unsigned long end,
+                                            unsigned long page_size_mask);
 void zone_sizes_init(void);
 
 extern int after_bootmem;
diff --git a/arch/xtensa/include/asm/segment.h b/arch/xtensa/include/asm/segment.h
deleted file mode 100644 (file)
index 98964ad..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * include/asm-xtensa/segment.h
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2001 - 2005 Tensilica Inc.
- */
-
-#ifndef _XTENSA_SEGMENT_H
-#define _XTENSA_SEGMENT_H
-
-#include <linux/uaccess.h>
-
-#endif /* _XTENSA_SEGEMENT_H */
index 2210c1b9491ba2e9f690dd4a26b209b64f4ad925..e5009a34f9c2622570272cd9ea7789e0e5492b2c 100644 (file)
@@ -934,7 +934,7 @@ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
        struct rbd_client *rbdc;
        int ret;
 
-       mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
+       mutex_lock(&client_mutex);
        rbdc = rbd_client_find(ceph_opts);
        if (rbdc) {
                ceph_destroy_options(ceph_opts);
@@ -1326,7 +1326,7 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
                zero_bvecs(&obj_req->bvec_pos, off, bytes);
                break;
        default:
-               rbd_assert(0);
+               BUG();
        }
 }
 
@@ -1581,7 +1581,7 @@ static void rbd_obj_request_destroy(struct kref *kref)
                kfree(obj_request->bvec_pos.bvecs);
                break;
        default:
-               rbd_assert(0);
+               BUG();
        }
 
        kfree(obj_request->img_extents);
@@ -1781,7 +1781,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
                                                    &obj_req->bvec_pos);
                break;
        default:
-               rbd_assert(0);
+               BUG();
        }
 }
 
@@ -2036,7 +2036,7 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req)
                        ret = rbd_obj_setup_zeroout(obj_req);
                        break;
                default:
-                       rbd_assert(0);
+                       BUG();
                }
                if (ret < 0)
                        return ret;
@@ -2383,7 +2383,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
                                                      &obj_req->bvec_pos);
                        break;
                default:
-                       rbd_assert(0);
+                       BUG();
                }
        } else {
                ret = rbd_img_fill_from_bvecs(child_img_req,
@@ -2515,7 +2515,7 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
                num_osd_ops += count_zeroout_ops(obj_req);
                break;
        default:
-               rbd_assert(0);
+               BUG();
        }
 
        obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
@@ -2542,7 +2542,7 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
                __rbd_obj_setup_zeroout(obj_req, which);
                break;
        default:
-               rbd_assert(0);
+               BUG();
        }
 
        ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
@@ -3842,8 +3842,12 @@ static void rbd_queue_workfn(struct work_struct *work)
                goto err_rq;
        }
 
-       rbd_assert(op_type == OBJ_OP_READ ||
-                  rbd_dev->spec->snap_id == CEPH_NOSNAP);
+       if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) {
+               rbd_warn(rbd_dev, "%s on read-only snapshot",
+                        obj_op_name(op_type));
+               result = -EIO;
+               goto err;
+       }
 
        /*
         * Quit early if the mapped snapshot no longer exists.  It's
index 47eb4d13ed5f870c38e2ba9f38de35957418c825..5e2e0348d460fdef29d1e6ff14b82905f14bbb4a 100644 (file)
@@ -263,8 +263,8 @@ config EDAC_PND2
          micro-server but may appear on others in the future.
 
 config EDAC_MPC85XX
-       tristate "Freescale MPC83xx / MPC85xx"
-       depends on FSL_SOC
+       bool "Freescale MPC83xx / MPC85xx"
+       depends on FSL_SOC && EDAC=y
        help
          Support for error detection and correction on the Freescale
          MPC8349, MPC8560, MPC8540, MPC8548, T4240
index 13594ffadcb3aca4b50d0c6e3b89afc78a6addf3..64922c8fa7e3b729ea6b7919b371fcd76515491b 100644 (file)
@@ -679,22 +679,18 @@ static int del_mc_from_global_list(struct mem_ctl_info *mci)
 
 struct mem_ctl_info *edac_mc_find(int idx)
 {
-       struct mem_ctl_info *mci = NULL;
+       struct mem_ctl_info *mci;
        struct list_head *item;
 
        mutex_lock(&mem_ctls_mutex);
 
        list_for_each(item, &mc_devices) {
                mci = list_entry(item, struct mem_ctl_info, link);
-
-               if (mci->mc_idx >= idx) {
-                       if (mci->mc_idx == idx) {
-                               goto unlock;
-                       }
-                       break;
-               }
+               if (mci->mc_idx == idx)
+                       goto unlock;
        }
 
+       mci = NULL;
 unlock:
        mutex_unlock(&mem_ctls_mutex);
        return mci;
index cd91510a5387d604b86e28af721f3002072b8f70..e694c46ff039cce3c38cfee073d92d0e1b0f4b40 100644 (file)
@@ -118,9 +118,7 @@ static DEFINE_IDA(hwmon_ida);
  * The complex conditional is necessary to avoid a cyclic dependency
  * between hwmon and thermal_sys modules.
  */
-#if IS_REACHABLE(CONFIG_THERMAL) && defined(CONFIG_THERMAL_OF) && \
-       (!defined(CONFIG_THERMAL_HWMON) || \
-        !(defined(MODULE) && IS_MODULE(CONFIG_THERMAL)))
+#ifdef CONFIG_THERMAL_OF
 static int hwmon_thermal_get_temp(void *data, int *temp)
 {
        struct hwmon_thermal_data *tdata = data;
index 2557f198e1750002a1a2d7dacd3fefbb926a7fd7..db269a348b20917b035f9c9ec203c6e2d3fdecb9 100644 (file)
@@ -436,6 +436,15 @@ config DM_DELAY
 
        If unsure, say N.
 
+config DM_DUST
+       tristate "Bad sector simulation target"
+       depends on BLK_DEV_DM
+       ---help---
+       A target that simulates bad sector behavior.
+       Useful for testing.
+
+       If unsure, say N.
+
 config DM_INIT
        bool "DM \"dm-mod.create=\" parameter support"
        depends on BLK_DEV_DM=y
index a52b703e588e23dfa3a240f8882acc2d1eb29212..be7a6eb92abcb47a4371fe6ed435f466124dda7a 100644 (file)
@@ -48,6 +48,7 @@ obj-$(CONFIG_DM_BUFIO)                += dm-bufio.o
 obj-$(CONFIG_DM_BIO_PRISON)    += dm-bio-prison.o
 obj-$(CONFIG_DM_CRYPT)         += dm-crypt.o
 obj-$(CONFIG_DM_DELAY)         += dm-delay.o
+obj-$(CONFIG_DM_DUST)          += dm-dust.o
 obj-$(CONFIG_DM_FLAKEY)                += dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH)     += dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)  += dm-queue-length.o
index 6fc93834da44648176a5163a098fdd485af2bf50..151aa95775be2daee11c7721eb62dde28ec702b1 100644 (file)
@@ -1167,11 +1167,18 @@ static int __load_discards(struct dm_cache_metadata *cmd,
                if (r)
                        return r;
 
-               for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+               for (b = 0; ; b++) {
                        r = fn(context, cmd->discard_block_size, to_dblock(b),
                               dm_bitset_cursor_get_value(&c));
                        if (r)
                                break;
+
+                       if (b >= (from_dblock(cmd->discard_nr_blocks) - 1))
+                               break;
+
+                       r = dm_bitset_cursor_next(&c);
+                       if (r)
+                               break;
                }
 
                dm_bitset_cursor_end(&c);
index 7f6462f74ac8fa99dbb3be7ccfe2b5c39068ee7a..1b16d34bb78518a1849aa331c6da2ac1da0e8586 100644 (file)
@@ -946,6 +946,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
 {
 #ifdef CONFIG_BLK_DEV_INTEGRITY
        struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
+       struct mapped_device *md = dm_table_get_md(ti->table);
 
        /* From now we require underlying device with our integrity profile */
        if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
@@ -965,7 +966,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
 
        if (crypt_integrity_aead(cc)) {
                cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
-               DMINFO("Integrity AEAD, tag size %u, IV size %u.",
+               DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md),
                       cc->integrity_tag_size, cc->integrity_iv_size);
 
                if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
@@ -973,7 +974,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
                        return -EINVAL;
                }
        } else if (cc->integrity_iv_size)
-               DMINFO("Additional per-sector space %u bytes for IV.",
+               DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md),
                       cc->integrity_iv_size);
 
        if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
@@ -1031,11 +1032,11 @@ static u8 *org_iv_of_dmreq(struct crypt_config *cc,
        return iv_of_dmreq(cc, dmreq) + cc->iv_size;
 }
 
-static uint64_t *org_sector_of_dmreq(struct crypt_config *cc,
+static __le64 *org_sector_of_dmreq(struct crypt_config *cc,
                       struct dm_crypt_request *dmreq)
 {
        u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
-       return (uint64_t*) ptr;
+       return (__le64 *) ptr;
 }
 
 static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
@@ -1071,7 +1072,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
        struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
        struct dm_crypt_request *dmreq;
        u8 *iv, *org_iv, *tag_iv, *tag;
-       uint64_t *sector;
+       __le64 *sector;
        int r = 0;
 
        BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
@@ -1143,9 +1144,11 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
                r = crypto_aead_decrypt(req);
        }
 
-       if (r == -EBADMSG)
-               DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+       if (r == -EBADMSG) {
+               char b[BDEVNAME_SIZE];
+               DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b),
                            (unsigned long long)le64_to_cpu(*sector));
+       }
 
        if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
                r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
@@ -1166,7 +1169,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
        struct scatterlist *sg_in, *sg_out;
        struct dm_crypt_request *dmreq;
        u8 *iv, *org_iv, *tag_iv;
-       uint64_t *sector;
+       __le64 *sector;
        int r = 0;
 
        /* Reject unexpected unaligned bio. */
@@ -1788,7 +1791,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
                error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
 
        if (error == -EBADMSG) {
-               DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+               char b[BDEVNAME_SIZE];
+               DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b),
                            (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
                io->error = BLK_STS_PROTECTION;
        } else if (error < 0)
@@ -1887,7 +1891,7 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
         * algorithm implementation is used.  Help people debug performance
         * problems by logging the ->cra_driver_name.
         */
-       DMINFO("%s using implementation \"%s\"", ciphermode,
+       DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode,
               crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name);
        return 0;
 }
@@ -1907,7 +1911,7 @@ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
                return err;
        }
 
-       DMINFO("%s using implementation \"%s\"", ciphermode,
+       DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode,
               crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name);
        return 0;
 }
index fddffe251bf6bf5c2ded195c31e392bc228568d8..f496213f8b6753b8760901848140c3f9901e7e06 100644 (file)
@@ -121,7 +121,8 @@ static void delay_dtr(struct dm_target *ti)
 {
        struct delay_c *dc = ti->private;
 
-       destroy_workqueue(dc->kdelayd_wq);
+       if (dc->kdelayd_wq)
+               destroy_workqueue(dc->kdelayd_wq);
 
        if (dc->read.dev)
                dm_put_device(ti, dc->read.dev);
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
new file mode 100644 (file)
index 0000000..845f376
--- /dev/null
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * This is a test "dust" device, which fails reads on specified
+ * sectors, emulating the behavior of a hard disk drive sending
+ * a "Read Medium Error" sense.
+ *
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+
+#define DM_MSG_PREFIX "dust"
+
+struct badblock {
+       struct rb_node node;
+       sector_t bb;
+};
+
+struct dust_device {
+       struct dm_dev *dev;
+       struct rb_root badblocklist;
+       unsigned long long badblock_count;
+       spinlock_t dust_lock;
+       unsigned int blksz;
+       unsigned int sect_per_block;
+       sector_t start;
+       bool fail_read_on_bb:1;
+       bool quiet_mode:1;
+};
+
+static struct badblock *dust_rb_search(struct rb_root *root, sector_t blk)
+{
+       struct rb_node *node = root->rb_node;
+
+       while (node) {
+               struct badblock *bblk = rb_entry(node, struct badblock, node);
+
+               if (bblk->bb > blk)
+                       node = node->rb_left;
+               else if (bblk->bb < blk)
+                       node = node->rb_right;
+               else
+                       return bblk;
+       }
+
+       return NULL;
+}
+
+static bool dust_rb_insert(struct rb_root *root, struct badblock *new)
+{
+       struct badblock *bblk;
+       struct rb_node **link = &root->rb_node, *parent = NULL;
+       sector_t value = new->bb;
+
+       while (*link) {
+               parent = *link;
+               bblk = rb_entry(parent, struct badblock, node);
+
+               if (bblk->bb > value)
+                       link = &(*link)->rb_left;
+               else if (bblk->bb < value)
+                       link = &(*link)->rb_right;
+               else
+                       return false;
+       }
+
+       rb_link_node(&new->node, parent, link);
+       rb_insert_color(&new->node, root);
+
+       return true;
+}
+
+static int dust_remove_block(struct dust_device *dd, unsigned long long block)
+{
+       struct badblock *bblock;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->dust_lock, flags);
+       bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
+
+       if (bblock == NULL) {
+               if (!dd->quiet_mode) {
+                       DMERR("%s: block %llu not found in badblocklist",
+                             __func__, block);
+               }
+               spin_unlock_irqrestore(&dd->dust_lock, flags);
+               return -EINVAL;
+       }
+
+       rb_erase(&bblock->node, &dd->badblocklist);
+       dd->badblock_count--;
+       if (!dd->quiet_mode)
+               DMINFO("%s: badblock removed at block %llu", __func__, block);
+       kfree(bblock);
+       spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+       return 0;
+}
+
+static int dust_add_block(struct dust_device *dd, unsigned long long block)
+{
+       struct badblock *bblock;
+       unsigned long flags;
+
+       bblock = kmalloc(sizeof(*bblock), GFP_KERNEL);
+       if (bblock == NULL) {
+               if (!dd->quiet_mode)
+                       DMERR("%s: badblock allocation failed", __func__);
+               return -ENOMEM;
+       }
+
+       spin_lock_irqsave(&dd->dust_lock, flags);
+       bblock->bb = block * dd->sect_per_block;
+       if (!dust_rb_insert(&dd->badblocklist, bblock)) {
+               if (!dd->quiet_mode) {
+                       DMERR("%s: block %llu already in badblocklist",
+                             __func__, block);
+               }
+               spin_unlock_irqrestore(&dd->dust_lock, flags);
+               kfree(bblock);
+               return -EINVAL;
+       }
+
+       dd->badblock_count++;
+       if (!dd->quiet_mode)
+               DMINFO("%s: badblock added at block %llu", __func__, block);
+       spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+       return 0;
+}
+
+static int dust_query_block(struct dust_device *dd, unsigned long long block)
+{
+       struct badblock *bblock;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->dust_lock, flags);
+       bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
+       if (bblock != NULL)
+               DMINFO("%s: block %llu found in badblocklist", __func__, block);
+       else
+               DMINFO("%s: block %llu not found in badblocklist", __func__, block);
+       spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+       return 0;
+}
+
+static int __dust_map_read(struct dust_device *dd, sector_t thisblock)
+{
+       struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
+
+       if (bblk)
+               return DM_MAPIO_KILL;
+
+       return DM_MAPIO_REMAPPED;
+}
+
+static int dust_map_read(struct dust_device *dd, sector_t thisblock,
+                        bool fail_read_on_bb)
+{
+       unsigned long flags;
+       int ret = DM_MAPIO_REMAPPED;
+
+       if (fail_read_on_bb) {
+               spin_lock_irqsave(&dd->dust_lock, flags);
+               ret = __dust_map_read(dd, thisblock);
+               spin_unlock_irqrestore(&dd->dust_lock, flags);
+       }
+
+       return ret;
+}
+
+static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
+{
+       struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
+
+       if (bblk) {
+               rb_erase(&bblk->node, &dd->badblocklist);
+               dd->badblock_count--;
+               kfree(bblk);
+               if (!dd->quiet_mode) {
+                       sector_div(thisblock, dd->sect_per_block);
+                       DMINFO("block %llu removed from badblocklist by write",
+                              (unsigned long long)thisblock);
+               }
+       }
+}
+
+static int dust_map_write(struct dust_device *dd, sector_t thisblock,
+                         bool fail_read_on_bb)
+{
+       unsigned long flags;
+
+       if (fail_read_on_bb) {
+               spin_lock_irqsave(&dd->dust_lock, flags);
+               __dust_map_write(dd, thisblock);
+               spin_unlock_irqrestore(&dd->dust_lock, flags);
+       }
+
+       return DM_MAPIO_REMAPPED;
+}
+
+static int dust_map(struct dm_target *ti, struct bio *bio)
+{
+       struct dust_device *dd = ti->private;
+       int ret;
+
+       bio_set_dev(bio, dd->dev->bdev);
+       bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+       if (bio_data_dir(bio) == READ)
+               ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+       else
+               ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+
+       return ret;
+}
+
+static bool __dust_clear_badblocks(struct rb_root *tree,
+                                  unsigned long long count)
+{
+       struct rb_node *node = NULL, *nnode = NULL;
+
+       nnode = rb_first(tree);
+       if (nnode == NULL) {
+               BUG_ON(count != 0);
+               return false;
+       }
+
+       while (nnode) {
+               node = nnode;
+               nnode = rb_next(node);
+               rb_erase(node, tree);
+               count--;
+               kfree(node);
+       }
+       BUG_ON(count != 0);
+       BUG_ON(tree->rb_node != NULL);
+
+       return true;
+}
+
+static int dust_clear_badblocks(struct dust_device *dd)
+{
+       unsigned long flags;
+       struct rb_root badblocklist;
+       unsigned long long badblock_count;
+
+       spin_lock_irqsave(&dd->dust_lock, flags);
+       badblocklist = dd->badblocklist;
+       badblock_count = dd->badblock_count;
+       dd->badblocklist = RB_ROOT;
+       dd->badblock_count = 0;
+       spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+       if (!__dust_clear_badblocks(&badblocklist, badblock_count))
+               DMINFO("%s: no badblocks found", __func__);
+       else
+               DMINFO("%s: badblocks cleared", __func__);
+
+       return 0;
+}
+
+/*
+ * Target parameters:
+ *
+ * <device_path> <offset> <blksz>
+ *
+ * device_path: path to the block device
+ * offset: offset to data area from start of device_path
+ * blksz: block size (minimum 512, maximum 1073741824, must be a power of 2)
+ */
+static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+       struct dust_device *dd;
+       unsigned long long tmp;
+       char dummy;
+       unsigned int blksz;
+       unsigned int sect_per_block;
+       sector_t DUST_MAX_BLKSZ_SECTORS = 2097152;
+       sector_t max_block_sectors = min(ti->len, DUST_MAX_BLKSZ_SECTORS);
+
+       if (argc != 3) {
+               ti->error = "Invalid argument count";
+               return -EINVAL;
+       }
+
+       if (kstrtouint(argv[2], 10, &blksz) || !blksz) {
+               ti->error = "Invalid block size parameter";
+               return -EINVAL;
+       }
+
+       if (blksz < 512) {
+               ti->error = "Block size must be at least 512";
+               return -EINVAL;
+       }
+
+       if (!is_power_of_2(blksz)) {
+               ti->error = "Block size must be a power of 2";
+               return -EINVAL;
+       }
+
+       if (to_sector(blksz) > max_block_sectors) {
+               ti->error = "Block size is too large";
+               return -EINVAL;
+       }
+
+       sect_per_block = (blksz >> SECTOR_SHIFT);
+
+       if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) {
+               ti->error = "Invalid device offset sector";
+               return -EINVAL;
+       }
+
+       dd = kzalloc(sizeof(struct dust_device), GFP_KERNEL);
+       if (dd == NULL) {
+               ti->error = "Cannot allocate context";
+               return -ENOMEM;
+       }
+
+       if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dd->dev)) {
+               ti->error = "Device lookup failed";
+               kfree(dd);
+               return -EINVAL;
+       }
+
+       dd->sect_per_block = sect_per_block;
+       dd->blksz = blksz;
+       dd->start = tmp;
+
+       /*
+        * Whether to fail a read on a "bad" block.
+        * Defaults to false; enabled later by message.
+        */
+       dd->fail_read_on_bb = false;
+
+       /*
+        * Initialize bad block list rbtree.
+        */
+       dd->badblocklist = RB_ROOT;
+       dd->badblock_count = 0;
+       spin_lock_init(&dd->dust_lock);
+
+       dd->quiet_mode = false;
+
+       BUG_ON(dm_set_target_max_io_len(ti, dd->sect_per_block) != 0);
+
+       ti->num_discard_bios = 1;
+       ti->num_flush_bios = 1;
+       ti->private = dd;
+
+       return 0;
+}
+
+static void dust_dtr(struct dm_target *ti)
+{
+       struct dust_device *dd = ti->private;
+
+       __dust_clear_badblocks(&dd->badblocklist, dd->badblock_count);
+       dm_put_device(ti, dd->dev);
+       kfree(dd);
+}
+
+static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
+                       char *result_buf, unsigned int maxlen)
+{
+       struct dust_device *dd = ti->private;
+       sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+       bool invalid_msg = false;
+       int result = -EINVAL;
+       unsigned long long tmp, block;
+       unsigned long flags;
+       char dummy;
+
+       if (argc == 1) {
+               if (!strcasecmp(argv[0], "addbadblock") ||
+                   !strcasecmp(argv[0], "removebadblock") ||
+                   !strcasecmp(argv[0], "queryblock")) {
+                       DMERR("%s requires an additional argument", argv[0]);
+               } else if (!strcasecmp(argv[0], "disable")) {
+                       DMINFO("disabling read failures on bad sectors");
+                       dd->fail_read_on_bb = false;
+                       result = 0;
+               } else if (!strcasecmp(argv[0], "enable")) {
+                       DMINFO("enabling read failures on bad sectors");
+                       dd->fail_read_on_bb = true;
+                       result = 0;
+               } else if (!strcasecmp(argv[0], "countbadblocks")) {
+                       spin_lock_irqsave(&dd->dust_lock, flags);
+                       DMINFO("countbadblocks: %llu badblock(s) found",
+                              dd->badblock_count);
+                       spin_unlock_irqrestore(&dd->dust_lock, flags);
+                       result = 0;
+               } else if (!strcasecmp(argv[0], "clearbadblocks")) {
+                       result = dust_clear_badblocks(dd);
+               } else if (!strcasecmp(argv[0], "quiet")) {
+                       if (!dd->quiet_mode)
+                               dd->quiet_mode = true;
+                       else
+                               dd->quiet_mode = false;
+                       result = 0;
+               } else {
+                       invalid_msg = true;
+               }
+       } else if (argc == 2) {
+               if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
+                       return result;
+
+               block = tmp;
+               sector_div(size, dd->sect_per_block);
+               if (block > size) {
+                       DMERR("selected block value out of range");
+                       return result;
+               }
+
+               if (!strcasecmp(argv[0], "addbadblock"))
+                       result = dust_add_block(dd, block);
+               else if (!strcasecmp(argv[0], "removebadblock"))
+                       result = dust_remove_block(dd, block);
+               else if (!strcasecmp(argv[0], "queryblock"))
+                       result = dust_query_block(dd, block);
+               else
+                       invalid_msg = true;
+
+       } else
+               DMERR("invalid number of arguments '%d'", argc);
+
+       if (invalid_msg)
+               DMERR("unrecognized message '%s' received", argv[0]);
+
+       return result;
+}
+
+static void dust_status(struct dm_target *ti, status_type_t type,
+                       unsigned int status_flags, char *result, unsigned int maxlen)
+{
+       struct dust_device *dd = ti->private;
+       unsigned int sz = 0;
+
+       switch (type) {
+       case STATUSTYPE_INFO:
+               DMEMIT("%s %s %s", dd->dev->name,
+                      dd->fail_read_on_bb ? "fail_read_on_bad_block" : "bypass",
+                      dd->quiet_mode ? "quiet" : "verbose");
+               break;
+
+       case STATUSTYPE_TABLE:
+               DMEMIT("%s %llu %u", dd->dev->name,
+                      (unsigned long long)dd->start, dd->blksz);
+               break;
+       }
+}
+
+static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+{
+       struct dust_device *dd = ti->private;
+       struct dm_dev *dev = dd->dev;
+
+       *bdev = dev->bdev;
+
+       /*
+        * Only pass ioctls through if the device sizes match exactly.
+        */
+       if (dd->start ||
+           ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+               return 1;
+
+       return 0;
+}
+
+static int dust_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn,
+                               void *data)
+{
+       struct dust_device *dd = ti->private;
+
+       return fn(ti, dd->dev, dd->start, ti->len, data);
+}
+
+static struct target_type dust_target = {
+       .name = "dust",
+       .version = {1, 0, 0},
+       .module = THIS_MODULE,
+       .ctr = dust_ctr,
+       .dtr = dust_dtr,
+       .iterate_devices = dust_iterate_devices,
+       .map = dust_map,
+       .message = dust_message,
+       .status = dust_status,
+       .prepare_ioctl = dust_prepare_ioctl,
+};
+
+static int __init dm_dust_init(void)
+{
+       int result = dm_register_target(&dust_target);
+
+       if (result < 0)
+               DMERR("dm_register_target failed %d", result);
+
+       return result;
+}
+
+static void __exit dm_dust_exit(void)
+{
+       dm_unregister_target(&dust_target);
+}
+
+module_init(dm_dust_init);
+module_exit(dm_dust_exit);
+
+MODULE_DESCRIPTION(DM_NAME " dust test target");
+MODULE_AUTHOR("Bryan Gurney <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
index 721efc4939422011a188ed1015473f118285352f..3f4139ac1f602463a452ddacef7cf092c3db6c85 100644 (file)
@@ -11,6 +11,7 @@
 #define _LINUX_DM_EXCEPTION_STORE
 
 #include <linux/blkdev.h>
+#include <linux/list_bl.h>
 #include <linux/device-mapper.h>
 
 /*
@@ -27,7 +28,7 @@ typedef sector_t chunk_t;
  * chunk within the device.
  */
 struct dm_exception {
-       struct list_head hash_list;
+       struct hlist_bl_node hash_list;
 
        chunk_t old_chunk;
        chunk_t new_chunk;
index 4b76f84424c3c1a73ef3bc3b9605a1486e3bf88b..352e803f566e1073b107c0beb2eff1c6256dd482 100644 (file)
@@ -160,7 +160,7 @@ static int __init dm_parse_table(struct dm_device *dev, char *str)
 
        while (table_entry) {
                DMDEBUG("parsing table \"%s\"", str);
-               if (++dev->dmi.target_count >= DM_MAX_TARGETS) {
+               if (++dev->dmi.target_count > DM_MAX_TARGETS) {
                        DMERR("too many targets %u > %d",
                              dev->dmi.target_count, DM_MAX_TARGETS);
                        return -EINVAL;
@@ -242,9 +242,9 @@ static int __init dm_parse_devices(struct list_head *devices, char *str)
                        return -ENOMEM;
                list_add_tail(&dev->list, devices);
 
-               if (++ndev >= DM_MAX_DEVICES) {
-                       DMERR("too many targets %u > %d",
-                             dev->dmi.target_count, DM_MAX_TARGETS);
+               if (++ndev > DM_MAX_DEVICES) {
+                       DMERR("too many devices %lu > %d",
+                             ndev, DM_MAX_DEVICES);
                        return -EINVAL;
                }
 
index c27c32cf4a30df7164793f9129717e3fa020f218..44e76cda087aa658fbb13195fa5d91f4117902b8 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/rbtree.h>
 #include <linux/delay.h>
 #include <linux/random.h>
+#include <linux/reboot.h>
 #include <crypto/hash.h>
 #include <crypto/skcipher.h>
 #include <linux/async_tx.h>
@@ -24,6 +25,7 @@
 
 #define DEFAULT_INTERLEAVE_SECTORS     32768
 #define DEFAULT_JOURNAL_SIZE_FACTOR    7
+#define DEFAULT_SECTORS_PER_BITMAP_BIT 32768
 #define DEFAULT_BUFFER_SECTORS         128
 #define DEFAULT_JOURNAL_WATERMARK      50
 #define DEFAULT_SYNC_MSEC              10000
@@ -33,6 +35,8 @@
 #define METADATA_WORKQUEUE_MAX_ACTIVE  16
 #define RECALC_SECTORS                 8192
 #define RECALC_WRITE_SUPER             16
+#define BITMAP_BLOCK_SIZE              4096    /* don't change it */
+#define BITMAP_FLUSH_INTERVAL          (10 * HZ)
 
 /*
  * Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -48,6 +52,7 @@
 #define SB_MAGIC                       "integrt"
 #define SB_VERSION_1                   1
 #define SB_VERSION_2                   2
+#define SB_VERSION_3                   3
 #define SB_SECTORS                     8
 #define MAX_SECTORS_PER_BLOCK          8
 
@@ -60,12 +65,14 @@ struct superblock {
        __u64 provided_data_sectors;    /* userspace uses this value */
        __u32 flags;
        __u8 log2_sectors_per_block;
-       __u8 pad[3];
+       __u8 log2_blocks_per_bitmap_bit;
+       __u8 pad[2];
        __u64 recalc_sector;
 };
 
 #define SB_FLAG_HAVE_JOURNAL_MAC       0x1
 #define SB_FLAG_RECALCULATING          0x2
+#define SB_FLAG_DIRTY_BITMAP           0x4
 
 #define        JOURNAL_ENTRY_ROUNDUP           8
 
@@ -151,9 +158,18 @@ struct dm_integrity_c {
        struct workqueue_struct *metadata_wq;
        struct superblock *sb;
        unsigned journal_pages;
+       unsigned n_bitmap_blocks;
+
        struct page_list *journal;
        struct page_list *journal_io;
        struct page_list *journal_xor;
+       struct page_list *recalc_bitmap;
+       struct page_list *may_write_bitmap;
+       struct bitmap_block_status *bbs;
+       unsigned bitmap_flush_interval;
+       int synchronous_mode;
+       struct bio_list synchronous_bios;
+       struct delayed_work bitmap_flush_work;
 
        struct crypto_skcipher *journal_crypt;
        struct scatterlist **journal_scatterlist;
@@ -180,6 +196,7 @@ struct dm_integrity_c {
        __s8 log2_metadata_run;
        __u8 log2_buffer_sectors;
        __u8 sectors_per_block;
+       __u8 log2_blocks_per_bitmap_bit;
 
        unsigned char mode;
        int suspending;
@@ -232,17 +249,20 @@ struct dm_integrity_c {
 
        bool journal_uptodate;
        bool just_formatted;
+       bool recalculate_flag;
 
        struct alg_spec internal_hash_alg;
        struct alg_spec journal_crypt_alg;
        struct alg_spec journal_mac_alg;
 
        atomic64_t number_of_mismatches;
+
+       struct notifier_block reboot_notifier;
 };
 
 struct dm_integrity_range {
        sector_t logical_sector;
-       unsigned n_sectors;
+       sector_t n_sectors;
        bool waiting;
        union {
                struct rb_node node;
@@ -288,6 +308,16 @@ struct journal_io {
        struct journal_completion *comp;
 };
 
+struct bitmap_block_status {
+       struct work_struct work;
+       struct dm_integrity_c *ic;
+       unsigned idx;
+       unsigned long *bitmap;
+       struct bio_list bio_queue;
+       spinlock_t bio_queue_lock;
+
+};
+
 static struct kmem_cache *journal_io_cache;
 
 #define JOURNAL_IO_MEMPOOL     32
@@ -423,7 +453,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
 
 static void sb_set_version(struct dm_integrity_c *ic)
 {
-       if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+       if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
+               ic->sb->version = SB_VERSION_3;
+       else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
                ic->sb->version = SB_VERSION_2;
        else
                ic->sb->version = SB_VERSION_1;
@@ -447,6 +479,137 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
        return dm_io(&io_req, 1, &io_loc, NULL);
 }
 
+#define BITMAP_OP_TEST_ALL_SET         0
+#define BITMAP_OP_TEST_ALL_CLEAR       1
+#define BITMAP_OP_SET                  2
+#define BITMAP_OP_CLEAR                        3
+
+static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap,
+                           sector_t sector, sector_t n_sectors, int mode)
+{
+       unsigned long bit, end_bit, this_end_bit, page, end_page;
+       unsigned long *data;
+
+       if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) {
+               DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)",
+                       (unsigned long long)sector,
+                       (unsigned long long)n_sectors,
+                       ic->sb->log2_sectors_per_block,
+                       ic->log2_blocks_per_bitmap_bit,
+                       mode);
+               BUG();
+       }
+
+       if (unlikely(!n_sectors))
+               return true;
+
+       bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+       end_bit = (sector + n_sectors - 1) >>
+               (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+
+       page = bit / (PAGE_SIZE * 8);
+       bit %= PAGE_SIZE * 8;
+
+       end_page = end_bit / (PAGE_SIZE * 8);
+       end_bit %= PAGE_SIZE * 8;
+
+repeat:
+       if (page < end_page) {
+               this_end_bit = PAGE_SIZE * 8 - 1;
+       } else {
+               this_end_bit = end_bit;
+       }
+
+       data = lowmem_page_address(bitmap[page].page);
+
+       if (mode == BITMAP_OP_TEST_ALL_SET) {
+               while (bit <= this_end_bit) {
+                       if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+                               do {
+                                       if (data[bit / BITS_PER_LONG] != -1)
+                                               return false;
+                                       bit += BITS_PER_LONG;
+                               } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+                               continue;
+                       }
+                       if (!test_bit(bit, data))
+                               return false;
+                       bit++;
+               }
+       } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) {
+               while (bit <= this_end_bit) {
+                       if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+                               do {
+                                       if (data[bit / BITS_PER_LONG] != 0)
+                                               return false;
+                                       bit += BITS_PER_LONG;
+                               } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+                               continue;
+                       }
+                       if (test_bit(bit, data))
+                               return false;
+                       bit++;
+               }
+       } else if (mode == BITMAP_OP_SET) {
+               while (bit <= this_end_bit) {
+                       if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+                               do {
+                                       data[bit / BITS_PER_LONG] = -1;
+                                       bit += BITS_PER_LONG;
+                               } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+                               continue;
+                       }
+                       __set_bit(bit, data);
+                       bit++;
+               }
+       } else if (mode == BITMAP_OP_CLEAR) {
+               if (!bit && this_end_bit == PAGE_SIZE * 8 - 1)
+                       clear_page(data);
+               else while (bit <= this_end_bit) {
+                       if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+                               do {
+                                       data[bit / BITS_PER_LONG] = 0;
+                                       bit += BITS_PER_LONG;
+                               } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+                               continue;
+                       }
+                       __clear_bit(bit, data);
+                       bit++;
+               }
+       } else {
+               BUG();
+       }
+
+       if (unlikely(page < end_page)) {
+               bit = 0;
+               page++;
+               goto repeat;
+       }
+
+       return true;
+}
+
+static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src)
+{
+       unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
+       unsigned i;
+
+       for (i = 0; i < n_bitmap_pages; i++) {
+               unsigned long *dst_data = lowmem_page_address(dst[i].page);
+               unsigned long *src_data = lowmem_page_address(src[i].page);
+               copy_page(dst_data, src_data);
+       }
+}
+
+static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector)
+{
+       unsigned bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+       unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8);
+
+       BUG_ON(bitmap_block >= ic->n_bitmap_blocks);
+       return &ic->bbs[bitmap_block];
+}
+
 static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
                                 bool e, const char *function)
 {
@@ -455,8 +618,8 @@ static void access_journal_check(struct dm_integrity_c *ic, unsigned section, un
 
        if (unlikely(section >= ic->journal_sections) ||
            unlikely(offset >= limit)) {
-               printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
-                       function, section, offset, ic->journal_sections, limit);
+               DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)",
+                      function, section, offset, ic->journal_sections, limit);
                BUG();
        }
 #endif
@@ -756,12 +919,12 @@ static void complete_journal_io(unsigned long error, void *context)
        complete_journal_op(comp);
 }
 
-static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
-                      unsigned n_sections, struct journal_completion *comp)
+static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags,
+                              unsigned sector, unsigned n_sectors, struct journal_completion *comp)
 {
        struct dm_io_request io_req;
        struct dm_io_region io_loc;
-       unsigned sector, n_sectors, pl_index, pl_offset;
+       unsigned pl_index, pl_offset;
        int r;
 
        if (unlikely(dm_integrity_failed(ic))) {
@@ -770,9 +933,6 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned
                return;
        }
 
-       sector = section * ic->journal_section_sectors;
-       n_sectors = n_sections * ic->journal_section_sectors;
-
        pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
        pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
 
@@ -805,6 +965,17 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned
        }
 }
 
+static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
+                      unsigned n_sections, struct journal_completion *comp)
+{
+       unsigned sector, n_sectors;
+
+       sector = section * ic->journal_section_sectors;
+       n_sectors = n_sections * ic->journal_section_sectors;
+
+       rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp);
+}
+
 static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
 {
        struct journal_completion io_comp;
@@ -988,6 +1159,12 @@ static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrit
        } while (unlikely(new_range->waiting));
 }
 
+static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+{
+       if (unlikely(!add_new_range(ic, new_range, true)))
+               wait_and_add_new_range(ic, new_range);
+}
+
 static void init_journal_node(struct journal_node *node)
 {
        RB_CLEAR_NODE(&node->node);
@@ -1204,6 +1381,14 @@ static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
        int r = dm_integrity_failed(ic);
        if (unlikely(r) && !bio->bi_status)
                bio->bi_status = errno_to_blk_status(r);
+       if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) {
+               unsigned long flags;
+               spin_lock_irqsave(&ic->endio_wait.lock, flags);
+               bio_list_add(&ic->synchronous_bios, bio);
+               queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+               spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+               return;
+       }
        bio_endio(bio);
 }
 
@@ -1477,7 +1662,8 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
                        else
                                wanted_tag_size *= ic->tag_size;
                        if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
-                               DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
+                               DMERR("Invalid integrity data size %u, expected %u",
+                                     bip->bip_iter.bi_size, wanted_tag_size);
                                return DM_MAPIO_KILL;
                        }
                }
@@ -1681,7 +1867,7 @@ retry:
                        unsigned ws, we, range_sectors;
 
                        dio->range.n_sectors = min(dio->range.n_sectors,
-                                                  ic->free_sectors << ic->sb->log2_sectors_per_block);
+                                                  (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block);
                        if (unlikely(!dio->range.n_sectors)) {
                                if (from_map)
                                        goto offload_to_thread;
@@ -1764,6 +1950,20 @@ offload_to_thread:
                goto journal_read_write;
        }
 
+       if (ic->mode == 'B' && dio->write) {
+               if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+                                    dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
+                       struct bitmap_block_status *bbs;
+
+                       bbs = sector_to_bitmap_block(ic, dio->range.logical_sector);
+                       spin_lock(&bbs->bio_queue_lock);
+                       bio_list_add(&bbs->bio_queue, bio);
+                       spin_unlock(&bbs->bio_queue_lock);
+                       queue_work(ic->writer_wq, &bbs->work);
+                       return;
+               }
+       }
+
        dio->in_flight = (atomic_t)ATOMIC_INIT(2);
 
        if (need_sync_io) {
@@ -1790,10 +1990,15 @@ offload_to_thread:
 
        if (need_sync_io) {
                wait_for_completion_io(&read_comp);
-               if (unlikely(ic->recalc_wq != NULL) &&
-                   ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
+               if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
                    dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector))
                        goto skip_check;
+               if (ic->mode == 'B') {
+                       if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector,
+                                            dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR))
+                               goto skip_check;
+               }
+
                if (likely(!bio->bi_status))
                        integrity_metadata(&dio->work);
                else
@@ -1831,8 +2036,16 @@ static void pad_uncommitted(struct dm_integrity_c *ic)
                wraparound_section(ic, &ic->free_section);
                ic->n_uncommitted_sections++;
        }
-       WARN_ON(ic->journal_sections * ic->journal_section_entries !=
-               (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors);
+       if (WARN_ON(ic->journal_sections * ic->journal_section_entries !=
+                   (ic->n_uncommitted_sections + ic->n_committed_sections) *
+                   ic->journal_section_entries + ic->free_sectors)) {
+               DMCRIT("journal_sections %u, journal_section_entries %u, "
+                      "n_uncommitted_sections %u, n_committed_sections %u, "
+                      "journal_section_entries %u, free_sectors %u",
+                      ic->journal_sections, ic->journal_section_entries,
+                      ic->n_uncommitted_sections, ic->n_committed_sections,
+                      ic->journal_section_entries, ic->free_sectors);
+       }
 }
 
 static void integrity_commit(struct work_struct *w)
@@ -1981,8 +2194,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
                        io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
 
                        spin_lock_irq(&ic->endio_wait.lock);
-                       if (unlikely(!add_new_range(ic, &io->range, true)))
-                               wait_and_add_new_range(ic, &io->range);
+                       add_new_range_and_wait(ic, &io->range);
 
                        if (likely(!from_replay)) {
                                struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
@@ -2120,11 +2332,14 @@ static void integrity_recalc(struct work_struct *w)
        sector_t area, offset;
        sector_t metadata_block;
        unsigned metadata_offset;
+       sector_t logical_sector, n_sectors;
        __u8 *t;
        unsigned i;
        int r;
        unsigned super_counter = 0;
 
+       DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
+
        spin_lock_irq(&ic->endio_wait.lock);
 
 next_chunk:
@@ -2133,21 +2348,49 @@ next_chunk:
                goto unlock_ret;
 
        range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
-       if (unlikely(range.logical_sector >= ic->provided_data_sectors))
+       if (unlikely(range.logical_sector >= ic->provided_data_sectors)) {
+               if (ic->mode == 'B') {
+                       DEBUG_print("queue_delayed_work: bitmap_flush_work\n");
+                       queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+               }
                goto unlock_ret;
+       }
 
        get_area_and_offset(ic, range.logical_sector, &area, &offset);
        range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
        if (!ic->meta_dev)
-               range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
-
-       if (unlikely(!add_new_range(ic, &range, true)))
-               wait_and_add_new_range(ic, &range);
+               range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
 
+       add_new_range_and_wait(ic, &range);
        spin_unlock_irq(&ic->endio_wait.lock);
+       logical_sector = range.logical_sector;
+       n_sectors = range.n_sectors;
+
+       if (ic->mode == 'B') {
+               if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) {
+                       goto advance_and_next;
+               }
+               while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector,
+                                      ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
+                       logical_sector += ic->sectors_per_block;
+                       n_sectors -= ic->sectors_per_block;
+                       cond_resched();
+               }
+               while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block,
+                                      ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
+                       n_sectors -= ic->sectors_per_block;
+                       cond_resched();
+               }
+               get_area_and_offset(ic, logical_sector, &area, &offset);
+       }
+
+       DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors);
 
        if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
                recalc_write_super(ic);
+               if (ic->mode == 'B') {
+                       queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
+               }
                super_counter = 0;
        }
 
@@ -2162,7 +2405,7 @@ next_chunk:
        io_req.client = ic->io;
        io_loc.bdev = ic->dev->bdev;
        io_loc.sector = get_data_sector(ic, area, offset);
-       io_loc.count = range.n_sectors;
+       io_loc.count = n_sectors;
 
        r = dm_io(&io_req, 1, &io_loc, NULL);
        if (unlikely(r)) {
@@ -2171,8 +2414,8 @@ next_chunk:
        }
 
        t = ic->recalc_tags;
-       for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
-               integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+       for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
+               integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
                t += ic->tag_size;
        }
 
@@ -2184,6 +2427,9 @@ next_chunk:
                goto err;
        }
 
+advance_and_next:
+       cond_resched();
+
        spin_lock_irq(&ic->endio_wait.lock);
        remove_range_unlocked(ic, &range);
        ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
@@ -2199,6 +2445,103 @@ unlock_ret:
        recalc_write_super(ic);
 }
 
+static void bitmap_block_work(struct work_struct *w)
+{
+       struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work);
+       struct dm_integrity_c *ic = bbs->ic;
+       struct bio *bio;
+       struct bio_list bio_queue;
+       struct bio_list waiting;
+
+       bio_list_init(&waiting);
+
+       spin_lock(&bbs->bio_queue_lock);
+       bio_queue = bbs->bio_queue;
+       bio_list_init(&bbs->bio_queue);
+       spin_unlock(&bbs->bio_queue_lock);
+
+       while ((bio = bio_list_pop(&bio_queue))) {
+               struct dm_integrity_io *dio;
+
+               dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+               if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+                                   dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
+                       remove_range(ic, &dio->range);
+                       INIT_WORK(&dio->work, integrity_bio_wait);
+                       queue_work(ic->wait_wq, &dio->work);
+               } else {
+                       block_bitmap_op(ic, ic->journal, dio->range.logical_sector,
+                                       dio->range.n_sectors, BITMAP_OP_SET);
+                       bio_list_add(&waiting, bio);
+               }
+       }
+
+       if (bio_list_empty(&waiting))
+               return;
+
+       rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC,
+                          bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT),
+                          BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL);
+
+       while ((bio = bio_list_pop(&waiting))) {
+               struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+               block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+                               dio->range.n_sectors, BITMAP_OP_SET);
+
+               remove_range(ic, &dio->range);
+               INIT_WORK(&dio->work, integrity_bio_wait);
+               queue_work(ic->wait_wq, &dio->work);
+       }
+
+       queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
+}
+
+static void bitmap_flush_work(struct work_struct *work)
+{
+       struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work);
+       struct dm_integrity_range range;
+       unsigned long limit;
+       struct bio *bio;
+
+       dm_integrity_flush_buffers(ic);
+
+       range.logical_sector = 0;
+       range.n_sectors = ic->provided_data_sectors;
+
+       spin_lock_irq(&ic->endio_wait.lock);
+       add_new_range_and_wait(ic, &range);
+       spin_unlock_irq(&ic->endio_wait.lock);
+
+       dm_integrity_flush_buffers(ic);
+       if (ic->meta_dev)
+               blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL);
+
+       limit = ic->provided_data_sectors;
+       if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+               limit = le64_to_cpu(ic->sb->recalc_sector)
+                       >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)
+                       << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+       }
+       /*DEBUG_print("zeroing journal\n");*/
+       block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR);
+       block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR);
+
+       rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+                          ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+
+       spin_lock_irq(&ic->endio_wait.lock);
+       remove_range_unlocked(ic, &range);
+       while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) {
+               bio_endio(bio);
+               spin_unlock_irq(&ic->endio_wait.lock);
+               spin_lock_irq(&ic->endio_wait.lock);
+       }
+       spin_unlock_irq(&ic->endio_wait.lock);
+}
+
+
 static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
                         unsigned n_sections, unsigned char commit_seq)
 {
@@ -2395,9 +2738,37 @@ clear_journal:
                init_journal_node(&ic->journal_tree[i]);
 }
 
+static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic)
+{
+       DEBUG_print("dm_integrity_enter_synchronous_mode\n");
+
+       if (ic->mode == 'B') {
+               ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1;
+               ic->synchronous_mode = 1;
+
+               cancel_delayed_work_sync(&ic->bitmap_flush_work);
+               queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+               flush_workqueue(ic->commit_wq);
+       }
+}
+
+static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+       struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier);
+
+       DEBUG_print("dm_integrity_reboot\n");
+
+       dm_integrity_enter_synchronous_mode(ic);
+
+       return NOTIFY_DONE;
+}
+
 static void dm_integrity_postsuspend(struct dm_target *ti)
 {
        struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+       int r;
+
+       WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
 
        del_timer_sync(&ic->autocommit_timer);
 
@@ -2406,6 +2777,9 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
        if (ic->recalc_wq)
                drain_workqueue(ic->recalc_wq);
 
+       if (ic->mode == 'B')
+               cancel_delayed_work_sync(&ic->bitmap_flush_work);
+
        queue_work(ic->commit_wq, &ic->commit_work);
        drain_workqueue(ic->commit_wq);
 
@@ -2416,6 +2790,18 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
                dm_integrity_flush_buffers(ic);
        }
 
+       if (ic->mode == 'B') {
+               dm_integrity_flush_buffers(ic);
+#if 1
+               /* set to 0 to test bitmap replay code */
+               init_journal(ic, 0, ic->journal_sections, 0);
+               ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+               r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+               if (unlikely(r))
+                       dm_integrity_io_error(ic, "writing superblock", r);
+#endif
+       }
+
        WRITE_ONCE(ic->suspending, 0);
 
        BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
@@ -2426,11 +2812,70 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
 static void dm_integrity_resume(struct dm_target *ti)
 {
        struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+       int r;
+       DEBUG_print("resume\n");
+
+       if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
+               DEBUG_print("resume dirty_bitmap\n");
+               rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
+                                  ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+               if (ic->mode == 'B') {
+                       if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
+                               block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal);
+                               block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal);
+                               if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors,
+                                                    BITMAP_OP_TEST_ALL_CLEAR)) {
+                                       ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+                                       ic->sb->recalc_sector = cpu_to_le64(0);
+                               }
+                       } else {
+                               DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n",
+                                           ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit);
+                               ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
+                               block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+                               block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+                               block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+                               rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+                                                  ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+                               ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+                               ic->sb->recalc_sector = cpu_to_le64(0);
+                       }
+               } else {
+                       if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
+                             block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) {
+                               ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+                               ic->sb->recalc_sector = cpu_to_le64(0);
+                       }
+                       init_journal(ic, 0, ic->journal_sections, 0);
+                       replay_journal(ic);
+                       ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+               }
+               r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+               if (unlikely(r))
+                       dm_integrity_io_error(ic, "writing superblock", r);
+       } else {
+               replay_journal(ic);
+               if (ic->mode == 'B') {
+                       int mode;
+                       ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+                       ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
+                       r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+                       if (unlikely(r))
+                               dm_integrity_io_error(ic, "writing superblock", r);
+
+                       mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR;
+                       block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode);
+                       block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode);
+                       block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode);
+                       rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+                                          ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+               }
+       }
 
-       replay_journal(ic);
-
-       if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+       DEBUG_print("testing recalc: %x\n", ic->sb->flags);
+       if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
                __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
+               DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors);
                if (recalc_pos < ic->provided_data_sectors) {
                        queue_work(ic->recalc_wq, &ic->recalc_work);
                } else if (recalc_pos > ic->provided_data_sectors) {
@@ -2438,6 +2883,16 @@ static void dm_integrity_resume(struct dm_target *ti)
                        recalc_write_super(ic);
                }
        }
+
+       ic->reboot_notifier.notifier_call = dm_integrity_reboot;
+       ic->reboot_notifier.next = NULL;
+       ic->reboot_notifier.priority = INT_MAX - 1;     /* be notified after md and before hardware drivers */
+       WARN_ON(register_reboot_notifier(&ic->reboot_notifier));
+
+#if 0
+       /* set to 1 to stress test synchronous mode */
+       dm_integrity_enter_synchronous_mode(ic);
+#endif
 }
 
 static void dm_integrity_status(struct dm_target *ti, status_type_t type,
@@ -2462,10 +2917,14 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
                __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
                watermark_percentage += ic->journal_entries / 2;
                do_div(watermark_percentage, ic->journal_entries);
-               arg_count = 5;
+               arg_count = 3;
                arg_count += !!ic->meta_dev;
                arg_count += ic->sectors_per_block != 1;
                arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
+               arg_count += ic->mode == 'J';
+               arg_count += ic->mode == 'J';
+               arg_count += ic->mode == 'B';
+               arg_count += ic->mode == 'B';
                arg_count += !!ic->internal_hash_alg.alg_string;
                arg_count += !!ic->journal_crypt_alg.alg_string;
                arg_count += !!ic->journal_mac_alg.alg_string;
@@ -2475,13 +2934,19 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
                        DMEMIT(" meta_device:%s", ic->meta_dev->name);
                if (ic->sectors_per_block != 1)
                        DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
-               if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+               if (ic->recalculate_flag)
                        DMEMIT(" recalculate");
                DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
                DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
                DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
-               DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
-               DMEMIT(" commit_time:%u", ic->autocommit_msec);
+               if (ic->mode == 'J') {
+                       DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
+                       DMEMIT(" commit_time:%u", ic->autocommit_msec);
+               }
+               if (ic->mode == 'B') {
+                       DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
+                       DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
+               }
 
 #define EMIT_ALG(a, n)                                                 \
                do {                                                    \
@@ -2562,7 +3027,7 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
                if (last_sector < ic->start || last_sector >= ic->meta_device_sectors)
                        return -EINVAL;
        } else {
-               __u64 meta_size = ic->provided_data_sectors * ic->tag_size;
+               __u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
                meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1))
                                >> (ic->log2_buffer_sectors + SECTOR_SHIFT);
                meta_size <<= ic->log2_buffer_sectors;
@@ -2659,37 +3124,37 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
        blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
 }
 
-static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
+static void dm_integrity_free_page_list(struct page_list *pl)
 {
        unsigned i;
 
        if (!pl)
                return;
-       for (i = 0; i < ic->journal_pages; i++)
-               if (pl[i].page)
-                       __free_page(pl[i].page);
+       for (i = 0; pl[i].page; i++)
+               __free_page(pl[i].page);
        kvfree(pl);
 }
 
-static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
+static struct page_list *dm_integrity_alloc_page_list(unsigned n_pages)
 {
-       size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
        struct page_list *pl;
        unsigned i;
 
-       pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO);
+       pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO);
        if (!pl)
                return NULL;
 
-       for (i = 0; i < ic->journal_pages; i++) {
+       for (i = 0; i < n_pages; i++) {
                pl[i].page = alloc_page(GFP_KERNEL);
                if (!pl[i].page) {
-                       dm_integrity_free_page_list(ic, pl);
+                       dm_integrity_free_page_list(pl);
                        return NULL;
                }
                if (i)
                        pl[i - 1].next = &pl[i];
        }
+       pl[i].page = NULL;
+       pl[i].next = NULL;
 
        return pl;
 }
@@ -2702,7 +3167,8 @@ static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, str
        kvfree(sl);
 }
 
-static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
+static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic,
+                                                                  struct page_list *pl)
 {
        struct scatterlist **sl;
        unsigned i;
@@ -2721,7 +3187,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
                unsigned idx;
 
                page_list_location(ic, i, 0, &start_index, &start_offset);
-               page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset);
+               page_list_location(ic, i, ic->journal_section_sectors - 1,
+                                  &end_index, &end_offset);
 
                n_pages = (end_index - start_index + 1);
 
@@ -2842,7 +3309,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
        }
        ic->journal_pages = journal_pages;
 
-       ic->journal = dm_integrity_alloc_page_list(ic);
+       ic->journal = dm_integrity_alloc_page_list(ic->journal_pages);
        if (!ic->journal) {
                *error = "Could not allocate memory for journal";
                r = -ENOMEM;
@@ -2874,7 +3341,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
                DEBUG_print("cipher %s, block size %u iv size %u\n",
                            ic->journal_crypt_alg.alg_string, blocksize, ivsize);
 
-               ic->journal_io = dm_integrity_alloc_page_list(ic);
+               ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages);
                if (!ic->journal_io) {
                        *error = "Could not allocate memory for journal io";
                        r = -ENOMEM;
@@ -2898,7 +3365,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
                                goto bad;
                        }
 
-                       ic->journal_xor = dm_integrity_alloc_page_list(ic);
+                       ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages);
                        if (!ic->journal_xor) {
                                *error = "Could not allocate memory for journal xor";
                                r = -ENOMEM;
@@ -2922,7 +3389,8 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
                        sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
                        memset(crypt_iv, 0x00, ivsize);
 
-                       skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
+                       skcipher_request_set_crypt(req, sg, sg,
+                                                  PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
                        init_completion(&comp.comp);
                        comp.in_flight = (atomic_t)ATOMIC_INIT(1);
                        if (do_crypt(true, req, &comp))
@@ -3063,7 +3531,7 @@ bad:
  *     device
  *     offset from the start of the device
  *     tag size
- *     D - direct writes, J - journal writes, R - recovery mode
+ *     D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode
  *     number of optional arguments
  *     optional arguments:
  *             journal_sectors
@@ -3071,10 +3539,14 @@ bad:
  *             buffer_sectors
  *             journal_watermark
  *             commit_time
+ *             meta_device
+ *             block_size
+ *             sectors_per_bit
+ *             bitmap_flush_interval
  *             internal_hash
  *             journal_crypt
  *             journal_mac
- *             block_size
+ *             recalculate
  */
 static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
@@ -3087,10 +3559,13 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                {0, 9, "Invalid number of feature args"},
        };
        unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
-       bool recalculate;
        bool should_write_sb;
        __u64 threshold;
        unsigned long long start;
+       __s8 log2_sectors_per_bitmap_bit = -1;
+       __s8 log2_blocks_per_bitmap_bit;
+       __u64 bits_in_journal;
+       __u64 n_bitmap_bits;
 
 #define DIRECT_ARGUMENTS       4
 
@@ -3114,6 +3589,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
        init_waitqueue_head(&ic->copy_to_journal_wait);
        init_completion(&ic->crypto_backoff);
        atomic64_set(&ic->number_of_mismatches, 0);
+       ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL;
 
        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
        if (r) {
@@ -3136,10 +3612,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                }
        }
 
-       if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
+       if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") ||
+           !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) {
                ic->mode = argv[3][0];
-       else {
-               ti->error = "Invalid mode (expecting J, D, R)";
+       else {
+               ti->error = "Invalid mode (expecting J, B, D, R)";
                r = -EINVAL;
                goto bad;
        }
@@ -3149,7 +3626,6 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
        buffer_sectors = DEFAULT_BUFFER_SECTORS;
        journal_watermark = DEFAULT_JOURNAL_WATERMARK;
        sync_msec = DEFAULT_SYNC_MSEC;
-       recalculate = false;
        ic->sectors_per_block = 1;
 
        as.argc = argc - DIRECT_ARGUMENTS;
@@ -3161,6 +3637,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
        while (extra_args--) {
                const char *opt_string;
                unsigned val;
+               unsigned long long llval;
                opt_string = dm_shift_arg(&as);
                if (!opt_string) {
                        r = -EINVAL;
@@ -3182,7 +3659,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                                dm_put_device(ti, ic->meta_dev);
                                ic->meta_dev = NULL;
                        }
-                       r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev);
+                       r = dm_get_device(ti, strchr(opt_string, ':') + 1,
+                                         dm_table_get_mode(ti->table), &ic->meta_dev);
                        if (r) {
                                ti->error = "Device lookup failed";
                                goto bad;
@@ -3196,6 +3674,14 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                                goto bad;
                        }
                        ic->sectors_per_block = val >> SECTOR_SHIFT;
+               } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) {
+                       log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval);
+               } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) {
+                       if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
+                               r = -EINVAL;
+                               ti->error = "Invalid bitmap_flush_interval argument";
+                       }
+                       ic->bitmap_flush_interval = msecs_to_jiffies(val);
                } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
                        r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
                                            "Invalid internal_hash argument");
@@ -3212,7 +3698,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                        if (r)
                                goto bad;
                } else if (!strcmp(opt_string, "recalculate")) {
-                       recalculate = true;
+                       ic->recalculate_flag = true;
                } else {
                        r = -EINVAL;
                        ti->error = "Invalid argument";
@@ -3228,7 +3714,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
        if (!journal_sectors) {
                journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
-                       ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
+                                     ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
        }
 
        if (!buffer_sectors)
@@ -3263,6 +3749,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
        else
                ic->log2_tag_size = -1;
 
+       if (ic->mode == 'B' && !ic->internal_hash) {
+               r = -EINVAL;
+               ti->error = "Bitmap mode can be only used with internal hash";
+               goto bad;
+       }
+
        ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
        ic->autocommit_msec = sync_msec;
        timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
@@ -3308,7 +3800,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
        }
        INIT_WORK(&ic->commit_work, integrity_commit);
 
-       if (ic->mode == 'J') {
+       if (ic->mode == 'J' || ic->mode == 'B') {
                ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
                if (!ic->writer_wq) {
                        ti->error = "Cannot allocate workqueue";
@@ -3349,7 +3841,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
                        should_write_sb = true;
        }
 
-       if (!ic->sb->version || ic->sb->version > SB_VERSION_2) {
+       if (!ic->sb->version || ic->sb->version > SB_VERSION_3) {
                r = -EINVAL;
                ti->error = "Unknown version";
                goto bad;
@@ -3409,6 +3901,27 @@ try_smaller_buffer:
                ti->error = "The device is too small";
                goto bad;
        }
+
+       if (log2_sectors_per_bitmap_bit < 0)
+               log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT);
+       if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block)
+               log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block;
+
+       bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3);
+       if (bits_in_journal > UINT_MAX)
+               bits_in_journal = UINT_MAX;
+       while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
+               log2_sectors_per_bitmap_bit++;
+
+       log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block;
+       ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
+       if (should_write_sb) {
+               ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
+       }
+       n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block)
+                               + (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit;
+       ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8);
+
        if (!ic->meta_dev)
                ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run));
 
@@ -3433,25 +3946,21 @@ try_smaller_buffer:
        DEBUG_print("   journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
        DEBUG_print("   journal_entries %u\n", ic->journal_entries);
        DEBUG_print("   log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
-       DEBUG_print("   device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors);
+       DEBUG_print("   data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT);
        DEBUG_print("   initial_sectors 0x%x\n", ic->initial_sectors);
        DEBUG_print("   metadata_run 0x%x\n", ic->metadata_run);
        DEBUG_print("   log2_metadata_run %d\n", ic->log2_metadata_run);
        DEBUG_print("   provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
                    (unsigned long long)ic->provided_data_sectors);
        DEBUG_print("   log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
+       DEBUG_print("   bits_in_journal %llu\n", (unsigned long long)bits_in_journal);
 
-       if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
+       if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
                ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
                ic->sb->recalc_sector = cpu_to_le64(0);
        }
 
-       if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
-               if (!ic->internal_hash) {
-                       r = -EINVAL;
-                       ti->error = "Recalculate is only valid with internal hash";
-                       goto bad;
-               }
+       if (ic->internal_hash) {
                ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
                if (!ic->recalc_wq ) {
                        ti->error = "Cannot allocate workqueue";
@@ -3488,6 +3997,45 @@ try_smaller_buffer:
                r = create_journal(ic, &ti->error);
                if (r)
                        goto bad;
+
+       }
+
+       if (ic->mode == 'B') {
+               unsigned i;
+               unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
+
+               ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
+               if (!ic->recalc_bitmap) {
+                       r = -ENOMEM;
+                       goto bad;
+               }
+               ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
+               if (!ic->may_write_bitmap) {
+                       r = -ENOMEM;
+                       goto bad;
+               }
+               ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
+               if (!ic->bbs) {
+                       r = -ENOMEM;
+                       goto bad;
+               }
+               INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work);
+               for (i = 0; i < ic->n_bitmap_blocks; i++) {
+                       struct bitmap_block_status *bbs = &ic->bbs[i];
+                       unsigned sector, pl_index, pl_offset;
+
+                       INIT_WORK(&bbs->work, bitmap_block_work);
+                       bbs->ic = ic;
+                       bbs->idx = i;
+                       bio_list_init(&bbs->bio_queue);
+                       spin_lock_init(&bbs->bio_queue_lock);
+
+                       sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT);
+                       pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+                       pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+                       bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset;
+               }
        }
 
        if (should_write_sb) {
@@ -3512,6 +4060,17 @@ try_smaller_buffer:
                if (r)
                        goto bad;
        }
+       if (ic->mode == 'B') {
+               unsigned max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8);
+               if (!max_io_len)
+                       max_io_len = 1U << 31;
+               DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len);
+               if (!ti->max_io_len || ti->max_io_len > max_io_len) {
+                       r = dm_set_target_max_io_len(ti, max_io_len);
+                       if (r)
+                               goto bad;
+               }
+       }
 
        if (!ic->internal_hash)
                dm_integrity_set(ti, ic);
@@ -3520,6 +4079,7 @@ try_smaller_buffer:
        ti->flush_supported = true;
 
        return 0;
+
 bad:
        dm_integrity_dtr(ti);
        return r;
@@ -3542,10 +4102,9 @@ static void dm_integrity_dtr(struct dm_target *ti)
                destroy_workqueue(ic->writer_wq);
        if (ic->recalc_wq)
                destroy_workqueue(ic->recalc_wq);
-       if (ic->recalc_buffer)
-               vfree(ic->recalc_buffer);
-       if (ic->recalc_tags)
-               kvfree(ic->recalc_tags);
+       vfree(ic->recalc_buffer);
+       kvfree(ic->recalc_tags);
+       kvfree(ic->bbs);
        if (ic->bufio)
                dm_bufio_client_destroy(ic->bufio);
        mempool_exit(&ic->journal_io_mempool);
@@ -3555,9 +4114,11 @@ static void dm_integrity_dtr(struct dm_target *ti)
                dm_put_device(ti, ic->dev);
        if (ic->meta_dev)
                dm_put_device(ti, ic->meta_dev);
-       dm_integrity_free_page_list(ic, ic->journal);
-       dm_integrity_free_page_list(ic, ic->journal_io);
-       dm_integrity_free_page_list(ic, ic->journal_xor);
+       dm_integrity_free_page_list(ic->journal);
+       dm_integrity_free_page_list(ic->journal_io);
+       dm_integrity_free_page_list(ic->journal_xor);
+       dm_integrity_free_page_list(ic->recalc_bitmap);
+       dm_integrity_free_page_list(ic->may_write_bitmap);
        if (ic->journal_scatterlist)
                dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
        if (ic->journal_io_scatterlist)
@@ -3595,7 +4156,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
 
 static struct target_type integrity_target = {
        .name                   = "integrity",
-       .version                = {1, 2, 0},
+       .version                = {1, 3, 0},
        .module                 = THIS_MODULE,
        .features               = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
        .ctr                    = dm_integrity_ctr,
index c740153b4e52df15ee7b4cab6c9b1e83d897143f..1e03bc89e20f68ce25c7e0c60e178c1985cec8bf 100644 (file)
@@ -2069,7 +2069,7 @@ int __init dm_early_create(struct dm_ioctl *dmi,
        /* alloc table */
        r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md);
        if (r)
-               goto err_destroy_dm;
+               goto err_hash_remove;
 
        /* add targets */
        for (i = 0; i < dmi->target_count; i++) {
@@ -2116,6 +2116,10 @@ int __init dm_early_create(struct dm_ioctl *dmi,
 
 err_destroy_table:
        dm_table_destroy(t);
+err_hash_remove:
+       (void) __hash_remove(__get_name_cell(dmi->name));
+       /* release reference from __get_name_cell */
+       dm_put(md);
 err_destroy_dm:
        dm_put(md);
        dm_destroy(md);
index 2ee5e357a0a717acf94defdd4fa335a0a49d14b1..dbcc1e41cd57dd5a669466a6907a70f78a16014f 100644 (file)
@@ -544,8 +544,23 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
        return DM_MAPIO_REMAPPED;
 }
 
-static void multipath_release_clone(struct request *clone)
+static void multipath_release_clone(struct request *clone,
+                                   union map_info *map_context)
 {
+       if (unlikely(map_context)) {
+               /*
+                * non-NULL map_context means caller is still map
+                * method; must undo multipath_clone_and_map()
+                */
+               struct dm_mpath_io *mpio = get_mpio(map_context);
+               struct pgpath *pgpath = mpio->pgpath;
+
+               if (pgpath && pgpath->pg->ps.type->end_io)
+                       pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
+                                                   &pgpath->path,
+                                                   mpio->nr_bytes);
+       }
+
        blk_put_request(clone);
 }
 
@@ -882,6 +897,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
        if (attached_handler_name || m->hw_handler_name) {
                INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
                r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error);
+               kfree(attached_handler_name);
                if (r) {
                        dm_put_device(ti, p->path.dev);
                        goto bad;
@@ -896,7 +912,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 
        return p;
  bad:
-       kfree(attached_handler_name);
        free_pgpath(p);
        return ERR_PTR(r);
 }
index b66745bd08bbcc2dd1ab349f47c7326199518778..5f7063f05ae0771e3a8ebaaf697da58b9d4308d5 100644 (file)
@@ -168,7 +168,7 @@ static void dm_end_request(struct request *clone, blk_status_t error)
        struct request *rq = tio->orig;
 
        blk_rq_unprep_clone(clone);
-       tio->ti->type->release_clone_rq(clone);
+       tio->ti->type->release_clone_rq(clone, NULL);
 
        rq_end_stats(md, rq);
        blk_mq_end_request(rq, error);
@@ -201,7 +201,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
        rq_end_stats(md, rq);
        if (tio->clone) {
                blk_rq_unprep_clone(tio->clone);
-               tio->ti->type->release_clone_rq(tio->clone);
+               tio->ti->type->release_clone_rq(tio->clone, NULL);
        }
 
        dm_mq_delay_requeue_request(rq, delay_ms);
@@ -398,7 +398,7 @@ static int map_request(struct dm_rq_target_io *tio)
        case DM_MAPIO_REMAPPED:
                if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
                        /* -ENOMEM */
-                       ti->type->release_clone_rq(clone);
+                       ti->type->release_clone_rq(clone, &tio->info);
                        return DM_MAPIO_REQUEUE;
                }
 
@@ -408,7 +408,7 @@ static int map_request(struct dm_rq_target_io *tio)
                ret = dm_dispatch_clone_request(clone, rq);
                if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
                        blk_rq_unprep_clone(clone);
-                       tio->ti->type->release_clone_rq(clone);
+                       tio->ti->type->release_clone_rq(clone, &tio->info);
                        tio->clone = NULL;
                        return DM_MAPIO_REQUEUE;
                }
index a168963b757df4fe12c885c48456a50d586851fe..3107f2b1988b35f3b22f8324becb48bdffe3bc13 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/kdev_t.h>
 #include <linux/list.h>
+#include <linux/list_bl.h>
 #include <linux/mempool.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -44,11 +45,11 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
 struct dm_exception_table {
        uint32_t hash_mask;
        unsigned hash_shift;
-       struct list_head *table;
+       struct hlist_bl_head *table;
 };
 
 struct dm_snapshot {
-       struct mutex lock;
+       struct rw_semaphore lock;
 
        struct dm_dev *origin;
        struct dm_dev *cow;
@@ -76,7 +77,9 @@ struct dm_snapshot {
 
        atomic_t pending_exceptions_count;
 
-       /* Protected by "lock" */
+       spinlock_t pe_allocation_lock;
+
+       /* Protected by "pe_allocation_lock" */
        sector_t exception_start_sequence;
 
        /* Protected by kcopyd single-threaded callback */
@@ -457,9 +460,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
                if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
                        continue;
 
-               mutex_lock(&s->lock);
+               down_read(&s->lock);
                active = s->active;
-               mutex_unlock(&s->lock);
+               up_read(&s->lock);
 
                if (active) {
                        if (snap_src)
@@ -618,6 +621,36 @@ static void unregister_snapshot(struct dm_snapshot *s)
  * The lowest hash_shift bits of the chunk number are ignored, allowing
  * some consecutive chunks to be grouped together.
  */
+static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
+
+/* Lock to protect access to the completed and pending exception hash tables. */
+struct dm_exception_table_lock {
+       struct hlist_bl_head *complete_slot;
+       struct hlist_bl_head *pending_slot;
+};
+
+static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
+                                        struct dm_exception_table_lock *lock)
+{
+       struct dm_exception_table *complete = &s->complete;
+       struct dm_exception_table *pending = &s->pending;
+
+       lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
+       lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
+}
+
+static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
+{
+       hlist_bl_lock(lock->complete_slot);
+       hlist_bl_lock(lock->pending_slot);
+}
+
+static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
+{
+       hlist_bl_unlock(lock->pending_slot);
+       hlist_bl_unlock(lock->complete_slot);
+}
+
 static int dm_exception_table_init(struct dm_exception_table *et,
                                   uint32_t size, unsigned hash_shift)
 {
@@ -625,12 +658,12 @@ static int dm_exception_table_init(struct dm_exception_table *et,
 
        et->hash_shift = hash_shift;
        et->hash_mask = size - 1;
-       et->table = dm_vcalloc(size, sizeof(struct list_head));
+       et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head));
        if (!et->table)
                return -ENOMEM;
 
        for (i = 0; i < size; i++)
-               INIT_LIST_HEAD(et->table + i);
+               INIT_HLIST_BL_HEAD(et->table + i);
 
        return 0;
 }
@@ -638,15 +671,16 @@ static int dm_exception_table_init(struct dm_exception_table *et,
 static void dm_exception_table_exit(struct dm_exception_table *et,
                                    struct kmem_cache *mem)
 {
-       struct list_head *slot;
-       struct dm_exception *ex, *next;
+       struct hlist_bl_head *slot;
+       struct dm_exception *ex;
+       struct hlist_bl_node *pos, *n;
        int i, size;
 
        size = et->hash_mask + 1;
        for (i = 0; i < size; i++) {
                slot = et->table + i;
 
-               list_for_each_entry_safe (ex, next, slot, hash_list)
+               hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
                        kmem_cache_free(mem, ex);
        }
 
@@ -660,7 +694,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
 
 static void dm_remove_exception(struct dm_exception *e)
 {
-       list_del(&e->hash_list);
+       hlist_bl_del(&e->hash_list);
 }
 
 /*
@@ -670,11 +704,12 @@ static void dm_remove_exception(struct dm_exception *e)
 static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
                                                chunk_t chunk)
 {
-       struct list_head *slot;
+       struct hlist_bl_head *slot;
+       struct hlist_bl_node *pos;
        struct dm_exception *e;
 
        slot = &et->table[exception_hash(et, chunk)];
-       list_for_each_entry (e, slot, hash_list)
+       hlist_bl_for_each_entry(e, pos, slot, hash_list)
                if (chunk >= e->old_chunk &&
                    chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
                        return e;
@@ -721,7 +756,8 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
 static void dm_insert_exception(struct dm_exception_table *eh,
                                struct dm_exception *new_e)
 {
-       struct list_head *l;
+       struct hlist_bl_head *l;
+       struct hlist_bl_node *pos;
        struct dm_exception *e = NULL;
 
        l = &eh->table[exception_hash(eh, new_e->old_chunk)];
@@ -731,7 +767,7 @@ static void dm_insert_exception(struct dm_exception_table *eh,
                goto out;
 
        /* List is ordered by old_chunk */
-       list_for_each_entry_reverse(e, l, hash_list) {
+       hlist_bl_for_each_entry(e, pos, l, hash_list) {
                /* Insert after an existing chunk? */
                if (new_e->old_chunk == (e->old_chunk +
                                         dm_consecutive_chunk_count(e) + 1) &&
@@ -752,12 +788,24 @@ static void dm_insert_exception(struct dm_exception_table *eh,
                        return;
                }
 
-               if (new_e->old_chunk > e->old_chunk)
+               if (new_e->old_chunk < e->old_chunk)
                        break;
        }
 
 out:
-       list_add(&new_e->hash_list, e ? &e->hash_list : l);
+       if (!e) {
+               /*
+                * Either the table doesn't support consecutive chunks or slot
+                * l is empty.
+                */
+               hlist_bl_add_head(&new_e->hash_list, l);
+       } else if (new_e->old_chunk < e->old_chunk) {
+               /* Add before an existing exception */
+               hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
+       } else {
+               /* Add to l's tail: e is the last exception in this slot */
+               hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
+       }
 }
 
 /*
@@ -766,6 +814,7 @@ out:
  */
 static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 {
+       struct dm_exception_table_lock lock;
        struct dm_snapshot *s = context;
        struct dm_exception *e;
 
@@ -778,7 +827,17 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
        /* Consecutive_count is implicitly initialised to zero */
        e->new_chunk = new;
 
+       /*
+        * Although there is no need to lock access to the exception tables
+        * here, if we don't then hlist_bl_add_head(), called by
+        * dm_insert_exception(), will complain about accessing the
+        * corresponding list without locking it first.
+        */
+       dm_exception_table_lock_init(s, old, &lock);
+
+       dm_exception_table_lock(&lock);
        dm_insert_exception(&s->complete, e);
+       dm_exception_table_unlock(&lock);
 
        return 0;
 }
@@ -807,7 +866,7 @@ static int calc_max_buckets(void)
 {
        /* use a fixed size of 2MB */
        unsigned long mem = 2 * 1024 * 1024;
-       mem /= sizeof(struct list_head);
+       mem /= sizeof(struct hlist_bl_head);
 
        return mem;
 }
@@ -927,7 +986,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
        int r;
        chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
 
-       mutex_lock(&s->lock);
+       down_write(&s->lock);
 
        /*
         * Process chunks (and associated exceptions) in reverse order
@@ -942,7 +1001,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
        b = __release_queued_bios_after_merge(s);
 
 out:
-       mutex_unlock(&s->lock);
+       up_write(&s->lock);
        if (b)
                flush_bios(b);
 
@@ -1001,9 +1060,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
                if (linear_chunks < 0) {
                        DMERR("Read error in exception store: "
                              "shutting down merge");
-                       mutex_lock(&s->lock);
+                       down_write(&s->lock);
                        s->merge_failed = 1;
-                       mutex_unlock(&s->lock);
+                       up_write(&s->lock);
                }
                goto shut;
        }
@@ -1044,10 +1103,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
                previous_count = read_pending_exceptions_done_count();
        }
 
-       mutex_lock(&s->lock);
+       down_write(&s->lock);
        s->first_merging_chunk = old_chunk;
        s->num_merging_chunks = linear_chunks;
-       mutex_unlock(&s->lock);
+       up_write(&s->lock);
 
        /* Wait until writes to all 'linear_chunks' drain */
        for (i = 0; i < linear_chunks; i++)
@@ -1089,10 +1148,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
        return;
 
 shut:
-       mutex_lock(&s->lock);
+       down_write(&s->lock);
        s->merge_failed = 1;
        b = __release_queued_bios_after_merge(s);
-       mutex_unlock(&s->lock);
+       up_write(&s->lock);
        error_bios(b);
 
        merge_shutdown(s);
@@ -1188,10 +1247,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        s->snapshot_overflowed = 0;
        s->active = 0;
        atomic_set(&s->pending_exceptions_count, 0);
+       spin_lock_init(&s->pe_allocation_lock);
        s->exception_start_sequence = 0;
        s->exception_complete_sequence = 0;
        s->out_of_order_tree = RB_ROOT;
-       mutex_init(&s->lock);
+       init_rwsem(&s->lock);
        INIT_LIST_HEAD(&s->list);
        spin_lock_init(&s->pe_lock);
        s->state_bits = 0;
@@ -1357,9 +1417,9 @@ static void snapshot_dtr(struct dm_target *ti)
        /* Check whether exception handover must be cancelled */
        (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
        if (snap_src && snap_dest && (s == snap_src)) {
-               mutex_lock(&snap_dest->lock);
+               down_write(&snap_dest->lock);
                snap_dest->valid = 0;
-               mutex_unlock(&snap_dest->lock);
+               up_write(&snap_dest->lock);
                DMERR("Cancelling snapshot handover.");
        }
        up_read(&_origins_lock);
@@ -1390,8 +1450,6 @@ static void snapshot_dtr(struct dm_target *ti)
 
        dm_exception_store_destroy(s->store);
 
-       mutex_destroy(&s->lock);
-
        dm_put_device(ti, s->cow);
 
        dm_put_device(ti, s->origin);
@@ -1467,6 +1525,13 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
        dm_table_event(s->ti->table);
 }
 
+static void invalidate_snapshot(struct dm_snapshot *s, int err)
+{
+       down_write(&s->lock);
+       __invalidate_snapshot(s, err);
+       up_write(&s->lock);
+}
+
 static void pending_complete(void *context, int success)
 {
        struct dm_snap_pending_exception *pe = context;
@@ -1475,43 +1540,63 @@ static void pending_complete(void *context, int success)
        struct bio *origin_bios = NULL;
        struct bio *snapshot_bios = NULL;
        struct bio *full_bio = NULL;
+       struct dm_exception_table_lock lock;
        int error = 0;
 
+       dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
+
        if (!success) {
                /* Read/write error - snapshot is unusable */
-               mutex_lock(&s->lock);
-               __invalidate_snapshot(s, -EIO);
+               invalidate_snapshot(s, -EIO);
                error = 1;
+
+               dm_exception_table_lock(&lock);
                goto out;
        }
 
        e = alloc_completed_exception(GFP_NOIO);
        if (!e) {
-               mutex_lock(&s->lock);
-               __invalidate_snapshot(s, -ENOMEM);
+               invalidate_snapshot(s, -ENOMEM);
                error = 1;
+
+               dm_exception_table_lock(&lock);
                goto out;
        }
        *e = pe->e;
 
-       mutex_lock(&s->lock);
+       down_read(&s->lock);
+       dm_exception_table_lock(&lock);
        if (!s->valid) {
+               up_read(&s->lock);
                free_completed_exception(e);
                error = 1;
+
                goto out;
        }
 
-       /* Check for conflicting reads */
-       __check_for_conflicting_io(s, pe->e.old_chunk);
-
        /*
-        * Add a proper exception, and remove the
-        * in-flight exception from the list.
+        * Add a proper exception. After inserting the completed exception all
+        * subsequent snapshot reads to this chunk will be redirected to the
+        * COW device.  This ensures that we do not starve. Moreover, as long
+        * as the pending exception exists, neither origin writes nor snapshot
+        * merging can overwrite the chunk in origin.
         */
        dm_insert_exception(&s->complete, e);
+       up_read(&s->lock);
+
+       /* Wait for conflicting reads to drain */
+       if (__chunk_is_tracked(s, pe->e.old_chunk)) {
+               dm_exception_table_unlock(&lock);
+               __check_for_conflicting_io(s, pe->e.old_chunk);
+               dm_exception_table_lock(&lock);
+       }
 
 out:
+       /* Remove the in-flight exception from the list */
        dm_remove_exception(&pe->e);
+
+       dm_exception_table_unlock(&lock);
+
        snapshot_bios = bio_list_get(&pe->snapshot_bios);
        origin_bios = bio_list_get(&pe->origin_bios);
        full_bio = pe->full_bio;
@@ -1519,8 +1604,6 @@ out:
                full_bio->bi_end_io = pe->full_bio_end_io;
        increment_pending_exceptions_done_count();
 
-       mutex_unlock(&s->lock);
-
        /* Submit any pending write bios */
        if (error) {
                if (full_bio)
@@ -1660,43 +1743,59 @@ __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
 }
 
 /*
- * Looks to see if this snapshot already has a pending exception
- * for this chunk, otherwise it allocates a new one and inserts
- * it into the pending table.
+ * Inserts a pending exception into the pending table.
  *
- * NOTE: a write lock must be held on snap->lock before calling
- * this.
+ * NOTE: a write lock must be held on the chunk's pending exception table slot
+ * before calling this.
  */
 static struct dm_snap_pending_exception *
-__find_pending_exception(struct dm_snapshot *s,
-                        struct dm_snap_pending_exception *pe, chunk_t chunk)
+__insert_pending_exception(struct dm_snapshot *s,
+                          struct dm_snap_pending_exception *pe, chunk_t chunk)
 {
-       struct dm_snap_pending_exception *pe2;
-
-       pe2 = __lookup_pending_exception(s, chunk);
-       if (pe2) {
-               free_pending_exception(pe);
-               return pe2;
-       }
-
        pe->e.old_chunk = chunk;
        bio_list_init(&pe->origin_bios);
        bio_list_init(&pe->snapshot_bios);
        pe->started = 0;
        pe->full_bio = NULL;
 
+       spin_lock(&s->pe_allocation_lock);
        if (s->store->type->prepare_exception(s->store, &pe->e)) {
+               spin_unlock(&s->pe_allocation_lock);
                free_pending_exception(pe);
                return NULL;
        }
 
        pe->exception_sequence = s->exception_start_sequence++;
+       spin_unlock(&s->pe_allocation_lock);
 
        dm_insert_exception(&s->pending, &pe->e);
 
        return pe;
 }
 
+/*
+ * Looks to see if this snapshot already has a pending exception
+ * for this chunk, otherwise it allocates a new one and inserts
+ * it into the pending table.
+ *
+ * NOTE: a write lock must be held on the chunk's pending exception table slot
+ * before calling this.
+ */
+static struct dm_snap_pending_exception *
+__find_pending_exception(struct dm_snapshot *s,
+                        struct dm_snap_pending_exception *pe, chunk_t chunk)
+{
+       struct dm_snap_pending_exception *pe2;
+
+       pe2 = __lookup_pending_exception(s, chunk);
+       if (pe2) {
+               free_pending_exception(pe);
+               return pe2;
+       }
+
+       return __insert_pending_exception(s, pe, chunk);
+}
+
 static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
                            struct bio *bio, chunk_t chunk)
 {
@@ -1714,6 +1813,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
        int r = DM_MAPIO_REMAPPED;
        chunk_t chunk;
        struct dm_snap_pending_exception *pe = NULL;
+       struct dm_exception_table_lock lock;
 
        init_tracked_chunk(bio);
 
@@ -1723,13 +1823,15 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
        }
 
        chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
+       dm_exception_table_lock_init(s, chunk, &lock);
 
        /* Full snapshots are not usable */
        /* To get here the table must be live so s->active is always set. */
        if (!s->valid)
                return DM_MAPIO_KILL;
 
-       mutex_lock(&s->lock);
+       down_read(&s->lock);
+       dm_exception_table_lock(&lock);
 
        if (!s->valid || (unlikely(s->snapshot_overflowed) &&
            bio_data_dir(bio) == WRITE)) {
@@ -1752,15 +1854,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
        if (bio_data_dir(bio) == WRITE) {
                pe = __lookup_pending_exception(s, chunk);
                if (!pe) {
-                       mutex_unlock(&s->lock);
+                       dm_exception_table_unlock(&lock);
                        pe = alloc_pending_exception(s);
-                       mutex_lock(&s->lock);
-
-                       if (!s->valid || s->snapshot_overflowed) {
-                               free_pending_exception(pe);
-                               r = DM_MAPIO_KILL;
-                               goto out_unlock;
-                       }
+                       dm_exception_table_lock(&lock);
 
                        e = dm_lookup_exception(&s->complete, chunk);
                        if (e) {
@@ -1771,13 +1867,22 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 
                        pe = __find_pending_exception(s, pe, chunk);
                        if (!pe) {
+                               dm_exception_table_unlock(&lock);
+                               up_read(&s->lock);
+
+                               down_write(&s->lock);
+
                                if (s->store->userspace_supports_overflow) {
-                                       s->snapshot_overflowed = 1;
-                                       DMERR("Snapshot overflowed: Unable to allocate exception.");
+                                       if (s->valid && !s->snapshot_overflowed) {
+                                               s->snapshot_overflowed = 1;
+                                               DMERR("Snapshot overflowed: Unable to allocate exception.");
+                                       }
                                } else
                                        __invalidate_snapshot(s, -ENOMEM);
+                               up_write(&s->lock);
+
                                r = DM_MAPIO_KILL;
-                               goto out_unlock;
+                               goto out;
                        }
                }
 
@@ -1789,7 +1894,10 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
                    bio->bi_iter.bi_size ==
                    (s->store->chunk_size << SECTOR_SHIFT)) {
                        pe->started = 1;
-                       mutex_unlock(&s->lock);
+
+                       dm_exception_table_unlock(&lock);
+                       up_read(&s->lock);
+
                        start_full_bio(pe, bio);
                        goto out;
                }
@@ -1797,9 +1905,12 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
                bio_list_add(&pe->snapshot_bios, bio);
 
                if (!pe->started) {
-                       /* this is protected by snap->lock */
+                       /* this is protected by the exception table lock */
                        pe->started = 1;
-                       mutex_unlock(&s->lock);
+
+                       dm_exception_table_unlock(&lock);
+                       up_read(&s->lock);
+
                        start_copy(pe);
                        goto out;
                }
@@ -1809,7 +1920,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
        }
 
 out_unlock:
-       mutex_unlock(&s->lock);
+       dm_exception_table_unlock(&lock);
+       up_read(&s->lock);
 out:
        return r;
 }
@@ -1845,7 +1957,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 
        chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
 
-       mutex_lock(&s->lock);
+       down_write(&s->lock);
 
        /* Full merging snapshots are redirected to the origin */
        if (!s->valid)
@@ -1876,12 +1988,12 @@ redirect_to_origin:
        bio_set_dev(bio, s->origin->bdev);
 
        if (bio_data_dir(bio) == WRITE) {
-               mutex_unlock(&s->lock);
+               up_write(&s->lock);
                return do_origin(s->origin, bio);
        }
 
 out_unlock:
-       mutex_unlock(&s->lock);
+       up_write(&s->lock);
 
        return r;
 }
@@ -1913,7 +2025,7 @@ static int snapshot_preresume(struct dm_target *ti)
        down_read(&_origins_lock);
        (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
        if (snap_src && snap_dest) {
-               mutex_lock(&snap_src->lock);
+               down_read(&snap_src->lock);
                if (s == snap_src) {
                        DMERR("Unable to resume snapshot source until "
                              "handover completes.");
@@ -1923,7 +2035,7 @@ static int snapshot_preresume(struct dm_target *ti)
                              "source is suspended.");
                        r = -EINVAL;
                }
-               mutex_unlock(&snap_src->lock);
+               up_read(&snap_src->lock);
        }
        up_read(&_origins_lock);
 
@@ -1969,11 +2081,11 @@ static void snapshot_resume(struct dm_target *ti)
 
        (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
        if (snap_src && snap_dest) {
-               mutex_lock(&snap_src->lock);
-               mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
+               down_write(&snap_src->lock);
+               down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
                __handover_exceptions(snap_src, snap_dest);
-               mutex_unlock(&snap_dest->lock);
-               mutex_unlock(&snap_src->lock);
+               up_write(&snap_dest->lock);
+               up_write(&snap_src->lock);
        }
 
        up_read(&_origins_lock);
@@ -1988,9 +2100,9 @@ static void snapshot_resume(struct dm_target *ti)
        /* Now we have correct chunk size, reregister */
        reregister_snapshot(s);
 
-       mutex_lock(&s->lock);
+       down_write(&s->lock);
        s->active = 1;
-       mutex_unlock(&s->lock);
+       up_write(&s->lock);
 }
 
 static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
@@ -2030,7 +2142,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
        switch (type) {
        case STATUSTYPE_INFO:
 
-               mutex_lock(&snap->lock);
+               down_write(&snap->lock);
 
                if (!snap->valid)
                        DMEMIT("Invalid");
@@ -2055,7 +2167,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
                                DMEMIT("Unknown");
                }
 
-               mutex_unlock(&snap->lock);
+               up_write(&snap->lock);
 
                break;
 
@@ -2107,9 +2219,10 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
        int r = DM_MAPIO_REMAPPED;
        struct dm_snapshot *snap;
        struct dm_exception *e;
-       struct dm_snap_pending_exception *pe;
+       struct dm_snap_pending_exception *pe, *pe2;
        struct dm_snap_pending_exception *pe_to_start_now = NULL;
        struct dm_snap_pending_exception *pe_to_start_last = NULL;
+       struct dm_exception_table_lock lock;
        chunk_t chunk;
 
        /* Do all the snapshots on this origin */
@@ -2121,52 +2234,59 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
                if (dm_target_is_snapshot_merge(snap->ti))
                        continue;
 
-               mutex_lock(&snap->lock);
-
-               /* Only deal with valid and active snapshots */
-               if (!snap->valid || !snap->active)
-                       goto next_snapshot;
-
                /* Nothing to do if writing beyond end of snapshot */
                if (sector >= dm_table_get_size(snap->ti->table))
-                       goto next_snapshot;
+                       continue;
 
                /*
                 * Remember, different snapshots can have
                 * different chunk sizes.
                 */
                chunk = sector_to_chunk(snap->store, sector);
+               dm_exception_table_lock_init(snap, chunk, &lock);
 
-               /*
-                * Check exception table to see if block
-                * is already remapped in this snapshot
-                * and trigger an exception if not.
-                */
-               e = dm_lookup_exception(&snap->complete, chunk);
-               if (e)
+               down_read(&snap->lock);
+               dm_exception_table_lock(&lock);
+
+               /* Only deal with valid and active snapshots */
+               if (!snap->valid || !snap->active)
                        goto next_snapshot;
 
                pe = __lookup_pending_exception(snap, chunk);
                if (!pe) {
-                       mutex_unlock(&snap->lock);
-                       pe = alloc_pending_exception(snap);
-                       mutex_lock(&snap->lock);
-
-                       if (!snap->valid) {
-                               free_pending_exception(pe);
-                               goto next_snapshot;
-                       }
-
+                       /*
+                        * Check exception table to see if block is already
+                        * remapped in this snapshot and trigger an exception
+                        * if not.
+                        */
                        e = dm_lookup_exception(&snap->complete, chunk);
-                       if (e) {
-                               free_pending_exception(pe);
+                       if (e)
                                goto next_snapshot;
-                       }
 
-                       pe = __find_pending_exception(snap, pe, chunk);
-                       if (!pe) {
-                               __invalidate_snapshot(snap, -ENOMEM);
-                               goto next_snapshot;
+                       dm_exception_table_unlock(&lock);
+                       pe = alloc_pending_exception(snap);
+                       dm_exception_table_lock(&lock);
+
+                       pe2 = __lookup_pending_exception(snap, chunk);
+
+                       if (!pe2) {
+                               e = dm_lookup_exception(&snap->complete, chunk);
+                               if (e) {
+                                       free_pending_exception(pe);
+                                       goto next_snapshot;
+                               }
+
+                               pe = __insert_pending_exception(snap, pe, chunk);
+                               if (!pe) {
+                                       dm_exception_table_unlock(&lock);
+                                       up_read(&snap->lock);
+
+                                       invalidate_snapshot(snap, -ENOMEM);
+                                       continue;
+                               }
+                       } else {
+                               free_pending_exception(pe);
+                               pe = pe2;
                        }
                }
 
@@ -2193,7 +2313,8 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
                }
 
 next_snapshot:
-               mutex_unlock(&snap->lock);
+               dm_exception_table_unlock(&lock);
+               up_read(&snap->lock);
 
                if (pe_to_start_now) {
                        start_copy(pe_to_start_now);
index 314d17ca64668a70ea1f6445111ca19b2024141e..64dd0b34fcf490cee3779e179e9b8c7c543b7e53 100644 (file)
@@ -136,7 +136,8 @@ static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
        return DM_MAPIO_KILL;
 }
 
-static void io_err_release_clone_rq(struct request *clone)
+static void io_err_release_clone_rq(struct request *clone,
+                                   union map_info *map_context)
 {
 }
 
index ed3caceaed07c07c33e16b9038f0c3bffd7616d5..7f0840601737f473dfde9bdb06de8fc4f639d264 100644 (file)
@@ -201,6 +201,13 @@ struct dm_pool_metadata {
         */
        bool fail_io:1;
 
+       /*
+        * Set once a thin-pool has been accessed through one of the interfaces
+        * that imply the pool is in-service (e.g. thin devices created/deleted,
+        * thin-pool message, metadata snapshots, etc).
+        */
+       bool in_service:1;
+
        /*
         * Reading the space map roots can fail, so we read it into these
         * buffers before the superblock is locked and updated.
@@ -367,6 +374,32 @@ static int subtree_equal(void *context, const void *value1_le, const void *value
 
 /*----------------------------------------------------------------*/
 
+/*
+ * Variant that is used for in-core only changes or code that
+ * shouldn't put the pool in service on its own (e.g. commit).
+ */
+static inline void __pmd_write_lock(struct dm_pool_metadata *pmd)
+       __acquires(pmd->root_lock)
+{
+       down_write(&pmd->root_lock);
+}
+#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
+
+static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
+{
+       __pmd_write_lock(pmd);
+       if (unlikely(!pmd->in_service))
+               pmd->in_service = true;
+}
+
+static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
+       __releases(pmd->root_lock)
+{
+       up_write(&pmd->root_lock);
+}
+
+/*----------------------------------------------------------------*/
+
 static int superblock_lock_zero(struct dm_pool_metadata *pmd,
                                struct dm_block **sblock)
 {
@@ -790,6 +823,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
         */
        BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
 
+       if (unlikely(!pmd->in_service))
+               return 0;
+
        r = __write_changed_details(pmd);
        if (r < 0)
                return r;
@@ -853,6 +889,7 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
        pmd->time = 0;
        INIT_LIST_HEAD(&pmd->thin_devices);
        pmd->fail_io = false;
+       pmd->in_service = false;
        pmd->bdev = bdev;
        pmd->data_block_size = data_block_size;
 
@@ -903,7 +940,6 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
                        DMWARN("%s: __commit_transaction() failed, error = %d",
                               __func__, r);
        }
-
        if (!pmd->fail_io)
                __destroy_persistent_data_objects(pmd);
 
@@ -1032,10 +1068,10 @@ int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        if (!pmd->fail_io)
                r = __create_thin(pmd, dev);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1123,10 +1159,10 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd,
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        if (!pmd->fail_io)
                r = __create_snap(pmd, dev, origin);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1166,10 +1202,10 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        if (!pmd->fail_io)
                r = __delete_device(pmd, dev);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1180,7 +1216,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
 
        if (pmd->fail_io)
                goto out;
@@ -1194,7 +1230,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
        r = 0;
 
 out:
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1225,7 +1261,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
         * We commit to ensure the btree roots which we increment in a
         * moment are up to date.
         */
-       __commit_transaction(pmd);
+       r = __commit_transaction(pmd);
+       if (r < 0) {
+               DMWARN("%s: __commit_transaction() failed, error = %d",
+                      __func__, r);
+               return r;
+       }
 
        /*
         * Copy the superblock.
@@ -1283,10 +1324,10 @@ int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        if (!pmd->fail_io)
                r = __reserve_metadata_snap(pmd);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1331,10 +1372,10 @@ int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        if (!pmd->fail_io)
                r = __release_metadata_snap(pmd);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1377,19 +1418,19 @@ int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock_in_core(pmd);
        if (!pmd->fail_io)
                r = __open_device(pmd, dev, 0, td);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
 
 int dm_pool_close_thin_device(struct dm_thin_device *td)
 {
-       down_write(&td->pmd->root_lock);
+       pmd_write_lock_in_core(td->pmd);
        __close_device(td);
-       up_write(&td->pmd->root_lock);
+       pmd_write_unlock(td->pmd);
 
        return 0;
 }
@@ -1570,10 +1611,10 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
 {
        int r = -EINVAL;
 
-       down_write(&td->pmd->root_lock);
+       pmd_write_lock(td->pmd);
        if (!td->pmd->fail_io)
                r = __insert(td, block, data_block);
-       up_write(&td->pmd->root_lock);
+       pmd_write_unlock(td->pmd);
 
        return r;
 }
@@ -1657,10 +1698,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 {
        int r = -EINVAL;
 
-       down_write(&td->pmd->root_lock);
+       pmd_write_lock(td->pmd);
        if (!td->pmd->fail_io)
                r = __remove(td, block);
-       up_write(&td->pmd->root_lock);
+       pmd_write_unlock(td->pmd);
 
        return r;
 }
@@ -1670,10 +1711,10 @@ int dm_thin_remove_range(struct dm_thin_device *td,
 {
        int r = -EINVAL;
 
-       down_write(&td->pmd->root_lock);
+       pmd_write_lock(td->pmd);
        if (!td->pmd->fail_io)
                r = __remove_range(td, begin, end);
-       up_write(&td->pmd->root_lock);
+       pmd_write_unlock(td->pmd);
 
        return r;
 }
@@ -1696,13 +1737,13 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
 {
        int r = 0;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        for (; b != e; b++) {
                r = dm_sm_inc_block(pmd->data_sm, b);
                if (r)
                        break;
        }
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1711,13 +1752,13 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
 {
        int r = 0;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        for (; b != e; b++) {
                r = dm_sm_dec_block(pmd->data_sm, b);
                if (r)
                        break;
        }
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1765,10 +1806,10 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        if (!pmd->fail_io)
                r = dm_sm_new_block(pmd->data_sm, result);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1777,12 +1818,16 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       /*
+        * Care is taken to not have commit be what
+        * triggers putting the thin-pool in-service.
+        */
+       __pmd_write_lock(pmd);
        if (pmd->fail_io)
                goto out;
 
        r = __commit_transaction(pmd);
-       if (r <= 0)
+       if (r < 0)
                goto out;
 
        /*
@@ -1790,7 +1835,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
         */
        r = __begin_transaction(pmd);
 out:
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
        return r;
 }
 
@@ -1806,7 +1851,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        if (pmd->fail_io)
                goto out;
 
@@ -1817,7 +1862,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
                pmd->fail_io = true;
 
 out:
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1948,10 +1993,10 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        if (!pmd->fail_io)
                r = __resize_space_map(pmd->data_sm, new_count);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -1960,29 +2005,29 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
 {
        int r = -EINVAL;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        if (!pmd->fail_io) {
                r = __resize_space_map(pmd->metadata_sm, new_count);
                if (!r)
                        __set_metadata_reserve(pmd);
        }
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
 
 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 {
-       down_write(&pmd->root_lock);
+       pmd_write_lock_in_core(pmd);
        dm_bm_set_read_only(pmd->bm);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 }
 
 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
 {
-       down_write(&pmd->root_lock);
+       pmd_write_lock_in_core(pmd);
        dm_bm_set_read_write(pmd->bm);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 }
 
 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
@@ -1992,9 +2037,9 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 {
        int r;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock_in_core(pmd);
        r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
 
        return r;
 }
@@ -2005,7 +2050,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
        struct dm_block *sblock;
        struct thin_disk_superblock *disk_super;
 
-       down_write(&pmd->root_lock);
+       pmd_write_lock(pmd);
        pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
 
        r = superblock_lock(pmd, &sblock);
@@ -2019,7 +2064,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
 
        dm_bm_unlock(sblock);
 out:
-       up_write(&pmd->root_lock);
+       pmd_write_unlock(pmd);
        return r;
 }
 
index f7822875589ea8439764d72a488adc2846a28f6f..1cb137f0ef9d7f4b265779563273ed82d136eee3 100644 (file)
@@ -190,7 +190,6 @@ struct writeback_struct {
        struct dm_writecache *wc;
        struct wc_entry **wc_list;
        unsigned wc_list_n;
-       unsigned page_offset;
        struct page *page;
        struct wc_entry *wc_list_inline[WB_LIST_INLINE];
        struct bio bio;
@@ -546,21 +545,20 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
                e = container_of(node, struct wc_entry, rb_node);
                if (read_original_sector(wc, e) == block)
                        break;
+
                node = (read_original_sector(wc, e) >= block ?
                        e->rb_node.rb_left : e->rb_node.rb_right);
                if (unlikely(!node)) {
-                       if (!(flags & WFE_RETURN_FOLLOWING)) {
+                       if (!(flags & WFE_RETURN_FOLLOWING))
                                return NULL;
-                       }
                        if (read_original_sector(wc, e) >= block) {
-                               break;
+                               return e;
                        } else {
                                node = rb_next(&e->rb_node);
-                               if (unlikely(!node)) {
+                               if (unlikely(!node))
                                        return NULL;
-                               }
                                e = container_of(node, struct wc_entry, rb_node);
-                               break;
+                               return e;
                        }
                }
        }
@@ -571,7 +569,7 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
                        node = rb_prev(&e->rb_node);
                else
                        node = rb_next(&e->rb_node);
-               if (!node)
+               if (unlikely(!node))
                        return e;
                e2 = container_of(node, struct wc_entry, rb_node);
                if (read_original_sector(wc, e2) != block)
@@ -804,7 +802,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
                        writecache_free_entry(wc, e);
                }
 
-               if (!node)
+               if (unlikely(!node))
                        break;
 
                e = container_of(node, struct wc_entry, rb_node);
@@ -1478,10 +1476,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
                bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
                wb = container_of(bio, struct writeback_struct, bio);
                wb->wc = wc;
-               wb->bio.bi_end_io = writecache_writeback_endio;
-               bio_set_dev(&wb->bio, wc->dev->bdev);
-               wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
-               wb->page_offset = PAGE_SIZE;
+               bio->bi_end_io = writecache_writeback_endio;
+               bio_set_dev(bio, wc->dev->bdev);
+               bio->bi_iter.bi_sector = read_original_sector(wc, e);
                if (max_pages <= WB_LIST_INLINE ||
                    unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
                                                           GFP_NOIO | __GFP_NORETRY |
@@ -1507,12 +1504,12 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
                        wb->wc_list[wb->wc_list_n++] = f;
                        e = f;
                }
-               bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
+               bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
                if (writecache_has_error(wc)) {
                        bio->bi_status = BLK_STS_IOERR;
-                       bio_endio(&wb->bio);
+                       bio_endio(bio);
                } else {
-                       submit_bio(&wb->bio);
+                       submit_bio(bio);
                }
 
                __writeback_throttle(wc, wbl);
index fa68336560c34dab4acdc2bd9f2dd207ce1d2f5e..d8334cd45d7cb5eb90745b09463cf4daaa231021 100644 (file)
@@ -1169,6 +1169,9 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
                        goto out;
                }
 
+               if (!nr_blkz)
+                       break;
+
                /* Process report */
                for (i = 0; i < nr_blkz; i++) {
                        ret = dmz_init_zone(zmd, zone, &blkz[i]);
@@ -1204,6 +1207,8 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
        /* Get zone information from disk */
        ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
                                  &blkz, &nr_blkz, GFP_NOIO);
+       if (!nr_blkz)
+               ret = -EIO;
        if (ret) {
                dmz_dev_err(zmd->dev, "Get zone %u report failed",
                            dmz_id(zmd, zone));
index 8865c1709e16357178ede5284c4477536bf2369d..51d029bbb740c4f032ea63005611b194fcc5cee8 100644 (file)
@@ -643,7 +643,8 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
 
        q = bdev_get_queue(dev->bdev);
        dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
-       aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1);
+       aligned_capacity = dev->capacity &
+                               ~((sector_t)blk_queue_zone_sectors(q) - 1);
        if (ti->begin ||
            ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
                ti->error = "Partial mapping not supported";
index 043f0761e4a0aea8a22a1c6745f3f9bbbc021dfd..1fb1333fefec12b881ec5e32f0a12bad8af6108c 100644 (file)
@@ -781,7 +781,8 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
 }
 
 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
-                                             fmode_t mode) {
+                                             fmode_t mode)
+{
        struct table_device *td;
 
        list_for_each_entry(td, l, list)
@@ -792,7 +793,8 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 }
 
 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
-                       struct dm_dev **result) {
+                       struct dm_dev **result)
+{
        int r;
        struct table_device *td;
 
@@ -1906,7 +1908,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
 static struct mapped_device *alloc_dev(int minor)
 {
        int r, numa_node_id = dm_get_numa_node();
-       struct dax_device *dax_dev = NULL;
        struct mapped_device *md;
        void *old_md;
 
@@ -1969,11 +1970,10 @@ static struct mapped_device *alloc_dev(int minor)
        sprintf(md->disk->disk_name, "dm-%d", minor);
 
        if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
-               dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
-               if (!dax_dev)
+               md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+               if (!md->dax_dev)
                        goto bad;
        }
-       md->dax_dev = dax_dev;
 
        add_disk_no_queue_reg(md->disk);
        format_dev_t(md->name, MKDEV(_major, minor));
index 0a3b8ae4a29c6789b400e91a912f90b8ced1a806..b8a62188f6be5906630983ef8fe183c9ba68ef2f 100644 (file)
@@ -190,6 +190,8 @@ static int sm_find_free(void *addr, unsigned begin, unsigned end,
 
 static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
 {
+       memset(ll, 0, sizeof(struct ll_disk));
+
        ll->tm = tm;
 
        ll->bitmap_info.tm = tm;
index d730693f299cfccf098f6466f66076a1765109b8..8f7f8efc71a7d945e3e5b014f12f2492e64948ff 100644 (file)
 #define ISC_PFG_CFG0_BPS_TWELVE (0x0 << 28)
 #define ISC_PFE_CFG0_BPS_MASK   GENMASK(30, 28)
 
+#define ISC_PFE_CFG0_COLEN     BIT(12)
+#define ISC_PFE_CFG0_ROWEN     BIT(13)
+
+/* ISC Parallel Front End Configuration 1 Register */
+#define ISC_PFE_CFG1    0x00000010
+
+#define ISC_PFE_CFG1_COLMIN(v)         ((v))
+#define ISC_PFE_CFG1_COLMIN_MASK       GENMASK(15, 0)
+#define ISC_PFE_CFG1_COLMAX(v)         ((v) << 16)
+#define ISC_PFE_CFG1_COLMAX_MASK       GENMASK(31, 16)
+
+/* ISC Parallel Front End Configuration 2 Register */
+#define ISC_PFE_CFG2    0x00000014
+
+#define ISC_PFE_CFG2_ROWMIN(v)         ((v))
+#define ISC_PFE_CFG2_ROWMIN_MASK       GENMASK(15, 0)
+#define ISC_PFE_CFG2_ROWMAX(v)         ((v) << 16)
+#define ISC_PFE_CFG2_ROWMAX_MASK       GENMASK(31, 16)
+
 /* ISC Clock Enable Register */
 #define ISC_CLKEN               0x00000018
 
index 4bba9da206e416a65dde7834b6c25cee14eeed3f..94cb309fdb527dffdf501b0c8fa3fb946e76bdc9 100644 (file)
@@ -721,6 +721,40 @@ static void isc_start_dma(struct isc_device *isc)
        u32 sizeimage = isc->fmt.fmt.pix.sizeimage;
        u32 dctrl_dview;
        dma_addr_t addr0;
+       u32 h, w;
+
+       h = isc->fmt.fmt.pix.height;
+       w = isc->fmt.fmt.pix.width;
+
+       /*
+        * In case the sensor is not RAW, it will output a pixel (12-16 bits)
+        * with two samples on the ISC Data bus (which is 8-12)
+        * ISC will count each sample, so, we need to multiply these values
+        * by two, to get the real number of samples for the required pixels.
+        */
+       if (!ISC_IS_FORMAT_RAW(isc->config.sd_format->mbus_code)) {
+               h <<= 1;
+               w <<= 1;
+       }
+
+       /*
+        * We limit the column/row count that the ISC will output according
+        * to the configured resolution that we want.
+        * This will avoid the situation where the sensor is misconfigured,
+        * sending more data, and the ISC will just take it and DMA to memory,
+        * causing corruption.
+        */
+       regmap_write(regmap, ISC_PFE_CFG1,
+                    (ISC_PFE_CFG1_COLMIN(0) & ISC_PFE_CFG1_COLMIN_MASK) |
+                    (ISC_PFE_CFG1_COLMAX(w - 1) & ISC_PFE_CFG1_COLMAX_MASK));
+
+       regmap_write(regmap, ISC_PFE_CFG2,
+                    (ISC_PFE_CFG2_ROWMIN(0) & ISC_PFE_CFG2_ROWMIN_MASK) |
+                    (ISC_PFE_CFG2_ROWMAX(h - 1) & ISC_PFE_CFG2_ROWMAX_MASK));
+
+       regmap_update_bits(regmap, ISC_PFE_CFG0,
+                          ISC_PFE_CFG0_COLEN | ISC_PFE_CFG0_ROWEN,
+                          ISC_PFE_CFG0_COLEN | ISC_PFE_CFG0_ROWEN);
 
        addr0 = vb2_dma_contig_plane_dma_addr(&isc->cur_frm->vb.vb2_buf, 0);
        regmap_write(regmap, ISC_DAD0, addr0);
@@ -1965,6 +1999,8 @@ static int isc_async_complete(struct v4l2_async_notifier *notifier)
        struct vb2_queue *q = &isc->vb2_vidq;
        int ret;
 
+       INIT_WORK(&isc->awb_work, isc_awb_work);
+
        ret = v4l2_device_register_subdev_nodes(&isc->v4l2_dev);
        if (ret < 0) {
                v4l2_err(&isc->v4l2_dev, "Failed to register subdev nodes\n");
@@ -2018,8 +2054,6 @@ static int isc_async_complete(struct v4l2_async_notifier *notifier)
                return ret;
        }
 
-       INIT_WORK(&isc->awb_work, isc_awb_work);
-
        /* Register video device */
        strscpy(vdev->name, ATMEL_ISC_NAME, sizeof(vdev->name));
        vdev->release           = video_device_release_empty;
@@ -2135,8 +2169,11 @@ static int isc_parse_dt(struct device *dev, struct isc_device *isc)
                        break;
                }
 
-               subdev_entity->asd = devm_kzalloc(dev,
-                                    sizeof(*subdev_entity->asd), GFP_KERNEL);
+               /* asd will be freed by the subsystem once it's added to the
+                * notifier list
+                */
+               subdev_entity->asd = kzalloc(sizeof(*subdev_entity->asd),
+                                            GFP_KERNEL);
                if (!subdev_entity->asd) {
                        of_node_put(rem);
                        ret = -ENOMEM;
@@ -2284,6 +2321,7 @@ static int atmel_isc_probe(struct platform_device *pdev)
                                                     subdev_entity->asd);
                if (ret) {
                        fwnode_handle_put(subdev_entity->asd->match.fwnode);
+                       kfree(subdev_entity->asd);
                        goto cleanup_subdev;
                }
 
index 3ce58dee4422bd16d61802c641e16e392a9597a3..1d96cca615477d513974e78cec2c7553eb603128 100644 (file)
@@ -1515,10 +1515,20 @@ static int coda_queue_setup(struct vb2_queue *vq,
 
 static int coda_buf_prepare(struct vb2_buffer *vb)
 {
+       struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb);
        struct coda_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue);
        struct coda_q_data *q_data;
 
        q_data = get_q_data(ctx, vb->vb2_queue->type);
+       if (V4L2_TYPE_IS_OUTPUT(vb->vb2_queue->type)) {
+               if (vbuf->field == V4L2_FIELD_ANY)
+                       vbuf->field = V4L2_FIELD_NONE;
+               if (vbuf->field != V4L2_FIELD_NONE) {
+                       v4l2_warn(&ctx->dev->v4l2_dev,
+                                 "%s field isn't supported\n", __func__);
+                       return -EINVAL;
+               }
+       }
 
        if (vb2_plane_size(vb, 0) < q_data->sizeimage) {
                v4l2_warn(&ctx->dev->v4l2_dev,
index 8339163a5231e37c5c0dcb926a175bfa64fd3507..4e24f5d781f4fe96da0e43648e13190eac578b49 100644 (file)
@@ -104,7 +104,7 @@ static int vpbe_enum_outputs(struct vpbe_device *vpbe_dev,
                             struct v4l2_output *output)
 {
        struct vpbe_config *cfg = vpbe_dev->cfg;
-       int temp_index = output->index;
+       unsigned int temp_index = output->index;
 
        if (temp_index >= cfg->num_outputs)
                return -EINVAL;
index 37f0d7146dfa480fa3e7b9551177c29495ee7208..cb6a9e3946b6d8fcea499e8d8f5800700e26feae 100644 (file)
@@ -1527,23 +1527,20 @@ static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *b)
        unsigned long size;
        struct videobuf_buffer *vb;
 
-       vb = q->bufs[b->index];
-
        if (!vout->streaming)
                return -EINVAL;
 
-       if (file->f_flags & O_NONBLOCK)
-               /* Call videobuf_dqbuf for non blocking mode */
-               ret = videobuf_dqbuf(q, (struct v4l2_buffer *)b, 1);
-       else
-               /* Call videobuf_dqbuf for  blocking mode */
-               ret = videobuf_dqbuf(q, (struct v4l2_buffer *)b, 0);
+       ret = videobuf_dqbuf(q, b, !!(file->f_flags & O_NONBLOCK));
+       if (ret)
+               return ret;
+
+       vb = q->bufs[b->index];
 
        addr = (unsigned long) vout->buf_phy_addr[vb->i];
        size = (unsigned long) vb->size;
        dma_unmap_single(vout->vid_dev->v4l2_dev.dev,  addr,
                                size, DMA_TO_DEVICE);
-       return ret;
+       return 0;
 }
 
 static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type i)
index 799e526fd3df555441a0821a8948e35166f575b4..8f097e514900307ff2de242b08f4cdd07b9c3188 100644 (file)
@@ -68,6 +68,7 @@ struct rcar_csi2;
 /* Field Detection Control */
 #define FLD_REG                                0x1c
 #define FLD_FLD_NUM(n)                 (((n) & 0xff) << 16)
+#define FLD_DET_SEL(n)                 (((n) & 0x3) << 4)
 #define FLD_FLD_EN4                    BIT(3)
 #define FLD_FLD_EN3                    BIT(2)
 #define FLD_FLD_EN2                    BIT(1)
@@ -84,6 +85,9 @@ struct rcar_csi2;
 
 /* Interrupt Enable */
 #define INTEN_REG                      0x30
+#define INTEN_INT_AFIFO_OF             BIT(27)
+#define INTEN_INT_ERRSOTHS             BIT(4)
+#define INTEN_INT_ERRSOTSYNCHS         BIT(3)
 
 /* Interrupt Source Mask */
 #define INTCLOSE_REG                   0x34
@@ -475,7 +479,7 @@ static int rcsi2_calc_mbps(struct rcar_csi2 *priv, unsigned int bpp)
 static int rcsi2_start_receiver(struct rcar_csi2 *priv)
 {
        const struct rcar_csi2_format *format;
-       u32 phycnt, vcdt = 0, vcdt2 = 0;
+       u32 phycnt, vcdt = 0, vcdt2 = 0, fld = 0;
        unsigned int i;
        int mbps, ret;
 
@@ -507,6 +511,16 @@ static int rcsi2_start_receiver(struct rcar_csi2 *priv)
                        vcdt2 |= vcdt_part << ((i % 2) * 16);
        }
 
+       if (priv->mf.field == V4L2_FIELD_ALTERNATE) {
+               fld = FLD_DET_SEL(1) | FLD_FLD_EN4 | FLD_FLD_EN3 | FLD_FLD_EN2
+                       | FLD_FLD_EN;
+
+               if (priv->mf.height == 240)
+                       fld |= FLD_FLD_NUM(0);
+               else
+                       fld |= FLD_FLD_NUM(1);
+       }
+
        phycnt = PHYCNT_ENABLECLK;
        phycnt |= (1 << priv->lanes) - 1;
 
@@ -514,6 +528,10 @@ static int rcsi2_start_receiver(struct rcar_csi2 *priv)
        if (mbps < 0)
                return mbps;
 
+       /* Enable interrupts. */
+       rcsi2_write(priv, INTEN_REG, INTEN_INT_AFIFO_OF | INTEN_INT_ERRSOTHS
+                   | INTEN_INT_ERRSOTSYNCHS);
+
        /* Init */
        rcsi2_write(priv, TREF_REG, TREF_TREF);
        rcsi2_write(priv, PHTC_REG, 0);
@@ -549,8 +567,7 @@ static int rcsi2_start_receiver(struct rcar_csi2 *priv)
        rcsi2_write(priv, PHYCNT_REG, phycnt);
        rcsi2_write(priv, LINKCNT_REG, LINKCNT_MONITOR_EN |
                    LINKCNT_REG_MONI_PACT_EN | LINKCNT_ICLK_NONSTOP);
-       rcsi2_write(priv, FLD_REG, FLD_FLD_NUM(2) | FLD_FLD_EN4 |
-                   FLD_FLD_EN3 | FLD_FLD_EN2 | FLD_FLD_EN);
+       rcsi2_write(priv, FLD_REG, fld);
        rcsi2_write(priv, PHYCNT_REG, phycnt | PHYCNT_SHUTDOWNZ);
        rcsi2_write(priv, PHYCNT_REG, phycnt | PHYCNT_SHUTDOWNZ | PHYCNT_RSTZ);
 
@@ -675,6 +692,43 @@ static const struct v4l2_subdev_ops rcar_csi2_subdev_ops = {
        .pad    = &rcar_csi2_pad_ops,
 };
 
+static irqreturn_t rcsi2_irq(int irq, void *data)
+{
+       struct rcar_csi2 *priv = data;
+       u32 status, err_status;
+
+       status = rcsi2_read(priv, INTSTATE_REG);
+       err_status = rcsi2_read(priv, INTERRSTATE_REG);
+
+       if (!status)
+               return IRQ_HANDLED;
+
+       rcsi2_write(priv, INTSTATE_REG, status);
+
+       if (!err_status)
+               return IRQ_HANDLED;
+
+       rcsi2_write(priv, INTERRSTATE_REG, err_status);
+
+       dev_info(priv->dev, "Transfer error, restarting CSI-2 receiver\n");
+
+       return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t rcsi2_irq_thread(int irq, void *data)
+{
+       struct rcar_csi2 *priv = data;
+
+       mutex_lock(&priv->lock);
+       rcsi2_stop(priv);
+       usleep_range(1000, 2000);
+       if (rcsi2_start(priv))
+               dev_warn(priv->dev, "Failed to restart CSI-2 receiver\n");
+       mutex_unlock(&priv->lock);
+
+       return IRQ_HANDLED;
+}
+
 /* -----------------------------------------------------------------------------
  * Async handling and registration of subdevices and links.
  */
@@ -947,7 +1001,7 @@ static int rcsi2_probe_resources(struct rcar_csi2 *priv,
                                 struct platform_device *pdev)
 {
        struct resource *res;
-       int irq;
+       int irq, ret;
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        priv->base = devm_ioremap_resource(&pdev->dev, res);
@@ -958,6 +1012,12 @@ static int rcsi2_probe_resources(struct rcar_csi2 *priv,
        if (irq < 0)
                return irq;
 
+       ret = devm_request_threaded_irq(&pdev->dev, irq, rcsi2_irq,
+                                       rcsi2_irq_thread, IRQF_SHARED,
+                                       KBUILD_MODNAME, priv);
+       if (ret)
+               return ret;
+
        priv->rstc = devm_reset_control_get(&pdev->dev, NULL);
        if (IS_ERR(priv->rstc))
                return PTR_ERR(priv->rstc);
index 7fb3a4fa07c1e7d674a4a39e414b51f40f5656e7..447bdfbe5afe190ca2e5ca67ce990a9291729e7a 100644 (file)
@@ -334,8 +334,8 @@ static int tegra_cec_probe(struct platform_device *pdev)
 
        hdmi_dev = cec_notifier_parse_hdmi_phandle(&pdev->dev);
 
-       if (!hdmi_dev)
-               return -ENODEV;
+       if (IS_ERR(hdmi_dev))
+               return PTR_ERR(hdmi_dev);
 
        cec = devm_kzalloc(&pdev->dev, sizeof(struct tegra_cec), GFP_KERNEL);
 
index b6b3ff0fe17f5c4e3a2c3f1460e6da57da4b79d8..7ccb950aa7d4aa30f4b4adf3248488201fecb023 100644 (file)
@@ -22,7 +22,6 @@ config MLXSW_CORE_HWMON
 config MLXSW_CORE_THERMAL
        bool "Thermal zone support for Mellanox Technologies Switch ASICs"
        depends on MLXSW_CORE && THERMAL
-       depends on !(MLXSW_CORE=y && THERMAL=m)
        default y
        ---help---
         Say Y here if you want to automatically control fans speed according
index 1ba4a5154fb5518e145f41ae579f42696067ae2e..64037b0a83877e15d2e6846e5abc662199335396 100644 (file)
@@ -1266,7 +1266,7 @@ static int prp_registered(struct v4l2_subdev *sd)
        if (ret)
                return ret;
 
-       ret = imx_media_capture_device_register(priv->vdev);
+       ret = imx_media_capture_device_register(priv->md, priv->vdev);
        if (ret)
                return ret;
 
index b7ce9d4392791ab3bea39a020644b6122bd11f41..9430c835c4349ddfe68ec5937ea250d6e2d6f121 100644 (file)
@@ -701,7 +701,8 @@ void imx_media_capture_device_error(struct imx_media_video_dev *vdev)
 }
 EXPORT_SYMBOL_GPL(imx_media_capture_device_error);
 
-int imx_media_capture_device_register(struct imx_media_video_dev *vdev)
+int imx_media_capture_device_register(struct imx_media_dev *md,
+                                     struct imx_media_video_dev *vdev)
 {
        struct capture_priv *priv = to_capture_priv(vdev);
        struct v4l2_subdev *sd = priv->src_sd;
@@ -710,8 +711,7 @@ int imx_media_capture_device_register(struct imx_media_video_dev *vdev)
        struct v4l2_subdev_format fmt_src;
        int ret;
 
-       /* get media device */
-       priv->md = dev_get_drvdata(sd->v4l2_dev->dev);
+       priv->md = md;
 
        vfd->v4l2_dev = sd->v4l2_dev;
 
index 28fe66052cc7486622d79ac91e5f859aa3fb0932..1d248aca40a9ec3795fcf27c70fb1fa81ce9a164 100644 (file)
@@ -1812,7 +1812,7 @@ static int csi_registered(struct v4l2_subdev *sd)
        if (ret)
                goto free_fim;
 
-       ret = imx_media_capture_device_register(priv->vdev);
+       ret = imx_media_capture_device_register(priv->md, priv->vdev);
        if (ret)
                goto free_fim;
 
index eb59ba0c3b62bfbc270142513ceb27cccd275adf..6587aa49e0051859b79076c515f9701a1876870d 100644 (file)
@@ -268,7 +268,8 @@ int imx_media_of_add_csi(struct imx_media_dev *imxmd,
 struct imx_media_video_dev *
 imx_media_capture_device_init(struct v4l2_subdev *src_sd, int pad);
 void imx_media_capture_device_remove(struct imx_media_video_dev *vdev);
-int imx_media_capture_device_register(struct imx_media_video_dev *vdev);
+int imx_media_capture_device_register(struct imx_media_dev *md,
+                                     struct imx_media_video_dev *vdev);
 void imx_media_capture_device_unregister(struct imx_media_video_dev *vdev);
 struct imx_media_buffer *
 imx_media_capture_device_next_buf(struct imx_media_video_dev *vdev);
index 18eb5d3ecf102ad5bca3f84379f1c290d2929a92..a708a0340eb18a0a0d5931870b946e13ba78181a 100644 (file)
@@ -1126,7 +1126,7 @@ static int imx7_csi_registered(struct v4l2_subdev *sd)
        if (ret < 0)
                return ret;
 
-       ret = imx_media_capture_device_register(csi->vdev);
+       ret = imx_media_capture_device_register(csi->imxmd, csi->vdev);
        if (ret < 0)
                return ret;
 
index 58721c46fba4184ea03ce95da4c574704b984f68..8bbc905b26c83d7b41c8159ad898f46bfeaa6b76 100644 (file)
@@ -352,7 +352,7 @@ static int rockchip_vpu_video_device_register(struct rockchip_vpu_dev *vpu)
        vpu->vfd_enc = vfd;
        video_set_drvdata(vfd, vpu);
 
-       ret = video_register_device(vfd, VFL_TYPE_GRABBER, 0);
+       ret = video_register_device(vfd, VFL_TYPE_GRABBER, -1);
        if (ret) {
                v4l2_err(&vpu->v4l2_dev, "Failed to register video device\n");
                goto err_free_dev;
@@ -463,6 +463,8 @@ static int rockchip_vpu_probe(struct platform_device *pdev)
 
        vpu->mdev.dev = vpu->dev;
        strscpy(vpu->mdev.model, DRIVER_NAME, sizeof(vpu->mdev.model));
+       strscpy(vpu->mdev.bus_info, "platform: " DRIVER_NAME,
+               sizeof(vpu->mdev.model));
        media_device_init(&vpu->mdev);
        vpu->v4l2_dev.mdev = &vpu->mdev;
 
@@ -480,15 +482,18 @@ static int rockchip_vpu_probe(struct platform_device *pdev)
        return 0;
 err_video_dev_unreg:
        if (vpu->vfd_enc) {
+               v4l2_m2m_unregister_media_controller(vpu->m2m_dev);
                video_unregister_device(vpu->vfd_enc);
                video_device_release(vpu->vfd_enc);
        }
 err_m2m_rel:
+       media_device_cleanup(&vpu->mdev);
        v4l2_m2m_release(vpu->m2m_dev);
 err_v4l2_unreg:
        v4l2_device_unregister(&vpu->v4l2_dev);
 err_clk_unprepare:
        clk_bulk_unprepare(vpu->variant->num_clocks, vpu->clocks);
+       pm_runtime_dont_use_autosuspend(vpu->dev);
        pm_runtime_disable(vpu->dev);
        return ret;
 }
@@ -500,15 +505,16 @@ static int rockchip_vpu_remove(struct platform_device *pdev)
        v4l2_info(&vpu->v4l2_dev, "Removing %s\n", pdev->name);
 
        media_device_unregister(&vpu->mdev);
-       v4l2_m2m_unregister_media_controller(vpu->m2m_dev);
-       v4l2_m2m_release(vpu->m2m_dev);
-       media_device_cleanup(&vpu->mdev);
        if (vpu->vfd_enc) {
+               v4l2_m2m_unregister_media_controller(vpu->m2m_dev);
                video_unregister_device(vpu->vfd_enc);
                video_device_release(vpu->vfd_enc);
        }
+       media_device_cleanup(&vpu->mdev);
+       v4l2_m2m_release(vpu->m2m_dev);
        v4l2_device_unregister(&vpu->v4l2_dev);
        clk_bulk_unprepare(vpu->variant->num_clocks, vpu->clocks);
+       pm_runtime_dont_use_autosuspend(vpu->dev);
        pm_runtime_disable(vpu->dev);
        return 0;
 }
index fb5e36aedd8c5e58feda0b675eba83e038e309b3..dcbfc3cbc9f31539b26f0f10ad577fca3224c0be 100644 (file)
@@ -152,9 +152,10 @@ static int vidioc_querycap(struct file *file, void *priv,
                           struct v4l2_capability *cap)
 {
        struct rockchip_vpu_dev *vpu = video_drvdata(file);
+       struct video_device *vdev = video_devdata(file);
 
        strscpy(cap->driver, vpu->dev->driver->name, sizeof(cap->driver));
-       strscpy(cap->card, vpu->vfd_enc->name, sizeof(cap->card));
+       strscpy(cap->card, vdev->name, sizeof(cap->card));
        snprintf(cap->bus_info, sizeof(cap->bus_info), "platform: %s",
                 vpu->dev->driver->name);
        return 0;
index 66a709d5d6b92e747897f780ca09dbe626a708f1..15bdd25780beecc67eac993a31308c0257571c79 100644 (file)
@@ -3,7 +3,7 @@
 #
 
 menuconfig THERMAL
-       tristate "Generic Thermal sysfs driver"
+       bool "Generic Thermal sysfs driver"
        help
          Generic Thermal Sysfs driver offers a generic mechanism for
          thermal management. Usually it's made up of one or more thermal
@@ -11,7 +11,7 @@ menuconfig THERMAL
          Each thermal zone contains its own temperature, trip points,
          cooling devices.
          All platforms with ACPI thermal support can use this driver.
-         If you want this support, you should say Y or M here.
+         If you want this support, you should say Y here.
 
 if THERMAL
 
@@ -24,7 +24,6 @@ config THERMAL_STATISTICS
 
 config THERMAL_EMERGENCY_POWEROFF_DELAY_MS
        int "Emergency poweroff delay in milli-seconds"
-       depends on THERMAL
        default 0
        help
          Thermal subsystem will issue a graceful shutdown when
@@ -149,10 +148,9 @@ config THERMAL_GOV_POWER_ALLOCATOR
          allocating and limiting power to devices.
 
 config CPU_THERMAL
-       bool "generic cpu cooling support"
+       bool "Generic cpu cooling support"
        depends on CPU_FREQ
        depends on THERMAL_OF
-       depends on THERMAL=y
        help
          This implements the generic cpu cooling mechanism through frequency
          reduction. An ACPI version of this already exists
index 2e013eeb4a1d9f79d65512bca49e4042e959f0fe..2c727a820759cc8ac78e84a4f41e8d559aefb6ce 100644 (file)
@@ -1,6 +1,5 @@
 config INTEL_POWERCLAMP
        tristate "Intel PowerClamp idle injection driver"
-       depends on THERMAL
        depends on X86
        depends on CPU_SUP_INTEL
        help
index 0c19fcd56a0da02713e93778afc78c84017aeeef..79a7df2baa92450ee17b843789fdf648149f5c8a 100644 (file)
@@ -220,6 +220,7 @@ static int int3403_add(struct platform_device *pdev)
 {
        struct int3403_priv *priv;
        int result = 0;
+       unsigned long long tmp;
        acpi_status status;
 
        priv = devm_kzalloc(&pdev->dev, sizeof(struct int3403_priv),
@@ -234,19 +235,18 @@ static int int3403_add(struct platform_device *pdev)
                goto err;
        }
 
-       status = acpi_evaluate_integer(priv->adev->handle, "PTYP",
-                                      NULL, &priv->type);
-       if (ACPI_FAILURE(status)) {
-               unsigned long long tmp;
 
-               status = acpi_evaluate_integer(priv->adev->handle, "_TMP",
-                                              NULL, &tmp);
+       status = acpi_evaluate_integer(priv->adev->handle, "_TMP",
+                                      NULL, &tmp);
+       if (ACPI_FAILURE(status)) {
+               status = acpi_evaluate_integer(priv->adev->handle, "PTYP",
+                                      NULL, &priv->type);
                if (ACPI_FAILURE(status)) {
                        result = -EINVAL;
                        goto err;
-               } else {
-                       priv->type = INT3403_TYPE_SENSOR;
                }
+       } else {
+               priv->type = INT3403_TYPE_SENSOR;
        }
 
        platform_set_drvdata(pdev, priv);
index 8e1cf4d789be10df2413e1311bba63dde3545f43..2e6071a82da2748071971dd37e65a42ce48e54dd 100644 (file)
@@ -81,22 +81,13 @@ static ssize_t power_limit_##index##_##suffix##_show(struct device *dev, \
                                        struct device_attribute *attr, \
                                        char *buf) \
 { \
-       struct pci_dev *pci_dev; \
-       struct platform_device *pdev; \
-       struct proc_thermal_device *proc_dev; \
+       struct proc_thermal_device *proc_dev = dev_get_drvdata(dev); \
        \
        if (proc_thermal_emum_mode == PROC_THERMAL_NONE) { \
                dev_warn(dev, "Attempted to get power limit before device was initialized!\n"); \
                return 0; \
        } \
        \
-       if (proc_thermal_emum_mode == PROC_THERMAL_PLATFORM_DEV) { \
-               pdev = to_platform_device(dev); \
-               proc_dev = platform_get_drvdata(pdev); \
-       } else { \
-               pci_dev = to_pci_dev(dev); \
-               proc_dev = pci_get_drvdata(pci_dev); \
-       } \
        return sprintf(buf, "%lu\n",\
        (unsigned long)proc_dev->power_limits[index].suffix * 1000); \
 }
@@ -274,7 +265,7 @@ static void proc_thermal_notify(acpi_handle handle, u32 event, void *data)
                                THERMAL_DEVICE_POWER_CAPABILITY_CHANGED);
                break;
        default:
-               dev_err(proc_priv->dev, "Unsupported event [0x%x]\n", event);
+               dev_dbg(proc_priv->dev, "Unsupported event [0x%x]\n", event);
                break;
        }
 }
index cdb455ffd5752ef1e7c9094ebf5205ee97bf4403..3ce20fec86a2bd583922fc68a07fc716d8d4755e 100644 (file)
@@ -1,6 +1,5 @@
 config QCOM_TSENS
        tristate "Qualcomm TSENS Temperature Alarm"
-       depends on THERMAL
        depends on QCOM_QFPROM
        depends on ARCH_QCOM || COMPILE_TEST
        help
index e0b530603db6a100a1343f22bbada1aa629911be..46cfb7de4eb289d7b81db525178a5dd5ef6656d4 100644 (file)
@@ -266,7 +266,7 @@ static int __init thermal_register_governors(void)
        return thermal_gov_power_allocator_register();
 }
 
-static void thermal_unregister_governors(void)
+static void __init thermal_unregister_governors(void)
 {
        thermal_gov_step_wise_unregister();
        thermal_gov_fair_share_unregister();
@@ -941,7 +941,7 @@ static void bind_cdev(struct thermal_cooling_device *cdev)
  */
 static struct thermal_cooling_device *
 __thermal_cooling_device_register(struct device_node *np,
-                                 char *type, void *devdata,
+                                 const char *type, void *devdata,
                                  const struct thermal_cooling_device_ops *ops)
 {
        struct thermal_cooling_device *cdev;
@@ -1015,7 +1015,7 @@ __thermal_cooling_device_register(struct device_node *np,
  * ERR_PTR. Caller must check return value with IS_ERR*() helpers.
  */
 struct thermal_cooling_device *
-thermal_cooling_device_register(char *type, void *devdata,
+thermal_cooling_device_register(const char *type, void *devdata,
                                const struct thermal_cooling_device_ops *ops)
 {
        return __thermal_cooling_device_register(NULL, type, devdata, ops);
@@ -1039,7 +1039,7 @@ EXPORT_SYMBOL_GPL(thermal_cooling_device_register);
  */
 struct thermal_cooling_device *
 thermal_of_cooling_device_register(struct device_node *np,
-                                  char *type, void *devdata,
+                                  const char *type, void *devdata,
                                   const struct thermal_cooling_device_ops *ops)
 {
        return __thermal_cooling_device_register(np, type, devdata, ops);
@@ -1543,6 +1543,7 @@ static int thermal_pm_notify(struct notifier_block *nb,
                             unsigned long mode, void *_unused)
 {
        struct thermal_zone_device *tz;
+       enum thermal_device_mode tz_mode;
 
        switch (mode) {
        case PM_HIBERNATION_PREPARE:
@@ -1555,6 +1556,13 @@ static int thermal_pm_notify(struct notifier_block *nb,
        case PM_POST_SUSPEND:
                atomic_set(&in_suspend, 0);
                list_for_each_entry(tz, &thermal_tz_list, node) {
+                       tz_mode = THERMAL_DEVICE_ENABLED;
+                       if (tz->ops->get_mode)
+                               tz->ops->get_mode(tz, &tz_mode);
+
+                       if (tz_mode == THERMAL_DEVICE_DISABLED)
+                               continue;
+
                        thermal_zone_device_init(tz);
                        thermal_zone_device_update(tz,
                                                   THERMAL_EVENT_UNSPECIFIED);
@@ -1612,19 +1620,4 @@ error:
        mutex_destroy(&poweroff_lock);
        return result;
 }
-
-static void __exit thermal_exit(void)
-{
-       unregister_pm_notifier(&thermal_pm_nb);
-       of_thermal_destroy_zones();
-       genetlink_exit();
-       class_unregister(&thermal_class);
-       thermal_unregister_governors();
-       ida_destroy(&thermal_tz_ida);
-       ida_destroy(&thermal_cdev_ida);
-       mutex_destroy(&thermal_list_lock);
-       mutex_destroy(&thermal_governor_lock);
-}
-
 fs_initcall(thermal_init);
-module_exit(thermal_exit);
index 967db336d11ae016324f4f15d7cbd33b809045c2..9eaff55df7b43ee490187b629856a4f35be6a46f 100644 (file)
@@ -251,7 +251,7 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry
        _enter("%s", cell->name);
 
        ret = dns_query("afsdb", cell->name, cell->name_len, "srv=1",
-                       &result, _expiry);
+                       &result, _expiry, true);
        if (ret < 0) {
                _leave(" = %d [dns]", ret);
                return ERR_PTR(ret);
index d12ffb457e4745809460707c02176d2e4a657e4b..3f4e460c6655ab3e259d7e17e6b6bbe8dfdb0036 100644 (file)
@@ -23,6 +23,9 @@
 #define AFSPATHMAX             1024    /* Maximum length of a pathname plus NUL */
 #define AFSOPAQUEMAX           1024    /* Maximum length of an opaque field */
 
+#define AFS_VL_MAX_LIFESPAN    (120 * HZ)
+#define AFS_PROBE_MAX_LIFESPAN (30 * HZ)
+
 typedef u64                    afs_volid_t;
 typedef u64                    afs_vnodeid_t;
 typedef u64                    afs_dataversion_t;
@@ -69,8 +72,8 @@ typedef enum {
 
 struct afs_callback {
        time64_t                expires_at;     /* Time at which expires */
-       unsigned                version;        /* Callback version */
-       afs_callback_type_t     type;           /* Type of callback */
+       //unsigned              version;        /* Callback version */
+       //afs_callback_type_t   type;           /* Type of callback */
 };
 
 struct afs_callback_break {
@@ -144,6 +147,15 @@ struct afs_file_status {
        u32                     abort_code;     /* Abort if bulk-fetching this failed */
 };
 
+struct afs_status_cb {
+       struct afs_file_status  status;
+       struct afs_callback     callback;
+       unsigned int            cb_break;       /* Pre-op callback break counter */
+       bool                    have_status;    /* True if status record was retrieved */
+       bool                    have_cb;        /* True if cb record was retrieved */
+       bool                    have_error;     /* True if status.abort_code indicates an error */
+};
+
 /*
  * AFS file status change request
  */
index 128f2dbe256a4eb0f6124294f883b29d8a57e10e..d441bef72163289cfcde8ceff2d3271438d2c0f9 100644 (file)
@@ -94,15 +94,15 @@ int afs_register_server_cb_interest(struct afs_vnode *vnode,
        struct afs_server *server = entry->server;
 
 again:
-       if (vnode->cb_interest &&
-           likely(vnode->cb_interest == entry->cb_interest))
+       vcbi = rcu_dereference_protected(vnode->cb_interest,
+                                        lockdep_is_held(&vnode->io_lock));
+       if (vcbi && likely(vcbi == entry->cb_interest))
                return 0;
 
        read_lock(&slist->lock);
        cbi = afs_get_cb_interest(entry->cb_interest);
        read_unlock(&slist->lock);
 
-       vcbi = vnode->cb_interest;
        if (vcbi) {
                if (vcbi == cbi) {
                        afs_put_cb_interest(afs_v2net(vnode), cbi);
@@ -114,8 +114,9 @@ again:
                 */
                if (cbi && vcbi->server == cbi->server) {
                        write_seqlock(&vnode->cb_lock);
-                       old = vnode->cb_interest;
-                       vnode->cb_interest = cbi;
+                       old = rcu_dereference_protected(vnode->cb_interest,
+                                                       lockdep_is_held(&vnode->cb_lock.lock));
+                       rcu_assign_pointer(vnode->cb_interest, cbi);
                        write_sequnlock(&vnode->cb_lock);
                        afs_put_cb_interest(afs_v2net(vnode), old);
                        return 0;
@@ -160,8 +161,9 @@ again:
         */
        write_seqlock(&vnode->cb_lock);
 
-       old = vnode->cb_interest;
-       vnode->cb_interest = cbi;
+       old = rcu_dereference_protected(vnode->cb_interest,
+                                       lockdep_is_held(&vnode->cb_lock.lock));
+       rcu_assign_pointer(vnode->cb_interest, cbi);
        vnode->cb_s_break = cbi->server->cb_s_break;
        vnode->cb_v_break = vnode->volume->cb_v_break;
        clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
@@ -191,10 +193,11 @@ void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi)
                                vi = NULL;
 
                        write_unlock(&cbi->server->cb_break_lock);
-                       kfree(vi);
+                       if (vi)
+                               kfree_rcu(vi, rcu);
                        afs_put_server(net, cbi->server);
                }
-               kfree(cbi);
+               kfree_rcu(cbi, rcu);
        }
 }
 
@@ -218,14 +221,8 @@ void __afs_break_callback(struct afs_vnode *vnode)
                vnode->cb_break++;
                afs_clear_permits(vnode);
 
-               spin_lock(&vnode->lock);
-
-               _debug("break callback");
-
-               if (list_empty(&vnode->granted_locks) &&
-                   !list_empty(&vnode->pending_locks))
+               if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
                        afs_lock_may_be_available(vnode);
-               spin_unlock(&vnode->lock);
        }
 }
 
index 9de46116c7492a712ede0c8a241eea71ef94230b..9c3b07ba22221f80d5d59751c1fd994765b1b98a 100644 (file)
@@ -123,6 +123,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
                                       const char *name, unsigned int namelen,
                                       const char *addresses)
 {
+       struct afs_vlserver_list *vllist;
        struct afs_cell *cell;
        int i, ret;
 
@@ -151,18 +152,14 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
 
        atomic_set(&cell->usage, 2);
        INIT_WORK(&cell->manager, afs_manage_cell);
-       cell->flags = ((1 << AFS_CELL_FL_NOT_READY) |
-                      (1 << AFS_CELL_FL_NO_LOOKUP_YET));
        INIT_LIST_HEAD(&cell->proc_volumes);
        rwlock_init(&cell->proc_lock);
        rwlock_init(&cell->vl_servers_lock);
 
-       /* Fill in the VL server list if we were given a list of addresses to
-        * use.
+       /* Provide a VL server list, filling it in if we were given a list of
+        * addresses to use.
         */
        if (addresses) {
-               struct afs_vlserver_list *vllist;
-
                vllist = afs_parse_text_addrs(net,
                                              addresses, strlen(addresses), ':',
                                              VL_SERVICE, AFS_VL_PORT);
@@ -171,19 +168,32 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
                        goto parse_failed;
                }
 
-               rcu_assign_pointer(cell->vl_servers, vllist);
+               vllist->source = DNS_RECORD_FROM_CONFIG;
+               vllist->status = DNS_LOOKUP_NOT_DONE;
                cell->dns_expiry = TIME64_MAX;
-               __clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags);
        } else {
+               ret = -ENOMEM;
+               vllist = afs_alloc_vlserver_list(0);
+               if (!vllist)
+                       goto error;
+               vllist->source = DNS_RECORD_UNAVAILABLE;
+               vllist->status = DNS_LOOKUP_NOT_DONE;
                cell->dns_expiry = ktime_get_real_seconds();
        }
 
+       rcu_assign_pointer(cell->vl_servers, vllist);
+
+       cell->dns_source = vllist->source;
+       cell->dns_status = vllist->status;
+       smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */
+
        _leave(" = %p", cell);
        return cell;
 
 parse_failed:
        if (ret == -EINVAL)
                printk(KERN_ERR "kAFS: bad VL server IP address\n");
+error:
        kfree(cell);
        _leave(" = %d", ret);
        return ERR_PTR(ret);
@@ -208,6 +218,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
 {
        struct afs_cell *cell, *candidate, *cursor;
        struct rb_node *parent, **pp;
+       enum afs_cell_state state;
        int ret, n;
 
        _enter("%s,%s", name, vllist);
@@ -267,18 +278,16 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
 
 wait_for_cell:
        _debug("wait_for_cell");
-       ret = wait_on_bit(&cell->flags, AFS_CELL_FL_NOT_READY, TASK_INTERRUPTIBLE);
-       smp_rmb();
-
-       switch (READ_ONCE(cell->state)) {
-       case AFS_CELL_FAILED:
+       wait_var_event(&cell->state,
+                      ({
+                              state = smp_load_acquire(&cell->state); /* vs error */
+                              state == AFS_CELL_ACTIVE || state == AFS_CELL_FAILED;
+                      }));
+
+       /* Check the state obtained from the wait check. */
+       if (state == AFS_CELL_FAILED) {
                ret = cell->error;
                goto error;
-       default:
-               _debug("weird %u %d", cell->state, cell->error);
-               goto error;
-       case AFS_CELL_ACTIVE:
-               break;
        }
 
        _leave(" = %p [cell]", cell);
@@ -360,16 +369,46 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
 /*
  * Update a cell's VL server address list from the DNS.
  */
-static void afs_update_cell(struct afs_cell *cell)
+static int afs_update_cell(struct afs_cell *cell)
 {
-       struct afs_vlserver_list *vllist, *old;
+       struct afs_vlserver_list *vllist, *old = NULL, *p;
        unsigned int min_ttl = READ_ONCE(afs_cell_min_ttl);
        unsigned int max_ttl = READ_ONCE(afs_cell_max_ttl);
        time64_t now, expiry = 0;
+       int ret = 0;
 
        _enter("%s", cell->name);
 
        vllist = afs_dns_query(cell, &expiry);
+       if (IS_ERR(vllist)) {
+               ret = PTR_ERR(vllist);
+
+               _debug("%s: fail %d", cell->name, ret);
+               if (ret == -ENOMEM)
+                       goto out_wake;
+
+               ret = -ENOMEM;
+               vllist = afs_alloc_vlserver_list(0);
+               if (!vllist)
+                       goto out_wake;
+
+               switch (ret) {
+               case -ENODATA:
+               case -EDESTADDRREQ:
+                       vllist->status = DNS_LOOKUP_GOT_NOT_FOUND;
+                       break;
+               case -EAGAIN:
+               case -ECONNREFUSED:
+                       vllist->status = DNS_LOOKUP_GOT_TEMP_FAILURE;
+                       break;
+               default:
+                       vllist->status = DNS_LOOKUP_GOT_LOCAL_FAILURE;
+                       break;
+               }
+       }
+
+       _debug("%s: got list %d %d", cell->name, vllist->source, vllist->status);
+       cell->dns_status = vllist->status;
 
        now = ktime_get_real_seconds();
        if (min_ttl > max_ttl)
@@ -379,48 +418,47 @@ static void afs_update_cell(struct afs_cell *cell)
        else if (expiry > now + max_ttl)
                expiry = now + max_ttl;
 
-       if (IS_ERR(vllist)) {
-               switch (PTR_ERR(vllist)) {
-               case -ENODATA:
-               case -EDESTADDRREQ:
+       _debug("%s: status %d", cell->name, vllist->status);
+       if (vllist->source == DNS_RECORD_UNAVAILABLE) {
+               switch (vllist->status) {
+               case DNS_LOOKUP_GOT_NOT_FOUND:
                        /* The DNS said that the cell does not exist or there
                         * weren't any addresses to be had.
                         */
-                       set_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
-                       clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
                        cell->dns_expiry = expiry;
                        break;
 
-               case -EAGAIN:
-               case -ECONNREFUSED:
+               case DNS_LOOKUP_BAD:
+               case DNS_LOOKUP_GOT_LOCAL_FAILURE:
+               case DNS_LOOKUP_GOT_TEMP_FAILURE:
+               case DNS_LOOKUP_GOT_NS_FAILURE:
                default:
-                       set_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
                        cell->dns_expiry = now + 10;
                        break;
                }
-
-               cell->error = -EDESTADDRREQ;
        } else {
-               clear_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags);
-               clear_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags);
-
-               /* Exclusion on changing vl_addrs is achieved by a
-                * non-reentrant work item.
-                */
-               old = rcu_dereference_protected(cell->vl_servers, true);
-               rcu_assign_pointer(cell->vl_servers, vllist);
                cell->dns_expiry = expiry;
-
-               if (old)
-                       afs_put_vlserverlist(cell->net, old);
        }
 
-       if (test_and_clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags))
-               wake_up_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET);
+       /* Replace the VL server list if the new record has servers or the old
+        * record doesn't.
+        */
+       write_lock(&cell->vl_servers_lock);
+       p = rcu_dereference_protected(cell->vl_servers, true);
+       if (vllist->nr_servers > 0 || p->nr_servers == 0) {
+               rcu_assign_pointer(cell->vl_servers, vllist);
+               cell->dns_source = vllist->source;
+               old = p;
+       }
+       write_unlock(&cell->vl_servers_lock);
+       afs_put_vlserverlist(cell->net, old);
 
-       now = ktime_get_real_seconds();
-       afs_set_cell_timer(cell->net, cell->dns_expiry - now);
-       _leave("");
+out_wake:
+       smp_store_release(&cell->dns_lookup_count,
+                         cell->dns_lookup_count + 1); /* vs source/status */
+       wake_up_var(&cell->dns_lookup_count);
+       _leave(" = %d", ret);
+       return ret;
 }
 
 /*
@@ -491,8 +529,7 @@ void afs_put_cell(struct afs_net *net, struct afs_cell *cell)
        now = ktime_get_real_seconds();
        cell->last_inactive = now;
        expire_delay = 0;
-       if (!test_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags) &&
-           !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags))
+       if (cell->vl_servers->nr_servers)
                expire_delay = afs_cell_gc_delay;
 
        if (atomic_dec_return(&cell->usage) > 1)
@@ -623,11 +660,13 @@ again:
                        goto final_destruction;
                if (cell->state == AFS_CELL_FAILED)
                        goto done;
-               cell->state = AFS_CELL_UNSET;
+               smp_store_release(&cell->state, AFS_CELL_UNSET);
+               wake_up_var(&cell->state);
                goto again;
 
        case AFS_CELL_UNSET:
-               cell->state = AFS_CELL_ACTIVATING;
+               smp_store_release(&cell->state, AFS_CELL_ACTIVATING);
+               wake_up_var(&cell->state);
                goto again;
 
        case AFS_CELL_ACTIVATING:
@@ -635,28 +674,29 @@ again:
                if (ret < 0)
                        goto activation_failed;
 
-               cell->state = AFS_CELL_ACTIVE;
-               smp_wmb();
-               clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
-               wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+               smp_store_release(&cell->state, AFS_CELL_ACTIVE);
+               wake_up_var(&cell->state);
                goto again;
 
        case AFS_CELL_ACTIVE:
                if (atomic_read(&cell->usage) > 1) {
-                       time64_t now = ktime_get_real_seconds();
-                       if (cell->dns_expiry <= now && net->live)
-                               afs_update_cell(cell);
+                       if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
+                               ret = afs_update_cell(cell);
+                               if (ret < 0)
+                                       cell->error = ret;
+                       }
                        goto done;
                }
-               cell->state = AFS_CELL_DEACTIVATING;
+               smp_store_release(&cell->state, AFS_CELL_DEACTIVATING);
+               wake_up_var(&cell->state);
                goto again;
 
        case AFS_CELL_DEACTIVATING:
-               set_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
                if (atomic_read(&cell->usage) > 1)
                        goto reverse_deactivation;
                afs_deactivate_cell(net, cell);
-               cell->state = AFS_CELL_INACTIVE;
+               smp_store_release(&cell->state, AFS_CELL_INACTIVE);
+               wake_up_var(&cell->state);
                goto again;
 
        default:
@@ -669,17 +709,13 @@ activation_failed:
        cell->error = ret;
        afs_deactivate_cell(net, cell);
 
-       cell->state = AFS_CELL_FAILED;
-       smp_wmb();
-       if (test_and_clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags))
-               wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+       smp_store_release(&cell->state, AFS_CELL_FAILED); /* vs error */
+       wake_up_var(&cell->state);
        goto again;
 
 reverse_deactivation:
-       cell->state = AFS_CELL_ACTIVE;
-       smp_wmb();
-       clear_bit(AFS_CELL_FL_NOT_READY, &cell->flags);
-       wake_up_bit(&cell->flags, AFS_CELL_FL_NOT_READY);
+       smp_store_release(&cell->state, AFS_CELL_ACTIVE);
+       wake_up_var(&cell->state);
        _leave(" [deact->act]");
        return;
 
@@ -739,11 +775,16 @@ void afs_manage_cells(struct work_struct *work)
                }
 
                if (usage == 1) {
+                       struct afs_vlserver_list *vllist;
                        time64_t expire_at = cell->last_inactive;
 
-                       if (!test_bit(AFS_CELL_FL_DNS_FAIL, &cell->flags) &&
-                           !test_bit(AFS_CELL_FL_NOT_FOUND, &cell->flags))
+                       read_lock(&cell->vl_servers_lock);
+                       vllist = rcu_dereference_protected(
+                               cell->vl_servers,
+                               lockdep_is_held(&cell->vl_servers_lock));
+                       if (vllist->nr_servers > 0)
                                expire_at += afs_cell_gc_delay;
+                       read_unlock(&cell->vl_servers_lock);
                        if (purging || expire_at <= now)
                                sched_cell = true;
                        else if (expire_at < next_manage)
@@ -751,10 +792,8 @@ void afs_manage_cells(struct work_struct *work)
                }
 
                if (!purging) {
-                       if (cell->dns_expiry <= now)
+                       if (test_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags))
                                sched_cell = true;
-                       else if (cell->dns_expiry <= next_manage)
-                               next_manage = cell->dns_expiry;
                }
 
                if (sched_cell)
index 748090014519d10af0152a8c695df7bdf77500b2..01437cfe54326e974cfdad81760f7d1d96b9d61b 100644 (file)
@@ -213,7 +213,7 @@ static int afs_find_cm_server_by_peer(struct afs_call *call)
                return 0;
        }
 
-       call->cm_server = server;
+       call->server = server;
        return afs_record_cm_probe(call, server);
 }
 
@@ -234,7 +234,7 @@ static int afs_find_cm_server_by_uuid(struct afs_call *call,
                return 0;
        }
 
-       call->cm_server = server;
+       call->server = server;
        return afs_record_cm_probe(call, server);
 }
 
@@ -260,8 +260,8 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
         * server holds up change visibility till it receives our reply so as
         * to maintain cache coherency.
         */
-       if (call->cm_server)
-               afs_break_callbacks(call->cm_server, call->count, call->request);
+       if (call->server)
+               afs_break_callbacks(call->server, call->count, call->request);
 
        afs_send_empty_reply(call);
        afs_put_call(call);
@@ -376,10 +376,10 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 {
        struct afs_call *call = container_of(work, struct afs_call, work);
 
-       _enter("{%p}", call->cm_server);
+       _enter("{%p}", call->server);
 
-       if (call->cm_server)
-               afs_init_callback_state(call->cm_server);
+       if (call->server)
+               afs_init_callback_state(call->server);
        afs_send_empty_reply(call);
        afs_put_call(call);
        _leave("");
index 9a466be583d2a9e57ddaec723e9dae8f2be45580..79d93a26759ae7cc0dcf633a79f85efd5a2c7711 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"
+#include "afs_fs.h"
 #include "xdr_fs.h"
 
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
@@ -102,8 +103,8 @@ struct afs_lookup_cookie {
        bool                    found;
        bool                    one_only;
        unsigned short          nr_fids;
-       struct afs_file_status  *statuses;
-       struct afs_callback     *callbacks;
+       struct inode            **inodes;
+       struct afs_status_cb    *statuses;
        struct afs_fid          fids[50];
 };
 
@@ -638,12 +639,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
                                   struct key *key)
 {
        struct afs_lookup_cookie *cookie;
-       struct afs_cb_interest *cbi = NULL;
+       struct afs_cb_interest *dcbi, *cbi = NULL;
        struct afs_super_info *as = dir->i_sb->s_fs_info;
-       struct afs_iget_data data;
+       struct afs_status_cb *scb;
+       struct afs_iget_data iget_data;
        struct afs_fs_cursor fc;
-       struct afs_vnode *dvnode = AFS_FS_I(dir);
-       struct inode *inode = NULL;
+       struct afs_server *server;
+       struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
+       struct inode *inode = NULL, *ti;
        int ret, i;
 
        _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry);
@@ -657,10 +660,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
        cookie->nr_fids = 1; /* slot 0 is saved for the fid we actually want */
 
        read_seqlock_excl(&dvnode->cb_lock);
-       if (dvnode->cb_interest &&
-           dvnode->cb_interest->server &&
-           test_bit(AFS_SERVER_FL_NO_IBULK, &dvnode->cb_interest->server->flags))
-               cookie->one_only = true;
+       dcbi = rcu_dereference_protected(dvnode->cb_interest,
+                                        lockdep_is_held(&dvnode->cb_lock.lock));
+       if (dcbi) {
+               server = dcbi->server;
+               if (server &&
+                   test_bit(AFS_SERVER_FL_NO_IBULK, &server->flags))
+                       cookie->one_only = true;
+       }
        read_sequnlock_excl(&dvnode->cb_lock);
 
        for (i = 0; i < 50; i++)
@@ -678,24 +685,43 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
                goto out;
 
        /* Check to see if we already have an inode for the primary fid. */
-       data.volume = dvnode->volume;
-       data.fid = cookie->fids[0];
-       inode = ilookup5(dir->i_sb, cookie->fids[0].vnode, afs_iget5_test, &data);
+       iget_data.fid = cookie->fids[0];
+       iget_data.volume = dvnode->volume;
+       iget_data.cb_v_break = dvnode->volume->cb_v_break;
+       iget_data.cb_s_break = 0;
+       inode = ilookup5(dir->i_sb, cookie->fids[0].vnode,
+                        afs_iget5_test, &iget_data);
        if (inode)
                goto out;
 
        /* Need space for examining all the selected files */
        inode = ERR_PTR(-ENOMEM);
-       cookie->statuses = kcalloc(cookie->nr_fids, sizeof(struct afs_file_status),
-                                  GFP_KERNEL);
+       cookie->statuses = kvcalloc(cookie->nr_fids, sizeof(struct afs_status_cb),
+                                   GFP_KERNEL);
        if (!cookie->statuses)
                goto out;
 
-       cookie->callbacks = kcalloc(cookie->nr_fids, sizeof(struct afs_callback),
-                                   GFP_KERNEL);
-       if (!cookie->callbacks)
+       cookie->inodes = kcalloc(cookie->nr_fids, sizeof(struct inode *),
+                                GFP_KERNEL);
+       if (!cookie->inodes)
                goto out_s;
 
+       for (i = 1; i < cookie->nr_fids; i++) {
+               scb = &cookie->statuses[i];
+
+               /* Find any inodes that already exist and get their
+                * callback counters.
+                */
+               iget_data.fid = cookie->fids[i];
+               ti = ilookup5_nowait(dir->i_sb, iget_data.fid.vnode,
+                                    afs_iget5_test, &iget_data);
+               if (!IS_ERR_OR_NULL(ti)) {
+                       vnode = AFS_FS_I(ti);
+                       scb->cb_break = afs_calc_vnode_cb_break(vnode);
+                       cookie->inodes[i] = ti;
+               }
+       }
+
        /* Try FS.InlineBulkStatus first.  Abort codes for the individual
         * lookups contained therein are stored in the reply without aborting
         * the whole operation.
@@ -704,7 +730,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
                goto no_inline_bulk_status;
 
        inode = ERR_PTR(-ERESTARTSYS);
-       if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
                while (afs_select_fileserver(&fc)) {
                        if (test_bit(AFS_SERVER_FL_NO_IBULK,
                                      &fc.cbi->server->flags)) {
@@ -712,11 +738,12 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
                                fc.ac.error = -ECONNABORTED;
                                break;
                        }
+                       iget_data.cb_v_break = dvnode->volume->cb_v_break;
+                       iget_data.cb_s_break = fc.cbi->server->cb_s_break;
                        afs_fs_inline_bulk_status(&fc,
                                                  afs_v2net(dvnode),
                                                  cookie->fids,
                                                  cookie->statuses,
-                                                 cookie->callbacks,
                                                  cookie->nr_fids, NULL);
                }
 
@@ -737,15 +764,16 @@ no_inline_bulk_status:
         * any of the lookups fails - so, for the moment, revert to
         * FS.FetchStatus for just the primary fid.
         */
-       cookie->nr_fids = 1;
        inode = ERR_PTR(-ERESTARTSYS);
-       if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
                while (afs_select_fileserver(&fc)) {
+                       iget_data.cb_v_break = dvnode->volume->cb_v_break;
+                       iget_data.cb_s_break = fc.cbi->server->cb_s_break;
+                       scb = &cookie->statuses[0];
                        afs_fs_fetch_status(&fc,
                                            afs_v2net(dvnode),
                                            cookie->fids,
-                                           cookie->statuses,
-                                           cookie->callbacks,
+                                           scb,
                                            NULL);
                }
 
@@ -757,26 +785,36 @@ no_inline_bulk_status:
        if (IS_ERR(inode))
                goto out_c;
 
-       for (i = 0; i < cookie->nr_fids; i++)
-               cookie->statuses[i].abort_code = 0;
-
 success:
        /* Turn all the files into inodes and save the first one - which is the
         * one we actually want.
         */
-       if (cookie->statuses[0].abort_code != 0)
-               inode = ERR_PTR(afs_abort_to_error(cookie->statuses[0].abort_code));
+       scb = &cookie->statuses[0];
+       if (scb->status.abort_code != 0)
+               inode = ERR_PTR(afs_abort_to_error(scb->status.abort_code));
 
        for (i = 0; i < cookie->nr_fids; i++) {
-               struct inode *ti;
+               struct afs_status_cb *scb = &cookie->statuses[i];
+
+               if (!scb->have_status && !scb->have_error)
+                       continue;
+
+               if (cookie->inodes[i]) {
+                       afs_vnode_commit_status(&fc, AFS_FS_I(cookie->inodes[i]),
+                                               scb->cb_break, NULL, scb);
+                       continue;
+               }
 
-               if (cookie->statuses[i].abort_code != 0)
+               if (scb->status.abort_code != 0)
                        continue;
 
-               ti = afs_iget(dir->i_sb, key, &cookie->fids[i],
-                             &cookie->statuses[i],
-                             &cookie->callbacks[i],
-                             cbi, dvnode);
+               iget_data.fid = cookie->fids[i];
+               ti = afs_iget(dir->i_sb, key, &iget_data, scb, cbi, dvnode);
+               if (!IS_ERR(ti))
+                       afs_cache_permit(AFS_FS_I(ti), key,
+                                        0 /* Assume vnode->cb_break is 0 */ +
+                                        iget_data.cb_v_break,
+                                        scb);
                if (i == 0) {
                        inode = ti;
                } else {
@@ -787,9 +825,13 @@ success:
 
 out_c:
        afs_put_cb_interest(afs_v2net(dvnode), cbi);
-       kfree(cookie->callbacks);
+       if (cookie->inodes) {
+               for (i = 0; i < cookie->nr_fids; i++)
+                       iput(cookie->inodes[i]);
+               kfree(cookie->inodes);
+       }
 out_s:
-       kfree(cookie->statuses);
+       kvfree(cookie->statuses);
 out:
        kfree(cookie);
        return inode;
@@ -1114,9 +1156,8 @@ void afs_d_release(struct dentry *dentry)
  */
 static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
                                struct dentry *new_dentry,
-                               struct afs_fid *newfid,
-                               struct afs_file_status *newstatus,
-                               struct afs_callback *newcb)
+                               struct afs_iget_data *new_data,
+                               struct afs_status_cb *new_scb)
 {
        struct afs_vnode *vnode;
        struct inode *inode;
@@ -1125,7 +1166,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
                return;
 
        inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key,
-                        newfid, newstatus, newcb, fc->cbi, fc->vnode);
+                        new_data, new_scb, fc->cbi, fc->vnode);
        if (IS_ERR(inode)) {
                /* ENOMEM or EINTR at a really inconvenient time - just abandon
                 * the new directory on the server.
@@ -1136,22 +1177,29 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc,
 
        vnode = AFS_FS_I(inode);
        set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
-       afs_vnode_commit_status(fc, vnode, 0);
+       if (fc->ac.error == 0)
+               afs_cache_permit(vnode, fc->key, vnode->cb_break, new_scb);
        d_instantiate(new_dentry, inode);
 }
 
+static void afs_prep_for_new_inode(struct afs_fs_cursor *fc,
+                                  struct afs_iget_data *iget_data)
+{
+       iget_data->volume = fc->vnode->volume;
+       iget_data->cb_v_break = fc->vnode->volume->cb_v_break;
+       iget_data->cb_s_break = fc->cbi->server->cb_s_break;
+}
+
 /*
  * create a directory on an AFS filesystem
  */
 static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-       struct afs_file_status newstatus;
+       struct afs_iget_data iget_data;
+       struct afs_status_cb *scb;
        struct afs_fs_cursor fc;
-       struct afs_callback newcb;
        struct afs_vnode *dvnode = AFS_FS_I(dir);
-       struct afs_fid newfid;
        struct key *key;
-       u64 data_version = dvnode->status.data_version;
        int ret;
 
        mode |= S_IFDIR;
@@ -1159,23 +1207,32 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        _enter("{%llx:%llu},{%pd},%ho",
               dvnode->fid.vid, dvnode->fid.vnode, dentry, mode);
 
+       ret = -ENOMEM;
+       scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               goto error;
+
        key = afs_request_key(dvnode->volume->cell);
        if (IS_ERR(key)) {
                ret = PTR_ERR(key);
-               goto error;
+               goto error_scb;
        }
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+               afs_dataversion_t data_version = dvnode->status.data_version + 1;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(dvnode);
-                       afs_fs_create(&fc, dentry->d_name.name, mode, data_version,
-                                     &newfid, &newstatus, &newcb);
+                       afs_prep_for_new_inode(&fc, &iget_data);
+                       afs_fs_create(&fc, dentry->d_name.name, mode,
+                                     &scb[0], &iget_data.fid, &scb[1]);
                }
 
-               afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
-               afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb);
+               afs_check_for_remote_deletion(&fc, dvnode);
+               afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+                                       &data_version, &scb[0]);
+               afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]);
                ret = afs_end_vnode_operation(&fc);
                if (ret < 0)
                        goto error_key;
@@ -1185,15 +1242,18 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
        if (ret == 0 &&
            test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-               afs_edit_dir_add(dvnode, &dentry->d_name, &newfid,
+               afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
                                 afs_edit_dir_for_create);
 
        key_put(key);
+       kfree(scb);
        _leave(" = 0");
        return 0;
 
 error_key:
        key_put(key);
+error_scb:
+       kfree(scb);
 error:
        d_drop(dentry);
        _leave(" = %d", ret);
@@ -1220,15 +1280,19 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
  */
 static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 {
+       struct afs_status_cb *scb;
        struct afs_fs_cursor fc;
        struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL;
        struct key *key;
-       u64 data_version = dvnode->status.data_version;
        int ret;
 
        _enter("{%llx:%llu},{%pd}",
               dvnode->fid.vid, dvnode->fid.vnode, dentry);
 
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               return -ENOMEM;
+
        key = afs_request_key(dvnode->volume->cell);
        if (IS_ERR(key)) {
                ret = PTR_ERR(key);
@@ -1250,14 +1314,16 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
        }
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+               afs_dataversion_t data_version = dvnode->status.data_version + 1;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(dvnode);
-                       afs_fs_remove(&fc, vnode, dentry->d_name.name, true,
-                                     data_version);
+                       afs_fs_remove(&fc, vnode, dentry->d_name.name, true, scb);
                }
 
-               afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+               afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+                                       &data_version, scb);
                ret = afs_end_vnode_operation(&fc);
                if (ret == 0) {
                        afs_dir_remove_subdir(dentry);
@@ -1272,6 +1338,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 error_key:
        key_put(key);
 error:
+       kfree(scb);
        return ret;
 }
 
@@ -1285,32 +1352,27 @@ error:
  * However, if we didn't have a callback promise outstanding, or it was
  * outstanding on a different server, then it won't break it either...
  */
-int afs_dir_remove_link(struct dentry *dentry, struct key *key,
-                       unsigned long d_version_before,
-                       unsigned long d_version_after)
+static int afs_dir_remove_link(struct afs_vnode *dvnode, struct dentry *dentry,
+                              struct key *key)
 {
-       bool dir_valid;
        int ret = 0;
 
-       /* There were no intervening changes on the server if the version
-        * number we got back was incremented by exactly 1.
-        */
-       dir_valid = (d_version_after == d_version_before + 1);
-
        if (d_really_is_positive(dentry)) {
                struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
 
                if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
                        /* Already done */
-               } else if (dir_valid) {
+               } else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
+                       write_seqlock(&vnode->cb_lock);
                        drop_nlink(&vnode->vfs_inode);
                        if (vnode->vfs_inode.i_nlink == 0) {
                                set_bit(AFS_VNODE_DELETED, &vnode->flags);
-                               clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+                               __afs_break_callback(vnode);
                        }
+                       write_sequnlock(&vnode->cb_lock);
                        ret = 0;
                } else {
-                       clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+                       afs_break_callback(vnode);
 
                        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
                                kdebug("AFS_VNODE_DELETED");
@@ -1331,11 +1393,10 @@ int afs_dir_remove_link(struct dentry *dentry, struct key *key,
 static int afs_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct afs_fs_cursor fc;
+       struct afs_status_cb *scb;
        struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL;
        struct key *key;
-       unsigned long d_version = (unsigned long)dentry->d_fsdata;
        bool need_rehash = false;
-       u64 data_version = dvnode->status.data_version;
        int ret;
 
        _enter("{%llx:%llu},{%pd}",
@@ -1344,10 +1405,15 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
        if (dentry->d_name.len >= AFSNAMEMAX)
                return -ENAMETOOLONG;
 
+       ret = -ENOMEM;
+       scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               goto error;
+
        key = afs_request_key(dvnode->volume->cell);
        if (IS_ERR(key)) {
                ret = PTR_ERR(key);
-               goto error;
+               goto error_scb;
        }
 
        /* Try to make sure we have a callback promise on the victim. */
@@ -1374,30 +1440,34 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
        spin_unlock(&dentry->d_lock);
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+               afs_dataversion_t data_version = dvnode->status.data_version + 1;
+               afs_dataversion_t data_version_2 = vnode->status.data_version;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(dvnode);
+                       fc.cb_break_2 = afs_calc_vnode_cb_break(vnode);
 
                        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc.cbi->server->flags) &&
                            !test_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags)) {
                                yfs_fs_remove_file2(&fc, vnode, dentry->d_name.name,
-                                                   data_version);
+                                                   &scb[0], &scb[1]);
                                if (fc.ac.error != -ECONNABORTED ||
                                    fc.ac.abort_code != RXGEN_OPCODE)
                                        continue;
                                set_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags);
                        }
 
-                       afs_fs_remove(&fc, vnode, dentry->d_name.name, false,
-                                     data_version);
+                       afs_fs_remove(&fc, vnode, dentry->d_name.name, false, &scb[0]);
                }
 
-               afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+               afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+                                       &data_version, &scb[0]);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break_2,
+                                       &data_version_2, &scb[1]);
                ret = afs_end_vnode_operation(&fc);
-               if (ret == 0)
-                       ret = afs_dir_remove_link(
-                               dentry, key, d_version,
-                               (unsigned long)dvnode->status.data_version);
+               if (ret == 0 && !(scb[1].have_status || scb[1].have_error))
+                       ret = afs_dir_remove_link(dvnode, dentry, key);
                if (ret == 0 &&
                    test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
                        afs_edit_dir_remove(dvnode, &dentry->d_name,
@@ -1409,6 +1479,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 
 error_key:
        key_put(key);
+error_scb:
+       kfree(scb);
 error:
        _leave(" = %d", ret);
        return ret;
@@ -1420,13 +1492,11 @@ error:
 static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                      bool excl)
 {
+       struct afs_iget_data iget_data;
        struct afs_fs_cursor fc;
-       struct afs_file_status newstatus;
-       struct afs_callback newcb;
+       struct afs_status_cb *scb;
        struct afs_vnode *dvnode = AFS_FS_I(dir);
-       struct afs_fid newfid;
        struct key *key;
-       u64 data_version = dvnode->status.data_version;
        int ret;
 
        mode |= S_IFREG;
@@ -1444,17 +1514,26 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                goto error;
        }
 
+       ret = -ENOMEM;
+       scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               goto error_scb;
+
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+               afs_dataversion_t data_version = dvnode->status.data_version + 1;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(dvnode);
-                       afs_fs_create(&fc, dentry->d_name.name, mode, data_version,
-                                     &newfid, &newstatus, &newcb);
+                       afs_prep_for_new_inode(&fc, &iget_data);
+                       afs_fs_create(&fc, dentry->d_name.name, mode,
+                                     &scb[0], &iget_data.fid, &scb[1]);
                }
 
-               afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
-               afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, &newcb);
+               afs_check_for_remote_deletion(&fc, dvnode);
+               afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+                                       &data_version, &scb[0]);
+               afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]);
                ret = afs_end_vnode_operation(&fc);
                if (ret < 0)
                        goto error_key;
@@ -1463,13 +1542,16 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        }
 
        if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-               afs_edit_dir_add(dvnode, &dentry->d_name, &newfid,
+               afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
                                 afs_edit_dir_for_create);
 
+       kfree(scb);
        key_put(key);
        _leave(" = 0");
        return 0;
 
+error_scb:
+       kfree(scb);
 error_key:
        key_put(key);
 error:
@@ -1485,15 +1567,12 @@ static int afs_link(struct dentry *from, struct inode *dir,
                    struct dentry *dentry)
 {
        struct afs_fs_cursor fc;
-       struct afs_vnode *dvnode, *vnode;
+       struct afs_status_cb *scb;
+       struct afs_vnode *dvnode = AFS_FS_I(dir);
+       struct afs_vnode *vnode = AFS_FS_I(d_inode(from));
        struct key *key;
-       u64 data_version;
        int ret;
 
-       vnode = AFS_FS_I(d_inode(from));
-       dvnode = AFS_FS_I(dir);
-       data_version = dvnode->status.data_version;
-
        _enter("{%llx:%llu},{%llx:%llu},{%pd}",
               vnode->fid.vid, vnode->fid.vnode,
               dvnode->fid.vid, dvnode->fid.vnode,
@@ -1503,14 +1582,21 @@ static int afs_link(struct dentry *from, struct inode *dir,
        if (dentry->d_name.len >= AFSNAMEMAX)
                goto error;
 
+       ret = -ENOMEM;
+       scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               goto error;
+
        key = afs_request_key(dvnode->volume->cell);
        if (IS_ERR(key)) {
                ret = PTR_ERR(key);
-               goto error;
+               goto error_scb;
        }
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+               afs_dataversion_t data_version = dvnode->status.data_version + 1;
+
                if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) {
                        afs_end_vnode_operation(&fc);
                        goto error_key;
@@ -1519,11 +1605,14 @@ static int afs_link(struct dentry *from, struct inode *dir,
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(dvnode);
                        fc.cb_break_2 = afs_calc_vnode_cb_break(vnode);
-                       afs_fs_link(&fc, vnode, dentry->d_name.name, data_version);
+                       afs_fs_link(&fc, vnode, dentry->d_name.name,
+                                   &scb[0], &scb[1]);
                }
 
-               afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break_2);
+               afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+                                       &data_version, &scb[0]);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break_2,
+                                       NULL, &scb[1]);
                ihold(&vnode->vfs_inode);
                d_instantiate(dentry, &vnode->vfs_inode);
 
@@ -1540,11 +1629,14 @@ static int afs_link(struct dentry *from, struct inode *dir,
                                 afs_edit_dir_for_link);
 
        key_put(key);
+       kfree(scb);
        _leave(" = 0");
        return 0;
 
 error_key:
        key_put(key);
+error_scb:
+       kfree(scb);
 error:
        d_drop(dentry);
        _leave(" = %d", ret);
@@ -1557,12 +1649,11 @@ error:
 static int afs_symlink(struct inode *dir, struct dentry *dentry,
                       const char *content)
 {
+       struct afs_iget_data iget_data;
        struct afs_fs_cursor fc;
-       struct afs_file_status newstatus;
+       struct afs_status_cb *scb;
        struct afs_vnode *dvnode = AFS_FS_I(dir);
-       struct afs_fid newfid;
        struct key *key;
-       u64 data_version = dvnode->status.data_version;
        int ret;
 
        _enter("{%llx:%llu},{%pd},%s",
@@ -1577,24 +1668,32 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
        if (strlen(content) >= AFSPATHMAX)
                goto error;
 
+       ret = -ENOMEM;
+       scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               goto error;
+
        key = afs_request_key(dvnode->volume->cell);
        if (IS_ERR(key)) {
                ret = PTR_ERR(key);
-               goto error;
+               goto error_scb;
        }
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+               afs_dataversion_t data_version = dvnode->status.data_version + 1;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(dvnode);
-                       afs_fs_symlink(&fc, dentry->d_name.name,
-                                      content, data_version,
-                                      &newfid, &newstatus);
+                       afs_prep_for_new_inode(&fc, &iget_data);
+                       afs_fs_symlink(&fc, dentry->d_name.name, content,
+                                      &scb[0], &iget_data.fid, &scb[1]);
                }
 
-               afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
-               afs_vnode_new_inode(&fc, dentry, &newfid, &newstatus, NULL);
+               afs_check_for_remote_deletion(&fc, dvnode);
+               afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+                                       &data_version, &scb[0]);
+               afs_vnode_new_inode(&fc, dentry, &iget_data, &scb[1]);
                ret = afs_end_vnode_operation(&fc);
                if (ret < 0)
                        goto error_key;
@@ -1603,15 +1702,18 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
        }
 
        if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
-               afs_edit_dir_add(dvnode, &dentry->d_name, &newfid,
+               afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
                                 afs_edit_dir_for_symlink);
 
        key_put(key);
+       kfree(scb);
        _leave(" = 0");
        return 0;
 
 error_key:
        key_put(key);
+error_scb:
+       kfree(scb);
 error:
        d_drop(dentry);
        _leave(" = %d", ret);
@@ -1626,11 +1728,11 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
                      unsigned int flags)
 {
        struct afs_fs_cursor fc;
+       struct afs_status_cb *scb;
        struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
        struct dentry *tmp = NULL, *rehash = NULL;
        struct inode *new_inode;
        struct key *key;
-       u64 orig_data_version, new_data_version;
        bool new_negative = d_is_negative(new_dentry);
        int ret;
 
@@ -1644,8 +1746,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
        vnode = AFS_FS_I(d_inode(old_dentry));
        orig_dvnode = AFS_FS_I(old_dir);
        new_dvnode = AFS_FS_I(new_dir);
-       orig_data_version = orig_dvnode->status.data_version;
-       new_data_version = new_dvnode->status.data_version;
 
        _enter("{%llx:%llu},{%llx:%llu},{%llx:%llu},{%pd}",
               orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
@@ -1653,10 +1753,15 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
               new_dvnode->fid.vid, new_dvnode->fid.vnode,
               new_dentry);
 
+       ret = -ENOMEM;
+       scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               goto error;
+
        key = afs_request_key(orig_dvnode->volume->cell);
        if (IS_ERR(key)) {
                ret = PTR_ERR(key);
-               goto error;
+               goto error_scb;
        }
 
        /* For non-directories, check whether the target is busy and if so,
@@ -1690,31 +1795,43 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        new_dentry = tmp;
                        rehash = NULL;
                        new_negative = true;
-                       orig_data_version = orig_dvnode->status.data_version;
-                       new_data_version = new_dvnode->status.data_version;
                }
        }
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, orig_dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, orig_dvnode, key, true)) {
+               afs_dataversion_t orig_data_version;
+               afs_dataversion_t new_data_version;
+               struct afs_status_cb *new_scb = &scb[1];
+
+               orig_data_version = orig_dvnode->status.data_version + 1;
+
                if (orig_dvnode != new_dvnode) {
                        if (mutex_lock_interruptible_nested(&new_dvnode->io_lock, 1) < 0) {
                                afs_end_vnode_operation(&fc);
                                goto error_rehash;
                        }
+                       new_data_version = new_dvnode->status.data_version;
+               } else {
+                       new_data_version = orig_data_version;
+                       new_scb = &scb[0];
                }
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(orig_dvnode);
                        fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode);
                        afs_fs_rename(&fc, old_dentry->d_name.name,
                                      new_dvnode, new_dentry->d_name.name,
-                                     orig_data_version, new_data_version);
+                                     &scb[0], new_scb);
                }
 
-               afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break);
-               afs_vnode_commit_status(&fc, new_dvnode, fc.cb_break_2);
-               if (orig_dvnode != new_dvnode)
+               afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break,
+                                       &orig_data_version, &scb[0]);
+               if (new_dvnode != orig_dvnode) {
+                       afs_vnode_commit_status(&fc, new_dvnode, fc.cb_break_2,
+                                               &new_data_version, &scb[1]);
                        mutex_unlock(&new_dvnode->io_lock);
+               }
                ret = afs_end_vnode_operation(&fc);
                if (ret < 0)
                        goto error_rehash;
@@ -1754,6 +1871,8 @@ error_tmp:
        if (tmp)
                dput(tmp);
        key_put(key);
+error_scb:
+       kfree(scb);
 error:
        _leave(" = %d", ret);
        return ret;
index f6f89fdab6b2e6efe27c75d4efcbc9b4ae7505cf..28f4aa0152290555925cea03a4bba5a18ea28490 100644 (file)
@@ -24,21 +24,28 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
                               struct key *key)
 {
        struct afs_fs_cursor fc;
-       u64 dir_data_version = dvnode->status.data_version;
+       struct afs_status_cb *scb;
        int ret = -ERESTARTSYS;
 
        _enter("%pd,%pd", old, new);
 
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               return -ENOMEM;
+
        trace_afs_silly_rename(vnode, false);
-       if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
+               afs_dataversion_t dir_data_version = dvnode->status.data_version + 1;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(dvnode);
                        afs_fs_rename(&fc, old->d_name.name,
                                      dvnode, new->d_name.name,
-                                     dir_data_version, dir_data_version);
+                                     scb, scb);
                }
 
-               afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+               afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+                                       &dir_data_version, scb);
                ret = afs_end_vnode_operation(&fc);
        }
 
@@ -64,6 +71,7 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
                fsnotify_nameremove(old, 0);
        }
 
+       kfree(scb);
        _leave(" = %d", ret);
        return ret;
 }
@@ -143,31 +151,37 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
                               struct dentry *dentry, struct key *key)
 {
        struct afs_fs_cursor fc;
-       u64 dir_data_version = dvnode->status.data_version;
+       struct afs_status_cb *scb;
        int ret = -ERESTARTSYS;
 
        _enter("");
 
+       scb = kcalloc(2, sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               return -ENOMEM;
+
        trace_afs_silly_rename(vnode, true);
-       if (afs_begin_vnode_operation(&fc, dvnode, key)) {
+       if (afs_begin_vnode_operation(&fc, dvnode, key, false)) {
+               afs_dataversion_t dir_data_version = dvnode->status.data_version + 1;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(dvnode);
 
                        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc.cbi->server->flags) &&
                            !test_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags)) {
                                yfs_fs_remove_file2(&fc, vnode, dentry->d_name.name,
-                                                   dir_data_version);
+                                                   &scb[0], &scb[1]);
                                if (fc.ac.error != -ECONNABORTED ||
                                    fc.ac.abort_code != RXGEN_OPCODE)
                                        continue;
                                set_bit(AFS_SERVER_FL_NO_RM2, &fc.cbi->server->flags);
                        }
 
-                       afs_fs_remove(&fc, vnode, dentry->d_name.name, false,
-                                     dir_data_version);
+                       afs_fs_remove(&fc, vnode, dentry->d_name.name, false, &scb[0]);
                }
 
-               afs_vnode_commit_status(&fc, dvnode, fc.cb_break);
+               afs_vnode_commit_status(&fc, dvnode, fc.cb_break,
+                                       &dir_data_version, &scb[0]);
                ret = afs_end_vnode_operation(&fc);
                if (ret == 0) {
                        drop_nlink(&vnode->vfs_inode);
@@ -182,6 +196,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
                                            afs_edit_dir_for_unlink);
        }
 
+       kfree(scb);
        _leave(" = %d", ret);
        return ret;
 }
index a9ba81ddf1546272d4a5cbb7e0885326c250c6ff..af1689d1f32e7e699e1ecd13e4b7f84443a0c4a6 100644 (file)
@@ -46,7 +46,7 @@ static int afs_probe_cell_name(struct dentry *dentry)
                return 0;
        }
 
-       ret = dns_query("afsdb", name, len, "srv=1", NULL, NULL);
+       ret = dns_query("afsdb", name, len, "srv=1", NULL, NULL, false);
        if (ret == -ENODATA)
                ret = -EDESTADDRREQ;
        return ret;
@@ -261,8 +261,7 @@ int afs_dynroot_populate(struct super_block *sb)
        struct afs_net *net = afs_sb2net(sb);
        int ret;
 
-       if (mutex_lock_interruptible(&net->proc_cells_lock) < 0)
-               return -ERESTARTSYS;
+       mutex_lock(&net->proc_cells_lock);
 
        net->dynroot_sb = sb;
        hlist_for_each_entry(cell, &net->proc_cells, proc_link) {
index e8d6619890a91bffb1d3ce12cf51c23547f55508..11e69c5fb7abb5554605fbb4974146d9f375af75 100644 (file)
@@ -170,11 +170,12 @@ int afs_release(struct inode *inode, struct file *file)
 {
        struct afs_vnode *vnode = AFS_FS_I(inode);
        struct afs_file *af = file->private_data;
+       int ret = 0;
 
        _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode);
 
        if ((file->f_mode & FMODE_WRITE))
-               return vfs_fsync(file, 0);
+               ret = vfs_fsync(file, 0);
 
        file->private_data = NULL;
        if (af->wb)
@@ -182,8 +183,8 @@ int afs_release(struct inode *inode, struct file *file)
        key_put(af->key);
        kfree(af);
        afs_prune_wb_keys(vnode);
-       _leave(" = 0");
-       return 0;
+       _leave(" = %d", ret);
+       return ret;
 }
 
 /*
@@ -227,6 +228,7 @@ static void afs_file_readpage_read_complete(struct page *page,
 int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *desc)
 {
        struct afs_fs_cursor fc;
+       struct afs_status_cb *scb;
        int ret;
 
        _enter("%s{%llx:%llu.%u},%x,,,",
@@ -236,15 +238,22 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
               vnode->fid.unique,
               key_serial(key));
 
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               return -ENOMEM;
+
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+               afs_dataversion_t data_version = vnode->status.data_version;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       afs_fs_fetch_data(&fc, desc);
+                       afs_fs_fetch_data(&fc, scb, desc);
                }
 
-               afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               afs_check_for_remote_deletion(&fc, vnode);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+                                       &data_version, scb);
                ret = afs_end_vnode_operation(&fc);
        }
 
@@ -254,6 +263,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
                                &afs_v2net(vnode)->n_fetch_bytes);
        }
 
+       kfree(scb);
        _leave(" = %d", ret);
        return ret;
 }
@@ -404,10 +414,10 @@ static int afs_readpage(struct file *file, struct page *page)
 /*
  * Make pages available as they're filled.
  */
-static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req)
+static void afs_readpages_page_done(struct afs_read *req)
 {
 #ifdef CONFIG_AFS_FSCACHE
-       struct afs_vnode *vnode = call->reply[0];
+       struct afs_vnode *vnode = req->vnode;
 #endif
        struct page *page = req->pages[req->index];
 
@@ -461,6 +471,7 @@ static int afs_readpages_one(struct file *file, struct address_space *mapping,
                return -ENOMEM;
 
        refcount_set(&req->usage, 1);
+       req->vnode = vnode;
        req->page_done = afs_readpages_page_done;
        req->pos = first->index;
        req->pos <<= PAGE_SHIFT;
index adc88eff7849e2f6ba2b0542fb27935be8eec799..ed3ac03682d70c3a71358297eace10b7846c6dc0 100644 (file)
@@ -41,9 +41,6 @@ void afs_lock_may_be_available(struct afs_vnode *vnode)
 {
        _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
 
-       if (vnode->lock_state != AFS_VNODE_LOCK_WAITING_FOR_CB)
-               return;
-
        spin_lock(&vnode->lock);
        if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
                afs_next_locker(vnode, 0);
@@ -77,7 +74,7 @@ static void afs_schedule_lock_extension(struct afs_vnode *vnode)
  */
 void afs_lock_op_done(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
+       struct afs_vnode *vnode = call->lvnode;
 
        if (call->error == 0) {
                spin_lock(&vnode->lock);
@@ -185,6 +182,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode)
 static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
                        afs_lock_type_t type)
 {
+       struct afs_status_cb *scb;
        struct afs_fs_cursor fc;
        int ret;
 
@@ -195,18 +193,23 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
               vnode->fid.unique,
               key_serial(key), type);
 
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               return -ENOMEM;
+
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       afs_fs_set_lock(&fc, type);
+                       afs_fs_set_lock(&fc, type, scb);
                }
 
-               afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               afs_check_for_remote_deletion(&fc, vnode);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break, NULL, scb);
                ret = afs_end_vnode_operation(&fc);
        }
 
+       kfree(scb);
        _leave(" = %d", ret);
        return ret;
 }
@@ -216,6 +219,7 @@ static int afs_set_lock(struct afs_vnode *vnode, struct key *key,
  */
 static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
 {
+       struct afs_status_cb *scb;
        struct afs_fs_cursor fc;
        int ret;
 
@@ -226,18 +230,23 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
               vnode->fid.unique,
               key_serial(key));
 
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               return -ENOMEM;
+
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, false)) {
                while (afs_select_current_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       afs_fs_extend_lock(&fc);
+                       afs_fs_extend_lock(&fc, scb);
                }
 
-               afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               afs_check_for_remote_deletion(&fc, vnode);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break, NULL, scb);
                ret = afs_end_vnode_operation(&fc);
        }
 
+       kfree(scb);
        _leave(" = %d", ret);
        return ret;
 }
@@ -247,6 +256,7 @@ static int afs_extend_lock(struct afs_vnode *vnode, struct key *key)
  */
 static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
 {
+       struct afs_status_cb *scb;
        struct afs_fs_cursor fc;
        int ret;
 
@@ -257,18 +267,23 @@ static int afs_release_lock(struct afs_vnode *vnode, struct key *key)
               vnode->fid.unique,
               key_serial(key));
 
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               return -ENOMEM;
+
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, false)) {
                while (afs_select_current_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       afs_fs_release_lock(&fc);
+                       afs_fs_release_lock(&fc, scb);
                }
 
-               afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               afs_check_for_remote_deletion(&fc, vnode);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break, NULL, scb);
                ret = afs_end_vnode_operation(&fc);
        }
 
+       kfree(scb);
        _leave(" = %d", ret);
        return ret;
 }
@@ -736,7 +751,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
        posix_test_lock(file, fl);
        if (fl->fl_type == F_UNLCK) {
                /* no local locks; consult the server */
-               ret = afs_fetch_status(vnode, key, false);
+               ret = afs_fetch_status(vnode, key, false, NULL);
                if (ret < 0)
                        goto error;
 
index 5d3abde52a0f094599aef6533f6b83f5ab3d97b6..9b72662093433ce126853354e7a9c7c6ce80b7ac 100644 (file)
@@ -33,8 +33,8 @@ static bool afs_fs_probe_done(struct afs_server *server)
 void afs_fileserver_probe_result(struct afs_call *call)
 {
        struct afs_addr_list *alist = call->alist;
-       struct afs_server *server = call->reply[0];
-       unsigned int server_index = (long)call->reply[1];
+       struct afs_server *server = call->server;
+       unsigned int server_index = call->server_index;
        unsigned int index = call->addr_ix;
        unsigned int rtt = UINT_MAX;
        bool have_result = false;
index 1296f5dc4c1e5f23e0019701646d35080dbbdf26..48298408d6ac7a944285a0f273e23b4925f8def4 100644 (file)
@@ -59,79 +59,18 @@ static void xdr_dump_bad(const __be32 *bp)
        pr_notice("0x50: %08x\n", ntohl(x[0]));
 }
 
-/*
- * Update the core inode struct from a returned status record.
- */
-void afs_update_inode_from_status(struct afs_vnode *vnode,
-                                 struct afs_file_status *status,
-                                 const afs_dataversion_t *expected_version,
-                                 u8 flags)
-{
-       struct timespec64 t;
-       umode_t mode;
-
-       t = status->mtime_client;
-       vnode->vfs_inode.i_ctime = t;
-       vnode->vfs_inode.i_mtime = t;
-       vnode->vfs_inode.i_atime = t;
-
-       if (flags & (AFS_VNODE_META_CHANGED | AFS_VNODE_NOT_YET_SET)) {
-               vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner);
-               vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group);
-               set_nlink(&vnode->vfs_inode, status->nlink);
-
-               mode = vnode->vfs_inode.i_mode;
-               mode &= ~S_IALLUGO;
-               mode |= status->mode;
-               barrier();
-               vnode->vfs_inode.i_mode = mode;
-       }
-
-       if (!(flags & AFS_VNODE_NOT_YET_SET)) {
-               if (expected_version &&
-                   *expected_version != status->data_version) {
-                       _debug("vnode modified %llx on {%llx:%llu} [exp %llx]",
-                              (unsigned long long) status->data_version,
-                              vnode->fid.vid, vnode->fid.vnode,
-                              (unsigned long long) *expected_version);
-                       vnode->invalid_before = status->data_version;
-                       if (vnode->status.type == AFS_FTYPE_DIR) {
-                               if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-                                       afs_stat_v(vnode, n_inval);
-                       } else {
-                               set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
-                       }
-               } else if (vnode->status.type == AFS_FTYPE_DIR) {
-                       /* Expected directory change is handled elsewhere so
-                        * that we can locally edit the directory and save on a
-                        * download.
-                        */
-                       if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
-                               flags &= ~AFS_VNODE_DATA_CHANGED;
-               }
-       }
-
-       if (flags & (AFS_VNODE_DATA_CHANGED | AFS_VNODE_NOT_YET_SET)) {
-               inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
-               i_size_write(&vnode->vfs_inode, status->size);
-       }
-}
-
 /*
  * decode an AFSFetchStatus block
  */
-static int xdr_decode_AFSFetchStatus(struct afs_call *call,
-                                    const __be32 **_bp,
-                                    struct afs_file_status *status,
-                                    struct afs_vnode *vnode,
-                                    const afs_dataversion_t *expected_version,
-                                    struct afs_read *read_req)
+static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
+                                    struct afs_call *call,
+                                    struct afs_status_cb *scb)
 {
        const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp;
+       struct afs_file_status *status = &scb->status;
        bool inline_error = (call->operation_ID == afs_FS_InlineBulkStatus);
        u64 data_version, size;
        u32 type, abort_code;
-       u8 flags = 0;
 
        abort_code = ntohl(xdr->abort_code);
 
@@ -144,6 +83,7 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
                         * case.
                         */
                        status->abort_code = abort_code;
+                       scb->have_error = true;
                        return 0;
                }
 
@@ -161,44 +101,25 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
        case AFS_FTYPE_FILE:
        case AFS_FTYPE_DIR:
        case AFS_FTYPE_SYMLINK:
-               if (type != status->type &&
-                   vnode &&
-                   !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
-                       pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n",
-                                  vnode->fid.vid,
-                                  vnode->fid.vnode,
-                                  vnode->fid.unique,
-                                  status->type, type);
-                       goto bad;
-               }
                status->type = type;
                break;
        default:
                goto bad;
        }
 
-#define EXTRACT_M(FIELD)                                       \
-       do {                                                    \
-               u32 x = ntohl(xdr->FIELD);                      \
-               if (status->FIELD != x) {                       \
-                       flags |= AFS_VNODE_META_CHANGED;        \
-                       status->FIELD = x;                      \
-               }                                               \
-       } while (0)
-
-       EXTRACT_M(nlink);
-       EXTRACT_M(author);
-       EXTRACT_M(owner);
-       EXTRACT_M(caller_access); /* call ticket dependent */
-       EXTRACT_M(anon_access);
-       EXTRACT_M(mode);
-       EXTRACT_M(group);
+       status->nlink           = ntohl(xdr->nlink);
+       status->author          = ntohl(xdr->author);
+       status->owner           = ntohl(xdr->owner);
+       status->caller_access   = ntohl(xdr->caller_access); /* Ticket dependent */
+       status->anon_access     = ntohl(xdr->anon_access);
+       status->mode            = ntohl(xdr->mode) & S_IALLUGO;
+       status->group           = ntohl(xdr->group);
+       status->lock_count      = ntohl(xdr->lock_count);
 
        status->mtime_client.tv_sec = ntohl(xdr->mtime_client);
        status->mtime_client.tv_nsec = 0;
        status->mtime_server.tv_sec = ntohl(xdr->mtime_server);
        status->mtime_server.tv_nsec = 0;
-       status->lock_count   = ntohl(xdr->lock_count);
 
        size  = (u64)ntohl(xdr->size_lo);
        size |= (u64)ntohl(xdr->size_hi) << 32;
@@ -206,25 +127,10 @@ static int xdr_decode_AFSFetchStatus(struct afs_call *call,
 
        data_version  = (u64)ntohl(xdr->data_version_lo);
        data_version |= (u64)ntohl(xdr->data_version_hi) << 32;
-       if (data_version != status->data_version) {
-               status->data_version = data_version;
-               flags |= AFS_VNODE_DATA_CHANGED;
-       }
-
-       if (read_req) {
-               read_req->data_version = data_version;
-               read_req->file_size = size;
-       }
+       status->data_version = data_version;
+       scb->have_status = true;
 
        *_bp = (const void *)*_bp + sizeof(*xdr);
-
-       if (vnode) {
-               if (test_bit(AFS_VNODE_UNSET, &vnode->flags))
-                       flags |= AFS_VNODE_NOT_YET_SET;
-               afs_update_inode_from_status(vnode, status, expected_version,
-                                            flags);
-       }
-
        return 0;
 
 bad:
@@ -232,77 +138,22 @@ bad:
        return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
 }
 
-/*
- * Decode the file status.  We need to lock the target vnode if we're going to
- * update its status so that stat() sees the attributes update atomically.
- */
-static int afs_decode_status(struct afs_call *call,
-                            const __be32 **_bp,
-                            struct afs_file_status *status,
-                            struct afs_vnode *vnode,
-                            const afs_dataversion_t *expected_version,
-                            struct afs_read *read_req)
+static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
 {
-       int ret;
-
-       if (!vnode)
-               return xdr_decode_AFSFetchStatus(call, _bp, status, vnode,
-                                                expected_version, read_req);
-
-       write_seqlock(&vnode->cb_lock);
-       ret = xdr_decode_AFSFetchStatus(call, _bp, status, vnode,
-                                       expected_version, read_req);
-       write_sequnlock(&vnode->cb_lock);
-       return ret;
+       return ktime_divns(call->reply_time, NSEC_PER_SEC) + expiry;
 }
 
-/*
- * decode an AFSCallBack block
- */
-static void xdr_decode_AFSCallBack(struct afs_call *call,
-                                  struct afs_vnode *vnode,
-                                  const __be32 **_bp)
+static void xdr_decode_AFSCallBack(const __be32 **_bp,
+                                  struct afs_call *call,
+                                  struct afs_status_cb *scb)
 {
-       struct afs_cb_interest *old, *cbi = call->cbi;
+       struct afs_callback *cb = &scb->callback;
        const __be32 *bp = *_bp;
-       u32 cb_expiry;
-
-       write_seqlock(&vnode->cb_lock);
-
-       if (!afs_cb_is_broken(call->cb_break, vnode, cbi)) {
-               vnode->cb_version       = ntohl(*bp++);
-               cb_expiry               = ntohl(*bp++);
-               vnode->cb_type          = ntohl(*bp++);
-               vnode->cb_expires_at    = cb_expiry + ktime_get_real_seconds();
-               old = vnode->cb_interest;
-               if (old != call->cbi) {
-                       vnode->cb_interest = cbi;
-                       cbi = old;
-               }
-               set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
-       } else {
-               bp += 3;
-       }
 
-       write_sequnlock(&vnode->cb_lock);
-       call->cbi = cbi;
-       *_bp = bp;
-}
-
-static ktime_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
-{
-       return ktime_add_ns(call->reply_time, expiry * NSEC_PER_SEC);
-}
-
-static void xdr_decode_AFSCallBack_raw(struct afs_call *call,
-                                      const __be32 **_bp,
-                                      struct afs_callback *cb)
-{
-       const __be32 *bp = *_bp;
-
-       cb->version     = ntohl(*bp++);
+       bp++; /* version */
        cb->expires_at  = xdr_decode_expiry(call, ntohl(*bp++));
-       cb->type        = ntohl(*bp++);
+       bp++; /* type */
+       scb->have_cb    = true;
        *_bp = bp;
 }
 
@@ -395,7 +246,6 @@ static void xdr_decode_AFSFetchVolumeStatus(const __be32 **_bp,
  */
 static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
        const __be32 *bp;
        int ret;
 
@@ -403,16 +253,13 @@ static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call)
        if (ret < 0)
                return ret;
 
-       _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
-
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = afs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       xdr_decode_AFSCallBack(call, vnode, &bp);
-       xdr_decode_AFSVolSync(&bp, call->reply[1]);
+       xdr_decode_AFSCallBack(&bp, call, call->out_scb);
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -431,8 +278,8 @@ static const struct afs_call_type afs_RXFSFetchStatus_vnode = {
 /*
  * fetch the status information for a file
  */
-int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync,
-                            bool new_inode)
+int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_status_cb *scb,
+                            struct afs_volsync *volsync)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -440,7 +287,7 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_fetch_file_status(fc, volsync, new_inode);
+               return yfs_fs_fetch_file_status(fc, scb, volsync);
 
        _enter(",%x,{%llx:%llu},,",
               key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
@@ -453,10 +300,8 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
        }
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = volsync;
-       call->expected_version = new_inode ? 1 : vnode->status.data_version;
-       call->want_reply_time = true;
+       call->out_scb = scb;
+       call->out_volsync = volsync;
 
        /* marshall the parameters */
        bp = call->request;
@@ -465,10 +310,10 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
        bp[2] = htonl(vnode->fid.vnode);
        bp[3] = htonl(vnode->fid.unique);
 
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
 
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -478,8 +323,7 @@ int afs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
  */
 static int afs_deliver_fs_fetch_data(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
-       struct afs_read *req = call->reply[2];
+       struct afs_read *req = call->read_request;
        const __be32 *bp;
        unsigned int size;
        int ret;
@@ -541,7 +385,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
                if (req->offset == PAGE_SIZE) {
                        req->offset = 0;
                        if (req->page_done)
-                               req->page_done(call, req);
+                               req->page_done(req);
                        req->index++;
                        if (req->remain > 0)
                                goto begin_page;
@@ -575,12 +419,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
                        return ret;
 
                bp = call->buffer;
-               ret = afs_decode_status(call, &bp, &vnode->status, vnode,
-                                       &vnode->status.data_version, req);
+               ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
                if (ret < 0)
                        return ret;
-               xdr_decode_AFSCallBack(call, vnode, &bp);
-               xdr_decode_AFSVolSync(&bp, call->reply[1]);
+               xdr_decode_AFSCallBack(&bp, call, call->out_scb);
+               xdr_decode_AFSVolSync(&bp, call->out_volsync);
+
+               req->data_version = call->out_scb->status.data_version;
+               req->file_size = call->out_scb->status.size;
 
                call->unmarshall++;
 
@@ -593,7 +439,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
                        zero_user_segment(req->pages[req->index],
                                          req->offset, PAGE_SIZE);
                if (req->page_done)
-                       req->page_done(call, req);
+                       req->page_done(req);
                req->offset = 0;
        }
 
@@ -603,7 +449,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 
 static void afs_fetch_data_destructor(struct afs_call *call)
 {
-       struct afs_read *req = call->reply[2];
+       struct afs_read *req = call->read_request;
 
        afs_put_read(req);
        afs_flat_call_destructor(call);
@@ -629,7 +475,9 @@ static const struct afs_call_type afs_RXFSFetchData64 = {
 /*
  * fetch data from a very large file
  */
-static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
+static int afs_fs_fetch_data64(struct afs_fs_cursor *fc,
+                              struct afs_status_cb *scb,
+                              struct afs_read *req)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -643,11 +491,9 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = NULL; /* volsync */
-       call->reply[2] = req;
-       call->expected_version = vnode->status.data_version;
-       call->want_reply_time = true;
+       call->out_scb = scb;
+       call->out_volsync = NULL;
+       call->read_request = req;
 
        /* marshall the parameters */
        bp = call->request;
@@ -661,9 +507,9 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
        bp[7] = htonl(lower_32_bits(req->len));
 
        refcount_inc(&req->usage);
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -671,7 +517,9 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc, struct afs_read *req)
 /*
  * fetch data from a file
  */
-int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
+int afs_fs_fetch_data(struct afs_fs_cursor *fc,
+                     struct afs_status_cb *scb,
+                     struct afs_read *req)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -679,12 +527,12 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_fetch_data(fc, req);
+               return yfs_fs_fetch_data(fc, scb, req);
 
        if (upper_32_bits(req->pos) ||
            upper_32_bits(req->len) ||
            upper_32_bits(req->pos + req->len))
-               return afs_fs_fetch_data64(fc, req);
+               return afs_fs_fetch_data64(fc, scb, req);
 
        _enter("");
 
@@ -693,11 +541,9 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = NULL; /* volsync */
-       call->reply[2] = req;
-       call->expected_version = vnode->status.data_version;
-       call->want_reply_time = true;
+       call->out_scb = scb;
+       call->out_volsync = NULL;
+       call->read_request = req;
 
        /* marshall the parameters */
        bp = call->request;
@@ -709,9 +555,9 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
        bp[5] = htonl(lower_32_bits(req->len));
 
        refcount_inc(&req->usage);
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -721,28 +567,24 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
  */
 static int afs_deliver_fs_create_vnode(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
        const __be32 *bp;
        int ret;
 
-       _enter("{%u}", call->unmarshall);
-
        ret = afs_transfer_reply(call);
        if (ret < 0)
                return ret;
 
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       xdr_decode_AFSFid(&bp, call->reply[1]);
-       ret = afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+       xdr_decode_AFSFid(&bp, call->out_fid);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       ret = afs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
-       xdr_decode_AFSCallBack_raw(call, &bp, call->reply[3]);
-       /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+       xdr_decode_AFSCallBack(&bp, call, call->out_scb);
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -771,24 +613,23 @@ static const struct afs_call_type afs_RXFSMakeDir = {
 int afs_fs_create(struct afs_fs_cursor *fc,
                  const char *name,
                  umode_t mode,
-                 u64 current_data_version,
+                 struct afs_status_cb *dvnode_scb,
                  struct afs_fid *newfid,
-                 struct afs_file_status *newstatus,
-                 struct afs_callback *newcb)
+                 struct afs_status_cb *new_scb)
 {
-       struct afs_vnode *vnode = fc->vnode;
+       struct afs_vnode *dvnode = fc->vnode;
        struct afs_call *call;
-       struct afs_net *net = afs_v2net(vnode);
+       struct afs_net *net = afs_v2net(dvnode);
        size_t namesz, reqsz, padsz;
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags)){
                if (S_ISDIR(mode))
-                       return yfs_fs_make_dir(fc, name, mode, current_data_version,
-                                              newfid, newstatus, newcb);
+                       return yfs_fs_make_dir(fc, name, mode, dvnode_scb,
+                                              newfid, new_scb);
                else
-                       return yfs_fs_create_file(fc, name, mode, current_data_version,
-                                                 newfid, newstatus, newcb);
+                       return yfs_fs_create_file(fc, name, mode, dvnode_scb,
+                                                 newfid, new_scb);
        }
 
        _enter("");
@@ -804,19 +645,16 @@ int afs_fs_create(struct afs_fs_cursor *fc,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = newfid;
-       call->reply[2] = newstatus;
-       call->reply[3] = newcb;
-       call->expected_version = current_data_version + 1;
-       call->want_reply_time = true;
+       call->out_dir_scb = dvnode_scb;
+       call->out_fid = newfid;
+       call->out_scb = new_scb;
 
        /* marshall the parameters */
        bp = call->request;
        *bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE);
-       *bp++ = htonl(vnode->fid.vid);
-       *bp++ = htonl(vnode->fid.vnode);
-       *bp++ = htonl(vnode->fid.unique);
+       *bp++ = htonl(dvnode->fid.vid);
+       *bp++ = htonl(dvnode->fid.vnode);
+       *bp++ = htonl(dvnode->fid.unique);
        *bp++ = htonl(namesz);
        memcpy(bp, name, namesz);
        bp = (void *) bp + namesz;
@@ -825,41 +663,38 @@ int afs_fs_create(struct afs_fs_cursor *fc,
                bp = (void *) bp + padsz;
        }
        *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME);
-       *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */
+       *bp++ = htonl(dvnode->vfs_inode.i_mtime.tv_sec); /* mtime */
        *bp++ = 0; /* owner */
        *bp++ = 0; /* group */
        *bp++ = htonl(mode & S_IALLUGO); /* unix mode */
        *bp++ = 0; /* segment size */
 
        afs_use_fs_server(call, fc->cbi);
-       trace_afs_make_fs_call1(call, &vnode->fid, name);
+       trace_afs_make_fs_call1(call, &dvnode->fid, name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
 /*
- * Deliver reply data to any operation that returns file status and volume
+ * Deliver reply data to any operation that returns directory status and volume
  * sync.
  */
-static int afs_deliver_fs_status_and_vol(struct afs_call *call)
+static int afs_deliver_fs_dir_status_and_vol(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
        const __be32 *bp;
        int ret;
 
-       _enter("{%u}", call->unmarshall);
-
        ret = afs_transfer_reply(call);
        if (ret < 0)
                return ret;
 
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = afs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
-       /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -871,14 +706,14 @@ static int afs_deliver_fs_status_and_vol(struct afs_call *call)
 static const struct afs_call_type afs_RXFSRemoveFile = {
        .name           = "FS.RemoveFile",
        .op             = afs_FS_RemoveFile,
-       .deliver        = afs_deliver_fs_status_and_vol,
+       .deliver        = afs_deliver_fs_dir_status_and_vol,
        .destructor     = afs_flat_call_destructor,
 };
 
 static const struct afs_call_type afs_RXFSRemoveDir = {
        .name           = "FS.RemoveDir",
        .op             = afs_FS_RemoveDir,
-       .deliver        = afs_deliver_fs_status_and_vol,
+       .deliver        = afs_deliver_fs_dir_status_and_vol,
        .destructor     = afs_flat_call_destructor,
 };
 
@@ -886,7 +721,7 @@ static const struct afs_call_type afs_RXFSRemoveDir = {
  * remove a file or directory
  */
 int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
-                 const char *name, bool isdir, u64 current_data_version)
+                 const char *name, bool isdir, struct afs_status_cb *dvnode_scb)
 {
        struct afs_vnode *dvnode = fc->vnode;
        struct afs_call *call;
@@ -895,7 +730,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_remove(fc, vnode, name, isdir, current_data_version);
+               return yfs_fs_remove(fc, vnode, name, isdir, dvnode_scb);
 
        _enter("");
 
@@ -910,9 +745,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = dvnode;
-       call->reply[1] = vnode;
-       call->expected_version = current_data_version + 1;
+       call->out_dir_scb = dvnode_scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -930,6 +763,7 @@ int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call1(call, &dvnode->fid, name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -939,7 +773,6 @@ int afs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
  */
 static int afs_deliver_fs_link(struct afs_call *call)
 {
-       struct afs_vnode *dvnode = call->reply[0], *vnode = call->reply[1];
        const __be32 *bp;
        int ret;
 
@@ -951,14 +784,13 @@ static int afs_deliver_fs_link(struct afs_call *call)
 
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = afs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       ret = afs_decode_status(call, &bp, &dvnode->status, dvnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
-       /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -978,7 +810,9 @@ static const struct afs_call_type afs_RXFSLink = {
  * make a hard link
  */
 int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
-               const char *name, u64 current_data_version)
+               const char *name,
+               struct afs_status_cb *dvnode_scb,
+               struct afs_status_cb *vnode_scb)
 {
        struct afs_vnode *dvnode = fc->vnode;
        struct afs_call *call;
@@ -987,7 +821,7 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_link(fc, vnode, name, current_data_version);
+               return yfs_fs_link(fc, vnode, name, dvnode_scb, vnode_scb);
 
        _enter("");
 
@@ -1000,9 +834,8 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = dvnode;
-       call->reply[1] = vnode;
-       call->expected_version = current_data_version + 1;
+       call->out_dir_scb = dvnode_scb;
+       call->out_scb = vnode_scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1023,6 +856,7 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call1(call, &vnode->fid, name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1032,7 +866,6 @@ int afs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
  */
 static int afs_deliver_fs_symlink(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
        const __be32 *bp;
        int ret;
 
@@ -1044,15 +877,14 @@ static int afs_deliver_fs_symlink(struct afs_call *call)
 
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       xdr_decode_AFSFid(&bp, call->reply[1]);
-       ret = afs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+       xdr_decode_AFSFid(&bp, call->out_fid);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       ret = afs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
-       /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -1074,19 +906,19 @@ static const struct afs_call_type afs_RXFSSymlink = {
 int afs_fs_symlink(struct afs_fs_cursor *fc,
                   const char *name,
                   const char *contents,
-                  u64 current_data_version,
+                  struct afs_status_cb *dvnode_scb,
                   struct afs_fid *newfid,
-                  struct afs_file_status *newstatus)
+                  struct afs_status_cb *new_scb)
 {
-       struct afs_vnode *vnode = fc->vnode;
+       struct afs_vnode *dvnode = fc->vnode;
        struct afs_call *call;
-       struct afs_net *net = afs_v2net(vnode);
+       struct afs_net *net = afs_v2net(dvnode);
        size_t namesz, reqsz, padsz, c_namesz, c_padsz;
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_symlink(fc, name, contents, current_data_version,
-                                     newfid, newstatus);
+               return yfs_fs_symlink(fc, name, contents, dvnode_scb,
+                                     newfid, new_scb);
 
        _enter("");
 
@@ -1104,17 +936,16 @@ int afs_fs_symlink(struct afs_fs_cursor *fc,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = newfid;
-       call->reply[2] = newstatus;
-       call->expected_version = current_data_version + 1;
+       call->out_dir_scb = dvnode_scb;
+       call->out_fid = newfid;
+       call->out_scb = new_scb;
 
        /* marshall the parameters */
        bp = call->request;
        *bp++ = htonl(FSSYMLINK);
-       *bp++ = htonl(vnode->fid.vid);
-       *bp++ = htonl(vnode->fid.vnode);
-       *bp++ = htonl(vnode->fid.unique);
+       *bp++ = htonl(dvnode->fid.vid);
+       *bp++ = htonl(dvnode->fid.vnode);
+       *bp++ = htonl(dvnode->fid.unique);
        *bp++ = htonl(namesz);
        memcpy(bp, name, namesz);
        bp = (void *) bp + namesz;
@@ -1130,14 +961,15 @@ int afs_fs_symlink(struct afs_fs_cursor *fc,
                bp = (void *) bp + c_padsz;
        }
        *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME);
-       *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */
+       *bp++ = htonl(dvnode->vfs_inode.i_mtime.tv_sec); /* mtime */
        *bp++ = 0; /* owner */
        *bp++ = 0; /* group */
        *bp++ = htonl(S_IRWXUGO); /* unix mode */
        *bp++ = 0; /* segment size */
 
        afs_use_fs_server(call, fc->cbi);
-       trace_afs_make_fs_call1(call, &vnode->fid, name);
+       trace_afs_make_fs_call1(call, &dvnode->fid, name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1147,29 +979,24 @@ int afs_fs_symlink(struct afs_fs_cursor *fc,
  */
 static int afs_deliver_fs_rename(struct afs_call *call)
 {
-       struct afs_vnode *orig_dvnode = call->reply[0], *new_dvnode = call->reply[1];
        const __be32 *bp;
        int ret;
 
-       _enter("{%u}", call->unmarshall);
-
        ret = afs_transfer_reply(call);
        if (ret < 0)
                return ret;
 
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = afs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
-       if (new_dvnode != orig_dvnode) {
-               ret = afs_decode_status(call, &bp, &new_dvnode->status, new_dvnode,
-                                       &call->expected_version_2, NULL);
+       if (call->out_dir_scb != call->out_scb) {
+               ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
                if (ret < 0)
                        return ret;
        }
-       /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -1186,14 +1013,14 @@ static const struct afs_call_type afs_RXFSRename = {
 };
 
 /*
- * create a symbolic link
+ * Rename/move a file or directory.
  */
 int afs_fs_rename(struct afs_fs_cursor *fc,
                  const char *orig_name,
                  struct afs_vnode *new_dvnode,
                  const char *new_name,
-                 u64 current_orig_data_version,
-                 u64 current_new_data_version)
+                 struct afs_status_cb *orig_dvnode_scb,
+                 struct afs_status_cb *new_dvnode_scb)
 {
        struct afs_vnode *orig_dvnode = fc->vnode;
        struct afs_call *call;
@@ -1204,8 +1031,8 @@ int afs_fs_rename(struct afs_fs_cursor *fc,
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
                return yfs_fs_rename(fc, orig_name,
                                     new_dvnode, new_name,
-                                    current_orig_data_version,
-                                    current_new_data_version);
+                                    orig_dvnode_scb,
+                                    new_dvnode_scb);
 
        _enter("");
 
@@ -1225,10 +1052,8 @@ int afs_fs_rename(struct afs_fs_cursor *fc,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = orig_dvnode;
-       call->reply[1] = new_dvnode;
-       call->expected_version = current_orig_data_version + 1;
-       call->expected_version_2 = current_new_data_version + 1;
+       call->out_dir_scb = orig_dvnode_scb;
+       call->out_scb = new_dvnode_scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1257,6 +1082,7 @@ int afs_fs_rename(struct afs_fs_cursor *fc,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call2(call, &orig_dvnode->fid, orig_name, new_name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1266,7 +1092,6 @@ int afs_fs_rename(struct afs_fs_cursor *fc,
  */
 static int afs_deliver_fs_store_data(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
        const __be32 *bp;
        int ret;
 
@@ -1278,13 +1103,10 @@ static int afs_deliver_fs_store_data(struct afs_call *call)
 
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = afs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
-
-       afs_pages_written_back(vnode, call);
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -1314,7 +1136,8 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc,
                               struct address_space *mapping,
                               pgoff_t first, pgoff_t last,
                               unsigned offset, unsigned to,
-                              loff_t size, loff_t pos, loff_t i_size)
+                              loff_t size, loff_t pos, loff_t i_size,
+                              struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1332,13 +1155,12 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc,
 
        call->key = fc->key;
        call->mapping = mapping;
-       call->reply[0] = vnode;
        call->first = first;
        call->last = last;
        call->first_offset = offset;
        call->last_to = to;
        call->send_pages = true;
-       call->expected_version = vnode->status.data_version + 1;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1362,6 +1184,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc,
        *bp++ = htonl((u32) i_size);
 
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1371,7 +1194,8 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc,
  */
 int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
                      pgoff_t first, pgoff_t last,
-                     unsigned offset, unsigned to)
+                     unsigned offset, unsigned to,
+                     struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1380,7 +1204,7 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_store_data(fc, mapping, first, last, offset, to);
+               return yfs_fs_store_data(fc, mapping, first, last, offset, to, scb);
 
        _enter(",%x,{%llx:%llu},,",
               key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
@@ -1401,7 +1225,7 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
 
        if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32)
                return afs_fs_store_data64(fc, mapping, first, last, offset, to,
-                                          size, pos, i_size);
+                                          size, pos, i_size, scb);
 
        call = afs_alloc_flat_call(net, &afs_RXFSStoreData,
                                   (4 + 6 + 3) * 4,
@@ -1411,13 +1235,12 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
 
        call->key = fc->key;
        call->mapping = mapping;
-       call->reply[0] = vnode;
        call->first = first;
        call->last = last;
        call->first_offset = offset;
        call->last_to = to;
        call->send_pages = true;
-       call->expected_version = vnode->status.data_version + 1;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1439,6 +1262,7 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1448,7 +1272,6 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
  */
 static int afs_deliver_fs_store_status(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
        const __be32 *bp;
        int ret;
 
@@ -1460,11 +1283,10 @@ static int afs_deliver_fs_store_status(struct afs_call *call)
 
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = afs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -1498,7 +1320,8 @@ static const struct afs_call_type afs_RXFSStoreData64_as_Status = {
  * set the attributes on a very large file, using FS.StoreData rather than
  * FS.StoreStatus so as to alter the file size also
  */
-static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
+static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr,
+                                struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1517,8 +1340,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->expected_version = vnode->status.data_version + 1;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1538,6 +1360,7 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1546,7 +1369,8 @@ static int afs_fs_setattr_size64(struct afs_fs_cursor *fc, struct iattr *attr)
  * set the attributes on a file, using FS.StoreData rather than FS.StoreStatus
  * so as to alter the file size also
  */
-static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
+static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr,
+                              struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1558,7 +1382,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
 
        ASSERT(attr->ia_valid & ATTR_SIZE);
        if (attr->ia_size >> 32)
-               return afs_fs_setattr_size64(fc, attr);
+               return afs_fs_setattr_size64(fc, attr, scb);
 
        call = afs_alloc_flat_call(net, &afs_RXFSStoreData_as_Status,
                                   (4 + 6 + 3) * 4,
@@ -1567,8 +1391,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->expected_version = vnode->status.data_version + 1;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1585,6 +1408,7 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1593,7 +1417,8 @@ static int afs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
  * set the attributes on a file, using FS.StoreData if there's a change in file
  * size, and FS.StoreStatus otherwise
  */
-int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
+int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr,
+                  struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1601,10 +1426,10 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_setattr(fc, attr);
+               return yfs_fs_setattr(fc, attr, scb);
 
        if (attr->ia_valid & ATTR_SIZE)
-               return afs_fs_setattr_size(fc, attr);
+               return afs_fs_setattr_size(fc, attr, scb);
 
        _enter(",%x,{%llx:%llu},,",
               key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
@@ -1616,8 +1441,7 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->expected_version = vnode->status.data_version;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1630,6 +1454,7 @@ int afs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1659,7 +1484,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
                        return ret;
 
                bp = call->buffer;
-               xdr_decode_AFSFetchVolumeStatus(&bp, call->reply[1]);
+               xdr_decode_AFSFetchVolumeStatus(&bp, call->out_volstatus);
                call->unmarshall++;
                afs_extract_to_tmp(call);
 
@@ -1675,7 +1500,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
                        return afs_protocol_error(call, -EBADMSG,
                                                  afs_eproto_volname_len);
                size = (call->count + 3) & ~3; /* It's padded */
-               afs_extract_begin(call, call->reply[2], size);
+               afs_extract_to_buf(call, size);
                call->unmarshall++;
 
                /* Fall through - and extract the volume name */
@@ -1685,7 +1510,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
                if (ret < 0)
                        return ret;
 
-               p = call->reply[2];
+               p = call->buffer;
                p[call->count] = 0;
                _debug("volname '%s'", p);
                afs_extract_to_tmp(call);
@@ -1703,7 +1528,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
                        return afs_protocol_error(call, -EBADMSG,
                                                  afs_eproto_offline_msg_len);
                size = (call->count + 3) & ~3; /* It's padded */
-               afs_extract_begin(call, call->reply[2], size);
+               afs_extract_to_buf(call, size);
                call->unmarshall++;
 
                /* Fall through - and extract the offline message */
@@ -1713,7 +1538,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
                if (ret < 0)
                        return ret;
 
-               p = call->reply[2];
+               p = call->buffer;
                p[call->count] = 0;
                _debug("offline '%s'", p);
 
@@ -1732,7 +1557,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
                        return afs_protocol_error(call, -EBADMSG,
                                                  afs_eproto_motd_len);
                size = (call->count + 3) & ~3; /* It's padded */
-               afs_extract_begin(call, call->reply[2], size);
+               afs_extract_to_buf(call, size);
                call->unmarshall++;
 
                /* Fall through - and extract the message of the day */
@@ -1742,7 +1567,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
                if (ret < 0)
                        return ret;
 
-               p = call->reply[2];
+               p = call->buffer;
                p[call->count] = 0;
                _debug("motd '%s'", p);
 
@@ -1756,16 +1581,6 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
        return 0;
 }
 
-/*
- * destroy an FS.GetVolumeStatus call
- */
-static void afs_get_volume_status_call_destructor(struct afs_call *call)
-{
-       kfree(call->reply[2]);
-       call->reply[2] = NULL;
-       afs_flat_call_destructor(call);
-}
-
 /*
  * FS.GetVolumeStatus operation type
  */
@@ -1773,7 +1588,7 @@ static const struct afs_call_type afs_RXFSGetVolumeStatus = {
        .name           = "FS.GetVolumeStatus",
        .op             = afs_FS_GetVolumeStatus,
        .deliver        = afs_deliver_fs_get_volume_status,
-       .destructor     = afs_get_volume_status_call_destructor,
+       .destructor     = afs_flat_call_destructor,
 };
 
 /*
@@ -1786,27 +1601,19 @@ int afs_fs_get_volume_status(struct afs_fs_cursor *fc,
        struct afs_call *call;
        struct afs_net *net = afs_v2net(vnode);
        __be32 *bp;
-       void *tmpbuf;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
                return yfs_fs_get_volume_status(fc, vs);
 
        _enter("");
 
-       tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
-       if (!tmpbuf)
-               return -ENOMEM;
-
-       call = afs_alloc_flat_call(net, &afs_RXFSGetVolumeStatus, 2 * 4, 12 * 4);
-       if (!call) {
-               kfree(tmpbuf);
+       call = afs_alloc_flat_call(net, &afs_RXFSGetVolumeStatus, 2 * 4,
+                                  max(12 * 4, AFSOPAQUEMAX + 1));
+       if (!call)
                return -ENOMEM;
-       }
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = vs;
-       call->reply[2] = tmpbuf;
+       call->out_volstatus = vs;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1815,6 +1622,7 @@ int afs_fs_get_volume_status(struct afs_fs_cursor *fc,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1835,7 +1643,7 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call)
 
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       /* xdr_decode_AFSVolSync(&bp, call->reply[X]); */
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -1876,7 +1684,8 @@ static const struct afs_call_type afs_RXFSReleaseLock = {
 /*
  * Set a lock on a file
  */
-int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
+int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type,
+                   struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1884,7 +1693,7 @@ int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_set_lock(fc, type);
+               return yfs_fs_set_lock(fc, type, scb);
 
        _enter("");
 
@@ -1893,8 +1702,8 @@ int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->want_reply_time = true;
+       call->lvnode = vnode;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1906,6 +1715,7 @@ int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_calli(call, &vnode->fid, type);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1913,7 +1723,7 @@ int afs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
 /*
  * extend a lock on a file
  */
-int afs_fs_extend_lock(struct afs_fs_cursor *fc)
+int afs_fs_extend_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1921,7 +1731,7 @@ int afs_fs_extend_lock(struct afs_fs_cursor *fc)
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_extend_lock(fc);
+               return yfs_fs_extend_lock(fc, scb);
 
        _enter("");
 
@@ -1930,8 +1740,8 @@ int afs_fs_extend_lock(struct afs_fs_cursor *fc)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->want_reply_time = true;
+       call->lvnode = vnode;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1942,6 +1752,7 @@ int afs_fs_extend_lock(struct afs_fs_cursor *fc)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1949,7 +1760,7 @@ int afs_fs_extend_lock(struct afs_fs_cursor *fc)
 /*
  * release a lock on a file
  */
-int afs_fs_release_lock(struct afs_fs_cursor *fc)
+int afs_fs_release_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1957,7 +1768,7 @@ int afs_fs_release_lock(struct afs_fs_cursor *fc)
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_release_lock(fc);
+               return yfs_fs_release_lock(fc, scb);
 
        _enter("");
 
@@ -1966,7 +1777,8 @@ int afs_fs_release_lock(struct afs_fs_cursor *fc)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
+       call->lvnode = vnode;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1977,6 +1789,7 @@ int afs_fs_release_lock(struct afs_fs_cursor *fc)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -2071,14 +1884,6 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
        return 0;
 }
 
-static void afs_destroy_fs_get_capabilities(struct afs_call *call)
-{
-       struct afs_server *server = call->reply[0];
-
-       afs_put_server(call->net, server);
-       afs_flat_call_destructor(call);
-}
-
 /*
  * FS.GetCapabilities operation type
  */
@@ -2087,7 +1892,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
        .op             = afs_FS_GetCapabilities,
        .deliver        = afs_deliver_fs_get_capabilities,
        .done           = afs_fileserver_probe_result,
-       .destructor     = afs_destroy_fs_get_capabilities,
+       .destructor     = afs_flat_call_destructor,
 };
 
 /*
@@ -2110,11 +1915,11 @@ struct afs_call *afs_fs_get_capabilities(struct afs_net *net,
                return ERR_PTR(-ENOMEM);
 
        call->key = key;
-       call->reply[0] = afs_get_server(server);
-       call->reply[1] = (void *)(long)server_index;
+       call->server = afs_get_server(server);
+       call->server_index = server_index;
        call->upgrade = true;
-       call->want_reply_time = true;
        call->async = true;
+       call->max_lifespan = AFS_PROBE_MAX_LIFESPAN;
 
        /* marshall the parameters */
        bp = call->request;
@@ -2131,10 +1936,6 @@ struct afs_call *afs_fs_get_capabilities(struct afs_net *net,
  */
 static int afs_deliver_fs_fetch_status(struct afs_call *call)
 {
-       struct afs_file_status *status = call->reply[1];
-       struct afs_callback *callback = call->reply[2];
-       struct afs_volsync *volsync = call->reply[3];
-       struct afs_fid *fid = call->reply[0];
        const __be32 *bp;
        int ret;
 
@@ -2142,16 +1943,13 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
        if (ret < 0)
                return ret;
 
-       _enter("{%llx:%llu}", fid->vid, fid->vnode);
-
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = afs_decode_status(call, &bp, status, NULL,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       xdr_decode_AFSCallBack_raw(call, &bp, callback);
-       xdr_decode_AFSVolSync(&bp, volsync);
+       xdr_decode_AFSCallBack(&bp, call, call->out_scb);
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -2173,15 +1971,14 @@ static const struct afs_call_type afs_RXFSFetchStatus = {
 int afs_fs_fetch_status(struct afs_fs_cursor *fc,
                        struct afs_net *net,
                        struct afs_fid *fid,
-                       struct afs_file_status *status,
-                       struct afs_callback *callback,
+                       struct afs_status_cb *scb,
                        struct afs_volsync *volsync)
 {
        struct afs_call *call;
        __be32 *bp;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_fetch_status(fc, net, fid, status, callback, volsync);
+               return yfs_fs_fetch_status(fc, net, fid, scb, volsync);
 
        _enter(",%x,{%llx:%llu},,",
               key_serial(fc->key), fid->vid, fid->vnode);
@@ -2193,12 +1990,9 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc,
        }
 
        call->key = fc->key;
-       call->reply[0] = fid;
-       call->reply[1] = status;
-       call->reply[2] = callback;
-       call->reply[3] = volsync;
-       call->expected_version = 1; /* vnode->status.data_version */
-       call->want_reply_time = true;
+       call->out_fid = fid;
+       call->out_scb = scb;
+       call->out_volsync = volsync;
 
        /* marshall the parameters */
        bp = call->request;
@@ -2207,9 +2001,9 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc,
        bp[2] = htonl(fid->vnode);
        bp[3] = htonl(fid->unique);
 
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -2219,9 +2013,7 @@ int afs_fs_fetch_status(struct afs_fs_cursor *fc,
  */
 static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
 {
-       struct afs_file_status *statuses;
-       struct afs_callback *callbacks;
-       struct afs_vnode *vnode = call->reply[0];
+       struct afs_status_cb *scb;
        const __be32 *bp;
        u32 tmp;
        int ret;
@@ -2260,10 +2052,8 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
                        return ret;
 
                bp = call->buffer;
-               statuses = call->reply[1];
-               ret = afs_decode_status(call, &bp, &statuses[call->count],
-                                       call->count == 0 ? vnode : NULL,
-                                       NULL, NULL);
+               scb = &call->out_scb[call->count];
+               ret = xdr_decode_AFSFetchStatus(&bp, call, scb);
                if (ret < 0)
                        return ret;
 
@@ -2302,13 +2092,8 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
 
                _debug("unmarshall CB array");
                bp = call->buffer;
-               callbacks = call->reply[2];
-               callbacks[call->count].version  = ntohl(bp[0]);
-               callbacks[call->count].expires_at = xdr_decode_expiry(call, ntohl(bp[1]));
-               callbacks[call->count].type     = ntohl(bp[2]);
-               statuses = call->reply[1];
-               if (call->count == 0 && vnode && statuses[0].abort_code == 0)
-                       xdr_decode_AFSCallBack(call, vnode, &bp);
+               scb = &call->out_scb[call->count];
+               xdr_decode_AFSCallBack(&bp, call, scb);
                call->count++;
                if (call->count < call->count2)
                        goto more_cbs;
@@ -2323,7 +2108,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
                        return ret;
 
                bp = call->buffer;
-               xdr_decode_AFSVolSync(&bp, call->reply[3]);
+               xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
                call->unmarshall++;
 
@@ -2351,8 +2136,7 @@ static const struct afs_call_type afs_RXFSInlineBulkStatus = {
 int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
                              struct afs_net *net,
                              struct afs_fid *fids,
-                             struct afs_file_status *statuses,
-                             struct afs_callback *callbacks,
+                             struct afs_status_cb *statuses,
                              unsigned int nr_fids,
                              struct afs_volsync *volsync)
 {
@@ -2361,7 +2145,7 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
        int i;
 
        if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
-               return yfs_fs_inline_bulk_status(fc, net, fids, statuses, callbacks,
+               return yfs_fs_inline_bulk_status(fc, net, fids, statuses,
                                                 nr_fids, volsync);
 
        _enter(",%x,{%llx:%llu},%u",
@@ -2376,12 +2160,9 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
        }
 
        call->key = fc->key;
-       call->reply[0] = NULL; /* vnode for fid[0] */
-       call->reply[1] = statuses;
-       call->reply[2] = callbacks;
-       call->reply[3] = volsync;
+       call->out_scb = statuses;
+       call->out_volsync = volsync;
        call->count2 = nr_fids;
-       call->want_reply_time = true;
 
        /* marshall the parameters */
        bp = call->request;
@@ -2393,9 +2174,9 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
                *bp++ = htonl(fids[i].unique);
        }
 
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &fids[0]);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -2405,7 +2186,6 @@ int afs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
  */
 static int afs_deliver_fs_fetch_acl(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[1];
        struct afs_acl *acl;
        const __be32 *bp;
        unsigned int size;
@@ -2430,7 +2210,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
                acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL);
                if (!acl)
                        return -ENOMEM;
-               call->reply[0] = acl;
+               call->ret_acl = acl;
                acl->size = call->count2;
                afs_extract_begin(call, acl->data, size);
                call->unmarshall++;
@@ -2451,11 +2231,10 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
                        return ret;
 
                bp = call->buffer;
-               ret = afs_decode_status(call, &bp, &vnode->status, vnode,
-                                       &vnode->status.data_version, NULL);
+               ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
                if (ret < 0)
                        return ret;
-               xdr_decode_AFSVolSync(&bp, call->reply[2]);
+               xdr_decode_AFSVolSync(&bp, call->out_volsync);
 
                call->unmarshall++;
 
@@ -2469,7 +2248,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
 
 static void afs_destroy_fs_fetch_acl(struct afs_call *call)
 {
-       kfree(call->reply[0]);
+       kfree(call->ret_acl);
        afs_flat_call_destructor(call);
 }
 
@@ -2486,7 +2265,8 @@ static const struct afs_call_type afs_RXFSFetchACL = {
 /*
  * Fetch the ACL for a file.
  */
-struct afs_acl *afs_fs_fetch_acl(struct afs_fs_cursor *fc)
+struct afs_acl *afs_fs_fetch_acl(struct afs_fs_cursor *fc,
+                                struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -2503,10 +2283,9 @@ struct afs_acl *afs_fs_fetch_acl(struct afs_fs_cursor *fc)
        }
 
        call->key = fc->key;
-       call->reply[0] = NULL;
-       call->reply[1] = vnode;
-       call->reply[2] = NULL; /* volsync */
-       call->ret_reply0 = true;
+       call->ret_acl = NULL;
+       call->out_scb = scb;
+       call->out_volsync = NULL;
 
        /* marshall the parameters */
        bp = call->request;
@@ -2515,27 +2294,50 @@ struct afs_acl *afs_fs_fetch_acl(struct afs_fs_cursor *fc)
        bp[2] = htonl(vnode->fid.vnode);
        bp[3] = htonl(vnode->fid.unique);
 
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
        afs_make_call(&fc->ac, call, GFP_KERNEL);
        return (struct afs_acl *)afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
+/*
+ * Deliver reply data to any operation that returns file status and volume
+ * sync.
+ */
+static int afs_deliver_fs_file_status_and_vol(struct afs_call *call)
+{
+       const __be32 *bp;
+       int ret;
+
+       ret = afs_transfer_reply(call);
+       if (ret < 0)
+               return ret;
+
+       bp = call->buffer;
+       ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+       if (ret < 0)
+               return ret;
+       xdr_decode_AFSVolSync(&bp, call->out_volsync);
+
+       _leave(" = 0 [done]");
+       return 0;
+}
+
 /*
  * FS.StoreACL operation type
  */
 static const struct afs_call_type afs_RXFSStoreACL = {
        .name           = "FS.StoreACL",
        .op             = afs_FS_StoreACL,
-       .deliver        = afs_deliver_fs_status_and_vol,
+       .deliver        = afs_deliver_fs_file_status_and_vol,
        .destructor     = afs_flat_call_destructor,
 };
 
 /*
  * Fetch the ACL for a file.
  */
-int afs_fs_store_acl(struct afs_fs_cursor *fc, const struct afs_acl *acl)
+int afs_fs_store_acl(struct afs_fs_cursor *fc, const struct afs_acl *acl,
+                    struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -2555,8 +2357,8 @@ int afs_fs_store_acl(struct afs_fs_cursor *fc, const struct afs_acl *acl)
        }
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[2] = NULL; /* volsync */
+       call->out_scb = scb;
+       call->out_volsync = NULL;
 
        /* marshall the parameters */
        bp = call->request;
index c4652b42d545ffa8b5999e3fe7358601969b79c3..b42d9d09669c863ce51ef463bc695085c072eccd 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/namei.h>
 #include <linux/iversion.h>
 #include "internal.h"
+#include "afs_fs.h"
 
 static const struct inode_operations afs_symlink_inode_operations = {
        .get_link       = page_get_link,
@@ -58,38 +59,50 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
  * Initialise an inode from the vnode status.
  */
 static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key,
-                                     struct afs_vnode *parent_vnode)
+                                     struct afs_cb_interest *cbi,
+                                     struct afs_vnode *parent_vnode,
+                                     struct afs_status_cb *scb)
 {
+       struct afs_cb_interest *old_cbi = NULL;
+       struct afs_file_status *status = &scb->status;
        struct inode *inode = AFS_VNODE_TO_I(vnode);
+       struct timespec64 t;
 
        _debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
-              vnode->status.type,
-              vnode->status.nlink,
-              (unsigned long long) vnode->status.size,
-              vnode->status.data_version,
-              vnode->status.mode);
+              status->type,
+              status->nlink,
+              (unsigned long long) status->size,
+              status->data_version,
+              status->mode);
 
-       read_seqlock_excl(&vnode->cb_lock);
+       write_seqlock(&vnode->cb_lock);
 
-       afs_update_inode_from_status(vnode, &vnode->status, NULL,
-                                    AFS_VNODE_NOT_YET_SET);
+       vnode->status = *status;
 
-       switch (vnode->status.type) {
+       t = status->mtime_client;
+       inode->i_ctime = t;
+       inode->i_mtime = t;
+       inode->i_atime = t;
+       inode->i_uid = make_kuid(&init_user_ns, status->owner);
+       inode->i_gid = make_kgid(&init_user_ns, status->group);
+       set_nlink(&vnode->vfs_inode, status->nlink);
+
+       switch (status->type) {
        case AFS_FTYPE_FILE:
-               inode->i_mode   = S_IFREG | vnode->status.mode;
+               inode->i_mode   = S_IFREG | status->mode;
                inode->i_op     = &afs_file_inode_operations;
                inode->i_fop    = &afs_file_operations;
                inode->i_mapping->a_ops = &afs_fs_aops;
                break;
        case AFS_FTYPE_DIR:
-               inode->i_mode   = S_IFDIR | vnode->status.mode;
+               inode->i_mode   = S_IFDIR | status->mode;
                inode->i_op     = &afs_dir_inode_operations;
                inode->i_fop    = &afs_dir_file_operations;
                inode->i_mapping->a_ops = &afs_dir_aops;
                break;
        case AFS_FTYPE_SYMLINK:
                /* Symlinks with a mode of 0644 are actually mountpoints. */
-               if ((vnode->status.mode & 0777) == 0644) {
+               if ((status->mode & 0777) == 0644) {
                        inode->i_flags |= S_AUTOMOUNT;
 
                        set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
@@ -99,7 +112,7 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key,
                        inode->i_fop    = &afs_mntpt_file_operations;
                        inode->i_mapping->a_ops = &afs_fs_aops;
                } else {
-                       inode->i_mode   = S_IFLNK | vnode->status.mode;
+                       inode->i_mode   = S_IFLNK | status->mode;
                        inode->i_op     = &afs_symlink_inode_operations;
                        inode->i_mapping->a_ops = &afs_fs_aops;
                }
@@ -107,7 +120,7 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key,
                break;
        default:
                dump_vnode(vnode, parent_vnode);
-               read_sequnlock_excl(&vnode->cb_lock);
+               write_sequnlock(&vnode->cb_lock);
                return afs_protocol_error(NULL, -EBADMSG, afs_eproto_file_type);
        }
 
@@ -116,17 +129,175 @@ static int afs_inode_init_from_status(struct afs_vnode *vnode, struct key *key,
         * for consistency with other AFS clients.
         */
        inode->i_blocks         = ((i_size_read(inode) + 1023) >> 10) << 1;
-       vnode->invalid_before   = vnode->status.data_version;
+       i_size_write(&vnode->vfs_inode, status->size);
+
+       vnode->invalid_before   = status->data_version;
+       inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
+
+       if (!scb->have_cb) {
+               /* it's a symlink we just created (the fileserver
+                * didn't give us a callback) */
+               vnode->cb_expires_at = ktime_get_real_seconds();
+       } else {
+               vnode->cb_expires_at = scb->callback.expires_at;
+               old_cbi = rcu_dereference_protected(vnode->cb_interest,
+                                                   lockdep_is_held(&vnode->cb_lock.lock));
+               if (cbi != old_cbi)
+                       rcu_assign_pointer(vnode->cb_interest, afs_get_cb_interest(cbi));
+               else
+                       old_cbi = NULL;
+               set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+       }
 
-       read_sequnlock_excl(&vnode->cb_lock);
+       write_sequnlock(&vnode->cb_lock);
+       afs_put_cb_interest(afs_v2net(vnode), old_cbi);
        return 0;
 }
 
+/*
+ * Update the core inode struct from a returned status record.
+ */
+static void afs_apply_status(struct afs_fs_cursor *fc,
+                            struct afs_vnode *vnode,
+                            struct afs_status_cb *scb,
+                            const afs_dataversion_t *expected_version)
+{
+       struct afs_file_status *status = &scb->status;
+       struct timespec64 t;
+       umode_t mode;
+       bool data_changed = false;
+
+       BUG_ON(test_bit(AFS_VNODE_UNSET, &vnode->flags));
+
+       if (status->type != vnode->status.type) {
+               pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n",
+                          vnode->fid.vid,
+                          vnode->fid.vnode,
+                          vnode->fid.unique,
+                          status->type, vnode->status.type);
+               afs_protocol_error(NULL, -EBADMSG, afs_eproto_bad_status);
+               return;
+       }
+
+       if (status->nlink != vnode->status.nlink)
+               set_nlink(&vnode->vfs_inode, status->nlink);
+
+       if (status->owner != vnode->status.owner)
+               vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner);
+
+       if (status->group != vnode->status.group)
+               vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group);
+
+       if (status->mode != vnode->status.mode) {
+               mode = vnode->vfs_inode.i_mode;
+               mode &= ~S_IALLUGO;
+               mode |= status->mode;
+               WRITE_ONCE(vnode->vfs_inode.i_mode, mode);
+       }
+
+       t = status->mtime_client;
+       vnode->vfs_inode.i_ctime = t;
+       vnode->vfs_inode.i_mtime = t;
+       vnode->vfs_inode.i_atime = t;
+
+       if (vnode->status.data_version != status->data_version)
+               data_changed = true;
+
+       vnode->status = *status;
+
+       if (expected_version &&
+           *expected_version != status->data_version) {
+               kdebug("vnode modified %llx on {%llx:%llu} [exp %llx] %s",
+                      (unsigned long long) status->data_version,
+                      vnode->fid.vid, vnode->fid.vnode,
+                      (unsigned long long) *expected_version,
+                      fc->type ? fc->type->name : "???");
+               vnode->invalid_before = status->data_version;
+               if (vnode->status.type == AFS_FTYPE_DIR) {
+                       if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+                               afs_stat_v(vnode, n_inval);
+               } else {
+                       set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+               }
+       } else if (vnode->status.type == AFS_FTYPE_DIR) {
+               /* Expected directory change is handled elsewhere so
+                * that we can locally edit the directory and save on a
+                * download.
+                */
+               if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+                       data_changed = false;
+       }
+
+       if (data_changed) {
+               inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
+               i_size_write(&vnode->vfs_inode, status->size);
+       }
+}
+
+/*
+ * Apply a callback to a vnode.
+ */
+static void afs_apply_callback(struct afs_fs_cursor *fc,
+                              struct afs_vnode *vnode,
+                              struct afs_status_cb *scb,
+                              unsigned int cb_break)
+{
+       struct afs_cb_interest *old;
+       struct afs_callback *cb = &scb->callback;
+
+       if (!afs_cb_is_broken(cb_break, vnode, fc->cbi)) {
+               vnode->cb_expires_at    = cb->expires_at;
+               old = rcu_dereference_protected(vnode->cb_interest,
+                                               lockdep_is_held(&vnode->cb_lock.lock));
+               if (old != fc->cbi) {
+                       rcu_assign_pointer(vnode->cb_interest, afs_get_cb_interest(fc->cbi));
+                       afs_put_cb_interest(afs_v2net(vnode), old);
+               }
+               set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+       }
+}
+
+/*
+ * Apply the received status and callback to an inode all in the same critical
+ * section to avoid races with afs_validate().
+ */
+void afs_vnode_commit_status(struct afs_fs_cursor *fc,
+                            struct afs_vnode *vnode,
+                            unsigned int cb_break,
+                            const afs_dataversion_t *expected_version,
+                            struct afs_status_cb *scb)
+{
+       if (fc->ac.error != 0)
+               return;
+
+       write_seqlock(&vnode->cb_lock);
+
+       if (scb->have_error) {
+               if (scb->status.abort_code == VNOVNODE) {
+                       set_bit(AFS_VNODE_DELETED, &vnode->flags);
+                       clear_nlink(&vnode->vfs_inode);
+                       __afs_break_callback(vnode);
+               }
+       } else {
+               if (scb->have_status)
+                       afs_apply_status(fc, vnode, scb, expected_version);
+               if (scb->have_cb)
+                       afs_apply_callback(fc, vnode, scb, cb_break);
+       }
+
+       write_sequnlock(&vnode->cb_lock);
+
+       if (fc->ac.error == 0 && scb->have_status)
+               afs_cache_permit(vnode, fc->key, cb_break, scb);
+}
+
 /*
  * Fetch file status from the volume.
  */
-int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
+int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool is_new,
+                    afs_access_t *_caller_access)
 {
+       struct afs_status_cb *scb;
        struct afs_fs_cursor fc;
        int ret;
 
@@ -135,18 +306,38 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
               vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
               vnode->flags);
 
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               return -ENOMEM;
+
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+               afs_dataversion_t data_version = vnode->status.data_version;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       afs_fs_fetch_file_status(&fc, NULL, new_inode);
+                       afs_fs_fetch_file_status(&fc, scb, NULL);
                }
 
-               afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               if (fc.error) {
+                       /* Do nothing. */
+               } else if (is_new) {
+                       ret = afs_inode_init_from_status(vnode, key, fc.cbi,
+                                                        NULL, scb);
+                       fc.error = ret;
+                       if (ret == 0)
+                               afs_cache_permit(vnode, key, fc.cb_break, scb);
+               } else {
+                       afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+                                               &data_version, scb);
+               }
+               afs_check_for_remote_deletion(&fc, vnode);
                ret = afs_end_vnode_operation(&fc);
        }
 
+       if (ret == 0 && _caller_access)
+               *_caller_access = scb->status.caller_access;
+       kfree(scb);
        _leave(" = %d", ret);
        return ret;
 }
@@ -156,10 +347,10 @@ int afs_fetch_status(struct afs_vnode *vnode, struct key *key, bool new_inode)
  */
 int afs_iget5_test(struct inode *inode, void *opaque)
 {
-       struct afs_iget_data *data = opaque;
+       struct afs_iget_data *iget_data = opaque;
        struct afs_vnode *vnode = AFS_FS_I(inode);
 
-       return memcmp(&vnode->fid, &data->fid, sizeof(data->fid)) == 0;
+       return memcmp(&vnode->fid, &iget_data->fid, sizeof(iget_data->fid)) == 0;
 }
 
 /*
@@ -177,17 +368,19 @@ static int afs_iget5_pseudo_dir_test(struct inode *inode, void *opaque)
  */
 static int afs_iget5_set(struct inode *inode, void *opaque)
 {
-       struct afs_iget_data *data = opaque;
+       struct afs_iget_data *iget_data = opaque;
        struct afs_vnode *vnode = AFS_FS_I(inode);
 
-       vnode->fid = data->fid;
-       vnode->volume = data->volume;
+       vnode->fid              = iget_data->fid;
+       vnode->volume           = iget_data->volume;
+       vnode->cb_v_break       = iget_data->cb_v_break;
+       vnode->cb_s_break       = iget_data->cb_s_break;
 
        /* YFS supports 96-bit vnode IDs, but Linux only supports
         * 64-bit inode numbers.
         */
-       inode->i_ino data->fid.vnode;
-       inode->i_generation data->fid.unique;
+       inode->i_ino            = iget_data->fid.vnode;
+       inode->i_generation     = iget_data->fid.unique;
        return 0;
 }
 
@@ -197,38 +390,42 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
  */
 struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
 {
-       struct afs_iget_data data;
        struct afs_super_info *as;
        struct afs_vnode *vnode;
        struct inode *inode;
        static atomic_t afs_autocell_ino;
 
+       struct afs_iget_data iget_data = {
+               .cb_v_break = 0,
+               .cb_s_break = 0,
+       };
+
        _enter("");
 
        as = sb->s_fs_info;
        if (as->volume) {
-               data.volume = as->volume;
-               data.fid.vid = as->volume->vid;
+               iget_data.volume = as->volume;
+               iget_data.fid.vid = as->volume->vid;
        }
        if (root) {
-               data.fid.vnode = 1;
-               data.fid.unique = 1;
+               iget_data.fid.vnode = 1;
+               iget_data.fid.unique = 1;
        } else {
-               data.fid.vnode = atomic_inc_return(&afs_autocell_ino);
-               data.fid.unique = 0;
+               iget_data.fid.vnode = atomic_inc_return(&afs_autocell_ino);
+               iget_data.fid.unique = 0;
        }
 
-       inode = iget5_locked(sb, data.fid.vnode,
+       inode = iget5_locked(sb, iget_data.fid.vnode,
                             afs_iget5_pseudo_dir_test, afs_iget5_set,
-                            &data);
+                            &iget_data);
        if (!inode) {
                _leave(" = -ENOMEM");
                return ERR_PTR(-ENOMEM);
        }
 
        _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }",
-              inode, inode->i_ino, data.fid.vid, data.fid.vnode,
-              data.fid.unique);
+              inode, inode->i_ino, iget_data.fid.vid, iget_data.fid.vnode,
+              iget_data.fid.unique);
 
        vnode = AFS_FS_I(inode);
 
@@ -299,23 +496,24 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
  * inode retrieval
  */
 struct inode *afs_iget(struct super_block *sb, struct key *key,
-                      struct afs_fid *fid, struct afs_file_status *status,
-                      struct afs_callback *cb, struct afs_cb_interest *cbi,
+                      struct afs_iget_data *iget_data,
+                      struct afs_status_cb *scb,
+                      struct afs_cb_interest *cbi,
                       struct afs_vnode *parent_vnode)
 {
-       struct afs_iget_data data = { .fid = *fid };
        struct afs_super_info *as;
        struct afs_vnode *vnode;
+       struct afs_fid *fid = &iget_data->fid;
        struct inode *inode;
        int ret;
 
        _enter(",{%llx:%llu.%u},,", fid->vid, fid->vnode, fid->unique);
 
        as = sb->s_fs_info;
-       data.volume = as->volume;
+       iget_data->volume = as->volume;
 
        inode = iget5_locked(sb, fid->vnode, afs_iget5_test, afs_iget5_set,
-                            &data);
+                            iget_data);
        if (!inode) {
                _leave(" = -ENOMEM");
                return ERR_PTR(-ENOMEM);
@@ -332,43 +530,25 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
                return inode;
        }
 
-       if (!status) {
+       if (!scb) {
                /* it's a remotely extant inode */
-               ret = afs_fetch_status(vnode, key, true);
+               ret = afs_fetch_status(vnode, key, true, NULL);
                if (ret < 0)
                        goto bad_inode;
        } else {
-               /* it's an inode we just created */
-               memcpy(&vnode->status, status, sizeof(vnode->status));
-
-               if (!cb) {
-                       /* it's a symlink we just created (the fileserver
-                        * didn't give us a callback) */
-                       vnode->cb_version = 0;
-                       vnode->cb_type = 0;
-                       vnode->cb_expires_at = ktime_get();
-               } else {
-                       vnode->cb_version = cb->version;
-                       vnode->cb_type = cb->type;
-                       vnode->cb_expires_at = cb->expires_at;
-                       vnode->cb_interest = afs_get_cb_interest(cbi);
-                       set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
-               }
-
-               vnode->cb_expires_at += ktime_get_real_seconds();
+               ret = afs_inode_init_from_status(vnode, key, cbi, parent_vnode,
+                                                scb);
+               if (ret < 0)
+                       goto bad_inode;
        }
 
-       ret = afs_inode_init_from_status(vnode, key, parent_vnode);
-       if (ret < 0)
-               goto bad_inode;
-
        afs_get_inode_cache(vnode);
 
        /* success */
        clear_bit(AFS_VNODE_UNSET, &vnode->flags);
        inode->i_flags |= S_NOATIME;
        unlock_new_inode(inode);
-       _leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type);
+       _leave(" = %p", inode);
        return inode;
 
        /* failure */
@@ -399,6 +579,66 @@ void afs_zap_data(struct afs_vnode *vnode)
                invalidate_inode_pages2(vnode->vfs_inode.i_mapping);
 }
 
+/*
+ * Check the validity of a vnode/inode.
+ */
+bool afs_check_validity(struct afs_vnode *vnode)
+{
+       struct afs_cb_interest *cbi;
+       struct afs_server *server;
+       struct afs_volume *volume = vnode->volume;
+       time64_t now = ktime_get_real_seconds();
+       bool valid, need_clear = false;
+       unsigned int cb_break, cb_s_break, cb_v_break;
+       int seq = 0;
+
+       do {
+               read_seqbegin_or_lock(&vnode->cb_lock, &seq);
+               cb_v_break = READ_ONCE(volume->cb_v_break);
+               cb_break = vnode->cb_break;
+
+               if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+                       cbi = rcu_dereference(vnode->cb_interest);
+                       server = rcu_dereference(cbi->server);
+                       cb_s_break = READ_ONCE(server->cb_s_break);
+
+                       if (vnode->cb_s_break != cb_s_break ||
+                           vnode->cb_v_break != cb_v_break) {
+                               vnode->cb_s_break = cb_s_break;
+                               vnode->cb_v_break = cb_v_break;
+                               need_clear = true;
+                               valid = false;
+                       } else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+                               need_clear = true;
+                               valid = false;
+                       } else if (vnode->cb_expires_at - 10 <= now) {
+                               need_clear = true;
+                               valid = false;
+                       } else {
+                               valid = true;
+                       }
+               } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+                       valid = true;
+               } else {
+                       vnode->cb_v_break = cb_v_break;
+                       valid = false;
+               }
+
+       } while (need_seqretry(&vnode->cb_lock, seq));
+
+       done_seqretry(&vnode->cb_lock, seq);
+
+       if (need_clear) {
+               write_seqlock(&vnode->cb_lock);
+               if (cb_break == vnode->cb_break)
+                       __afs_break_callback(vnode);
+               write_sequnlock(&vnode->cb_lock);
+               valid = false;
+       }
+
+       return valid;
+}
+
 /*
  * validate a vnode/inode
  * - there are several things we need to check
@@ -410,7 +650,6 @@ void afs_zap_data(struct afs_vnode *vnode)
  */
 int afs_validate(struct afs_vnode *vnode, struct key *key)
 {
-       time64_t now = ktime_get_real_seconds();
        bool valid;
        int ret;
 
@@ -418,36 +657,9 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
               vnode->fid.vid, vnode->fid.vnode, vnode->flags,
               key_serial(key));
 
-       /* Quickly check the callback state.  Ideally, we'd use read_seqbegin
-        * here, but we have no way to pass the net namespace to the RCU
-        * cleanup for the server record.
-        */
-       read_seqlock_excl(&vnode->cb_lock);
-
-       if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
-               if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break ||
-                   vnode->cb_v_break != vnode->volume->cb_v_break) {
-                       vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
-                       vnode->cb_v_break = vnode->volume->cb_v_break;
-                       valid = false;
-               } else if (vnode->status.type == AFS_FTYPE_DIR &&
-                          (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags) ||
-                           vnode->cb_expires_at - 10 <= now)) {
-                       valid = false;
-               } else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) ||
-                          vnode->cb_expires_at - 10 <= now) {
-                       valid = false;
-               } else {
-                       valid = true;
-               }
-       } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
-               valid = true;
-       } else {
-               vnode->cb_v_break = vnode->volume->cb_v_break;
-               valid = false;
-       }
-
-       read_sequnlock_excl(&vnode->cb_lock);
+       rcu_read_lock();
+       valid = afs_check_validity(vnode);
+       rcu_read_unlock();
 
        if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
                clear_nlink(&vnode->vfs_inode);
@@ -463,7 +675,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
         * access */
        if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
                _debug("not promised");
-               ret = afs_fetch_status(vnode, key, false);
+               ret = afs_fetch_status(vnode, key, false, NULL);
                if (ret < 0) {
                        if (ret == -ENOENT) {
                                set_bit(AFS_VNODE_DELETED, &vnode->flags);
@@ -534,6 +746,7 @@ int afs_drop_inode(struct inode *inode)
  */
 void afs_evict_inode(struct inode *inode)
 {
+       struct afs_cb_interest *cbi;
        struct afs_vnode *vnode;
 
        vnode = AFS_FS_I(inode);
@@ -550,10 +763,14 @@ void afs_evict_inode(struct inode *inode)
        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
 
-       if (vnode->cb_interest) {
-               afs_put_cb_interest(afs_i2net(inode), vnode->cb_interest);
-               vnode->cb_interest = NULL;
+       write_seqlock(&vnode->cb_lock);
+       cbi = rcu_dereference_protected(vnode->cb_interest,
+                                       lockdep_is_held(&vnode->cb_lock.lock));
+       if (cbi) {
+               afs_put_cb_interest(afs_i2net(inode), cbi);
+               rcu_assign_pointer(vnode->cb_interest, NULL);
        }
+       write_sequnlock(&vnode->cb_lock);
 
        while (!list_empty(&vnode->wb_keys)) {
                struct afs_wb_key *wbk = list_entry(vnode->wb_keys.next,
@@ -573,6 +790,7 @@ void afs_evict_inode(struct inode *inode)
        }
 #endif
 
+       afs_prune_wb_keys(vnode);
        afs_put_permits(rcu_access_pointer(vnode->permit_cache));
        key_put(vnode->silly_key);
        vnode->silly_key = NULL;
@@ -587,9 +805,10 @@ void afs_evict_inode(struct inode *inode)
 int afs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct afs_fs_cursor fc;
+       struct afs_status_cb *scb;
        struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
        struct key *key;
-       int ret;
+       int ret = -ENOMEM;
 
        _enter("{%llx:%llu},{n=%pd},%x",
               vnode->fid.vid, vnode->fid.vnode, dentry,
@@ -601,6 +820,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
                return 0;
        }
 
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_KERNEL);
+       if (!scb)
+               goto error;
+
        /* flush any dirty data outstanding on a regular file */
        if (S_ISREG(vnode->vfs_inode.i_mode))
                filemap_write_and_wait(vnode->vfs_inode.i_mapping);
@@ -611,25 +834,33 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
                key = afs_request_key(vnode->volume->cell);
                if (IS_ERR(key)) {
                        ret = PTR_ERR(key);
-                       goto error;
+                       goto error_scb;
                }
        }
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, false)) {
+               afs_dataversion_t data_version = vnode->status.data_version;
+
+               if (attr->ia_valid & ATTR_SIZE)
+                       data_version++;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       afs_fs_setattr(&fc, attr);
+                       afs_fs_setattr(&fc, attr, scb);
                }
 
-               afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               afs_check_for_remote_deletion(&fc, vnode);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+                                       &data_version, scb);
                ret = afs_end_vnode_operation(&fc);
        }
 
        if (!(attr->ia_valid & ATTR_FILE))
                key_put(key);
 
+error_scb:
+       kfree(scb);
 error:
        _leave(" = %d", ret);
        return ret;
index b3cd6e8ad59d1ea91fe7dd03ac7e63e6d036bfdf..2073c1a3ab4b658606c42b52a153d057de72cd1c 100644 (file)
@@ -66,6 +66,8 @@ struct afs_fs_context {
 struct afs_iget_data {
        struct afs_fid          fid;
        struct afs_volume       *volume;        /* volume on which resides */
+       unsigned int            cb_v_break;     /* Pre-fetch volume break count */
+       unsigned int            cb_s_break;     /* Pre-fetch server break count */
 };
 
 enum afs_call_state {
@@ -111,8 +113,12 @@ struct afs_call {
        struct rxrpc_call       *rxcall;        /* RxRPC call handle */
        struct key              *key;           /* security for this call */
        struct afs_net          *net;           /* The network namespace */
-       struct afs_server       *cm_server;     /* Server affected by incoming CM call */
+       union {
+               struct afs_server       *server;
+               struct afs_vlserver     *vlserver;
+       };
        struct afs_cb_interest  *cbi;           /* Callback interest for server used */
+       struct afs_vnode        *lvnode;        /* vnode being locked */
        void                    *request;       /* request data (first part) */
        struct address_space    *mapping;       /* Pages being written from */
        struct iov_iter         iter;           /* Buffer iterator */
@@ -122,7 +128,20 @@ struct afs_call {
                struct bio_vec  bvec[1];
        };
        void                    *buffer;        /* reply receive buffer */
-       void                    *reply[4];      /* Where to put the reply */
+       union {
+               long                    ret0;   /* Value to reply with instead of 0 */
+               struct afs_addr_list    *ret_alist;
+               struct afs_vldb_entry   *ret_vldb;
+               struct afs_acl          *ret_acl;
+       };
+       struct afs_fid          *out_fid;
+       struct afs_status_cb    *out_dir_scb;
+       struct afs_status_cb    *out_scb;
+       struct yfs_acl          *out_yacl;
+       struct afs_volsync      *out_volsync;
+       struct afs_volume_status *out_volstatus;
+       struct afs_read         *read_request;
+       unsigned int            server_index;
        pgoff_t                 first;          /* first page in mapping to deal with */
        pgoff_t                 last;           /* last page in mapping to deal with */
        atomic_t                usage;
@@ -131,10 +150,10 @@ struct afs_call {
        int                     error;          /* error code */
        u32                     abort_code;     /* Remote abort ID or 0 */
        u32                     epoch;
+       unsigned int            max_lifespan;   /* Maximum lifespan to set if not 0 */
        unsigned                request_size;   /* size of request data */
        unsigned                reply_max;      /* maximum size of reply */
        unsigned                first_offset;   /* offset into mapping[first] */
-       unsigned int            cb_break;       /* cb_break + cb_s_break before the call */
        union {
                unsigned        last_to;        /* amount of mapping[last] */
                unsigned        count2;         /* count used in unmarshalling */
@@ -145,9 +164,9 @@ struct afs_call {
        bool                    send_pages;     /* T if data from mapping should be sent */
        bool                    need_attention; /* T if RxRPC poked us */
        bool                    async;          /* T if asynchronous */
-       bool                    ret_reply0;     /* T if should return reply[0] on success */
        bool                    upgrade;        /* T to request service upgrade */
-       bool                    want_reply_time; /* T if want reply_time */
+       bool                    have_reply_time; /* T if have got reply_time */
+       bool                    intr;           /* T if interruptible */
        u16                     service_id;     /* Actual service ID (after upgrade) */
        unsigned int            debug_id;       /* Trace ID */
        u32                     operation_ID;   /* operation ID for an incoming call */
@@ -159,8 +178,6 @@ struct afs_call {
                } __attribute__((packed));
                __be64          tmp64;
        };
-       afs_dataversion_t       expected_version; /* Updated version expected from store */
-       afs_dataversion_t       expected_version_2; /* 2nd updated version expected from store */
        ktime_t                 reply_time;     /* Time of first reply packet */
 };
 
@@ -221,7 +238,8 @@ struct afs_read {
        unsigned int            index;          /* Which page we're reading into */
        unsigned int            nr_pages;
        unsigned int            offset;         /* offset into current page */
-       void (*page_done)(struct afs_call *, struct afs_read *);
+       struct afs_vnode        *vnode;
+       void (*page_done)(struct afs_read *);
        struct page             **pages;
        struct page             *array[];
 };
@@ -367,13 +385,13 @@ struct afs_cell {
        time64_t                last_inactive;  /* Time of last drop of usage count */
        atomic_t                usage;
        unsigned long           flags;
-#define AFS_CELL_FL_NOT_READY  0               /* The cell record is not ready for use */
-#define AFS_CELL_FL_NO_GC      1               /* The cell was added manually, don't auto-gc */
-#define AFS_CELL_FL_NOT_FOUND  2               /* Permanent DNS error */
-#define AFS_CELL_FL_DNS_FAIL   3               /* Failed to access DNS */
-#define AFS_CELL_FL_NO_LOOKUP_YET 4            /* Not completed first DNS lookup yet */
+#define AFS_CELL_FL_NO_GC      0               /* The cell was added manually, don't auto-gc */
+#define AFS_CELL_FL_DO_LOOKUP  1               /* DNS lookup requested */
        enum afs_cell_state     state;
        short                   error;
+       enum dns_record_source  dns_source:8;   /* Latest source of data from lookup */
+       enum dns_lookup_status  dns_status:8;   /* Latest status of data from lookup */
+       unsigned int            dns_lookup_count; /* Counter of DNS lookups */
 
        /* Active fileserver interaction state. */
        struct list_head        proc_volumes;   /* procfs volume list */
@@ -538,7 +556,10 @@ struct afs_server {
 struct afs_vol_interest {
        struct hlist_node       srv_link;       /* Link in server->cb_volumes */
        struct hlist_head       cb_interests;   /* List of callback interests on the server */
-       afs_volid_t             vid;            /* Volume ID to match */
+       union {
+               struct rcu_head rcu;
+               afs_volid_t     vid;            /* Volume ID to match */
+       };
        unsigned int            usage;
 };
 
@@ -550,7 +571,10 @@ struct afs_cb_interest {
        struct afs_vol_interest *vol_interest;
        struct afs_server       *server;        /* Server on which this interest resides */
        struct super_block      *sb;            /* Superblock on which inodes reside */
-       afs_volid_t             vid;            /* Volume ID to match */
+       union {
+               struct rcu_head rcu;
+               afs_volid_t     vid;            /* Volume ID to match */
+       };
        refcount_t              usage;
 };
 
@@ -660,15 +684,13 @@ struct afs_vnode {
        afs_lock_type_t         lock_type : 8;
 
        /* outstanding callback notification on this file */
-       struct afs_cb_interest  *cb_interest;   /* Server on which this resides */
+       struct afs_cb_interest __rcu *cb_interest; /* Server on which this resides */
        unsigned int            cb_s_break;     /* Mass break counter on ->server */
        unsigned int            cb_v_break;     /* Mass break counter on ->volume */
        unsigned int            cb_break;       /* Break counter on vnode */
        seqlock_t               cb_lock;        /* Lock for ->cb_interest, ->status, ->cb_*break */
 
        time64_t                cb_expires_at;  /* time at which callback expires */
-       unsigned                cb_version;     /* callback version */
-       afs_callback_type_t     cb_type;        /* type of callback */
 };
 
 static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
@@ -755,6 +777,7 @@ struct afs_vl_cursor {
  * Cursor for iterating over a set of fileservers.
  */
 struct afs_fs_cursor {
+       const struct afs_call_type *type;       /* Type of call done */
        struct afs_addr_cursor  ac;
        struct afs_vnode        *vnode;
        struct afs_server_list  *server_list;   /* Current server list (pins ref) */
@@ -772,6 +795,7 @@ struct afs_fs_cursor {
 #define AFS_FS_CURSOR_VNOVOL   0x0008          /* Set if seen VNOVOL */
 #define AFS_FS_CURSOR_CUR_ONLY 0x0010          /* Set if current server only (file lock held) */
 #define AFS_FS_CURSOR_NO_VSLEEP        0x0020          /* Set to prevent sleep on VBUSY, VOFFLINE, ... */
+#define AFS_FS_CURSOR_INTR     0x0040          /* Set if op is interruptible */
        unsigned short          nr_iterations;  /* Number of server iterations */
 };
 
@@ -882,7 +906,6 @@ extern const struct address_space_operations afs_dir_aops;
 extern const struct dentry_operations afs_fs_dentry_operations;
 
 extern void afs_d_release(struct dentry *);
-extern int afs_dir_remove_link(struct dentry *, struct key *, unsigned long, unsigned long);
 
 /*
  * dir_edit.c
@@ -940,50 +963,48 @@ extern int afs_flock(struct file *, int, struct file_lock *);
 /*
  * fsclient.c
  */
-#define AFS_VNODE_NOT_YET_SET  0x01
-#define AFS_VNODE_META_CHANGED 0x02
-#define AFS_VNODE_DATA_CHANGED 0x04
-extern void afs_update_inode_from_status(struct afs_vnode *, struct afs_file_status *,
-                                        const afs_dataversion_t *, u8);
-
-extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool);
+extern int afs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_status_cb *,
+                                   struct afs_volsync *);
 extern int afs_fs_give_up_callbacks(struct afs_net *, struct afs_server *);
-extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *);
-extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t, u64,
-                        struct afs_fid *, struct afs_file_status *, struct afs_callback *);
-extern int afs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, u64);
-extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
-extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64,
-                         struct afs_fid *, struct afs_file_status *);
+extern int afs_fs_fetch_data(struct afs_fs_cursor *, struct afs_status_cb *, struct afs_read *);
+extern int afs_fs_create(struct afs_fs_cursor *, const char *, umode_t,
+                        struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *);
+extern int afs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool,
+                        struct afs_status_cb *);
+extern int afs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *,
+                      struct afs_status_cb *, struct afs_status_cb *);
+extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *,
+                         struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *);
 extern int afs_fs_rename(struct afs_fs_cursor *, const char *,
-                        struct afs_vnode *, const char *, u64, u64);
+                        struct afs_vnode *, const char *,
+                        struct afs_status_cb *, struct afs_status_cb *);
 extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
-                            pgoff_t, pgoff_t, unsigned, unsigned);
-extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *);
+                            pgoff_t, pgoff_t, unsigned, unsigned, struct afs_status_cb *);
+extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *, struct afs_status_cb *);
 extern int afs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *);
-extern int afs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t);
-extern int afs_fs_extend_lock(struct afs_fs_cursor *);
-extern int afs_fs_release_lock(struct afs_fs_cursor *);
+extern int afs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t, struct afs_status_cb *);
+extern int afs_fs_extend_lock(struct afs_fs_cursor *, struct afs_status_cb *);
+extern int afs_fs_release_lock(struct afs_fs_cursor *, struct afs_status_cb *);
 extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *,
                                        struct afs_addr_cursor *, struct key *);
 extern struct afs_call *afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
                                                struct afs_addr_cursor *, struct key *,
                                                unsigned int);
 extern int afs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *,
-                                    struct afs_fid *, struct afs_file_status *,
-                                    struct afs_callback *, unsigned int,
-                                    struct afs_volsync *);
+                                    struct afs_fid *, struct afs_status_cb *,
+                                    unsigned int, struct afs_volsync *);
 extern int afs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *,
-                              struct afs_fid *, struct afs_file_status *,
-                              struct afs_callback *, struct afs_volsync *);
+                              struct afs_fid *, struct afs_status_cb *,
+                              struct afs_volsync *);
 
 struct afs_acl {
        u32     size;
        u8      data[];
 };
 
-extern struct afs_acl *afs_fs_fetch_acl(struct afs_fs_cursor *);
-extern int afs_fs_store_acl(struct afs_fs_cursor *, const struct afs_acl *);
+extern struct afs_acl *afs_fs_fetch_acl(struct afs_fs_cursor *, struct afs_status_cb *);
+extern int afs_fs_store_acl(struct afs_fs_cursor *, const struct afs_acl *,
+                           struct afs_status_cb *);
 
 /*
  * fs_probe.c
@@ -995,15 +1016,20 @@ extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
 /*
  * inode.c
  */
-extern int afs_fetch_status(struct afs_vnode *, struct key *, bool);
+extern void afs_vnode_commit_status(struct afs_fs_cursor *,
+                                   struct afs_vnode *,
+                                   unsigned int,
+                                   const afs_dataversion_t *,
+                                   struct afs_status_cb *);
+extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
 extern int afs_iget5_test(struct inode *, void *);
 extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
 extern struct inode *afs_iget(struct super_block *, struct key *,
-                             struct afs_fid *, struct afs_file_status *,
-                             struct afs_callback *,
+                             struct afs_iget_data *, struct afs_status_cb *,
                              struct afs_cb_interest *,
                              struct afs_vnode *);
 extern void afs_zap_data(struct afs_vnode *);
+extern bool afs_check_validity(struct afs_vnode *);
 extern int afs_validate(struct afs_vnode *, struct key *);
 extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int afs_setattr(struct dentry *, struct iattr *);
@@ -1096,7 +1122,7 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {}
  * rotate.c
  */
 extern bool afs_begin_vnode_operation(struct afs_fs_cursor *, struct afs_vnode *,
-                                     struct key *);
+                                     struct key *, bool);
 extern bool afs_select_fileserver(struct afs_fs_cursor *);
 extern bool afs_select_current_fileserver(struct afs_fs_cursor *);
 extern int afs_end_vnode_operation(struct afs_fs_cursor *);
@@ -1121,6 +1147,12 @@ extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
 extern int afs_extract_data(struct afs_call *, bool);
 extern int afs_protocol_error(struct afs_call *, int, enum afs_eproto_cause);
 
+static inline void afs_set_fc_call(struct afs_call *call, struct afs_fs_cursor *fc)
+{
+       call->intr = fc->flags & AFS_FS_CURSOR_INTR;
+       fc->type = call->type;
+}
+
 static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size)
 {
        call->kvec[0].iov_base = buf;
@@ -1201,7 +1233,8 @@ static inline void afs_set_call_complete(struct afs_call *call,
  */
 extern void afs_put_permits(struct afs_permits *);
 extern void afs_clear_permits(struct afs_vnode *);
-extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int);
+extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int,
+                            struct afs_status_cb *);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
 extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *);
@@ -1327,7 +1360,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
                        struct page *page, void *fsdata);
 extern int afs_writepage(struct page *, struct writeback_control *);
 extern int afs_writepages(struct address_space *, struct writeback_control *);
-extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
 extern int afs_fsync(struct file *, loff_t, loff_t, int);
 extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
@@ -1343,33 +1375,36 @@ extern ssize_t afs_listxattr(struct dentry *, char *, size_t);
 /*
  * yfsclient.c
  */
-extern int yfs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_volsync *, bool);
-extern int yfs_fs_fetch_data(struct afs_fs_cursor *, struct afs_read *);
-extern int yfs_fs_create_file(struct afs_fs_cursor *, const char *, umode_t, u64,
-                             struct afs_fid *, struct afs_file_status *, struct afs_callback *);
-extern int yfs_fs_make_dir(struct afs_fs_cursor *, const char *, umode_t, u64,
-                        struct afs_fid *, struct afs_file_status *, struct afs_callback *);
-extern int yfs_fs_remove_file2(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
-extern int yfs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool, u64);
-extern int yfs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *, u64);
-extern int yfs_fs_symlink(struct afs_fs_cursor *, const char *, const char *, u64,
-                         struct afs_fid *, struct afs_file_status *);
-extern int yfs_fs_rename(struct afs_fs_cursor *, const char *,
-                        struct afs_vnode *, const char *, u64, u64);
+extern int yfs_fs_fetch_file_status(struct afs_fs_cursor *, struct afs_status_cb *,
+                                   struct afs_volsync *);
+extern int yfs_fs_fetch_data(struct afs_fs_cursor *, struct afs_status_cb *, struct afs_read *);
+extern int yfs_fs_create_file(struct afs_fs_cursor *, const char *, umode_t, struct afs_status_cb *,
+                             struct afs_fid *, struct afs_status_cb *);
+extern int yfs_fs_make_dir(struct afs_fs_cursor *, const char *, umode_t, struct afs_status_cb *,
+                          struct afs_fid *, struct afs_status_cb *);
+extern int yfs_fs_remove_file2(struct afs_fs_cursor *, struct afs_vnode *, const char *,
+                              struct afs_status_cb *, struct afs_status_cb *);
+extern int yfs_fs_remove(struct afs_fs_cursor *, struct afs_vnode *, const char *, bool,
+                        struct afs_status_cb *);
+extern int yfs_fs_link(struct afs_fs_cursor *, struct afs_vnode *, const char *,
+                      struct afs_status_cb *, struct afs_status_cb *);
+extern int yfs_fs_symlink(struct afs_fs_cursor *, const char *, const char *,
+                         struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *);
+extern int yfs_fs_rename(struct afs_fs_cursor *, const char *, struct afs_vnode *, const char *,
+                        struct afs_status_cb *, struct afs_status_cb *);
 extern int yfs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
-                            pgoff_t, pgoff_t, unsigned, unsigned);
-extern int yfs_fs_setattr(struct afs_fs_cursor *, struct iattr *);
+                            pgoff_t, pgoff_t, unsigned, unsigned, struct afs_status_cb *);
+extern int yfs_fs_setattr(struct afs_fs_cursor *, struct iattr *, struct afs_status_cb *);
 extern int yfs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *);
-extern int yfs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t);
-extern int yfs_fs_extend_lock(struct afs_fs_cursor *);
-extern int yfs_fs_release_lock(struct afs_fs_cursor *);
+extern int yfs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t, struct afs_status_cb *);
+extern int yfs_fs_extend_lock(struct afs_fs_cursor *, struct afs_status_cb *);
+extern int yfs_fs_release_lock(struct afs_fs_cursor *, struct afs_status_cb *);
 extern int yfs_fs_fetch_status(struct afs_fs_cursor *, struct afs_net *,
-                              struct afs_fid *, struct afs_file_status *,
-                              struct afs_callback *, struct afs_volsync *);
+                              struct afs_fid *, struct afs_status_cb *,
+                              struct afs_volsync *);
 extern int yfs_fs_inline_bulk_status(struct afs_fs_cursor *, struct afs_net *,
-                                    struct afs_fid *, struct afs_file_status *,
-                                    struct afs_callback *, unsigned int,
-                                    struct afs_volsync *);
+                                    struct afs_fid *, struct afs_status_cb *,
+                                    unsigned int, struct afs_volsync *);
 
 struct yfs_acl {
        struct afs_acl  *acl;           /* Dir/file/symlink ACL */
@@ -1382,8 +1417,10 @@ struct yfs_acl {
 };
 
 extern void yfs_free_opaque_acl(struct yfs_acl *);
-extern struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_fs_cursor *, unsigned int);
-extern int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *, const struct afs_acl *);
+extern struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_fs_cursor *, struct yfs_acl *,
+                                              struct afs_status_cb *);
+extern int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *, const struct afs_acl *,
+                                   struct afs_status_cb *);
 
 /*
  * Miscellaneous inline functions.
@@ -1398,14 +1435,6 @@ static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
        return &vnode->vfs_inode;
 }
 
-static inline void afs_vnode_commit_status(struct afs_fs_cursor *fc,
-                                          struct afs_vnode *vnode,
-                                          unsigned int cb_break)
-{
-       if (fc->ac.error == 0)
-               afs_cache_permit(vnode, fc->key, cb_break);
-}
-
 static inline void afs_check_for_remote_deletion(struct afs_fs_cursor *fc,
                                                 struct afs_vnode *vnode)
 {
index be2ee3bbd0a953349ccba4a30eecbd2366b840c1..371501d28e08752a7e012361a53724fb41297c8a 100644 (file)
@@ -53,7 +53,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
        seq_printf(m, "%3u %6lld %2u %s\n",
                   atomic_read(&cell->usage),
                   cell->dns_expiry - ktime_get_real_seconds(),
-                  vllist ? vllist->nr_servers : 0,
+                  vllist->nr_servers,
                   cell->name);
        return 0;
 }
@@ -296,8 +296,8 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
 
        if (v == SEQ_START_TOKEN) {
                seq_printf(m, "# source %s, status %s\n",
-                          dns_record_sources[vllist->source],
-                          dns_lookup_statuses[vllist->status]);
+                          dns_record_sources[vllist ? vllist->source : 0],
+                          dns_lookup_statuses[vllist ? vllist->status : 0]);
                return 0;
        }
 
@@ -336,7 +336,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
        if (pos == 0)
                return SEQ_START_TOKEN;
 
-       if (!vllist || pos - 1 >= vllist->nr_servers)
+       if (pos - 1 >= vllist->nr_servers)
                return NULL;
 
        return &vllist->servers[pos - 1];
index c3ae324781f846b8122b6b0a80085efaafdcaaba..b00c739e0e63aaf03d288ad94096e0ab075ee9b4 100644 (file)
@@ -25,7 +25,7 @@
  * them here also using the io_lock.
  */
 bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
-                              struct key *key)
+                              struct key *key, bool intr)
 {
        memset(fc, 0, sizeof(*fc));
        fc->vnode = vnode;
@@ -33,10 +33,15 @@ bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode
        fc->ac.error = SHRT_MAX;
        fc->error = -EDESTADDRREQ;
 
-       if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
-               fc->error = -EINTR;
-               fc->flags |= AFS_FS_CURSOR_STOP;
-               return false;
+       if (intr) {
+               fc->flags |= AFS_FS_CURSOR_INTR;
+               if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
+                       fc->error = -EINTR;
+                       fc->flags |= AFS_FS_CURSOR_STOP;
+                       return false;
+               }
+       } else {
+               mutex_lock(&vnode->io_lock);
        }
 
        if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
@@ -61,7 +66,8 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
        fc->untried = (1UL << fc->server_list->nr_servers) - 1;
        fc->index = READ_ONCE(fc->server_list->preferred);
 
-       cbi = vnode->cb_interest;
+       cbi = rcu_dereference_protected(vnode->cb_interest,
+                                       lockdep_is_held(&vnode->io_lock));
        if (cbi) {
                /* See if the vnode's preferred record is still available */
                for (i = 0; i < fc->server_list->nr_servers; i++) {
@@ -82,8 +88,8 @@ static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
 
                /* Note that the callback promise is effectively broken */
                write_seqlock(&vnode->cb_lock);
-               ASSERTCMP(cbi, ==, vnode->cb_interest);
-               vnode->cb_interest = NULL;
+               ASSERTCMP(cbi, ==, rcu_access_pointer(vnode->cb_interest));
+               rcu_assign_pointer(vnode->cb_interest, NULL);
                if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
                        vnode->cb_break++;
                write_sequnlock(&vnode->cb_lock);
@@ -118,10 +124,14 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
  */
 static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
 {
-       msleep_interruptible(1000);
-       if (signal_pending(current)) {
-               fc->error = -ERESTARTSYS;
-               return false;
+       if (fc->flags & AFS_FS_CURSOR_INTR) {
+               msleep_interruptible(1000);
+               if (signal_pending(current)) {
+                       fc->error = -ERESTARTSYS;
+                       return false;
+               }
+       } else {
+               msleep(1000);
        }
 
        return true;
@@ -408,7 +418,9 @@ selected_server:
        if (error < 0)
                goto failed_set_error;
 
-       fc->cbi = afs_get_cb_interest(vnode->cb_interest);
+       fc->cbi = afs_get_cb_interest(
+               rcu_dereference_protected(vnode->cb_interest,
+                                         lockdep_is_held(&vnode->io_lock)));
 
        read_lock(&server->fs_lock);
        alist = rcu_dereference_protected(server->addresses,
@@ -459,6 +471,8 @@ no_more_servers:
                                     s->probe.abort_code);
        }
 
+       error = e.error;
+
 failed_set_error:
        fc->error = error;
 failed:
@@ -476,12 +490,15 @@ failed:
 bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
 {
        struct afs_vnode *vnode = fc->vnode;
-       struct afs_cb_interest *cbi = vnode->cb_interest;
+       struct afs_cb_interest *cbi;
        struct afs_addr_list *alist;
        int error = fc->ac.error;
 
        _enter("");
 
+       cbi = rcu_dereference_protected(vnode->cb_interest,
+                                       lockdep_is_held(&vnode->io_lock));
+
        switch (error) {
        case SHRT_MAX:
                if (!cbi) {
@@ -490,7 +507,7 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
                        return false;
                }
 
-               fc->cbi = afs_get_cb_interest(vnode->cb_interest);
+               fc->cbi = afs_get_cb_interest(cbi);
 
                read_lock(&cbi->server->fs_lock);
                alist = rcu_dereference_protected(cbi->server->addresses,
index a34a89c75c6ac6e75195c0b9f5675aff10008bbf..4fa5ce92b9b97339f0cb2094c73a0223f9dd4da0 100644 (file)
@@ -188,7 +188,7 @@ void afs_put_call(struct afs_call *call)
                if (call->type->destructor)
                        call->type->destructor(call);
 
-               afs_put_server(call->net, call->cm_server);
+               afs_put_server(call->net, call->server);
                afs_put_cb_interest(call->net, call->cbi);
                afs_put_addrlist(call->alist);
                kfree(call->request);
@@ -417,6 +417,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
                                          afs_wake_up_async_call :
                                          afs_wake_up_call_waiter),
                                         call->upgrade,
+                                        call->intr,
                                         call->debug_id);
        if (IS_ERR(rxcall)) {
                ret = PTR_ERR(rxcall);
@@ -426,6 +427,10 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
 
        call->rxcall = rxcall;
 
+       if (call->max_lifespan)
+               rxrpc_kernel_set_max_life(call->net->socket, rxcall,
+                                         call->max_lifespan);
+
        /* send the request */
        iov[0].iov_base = call->request;
        iov[0].iov_len  = call->request_size;
@@ -529,11 +534,11 @@ static void afs_deliver_to_call(struct afs_call *call)
                        return;
                }
 
-               if (call->want_reply_time &&
+               if (!call->have_reply_time &&
                    rxrpc_kernel_get_reply_time(call->net->socket,
                                                call->rxcall,
                                                &call->reply_time))
-                       call->want_reply_time = false;
+                       call->have_reply_time = true;
 
                ret = call->type->deliver(call);
                state = READ_ONCE(call->state);
@@ -648,7 +653,7 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
                        break;
                }
 
-               if (timeout == 0 &&
+               if (call->intr && timeout == 0 &&
                    life == last_life && signal_pending(current)) {
                        if (stalled)
                                break;
@@ -691,10 +696,9 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
        ret = ac->error;
        switch (ret) {
        case 0:
-               if (call->ret_reply0) {
-                       ret = (long)call->reply[0];
-                       call->reply[0] = NULL;
-               }
+               ret = call->ret0;
+               call->ret0 = 0;
+
                /* Fall through */
        case -ECONNABORTED:
                ac->responded = true;
index 5f58a9a17e694a09dbe0d0b70d9dbc0cc9833aa4..5d8ece98561e6cde9eea417a155fa7be1c5ca07b 100644 (file)
@@ -87,11 +87,9 @@ void afs_clear_permits(struct afs_vnode *vnode)
        permits = rcu_dereference_protected(vnode->permit_cache,
                                            lockdep_is_held(&vnode->lock));
        RCU_INIT_POINTER(vnode->permit_cache, NULL);
-       vnode->cb_break++;
        spin_unlock(&vnode->lock);
 
-       if (permits)
-               afs_put_permits(permits);
+       afs_put_permits(permits);
 }
 
 /*
@@ -118,10 +116,10 @@ static void afs_hash_permits(struct afs_permits *permits)
  * as the ACL *may* have changed.
  */
 void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
-                     unsigned int cb_break)
+                     unsigned int cb_break, struct afs_status_cb *scb)
 {
        struct afs_permits *permits, *xpermits, *replacement, *zap, *new = NULL;
-       afs_access_t caller_access = READ_ONCE(vnode->status.caller_access);
+       afs_access_t caller_access = scb->status.caller_access;
        size_t size = 0;
        bool changed = false;
        int i, j;
@@ -148,7 +146,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
                                }
 
                                if (afs_cb_is_broken(cb_break, vnode,
-                                                    vnode->cb_interest)) {
+                                                    rcu_dereference(vnode->cb_interest))) {
                                        changed = true;
                                        break;
                                }
@@ -178,7 +176,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key,
                }
        }
 
-       if (afs_cb_is_broken(cb_break, vnode, vnode->cb_interest))
+       if (afs_cb_is_broken(cb_break, vnode, rcu_dereference(vnode->cb_interest)))
                goto someone_else_changed_it;
 
        /* We need a ref on any permits list we want to copy as we'll have to
@@ -255,14 +253,16 @@ found:
 
        kfree(new);
 
+       rcu_read_lock();
        spin_lock(&vnode->lock);
        zap = rcu_access_pointer(vnode->permit_cache);
-       if (!afs_cb_is_broken(cb_break, vnode, vnode->cb_interest) &&
+       if (!afs_cb_is_broken(cb_break, vnode, rcu_dereference(vnode->cb_interest)) &&
            zap == permits)
                rcu_assign_pointer(vnode->permit_cache, replacement);
        else
                zap = replacement;
        spin_unlock(&vnode->lock);
+       rcu_read_unlock();
        afs_put_permits(zap);
 out_put:
        afs_put_permits(permits);
@@ -322,13 +322,12 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key,
                 */
                _debug("no valid permit");
 
-               ret = afs_fetch_status(vnode, key, false);
+               ret = afs_fetch_status(vnode, key, false, _access);
                if (ret < 0) {
                        *_access = 0;
                        _leave(" = %d", ret);
                        return ret;
                }
-               *_access = vnode->status.caller_access;
        }
 
        _leave(" = 0 [access %x]", *_access);
index 65b33b6da48b9411c8385a27869785d5076713b1..52c170b59cfdca4202d58c243cd7a3b986f06d61 100644 (file)
@@ -521,8 +521,15 @@ static noinline bool afs_update_server_record(struct afs_fs_cursor *fc, struct a
        alist = afs_vl_lookup_addrs(fc->vnode->volume->cell, fc->key,
                                    &server->uuid);
        if (IS_ERR(alist)) {
-               fc->ac.error = PTR_ERR(alist);
-               _leave(" = f [%d]", fc->ac.error);
+               if ((PTR_ERR(alist) == -ERESTARTSYS ||
+                    PTR_ERR(alist) == -EINTR) &&
+                   !(fc->flags & AFS_FS_CURSOR_INTR) &&
+                   server->addresses) {
+                       _leave(" = t [intr]");
+                       return true;
+               }
+               fc->error = PTR_ERR(alist);
+               _leave(" = f [%d]", fc->error);
                return false;
        }
 
@@ -574,7 +581,11 @@ retry:
        ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING,
                          TASK_INTERRUPTIBLE);
        if (ret == -ERESTARTSYS) {
-               fc->ac.error = ret;
+               if (!(fc->flags & AFS_FS_CURSOR_INTR) && server->addresses) {
+                       _leave(" = t [intr]");
+                       return true;
+               }
+               fc->error = ret;
                _leave(" = f [intr]");
                return false;
        }
index 783c68cd1a3587de5474ddaa67aff6a6d783e41c..f18911e8d77035f4fe757c03343eb8ce06b5e13c 100644 (file)
@@ -426,7 +426,7 @@ static int afs_set_super(struct super_block *sb, struct fs_context *fc)
 static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
 {
        struct afs_super_info *as = AFS_FS_S(sb);
-       struct afs_fid fid;
+       struct afs_iget_data iget_data;
        struct inode *inode = NULL;
        int ret;
 
@@ -451,11 +451,13 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
        } else {
                sprintf(sb->s_id, "%llu", as->volume->vid);
                afs_activate_volume(as->volume);
-               fid.vid         = as->volume->vid;
-               fid.vnode       = 1;
-               fid.vnode_hi    = 0;
-               fid.unique      = 1;
-               inode = afs_iget(sb, ctx->key, &fid, NULL, NULL, NULL, NULL);
+               iget_data.fid.vid       = as->volume->vid;
+               iget_data.fid.vnode     = 1;
+               iget_data.fid.vnode_hi  = 0;
+               iget_data.fid.unique    = 1;
+               iget_data.cb_v_break    = as->volume->cb_v_break;
+               iget_data.cb_s_break    = 0;
+               inode = afs_iget(sb, ctx->key, &iget_data, NULL, NULL, NULL);
        }
 
        if (IS_ERR(inode))
@@ -677,13 +679,12 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
        vnode->volume           = NULL;
        vnode->lock_key         = NULL;
        vnode->permit_cache     = NULL;
-       vnode->cb_interest      = NULL;
+       RCU_INIT_POINTER(vnode->cb_interest, NULL);
 #ifdef CONFIG_AFS_FSCACHE
        vnode->cache            = NULL;
 #endif
 
        vnode->flags            = 1 << AFS_VNODE_UNSET;
-       vnode->cb_type          = 0;
        vnode->lock_state       = AFS_VNODE_LOCK_NONE;
 
        init_rwsem(&vnode->rmdir_lock);
@@ -708,7 +709,7 @@ static void afs_destroy_inode(struct inode *inode)
 
        _debug("DESTROY INODE %p", inode);
 
-       ASSERTCMP(vnode->cb_interest, ==, NULL);
+       ASSERTCMP(rcu_access_pointer(vnode->cb_interest), ==, NULL);
 
        atomic_dec(&afs_count_active_inodes);
 }
@@ -741,7 +742,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
                return PTR_ERR(key);
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
                fc.flags |= AFS_FS_CURSOR_NO_VSLEEP;
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
@@ -749,7 +750,6 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
                }
 
                afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
                ret = afs_end_vnode_operation(&fc);
        }
 
index b4f1a84519b952dfe65295aea24cae30d2b9aa9e..61e25010ff335c2be12c6ff85e582d353b6eac95 100644 (file)
@@ -232,18 +232,16 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
                if (bs.status > NR__dns_lookup_status)
                        bs.status = NR__dns_lookup_status;
 
+               /* See if we can update an old server record */
                server = NULL;
-               if (previous) {
-                       /* See if we can update an old server record */
-                       for (i = 0; i < previous->nr_servers; i++) {
-                               struct afs_vlserver *p = previous->servers[i].server;
-
-                               if (p->name_len == bs.name_len &&
-                                   p->port == bs.port &&
-                                   strncasecmp(b, p->name, bs.name_len) == 0) {
-                                       server = afs_get_vlserver(p);
-                                       break;
-                               }
+               for (i = 0; i < previous->nr_servers; i++) {
+                       struct afs_vlserver *p = previous->servers[i].server;
+
+                       if (p->name_len == bs.name_len &&
+                           p->port == bs.port &&
+                           strncasecmp(b, p->name, bs.name_len) == 0) {
+                               server = afs_get_vlserver(p);
+                               break;
                        }
                }
 
index b05e0de04f422a1f7f8122e245cc3cb2a8958776..beb991563939aebbb799ba852b2d0411929cd131 100644 (file)
@@ -33,8 +33,8 @@ static bool afs_vl_probe_done(struct afs_vlserver *server)
 void afs_vlserver_probe_result(struct afs_call *call)
 {
        struct afs_addr_list *alist = call->alist;
-       struct afs_vlserver *server = call->reply[0];
-       unsigned int server_index = (long)call->reply[1];
+       struct afs_vlserver *server = call->vlserver;
+       unsigned int server_index = call->server_index;
        unsigned int index = call->addr_ix;
        unsigned int rtt = UINT_MAX;
        bool have_result = false;
index 7adde83a06482b56c555c18c3629c335a2315397..3f845489a9f0eba96ae86b657d58170b1ff92df0 100644 (file)
@@ -43,11 +43,29 @@ bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cel
 static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
 {
        struct afs_cell *cell = vc->cell;
+       unsigned int dns_lookup_count;
+
+       if (cell->dns_source == DNS_RECORD_UNAVAILABLE ||
+           cell->dns_expiry <= ktime_get_real_seconds()) {
+               dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count);
+               set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
+               queue_work(afs_wq, &cell->manager);
+
+               if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
+                       if (wait_var_event_interruptible(
+                                   &cell->dns_lookup_count,
+                                   smp_load_acquire(&cell->dns_lookup_count)
+                                   != dns_lookup_count) < 0) {
+                               vc->error = -ERESTARTSYS;
+                               return false;
+                       }
+               }
 
-       if (wait_on_bit(&cell->flags, AFS_CELL_FL_NO_LOOKUP_YET,
-                       TASK_INTERRUPTIBLE)) {
-               vc->error = -ERESTARTSYS;
-               return false;
+               /* Status load is ordered after lookup counter load */
+               if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
+                       vc->error = -EDESTADDRREQ;
+                       return false;
+               }
        }
 
        read_lock(&cell->vl_servers_lock);
@@ -55,7 +73,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
                rcu_dereference_protected(cell->vl_servers,
                                          lockdep_is_held(&cell->vl_servers_lock)));
        read_unlock(&cell->vl_servers_lock);
-       if (!vc->server_list || !vc->server_list->nr_servers)
+       if (!vc->server_list->nr_servers)
                return false;
 
        vc->untried = (1UL << vc->server_list->nr_servers) - 1;
index dd9ba4e96fb3ecc14d2fe4552fe209a09e6b2abc..3d4b9836a2e2f0ef0f5eb13f1ce49b90e2605284 100644 (file)
@@ -34,7 +34,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
 
        /* unmarshall the reply once we've received all of it */
        uvldb = call->buffer;
-       entry = call->reply[0];
+       entry = call->ret_vldb;
 
        nr_servers = ntohl(uvldb->nServers);
        if (nr_servers > AFS_NMAXNSERVERS)
@@ -110,7 +110,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
 
 static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call)
 {
-       kfree(call->reply[0]);
+       kfree(call->ret_vldb);
        afs_flat_call_destructor(call);
 }
 
@@ -155,8 +155,8 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
        }
 
        call->key = vc->key;
-       call->reply[0] = entry;
-       call->ret_reply0 = true;
+       call->ret_vldb = entry;
+       call->max_lifespan = AFS_VL_MAX_LIFESPAN;
 
        /* Marshall the parameters */
        bp = call->request;
@@ -214,7 +214,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
                if (!alist)
                        return -ENOMEM;
                alist->version = uniquifier;
-               call->reply[0] = alist;
+               call->ret_alist = alist;
                call->count = count;
                call->count2 = nentries;
                call->unmarshall++;
@@ -229,7 +229,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
                if (ret < 0)
                        return ret;
 
-               alist = call->reply[0];
+               alist = call->ret_alist;
                bp = call->buffer;
                count = min(call->count, 4U);
                for (i = 0; i < count; i++)
@@ -249,8 +249,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
 
 static void afs_vl_get_addrs_u_destructor(struct afs_call *call)
 {
-       afs_put_server(call->net, (struct afs_server *)call->reply[0]);
-       kfree(call->reply[1]);
+       afs_put_addrlist(call->ret_alist);
        return afs_flat_call_destructor(call);
 }
 
@@ -287,8 +286,8 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
                return ERR_PTR(-ENOMEM);
 
        call->key = vc->key;
-       call->reply[0] = NULL;
-       call->ret_reply0 = true;
+       call->ret_alist = NULL;
+       call->max_lifespan = AFS_VL_MAX_LIFESPAN;
 
        /* Marshall the parameters */
        bp = call->request;
@@ -358,9 +357,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
 
 static void afs_destroy_vl_get_capabilities(struct afs_call *call)
 {
-       struct afs_vlserver *server = call->reply[0];
-
-       afs_put_vlserver(call->net, server);
+       afs_put_vlserver(call->net, call->vlserver);
        afs_flat_call_destructor(call);
 }
 
@@ -398,11 +395,11 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
                return ERR_PTR(-ENOMEM);
 
        call->key = key;
-       call->reply[0] = afs_get_vlserver(server);
-       call->reply[1] = (void *)(long)server_index;
+       call->vlserver = afs_get_vlserver(server);
+       call->server_index = server_index;
        call->upgrade = true;
-       call->want_reply_time = true;
        call->async = true;
+       call->max_lifespan = AFS_PROBE_MAX_LIFESPAN;
 
        /* marshall the parameters */
        bp = call->request;
@@ -460,7 +457,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
                if (!alist)
                        return -ENOMEM;
                alist->version = uniquifier;
-               call->reply[0] = alist;
+               call->ret_alist = alist;
 
                if (call->count == 0)
                        goto extract_volendpoints;
@@ -488,7 +485,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
                if (ret < 0)
                        return ret;
 
-               alist = call->reply[0];
+               alist = call->ret_alist;
                bp = call->buffer;
                switch (call->count2) {
                case YFS_ENDPOINT_IPV4:
@@ -609,7 +606,6 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
                break;
        }
 
-       alist = call->reply[0];
        _leave(" = 0 [done]");
        return 0;
 }
@@ -644,8 +640,8 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
                return ERR_PTR(-ENOMEM);
 
        call->key = vc->key;
-       call->reply[0] = NULL;
-       call->ret_reply0 = true;
+       call->ret_alist = NULL;
+       call->max_lifespan = AFS_VL_MAX_LIFESPAN;
 
        /* Marshall the parameters */
        bp = call->request;
index 0122d7445fba1e07eaf62b4be1d1e69c66e5f7c4..8bcab95f11273aeeb94e0347b9696960e0f1138a 100644 (file)
@@ -313,6 +313,46 @@ static void afs_redirty_pages(struct writeback_control *wbc,
        _leave("");
 }
 
+/*
+ * completion of write to server
+ */
+static void afs_pages_written_back(struct afs_vnode *vnode,
+                                  pgoff_t first, pgoff_t last)
+{
+       struct pagevec pv;
+       unsigned long priv;
+       unsigned count, loop;
+
+       _enter("{%llx:%llu},{%lx-%lx}",
+              vnode->fid.vid, vnode->fid.vnode, first, last);
+
+       pagevec_init(&pv);
+
+       do {
+               _debug("done %lx-%lx", first, last);
+
+               count = last - first + 1;
+               if (count > PAGEVEC_SIZE)
+                       count = PAGEVEC_SIZE;
+               pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
+                                             first, count, pv.pages);
+               ASSERTCMP(pv.nr, ==, count);
+
+               for (loop = 0; loop < count; loop++) {
+                       priv = page_private(pv.pages[loop]);
+                       trace_afs_page_dirty(vnode, tracepoint_string("clear"),
+                                            pv.pages[loop]->index, priv);
+                       set_page_private(pv.pages[loop], 0);
+                       end_page_writeback(pv.pages[loop]);
+               }
+               first += count;
+               __pagevec_release(&pv);
+       } while (first <= last);
+
+       afs_prune_wb_keys(vnode);
+       _leave("");
+}
+
 /*
  * write to a file
  */
@@ -322,6 +362,7 @@ static int afs_store_data(struct address_space *mapping,
 {
        struct afs_vnode *vnode = AFS_FS_I(mapping->host);
        struct afs_fs_cursor fc;
+       struct afs_status_cb *scb;
        struct afs_wb_key *wbk = NULL;
        struct list_head *p;
        int ret = -ENOKEY, ret2;
@@ -333,6 +374,10 @@ static int afs_store_data(struct address_space *mapping,
               vnode->fid.unique,
               first, last, offset, to);
 
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
+       if (!scb)
+               return -ENOMEM;
+
        spin_lock(&vnode->wb_lock);
        p = vnode->wb_keys.next;
 
@@ -351,6 +396,7 @@ try_next_key:
 
        spin_unlock(&vnode->wb_lock);
        afs_put_wb_key(wbk);
+       kfree(scb);
        _leave(" = %d [no keys]", ret);
        return ret;
 
@@ -361,14 +407,19 @@ found_key:
        _debug("USE WB KEY %u", key_serial(wbk->key));
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, wbk->key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, wbk->key, false)) {
+               afs_dataversion_t data_version = vnode->status.data_version + 1;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       afs_fs_store_data(&fc, mapping, first, last, offset, to);
+                       afs_fs_store_data(&fc, mapping, first, last, offset, to, scb);
                }
 
-               afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               afs_check_for_remote_deletion(&fc, vnode);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+                                       &data_version, scb);
+               if (fc.ac.error == 0)
+                       afs_pages_written_back(vnode, first, last);
                ret = afs_end_vnode_operation(&fc);
        }
 
@@ -393,6 +444,7 @@ found_key:
        }
 
        afs_put_wb_key(wbk);
+       kfree(scb);
        _leave(" = %d", ret);
        return ret;
 }
@@ -678,46 +730,6 @@ int afs_writepages(struct address_space *mapping,
        return ret;
 }
 
-/*
- * completion of write to server
- */
-void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
-{
-       struct pagevec pv;
-       unsigned long priv;
-       unsigned count, loop;
-       pgoff_t first = call->first, last = call->last;
-
-       _enter("{%llx:%llu},{%lx-%lx}",
-              vnode->fid.vid, vnode->fid.vnode, first, last);
-
-       pagevec_init(&pv);
-
-       do {
-               _debug("done %lx-%lx", first, last);
-
-               count = last - first + 1;
-               if (count > PAGEVEC_SIZE)
-                       count = PAGEVEC_SIZE;
-               pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
-                                             first, count, pv.pages);
-               ASSERTCMP(pv.nr, ==, count);
-
-               for (loop = 0; loop < count; loop++) {
-                       priv = page_private(pv.pages[loop]);
-                       trace_afs_page_dirty(vnode, tracepoint_string("clear"),
-                                            pv.pages[loop]->index, priv);
-                       set_page_private(pv.pages[loop], 0);
-                       end_page_writeback(pv.pages[loop]);
-               }
-               first += count;
-               __pagevec_release(&pv);
-       } while (first <= last);
-
-       afs_prune_wb_keys(vnode);
-       _leave("");
-}
-
 /*
  * write to an AFS file
  */
index c81f85003fc7dbe012d3c1efe1910a102e6476ac..17f58fea7ec1f0f8d82811142597c0fa0520e821 100644 (file)
@@ -47,40 +47,52 @@ static int afs_xattr_get_acl(const struct xattr_handler *handler,
                             void *buffer, size_t size)
 {
        struct afs_fs_cursor fc;
+       struct afs_status_cb *scb;
        struct afs_vnode *vnode = AFS_FS_I(inode);
        struct afs_acl *acl = NULL;
        struct key *key;
-       int ret;
+       int ret = -ENOMEM;
+
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
+       if (!scb)
+               goto error;
 
        key = afs_request_key(vnode->volume->cell);
-       if (IS_ERR(key))
-               return PTR_ERR(key);
+       if (IS_ERR(key)) {
+               ret = PTR_ERR(key);
+               goto error_scb;
+       }
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+               afs_dataversion_t data_version = vnode->status.data_version;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       acl = afs_fs_fetch_acl(&fc);
+                       acl = afs_fs_fetch_acl(&fc, scb);
                }
 
                afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+                                       &data_version, scb);
                ret = afs_end_vnode_operation(&fc);
        }
 
        if (ret == 0) {
                ret = acl->size;
                if (size > 0) {
-                       ret = -ERANGE;
-                       if (acl->size > size)
-                               return -ERANGE;
-                       memcpy(buffer, acl->data, acl->size);
-                       ret = acl->size;
+                       if (acl->size <= size)
+                               memcpy(buffer, acl->data, acl->size);
+                       else
+                               ret = -ERANGE;
                }
                kfree(acl);
        }
 
        key_put(key);
+error_scb:
+       kfree(scb);
+error:
        return ret;
 }
 
@@ -93,41 +105,53 @@ static int afs_xattr_set_acl(const struct xattr_handler *handler,
                              const void *buffer, size_t size, int flags)
 {
        struct afs_fs_cursor fc;
+       struct afs_status_cb *scb;
        struct afs_vnode *vnode = AFS_FS_I(inode);
        struct afs_acl *acl = NULL;
        struct key *key;
-       int ret;
+       int ret = -ENOMEM;
 
        if (flags == XATTR_CREATE)
                return -EINVAL;
 
-       key = afs_request_key(vnode->volume->cell);
-       if (IS_ERR(key))
-               return PTR_ERR(key);
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
+       if (!scb)
+               goto error;
 
        acl = kmalloc(sizeof(*acl) + size, GFP_KERNEL);
-       if (!acl) {
-               key_put(key);
-               return -ENOMEM;
+       if (!acl)
+               goto error_scb;
+
+       key = afs_request_key(vnode->volume->cell);
+       if (IS_ERR(key)) {
+               ret = PTR_ERR(key);
+               goto error_acl;
        }
 
        acl->size = size;
        memcpy(acl->data, buffer, size);
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+               afs_dataversion_t data_version = vnode->status.data_version;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       afs_fs_store_acl(&fc, acl);
+                       afs_fs_store_acl(&fc, acl, scb);
                }
 
                afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+                                       &data_version, scb);
                ret = afs_end_vnode_operation(&fc);
        }
 
-       kfree(acl);
        key_put(key);
+error_acl:
+       kfree(acl);
+error_scb:
+       kfree(scb);
+error:
        return ret;
 }
 
@@ -146,12 +170,12 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler,
                             void *buffer, size_t size)
 {
        struct afs_fs_cursor fc;
+       struct afs_status_cb *scb;
        struct afs_vnode *vnode = AFS_FS_I(inode);
        struct yfs_acl *yacl = NULL;
        struct key *key;
-       unsigned int flags = 0;
        char buf[16], *data;
-       int which = 0, dsize, ret;
+       int which = 0, dsize, ret = -ENOMEM;
 
        if (strcmp(name, "acl") == 0)
                which = 0;
@@ -164,65 +188,81 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler,
        else
                return -EOPNOTSUPP;
 
+       yacl = kzalloc(sizeof(struct yfs_acl), GFP_KERNEL);
+       if (!yacl)
+               goto error;
+
        if (which == 0)
-               flags |= YFS_ACL_WANT_ACL;
+               yacl->flags |= YFS_ACL_WANT_ACL;
        else if (which == 3)
-               flags |= YFS_ACL_WANT_VOL_ACL;
+               yacl->flags |= YFS_ACL_WANT_VOL_ACL;
+
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
+       if (!scb)
+               goto error_yacl;
 
        key = afs_request_key(vnode->volume->cell);
-       if (IS_ERR(key))
-               return PTR_ERR(key);
+       if (IS_ERR(key)) {
+               ret = PTR_ERR(key);
+               goto error_scb;
+       }
 
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+               afs_dataversion_t data_version = vnode->status.data_version;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       yacl = yfs_fs_fetch_opaque_acl(&fc, flags);
+                       yfs_fs_fetch_opaque_acl(&fc, yacl, scb);
                }
 
                afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+                                       &data_version, scb);
                ret = afs_end_vnode_operation(&fc);
        }
 
-       if (ret == 0) {
-               switch (which) {
-               case 0:
-                       data = yacl->acl->data;
-                       dsize = yacl->acl->size;
-                       break;
-               case 1:
-                       data = buf;
-                       dsize = snprintf(buf, sizeof(buf), "%u",
-                                        yacl->inherit_flag);
-                       break;
-               case 2:
-                       data = buf;
-                       dsize = snprintf(buf, sizeof(buf), "%u",
-                                        yacl->num_cleaned);
-                       break;
-               case 3:
-                       data = yacl->vol_acl->data;
-                       dsize = yacl->vol_acl->size;
-                       break;
-               default:
-                       ret = -EOPNOTSUPP;
-                       goto out;
-               }
+       if (ret < 0)
+               goto error_key;
+
+       switch (which) {
+       case 0:
+               data = yacl->acl->data;
+               dsize = yacl->acl->size;
+               break;
+       case 1:
+               data = buf;
+               dsize = snprintf(buf, sizeof(buf), "%u", yacl->inherit_flag);
+               break;
+       case 2:
+               data = buf;
+               dsize = snprintf(buf, sizeof(buf), "%u", yacl->num_cleaned);
+               break;
+       case 3:
+               data = yacl->vol_acl->data;
+               dsize = yacl->vol_acl->size;
+               break;
+       default:
+               ret = -EOPNOTSUPP;
+               goto error_key;
+       }
 
-               ret = dsize;
-               if (size > 0) {
-                       if (dsize > size) {
-                               ret = -ERANGE;
-                               goto out;
-                       }
-                       memcpy(buffer, data, dsize);
+       ret = dsize;
+       if (size > 0) {
+               if (dsize > size) {
+                       ret = -ERANGE;
+                       goto error_key;
                }
+               memcpy(buffer, data, dsize);
        }
 
-out:
-       yfs_free_opaque_acl(yacl);
+error_key:
        key_put(key);
+error_scb:
+       kfree(scb);
+error_yacl:
+       yfs_free_opaque_acl(yacl);
+error:
        return ret;
 }
 
@@ -235,42 +275,54 @@ static int afs_xattr_set_yfs(const struct xattr_handler *handler,
                              const void *buffer, size_t size, int flags)
 {
        struct afs_fs_cursor fc;
+       struct afs_status_cb *scb;
        struct afs_vnode *vnode = AFS_FS_I(inode);
        struct afs_acl *acl = NULL;
        struct key *key;
-       int ret;
+       int ret = -ENOMEM;
 
        if (flags == XATTR_CREATE ||
            strcmp(name, "acl") != 0)
                return -EINVAL;
 
-       key = afs_request_key(vnode->volume->cell);
-       if (IS_ERR(key))
-               return PTR_ERR(key);
+       scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
+       if (!scb)
+               goto error;
 
        acl = kmalloc(sizeof(*acl) + size, GFP_KERNEL);
-       if (!acl) {
-               key_put(key);
-               return -ENOMEM;
-       }
+       if (!acl)
+               goto error_scb;
 
        acl->size = size;
        memcpy(acl->data, buffer, size);
 
+       key = afs_request_key(vnode->volume->cell);
+       if (IS_ERR(key)) {
+               ret = PTR_ERR(key);
+               goto error_acl;
+       }
+
        ret = -ERESTARTSYS;
-       if (afs_begin_vnode_operation(&fc, vnode, key)) {
+       if (afs_begin_vnode_operation(&fc, vnode, key, true)) {
+               afs_dataversion_t data_version = vnode->status.data_version;
+
                while (afs_select_fileserver(&fc)) {
                        fc.cb_break = afs_calc_vnode_cb_break(vnode);
-                       yfs_fs_store_opaque_acl2(&fc, acl);
+                       yfs_fs_store_opaque_acl2(&fc, acl, scb);
                }
 
                afs_check_for_remote_deletion(&fc, fc.vnode);
-               afs_vnode_commit_status(&fc, vnode, fc.cb_break);
+               afs_vnode_commit_status(&fc, vnode, fc.cb_break,
+                                       &data_version, scb);
                ret = afs_end_vnode_operation(&fc);
        }
 
+error_acl:
        kfree(acl);
        key_put(key);
+error_scb:
+       kfree(scb);
+error:
        return ret;
 }
 
index 6cf7d161baa1ecd3d069ecf1d30b214e8d6f8c37..10de675dc6fcaa0839a997bc1636434787cde0ee 100644 (file)
@@ -183,24 +183,19 @@ static void xdr_dump_bad(const __be32 *bp)
 /*
  * Decode a YFSFetchStatus block
  */
-static int xdr_decode_YFSFetchStatus(struct afs_call *call,
-                                    const __be32 **_bp,
-                                    struct afs_file_status *status,
-                                    struct afs_vnode *vnode,
-                                    const afs_dataversion_t *expected_version,
-                                    struct afs_read *read_req)
+static int xdr_decode_YFSFetchStatus(const __be32 **_bp,
+                                    struct afs_call *call,
+                                    struct afs_status_cb *scb)
 {
        const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp;
+       struct afs_file_status *status = &scb->status;
        u32 type;
-       u8 flags = 0;
 
        status->abort_code = ntohl(xdr->abort_code);
        if (status->abort_code != 0) {
-               if (vnode && status->abort_code == VNOVNODE) {
-                       set_bit(AFS_VNODE_DELETED, &vnode->flags);
+               if (status->abort_code == VNOVNODE)
                        status->nlink = 0;
-                       __afs_break_callback(vnode);
-               }
+               scb->have_error = true;
                return 0;
        }
 
@@ -209,77 +204,28 @@ static int xdr_decode_YFSFetchStatus(struct afs_call *call,
        case AFS_FTYPE_FILE:
        case AFS_FTYPE_DIR:
        case AFS_FTYPE_SYMLINK:
-               if (type != status->type &&
-                   vnode &&
-                   !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
-                       pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n",
-                                  vnode->fid.vid,
-                                  vnode->fid.vnode,
-                                  vnode->fid.unique,
-                                  status->type, type);
-                       goto bad;
-               }
                status->type = type;
                break;
        default:
                goto bad;
        }
 
-#define EXTRACT_M4(FIELD)                                      \
-       do {                                                    \
-               u32 x = ntohl(xdr->FIELD);                      \
-               if (status->FIELD != x) {                       \
-                       flags |= AFS_VNODE_META_CHANGED;        \
-                       status->FIELD = x;                      \
-               }                                               \
-       } while (0)
-
-#define EXTRACT_M8(FIELD)                                      \
-       do {                                                    \
-               u64 x = xdr_to_u64(xdr->FIELD);                 \
-               if (status->FIELD != x) {                       \
-                       flags |= AFS_VNODE_META_CHANGED;        \
-                       status->FIELD = x;                      \
-               }                                               \
-       } while (0)
-
-#define EXTRACT_D8(FIELD)                                      \
-       do {                                                    \
-               u64 x = xdr_to_u64(xdr->FIELD);                 \
-               if (status->FIELD != x) {                       \
-                       flags |= AFS_VNODE_DATA_CHANGED;        \
-                       status->FIELD = x;                      \
-               }                                               \
-       } while (0)
-
-       EXTRACT_M4(nlink);
-       EXTRACT_D8(size);
-       EXTRACT_D8(data_version);
-       EXTRACT_M8(author);
-       EXTRACT_M8(owner);
-       EXTRACT_M8(group);
-       EXTRACT_M4(mode);
-       EXTRACT_M4(caller_access); /* call ticket dependent */
-       EXTRACT_M4(anon_access);
-
-       status->mtime_client = xdr_to_time(xdr->mtime_client);
-       status->mtime_server = xdr_to_time(xdr->mtime_server);
-       status->lock_count   = ntohl(xdr->lock_count);
-
-       if (read_req) {
-               read_req->data_version = status->data_version;
-               read_req->file_size = status->size;
-       }
+       status->nlink           = ntohl(xdr->nlink);
+       status->author          = xdr_to_u64(xdr->author);
+       status->owner           = xdr_to_u64(xdr->owner);
+       status->caller_access   = ntohl(xdr->caller_access); /* Ticket dependent */
+       status->anon_access     = ntohl(xdr->anon_access);
+       status->mode            = ntohl(xdr->mode) & S_IALLUGO;
+       status->group           = xdr_to_u64(xdr->group);
+       status->lock_count      = ntohl(xdr->lock_count);
+
+       status->mtime_client    = xdr_to_time(xdr->mtime_client);
+       status->mtime_server    = xdr_to_time(xdr->mtime_server);
+       status->size            = xdr_to_u64(xdr->size);
+       status->data_version    = xdr_to_u64(xdr->data_version);
+       scb->have_status        = true;
 
        *_bp += xdr_size(xdr);
-
-       if (vnode) {
-               if (test_bit(AFS_VNODE_UNSET, &vnode->flags))
-                       flags |= AFS_VNODE_NOT_YET_SET;
-               afs_update_inode_from_status(vnode, status, expected_version,
-                                            flags);
-       }
-
        return 0;
 
 bad:
@@ -287,74 +233,21 @@ bad:
        return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
 }
 
-/*
- * Decode the file status.  We need to lock the target vnode if we're going to
- * update its status so that stat() sees the attributes update atomically.
- */
-static int yfs_decode_status(struct afs_call *call,
-                            const __be32 **_bp,
-                            struct afs_file_status *status,
-                            struct afs_vnode *vnode,
-                            const afs_dataversion_t *expected_version,
-                            struct afs_read *read_req)
-{
-       int ret;
-
-       if (!vnode)
-               return xdr_decode_YFSFetchStatus(call, _bp, status, vnode,
-                                                expected_version, read_req);
-
-       write_seqlock(&vnode->cb_lock);
-       ret = xdr_decode_YFSFetchStatus(call, _bp, status, vnode,
-                                       expected_version, read_req);
-       write_sequnlock(&vnode->cb_lock);
-       return ret;
-}
-
 /*
  * Decode a YFSCallBack block
  */
-static void xdr_decode_YFSCallBack(struct afs_call *call,
-                                  struct afs_vnode *vnode,
-                                  const __be32 **_bp)
-{
-       struct yfs_xdr_YFSCallBack *xdr = (void *)*_bp;
-       struct afs_cb_interest *old, *cbi = call->cbi;
-       u64 cb_expiry;
-
-       write_seqlock(&vnode->cb_lock);
-
-       if (!afs_cb_is_broken(call->cb_break, vnode, cbi)) {
-               cb_expiry = xdr_to_u64(xdr->expiration_time);
-               do_div(cb_expiry, 10 * 1000 * 1000);
-               vnode->cb_version       = ntohl(xdr->version);
-               vnode->cb_type          = ntohl(xdr->type);
-               vnode->cb_expires_at    = cb_expiry + ktime_get_real_seconds();
-               old = vnode->cb_interest;
-               if (old != call->cbi) {
-                       vnode->cb_interest = cbi;
-                       cbi = old;
-               }
-               set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
-       }
-
-       write_sequnlock(&vnode->cb_lock);
-       call->cbi = cbi;
-       *_bp += xdr_size(xdr);
-}
-
-static void xdr_decode_YFSCallBack_raw(const __be32 **_bp,
-                                      struct afs_callback *cb)
+static void xdr_decode_YFSCallBack(const __be32 **_bp,
+                                  struct afs_call *call,
+                                  struct afs_status_cb *scb)
 {
        struct yfs_xdr_YFSCallBack *x = (void *)*_bp;
-       u64 cb_expiry;
-
-       cb_expiry = xdr_to_u64(x->expiration_time);
-       do_div(cb_expiry, 10 * 1000 * 1000);
-       cb->version     = ntohl(x->version);
-       cb->type        = ntohl(x->type);
-       cb->expires_at  = cb_expiry + ktime_get_real_seconds();
+       struct afs_callback *cb = &scb->callback;
+       ktime_t cb_expiry;
 
+       cb_expiry = call->reply_time;
+       cb_expiry = ktime_add(cb_expiry, xdr_to_u64(x->expiration_time) * 100);
+       cb->expires_at  = ktime_divns(cb_expiry, NSEC_PER_SEC);
+       scb->have_cb    = true;
        *_bp += xdr_size(x);
 }
 
@@ -442,11 +335,10 @@ static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp,
 }
 
 /*
- * deliver reply data to an FS.FetchStatus
+ * Deliver a reply that's a status, callback and volsync.
  */
-static int yfs_deliver_fs_fetch_status_vnode(struct afs_call *call)
+static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
        const __be32 *bp;
        int ret;
 
@@ -454,16 +346,36 @@ static int yfs_deliver_fs_fetch_status_vnode(struct afs_call *call)
        if (ret < 0)
                return ret;
 
-       _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
-
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       xdr_decode_YFSCallBack(call, vnode, &bp);
-       xdr_decode_YFSVolSync(&bp, call->reply[1]);
+       xdr_decode_YFSCallBack(&bp, call, call->out_scb);
+       xdr_decode_YFSVolSync(&bp, call->out_volsync);
+
+       _leave(" = 0 [done]");
+       return 0;
+}
+
+/*
+ * Deliver reply data to operations that just return a file status and a volume
+ * sync record.
+ */
+static int yfs_deliver_status_and_volsync(struct afs_call *call)
+{
+       const __be32 *bp;
+       int ret;
+
+       ret = afs_transfer_reply(call);
+       if (ret < 0)
+               return ret;
+
+       bp = call->buffer;
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+       if (ret < 0)
+               return ret;
+       xdr_decode_YFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -475,15 +387,15 @@ static int yfs_deliver_fs_fetch_status_vnode(struct afs_call *call)
 static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = {
        .name           = "YFS.FetchStatus(vnode)",
        .op             = yfs_FS_FetchStatus,
-       .deliver        = yfs_deliver_fs_fetch_status_vnode,
+       .deliver        = yfs_deliver_fs_status_cb_and_volsync,
        .destructor     = afs_flat_call_destructor,
 };
 
 /*
  * Fetch the status information for a file.
  */
-int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsync,
-                            bool new_inode)
+int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_status_cb *scb,
+                            struct afs_volsync *volsync)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -505,9 +417,8 @@ int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
        }
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = volsync;
-       call->expected_version = new_inode ? 1 : vnode->status.data_version;
+       call->out_scb = scb;
+       call->out_volsync = volsync;
 
        /* marshall the parameters */
        bp = call->request;
@@ -516,9 +427,9 @@ int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
        bp = xdr_encode_YFSFid(bp, &vnode->fid);
        yfs_check_req(call, bp);
 
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -528,8 +439,7 @@ int yfs_fs_fetch_file_status(struct afs_fs_cursor *fc, struct afs_volsync *volsy
  */
 static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
-       struct afs_read *req = call->reply[2];
+       struct afs_read *req = call->read_request;
        const __be32 *bp;
        unsigned int size;
        int ret;
@@ -586,7 +496,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
                if (req->offset == PAGE_SIZE) {
                        req->offset = 0;
                        if (req->page_done)
-                               req->page_done(call, req);
+                               req->page_done(req);
                        req->index++;
                        if (req->remain > 0)
                                goto begin_page;
@@ -623,12 +533,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
                        return ret;
 
                bp = call->buffer;
-               ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
-                                       &vnode->status.data_version, req);
+               ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
                if (ret < 0)
                        return ret;
-               xdr_decode_YFSCallBack(call, vnode, &bp);
-               xdr_decode_YFSVolSync(&bp, call->reply[1]);
+               xdr_decode_YFSCallBack(&bp, call, call->out_scb);
+               xdr_decode_YFSVolSync(&bp, call->out_volsync);
+
+               req->data_version = call->out_scb->status.data_version;
+               req->file_size = call->out_scb->status.size;
 
                call->unmarshall++;
 
@@ -642,7 +554,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
                        zero_user_segment(req->pages[req->index],
                                          req->offset, PAGE_SIZE);
                if (req->page_done)
-                       req->page_done(call, req);
+                       req->page_done(req);
                req->offset = 0;
        }
 
@@ -652,9 +564,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
 
 static void yfs_fetch_data_destructor(struct afs_call *call)
 {
-       struct afs_read *req = call->reply[2];
-
-       afs_put_read(req);
+       afs_put_read(call->read_request);
        afs_flat_call_destructor(call);
 }
 
@@ -671,7 +581,8 @@ static const struct afs_call_type yfs_RXYFSFetchData64 = {
 /*
  * Fetch data from a file.
  */
-int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
+int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb,
+                     struct afs_read *req)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -693,11 +604,9 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = NULL; /* volsync */
-       call->reply[2] = req;
-       call->expected_version = vnode->status.data_version;
-       call->want_reply_time = true;
+       call->out_scb = scb;
+       call->out_volsync = NULL;
+       call->read_request = req;
 
        /* marshall the parameters */
        bp = call->request;
@@ -709,9 +618,9 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
        yfs_check_req(call, bp);
 
        refcount_inc(&req->usage);
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -721,7 +630,6 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_read *req)
  */
 static int yfs_deliver_fs_create_vnode(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
        const __be32 *bp;
        int ret;
 
@@ -733,16 +641,15 @@ static int yfs_deliver_fs_create_vnode(struct afs_call *call)
 
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       xdr_decode_YFSFid(&bp, call->reply[1]);
-       ret = yfs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+       xdr_decode_YFSFid(&bp, call->out_fid);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
-       xdr_decode_YFSCallBack_raw(&bp, call->reply[3]);
-       xdr_decode_YFSVolSync(&bp, NULL);
+       xdr_decode_YFSCallBack(&bp, call, call->out_scb);
+       xdr_decode_YFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -764,14 +671,13 @@ static const struct afs_call_type afs_RXFSCreateFile = {
 int yfs_fs_create_file(struct afs_fs_cursor *fc,
                       const char *name,
                       umode_t mode,
-                      u64 current_data_version,
+                      struct afs_status_cb *dvnode_scb,
                       struct afs_fid *newfid,
-                      struct afs_file_status *newstatus,
-                      struct afs_callback *newcb)
+                      struct afs_status_cb *new_scb)
 {
-       struct afs_vnode *vnode = fc->vnode;
+       struct afs_vnode *dvnode = fc->vnode;
        struct afs_call *call;
-       struct afs_net *net = afs_v2net(vnode);
+       struct afs_net *net = afs_v2net(dvnode);
        size_t namesz, reqsz, rplsz;
        __be32 *bp;
 
@@ -795,24 +701,23 @@ int yfs_fs_create_file(struct afs_fs_cursor *fc,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = newfid;
-       call->reply[2] = newstatus;
-       call->reply[3] = newcb;
-       call->expected_version = current_data_version + 1;
+       call->out_dir_scb = dvnode_scb;
+       call->out_fid = newfid;
+       call->out_scb = new_scb;
 
        /* marshall the parameters */
        bp = call->request;
        bp = xdr_encode_u32(bp, YFSCREATEFILE);
        bp = xdr_encode_u32(bp, 0); /* RPC flags */
-       bp = xdr_encode_YFSFid(bp, &vnode->fid);
+       bp = xdr_encode_YFSFid(bp, &dvnode->fid);
        bp = xdr_encode_string(bp, name, namesz);
        bp = xdr_encode_YFSStoreStatus_mode(bp, mode);
        bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */
        yfs_check_req(call, bp);
 
        afs_use_fs_server(call, fc->cbi);
-       trace_afs_make_fs_call1(call, &vnode->fid, name);
+       trace_afs_make_fs_call1(call, &dvnode->fid, name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -830,14 +735,13 @@ static const struct afs_call_type yfs_RXFSMakeDir = {
 int yfs_fs_make_dir(struct afs_fs_cursor *fc,
                    const char *name,
                    umode_t mode,
-                   u64 current_data_version,
+                   struct afs_status_cb *dvnode_scb,
                    struct afs_fid *newfid,
-                   struct afs_file_status *newstatus,
-                   struct afs_callback *newcb)
+                   struct afs_status_cb *new_scb)
 {
-       struct afs_vnode *vnode = fc->vnode;
+       struct afs_vnode *dvnode = fc->vnode;
        struct afs_call *call;
-       struct afs_net *net = afs_v2net(vnode);
+       struct afs_net *net = afs_v2net(dvnode);
        size_t namesz, reqsz, rplsz;
        __be32 *bp;
 
@@ -860,23 +764,22 @@ int yfs_fs_make_dir(struct afs_fs_cursor *fc,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = newfid;
-       call->reply[2] = newstatus;
-       call->reply[3] = newcb;
-       call->expected_version = current_data_version + 1;
+       call->out_dir_scb = dvnode_scb;
+       call->out_fid = newfid;
+       call->out_scb = new_scb;
 
        /* marshall the parameters */
        bp = call->request;
        bp = xdr_encode_u32(bp, YFSMAKEDIR);
        bp = xdr_encode_u32(bp, 0); /* RPC flags */
-       bp = xdr_encode_YFSFid(bp, &vnode->fid);
+       bp = xdr_encode_YFSFid(bp, &dvnode->fid);
        bp = xdr_encode_string(bp, name, namesz);
        bp = xdr_encode_YFSStoreStatus_mode(bp, mode);
        yfs_check_req(call, bp);
 
        afs_use_fs_server(call, fc->cbi);
-       trace_afs_make_fs_call1(call, &vnode->fid, name);
+       trace_afs_make_fs_call1(call, &dvnode->fid, name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -886,8 +789,6 @@ int yfs_fs_make_dir(struct afs_fs_cursor *fc,
  */
 static int yfs_deliver_fs_remove_file2(struct afs_call *call)
 {
-       struct afs_vnode *dvnode = call->reply[0];
-       struct afs_vnode *vnode = call->reply[1];
        struct afs_fid fid;
        const __be32 *bp;
        int ret;
@@ -898,20 +799,18 @@ static int yfs_deliver_fs_remove_file2(struct afs_call *call)
        if (ret < 0)
                return ret;
 
-       /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
 
        xdr_decode_YFSFid(&bp, &fid);
-       ret = yfs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
        /* Was deleted if vnode->status.abort_code == VNOVNODE. */
 
-       xdr_decode_YFSVolSync(&bp, NULL);
+       xdr_decode_YFSVolSync(&bp, call->out_volsync);
        return 0;
 }
 
@@ -929,7 +828,8 @@ static const struct afs_call_type yfs_RXYFSRemoveFile2 = {
  * Remove a file and retrieve new file status.
  */
 int yfs_fs_remove_file2(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
-                       const char *name, u64 current_data_version)
+                       const char *name, struct afs_status_cb *dvnode_scb,
+                       struct afs_status_cb *vnode_scb)
 {
        struct afs_vnode *dvnode = fc->vnode;
        struct afs_call *call;
@@ -954,9 +854,8 @@ int yfs_fs_remove_file2(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = dvnode;
-       call->reply[1] = vnode;
-       call->expected_version = current_data_version + 1;
+       call->out_dir_scb = dvnode_scb;
+       call->out_scb = vnode_scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -968,6 +867,7 @@ int yfs_fs_remove_file2(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call1(call, &dvnode->fid, name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -977,7 +877,6 @@ int yfs_fs_remove_file2(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
  */
 static int yfs_deliver_fs_remove(struct afs_call *call)
 {
-       struct afs_vnode *dvnode = call->reply[0];
        const __be32 *bp;
        int ret;
 
@@ -987,14 +886,12 @@ static int yfs_deliver_fs_remove(struct afs_call *call)
        if (ret < 0)
                return ret;
 
-       /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
 
-       xdr_decode_YFSVolSync(&bp, NULL);
+       xdr_decode_YFSVolSync(&bp, call->out_volsync);
        return 0;
 }
 
@@ -1019,7 +916,8 @@ static const struct afs_call_type yfs_RXYFSRemoveDir = {
  * remove a file or directory
  */
 int yfs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
-                 const char *name, bool isdir, u64 current_data_version)
+                 const char *name, bool isdir,
+                 struct afs_status_cb *dvnode_scb)
 {
        struct afs_vnode *dvnode = fc->vnode;
        struct afs_call *call;
@@ -1042,9 +940,7 @@ int yfs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = dvnode;
-       call->reply[1] = vnode;
-       call->expected_version = current_data_version + 1;
+       call->out_dir_scb = dvnode_scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1056,6 +952,7 @@ int yfs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call1(call, &dvnode->fid, name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1065,7 +962,6 @@ int yfs_fs_remove(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
  */
 static int yfs_deliver_fs_link(struct afs_call *call)
 {
-       struct afs_vnode *dvnode = call->reply[0], *vnode = call->reply[1];
        const __be32 *bp;
        int ret;
 
@@ -1075,16 +971,14 @@ static int yfs_deliver_fs_link(struct afs_call *call)
        if (ret < 0)
                return ret;
 
-       /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = yfs_decode_status(call, &bp, &vnode->status, vnode, NULL, NULL);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       ret = yfs_decode_status(call, &bp, &dvnode->status, dvnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
-       xdr_decode_YFSVolSync(&bp, NULL);
+       xdr_decode_YFSVolSync(&bp, call->out_volsync);
        _leave(" = 0 [done]");
        return 0;
 }
@@ -1103,7 +997,9 @@ static const struct afs_call_type yfs_RXYFSLink = {
  * Make a hard link.
  */
 int yfs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
-               const char *name, u64 current_data_version)
+               const char *name,
+               struct afs_status_cb *dvnode_scb,
+               struct afs_status_cb *vnode_scb)
 {
        struct afs_vnode *dvnode = fc->vnode;
        struct afs_call *call;
@@ -1127,9 +1023,8 @@ int yfs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = dvnode;
-       call->reply[1] = vnode;
-       call->expected_version = current_data_version + 1;
+       call->out_dir_scb = dvnode_scb;
+       call->out_scb = vnode_scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1142,6 +1037,7 @@ int yfs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call1(call, &vnode->fid, name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1151,7 +1047,6 @@ int yfs_fs_link(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
  */
 static int yfs_deliver_fs_symlink(struct afs_call *call)
 {
-       struct afs_vnode *vnode = call->reply[0];
        const __be32 *bp;
        int ret;
 
@@ -1163,15 +1058,14 @@ static int yfs_deliver_fs_symlink(struct afs_call *call)
 
        /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       xdr_decode_YFSFid(&bp, call->reply[1]);
-       ret = yfs_decode_status(call, &bp, call->reply[2], NULL, NULL, NULL);
+       xdr_decode_YFSFid(&bp, call->out_fid);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
        if (ret < 0)
                return ret;
-       ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
-       xdr_decode_YFSVolSync(&bp, NULL);
+       xdr_decode_YFSVolSync(&bp, call->out_volsync);
 
        _leave(" = 0 [done]");
        return 0;
@@ -1193,9 +1087,9 @@ static const struct afs_call_type yfs_RXYFSSymlink = {
 int yfs_fs_symlink(struct afs_fs_cursor *fc,
                   const char *name,
                   const char *contents,
-                  u64 current_data_version,
+                  struct afs_status_cb *dvnode_scb,
                   struct afs_fid *newfid,
-                  struct afs_file_status *newstatus)
+                  struct afs_status_cb *vnode_scb)
 {
        struct afs_vnode *dvnode = fc->vnode;
        struct afs_call *call;
@@ -1222,10 +1116,9 @@ int yfs_fs_symlink(struct afs_fs_cursor *fc,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = dvnode;
-       call->reply[1] = newfid;
-       call->reply[2] = newstatus;
-       call->expected_version = current_data_version + 1;
+       call->out_dir_scb = dvnode_scb;
+       call->out_fid = newfid;
+       call->out_scb = vnode_scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1239,6 +1132,7 @@ int yfs_fs_symlink(struct afs_fs_cursor *fc,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call1(call, &dvnode->fid, name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1248,8 +1142,6 @@ int yfs_fs_symlink(struct afs_fs_cursor *fc,
  */
 static int yfs_deliver_fs_rename(struct afs_call *call)
 {
-       struct afs_vnode *orig_dvnode = call->reply[0];
-       struct afs_vnode *new_dvnode = call->reply[1];
        const __be32 *bp;
        int ret;
 
@@ -1259,20 +1151,17 @@ static int yfs_deliver_fs_rename(struct afs_call *call)
        if (ret < 0)
                return ret;
 
-       /* unmarshall the reply once we've received all of it */
        bp = call->buffer;
-       ret = yfs_decode_status(call, &bp, &orig_dvnode->status, orig_dvnode,
-                               &call->expected_version, NULL);
+       ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
        if (ret < 0)
                return ret;
-       if (new_dvnode != orig_dvnode) {
-               ret = yfs_decode_status(call, &bp, &new_dvnode->status, new_dvnode,
-                                       &call->expected_version_2, NULL);
+       if (call->out_dir_scb != call->out_scb) {
+               ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
                if (ret < 0)
                        return ret;
        }
 
-       xdr_decode_YFSVolSync(&bp, NULL);
+       xdr_decode_YFSVolSync(&bp, call->out_volsync);
        _leave(" = 0 [done]");
        return 0;
 }
@@ -1294,8 +1183,8 @@ int yfs_fs_rename(struct afs_fs_cursor *fc,
                  const char *orig_name,
                  struct afs_vnode *new_dvnode,
                  const char *new_name,
-                 u64 current_orig_data_version,
-                 u64 current_new_data_version)
+                 struct afs_status_cb *orig_dvnode_scb,
+                 struct afs_status_cb *new_dvnode_scb)
 {
        struct afs_vnode *orig_dvnode = fc->vnode;
        struct afs_call *call;
@@ -1321,10 +1210,8 @@ int yfs_fs_rename(struct afs_fs_cursor *fc,
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = orig_dvnode;
-       call->reply[1] = new_dvnode;
-       call->expected_version = current_orig_data_version + 1;
-       call->expected_version_2 = current_new_data_version + 1;
+       call->out_dir_scb = orig_dvnode_scb;
+       call->out_scb = new_dvnode_scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1338,46 +1225,18 @@ int yfs_fs_rename(struct afs_fs_cursor *fc,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call2(call, &orig_dvnode->fid, orig_name, new_name);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
-/*
- * Deliver reply data to a YFS.StoreData64 operation.
- */
-static int yfs_deliver_fs_store_data(struct afs_call *call)
-{
-       struct afs_vnode *vnode = call->reply[0];
-       const __be32 *bp;
-       int ret;
-
-       _enter("");
-
-       ret = afs_transfer_reply(call);
-       if (ret < 0)
-               return ret;
-
-       /* unmarshall the reply once we've received all of it */
-       bp = call->buffer;
-       ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
-       if (ret < 0)
-               return ret;
-       xdr_decode_YFSVolSync(&bp, NULL);
-
-       afs_pages_written_back(vnode, call);
-
-       _leave(" = 0 [done]");
-       return 0;
-}
-
 /*
  * YFS.StoreData64 operation type.
  */
 static const struct afs_call_type yfs_RXYFSStoreData64 = {
        .name           = "YFS.StoreData64",
        .op             = yfs_FS_StoreData64,
-       .deliver        = yfs_deliver_fs_store_data,
+       .deliver        = yfs_deliver_status_and_volsync,
        .destructor     = afs_flat_call_destructor,
 };
 
@@ -1386,7 +1245,8 @@ static const struct afs_call_type yfs_RXYFSStoreData64 = {
  */
 int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
                      pgoff_t first, pgoff_t last,
-                     unsigned offset, unsigned to)
+                     unsigned offset, unsigned to,
+                     struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1424,13 +1284,12 @@ int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
 
        call->key = fc->key;
        call->mapping = mapping;
-       call->reply[0] = vnode;
        call->first = first;
        call->last = last;
        call->first_offset = offset;
        call->last_to = to;
        call->send_pages = true;
-       call->expected_version = vnode->status.data_version + 1;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1445,51 +1304,25 @@ int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
-/*
- * deliver reply data to an FS.StoreStatus
- */
-static int yfs_deliver_fs_store_status(struct afs_call *call)
-{
-       struct afs_vnode *vnode = call->reply[0];
-       const __be32 *bp;
-       int ret;
-
-       _enter("");
-
-       ret = afs_transfer_reply(call);
-       if (ret < 0)
-               return ret;
-
-       /* unmarshall the reply once we've received all of it */
-       bp = call->buffer;
-       ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
-       if (ret < 0)
-               return ret;
-       xdr_decode_YFSVolSync(&bp, NULL);
-
-       _leave(" = 0 [done]");
-       return 0;
-}
-
 /*
  * YFS.StoreStatus operation type
  */
 static const struct afs_call_type yfs_RXYFSStoreStatus = {
        .name           = "YFS.StoreStatus",
        .op             = yfs_FS_StoreStatus,
-       .deliver        = yfs_deliver_fs_store_status,
+       .deliver        = yfs_deliver_status_and_volsync,
        .destructor     = afs_flat_call_destructor,
 };
 
 static const struct afs_call_type yfs_RXYFSStoreData64_as_Status = {
        .name           = "YFS.StoreData64",
        .op             = yfs_FS_StoreData64,
-       .deliver        = yfs_deliver_fs_store_status,
+       .deliver        = yfs_deliver_status_and_volsync,
        .destructor     = afs_flat_call_destructor,
 };
 
@@ -1497,7 +1330,8 @@ static const struct afs_call_type yfs_RXYFSStoreData64_as_Status = {
  * Set the attributes on a file, using YFS.StoreData64 rather than
  * YFS.StoreStatus so as to alter the file size also.
  */
-static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
+static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr,
+                              struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1518,8 +1352,7 @@ static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->expected_version = vnode->status.data_version + 1;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1534,6 +1367,7 @@ static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1542,7 +1376,8 @@ static int yfs_fs_setattr_size(struct afs_fs_cursor *fc, struct iattr *attr)
  * Set the attributes on a file, using YFS.StoreData64 if there's a change in
  * file size, and YFS.StoreStatus otherwise.
  */
-int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
+int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr,
+                  struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1550,7 +1385,7 @@ int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
        __be32 *bp;
 
        if (attr->ia_valid & ATTR_SIZE)
-               return yfs_fs_setattr_size(fc, attr);
+               return yfs_fs_setattr_size(fc, attr, scb);
 
        _enter(",%x,{%llx:%llu},,",
               key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);
@@ -1565,8 +1400,7 @@ int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->expected_version = vnode->status.data_version;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1578,6 +1412,7 @@ int yfs_fs_setattr(struct afs_fs_cursor *fc, struct iattr *attr)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1607,7 +1442,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
                        return ret;
 
                bp = call->buffer;
-               xdr_decode_YFSFetchVolumeStatus(&bp, call->reply[1]);
+               xdr_decode_YFSFetchVolumeStatus(&bp, call->out_volstatus);
                call->unmarshall++;
                afs_extract_to_tmp(call);
 
@@ -1623,7 +1458,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
                        return afs_protocol_error(call, -EBADMSG,
                                                  afs_eproto_volname_len);
                size = (call->count + 3) & ~3; /* It's padded */
-               afs_extract_begin(call, call->reply[2], size);
+               afs_extract_to_buf(call, size);
                call->unmarshall++;
 
                /* Fall through - and extract the volume name */
@@ -1633,7 +1468,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
                if (ret < 0)
                        return ret;
 
-               p = call->reply[2];
+               p = call->buffer;
                p[call->count] = 0;
                _debug("volname '%s'", p);
                afs_extract_to_tmp(call);
@@ -1651,7 +1486,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
                        return afs_protocol_error(call, -EBADMSG,
                                                  afs_eproto_offline_msg_len);
                size = (call->count + 3) & ~3; /* It's padded */
-               afs_extract_begin(call, call->reply[2], size);
+               afs_extract_to_buf(call, size);
                call->unmarshall++;
 
                /* Fall through - and extract the offline message */
@@ -1661,7 +1496,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
                if (ret < 0)
                        return ret;
 
-               p = call->reply[2];
+               p = call->buffer;
                p[call->count] = 0;
                _debug("offline '%s'", p);
 
@@ -1680,7 +1515,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
                        return afs_protocol_error(call, -EBADMSG,
                                                  afs_eproto_motd_len);
                size = (call->count + 3) & ~3; /* It's padded */
-               afs_extract_begin(call, call->reply[2], size);
+               afs_extract_to_buf(call, size);
                call->unmarshall++;
 
                /* Fall through - and extract the message of the day */
@@ -1690,7 +1525,7 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
                if (ret < 0)
                        return ret;
 
-               p = call->reply[2];
+               p = call->buffer;
                p[call->count] = 0;
                _debug("motd '%s'", p);
 
@@ -1705,16 +1540,6 @@ static int yfs_deliver_fs_get_volume_status(struct afs_call *call)
        return 0;
 }
 
-/*
- * Destroy a YFS.GetVolumeStatus call.
- */
-static void yfs_get_volume_status_call_destructor(struct afs_call *call)
-{
-       kfree(call->reply[2]);
-       call->reply[2] = NULL;
-       afs_flat_call_destructor(call);
-}
-
 /*
  * YFS.GetVolumeStatus operation type
  */
@@ -1722,7 +1547,7 @@ static const struct afs_call_type yfs_RXYFSGetVolumeStatus = {
        .name           = "YFS.GetVolumeStatus",
        .op             = yfs_FS_GetVolumeStatus,
        .deliver        = yfs_deliver_fs_get_volume_status,
-       .destructor     = yfs_get_volume_status_call_destructor,
+       .destructor     = afs_flat_call_destructor,
 };
 
 /*
@@ -1735,28 +1560,21 @@ int yfs_fs_get_volume_status(struct afs_fs_cursor *fc,
        struct afs_call *call;
        struct afs_net *net = afs_v2net(vnode);
        __be32 *bp;
-       void *tmpbuf;
 
        _enter("");
 
-       tmpbuf = kmalloc(AFSOPAQUEMAX, GFP_KERNEL);
-       if (!tmpbuf)
-               return -ENOMEM;
-
        call = afs_alloc_flat_call(net, &yfs_RXYFSGetVolumeStatus,
                                   sizeof(__be32) * 2 +
                                   sizeof(struct yfs_xdr_u64),
-                                  sizeof(struct yfs_xdr_YFSFetchVolumeStatus) +
-                                  sizeof(__be32));
-       if (!call) {
-               kfree(tmpbuf);
+                                  max_t(size_t,
+                                        sizeof(struct yfs_xdr_YFSFetchVolumeStatus) +
+                                        sizeof(__be32),
+                                        AFSOPAQUEMAX + 1));
+       if (!call)
                return -ENOMEM;
-       }
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[1] = vs;
-       call->reply[2] = tmpbuf;
+       call->out_volstatus = vs;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1767,38 +1585,11 @@ int yfs_fs_get_volume_status(struct afs_fs_cursor *fc,
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
-/*
- * Deliver reply data to operations that just return a file status and a volume
- * sync record.
- */
-static int yfs_deliver_status_and_volsync(struct afs_call *call)
-{
-       struct afs_vnode *vnode = call->reply[0];
-       const __be32 *bp;
-       int ret;
-
-       _enter("{%u}", call->unmarshall);
-
-       ret = afs_transfer_reply(call);
-       if (ret < 0)
-               return ret;
-
-       /* unmarshall the reply once we've received all of it */
-       bp = call->buffer;
-       ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
-                               &call->expected_version, NULL);
-       if (ret < 0)
-               return ret;
-       xdr_decode_YFSVolSync(&bp, NULL);
-
-       _leave(" = 0 [done]");
-       return 0;
-}
-
 /*
  * YFS.SetLock operation type
  */
@@ -1834,7 +1625,8 @@ static const struct afs_call_type yfs_RXYFSReleaseLock = {
 /*
  * Set a lock on a file
  */
-int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
+int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type,
+                   struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1853,8 +1645,8 @@ int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->want_reply_time = true;
+       call->lvnode = vnode;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1866,6 +1658,7 @@ int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_calli(call, &vnode->fid, type);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1873,7 +1666,7 @@ int yfs_fs_set_lock(struct afs_fs_cursor *fc, afs_lock_type_t type)
 /*
  * extend a lock on a file
  */
-int yfs_fs_extend_lock(struct afs_fs_cursor *fc)
+int yfs_fs_extend_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1891,8 +1684,8 @@ int yfs_fs_extend_lock(struct afs_fs_cursor *fc)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->want_reply_time = true;
+       call->lvnode = vnode;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1903,6 +1696,7 @@ int yfs_fs_extend_lock(struct afs_fs_cursor *fc)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -1910,7 +1704,7 @@ int yfs_fs_extend_lock(struct afs_fs_cursor *fc)
 /*
  * release a lock on a file
  */
-int yfs_fs_release_lock(struct afs_fs_cursor *fc)
+int yfs_fs_release_lock(struct afs_fs_cursor *fc, struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -1928,7 +1722,8 @@ int yfs_fs_release_lock(struct afs_fs_cursor *fc)
                return -ENOMEM;
 
        call->key = fc->key;
-       call->reply[0] = vnode;
+       call->lvnode = vnode;
+       call->out_scb = scb;
 
        /* marshall the parameters */
        bp = call->request;
@@ -1939,48 +1734,18 @@ int yfs_fs_release_lock(struct afs_fs_cursor *fc)
 
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
 
-/*
- * Deliver reply data to an FS.FetchStatus with no vnode.
- */
-static int yfs_deliver_fs_fetch_status(struct afs_call *call)
-{
-       struct afs_file_status *status = call->reply[1];
-       struct afs_callback *callback = call->reply[2];
-       struct afs_volsync *volsync = call->reply[3];
-       struct afs_vnode *vnode = call->reply[0];
-       const __be32 *bp;
-       int ret;
-
-       ret = afs_transfer_reply(call);
-       if (ret < 0)
-               return ret;
-
-       _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
-
-       /* unmarshall the reply once we've received all of it */
-       bp = call->buffer;
-       ret = yfs_decode_status(call, &bp, status, vnode,
-                               &call->expected_version, NULL);
-       if (ret < 0)
-               return ret;
-       xdr_decode_YFSCallBack_raw(&bp, callback);
-       xdr_decode_YFSVolSync(&bp, volsync);
-
-       _leave(" = 0 [done]");
-       return 0;
-}
-
 /*
  * YFS.FetchStatus operation type
  */
 static const struct afs_call_type yfs_RXYFSFetchStatus = {
        .name           = "YFS.FetchStatus",
        .op             = yfs_FS_FetchStatus,
-       .deliver        = yfs_deliver_fs_fetch_status,
+       .deliver        = yfs_deliver_fs_status_cb_and_volsync,
        .destructor     = afs_flat_call_destructor,
 };
 
@@ -1990,8 +1755,7 @@ static const struct afs_call_type yfs_RXYFSFetchStatus = {
 int yfs_fs_fetch_status(struct afs_fs_cursor *fc,
                        struct afs_net *net,
                        struct afs_fid *fid,
-                       struct afs_file_status *status,
-                       struct afs_callback *callback,
+                       struct afs_status_cb *scb,
                        struct afs_volsync *volsync)
 {
        struct afs_call *call;
@@ -2012,11 +1776,8 @@ int yfs_fs_fetch_status(struct afs_fs_cursor *fc,
        }
 
        call->key = fc->key;
-       call->reply[0] = NULL; /* vnode for fid[0] */
-       call->reply[1] = status;
-       call->reply[2] = callback;
-       call->reply[3] = volsync;
-       call->expected_version = 1; /* vnode->status.data_version */
+       call->out_scb = scb;
+       call->out_volsync = volsync;
 
        /* marshall the parameters */
        bp = call->request;
@@ -2025,9 +1786,9 @@ int yfs_fs_fetch_status(struct afs_fs_cursor *fc,
        bp = xdr_encode_YFSFid(bp, fid);
        yfs_check_req(call, bp);
 
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, fid);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -2037,9 +1798,7 @@ int yfs_fs_fetch_status(struct afs_fs_cursor *fc,
  */
 static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
 {
-       struct afs_file_status *statuses;
-       struct afs_callback *callbacks;
-       struct afs_vnode *vnode = call->reply[0];
+       struct afs_status_cb *scb;
        const __be32 *bp;
        u32 tmp;
        int ret;
@@ -2078,10 +1837,8 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
                        return ret;
 
                bp = call->buffer;
-               statuses = call->reply[1];
-               ret = yfs_decode_status(call, &bp, &statuses[call->count],
-                                       call->count == 0 ? vnode : NULL,
-                                       NULL, NULL);
+               scb = &call->out_scb[call->count];
+               ret = xdr_decode_YFSFetchStatus(&bp, call, scb);
                if (ret < 0)
                        return ret;
 
@@ -2120,13 +1877,8 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
 
                _debug("unmarshall CB array");
                bp = call->buffer;
-               callbacks = call->reply[2];
-               xdr_decode_YFSCallBack_raw(&bp, &callbacks[call->count]);
-               statuses = call->reply[1];
-               if (call->count == 0 && vnode && statuses[0].abort_code == 0) {
-                       bp = call->buffer;
-                       xdr_decode_YFSCallBack(call, vnode, &bp);
-               }
+               scb = &call->out_scb[call->count];
+               xdr_decode_YFSCallBack(&bp, call, scb);
                call->count++;
                if (call->count < call->count2)
                        goto more_cbs;
@@ -2141,7 +1893,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
                        return ret;
 
                bp = call->buffer;
-               xdr_decode_YFSVolSync(&bp, call->reply[3]);
+               xdr_decode_YFSVolSync(&bp, call->out_volsync);
 
                call->unmarshall++;
 
@@ -2170,8 +1922,7 @@ static const struct afs_call_type yfs_RXYFSInlineBulkStatus = {
 int yfs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
                              struct afs_net *net,
                              struct afs_fid *fids,
-                             struct afs_file_status *statuses,
-                             struct afs_callback *callbacks,
+                             struct afs_status_cb *statuses,
                              unsigned int nr_fids,
                              struct afs_volsync *volsync)
 {
@@ -2194,10 +1945,8 @@ int yfs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
        }
 
        call->key = fc->key;
-       call->reply[0] = NULL; /* vnode for fid[0] */
-       call->reply[1] = statuses;
-       call->reply[2] = callbacks;
-       call->reply[3] = volsync;
+       call->out_scb = statuses;
+       call->out_volsync = volsync;
        call->count2 = nr_fids;
 
        /* marshall the parameters */
@@ -2209,9 +1958,9 @@ int yfs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
                bp = xdr_encode_YFSFid(bp, &fids[i]);
        yfs_check_req(call, bp);
 
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &fids[0]);
+       afs_set_fc_call(call, fc);
        afs_make_call(&fc->ac, call, GFP_NOFS);
        return afs_wait_for_call_to_complete(call, &fc->ac);
 }
@@ -2221,9 +1970,7 @@ int yfs_fs_inline_bulk_status(struct afs_fs_cursor *fc,
  */
 static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
 {
-       struct afs_volsync *volsync = call->reply[2];
-       struct afs_vnode *vnode = call->reply[1];
-       struct yfs_acl *yacl =  call->reply[0];
+       struct yfs_acl *yacl = call->out_yacl;
        struct afs_acl *acl;
        const __be32 *bp;
        unsigned int size;
@@ -2308,11 +2055,10 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
                bp = call->buffer;
                yacl->inherit_flag = ntohl(*bp++);
                yacl->num_cleaned = ntohl(*bp++);
-               ret = yfs_decode_status(call, &bp, &vnode->status, vnode,
-                                       &call->expected_version, NULL);
+               ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
                if (ret < 0)
                        return ret;
-               xdr_decode_YFSVolSync(&bp, volsync);
+               xdr_decode_YFSVolSync(&bp, call->out_volsync);
 
                call->unmarshall++;
 
@@ -2333,12 +2079,6 @@ void yfs_free_opaque_acl(struct yfs_acl *yacl)
        }
 }
 
-static void yfs_destroy_fs_fetch_opaque_acl(struct afs_call *call)
-{
-       yfs_free_opaque_acl(call->reply[0]);
-       afs_flat_call_destructor(call);
-}
-
 /*
  * YFS.FetchOpaqueACL operation type
  */
@@ -2346,18 +2086,18 @@ static const struct afs_call_type yfs_RXYFSFetchOpaqueACL = {
        .name           = "YFS.FetchOpaqueACL",
        .op             = yfs_FS_FetchOpaqueACL,
        .deliver        = yfs_deliver_fs_fetch_opaque_acl,
-       .destructor     = yfs_destroy_fs_fetch_opaque_acl,
+       .destructor     = afs_flat_call_destructor,
 };
 
 /*
  * Fetch the YFS advanced ACLs for a file.
  */
 struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_fs_cursor *fc,
-                                       unsigned int flags)
+                                       struct yfs_acl *yacl,
+                                       struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
-       struct yfs_acl *yacl;
        struct afs_net *net = afs_v2net(vnode);
        __be32 *bp;
 
@@ -2370,19 +2110,15 @@ struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_fs_cursor *fc,
                                   sizeof(__be32) * 2 +
                                   sizeof(struct yfs_xdr_YFSFetchStatus) +
                                   sizeof(struct yfs_xdr_YFSVolSync));
-       if (!call)
-               goto nomem;
-
-       yacl = kzalloc(sizeof(struct yfs_acl), GFP_KERNEL);
-       if (!yacl)
-               goto nomem_call;
+       if (!call) {
+               fc->ac.error = -ENOMEM;
+               return ERR_PTR(-ENOMEM);
+       }
 
-       yacl->flags = flags;
        call->key = fc->key;
-       call->reply[0] = yacl;
-       call->reply[1] = vnode;
-       call->reply[2] = NULL; /* volsync */
-       call->ret_reply0 = true;
+       call->out_yacl = yacl;
+       call->out_scb = scb;
+       call->out_volsync = NULL;
 
        /* marshall the parameters */
        bp = call->request;
@@ -2391,17 +2127,10 @@ struct yfs_acl *yfs_fs_fetch_opaque_acl(struct afs_fs_cursor *fc,
        bp = xdr_encode_YFSFid(bp, &vnode->fid);
        yfs_check_req(call, bp);
 
-       call->cb_break = fc->cb_break;
        afs_use_fs_server(call, fc->cbi);
        trace_afs_make_fs_call(call, &vnode->fid);
        afs_make_call(&fc->ac, call, GFP_KERNEL);
        return (struct yfs_acl *)afs_wait_for_call_to_complete(call, &fc->ac);
-
-nomem_call:
-       afs_put_call(call);
-nomem:
-       fc->ac.error = -ENOMEM;
-       return ERR_PTR(-ENOMEM);
 }
 
 /*
@@ -2417,7 +2146,8 @@ static const struct afs_call_type yfs_RXYFSStoreOpaqueACL2 = {
 /*
  * Fetch the YFS ACL for a file.
  */
-int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *fc, const struct afs_acl *acl)
+int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *fc, const struct afs_acl *acl,
+                            struct afs_status_cb *scb)
 {
        struct afs_vnode *vnode = fc->vnode;
        struct afs_call *call;
@@ -2441,8 +2171,8 @@ int yfs_fs_store_opaque_acl2(struct afs_fs_cursor *fc, const struct afs_acl *acl
        }
 
        call->key = fc->key;
-       call->reply[0] = vnode;
-       call->reply[2] = NULL; /* volsync */
+       call->out_scb = scb;
+       call->out_volsync = NULL;
 
        /* marshall the parameters */
        bp = call->request;
index 36a8dc699448c66eead329bea2b1b43ca0966d8e..72f8e131139241fa89f9eeab64b42645079c0b84 100644 (file)
@@ -892,8 +892,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
        int have = ci->i_snap_caps;
 
        if ((have & mask) == mask) {
-               dout("__ceph_caps_issued_mask %p snap issued %s"
-                    " (mask %s)\n", &ci->vfs_inode,
+               dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
+                    " (mask %s)\n", ci->vfs_inode.i_ino,
                     ceph_cap_string(have),
                     ceph_cap_string(mask));
                return 1;
@@ -904,8 +904,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
                if (!__cap_is_valid(cap))
                        continue;
                if ((cap->issued & mask) == mask) {
-                       dout("__ceph_caps_issued_mask %p cap %p issued %s"
-                            " (mask %s)\n", &ci->vfs_inode, cap,
+                       dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
+                            " (mask %s)\n", ci->vfs_inode.i_ino, cap,
                             ceph_cap_string(cap->issued),
                             ceph_cap_string(mask));
                        if (touch)
@@ -916,8 +916,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
                /* does a combination of caps satisfy mask? */
                have |= cap->issued;
                if ((have & mask) == mask) {
-                       dout("__ceph_caps_issued_mask %p combo issued %s"
-                            " (mask %s)\n", &ci->vfs_inode,
+                       dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
+                            " (mask %s)\n", ci->vfs_inode.i_ino,
                             ceph_cap_string(cap->issued),
                             ceph_cap_string(mask));
                        if (touch) {
@@ -2257,8 +2257,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        if (datasync)
                goto out;
 
-       inode_lock(inode);
-
        dirty = try_flush_caps(inode, &flush_tid);
        dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
 
@@ -2273,7 +2271,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                ret = wait_event_interruptible(ci->i_cap_wq,
                                        caps_are_flushed(inode, flush_tid));
        }
-       inode_unlock(inode);
 out:
        dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
        return ret;
@@ -2528,9 +2525,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
  * to (when applicable), and check against max_size here as well.
  * Note that caller is responsible for ensuring max_size increases are
  * requested from the MDS.
+ *
+ * Returns 0 if caps were not able to be acquired (yet), a 1 if they were,
+ * or a negative error code.
+ *
+ * FIXME: how does a 0 return differ from -EAGAIN?
  */
 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
-                           loff_t endoff, bool nonblock, int *got, int *err)
+                           loff_t endoff, bool nonblock, int *got)
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
@@ -2550,8 +2552,7 @@ again:
        if ((file_wanted & need) != need) {
                dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
                     ceph_cap_string(need), ceph_cap_string(file_wanted));
-               *err = -EBADF;
-               ret = 1;
+               ret = -EBADF;
                goto out_unlock;
        }
 
@@ -2572,10 +2573,8 @@ again:
                if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
                        dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
                             inode, endoff, ci->i_max_size);
-                       if (endoff > ci->i_requested_max_size) {
-                               *err = -EAGAIN;
-                               ret = 1;
-                       }
+                       if (endoff > ci->i_requested_max_size)
+                               ret = -EAGAIN;
                        goto out_unlock;
                }
                /*
@@ -2610,8 +2609,7 @@ again:
                                         * task isn't in TASK_RUNNING state
                                         */
                                        if (nonblock) {
-                                               *err = -EAGAIN;
-                                               ret = 1;
+                                               ret = -EAGAIN;
                                                goto out_unlock;
                                        }
 
@@ -2640,8 +2638,7 @@ again:
                if (session_readonly) {
                        dout("get_cap_refs %p needed %s but mds%d readonly\n",
                             inode, ceph_cap_string(need), ci->i_auth_cap->mds);
-                       *err = -EROFS;
-                       ret = 1;
+                       ret = -EROFS;
                        goto out_unlock;
                }
 
@@ -2650,16 +2647,14 @@ again:
                        if (READ_ONCE(mdsc->fsc->mount_state) ==
                            CEPH_MOUNT_SHUTDOWN) {
                                dout("get_cap_refs %p forced umount\n", inode);
-                               *err = -EIO;
-                               ret = 1;
+                               ret = -EIO;
                                goto out_unlock;
                        }
                        mds_wanted = __ceph_caps_mds_wanted(ci, false);
                        if (need & ~(mds_wanted & need)) {
                                dout("get_cap_refs %p caps were dropped"
                                     " (session killed?)\n", inode);
-                               *err = -ESTALE;
-                               ret = 1;
+                               ret = -ESTALE;
                                goto out_unlock;
                        }
                        if (!(file_wanted & ~mds_wanted))
@@ -2710,7 +2705,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
 int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
                      bool nonblock, int *got)
 {
-       int ret, err = 0;
+       int ret;
 
        BUG_ON(need & ~CEPH_CAP_FILE_RD);
        BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
@@ -2718,15 +2713,8 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
        if (ret < 0)
                return ret;
 
-       ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err);
-       if (ret) {
-               if (err == -EAGAIN) {
-                       ret = 0;
-               } else if (err < 0) {
-                       ret = err;
-               }
-       }
-       return ret;
+       ret = try_get_cap_refs(ci, need, want, 0, nonblock, got);
+       return ret == -EAGAIN ? 0 : ret;
 }
 
 /*
@@ -2737,7 +2725,7 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                  loff_t endoff, int *got, struct page **pinned_page)
 {
-       int _got, ret, err = 0;
+       int _got, ret;
 
        ret = ceph_pool_perm_check(ci, need);
        if (ret < 0)
@@ -2747,21 +2735,19 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                if (endoff > 0)
                        check_max_size(&ci->vfs_inode, endoff);
 
-               err = 0;
                _got = 0;
                ret = try_get_cap_refs(ci, need, want, endoff,
-                                      false, &_got, &err);
-               if (ret) {
-                       if (err == -EAGAIN)
-                               continue;
-                       if (err < 0)
-                               ret = err;
-               } else {
+                                      false, &_got);
+               if (ret == -EAGAIN) {
+                       continue;
+               } else if (!ret) {
+                       int err;
+
                        DEFINE_WAIT_FUNC(wait, woken_wake_function);
                        add_wait_queue(&ci->i_cap_wq, &wait);
 
-                       while (!try_get_cap_refs(ci, need, want, endoff,
-                                                true, &_got, &err)) {
+                       while (!(err = try_get_cap_refs(ci, need, want, endoff,
+                                                       true, &_got))) {
                                if (signal_pending(current)) {
                                        ret = -ERESTARTSYS;
                                        break;
@@ -2770,19 +2756,14 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                        }
 
                        remove_wait_queue(&ci->i_cap_wq, &wait);
-
                        if (err == -EAGAIN)
                                continue;
-                       if (err < 0)
-                               ret = err;
                }
-               if (ret < 0) {
-                       if (err == -ESTALE) {
-                               /* session was killed, try renew caps */
-                               ret = ceph_renew_caps(&ci->vfs_inode);
-                               if (ret == 0)
-                                       continue;
-                       }
+               if (ret == -ESTALE) {
+                       /* session was killed, try renew caps */
+                       ret = ceph_renew_caps(&ci->vfs_inode);
+                       if (ret == 0)
+                               continue;
                        return ret;
                }
 
@@ -4099,7 +4080,7 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
 }
 
 /*
- * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
+ * For a soon-to-be unlinked file, drop the LINK caps. If it
  * looks like the link count will hit 0, drop any other caps (other
  * than PIN) we don't specifically want (due to the file still being
  * open).
index 98365e74cb4aa12cb5b80daa18f4e83956798f27..b3fc5fe26a1ab47a1094194cb1835c6b18d2aefb 100644 (file)
@@ -37,7 +37,7 @@ static int mdsmap_show(struct seq_file *s, void *p)
                struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
                int state = mdsmap->m_info[i].state;
                seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
-                              ceph_pr_addr(&addr->in_addr),
+                              ceph_pr_addr(addr),
                               ceph_mds_state_name(state));
        }
        return 0;
@@ -88,7 +88,7 @@ static int mdsc_show(struct seq_file *s, void *p)
                                   req->r_dentry,
                                   path ? path : "");
                        spin_unlock(&req->r_dentry->d_lock);
-                       kfree(path);
+                       ceph_mdsc_free_path(path, pathlen);
                } else if (req->r_path1) {
                        seq_printf(s, " #%llx/%s", req->r_ino1.ino,
                                   req->r_path1);
@@ -108,7 +108,7 @@ static int mdsc_show(struct seq_file *s, void *p)
                                   req->r_old_dentry,
                                   path ? path : "");
                        spin_unlock(&req->r_old_dentry->d_lock);
-                       kfree(path);
+                       ceph_mdsc_free_path(path, pathlen);
                } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
                        if (req->r_ino2.ino)
                                seq_printf(s, " #%llx/%s", req->r_ino2.ino,
@@ -124,18 +124,48 @@ static int mdsc_show(struct seq_file *s, void *p)
        return 0;
 }
 
+static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
+{
+       struct seq_file *s = p;
+
+       seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
+                  ceph_cap_string(cap->issued),
+                  ceph_cap_string(cap->implemented));
+       return 0;
+}
+
 static int caps_show(struct seq_file *s, void *p)
 {
        struct ceph_fs_client *fsc = s->private;
-       int total, avail, used, reserved, min;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
+       int total, avail, used, reserved, min, i;
 
        ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
        seq_printf(s, "total\t\t%d\n"
                   "avail\t\t%d\n"
                   "used\t\t%d\n"
                   "reserved\t%d\n"
-                  "min\t%d\n",
+                  "min\t\t%d\n\n",
                   total, avail, used, reserved, min);
+       seq_printf(s, "ino                issued           implemented\n");
+       seq_printf(s, "-----------------------------------------------\n");
+
+       mutex_lock(&mdsc->mutex);
+       for (i = 0; i < mdsc->max_sessions; i++) {
+               struct ceph_mds_session *session;
+
+               session = __ceph_lookup_mds_session(mdsc, i);
+               if (!session)
+                       continue;
+               mutex_unlock(&mdsc->mutex);
+               mutex_lock(&session->s_mutex);
+               ceph_iterate_session_caps(session, caps_show_cb, s);
+               mutex_unlock(&session->s_mutex);
+               ceph_put_mds_session(session);
+               mutex_lock(&mdsc->mutex);
+       }
+       mutex_unlock(&mdsc->mutex);
+
        return 0;
 }
 
index 3c59ad180ef0bb5b9dfb09528e5c1dc363f9b94e..d3ef7ee429ec03d3a4209260df0c396ab433f77d 100644 (file)
@@ -22,18 +22,77 @@ struct ceph_nfs_confh {
        u64 ino, parent_ino;
 } __attribute__ ((packed));
 
+/*
+ * fh for snapped inode
+ */
+struct ceph_nfs_snapfh {
+       u64 ino;
+       u64 snapid;
+       u64 parent_ino;
+       u32 hash;
+} __attribute__ ((packed));
+
+static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
+                             struct inode *parent_inode)
+{
+       const static int snap_handle_length =
+               sizeof(struct ceph_nfs_snapfh) >> 2;
+       struct ceph_nfs_snapfh *sfh = (void *)rawfh;
+       u64 snapid = ceph_snap(inode);
+       int ret;
+       bool no_parent = true;
+
+       if (*max_len < snap_handle_length) {
+               *max_len = snap_handle_length;
+               ret = FILEID_INVALID;
+               goto out;
+       }
+
+       ret =  -EINVAL;
+       if (snapid != CEPH_SNAPDIR) {
+               struct inode *dir;
+               struct dentry *dentry = d_find_alias(inode);
+               if (!dentry)
+                       goto out;
+
+               rcu_read_lock();
+               dir = d_inode_rcu(dentry->d_parent);
+               if (ceph_snap(dir) != CEPH_SNAPDIR) {
+                       sfh->parent_ino = ceph_ino(dir);
+                       sfh->hash = ceph_dentry_hash(dir, dentry);
+                       no_parent = false;
+               }
+               rcu_read_unlock();
+               dput(dentry);
+       }
+
+       if (no_parent) {
+               if (!S_ISDIR(inode->i_mode))
+                       goto out;
+               sfh->parent_ino = sfh->ino;
+               sfh->hash = 0;
+       }
+       sfh->ino = ceph_ino(inode);
+       sfh->snapid = snapid;
+
+       *max_len = snap_handle_length;
+       ret = FILEID_BTRFS_WITH_PARENT;
+out:
+       dout("encode_snapfh %llx.%llx ret=%d\n", ceph_vinop(inode), ret);
+       return ret;
+}
+
 static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
                          struct inode *parent_inode)
 {
+       const static int handle_length =
+               sizeof(struct ceph_nfs_fh) >> 2;
+       const static int connected_handle_length =
+               sizeof(struct ceph_nfs_confh) >> 2;
        int type;
-       struct ceph_nfs_fh *fh = (void *)rawfh;
-       struct ceph_nfs_confh *cfh = (void *)rawfh;
-       int connected_handle_length = sizeof(*cfh)/4;
-       int handle_length = sizeof(*fh)/4;
 
-       /* don't re-export snaps */
        if (ceph_snap(inode) != CEPH_NOSNAP)
-               return -EINVAL;
+               return ceph_encode_snapfh(inode, rawfh, max_len, parent_inode);
 
        if (parent_inode && (*max_len < connected_handle_length)) {
                *max_len = connected_handle_length;
@@ -44,6 +103,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
        }
 
        if (parent_inode) {
+               struct ceph_nfs_confh *cfh = (void *)rawfh;
                dout("encode_fh %llx with parent %llx\n",
                     ceph_ino(inode), ceph_ino(parent_inode));
                cfh->ino = ceph_ino(inode);
@@ -51,6 +111,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
                *max_len = connected_handle_length;
                type = FILEID_INO32_GEN_PARENT;
        } else {
+               struct ceph_nfs_fh *fh = (void *)rawfh;
                dout("encode_fh %llx\n", ceph_ino(inode));
                fh->ino = ceph_ino(inode);
                *max_len = handle_length;
@@ -59,7 +120,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
        return type;
 }
 
-static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
+static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
@@ -81,7 +142,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
                mask = CEPH_STAT_CAP_INODE;
                if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
                        mask |= CEPH_CAP_XATTR_SHARED;
-               req->r_args.getattr.mask = cpu_to_le32(mask);
+               req->r_args.lookupino.mask = cpu_to_le32(mask);
 
                req->r_ino1 = vino;
                req->r_num_caps = 1;
@@ -91,16 +152,114 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
                        ihold(inode);
                ceph_mdsc_put_request(req);
                if (!inode)
-                       return ERR_PTR(-ESTALE);
-               if (inode->i_nlink == 0) {
-                       iput(inode);
-                       return ERR_PTR(-ESTALE);
-               }
+                       return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE);
        }
+       return inode;
+}
+
+struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino)
+{
+       struct inode *inode = __lookup_inode(sb, ino);
+       if (IS_ERR(inode))
+               return inode;
+       if (inode->i_nlink == 0) {
+               iput(inode);
+               return ERR_PTR(-ESTALE);
+       }
+       return inode;
+}
 
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
+{
+       struct inode *inode = __lookup_inode(sb, ino);
+       if (IS_ERR(inode))
+               return ERR_CAST(inode);
+       if (inode->i_nlink == 0) {
+               iput(inode);
+               return ERR_PTR(-ESTALE);
+       }
        return d_obtain_alias(inode);
 }
 
+static struct dentry *__snapfh_to_dentry(struct super_block *sb,
+                                         struct ceph_nfs_snapfh *sfh,
+                                         bool want_parent)
+{
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+       struct ceph_mds_request *req;
+       struct inode *inode;
+       struct ceph_vino vino;
+       int mask;
+       int err;
+       bool unlinked = false;
+
+       if (want_parent) {
+               vino.ino = sfh->parent_ino;
+               if (sfh->snapid == CEPH_SNAPDIR)
+                       vino.snap = CEPH_NOSNAP;
+               else if (sfh->ino == sfh->parent_ino)
+                       vino.snap = CEPH_SNAPDIR;
+               else
+                       vino.snap = sfh->snapid;
+       } else {
+               vino.ino = sfh->ino;
+               vino.snap = sfh->snapid;
+       }
+       inode = ceph_find_inode(sb, vino);
+       if (inode)
+               return d_obtain_alias(inode);
+
+       req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+                                      USE_ANY_MDS);
+       if (IS_ERR(req))
+               return ERR_CAST(req);
+
+       mask = CEPH_STAT_CAP_INODE;
+       if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+               mask |= CEPH_CAP_XATTR_SHARED;
+       req->r_args.lookupino.mask = cpu_to_le32(mask);
+       if (vino.snap < CEPH_NOSNAP) {
+               req->r_args.lookupino.snapid = cpu_to_le64(vino.snap);
+               if (!want_parent && sfh->ino != sfh->parent_ino) {
+                       req->r_args.lookupino.parent =
+                                       cpu_to_le64(sfh->parent_ino);
+                       req->r_args.lookupino.hash =
+                                       cpu_to_le32(sfh->hash);
+               }
+       }
+
+       req->r_ino1 = vino;
+       req->r_num_caps = 1;
+       err = ceph_mdsc_do_request(mdsc, NULL, req);
+       inode = req->r_target_inode;
+       if (inode) {
+               if (vino.snap == CEPH_SNAPDIR) {
+                       if (inode->i_nlink == 0)
+                               unlinked = true;
+                       inode = ceph_get_snapdir(inode);
+               } else if (ceph_snap(inode) == vino.snap) {
+                       ihold(inode);
+               } else {
+                       /* mds does not support lookup snapped inode */
+                       err = -EOPNOTSUPP;
+                       inode = NULL;
+               }
+       }
+       ceph_mdsc_put_request(req);
+
+       if (want_parent) {
+               dout("snapfh_to_parent %llx.%llx\n err=%d\n",
+                    vino.ino, vino.snap, err);
+       } else {
+               dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d",
+                     vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err);
+       }
+       if (!inode)
+               return ERR_PTR(-ESTALE);
+       /* see comments in ceph_get_parent() */
+       return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode);
+}
+
 /*
  * convert regular fh to dentry
  */
@@ -110,6 +269,11 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
 {
        struct ceph_nfs_fh *fh = (void *)fid->raw;
 
+       if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+               struct ceph_nfs_snapfh *sfh = (void *)fid->raw;
+               return __snapfh_to_dentry(sb, sfh, false);
+       }
+
        if (fh_type != FILEID_INO32_GEN  &&
            fh_type != FILEID_INO32_GEN_PARENT)
                return NULL;
@@ -163,13 +327,49 @@ static struct dentry *__get_parent(struct super_block *sb,
 
 static struct dentry *ceph_get_parent(struct dentry *child)
 {
-       /* don't re-export snaps */
-       if (ceph_snap(d_inode(child)) != CEPH_NOSNAP)
-               return ERR_PTR(-EINVAL);
-
-       dout("get_parent %p ino %llx.%llx\n",
-            child, ceph_vinop(d_inode(child)));
-       return __get_parent(child->d_sb, child, 0);
+       struct inode *inode = d_inode(child);
+       struct dentry *dn;
+
+       if (ceph_snap(inode) != CEPH_NOSNAP) {
+               struct inode* dir;
+               bool unlinked = false;
+               /* do not support non-directory */
+               if (!d_is_dir(child)) {
+                       dn = ERR_PTR(-EINVAL);
+                       goto out;
+               }
+               dir = __lookup_inode(inode->i_sb, ceph_ino(inode));
+               if (IS_ERR(dir)) {
+                       dn = ERR_CAST(dir);
+                       goto out;
+               }
+               /* There can be multiple paths to access snapped inode.
+                * For simplicity, treat snapdir of head inode as parent */
+               if (ceph_snap(inode) != CEPH_SNAPDIR) {
+                       struct inode *snapdir = ceph_get_snapdir(dir);
+                       if (dir->i_nlink == 0)
+                               unlinked = true;
+                       iput(dir);
+                       if (IS_ERR(snapdir)) {
+                               dn = ERR_CAST(snapdir);
+                               goto out;
+                       }
+                       dir = snapdir;
+               }
+               /* If directory has already been deleted, futher get_parent
+                * will fail. Do not mark snapdir dentry as disconnected,
+                * this prevent exportfs from doing futher get_parent. */
+               if (unlinked)
+                       dn = d_obtain_root(dir);
+               else
+                       dn = d_obtain_alias(dir);
+       } else {
+               dn = __get_parent(child->d_sb, child, 0);
+       }
+out:
+       dout("get_parent %p ino %llx.%llx err=%ld\n",
+            child, ceph_vinop(inode), (IS_ERR(dn) ? PTR_ERR(dn) : 0));
+       return dn;
 }
 
 /*
@@ -182,6 +382,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
        struct ceph_nfs_confh *cfh = (void *)fid->raw;
        struct dentry *dentry;
 
+       if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+               struct ceph_nfs_snapfh *sfh = (void *)fid->raw;
+               return __snapfh_to_dentry(sb, sfh, true);
+       }
+
        if (fh_type != FILEID_INO32_GEN_PARENT)
                return NULL;
        if (fh_len < sizeof(*cfh) / 4)
@@ -194,14 +399,115 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
        return dentry;
 }
 
+static int __get_snap_name(struct dentry *parent, char *name,
+                          struct dentry *child)
+{
+       struct inode *inode = d_inode(child);
+       struct inode *dir = d_inode(parent);
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+       struct ceph_mds_request *req = NULL;
+       char *last_name = NULL;
+       unsigned next_offset = 2;
+       int err = -EINVAL;
+
+       if (ceph_ino(inode) != ceph_ino(dir))
+               goto out;
+       if (ceph_snap(inode) == CEPH_SNAPDIR) {
+               if (ceph_snap(dir) == CEPH_NOSNAP) {
+                       strcpy(name, fsc->mount_options->snapdir_name);
+                       err = 0;
+               }
+               goto out;
+       }
+       if (ceph_snap(dir) != CEPH_SNAPDIR)
+               goto out;
+
+       while (1) {
+               struct ceph_mds_reply_info_parsed *rinfo;
+               struct ceph_mds_reply_dir_entry *rde;
+               int i;
+
+               req = ceph_mdsc_create_request(fsc->mdsc, CEPH_MDS_OP_LSSNAP,
+                                              USE_AUTH_MDS);
+               if (IS_ERR(req)) {
+                       err = PTR_ERR(req);
+                       req = NULL;
+                       goto out;
+               }
+               err = ceph_alloc_readdir_reply_buffer(req, inode);
+               if (err)
+                       goto out;
+
+               req->r_direct_mode = USE_AUTH_MDS;
+               req->r_readdir_offset = next_offset;
+               req->r_args.readdir.flags =
+                               cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
+               if (last_name) {
+                       req->r_path2 = last_name;
+                       last_name = NULL;
+               }
+
+               req->r_inode = dir;
+               ihold(dir);
+               req->r_dentry = dget(parent);
+
+               inode_lock(dir);
+               err = ceph_mdsc_do_request(fsc->mdsc, NULL, req);
+               inode_unlock(dir);
+
+               if (err < 0)
+                       goto out;
+
+                rinfo = &req->r_reply_info;
+                for (i = 0; i < rinfo->dir_nr; i++) {
+                        rde = rinfo->dir_entries + i;
+                        BUG_ON(!rde->inode.in);
+                        if (ceph_snap(inode) ==
+                            le64_to_cpu(rde->inode.in->snapid)) {
+                                memcpy(name, rde->name, rde->name_len);
+                                name[rde->name_len] = '\0';
+                                err = 0;
+                                goto out;
+                        }
+                }
+
+                if (rinfo->dir_end)
+                        break;
+
+                BUG_ON(rinfo->dir_nr <= 0);
+                rde = rinfo->dir_entries + (rinfo->dir_nr - 1);
+                next_offset += rinfo->dir_nr;
+                last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL);
+                if (!last_name) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+
+                ceph_mdsc_put_request(req);
+                req = NULL;
+       }
+       err = -ENOENT;
+out:
+       if (req)
+               ceph_mdsc_put_request(req);
+       kfree(last_name);
+       dout("get_snap_name %p ino %llx.%llx err=%d\n",
+            child, ceph_vinop(inode), err);
+       return err;
+}
+
 static int ceph_get_name(struct dentry *parent, char *name,
                         struct dentry *child)
 {
        struct ceph_mds_client *mdsc;
        struct ceph_mds_request *req;
+       struct inode *inode = d_inode(child);
        int err;
 
-       mdsc = ceph_inode_to_client(d_inode(child))->mdsc;
+       if (ceph_snap(inode) != CEPH_NOSNAP)
+               return __get_snap_name(parent, name, child);
+
+       mdsc = ceph_inode_to_client(inode)->mdsc;
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
                                       USE_ANY_MDS);
        if (IS_ERR(req))
@@ -209,8 +515,8 @@ static int ceph_get_name(struct dentry *parent, char *name,
 
        inode_lock(d_inode(parent));
 
-       req->r_inode = d_inode(child);
-       ihold(d_inode(child));
+       req->r_inode = inode;
+       ihold(inode);
        req->r_ino2 = ceph_vino(d_inode(parent));
        req->r_parent = d_inode(parent);
        set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
@@ -224,10 +530,10 @@ static int ceph_get_name(struct dentry *parent, char *name,
                memcpy(name, rinfo->dname, rinfo->dname_len);
                name[rinfo->dname_len] = 0;
                dout("get_name %p ino %llx.%llx name %s\n",
-                    child, ceph_vinop(d_inode(child)), name);
+                    child, ceph_vinop(inode), name);
        } else {
                dout("get_name %p ino %llx.%llx err %d\n",
-                    child, ceph_vinop(d_inode(child)), err);
+                    child, ceph_vinop(inode), err);
        }
 
        ceph_mdsc_put_request(req);
index 84725b53ac2190e423c3233aaac7d9adc2a745f4..305daf043eb0eaca81e17dfbd405cd784edaab81 100644 (file)
@@ -929,7 +929,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 
        dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
             (write ? "write" : "read"), file, pos, (unsigned)count,
-            snapc, snapc->seq);
+            snapc, snapc ? snapc->seq : 0);
 
        ret = filemap_write_and_wait_range(inode->i_mapping,
                                           pos, pos + count - 1);
index 35dae6d5493a8eb59e51c2a67f8171d444a44319..f85355bf49c4c7b3830822264c16f7b25c31d731 100644 (file)
@@ -2266,43 +2266,72 @@ int ceph_permission(struct inode *inode, int mask)
        return err;
 }
 
+/* Craft a mask of needed caps given a set of requested statx attrs. */
+static int statx_to_caps(u32 want)
+{
+       int mask = 0;
+
+       if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME))
+               mask |= CEPH_CAP_AUTH_SHARED;
+
+       if (want & (STATX_NLINK|STATX_CTIME))
+               mask |= CEPH_CAP_LINK_SHARED;
+
+       if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|
+                   STATX_BLOCKS))
+               mask |= CEPH_CAP_FILE_SHARED;
+
+       if (want & (STATX_CTIME))
+               mask |= CEPH_CAP_XATTR_SHARED;
+
+       return mask;
+}
+
 /*
- * Get all attributes.  Hopefully somedata we'll have a statlite()
- * and can limit the fields we require to be accurate.
+ * Get all the attributes. If we have sufficient caps for the requested attrs,
+ * then we can avoid talking to the MDS at all.
  */
 int ceph_getattr(const struct path *path, struct kstat *stat,
                 u32 request_mask, unsigned int flags)
 {
        struct inode *inode = d_inode(path->dentry);
        struct ceph_inode_info *ci = ceph_inode(inode);
-       int err;
+       int err = 0;
 
-       err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false);
-       if (!err) {
-               generic_fillattr(inode, stat);
-               stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
-               if (ceph_snap(inode) == CEPH_NOSNAP)
-                       stat->dev = inode->i_sb->s_dev;
+       /* Skip the getattr altogether if we're asked not to sync */
+       if (!(flags & AT_STATX_DONT_SYNC)) {
+               err = ceph_do_getattr(inode, statx_to_caps(request_mask),
+                                     flags & AT_STATX_FORCE_SYNC);
+               if (err)
+                       return err;
+       }
+
+       generic_fillattr(inode, stat);
+       stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+       if (ceph_snap(inode) == CEPH_NOSNAP)
+               stat->dev = inode->i_sb->s_dev;
+       else
+               stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
+
+       if (S_ISDIR(inode->i_mode)) {
+               if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+                                       RBYTES))
+                       stat->size = ci->i_rbytes;
                else
-                       stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
-
-               if (S_ISDIR(inode->i_mode)) {
-                       if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
-                                               RBYTES))
-                               stat->size = ci->i_rbytes;
-                       else
-                               stat->size = ci->i_files + ci->i_subdirs;
-                       stat->blocks = 0;
-                       stat->blksize = 65536;
-                       /*
-                        * Some applications rely on the number of st_nlink
-                        * value on directories to be either 0 (if unlinked)
-                        * or 2 + number of subdirectories.
-                        */
-                       if (stat->nlink == 1)
-                               /* '.' + '..' + subdirs */
-                               stat->nlink = 1 + 1 + ci->i_subdirs;
-               }
+                       stat->size = ci->i_files + ci->i_subdirs;
+               stat->blocks = 0;
+               stat->blksize = 65536;
+               /*
+                * Some applications rely on the number of st_nlink
+                * value on directories to be either 0 (if unlinked)
+                * or 2 + number of subdirectories.
+                */
+               if (stat->nlink == 1)
+                       /* '.' + '..' + subdirs */
+                       stat->nlink = 1 + 1 + ci->i_subdirs;
        }
+
+       /* Mask off any higher bits (e.g. btime) until we have support */
+       stat->result_mask = request_mask & STATX_BASIC_STATS;
        return err;
 }
index 9dae2ec7e1fa89705f649b1c3349520b8bb23543..ac9b53b893650270b80ccf5a69a954e55346922f 100644 (file)
@@ -237,15 +237,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
        spin_lock(&ci->i_ceph_lock);
        if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
                err = -EIO;
-       } else if (op == CEPH_MDS_OP_SETFILELOCK) {
-               /*
-                * increasing i_filelock_ref closes race window between
-                * handling request reply and adding file_lock struct to
-                * inode. Otherwise, i_auth_cap may get trimmed in the
-                * window. Caller function will decrease the counter.
-                */
-               fl->fl_ops = &ceph_fl_lock_ops;
-               atomic_inc(&ci->i_filelock_ref);
        }
        spin_unlock(&ci->i_ceph_lock);
        if (err < 0) {
@@ -299,10 +290,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
        spin_lock(&ci->i_ceph_lock);
        if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
                err = -EIO;
-       } else {
-               /* see comment in ceph_lock */
-               fl->fl_ops = &ceph_fl_lock_ops;
-               atomic_inc(&ci->i_filelock_ref);
        }
        spin_unlock(&ci->i_ceph_lock);
        if (err < 0) {
index 9049c2a3e972f499ea1371e8c4b8112ead98a6e3..959b1bf7c327d0e66454b570c31c36db076cfa0a 100644 (file)
@@ -550,15 +550,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
                                                   int mds)
 {
-       struct ceph_mds_session *session;
-
        if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
                return NULL;
-       session = mdsc->sessions[mds];
-       dout("lookup_mds_session %p %d\n", session,
-            refcount_read(&session->s_ref));
-       get_session(session);
-       return session;
+       return get_session(mdsc->sessions[mds]);
 }
 
 static bool __have_session(struct ceph_mds_client *mdsc, int mds)
@@ -1284,9 +1278,9 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
  *
  * Caller must hold session s_mutex.
  */
-static int iterate_session_caps(struct ceph_mds_session *session,
-                                int (*cb)(struct inode *, struct ceph_cap *,
-                                           void *), void *arg)
+int ceph_iterate_session_caps(struct ceph_mds_session *session,
+                             int (*cb)(struct inode *, struct ceph_cap *,
+                                       void *), void *arg)
 {
        struct list_head *p;
        struct ceph_cap *cap;
@@ -1451,7 +1445,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
        LIST_HEAD(dispose);
 
        dout("remove_session_caps on %p\n", session);
-       iterate_session_caps(session, remove_session_caps_cb, fsc);
+       ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
 
        wake_up_all(&fsc->mdsc->cap_flushing_wq);
 
@@ -1534,8 +1528,8 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 static void wake_up_session_caps(struct ceph_mds_session *session, int ev)
 {
        dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
-       iterate_session_caps(session, wake_up_session_cb,
-                            (void *)(unsigned long)ev);
+       ceph_iterate_session_caps(session, wake_up_session_cb,
+                                 (void *)(unsigned long)ev);
 }
 
 /*
@@ -1768,7 +1762,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
             session->s_mds, session->s_nr_caps, max_caps, trim_caps);
        if (trim_caps > 0) {
                session->s_trim_caps = trim_caps;
-               iterate_session_caps(session, trim_caps_cb, session);
+               ceph_iterate_session_caps(session, trim_caps_cb, session);
                dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
                     session->s_mds, session->s_nr_caps, max_caps,
                        trim_caps - session->s_trim_caps);
@@ -1861,7 +1855,8 @@ again:
                num_cap_releases--;
 
                head = msg->front.iov_base;
-               le32_add_cpu(&head->num, 1);
+               put_unaligned_le32(get_unaligned_le32(&head->num) + 1,
+                                  &head->num);
                item = msg->front.iov_base + msg->front.iov_len;
                item->ino = cpu_to_le64(cap->cap_ino);
                item->cap_id = cpu_to_le64(cap->cap_id);
@@ -2089,43 +2084,29 @@ static inline  u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
  * Encode hidden .snap dirs as a double /, i.e.
  *   foo/.snap/bar -> foo//bar
  */
-char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
                           int stop_on_nosnap)
 {
        struct dentry *temp;
        char *path;
-       int len, pos;
+       int pos;
        unsigned seq;
+       u64 base;
 
        if (!dentry)
                return ERR_PTR(-EINVAL);
 
-retry:
-       len = 0;
-       seq = read_seqbegin(&rename_lock);
-       rcu_read_lock();
-       for (temp = dentry; !IS_ROOT(temp);) {
-               struct inode *inode = d_inode(temp);
-               if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
-                       len++;  /* slash only */
-               else if (stop_on_nosnap && inode &&
-                        ceph_snap(inode) == CEPH_NOSNAP)
-                       break;
-               else
-                       len += 1 + temp->d_name.len;
-               temp = temp->d_parent;
-       }
-       rcu_read_unlock();
-       if (len)
-               len--;  /* no leading '/' */
-
-       path = kmalloc(len+1, GFP_NOFS);
+       path = __getname();
        if (!path)
                return ERR_PTR(-ENOMEM);
-       pos = len;
-       path[pos] = 0;  /* trailing null */
+retry:
+       pos = PATH_MAX - 1;
+       path[pos] = '\0';
+
+       seq = read_seqbegin(&rename_lock);
        rcu_read_lock();
-       for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+       temp = dentry;
+       for (;;) {
                struct inode *inode;
 
                spin_lock(&temp->d_lock);
@@ -2143,83 +2124,54 @@ retry:
                                spin_unlock(&temp->d_lock);
                                break;
                        }
-                       strncpy(path + pos, temp->d_name.name,
-                               temp->d_name.len);
+                       memcpy(path + pos, temp->d_name.name, temp->d_name.len);
                }
                spin_unlock(&temp->d_lock);
-               if (pos)
-                       path[--pos] = '/';
                temp = temp->d_parent;
+
+               /* Are we at the root? */
+               if (IS_ROOT(temp))
+                       break;
+
+               /* Are we out of buffer? */
+               if (--pos < 0)
+                       break;
+
+               path[pos] = '/';
        }
+       base = ceph_ino(d_inode(temp));
        rcu_read_unlock();
-       if (pos != 0 || read_seqretry(&rename_lock, seq)) {
+       if (pos < 0 || read_seqretry(&rename_lock, seq)) {
                pr_err("build_path did not end path lookup where "
-                      "expected, namelen is %d, pos is %d\n", len, pos);
+                      "expected, pos is %d\n", pos);
                /* presumably this is only possible if racing with a
                   rename of one of the parent directories (we can not
                   lock the dentries above us to prevent this, but
                   retrying should be harmless) */
-               kfree(path);
                goto retry;
        }
 
-       *base = ceph_ino(d_inode(temp));
-       *plen = len;
+       *pbase = base;
+       *plen = PATH_MAX - 1 - pos;
        dout("build_path on %p %d built %llx '%.*s'\n",
-            dentry, d_count(dentry), *base, len, path);
-       return path;
-}
-
-/* Duplicate the dentry->d_name.name safely */
-static int clone_dentry_name(struct dentry *dentry, const char **ppath,
-                            int *ppathlen)
-{
-       u32 len;
-       char *name;
-
-retry:
-       len = READ_ONCE(dentry->d_name.len);
-       name = kmalloc(len + 1, GFP_NOFS);
-       if (!name)
-               return -ENOMEM;
-
-       spin_lock(&dentry->d_lock);
-       if (dentry->d_name.len != len) {
-               spin_unlock(&dentry->d_lock);
-               kfree(name);
-               goto retry;
-       }
-       memcpy(name, dentry->d_name.name, len);
-       spin_unlock(&dentry->d_lock);
-
-       name[len] = '\0';
-       *ppath = name;
-       *ppathlen = len;
-       return 0;
+            dentry, d_count(dentry), base, *plen, path + pos);
+       return path + pos;
 }
 
 static int build_dentry_path(struct dentry *dentry, struct inode *dir,
                             const char **ppath, int *ppathlen, u64 *pino,
                             bool *pfreepath, bool parent_locked)
 {
-       int ret;
        char *path;
 
        rcu_read_lock();
        if (!dir)
                dir = d_inode_rcu(dentry->d_parent);
-       if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
+       if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) {
                *pino = ceph_ino(dir);
                rcu_read_unlock();
-               if (parent_locked) {
-                       *ppath = dentry->d_name.name;
-                       *ppathlen = dentry->d_name.len;
-               } else {
-                       ret = clone_dentry_name(dentry, ppath, ppathlen);
-                       if (ret)
-                               return ret;
-                       *pfreepath = true;
-               }
+               *ppath = dentry->d_name.name;
+               *ppathlen = dentry->d_name.len;
                return 0;
        }
        rcu_read_unlock();
@@ -2331,9 +2283,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
                (!!req->r_inode_drop + !!req->r_dentry_drop +
                 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
        if (req->r_dentry_drop)
-               len += req->r_dentry->d_name.len;
+               len += pathlen1;
        if (req->r_old_dentry_drop)
-               len += req->r_old_dentry->d_name.len;
+               len += pathlen2;
 
        msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
        if (!msg) {
@@ -2410,10 +2362,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 
 out_free2:
        if (freepath2)
-               kfree((char *)path2);
+               ceph_mdsc_free_path((char *)path2, pathlen2);
 out_free1:
        if (freepath1)
-               kfree((char *)path1);
+               ceph_mdsc_free_path((char *)path1, pathlen1);
 out:
        return msg;
 }
@@ -2427,8 +2379,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
 {
        if (req->r_callback)
                req->r_callback(mdsc, req);
-       else
-               complete_all(&req->r_completion);
+       complete_all(&req->r_completion);
 }
 
 /*
@@ -2670,28 +2621,11 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
        }
 }
 
-void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
                              struct ceph_mds_request *req)
-{
-       dout("submit_request on %p\n", req);
-       mutex_lock(&mdsc->mutex);
-       __register_request(mdsc, req, NULL);
-       __do_request(mdsc, req);
-       mutex_unlock(&mdsc->mutex);
-}
-
-/*
- * Synchrously perform an mds request.  Take care of all of the
- * session setup, forwarding, retry details.
- */
-int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
-                        struct inode *dir,
-                        struct ceph_mds_request *req)
 {
        int err;
 
-       dout("do_request on %p\n", req);
-
        /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
        if (req->r_inode)
                ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
@@ -2701,18 +2635,21 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
                ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
                                  CEPH_CAP_PIN);
 
-       /* issue */
+       dout("submit_request on %p for inode %p\n", req, dir);
        mutex_lock(&mdsc->mutex);
        __register_request(mdsc, req, dir);
        __do_request(mdsc, req);
+       err = req->r_err;
+       mutex_unlock(&mdsc->mutex);
+       return err;
+}
 
-       if (req->r_err) {
-               err = req->r_err;
-               goto out;
-       }
+static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_request *req)
+{
+       int err;
 
        /* wait */
-       mutex_unlock(&mdsc->mutex);
        dout("do_request waiting\n");
        if (!req->r_timeout && req->r_wait_for_completion) {
                err = req->r_wait_for_completion(mdsc, req);
@@ -2753,8 +2690,26 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
                err = req->r_err;
        }
 
-out:
        mutex_unlock(&mdsc->mutex);
+       return err;
+}
+
+/*
+ * Synchrously perform an mds request.  Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+                        struct inode *dir,
+                        struct ceph_mds_request *req)
+{
+       int err;
+
+       dout("do_request on %p\n", req);
+
+       /* issue */
+       err = ceph_mdsc_submit_request(mdsc, dir, req);
+       if (!err)
+               err = ceph_mdsc_wait_request(mdsc, req);
        dout("do_request %p done, result %d\n", req, err);
        return err;
 }
@@ -3485,7 +3440,7 @@ out_freeflocks:
                ceph_pagelist_encode_string(pagelist, path, pathlen);
                ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
 out_freepath:
-               kfree(path);
+               ceph_mdsc_free_path(path, pathlen);
        }
 
 out_err:
@@ -3642,7 +3597,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
                recon_state.msg_version = 2;
        }
        /* trsaverse this session's caps */
-       err = iterate_session_caps(session, encode_caps_cb, &recon_state);
+       err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state);
 
        spin_lock(&session->s_cap_lock);
        session->s_cap_reconnect = 0;
@@ -4125,6 +4080,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        mdsc->max_sessions = 0;
        mdsc->stopping = 0;
        atomic64_set(&mdsc->quotarealms_count, 0);
+       mdsc->quotarealms_inodes = RB_ROOT;
+       mutex_init(&mdsc->quotarealms_inodes_mutex);
        mdsc->last_snap_seq = 0;
        init_rwsem(&mdsc->snap_rwsem);
        mdsc->snap_realms = RB_ROOT;
@@ -4216,6 +4173,8 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
         * their inode/dcache refs
         */
        ceph_msgr_flush();
+
+       ceph_cleanup_quotarealms_inodes(mdsc);
 }
 
 /*
index 50385a481fdbb9b2764257ddec68cc5acb4a9ff3..a83f28bc23870ee1c65c7ac50d265c52e8f2a08a 100644 (file)
@@ -325,6 +325,18 @@ struct ceph_snapid_map {
        unsigned long last_used;
 };
 
+/*
+ * node for list of quotarealm inodes that are not visible from the filesystem
+ * mountpoint, but required to handle, e.g. quotas.
+ */
+struct ceph_quotarealm_inode {
+       struct rb_node node;
+       u64 ino;
+       unsigned long timeout; /* last time a lookup failed for this inode */
+       struct mutex mutex;
+       struct inode *inode;
+};
+
 /*
  * mds client state
  */
@@ -344,6 +356,12 @@ struct ceph_mds_client {
        int                     stopping;      /* true if shutting down */
 
        atomic64_t              quotarealms_count; /* # realms with quota */
+       /*
+        * We keep a list of inodes we don't see in the mountpoint but that we
+        * need to track quota realms.
+        */
+       struct rb_root          quotarealms_inodes;
+       struct mutex            quotarealms_inodes_mutex;
 
        /*
         * snap_rwsem will cover cap linkage into snaprealms, and
@@ -447,8 +465,9 @@ extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
                                           struct inode *dir);
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
-extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
-                                    struct ceph_mds_request *req);
+extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+                                   struct inode *dir,
+                                   struct ceph_mds_request *req);
 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
                                struct inode *dir,
                                struct ceph_mds_request *req);
@@ -468,8 +487,18 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
                                    struct ceph_mds_session *session);
 extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
 extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
+extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
+                                    int (*cb)(struct inode *,
+                                              struct ceph_cap *, void *),
+                                    void *arg);
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 
+static inline void ceph_mdsc_free_path(char *path, int len)
+{
+       if (path)
+               __putname(path - (PATH_MAX - 1 - len));
+}
+
 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
                                  int stop_on_nosnap);
 
index 1a2c5d390f7f184705b3bde1f68f36074b6bf39e..701b4fb0fb5a4cd576880139ca2e183dc0b39e98 100644 (file)
@@ -205,7 +205,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
                dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
                     i+1, n, global_id, mds, inc,
-                    ceph_pr_addr(&addr.in_addr),
+                    ceph_pr_addr(&addr),
                     ceph_mds_state_name(state));
 
                if (mds < 0 || state <= 0)
index 9455d3aef0c3c1b50f5be96e3a804424e55597b4..c4522212872c9fe5f5a3dd4515b19e688cce805a 100644 (file)
@@ -22,7 +22,16 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
 static inline bool ceph_has_realms_with_quotas(struct inode *inode)
 {
        struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
-       return atomic64_read(&mdsc->quotarealms_count) > 0;
+       struct super_block *sb = mdsc->fsc->sb;
+
+       if (atomic64_read(&mdsc->quotarealms_count) > 0)
+               return true;
+       /* if root is the real CephFS root, we don't have quota realms */
+       if (sb->s_root->d_inode &&
+           (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+               return false;
+       /* otherwise, we can't know for sure */
+       return true;
 }
 
 void ceph_handle_quota(struct ceph_mds_client *mdsc,
@@ -68,6 +77,108 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
        iput(inode);
 }
 
+static struct ceph_quotarealm_inode *
+find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino)
+{
+       struct ceph_quotarealm_inode *qri = NULL;
+       struct rb_node **node, *parent = NULL;
+
+       mutex_lock(&mdsc->quotarealms_inodes_mutex);
+       node = &(mdsc->quotarealms_inodes.rb_node);
+       while (*node) {
+               parent = *node;
+               qri = container_of(*node, struct ceph_quotarealm_inode, node);
+
+               if (ino < qri->ino)
+                       node = &((*node)->rb_left);
+               else if (ino > qri->ino)
+                       node = &((*node)->rb_right);
+               else
+                       break;
+       }
+       if (!qri || (qri->ino != ino)) {
+               /* Not found, create a new one and insert it */
+               qri = kmalloc(sizeof(*qri), GFP_KERNEL);
+               if (qri) {
+                       qri->ino = ino;
+                       qri->inode = NULL;
+                       qri->timeout = 0;
+                       mutex_init(&qri->mutex);
+                       rb_link_node(&qri->node, parent, node);
+                       rb_insert_color(&qri->node, &mdsc->quotarealms_inodes);
+               } else
+                       pr_warn("Failed to alloc quotarealms_inode\n");
+       }
+       mutex_unlock(&mdsc->quotarealms_inodes_mutex);
+
+       return qri;
+}
+
+/*
+ * This function will try to lookup a realm inode which isn't visible in the
+ * filesystem mountpoint.  A list of these kind of inodes (not visible) is
+ * maintained in the mdsc and freed only when the filesystem is umounted.
+ *
+ * Note that these inodes are kept in this list even if the lookup fails, which
+ * allows to prevent useless lookup requests.
+ */
+static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
+                                            struct super_block *sb,
+                                            struct ceph_snap_realm *realm)
+{
+       struct ceph_quotarealm_inode *qri;
+       struct inode *in;
+
+       qri = find_quotarealm_inode(mdsc, realm->ino);
+       if (!qri)
+               return NULL;
+
+       mutex_lock(&qri->mutex);
+       if (qri->inode) {
+               /* A request has already returned the inode */
+               mutex_unlock(&qri->mutex);
+               return qri->inode;
+       }
+       /* Check if this inode lookup has failed recently */
+       if (qri->timeout &&
+           time_before_eq(jiffies, qri->timeout)) {
+               mutex_unlock(&qri->mutex);
+               return NULL;
+       }
+       in = ceph_lookup_inode(sb, realm->ino);
+       if (IS_ERR(in)) {
+               pr_warn("Can't lookup inode %llx (err: %ld)\n",
+                       realm->ino, PTR_ERR(in));
+               qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
+       } else {
+               qri->timeout = 0;
+               qri->inode = in;
+       }
+       mutex_unlock(&qri->mutex);
+
+       return in;
+}
+
+void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
+{
+       struct ceph_quotarealm_inode *qri;
+       struct rb_node *node;
+
+       /*
+        * It should now be safe to clean quotarealms_inode tree without holding
+        * mdsc->quotarealms_inodes_mutex...
+        */
+       mutex_lock(&mdsc->quotarealms_inodes_mutex);
+       while (!RB_EMPTY_ROOT(&mdsc->quotarealms_inodes)) {
+               node = rb_first(&mdsc->quotarealms_inodes);
+               qri = rb_entry(node, struct ceph_quotarealm_inode, node);
+               rb_erase(node, &mdsc->quotarealms_inodes);
+               iput(qri->inode);
+               kfree(qri);
+       }
+       mutex_unlock(&mdsc->quotarealms_inodes_mutex);
+}
+
 /*
  * This function walks through the snaprealm for an inode and returns the
  * ceph_snap_realm for the first snaprealm that has quotas set (either max_files
@@ -76,9 +187,15 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
  *
  * Note that the caller is responsible for calling ceph_put_snap_realm() on the
  * returned realm.
+ *
+ * Callers of this function need to hold mdsc->snap_rwsem.  However, if there's
+ * a need to do an inode lookup, this rwsem will be temporarily dropped.  Hence
+ * the 'retry' argument: if rwsem needs to be dropped and 'retry' is 'false'
+ * this function will return -EAGAIN; otherwise, the snaprealms walk-through
+ * will be restarted.
  */
 static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
-                                              struct inode *inode)
+                                              struct inode *inode, bool retry)
 {
        struct ceph_inode_info *ci = NULL;
        struct ceph_snap_realm *realm, *next;
@@ -88,6 +205,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return NULL;
 
+restart:
        realm = ceph_inode(inode)->i_snap_realm;
        if (realm)
                ceph_get_snap_realm(mdsc, realm);
@@ -95,11 +213,25 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
                pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) "
                                   "null i_snap_realm\n", ceph_vinop(inode));
        while (realm) {
+               bool has_inode;
+
                spin_lock(&realm->inodes_with_caps_lock);
-               in = realm->inode ? igrab(realm->inode) : NULL;
+               has_inode = realm->inode;
+               in = has_inode ? igrab(realm->inode) : NULL;
                spin_unlock(&realm->inodes_with_caps_lock);
-               if (!in)
+               if (has_inode && !in)
                        break;
+               if (!in) {
+                       up_read(&mdsc->snap_rwsem);
+                       in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
+                       down_read(&mdsc->snap_rwsem);
+                       if (IS_ERR_OR_NULL(in))
+                               break;
+                       ceph_put_snap_realm(mdsc, realm);
+                       if (!retry)
+                               return ERR_PTR(-EAGAIN);
+                       goto restart;
+               }
 
                ci = ceph_inode(in);
                has_quota = __ceph_has_any_quota(ci);
@@ -125,9 +257,22 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
        struct ceph_snap_realm *old_realm, *new_realm;
        bool is_same;
 
+restart:
+       /*
+        * We need to lookup 2 quota realms atomically, i.e. with snap_rwsem.
+        * However, get_quota_realm may drop it temporarily.  By setting the
+        * 'retry' parameter to 'false', we'll get -EAGAIN if the rwsem was
+        * dropped and we can then restart the whole operation.
+        */
        down_read(&mdsc->snap_rwsem);
-       old_realm = get_quota_realm(mdsc, old);
-       new_realm = get_quota_realm(mdsc, new);
+       old_realm = get_quota_realm(mdsc, old, true);
+       new_realm = get_quota_realm(mdsc, new, false);
+       if (PTR_ERR(new_realm) == -EAGAIN) {
+               up_read(&mdsc->snap_rwsem);
+               if (old_realm)
+                       ceph_put_snap_realm(mdsc, old_realm);
+               goto restart;
+       }
        is_same = (old_realm == new_realm);
        up_read(&mdsc->snap_rwsem);
 
@@ -166,6 +311,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
                return false;
 
        down_read(&mdsc->snap_rwsem);
+restart:
        realm = ceph_inode(inode)->i_snap_realm;
        if (realm)
                ceph_get_snap_realm(mdsc, realm);
@@ -173,12 +319,23 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
                pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) "
                                   "null i_snap_realm\n", ceph_vinop(inode));
        while (realm) {
+               bool has_inode;
+
                spin_lock(&realm->inodes_with_caps_lock);
-               in = realm->inode ? igrab(realm->inode) : NULL;
+               has_inode = realm->inode;
+               in = has_inode ? igrab(realm->inode) : NULL;
                spin_unlock(&realm->inodes_with_caps_lock);
-               if (!in)
+               if (has_inode && !in)
                        break;
-
+               if (!in) {
+                       up_read(&mdsc->snap_rwsem);
+                       in = lookup_quotarealm_inode(mdsc, inode->i_sb, realm);
+                       down_read(&mdsc->snap_rwsem);
+                       if (IS_ERR_OR_NULL(in))
+                               break;
+                       ceph_put_snap_realm(mdsc, realm);
+                       goto restart;
+               }
                ci = ceph_inode(in);
                spin_lock(&ci->i_ceph_lock);
                if (op == QUOTA_CHECK_MAX_FILES_OP) {
@@ -314,7 +471,7 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
        bool is_updated = false;
 
        down_read(&mdsc->snap_rwsem);
-       realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root));
+       realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true);
        up_read(&mdsc->snap_rwsem);
        if (!realm)
                return false;
index 285edda4fc3b4ec122975062f09d6497e41a0cbe..c864b44c8341d4ec211175d9efe0c75905067e14 100644 (file)
@@ -845,6 +845,12 @@ static void ceph_umount_begin(struct super_block *sb)
        return;
 }
 
+static int ceph_remount(struct super_block *sb, int *flags, char *data)
+{
+       sync_filesystem(sb);
+       return 0;
+}
+
 static const struct super_operations ceph_super_ops = {
        .alloc_inode    = ceph_alloc_inode,
        .destroy_inode  = ceph_destroy_inode,
@@ -853,6 +859,7 @@ static const struct super_operations ceph_super_ops = {
        .drop_inode     = ceph_drop_inode,
        .sync_fs        = ceph_sync_fs,
        .put_super      = ceph_put_super,
+       .remount_fs     = ceph_remount,
        .show_options   = ceph_show_options,
        .statfs         = ceph_statfs,
        .umount_begin   = ceph_umount_begin,
index c5b4a05905c01f7ce738a7123fe6db23945c2c7e..6edab9a750f8a00211f7ccf2aa659982c684330a 100644 (file)
@@ -1083,6 +1083,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 
 /* export.c */
 extern const struct export_operations ceph_export_ops;
+struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino);
 
 /* locks.c */
 extern __init void ceph_flock_init(void);
@@ -1133,5 +1134,6 @@ extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode,
                                                loff_t newlen);
 extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
                                     struct kstatfs *buf);
+extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc);
 
 #endif /* _FS_CEPH_SUPER_H */
index 7ede7306599f47449bdd02533db47bfede2c81df..1e21b2528cfb3e8397db38ab0d927ef047ab3e37 100644 (file)
@@ -77,7 +77,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
                goto name_is_IP_address;
 
        /* Perform the upcall */
-       rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
+       rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL, false);
        if (rc < 0)
                cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
                         __func__, len, len, hostname);
index 591e82ba443cd36bae59cb621acd2fad178d23ac..5e7932d668ab74b87e063af86694ff4de3cdc924 100644 (file)
@@ -1757,12 +1757,19 @@ int configfs_register_group(struct config_group *parent_group,
 
        inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
        ret = create_default_group(parent_group, group);
-       if (!ret) {
-               spin_lock(&configfs_dirent_lock);
-               configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
-               spin_unlock(&configfs_dirent_lock);
-       }
+       if (ret)
+               goto err_out;
+
+       spin_lock(&configfs_dirent_lock);
+       configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
+       spin_unlock(&configfs_dirent_lock);
+       inode_unlock(d_inode(parent));
+       return 0;
+err_out:
        inode_unlock(d_inode(parent));
+       mutex_lock(&subsys->su_mutex);
+       unlink_group(group);
+       mutex_unlock(&subsys->su_mutex);
        return ret;
 }
 EXPORT_SYMBOL(configfs_register_group);
index a7d3df85736dfba49c461ed54a17b5db0d3bfa44..e6a700f014528088ef0e15260b9a5d4d95822a64 100644 (file)
@@ -22,7 +22,7 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
        char *ip_addr = NULL;
        int ip_len;
 
-       ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
+       ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL, false);
        if (ip_len > 0)
                ret = rpc_pton(net, ip_addr, ip_len, sa, salen);
        else
diff --git a/include/asm-generic/segment.h b/include/asm-generic/segment.h
deleted file mode 100644 (file)
index 5580eac..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ASM_GENERIC_SEGMENT_H
-#define __ASM_GENERIC_SEGMENT_H
-/*
- * Only here because we have some old header files that expect it...
- *
- * New architectures probably don't want to have their own version.
- */
-
-#endif /* __ASM_GENERIC_SEGMENT_H */
index b3d2241e03f81c5d11d8bfb9bacb8f2e6676292f..e935318804f8aedb6319c4f63f89e5bf8a46c5b0 100644 (file)
@@ -9,7 +9,63 @@
  */
 #include <linux/string.h>
 
-#include <asm/segment.h>
+#ifdef CONFIG_UACCESS_MEMCPY
+static inline __must_check unsigned long
+raw_copy_from_user(void *to, const void __user * from, unsigned long n)
+{
+       if (__builtin_constant_p(n)) {
+               switch(n) {
+               case 1:
+                       *(u8 *)to = *(u8 __force *)from;
+                       return 0;
+               case 2:
+                       *(u16 *)to = *(u16 __force *)from;
+                       return 0;
+               case 4:
+                       *(u32 *)to = *(u32 __force *)from;
+                       return 0;
+#ifdef CONFIG_64BIT
+               case 8:
+                       *(u64 *)to = *(u64 __force *)from;
+                       return 0;
+#endif
+               }
+       }
+
+       memcpy(to, (const void __force *)from, n);
+       return 0;
+}
+
+static inline __must_check unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+       if (__builtin_constant_p(n)) {
+               switch(n) {
+               case 1:
+                       *(u8 __force *)to = *(u8 *)from;
+                       return 0;
+               case 2:
+                       *(u16 __force *)to = *(u16 *)from;
+                       return 0;
+               case 4:
+                       *(u32 __force *)to = *(u32 *)from;
+                       return 0;
+#ifdef CONFIG_64BIT
+               case 8:
+                       *(u64 __force *)to = *(u64 *)from;
+                       return 0;
+#endif
+               default:
+                       break;
+               }
+       }
+
+       memcpy((void __force *)to, from, n);
+       return 0;
+}
+#define INLINE_COPY_FROM_USER
+#define INLINE_COPY_TO_USER
+#endif /* CONFIG_UACCESS_MEMCPY */
 
 #define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
 
index 4903deb0777a746068ecd7f5cc5a472276885cfe..3ac0feaf2b5ee044f5c457a8c6642bb4037069b6 100644 (file)
@@ -436,6 +436,12 @@ union ceph_mds_request_args {
                __le64 length; /* num bytes to lock from start */
                __u8 wait; /* will caller wait for lock to become available? */
        } __attribute__ ((packed)) filelock_change;
+       struct {
+               __le32 mask;                 /* CEPH_CAP_* */
+               __le64 snapid;
+               __le64 parent;
+               __le32 hash;
+       } __attribute__ ((packed)) lookupino;
 } __attribute__ ((packed));
 
 #define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
index 800a2128d411b3a1f39ec0ab4f2068a4b4d76a52..23895d178149bd2ec5fb7984419976e8e13cbc1a 100644 (file)
@@ -323,7 +323,8 @@ struct ceph_connection {
 };
 
 
-extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
+extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
+
 extern int ceph_parse_ips(const char *c, const char *end,
                          struct ceph_entity_addr *addr,
                          int max_count, int *count);
index 5675b1f09bc5c2fa813f4e8e714a791f2666ab1c..e081b56f1c1dacf60a850cbafface5e8e481f7e3 100644 (file)
@@ -110,17 +110,16 @@ struct ceph_object_id {
        int name_len;
 };
 
+#define __CEPH_OID_INITIALIZER(oid) { .name = (oid).inline_name }
+
+#define CEPH_DEFINE_OID_ONSTACK(oid)                           \
+       struct ceph_object_id oid = __CEPH_OID_INITIALIZER(oid)
+
 static inline void ceph_oid_init(struct ceph_object_id *oid)
 {
-       oid->name = oid->inline_name;
-       oid->name_len = 0;
+       *oid = (struct ceph_object_id) __CEPH_OID_INITIALIZER(*oid);
 }
 
-#define CEPH_OID_INIT_ONSTACK(oid)                                     \
-    ({ ceph_oid_init(&oid); oid; })
-#define CEPH_DEFINE_OID_ONSTACK(oid)                                   \
-       struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid)
-
 static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
 {
        return oid->name == oid->inline_name && !oid->name_len;
index b0672756d0562697238cb9cf53775c3c48d87a4a..e1f51d607cc541924cc54c43aaa7b80c06a6322e 100644 (file)
@@ -62,7 +62,8 @@ typedef int (*dm_clone_and_map_request_fn) (struct dm_target *ti,
                                            struct request *rq,
                                            union map_info *map_context,
                                            struct request **clone);
-typedef void (*dm_release_clone_request_fn) (struct request *clone);
+typedef void (*dm_release_clone_request_fn) (struct request *clone,
+                                            union map_info *map_context);
 
 /*
  * Returns:
index 34a744a1bafcbc84c4e2992435f14b71610a4c88..f2b3ae22e6b77bf3ba37ed7b42abc94ca95222cb 100644 (file)
@@ -27,6 +27,7 @@
 #include <uapi/linux/dns_resolver.h>
 
 extern int dns_query(const char *type, const char *name, size_t namelen,
-                    const char *options, char **_result, time64_t *_expiry);
+                    const char *options, char **_result, time64_t *_expiry,
+                    bool invalidate);
 
 #endif /* _LINUX_DNS_RESOLVER_H */
index d3b4db8953408184b1a5213a31f871da0663a0a7..e951228db4b2c64f68a05f73bd6e058652699730 100644 (file)
@@ -789,7 +789,7 @@ static inline void hlist_add_behind(struct hlist_node *n,
                                    struct hlist_node *prev)
 {
        n->next = prev->next;
-       WRITE_ONCE(prev->next, n);
+       prev->next = n;
        n->pprev = &prev->next;
 
        if (n->next)
index 3fc2cc57ba1bc6d5badfd24ef67982683457507e..ae1b541446c906158cb057538b62241161e4b361 100644 (file)
@@ -86,6 +86,32 @@ static inline void hlist_bl_add_head(struct hlist_bl_node *n,
        hlist_bl_set_first(h, n);
 }
 
+static inline void hlist_bl_add_before(struct hlist_bl_node *n,
+                                      struct hlist_bl_node *next)
+{
+       struct hlist_bl_node **pprev = next->pprev;
+
+       n->pprev = pprev;
+       n->next = next;
+       next->pprev = &n->next;
+
+       /* pprev may be `first`, so be careful not to lose the lock bit */
+       WRITE_ONCE(*pprev,
+                  (struct hlist_bl_node *)
+                       ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
+}
+
+static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
+                                      struct hlist_bl_node *prev)
+{
+       n->next = prev->next;
+       n->pprev = &prev->next;
+       prev->next = n;
+
+       if (n->next)
+               n->next->pprev = &n->next;
+}
+
 static inline void __hlist_bl_del(struct hlist_bl_node *n)
 {
        struct hlist_bl_node *next = n->next;
index 15eb85de92269e90d725c6da97f5615bd883dbff..659045046468fec25dfacbab285b95db5c3b028d 100644 (file)
@@ -284,11 +284,15 @@ static inline __must_check size_t array3_size(size_t a, size_t b, size_t c)
        return bytes;
 }
 
-static inline __must_check size_t __ab_c_size(size_t n, size_t size, size_t c)
+/*
+ * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for
+ * struct_size() below.
+ */
+static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c)
 {
        size_t bytes;
 
-       if (check_mul_overflow(n, size, &bytes))
+       if (check_mul_overflow(a, b, &bytes))
                return SIZE_MAX;
        if (check_add_overflow(bytes, c, &bytes))
                return SIZE_MAX;
index 9a5eafb7145bb1733fa8ffeaf26ff15445ee013c..abc7de77b9881a0a3ab81c50f103f0b1b782fc68 100644 (file)
@@ -61,9 +61,6 @@ struct kmem_cache {
        atomic_t allocmiss;
        atomic_t freehit;
        atomic_t freemiss;
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-       atomic_t store_user_clean;
-#endif
 
        /*
         * If debugging is enabled, then the allocator can add additional
index 4a22099ed8c01fc7c152ef37b6ecc6a52582e46d..15a4ca5d709995b55c81d08615ce7f8f68c223e7 100644 (file)
@@ -442,10 +442,10 @@ void thermal_zone_device_update(struct thermal_zone_device *,
                                enum thermal_notify_event);
 void thermal_zone_set_trips(struct thermal_zone_device *);
 
-struct thermal_cooling_device *thermal_cooling_device_register(char *, void *,
-               const struct thermal_cooling_device_ops *);
+struct thermal_cooling_device *thermal_cooling_device_register(const char *,
+               void *, const struct thermal_cooling_device_ops *);
 struct thermal_cooling_device *
-thermal_of_cooling_device_register(struct device_node *np, char *, void *,
+thermal_of_cooling_device_register(struct device_node *np, const char *, void *,
                                   const struct thermal_cooling_device_ops *);
 struct thermal_cooling_device *
 devm_thermal_of_cooling_device_register(struct device *dev,
index 2b0072fa5e92d2eb4ef6d779eb95be3f4d7e9ae3..7dec36aecbd9fe239ba1b94f81c729e6665d6bd4 100644 (file)
@@ -305,6 +305,19 @@ do {                                                                       \
        __ret;                                                          \
 })
 
+#define __wait_var_event_interruptible(var, condition)                 \
+       ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0,     \
+                         schedule())
+
+#define wait_var_event_interruptible(var, condition)                   \
+({                                                                     \
+       int __ret = 0;                                                  \
+       might_sleep();                                                  \
+       if (!(condition))                                               \
+               __ret = __wait_var_event_interruptible(var, condition); \
+       __ret;                                                          \
+})
+
 /**
  * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
  *
index 5c31a768249201d48459e6f603d9a00df5e8cf14..f76d2f25a82474f1ab80eb7dd7004b94f288900d 100644 (file)
@@ -92,7 +92,7 @@ struct vpbe_config {
        struct encoder_config_info *ext_encoders;
        /* amplifier information goes here */
        struct amp_config_info *amp;
-       int num_outputs;
+       unsigned int num_outputs;
        /* Order is venc outputs followed by LCD and then external encoders */
        struct vpbe_output *outputs;
 };
index 78c856cba4f538c078fada09ef3238c2bc220069..93358bfc0e1b997c1c58d536dd02ff452c26cb0b 100644 (file)
@@ -45,6 +45,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *,
                                           gfp_t,
                                           rxrpc_notify_rx_t,
                                           bool,
+                                          bool,
                                           unsigned int);
 int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *,
                           struct msghdr *, size_t,
@@ -68,5 +69,7 @@ u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *);
 bool rxrpc_kernel_get_reply_time(struct socket *, struct rxrpc_call *,
                                 ktime_t *);
 bool rxrpc_kernel_call_is_complete(struct rxrpc_call *);
+void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *,
+                              unsigned long);
 
 #endif /* _NET_RXRPC_H */
index 6b3ee9948bf17a37f5be8a730064f3e5d7374774..0b1f779572402d4736783cbc890bfbf748416b38 100644 (file)
@@ -130,6 +130,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 {
        struct rwsem_waiter *waiter, *tmp;
        long oldcount, woken = 0, adjustment = 0;
+       struct list_head wlist;
 
        /*
         * Take a peek at the queue head waiter such that we can determine
@@ -188,18 +189,43 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
         * of the queue. We know that woken will be at least 1 as we accounted
         * for above. Note we increment the 'active part' of the count by the
         * number of readers before waking any processes up.
+        *
+        * We have to do wakeup in 2 passes to prevent the possibility that
+        * the reader count may be decremented before it is incremented. It
+        * is because the to-be-woken waiter may not have slept yet. So it
+        * may see waiter->task got cleared, finish its critical section and
+        * do an unlock before the reader count increment.
+        *
+        * 1) Collect the read-waiters in a separate list, count them and
+        *    fully increment the reader count in rwsem.
+        * 2) For each waiters in the new list, clear waiter->task and
+        *    put them into wake_q to be woken up later.
         */
-       list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
-               struct task_struct *tsk;
-
+       list_for_each_entry(waiter, &sem->wait_list, list) {
                if (waiter->type == RWSEM_WAITING_FOR_WRITE)
                        break;
 
                woken++;
-               tsk = waiter->task;
+       }
+       list_cut_before(&wlist, &sem->wait_list, &waiter->list);
+
+       adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+       lockevent_cond_inc(rwsem_wake_reader, woken);
+       if (list_empty(&sem->wait_list)) {
+               /* hit end of list above */
+               adjustment -= RWSEM_WAITING_BIAS;
+       }
+
+       if (adjustment)
+               atomic_long_add(adjustment, &sem->count);
+
+       /* 2nd pass */
+       list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+               struct task_struct *tsk;
 
+               tsk = waiter->task;
                get_task_struct(tsk);
-               list_del(&waiter->list);
+
                /*
                 * Ensure calling get_task_struct() before setting the reader
                 * waiter to nil such that rwsem_down_read_failed() cannot
@@ -213,16 +239,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
                 */
                wake_q_add_safe(wake_q, tsk);
        }
-
-       adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
-       lockevent_cond_inc(rwsem_wake_reader, woken);
-       if (list_empty(&sem->wait_list)) {
-               /* hit end of list above */
-               adjustment -= RWSEM_WAITING_BIAS;
-       }
-
-       if (adjustment)
-               atomic_long_add(adjustment, &sem->count);
 }
 
 /*
index ac5555e257334b7a33bab652e37fc43dfb0f9bac..8de4f789dc1b59cd45383a225487e5510580a023 100644 (file)
@@ -691,7 +691,7 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
                time_constant = max(time_constant, 0l);
        }
 
-       if (txc->modes & ADJ_TAI && txc->constant > 0)
+       if (txc->modes & ADJ_TAI && txc->constant >= 0)
                *time_tai = txc->constant;
 
        if (txc->modes & ADJ_OFFSET)
index 3577609b61bea447a73248bf225ba8e5a67cd1e1..8d9239a4156c65288c12ef5a5626446335315b94 100644 (file)
@@ -601,6 +601,10 @@ config ARCH_NO_SG_CHAIN
 config ARCH_HAS_PMEM_API
        bool
 
+# use memcpy to implement user copies for nommu architectures
+config UACCESS_MEMCPY
+       bool
+
 config ARCH_HAS_UACCESS_FLUSHCACHE
        bool
 
index fdfa173651ebe8d430d24a4e819d12a4390351cf..eae43952902ebfb487817b081f52685a47154464 100644 (file)
@@ -542,10 +542,6 @@ config DEBUG_SLAB
          allocation as well as poisoning memory on free to catch use of freed
          memory. This can make kmalloc/kfree-intensive workloads much slower.
 
-config DEBUG_SLAB_LEAK
-       bool "Memory leak debugging"
-       depends on DEBUG_SLAB
-
 config SLUB_DEBUG_ON
        bool "SLUB debugging on by default"
        depends on SLUB && SLUB_DEBUG
index 7660d88fd4960c8571d16f035d4d92b76fd17f2c..c94586b6255187906cfde38fda8b84f33c9fc21f 100644 (file)
@@ -10,7 +10,6 @@
  * The Hamming Weight of a number is the total number of bits set in it.
  */
 
-#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned int __sw_hweight32(unsigned int w)
 {
 #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
@@ -27,7 +26,6 @@ unsigned int __sw_hweight32(unsigned int w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight32);
-#endif
 
 unsigned int __sw_hweight16(unsigned int w)
 {
@@ -46,7 +44,6 @@ unsigned int __sw_hweight8(unsigned int w)
 }
 EXPORT_SYMBOL(__sw_hweight8);
 
-#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned long __sw_hweight64(__u64 w)
 {
 #if BITS_PER_LONG == 32
@@ -69,4 +66,3 @@ unsigned long __sw_hweight64(__u64 w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight64);
-#endif
index 2915d912e89a4c5bb0f5edabc2ee8e768960263d..f7117ad9b3a34ddf3ce6cc12689eef6ebd155763 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -362,29 +362,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 
 #endif
 
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-
-static inline bool is_store_user_clean(struct kmem_cache *cachep)
-{
-       return atomic_read(&cachep->store_user_clean) == 1;
-}
-
-static inline void set_store_user_clean(struct kmem_cache *cachep)
-{
-       atomic_set(&cachep->store_user_clean, 1);
-}
-
-static inline void set_store_user_dirty(struct kmem_cache *cachep)
-{
-       if (is_store_user_clean(cachep))
-               atomic_set(&cachep->store_user_clean, 0);
-}
-
-#else
-static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
-
-#endif
-
 /*
  * Do not go above this order unless 0 objects fit into the slab or
  * overridden on the command line.
@@ -2552,11 +2529,6 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
        objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
        page->active++;
 
-#if DEBUG
-       if (cachep->flags & SLAB_STORE_USER)
-               set_store_user_dirty(cachep);
-#endif
-
        return objp;
 }
 
@@ -2762,10 +2734,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
        }
-       if (cachep->flags & SLAB_STORE_USER) {
-               set_store_user_dirty(cachep);
+       if (cachep->flags & SLAB_STORE_USER)
                *dbg_userword(cachep, objp) = (void *)caller;
-       }
 
        objnr = obj_to_index(cachep, page, objp);
 
@@ -4184,200 +4154,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
        return res;
 }
 
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-
-static inline int add_caller(unsigned long *n, unsigned long v)
-{
-       unsigned long *p;
-       int l;
-       if (!v)
-               return 1;
-       l = n[1];
-       p = n + 2;
-       while (l) {
-               int i = l/2;
-               unsigned long *q = p + 2 * i;
-               if (*q == v) {
-                       q[1]++;
-                       return 1;
-               }
-               if (*q > v) {
-                       l = i;
-               } else {
-                       p = q + 2;
-                       l -= i + 1;
-               }
-       }
-       if (++n[1] == n[0])
-               return 0;
-       memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
-       p[0] = v;
-       p[1] = 1;
-       return 1;
-}
-
-static void handle_slab(unsigned long *n, struct kmem_cache *c,
-                                               struct page *page)
-{
-       void *p;
-       int i, j;
-       unsigned long v;
-
-       if (n[0] == n[1])
-               return;
-       for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
-               bool active = true;
-
-               for (j = page->active; j < c->num; j++) {
-                       if (get_free_obj(page, j) == i) {
-                               active = false;
-                               break;
-                       }
-               }
-
-               if (!active)
-                       continue;
-
-               /*
-                * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
-                * mapping is established when actual object allocation and
-                * we could mistakenly access the unmapped object in the cpu
-                * cache.
-                */
-               if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
-                       continue;
-
-               if (!add_caller(n, v))
-                       return;
-       }
-}
-
-static void show_symbol(struct seq_file *m, unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
-       unsigned long offset, size;
-       char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
-
-       if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
-               seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
-               if (modname[0])
-                       seq_printf(m, " [%s]", modname);
-               return;
-       }
-#endif
-       seq_printf(m, "%px", (void *)address);
-}
-
-static int leaks_show(struct seq_file *m, void *p)
-{
-       struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
-                                              root_caches_node);
-       struct page *page;
-       struct kmem_cache_node *n;
-       const char *name;
-       unsigned long *x = m->private;
-       int node;
-       int i;
-
-       if (!(cachep->flags & SLAB_STORE_USER))
-               return 0;
-       if (!(cachep->flags & SLAB_RED_ZONE))
-               return 0;
-
-       /*
-        * Set store_user_clean and start to grab stored user information
-        * for all objects on this cache. If some alloc/free requests comes
-        * during the processing, information would be wrong so restart
-        * whole processing.
-        */
-       do {
-               drain_cpu_caches(cachep);
-               /*
-                * drain_cpu_caches() could make kmemleak_object and
-                * debug_objects_cache dirty, so reset afterwards.
-                */
-               set_store_user_clean(cachep);
-
-               x[1] = 0;
-
-               for_each_kmem_cache_node(cachep, node, n) {
-
-                       check_irq_on();
-                       spin_lock_irq(&n->list_lock);
-
-                       list_for_each_entry(page, &n->slabs_full, slab_list)
-                               handle_slab(x, cachep, page);
-                       list_for_each_entry(page, &n->slabs_partial, slab_list)
-                               handle_slab(x, cachep, page);
-                       spin_unlock_irq(&n->list_lock);
-               }
-       } while (!is_store_user_clean(cachep));
-
-       name = cachep->name;
-       if (x[0] == x[1]) {
-               /* Increase the buffer size */
-               mutex_unlock(&slab_mutex);
-               m->private = kcalloc(x[0] * 4, sizeof(unsigned long),
-                                    GFP_KERNEL);
-               if (!m->private) {
-                       /* Too bad, we are really out */
-                       m->private = x;
-                       mutex_lock(&slab_mutex);
-                       return -ENOMEM;
-               }
-               *(unsigned long *)m->private = x[0] * 2;
-               kfree(x);
-               mutex_lock(&slab_mutex);
-               /* Now make sure this entry will be retried */
-               m->count = m->size;
-               return 0;
-       }
-       for (i = 0; i < x[1]; i++) {
-               seq_printf(m, "%s: %lu ", name, x[2*i+3]);
-               show_symbol(m, x[2*i+2]);
-               seq_putc(m, '\n');
-       }
-
-       return 0;
-}
-
-static const struct seq_operations slabstats_op = {
-       .start = slab_start,
-       .next = slab_next,
-       .stop = slab_stop,
-       .show = leaks_show,
-};
-
-static int slabstats_open(struct inode *inode, struct file *file)
-{
-       unsigned long *n;
-
-       n = __seq_open_private(file, &slabstats_op, PAGE_SIZE);
-       if (!n)
-               return -ENOMEM;
-
-       *n = PAGE_SIZE / (2 * sizeof(unsigned long));
-
-       return 0;
-}
-
-static const struct file_operations proc_slabstats_operations = {
-       .open           = slabstats_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = seq_release_private,
-};
-#endif
-
-static int __init slab_proc_init(void)
-{
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-       proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
-#endif
-       return 0;
-}
-module_init(slab_proc_init);
-
 #ifdef CONFIG_HARDENED_USERCOPY
 /*
  * Rejects incorrectly sized objects and objects that are to be copied
index 2105a6eaa66cdaa038e43363341d3dff2896d1a9..4cc28541281befa29f67317c399e376b33c99624 100644 (file)
@@ -271,7 +271,7 @@ static int decode_locker(void **p, void *end, struct ceph_locker *locker)
 
        dout("%s %s%llu cookie %s addr %s\n", __func__,
             ENTITY_NAME(locker->id.name), locker->id.cookie,
-            ceph_pr_addr(&locker->info.addr.in_addr));
+            ceph_pr_addr(&locker->info.addr));
        return 0;
 }
 
index 46f65709a6ff8556f00517d462c1abc053aae8a0..63aef9915f759bd447fc6ae365367d2e770afad5 100644 (file)
@@ -46,7 +46,7 @@ static int monmap_show(struct seq_file *s, void *p)
 
                seq_printf(s, "\t%s%lld\t%s\n",
                           ENTITY_NAME(inst->name),
-                          ceph_pr_addr(&inst->addr.in_addr));
+                          ceph_pr_addr(&inst->addr));
        }
        return 0;
 }
@@ -82,7 +82,7 @@ static int osdmap_show(struct seq_file *s, void *p)
                char sb[64];
 
                seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
-                          i, ceph_pr_addr(&addr->in_addr),
+                          i, ceph_pr_addr(addr),
                           ((map->osd_weight[i]*100) >> 16),
                           ceph_osdmap_state_str(sb, sizeof(sb), state),
                           ((ceph_get_primary_affinity(map, i)*100) >> 16));
index 3083988ce729dbe01771e9433b7de72e484394f9..cd0b094468b612b7bdb8231ddef8fa9847cf2dd5 100644 (file)
@@ -186,17 +186,18 @@ static atomic_t addr_str_seq = ATOMIC_INIT(0);
 
 static struct page *zero_page;         /* used in certain error cases */
 
-const char *ceph_pr_addr(const struct sockaddr_storage *ss)
+const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 {
        int i;
        char *s;
-       struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
-       struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
+       struct sockaddr_storage ss = addr->in_addr; /* align */
+       struct sockaddr_in *in4 = (struct sockaddr_in *)&ss;
+       struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss;
 
        i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
        s = addr_str[i];
 
-       switch (ss->ss_family) {
+       switch (ss.ss_family) {
        case AF_INET:
                snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
                         ntohs(in4->sin_port));
@@ -209,7 +210,7 @@ const char *ceph_pr_addr(const struct sockaddr_storage *ss)
 
        default:
                snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)",
-                        ss->ss_family);
+                        ss.ss_family);
        }
 
        return s;
@@ -449,7 +450,7 @@ static void set_sock_callbacks(struct socket *sock,
  */
 static int ceph_tcp_connect(struct ceph_connection *con)
 {
-       struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
+       struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
        struct socket *sock;
        unsigned int noio_flag;
        int ret;
@@ -458,7 +459,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 
        /* sock_create_kern() allocates with GFP_KERNEL */
        noio_flag = memalloc_noio_save();
-       ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
+       ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family,
                               SOCK_STREAM, IPPROTO_TCP, &sock);
        memalloc_noio_restore(noio_flag);
        if (ret)
@@ -471,18 +472,18 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 
        set_sock_callbacks(sock, con);
 
-       dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
+       dout("connect %s\n", ceph_pr_addr(&con->peer_addr));
 
        con_sock_state_connecting(con);
-       ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+       ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss),
                                 O_NONBLOCK);
        if (ret == -EINPROGRESS) {
                dout("connect %s EINPROGRESS sk_state = %u\n",
-                    ceph_pr_addr(&con->peer_addr.in_addr),
+                    ceph_pr_addr(&con->peer_addr),
                     sock->sk->sk_state);
        } else if (ret < 0) {
                pr_err("connect %s error %d\n",
-                      ceph_pr_addr(&con->peer_addr.in_addr), ret);
+                      ceph_pr_addr(&con->peer_addr), ret);
                sock_release(sock);
                return ret;
        }
@@ -669,8 +670,7 @@ static void reset_connection(struct ceph_connection *con)
 void ceph_con_close(struct ceph_connection *con)
 {
        mutex_lock(&con->mutex);
-       dout("con_close %p peer %s\n", con,
-            ceph_pr_addr(&con->peer_addr.in_addr));
+       dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
        con->state = CON_STATE_CLOSED;
 
        con_flag_clear(con, CON_FLAG_LOSSYTX);  /* so we retry next connect */
@@ -694,7 +694,7 @@ void ceph_con_open(struct ceph_connection *con,
                   struct ceph_entity_addr *addr)
 {
        mutex_lock(&con->mutex);
-       dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
+       dout("con_open %p %s\n", con, ceph_pr_addr(addr));
 
        WARN_ON(con->state != CON_STATE_CLOSED);
        con->state = CON_STATE_PREOPEN;
@@ -1788,21 +1788,22 @@ static int verify_hello(struct ceph_connection *con)
 {
        if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
                pr_err("connect to %s got bad banner\n",
-                      ceph_pr_addr(&con->peer_addr.in_addr));
+                      ceph_pr_addr(&con->peer_addr));
                con->error_msg = "protocol error, bad banner";
                return -1;
        }
        return 0;
 }
 
-static bool addr_is_blank(struct sockaddr_storage *ss)
+static bool addr_is_blank(struct ceph_entity_addr *addr)
 {
-       struct in_addr *addr = &((struct sockaddr_in *)ss)->sin_addr;
-       struct in6_addr *addr6 = &((struct sockaddr_in6 *)ss)->sin6_addr;
+       struct sockaddr_storage ss = addr->in_addr; /* align */
+       struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
+       struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr;
 
-       switch (ss->ss_family) {
+       switch (ss.ss_family) {
        case AF_INET:
-               return addr->s_addr == htonl(INADDR_ANY);
+               return addr4->s_addr == htonl(INADDR_ANY);
        case AF_INET6:
                return ipv6_addr_any(addr6);
        default:
@@ -1810,25 +1811,25 @@ static bool addr_is_blank(struct sockaddr_storage *ss)
        }
 }
 
-static int addr_port(struct sockaddr_storage *ss)
+static int addr_port(struct ceph_entity_addr *addr)
 {
-       switch (ss->ss_family) {
+       switch (get_unaligned(&addr->in_addr.ss_family)) {
        case AF_INET:
-               return ntohs(((struct sockaddr_in *)ss)->sin_port);
+               return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port));
        case AF_INET6:
-               return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+               return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port));
        }
        return 0;
 }
 
-static void addr_set_port(struct sockaddr_storage *ss, int p)
+static void addr_set_port(struct ceph_entity_addr *addr, int p)
 {
-       switch (ss->ss_family) {
+       switch (get_unaligned(&addr->in_addr.ss_family)) {
        case AF_INET:
-               ((struct sockaddr_in *)ss)->sin_port = htons(p);
+               put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port);
                break;
        case AF_INET6:
-               ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+               put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port);
                break;
        }
 }
@@ -1836,21 +1837,18 @@ static void addr_set_port(struct sockaddr_storage *ss, int p)
 /*
  * Unlike other *_pton function semantics, zero indicates success.
  */
-static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
+static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr,
                char delim, const char **ipend)
 {
-       struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
-       struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
+       memset(&addr->in_addr, 0, sizeof(addr->in_addr));
 
-       memset(ss, 0, sizeof(*ss));
-
-       if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) {
-               ss->ss_family = AF_INET;
+       if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) {
+               put_unaligned(AF_INET, &addr->in_addr.ss_family);
                return 0;
        }
 
-       if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) {
-               ss->ss_family = AF_INET6;
+       if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) {
+               put_unaligned(AF_INET6, &addr->in_addr.ss_family);
                return 0;
        }
 
@@ -1862,7 +1860,7 @@ static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
  */
 #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER
 static int ceph_dns_resolve_name(const char *name, size_t namelen,
-               struct sockaddr_storage *ss, char delim, const char **ipend)
+               struct ceph_entity_addr *addr, char delim, const char **ipend)
 {
        const char *end, *delim_p;
        char *colon_p, *ip_addr = NULL;
@@ -1889,9 +1887,9 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen,
                return -EINVAL;
 
        /* do dns_resolve upcall */
-       ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL);
+       ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL, false);
        if (ip_len > 0)
-               ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL);
+               ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL);
        else
                ret = -ESRCH;
 
@@ -1900,13 +1898,13 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen,
        *ipend = end;
 
        pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name,
-                       ret, ret ? "failed" : ceph_pr_addr(ss));
+                       ret, ret ? "failed" : ceph_pr_addr(addr));
 
        return ret;
 }
 #else
 static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
-               struct sockaddr_storage *ss, char delim, const char **ipend)
+               struct ceph_entity_addr *addr, char delim, const char **ipend)
 {
        return -EINVAL;
 }
@@ -1917,13 +1915,13 @@ static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
  * then try to extract a hostname to resolve using userspace DNS upcall.
  */
 static int ceph_parse_server_name(const char *name, size_t namelen,
-                       struct sockaddr_storage *ss, char delim, const char **ipend)
+               struct ceph_entity_addr *addr, char delim, const char **ipend)
 {
        int ret;
 
-       ret = ceph_pton(name, namelen, ss, delim, ipend);
+       ret = ceph_pton(name, namelen, addr, delim, ipend);
        if (ret)
-               ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend);
+               ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend);
 
        return ret;
 }
@@ -1942,7 +1940,6 @@ int ceph_parse_ips(const char *c, const char *end,
        dout("parse_ips on '%.*s'\n", (int)(end-c), c);
        for (i = 0; i < max_count; i++) {
                const char *ipend;
-               struct sockaddr_storage *ss = &addr[i].in_addr;
                int port;
                char delim = ',';
 
@@ -1951,7 +1948,7 @@ int ceph_parse_ips(const char *c, const char *end,
                        p++;
                }
 
-               ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend);
+               ret = ceph_parse_server_name(p, end - p, &addr[i], delim, &ipend);
                if (ret)
                        goto bad;
                ret = -EINVAL;
@@ -1982,9 +1979,9 @@ int ceph_parse_ips(const char *c, const char *end,
                        port = CEPH_MON_PORT;
                }
 
-               addr_set_port(ss, port);
+               addr_set_port(&addr[i], port);
 
-               dout("parse_ips got %s\n", ceph_pr_addr(ss));
+               dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
 
                if (p == end)
                        break;
@@ -2023,12 +2020,12 @@ static int process_banner(struct ceph_connection *con)
         */
        if (memcmp(&con->peer_addr, &con->actual_peer_addr,
                   sizeof(con->peer_addr)) != 0 &&
-           !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+           !(addr_is_blank(&con->actual_peer_addr) &&
              con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
                pr_warn("wrong peer, want %s/%d, got %s/%d\n",
-                       ceph_pr_addr(&con->peer_addr.in_addr),
+                       ceph_pr_addr(&con->peer_addr),
                        (int)le32_to_cpu(con->peer_addr.nonce),
-                       ceph_pr_addr(&con->actual_peer_addr.in_addr),
+                       ceph_pr_addr(&con->actual_peer_addr),
                        (int)le32_to_cpu(con->actual_peer_addr.nonce));
                con->error_msg = "wrong peer at address";
                return -1;
@@ -2037,16 +2034,16 @@ static int process_banner(struct ceph_connection *con)
        /*
         * did we learn our address?
         */
-       if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
-               int port = addr_port(&con->msgr->inst.addr.in_addr);
+       if (addr_is_blank(&con->msgr->inst.addr)) {
+               int port = addr_port(&con->msgr->inst.addr);
 
                memcpy(&con->msgr->inst.addr.in_addr,
                       &con->peer_addr_for_me.in_addr,
                       sizeof(con->peer_addr_for_me.in_addr));
-               addr_set_port(&con->msgr->inst.addr.in_addr, port);
+               addr_set_port(&con->msgr->inst.addr, port);
                encode_my_addr(con->msgr);
                dout("process_banner learned my addr is %s\n",
-                    ceph_pr_addr(&con->msgr->inst.addr.in_addr));
+                    ceph_pr_addr(&con->msgr->inst.addr));
        }
 
        return 0;
@@ -2097,7 +2094,7 @@ static int process_connect(struct ceph_connection *con)
                pr_err("%s%lld %s feature set mismatch,"
                       " my %llx < server's %llx, missing %llx\n",
                       ENTITY_NAME(con->peer_name),
-                      ceph_pr_addr(&con->peer_addr.in_addr),
+                      ceph_pr_addr(&con->peer_addr),
                       sup_feat, server_feat, server_feat & ~sup_feat);
                con->error_msg = "missing required protocol features";
                reset_connection(con);
@@ -2107,7 +2104,7 @@ static int process_connect(struct ceph_connection *con)
                pr_err("%s%lld %s protocol version mismatch,"
                       " my %d != server's %d\n",
                       ENTITY_NAME(con->peer_name),
-                      ceph_pr_addr(&con->peer_addr.in_addr),
+                      ceph_pr_addr(&con->peer_addr),
                       le32_to_cpu(con->out_connect.protocol_version),
                       le32_to_cpu(con->in_reply.protocol_version));
                con->error_msg = "protocol version mismatch";
@@ -2141,7 +2138,7 @@ static int process_connect(struct ceph_connection *con)
                     le32_to_cpu(con->in_reply.connect_seq));
                pr_err("%s%lld %s connection reset\n",
                       ENTITY_NAME(con->peer_name),
-                      ceph_pr_addr(&con->peer_addr.in_addr));
+                      ceph_pr_addr(&con->peer_addr));
                reset_connection(con);
                con_out_kvec_reset(con);
                ret = prepare_write_connect(con);
@@ -2198,7 +2195,7 @@ static int process_connect(struct ceph_connection *con)
                        pr_err("%s%lld %s protocol feature mismatch,"
                               " my required %llx > server's %llx, need %llx\n",
                               ENTITY_NAME(con->peer_name),
-                              ceph_pr_addr(&con->peer_addr.in_addr),
+                              ceph_pr_addr(&con->peer_addr),
                               req_feat, server_feat, req_feat & ~server_feat);
                        con->error_msg = "missing required protocol features";
                        reset_connection(con);
@@ -2405,7 +2402,7 @@ static int read_partial_message(struct ceph_connection *con)
        if ((s64)seq - (s64)con->in_seq < 1) {
                pr_info("skipping %s%lld %s seq %lld expected %lld\n",
                        ENTITY_NAME(con->peer_name),
-                       ceph_pr_addr(&con->peer_addr.in_addr),
+                       ceph_pr_addr(&con->peer_addr),
                        seq, con->in_seq + 1);
                con->in_base_pos = -front_len - middle_len - data_len -
                        sizeof_footer(con);
@@ -2984,10 +2981,10 @@ static void ceph_con_workfn(struct work_struct *work)
 static void con_fault(struct ceph_connection *con)
 {
        dout("fault %p state %lu to peer %s\n",
-            con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
+            con, con->state, ceph_pr_addr(&con->peer_addr));
 
        pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-               ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+               ceph_pr_addr(&con->peer_addr), con->error_msg);
        con->error_msg = NULL;
 
        WARN_ON(con->state != CON_STATE_CONNECTING &&
index a53e4fbb631918ccf94536849e5e25acad2dfdc6..895679d3529b8e3b77fcbcc3f9b88c5204239087 100644 (file)
@@ -76,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
             m->num_mon);
        for (i = 0; i < m->num_mon; i++)
                dout("monmap_decode  mon%d is %s\n", i,
-                    ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
+                    ceph_pr_addr(&m->mon_inst[i].addr));
        return m;
 
 bad:
@@ -203,7 +203,7 @@ static void reopen_session(struct ceph_mon_client *monc)
 {
        if (!monc->hunting)
                pr_info("mon%d %s session lost, hunting for new mon\n",
-                   monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr));
+                   monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr));
 
        __close_session(monc);
        __open_session(monc);
@@ -1178,7 +1178,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                __resend_generic_request(monc);
 
                pr_info("mon%d %s session established\n", monc->cur_mon,
-                       ceph_pr_addr(&monc->con.peer_addr.in_addr));
+                       ceph_pr_addr(&monc->con.peer_addr));
        }
 
 out:
index 6f739de28918638f737bc13f6ed53a786883781e..9a8eca5eda654d8bdc9ba1480d826453917078e1 100644 (file)
@@ -4926,7 +4926,7 @@ static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
 
        dout("%s %s%llu cookie %llu addr %s\n", __func__,
             ENTITY_NAME(item->name), item->cookie,
-            ceph_pr_addr(&item->addr.in_addr));
+            ceph_pr_addr(&item->addr));
        return 0;
 }
 
index 19aa32fc1802b44a971eecd6f7b9f967ec2900d9..2d260432b3be0a7ddefbd6cb68c2bb2b18163525 100644 (file)
@@ -54,6 +54,7 @@
  * @options: Request options (or NULL if no options)
  * @_result: Where to place the returned data (or NULL)
  * @_expiry: Where to store the result expiry time (or NULL)
+ * @invalidate: Always invalidate the key after use
  *
  * The data will be returned in the pointer at *result, if provided, and the
  * caller is responsible for freeing it.
@@ -69,7 +70,8 @@
  * Returns the size of the result on success, -ve error code otherwise.
  */
 int dns_query(const char *type, const char *name, size_t namelen,
-             const char *options, char **_result, time64_t *_expiry)
+             const char *options, char **_result, time64_t *_expiry,
+             bool invalidate)
 {
        struct key *rkey;
        struct user_key_payload *upayload;
@@ -157,6 +159,8 @@ int dns_query(const char *type, const char *name, size_t namelen,
        ret = len;
 put:
        up_read(&rkey->sem);
+       if (invalidate)
+               key_invalidate(rkey);
        key_put(rkey);
 out:
        kleave(" = %d", ret);
index ae8c5d7f3bf1e29460e5b96b05b7b1b1ecd4ce15..ffde5b187f5d19d9975594d1826a2ace1a9fe71c 100644 (file)
@@ -270,6 +270,7 @@ static int rxrpc_listen(struct socket *sock, int backlog)
  * @gfp: The allocation constraints
  * @notify_rx: Where to send notifications instead of socket queue
  * @upgrade: Request service upgrade for call
+ * @intr: The call is interruptible
  * @debug_id: The debug ID for tracing to be assigned to the call
  *
  * Allow a kernel service to begin a call on the nominated socket.  This just
@@ -287,6 +288,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
                                           gfp_t gfp,
                                           rxrpc_notify_rx_t notify_rx,
                                           bool upgrade,
+                                          bool intr,
                                           unsigned int debug_id)
 {
        struct rxrpc_conn_parameters cp;
@@ -311,6 +313,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
        memset(&p, 0, sizeof(p));
        p.user_call_ID = user_call_ID;
        p.tx_total_len = tx_total_len;
+       p.intr = intr;
 
        memset(&cp, 0, sizeof(cp));
        cp.local                = rx->local;
@@ -443,6 +446,31 @@ void rxrpc_kernel_new_call_notification(
 }
 EXPORT_SYMBOL(rxrpc_kernel_new_call_notification);
 
+/**
+ * rxrpc_kernel_set_max_life - Set maximum lifespan on a call
+ * @sock: The socket the call is on
+ * @call: The call to configure
+ * @hard_timeout: The maximum lifespan of the call in jiffies
+ *
+ * Set the maximum lifespan of a call.  The call will end with ETIME or
+ * ETIMEDOUT if it takes longer than this.
+ */
+void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call,
+                              unsigned long hard_timeout)
+{
+       unsigned long now;
+
+       mutex_lock(&call->user_mutex);
+
+       now = jiffies;
+       hard_timeout += now;
+       WRITE_ONCE(call->expect_term_by, hard_timeout);
+       rxrpc_reduce_call_timer(call, hard_timeout, now, rxrpc_timer_set_for_hard);
+
+       mutex_unlock(&call->user_mutex);
+}
+EXPORT_SYMBOL(rxrpc_kernel_set_max_life);
+
 /*
  * connect an RxRPC socket
  * - this just targets it at a specific destination; no actual connection
index 062ca9dc29b8ab2fa7381c606791d4fd39657962..07fc1dfa487890e800598755d2c2c9202be86ccf 100644 (file)
@@ -482,6 +482,7 @@ enum rxrpc_call_flag {
        RXRPC_CALL_BEGAN_RX_TIMER,      /* We began the expect_rx_by timer */
        RXRPC_CALL_RX_HEARD,            /* The peer responded at least once to this call */
        RXRPC_CALL_RX_UNDERRUN,         /* Got data underrun */
+       RXRPC_CALL_IS_INTR,             /* The call is interruptible */
 };
 
 /*
@@ -711,6 +712,7 @@ struct rxrpc_call_params {
                u32             normal;         /* Max time since last call packet (msec) */
        } timeouts;
        u8                      nr_timeouts;    /* Number of timeouts specified */
+       bool                    intr;           /* The call is interruptible */
 };
 
 struct rxrpc_send_params {
index fe96881a334daff644a1f9d01497771f745e9fc8..d0ca98d7aef57691664b2c91350bcae210252332 100644 (file)
@@ -241,6 +241,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
                return call;
        }
 
+       if (p->intr)
+               __set_bit(RXRPC_CALL_IS_INTR, &call->flags);
        call->tx_total_len = p->tx_total_len;
        trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
                         here, (const void *)p->user_call_ID);
index 83797b3949e2f0122d87d15a48651b3b540ab156..5cf5595a14d8e73408de49bab6765b4c7057a69a 100644 (file)
@@ -656,10 +656,14 @@ static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp)
 
                add_wait_queue_exclusive(&call->waitq, &myself);
                for (;;) {
-                       set_current_state(TASK_INTERRUPTIBLE);
+                       if (test_bit(RXRPC_CALL_IS_INTR, &call->flags))
+                               set_current_state(TASK_INTERRUPTIBLE);
+                       else
+                               set_current_state(TASK_UNINTERRUPTIBLE);
                        if (call->call_id)
                                break;
-                       if (signal_pending(current)) {
+                       if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) &&
+                           signal_pending(current)) {
                                ret = -ERESTARTSYS;
                                break;
                        }
index bec64deb7b0a2794345c896827846fa8bac57e19..45a05d9a27fa122dc03440d6734b229be7c15991 100644 (file)
@@ -80,7 +80,8 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
                if (call->state >= RXRPC_CALL_COMPLETE)
                        return call->error;
 
-               if (timeout == 0 &&
+               if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) &&
+                   timeout == 0 &&
                    tx_win == tx_start && signal_pending(current))
                        return -EINTR;
 
@@ -620,6 +621,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
                .call.tx_total_len      = -1,
                .call.user_call_ID      = 0,
                .call.nr_timeouts       = 0,
+               .call.intr              = true,
                .abort_code             = 0,
                .command                = RXRPC_CMD_SEND_DATA,
                .exclusive              = false,
index 8df526c80b6575b97c8bfe72317cc628633c323a..4dd11a554b9b2f7509c1bb8725e7217df33e898f 100644 (file)
@@ -306,7 +306,7 @@ ignore it:
 
 - To skip validation of a file, add
 
-    OBJECT_FILES_NON_STANDARD_filename.o := n
+    OBJECT_FILES_NON_STANDARD_filename.o := y
 
   to the Makefile.
 
index ac743a1d53ab321a8a664fab6ef9a4f3a04b6dfa..7325d89ccad93dec63603d850da0d7a6f50f863a 100644 (file)
@@ -28,6 +28,8 @@
 #include <linux/hashtable.h>
 #include <linux/kernel.h>
 
+#define FAKE_JUMP_OFFSET -1
+
 struct alternative {
        struct list_head list;
        struct instruction *insn;
@@ -568,7 +570,7 @@ static int add_jump_destinations(struct objtool_file *file)
                    insn->type != INSN_JUMP_UNCONDITIONAL)
                        continue;
 
-               if (insn->ignore)
+               if (insn->ignore || insn->offset == FAKE_JUMP_OFFSET)
                        continue;
 
                rela = find_rela_by_dest_range(insn->sec, insn->offset,
@@ -745,10 +747,10 @@ static int handle_group_alt(struct objtool_file *file,
                clear_insn_state(&fake_jump->state);
 
                fake_jump->sec = special_alt->new_sec;
-               fake_jump->offset = -1;
+               fake_jump->offset = FAKE_JUMP_OFFSET;
                fake_jump->type = INSN_JUMP_UNCONDITIONAL;
                fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
-               fake_jump->ignore = true;
+               fake_jump->func = orig_insn->func;
        }
 
        if (!special_alt->new_len) {
@@ -1957,7 +1959,8 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
                        return 1;
                }
 
-               func = insn->func ? insn->func->pfunc : NULL;
+               if (insn->func)
+                       func = insn->func->pfunc;
 
                if (func && insn->ignore) {
                        WARN_FUNC("BUG: why am I validating an ignored function?",
index 91750352459dfd9c3cc2bb81f99f5f9ef74ed7f3..8059ce8342477fce62e68f1205554bf7e9189525 100644 (file)
@@ -1,4 +1,3 @@
-kselftest
 gpiogpio-event-mon
 gpiogpio-hammer
 gpioinclude/
index f2ebf8cf46869fb151dd4feb96477c9374563a68..9781ca79794af774ca86b3ef0708283cfa73db0a 100644 (file)
@@ -71,6 +71,9 @@ override LDFLAGS =
 override MAKEFLAGS =
 endif
 
+# Append kselftest to KBUILD_OUTPUT to avoid cluttering
+# KBUILD_OUTPUT with selftest objects and headers installed
+# by selftests Makefile or lib.mk.
 ifneq ($(KBUILD_SRC),)
 override LDFLAGS =
 endif
@@ -79,19 +82,13 @@ ifneq ($(O),)
        BUILD := $(O)
 else
        ifneq ($(KBUILD_OUTPUT),)
-               BUILD := $(KBUILD_OUTPUT)
+               BUILD := $(KBUILD_OUTPUT)/kselftest
        else
                BUILD := $(shell pwd)
                DEFAULT_INSTALL_HDR_PATH := 1
        endif
 endif
 
-# KSFT_TAP_LEVEL is used from KSFT framework to prevent nested TAP header
-# printing from tests. Applicable to run_tests case where run_tests adds
-# TAP header prior running tests and when a test program invokes another
-# with system() call. Export it here to cover override RUN_TESTS defines.
-export KSFT_TAP_LEVEL=`echo 1`
-
 # Prepare for headers install
 top_srcdir ?= ../../..
 include $(top_srcdir)/scripts/subarch.include
@@ -169,14 +166,22 @@ clean_hotplug:
 run_pstore_crash:
        make -C pstore run_crash
 
-INSTALL_PATH ?= install
+# Use $BUILD as the default install root. $BUILD points to the
+# right output location for the following cases:
+# 1. output_dir=kernel_src
+# 2. a separate output directory is specified using O= KBUILD_OUTPUT
+# 3. a separate output directory is specified using KBUILD_OUTPUT
+#
+INSTALL_PATH ?= $(BUILD)/install
 INSTALL_PATH := $(abspath $(INSTALL_PATH))
 ALL_SCRIPT := $(INSTALL_PATH)/run_kselftest.sh
 
-install:
+install: all
 ifdef INSTALL_PATH
        @# Ask all targets to install their files
-       mkdir -p $(INSTALL_PATH)
+       mkdir -p $(INSTALL_PATH)/kselftest
+       install -m 744 kselftest/runner.sh $(INSTALL_PATH)/kselftest/
+       install -m 744 kselftest/prefix.pl $(INSTALL_PATH)/kselftest/
        @for TARGET in $(TARGETS); do \
                BUILD_TARGET=$$BUILD/$$TARGET;  \
                make OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \
@@ -186,24 +191,20 @@ ifdef INSTALL_PATH
        echo "#!/bin/sh" > $(ALL_SCRIPT)
        echo "BASE_DIR=\$$(realpath \$$(dirname \$$0))" >> $(ALL_SCRIPT)
        echo "cd \$$BASE_DIR" >> $(ALL_SCRIPT)
+       echo ". ./kselftest/runner.sh" >> $(ALL_SCRIPT)
        echo "ROOT=\$$PWD" >> $(ALL_SCRIPT)
        echo "if [ \"\$$1\" = \"--summary\" ]; then" >> $(ALL_SCRIPT)
-       echo "  OUTPUT=\$$BASE_DIR/output.log" >> $(ALL_SCRIPT)
-       echo "  cat /dev/null > \$$OUTPUT" >> $(ALL_SCRIPT)
-       echo "else" >> $(ALL_SCRIPT)
-       echo "  OUTPUT=/dev/stdout" >> $(ALL_SCRIPT)
+       echo "  logfile=\$$BASE_DIR/output.log" >> $(ALL_SCRIPT)
+       echo "  cat /dev/null > \$$logfile" >> $(ALL_SCRIPT)
        echo "fi" >> $(ALL_SCRIPT)
-       echo "export KSFT_TAP_LEVEL=1" >> $(ALL_SCRIPT)
-       echo "export skip=4" >> $(ALL_SCRIPT)
 
        for TARGET in $(TARGETS); do \
                BUILD_TARGET=$$BUILD/$$TARGET;  \
-               echo "echo ; echo TAP version 13" >> $(ALL_SCRIPT);     \
-               echo "echo Running tests in $$TARGET" >> $(ALL_SCRIPT); \
-               echo "echo ========================================" >> $(ALL_SCRIPT); \
                echo "[ -w /dev/kmsg ] && echo \"kselftest: Running tests in $$TARGET\" >> /dev/kmsg" >> $(ALL_SCRIPT); \
                echo "cd $$TARGET" >> $(ALL_SCRIPT); \
+               echo -n "run_many" >> $(ALL_SCRIPT); \
                make -s --no-print-directory OUTPUT=$$BUILD_TARGET -C $$TARGET emit_tests >> $(ALL_SCRIPT); \
+               echo "" >> $(ALL_SCRIPT);           \
                echo "cd \$$ROOT" >> $(ALL_SCRIPT); \
        done;
 
index 901b85ea6a590c1c0e042fd784ea60f75d676a6f..8f3655e590201458055af9380f271709f1502440 100644 (file)
@@ -21,6 +21,8 @@
 
 #include "../kselftest.h"
 
+#define COUNT_ISN_BPS  4
+#define COUNT_WPS      4
 
 /* Breakpoint access modes */
 enum {
@@ -220,7 +222,7 @@ static void trigger_tests(void)
                        if (!local && !global)
                                continue;
 
-                       for (i = 0; i < 4; i++) {
+                       for (i = 0; i < COUNT_ISN_BPS; i++) {
                                dummy_funcs[i]();
                                check_trapped();
                        }
@@ -292,7 +294,7 @@ static void launch_instruction_breakpoints(char *buf, int local, int global)
 {
        int i;
 
-       for (i = 0; i < 4; i++) {
+       for (i = 0; i < COUNT_ISN_BPS; i++) {
                set_breakpoint_addr(dummy_funcs[i], i);
                toggle_breakpoint(i, BP_X, 1, local, global, 1);
                ptrace(PTRACE_CONT, child_pid, NULL, 0);
@@ -314,7 +316,7 @@ static void launch_watchpoints(char *buf, int mode, int len,
        else
                mode_str = "read";
 
-       for (i = 0; i < 4; i++) {
+       for (i = 0; i < COUNT_WPS; i++) {
                set_breakpoint_addr(&dummy_var[i], i);
                toggle_breakpoint(i, mode, len, local, global, 1);
                ptrace(PTRACE_CONT, child_pid, NULL, 0);
@@ -330,8 +332,15 @@ static void launch_watchpoints(char *buf, int mode, int len,
 static void launch_tests(void)
 {
        char buf[1024];
+       unsigned int tests = 0;
        int len, local, global, i;
 
+       tests += 3 * COUNT_ISN_BPS;
+       tests += sizeof(long) / 2 * 3 * COUNT_WPS;
+       tests += sizeof(long) / 2 * 3 * COUNT_WPS;
+       tests += 2;
+       ksft_set_plan(tests);
+
        /* Instruction breakpoints */
        for (local = 0; local < 2; local++) {
                for (global = 0; global < 2; global++) {
index 2d95e5adde726fb388b26de8e0db87343110d036..ab59d814341a8a33ae66676c1233b0c837016892 100644 (file)
@@ -118,7 +118,7 @@ static bool set_watchpoint(pid_t pid, int size, int wp)
        return false;
 }
 
-static bool run_test(int wr_size, int wp_size, int wr, int wp)
+static bool arun_test(int wr_size, int wp_size, int wr, int wp)
 {
        int status;
        siginfo_t siginfo;
@@ -214,6 +214,7 @@ int main(int argc, char **argv)
        bool result;
 
        ksft_print_header();
+       ksft_set_plan(213);
 
        act.sa_handler = sigalrm;
        sigemptyset(&act.sa_mask);
index f82dcc1f8841e74b950257e2b96c3b0914987a12..cf868b5e00f79fec541e86c4c9320168449e8ca2 100644 (file)
@@ -173,6 +173,7 @@ int main(int argc, char **argv)
        int opt;
        bool do_suspend = true;
        bool succeeded = true;
+       unsigned int tests = 0;
        cpu_set_t available_cpus;
        int err;
        int cpu;
@@ -191,6 +192,13 @@ int main(int argc, char **argv)
                }
        }
 
+       for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+               if (!CPU_ISSET(cpu, &available_cpus))
+                       continue;
+               tests++;
+       }
+       ksft_set_plan(tests);
+
        if (do_suspend)
                suspend();
 
index 3ab39a61b95bdcc0bb5dc7d70e7fd0bb563e26d8..df0ef02b403670925ae14e42fc396fdd5945aa3f 100644 (file)
@@ -430,8 +430,6 @@ int main(int argc, char **argv)
 {
        char *tmp1, *tmp2, *our_path;
 
-       ksft_print_header();
-
        /* Find our path */
        tmp1 = strdup(argv[0]);
        if (!tmp1)
@@ -445,6 +443,8 @@ int main(int argc, char **argv)
        mpid = getpid();
 
        if (fork_wait()) {
+               ksft_print_header();
+               ksft_set_plan(12);
                ksft_print_msg("[RUN]\t+++ Tests with uid == 0 +++\n");
                return do_tests(0, our_path);
        }
@@ -452,6 +452,8 @@ int main(int argc, char **argv)
        ksft_print_msg("==================================================\n");
 
        if (fork_wait()) {
+               ksft_print_header();
+               ksft_set_plan(9);
                ksft_print_msg("[RUN]\t+++ Tests with uid != 0 +++\n");
                return do_tests(1, our_path);
        }
diff --git a/tools/testing/selftests/drivers/.gitignore b/tools/testing/selftests/drivers/.gitignore
new file mode 100644 (file)
index 0000000..f6aebcc
--- /dev/null
@@ -0,0 +1 @@
+/dma-buf/udmabuf
index 54cd5c414e82c487f3c57094b84f5ab5cf960604..8d20957f758695fde13d78e96536e127ad32d8f6 100644 (file)
@@ -395,6 +395,7 @@ int main(int argc, char *argv[])
        }
 
        ksft_print_header();
+       ksft_set_plan(1);
        ksft_print_msg("%s: Test requeue functionality\n", basename(argv[0]));
        ksft_print_msg(
                "\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n",
index 08187a16507ff8afc3dd02c958e22327024fbcf9..742624c59ba7d96a2e30571d20f942d674ef4c24 100644 (file)
@@ -79,6 +79,7 @@ int main(int argc, char *argv[])
        }
 
        ksft_print_header();
+       ksft_set_plan(1);
        ksft_print_msg("%s: Detect mismatched requeue_pi operations\n",
               basename(argv[0]));
 
index f0542a344d95fa9656b325623473b851149f9cd6..a0f5934707ffcb0f9f622e43bcfb930b13461820 100644 (file)
@@ -144,6 +144,7 @@ int main(int argc, char *argv[])
        }
 
        ksft_print_header();
+       ksft_set_plan(1);
        ksft_print_msg("%s: Test signal handling during requeue_pi\n",
               basename(argv[0]));
        ksft_print_msg("\tArguments: <none>\n");
index 6216de828093a079a3afeafe06ff21d9bd6e15ee..a458d42ff86ec5f5b98bb75762b67edd90e3e141 100644 (file)
@@ -98,6 +98,7 @@ int main(int argc, char **argv)
        }
 
        ksft_print_header();
+       ksft_set_plan(1);
        ksft_print_msg(
                "%s: Test the futex value of private file mappings in FUTEX_WAIT\n",
                basename(argv[0]));
index bab3dfe1787f9bd724bd75f02625efd0d08b7d50..04b95478059cdec40890e35c3fe053117e86f3b9 100644 (file)
@@ -69,6 +69,7 @@ int main(int argc, char *argv[])
        }
 
        ksft_print_header();
+       ksft_set_plan(1);
        ksft_print_msg("%s: Block on a futex and wait for timeout\n",
               basename(argv[0]));
        ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns);
index 26975322545b4173083d082311651210501989c2..3a1d12a14921d77f977b154dbe309beaddb91f08 100644 (file)
@@ -100,6 +100,7 @@ int main(int argc, char **argv)
        }
 
        ksft_print_header();
+       ksft_set_plan(1);
        ksft_print_msg("%s: Test the uninitialized futex value in FUTEX_WAIT\n",
               basename(argv[0]));
 
index da15a63269b4c7f3059bcac284ca2fdb89602847..a34a6bbc30cecbce095299e113bda441c84cb849 100644 (file)
@@ -65,6 +65,7 @@ int main(int argc, char *argv[])
        }
 
        ksft_print_header();
+       ksft_set_plan(1);
        ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n",
               basename(argv[0]));
 
index 47e1d995c1822903499e6597bc863f3b41402e6a..ec15c4f6af552d57a5584e18abb7cb9bb17cc90f 100644 (file)
@@ -33,6 +33,7 @@ struct ksft_count {
 };
 
 static struct ksft_count ksft_cnt;
+static unsigned int ksft_plan;
 
 static inline int ksft_test_num(void)
 {
@@ -61,13 +62,21 @@ static inline void ksft_print_header(void)
                printf("TAP version 13\n");
 }
 
+static inline void ksft_set_plan(unsigned int plan)
+{
+       ksft_plan = plan;
+       printf("1..%d\n", ksft_plan);
+}
+
 static inline void ksft_print_cnts(void)
 {
-       printf("Pass %d Fail %d Xfail %d Xpass %d Skip %d Error %d\n",
+       if (ksft_plan != ksft_test_num())
+               printf("# Planned tests != run tests (%u != %u)\n",
+                       ksft_plan, ksft_test_num());
+       printf("# Pass %d Fail %d Xfail %d Xpass %d Skip %d Error %d\n",
                ksft_cnt.ksft_pass, ksft_cnt.ksft_fail,
                ksft_cnt.ksft_xfail, ksft_cnt.ksft_xpass,
                ksft_cnt.ksft_xskip, ksft_cnt.ksft_error);
-       printf("1..%d\n", ksft_test_num());
 }
 
 static inline void ksft_print_msg(const char *msg, ...)
@@ -111,7 +120,7 @@ static inline void ksft_test_result_skip(const char *msg, ...)
        ksft_cnt.ksft_xskip++;
 
        va_start(args, msg);
-       printf("ok %d # skip ", ksft_test_num());
+       printf("not ok %d # SKIP ", ksft_test_num());
        vprintf(msg, args);
        va_end(args);
 }
@@ -172,7 +181,7 @@ static inline int ksft_exit_skip(const char *msg, ...)
                va_list args;
 
                va_start(args, msg);
-               printf("1..%d # Skipped: ", ksft_test_num());
+               printf("not ok %d # SKIP ", 1 + ksft_test_num());
                vprintf(msg, args);
                va_end(args);
        } else {
diff --git a/tools/testing/selftests/kselftest/prefix.pl b/tools/testing/selftests/kselftest/prefix.pl
new file mode 100755 (executable)
index 0000000..ec7e481
--- /dev/null
@@ -0,0 +1,23 @@
+#!/usr/bin/perl
+# SPDX-License-Identifier: GPL-2.0
+# Prefix all lines with "# ", unbuffered. Command being piped in may need
+# to have unbuffering forced with "stdbuf -i0 -o0 -e0 $cmd".
+use strict;
+
+binmode STDIN;
+binmode STDOUT;
+
+STDOUT->autoflush(1);
+
+my $needed = 1;
+while (1) {
+       my $char;
+       my $bytes = sysread(STDIN, $char, 1);
+       exit 0 if ($bytes == 0);
+       if ($needed) {
+               print "# ";
+               $needed = 0;
+       }
+       print $char;
+       $needed = 1 if ($char eq "\n");
+}
diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh
new file mode 100644 (file)
index 0000000..eff3ee3
--- /dev/null
@@ -0,0 +1,86 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Runs a set of tests in a given subdirectory.
+export skip_rc=4
+export logfile=/dev/stdout
+export per_test_logging=
+
+# There isn't a shell-agnostic way to find the path of a sourced file,
+# so we must rely on BASE_DIR being set to find other tools.
+if [ -z "$BASE_DIR" ]; then
+       echo "Error: BASE_DIR must be set before sourcing." >&2
+       exit 1
+fi
+
+# If Perl is unavailable, we must fall back to line-at-a-time prefixing
+# with sed instead of unbuffered output.
+tap_prefix()
+{
+       if [ ! -x /usr/bin/perl ]; then
+               sed -e 's/^/# /'
+       else
+               "$BASE_DIR"/kselftest/prefix.pl
+       fi
+}
+
+# If stdbuf is unavailable, we must fall back to line-at-a-time piping.
+tap_unbuffer()
+{
+       if ! which stdbuf >/dev/null ; then
+               "$@"
+       else
+               stdbuf -i0 -o0 -e0 "$@"
+       fi
+}
+
+run_one()
+{
+       DIR="$1"
+       TEST="$2"
+       NUM="$3"
+
+       BASENAME_TEST=$(basename $TEST)
+
+       TEST_HDR_MSG="selftests: $DIR: $BASENAME_TEST"
+       echo "# $TEST_HDR_MSG"
+       if [ ! -x "$TEST" ]; then
+               echo -n "# Warning: file $TEST is "
+               if [ ! -e "$TEST" ]; then
+                       echo "missing!"
+               else
+                       echo "not executable, correct this."
+               fi
+               echo "not ok $test_num $TEST_HDR_MSG"
+       else
+               cd `dirname $TEST` > /dev/null
+               (((((tap_unbuffer ./$BASENAME_TEST 2>&1; echo $? >&3) |
+                       tap_prefix >&4) 3>&1) |
+                       (read xs; exit $xs)) 4>>"$logfile" &&
+               echo "ok $test_num $TEST_HDR_MSG") ||
+               (if [ $? -eq $skip_rc ]; then   \
+                       echo "not ok $test_num $TEST_HDR_MSG # SKIP"
+               else
+                       echo "not ok $test_num $TEST_HDR_MSG"
+               fi)
+               cd - >/dev/null
+       fi
+}
+
+run_many()
+{
+       echo "TAP version 13"
+       DIR=$(basename "$PWD")
+       test_num=0
+       total=$(echo "$@" | wc -w)
+       echo "1..$total"
+       for TEST in "$@"; do
+               BASENAME_TEST=$(basename $TEST)
+               test_num=$(( test_num + 1 ))
+               if [ -n "$per_test_logging" ]; then
+                       logfile="/tmp/$BASENAME_TEST"
+                       cat /dev/null > "$logfile"
+               fi
+               run_one "$DIR" "$TEST" "$test_num"
+       done
+}
index 5979fdc4f36cf3729373509a2bcce8a8576aa362..07733719578358bf6c983eabe192dd38ae59cf58 100644 (file)
@@ -3,17 +3,12 @@
 CC := $(CROSS_COMPILE)gcc
 
 ifeq (0,$(MAKELEVEL))
-    ifneq ($(O),)
-       OUTPUT := $(O)
-    else
-       ifneq ($(KBUILD_OUTPUT),)
-               OUTPUT := $(KBUILD_OUTPUT)
-       else
-               OUTPUT := $(shell pwd)
-               DEFAULT_INSTALL_HDR_PATH := 1
-       endif
+    ifeq ($(OUTPUT),)
+       OUTPUT := $(shell pwd)
+       DEFAULT_INSTALL_HDR_PATH := 1
     endif
 endif
+selfdir = $(realpath $(dir $(filter %/lib.mk,$(MAKEFILE_LIST))))
 
 # The following are built by lib.mk common compile rules.
 # TEST_CUSTOM_PROGS should be used by tests that require
@@ -65,44 +60,13 @@ all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES)
 endif
 
 .ONESHELL:
-define RUN_TEST_PRINT_RESULT
-       TEST_HDR_MSG="selftests: "`basename $$PWD`:" $$BASENAME_TEST";  \
-       echo $$TEST_HDR_MSG;                                    \
-       echo "========================================";        \
-       if [ ! -x $$TEST ]; then        \
-               echo "$$TEST_HDR_MSG: Warning: file $$BASENAME_TEST is not executable, correct this.";\
-               echo "not ok 1..$$test_num $$TEST_HDR_MSG [FAIL]"; \
-       else                                    \
-               cd `dirname $$TEST` > /dev/null; \
-               if [ "X$(summary)" != "X" ]; then       \
-                       (./$$BASENAME_TEST > /tmp/$$BASENAME_TEST 2>&1 && \
-                       echo "ok 1..$$test_num $$TEST_HDR_MSG [PASS]") || \
-                       (if [ $$? -eq $$skip ]; then    \
-                               echo "not ok 1..$$test_num $$TEST_HDR_MSG [SKIP]";                              \
-                       else echo "not ok 1..$$test_num $$TEST_HDR_MSG [FAIL]";                                 \
-                       fi;)                    \
-               else                            \
-                       (./$$BASENAME_TEST &&   \
-                       echo "ok 1..$$test_num $$TEST_HDR_MSG [PASS]") ||                                               \
-                       (if [ $$? -eq $$skip ]; then \
-                               echo "not ok 1..$$test_num $$TEST_HDR_MSG [SKIP]"; \
-                       else echo "not ok 1..$$test_num $$TEST_HDR_MSG [FAIL]";                         \
-                       fi;)            \
-               fi;                             \
-               cd - > /dev/null;               \
-       fi;
-endef
-
 define RUN_TESTS
-       @export KSFT_TAP_LEVEL=`echo 1`;                \
-       test_num=`echo 0`;                              \
-       skip=`echo 4`;                                  \
-       echo "TAP version 13";                          \
-       for TEST in $(1); do                            \
-               BASENAME_TEST=`basename $$TEST`;        \
-               test_num=`echo $$test_num+1 | bc`;      \
-               $(call RUN_TEST_PRINT_RESULT,$(TEST),$(BASENAME_TEST),$(test_num),$(skip))                                              \
-       done;
+       @BASE_DIR="$(selfdir)";                 \
+       . $(selfdir)/kselftest/runner.sh;       \
+       if [ "X$(summary)" != "X" ]; then       \
+               per_test_logging=1;             \
+       fi;                                     \
+       run_many $(1)
 endef
 
 run_tests: all
@@ -139,24 +103,12 @@ else
        $(error Error: set INSTALL_PATH to use install)
 endif
 
-define EMIT_TESTS
-       @test_num=`echo 0`;                             \
+emit_tests:
        for TEST in $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS); do \
                BASENAME_TEST=`basename $$TEST`;        \
-               test_num=`echo $$test_num+1 | bc`;      \
-               TEST_HDR_MSG="selftests: "`basename $$PWD`:" $$BASENAME_TEST";  \
-               echo "echo $$TEST_HDR_MSG";     \
-               if [ ! -x $$TEST ]; then        \
-                       echo "echo \"$$TEST_HDR_MSG: Warning: file $$BASENAME_TEST is not executable, correct this.\"";         \
-                       echo "echo \"not ok 1..$$test_num $$TEST_HDR_MSG [FAIL]\""; \
-               else
-                       echo "(./$$BASENAME_TEST >> \$$OUTPUT 2>&1 && echo \"ok 1..$$test_num $$TEST_HDR_MSG [PASS]\") || (if [ \$$? -eq \$$skip ]; then echo \"not ok 1..$$test_num $$TEST_HDR_MSG [SKIP]\"; else echo \"not ok 1..$$test_num $$TEST_HDR_MSG [FAIL]\"; fi;)"; \
-               fi;             \
-       done;
-endef
-
-emit_tests:
-       $(EMIT_TESTS)
+               echo "  \\";                            \
+               echo -n "       \"$$BASENAME_TEST\"";   \
+       done;                                           \
 
 # define if isn't already. It is undefined in make O= case.
 ifeq ($(RM),)
index 6793f8ecc8e7fc55d2eb1f16bdc04449f7df0a54..70b4ddbf126b059e3336d0e25f74409b59fe52fb 100644 (file)
@@ -304,6 +304,7 @@ static int test_membarrier_query(void)
 int main(int argc, char **argv)
 {
        ksft_print_header();
+       ksft_set_plan(13);
 
        test_membarrier_query();
        test_membarrier();
diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
new file mode 100644 (file)
index 0000000..822a1e6
--- /dev/null
@@ -0,0 +1 @@
+pidfd_test
index d59378a93782ed4edc2c80e737b178baaa816736..5bae1792e3d6372570e60c733506c2df9ea0ab50 100644 (file)
@@ -371,6 +371,7 @@ static int test_pidfd_send_signal_syscall_support(void)
 int main(int argc, char **argv)
 {
        ksft_print_header();
+       ksft_set_plan(4);
 
        test_pidfd_send_signal_syscall_support();
        test_pidfd_send_signal_simple_success();
index c30c52e1d0d28e28f9743bbe4e043122cf075414..d6469535630af888051a9c50e8059a8a2abfac8a 100644 (file)
@@ -1,5 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0+ OR MIT
-CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./
+
+ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
+CLANG_FLAGS += -no-integrated-as
+endif
+
+CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./ \
+         $(CLANG_FLAGS)
 LDLIBS += -lpthread
 
 # Own dependencies because we only want to build against 1st prerequisite, but
index 3cea19877227a03c4c501bffa6a687cbe32ad126..84f28f147fb6696a55a4f6bb7bd194368bb1bded 100644 (file)
@@ -5,7 +5,54 @@
  * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  */
 
-#define RSEQ_SIG       0x53053053
+/*
+ * RSEQ_SIG uses the udf A32 instruction with an uncommon immediate operand
+ * value 0x5de3. This traps if user-space reaches this instruction by mistake,
+ * and the uncommon operand ensures the kernel does not move the instruction
+ * pointer to attacker-controlled code on rseq abort.
+ *
+ * The instruction pattern in the A32 instruction set is:
+ *
+ * e7f5def3    udf    #24035    ; 0x5de3
+ *
+ * This translates to the following instruction pattern in the T16 instruction
+ * set:
+ *
+ * little endian:
+ * def3        udf    #243      ; 0xf3
+ * e7f5        b.n    <7f5>
+ *
+ * pre-ARMv6 big endian code:
+ * e7f5        b.n    <7f5>
+ * def3        udf    #243      ; 0xf3
+ *
+ * ARMv6+ -mbig-endian generates mixed endianness code vs data: little-endian
+ * code and big-endian data. Ensure the RSEQ_SIG data signature matches code
+ * endianness. Prior to ARMv6, -mbig-endian generates big-endian code and data
+ * (which match), so there is no need to reverse the endianness of the data
+ * representation of the signature. However, the choice between BE32 and BE8
+ * is done by the linker, so we cannot know whether code and data endianness
+ * will be mixed before the linker is invoked.
+ */
+
+#define RSEQ_SIG_CODE  0xe7f5def3
+
+#ifndef __ASSEMBLER__
+
+#define RSEQ_SIG_DATA                                                  \
+       ({                                                              \
+               int sig;                                                \
+               asm volatile ("b 2f\n\t"                                \
+                             "1: .inst " __rseq_str(RSEQ_SIG_CODE) "\n\t" \
+                             "2:\n\t"                                  \
+                             "ldr %[sig], 1b\n\t"                      \
+                             : [sig] "=r" (sig));                      \
+               sig;                                                    \
+       })
+
+#define RSEQ_SIG       RSEQ_SIG_DATA
+
+#endif
 
 #define rseq_smp_mb()  __asm__ __volatile__ ("dmb" ::: "memory", "cc")
 #define rseq_smp_rmb() __asm__ __volatile__ ("dmb" ::: "memory", "cc")
@@ -30,18 +77,35 @@ do {                                                                        \
 #include "rseq-skip.h"
 #else /* !RSEQ_SKIP_FASTPATH */
 
-#define __RSEQ_ASM_DEFINE_TABLE(version, flags,        start_ip,               \
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,       \
                                post_commit_offset, abort_ip)           \
-               ".pushsection __rseq_table, \"aw\"\n\t"                 \
+               ".pushsection __rseq_cs, \"aw\"\n\t"                    \
                ".balign 32\n\t"                                        \
+               __rseq_str(label) ":\n\t"                                       \
                ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
                ".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+               ".popsection\n\t"                                       \
+               ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"          \
+               ".word " __rseq_str(label) "b, 0x0\n\t"                 \
                ".popsection\n\t"
 
-#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip)      \
-       __RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip,                     \
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+       __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,              \
                                (post_commit_ip - start_ip), abort_ip)
 
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)                  \
+               ".pushsection __rseq_exit_point_array, \"aw\"\n\t"      \
+               ".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) ", 0x0\n\t" \
+               ".popsection\n\t"
+
 #define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)               \
                RSEQ_INJECT_ASM(1)                                      \
                "adr r0, " __rseq_str(cs_label) "\n\t"                  \
@@ -61,7 +125,8 @@ do {                                                                 \
                __rseq_str(table_label) ":\n\t"                         \
                ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
                ".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
-               ".word " __rseq_str(RSEQ_SIG) "\n\t"                    \
+               ".arm\n\t"                                              \
+               ".inst " __rseq_str(RSEQ_SIG_CODE) "\n\t"               \
                __rseq_str(label) ":\n\t"                               \
                teardown                                                \
                "b %l[" __rseq_str(abort_label) "]\n\t"
@@ -86,7 +151,12 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -148,7 +218,12 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -214,7 +289,10 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -266,7 +344,12 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -336,7 +419,12 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -407,7 +495,13 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -485,7 +579,12 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                "str %[src], %[rseq_scratch0]\n\t"
                "str %[dst], %[rseq_scratch1]\n\t"
                "str %[len], %[rseq_scratch2]\n\t"
@@ -604,7 +703,12 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                "str %[src], %[rseq_scratch0]\n\t"
                "str %[dst], %[rseq_scratch1]\n\t"
                "str %[len], %[rseq_scratch2]\n\t"
index 954f34671ca6bec6cbebc7de44d4a3653d573a71..200dae9e4208c49fc3262d3cc4a5a96cefecf305 100644 (file)
@@ -6,7 +6,20 @@
  * (C) Copyright 2018 - Will Deacon <will.deacon@arm.com>
  */
 
-#define RSEQ_SIG       0xd428bc00      /* BRK #0x45E0 */
+/*
+ * aarch64 -mbig-endian generates mixed endianness code vs data:
+ * little-endian code and big-endian data. Ensure the RSEQ_SIG signature
+ * matches code endianness.
+ */
+#define RSEQ_SIG_CODE  0xd428bc00      /* BRK #0x45E0.  */
+
+#ifdef __AARCH64EB__
+#define RSEQ_SIG_DATA  0x00bc28d4      /* BRK #0x45E0.  */
+#else
+#define RSEQ_SIG_DATA  RSEQ_SIG_CODE
+#endif
+
+#define RSEQ_SIG       RSEQ_SIG_DATA
 
 #define rseq_smp_mb()  __asm__ __volatile__ ("dmb ish" ::: "memory")
 #define rseq_smp_rmb() __asm__ __volatile__ ("dmb ishld" ::: "memory")
@@ -82,19 +95,35 @@ do {                                                                                \
 
 #define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip,               \
                                post_commit_offset, abort_ip)                   \
-       "       .pushsection    __rseq_table, \"aw\"\n"                         \
+       "       .pushsection    __rseq_cs, \"aw\"\n"                            \
        "       .balign 32\n"                                                   \
        __rseq_str(label) ":\n"                                                 \
        "       .long   " __rseq_str(version) ", " __rseq_str(flags) "\n"       \
        "       .quad   " __rseq_str(start_ip) ", "                             \
                          __rseq_str(post_commit_offset) ", "                   \
                          __rseq_str(abort_ip) "\n"                             \
+       "       .popsection\n\t"                                                \
+       "       .pushsection __rseq_cs_ptr_array, \"aw\"\n"                             \
+       "       .quad " __rseq_str(label) "b\n"                                 \
        "       .popsection\n"
 
 #define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)       \
        __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,                      \
                                (post_commit_ip - start_ip), abort_ip)
 
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)                          \
+       "       .pushsection __rseq_exit_point_array, \"aw\"\n"                 \
+       "       .quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n"      \
+       "       .popsection\n"
+
 #define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)                       \
        RSEQ_INJECT_ASM(1)                                                      \
        "       adrp    " RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n"       \
@@ -105,7 +134,7 @@ do {                                                                                \
 
 #define RSEQ_ASM_DEFINE_ABORT(label, abort_label)                              \
        "       b       222f\n"                                                 \
-       "       .inst   "       __rseq_str(RSEQ_SIG) "\n"                       \
+       "       .inst   "       __rseq_str(RSEQ_SIG_CODE) "\n"                  \
        __rseq_str(label) ":\n"                                                 \
        "       b       %l[" __rseq_str(abort_label) "]\n"                      \
        "222:\n"
@@ -182,6 +211,11 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
                RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
                RSEQ_INJECT_ASM(3)
@@ -231,6 +265,11 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
                RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
                RSEQ_INJECT_ASM(3)
@@ -282,6 +321,9 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+#endif
                RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
                RSEQ_INJECT_ASM(3)
@@ -325,6 +367,11 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
                RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
                RSEQ_INJECT_ASM(3)
@@ -379,6 +426,11 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
                RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
                RSEQ_INJECT_ASM(3)
@@ -433,6 +485,12 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error3])
+#endif
                RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
                RSEQ_INJECT_ASM(3)
@@ -490,6 +548,11 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
                RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
                RSEQ_INJECT_ASM(3)
@@ -545,6 +608,11 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
                RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
                RSEQ_INJECT_ASM(3)
index 7f48ecf469941299c68e40bafdc35fae815f5f43..e989e7c14b0972e52c66aba3a676d0e2cc41b448 100644 (file)
@@ -7,7 +7,39 @@
  * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  */
 
-#define RSEQ_SIG       0x53053053
+/*
+ * RSEQ_SIG uses the break instruction. The instruction pattern is:
+ *
+ * On MIPS:
+ *     0350000d        break     0x350
+ *
+ * On nanoMIPS:
+ *      00100350        break     0x350
+ *
+ * On microMIPS:
+ *      0000d407        break     0x350
+ *
+ * For nanoMIPS32 and microMIPS, the instruction stream is encoded as 16-bit
+ * halfwords, so the signature halfwords need to be swapped accordingly for
+ * little-endian.
+ */
+#if defined(__nanomips__)
+# ifdef __MIPSEL__
+#  define RSEQ_SIG     0x03500010
+# else
+#  define RSEQ_SIG     0x00100350
+# endif
+#elif defined(__mips_micromips)
+# ifdef __MIPSEL__
+#  define RSEQ_SIG     0xd4070000
+# else
+#  define RSEQ_SIG     0x0000d407
+# endif
+#elif defined(__mips__)
+# define RSEQ_SIG      0x0350000d
+#else
+/* Unknown MIPS architecture. */
+#endif
 
 #define rseq_smp_mb()  __asm__ __volatile__ ("sync" ::: "memory")
 #define rseq_smp_rmb() rseq_smp_mb()
@@ -54,20 +86,38 @@ do {                                                                        \
 # error unsupported _MIPS_SZLONG
 #endif
 
-#define __RSEQ_ASM_DEFINE_TABLE(version, flags,        start_ip, \
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
                                post_commit_offset, abort_ip) \
-               ".pushsection __rseq_table, \"aw\"\n\t" \
+               ".pushsection __rseq_cs, \"aw\"\n\t" \
                ".balign 32\n\t" \
+               __rseq_str(label) ":\n\t"                                       \
                ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
                LONG " " U32_U64_PAD(__rseq_str(start_ip)) "\n\t" \
                LONG " " U32_U64_PAD(__rseq_str(post_commit_offset)) "\n\t" \
                LONG " " U32_U64_PAD(__rseq_str(abort_ip)) "\n\t" \
+               ".popsection\n\t" \
+               ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+               LONG " " U32_U64_PAD(__rseq_str(label) "b") "\n\t" \
                ".popsection\n\t"
 
-#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip) \
-       __RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip, \
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+       __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
                                (post_commit_ip - start_ip), abort_ip)
 
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
+               ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
+               LONG " " U32_U64_PAD(__rseq_str(start_ip)) "\n\t" \
+               LONG " " U32_U64_PAD(__rseq_str(exit_ip)) "\n\t" \
+               ".popsection\n\t"
+
 #define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
                RSEQ_INJECT_ASM(1) \
                LONG_LA " $4, " __rseq_str(cs_label) "\n\t" \
@@ -113,7 +163,12 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -173,7 +228,12 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -237,7 +297,10 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -289,7 +352,12 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -357,7 +425,12 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -426,7 +499,13 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -500,7 +579,12 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                LONG_S " %[src], %[rseq_scratch0]\n\t"
                LONG_S "  %[dst], %[rseq_scratch1]\n\t"
                LONG_S " %[len], %[rseq_scratch2]\n\t"
@@ -616,7 +700,12 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 
        rseq_workaround_gcc_asm_size_guess();
        __asm__ __volatile__ goto (
-               RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                LONG_S " %[src], %[rseq_scratch0]\n\t"
                LONG_S " %[dst], %[rseq_scratch1]\n\t"
                LONG_S " %[len], %[rseq_scratch2]\n\t"
index 52630c9f42be2998f04a77a9cba702cb72bd16b6..76be90196fe4f1fc09038b5f1a19fa93af91793c 100644 (file)
@@ -6,7 +6,15 @@
  * (C) Copyright 2016-2018 - Boqun Feng <boqun.feng@gmail.com>
  */
 
-#define RSEQ_SIG       0x53053053
+/*
+ * RSEQ_SIG is used with the following trap instruction:
+ *
+ * powerpc-be:    0f e5 00 0b           twui   r5,11
+ * powerpc64-le:  0b 00 e5 0f           twui   r5,11
+ * powerpc64-be:  0f e5 00 0b           twui   r5,11
+ */
+
+#define RSEQ_SIG       0x0fe5000b
 
 #define rseq_smp_mb()          __asm__ __volatile__ ("sync"    ::: "memory", "cc")
 #define rseq_smp_lwsync()      __asm__ __volatile__ ("lwsync"  ::: "memory", "cc")
@@ -33,8 +41,8 @@ do {                                                                  \
 #else /* !RSEQ_SKIP_FASTPATH */
 
 /*
- * The __rseq_table section can be used by debuggers to better handle
- * single-stepping through the restartable critical sections.
+ * The __rseq_cs_ptr_array and __rseq_cs sections can be used by debuggers to
+ * better handle single-stepping through the restartable critical sections.
  */
 
 #ifdef __PPC64__
@@ -46,11 +54,14 @@ do {                                                                        \
 
 #define __RSEQ_ASM_DEFINE_TABLE(label, version, flags,                         \
                        start_ip, post_commit_offset, abort_ip)                 \
-               ".pushsection __rseq_table, \"aw\"\n\t"                         \
+               ".pushsection __rseq_cs, \"aw\"\n\t"                            \
                ".balign 32\n\t"                                                \
                __rseq_str(label) ":\n\t"                                       \
                ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t"      \
                ".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+               ".popsection\n\t"                                               \
+               ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"                  \
+               ".quad " __rseq_str(label) "b\n\t"                              \
                ".popsection\n\t"
 
 #define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)                       \
@@ -63,6 +74,19 @@ do {                                                                 \
                "std %%r17, %[" __rseq_str(rseq_cs) "]\n\t"                     \
                __rseq_str(label) ":\n\t"
 
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)                  \
+               ".pushsection __rseq_exit_point_array, \"aw\"\n\t"      \
+               ".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \
+               ".popsection\n\t"
+
 #else /* #ifdef __PPC64__ */
 
 #define STORE_WORD     "stw "
@@ -72,12 +96,29 @@ do {                                                                        \
 
 #define __RSEQ_ASM_DEFINE_TABLE(label, version, flags,                         \
                        start_ip, post_commit_offset, abort_ip)                 \
-               ".pushsection __rseq_table, \"aw\"\n\t"                         \
+               ".pushsection __rseq_cs, \"aw\"\n\t"                            \
                ".balign 32\n\t"                                                \
                __rseq_str(label) ":\n\t"                                       \
                ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t"      \
                /* 32-bit only supported on BE */                               \
                ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
+               ".popsection\n\t"                                       \
+               ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"          \
+               ".long 0x0, " __rseq_str(label) "b\n\t"                 \
+               ".popsection\n\t"
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)                          \
+               ".pushsection __rseq_exit_point_array, \"aw\"\n\t"              \
+               /* 32-bit only supported on BE */                               \
+               ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \
                ".popsection\n\t"
 
 #define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)                       \
@@ -169,6 +210,11 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                /* cmp cpuid */
@@ -224,6 +270,11 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                /* cmp cpuid */
@@ -286,6 +337,9 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                /* cmp cpuid */
@@ -337,6 +391,11 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                /* cmp cpuid */
@@ -400,6 +459,11 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                /* cmp cpuid */
@@ -465,6 +529,12 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                /* cmp cpuid */
@@ -532,6 +602,11 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* setup for mempcy */
                "mr %%r19, %[len]\n\t"
                "mr %%r20, %[src]\n\t"
@@ -601,6 +676,11 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* setup for mempcy */
                "mr %%r19, %[len]\n\t"
                "mr %%r20, %[src]\n\t"
index 0afdf795797455f8ac61c5e186931a20e588e570..8ef94ad1cbb45224f7eef950a9d51a3e4816f1cf 100644 (file)
@@ -44,22 +44,54 @@ do {                                                                        \
 
 #define __RSEQ_ASM_DEFINE_TABLE(label, version, flags,                 \
                                start_ip, post_commit_offset, abort_ip) \
-               ".pushsection __rseq_table, \"aw\"\n\t"                 \
+               ".pushsection __rseq_cs, \"aw\"\n\t"                    \
                ".balign 32\n\t"                                        \
                __rseq_str(label) ":\n\t"                               \
                ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
                ".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+               ".popsection\n\t"                                       \
+               ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"          \
+               ".quad " __rseq_str(label) "b\n\t"                      \
+               ".popsection\n\t"
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)                  \
+               ".pushsection __rseq_exit_point_array, \"aw\"\n\t"      \
+               ".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \
                ".popsection\n\t"
 
 #elif __s390__
 
 #define __RSEQ_ASM_DEFINE_TABLE(label, version, flags,                 \
                                start_ip, post_commit_offset, abort_ip) \
-               ".pushsection __rseq_table, \"aw\"\n\t"                 \
+               ".pushsection __rseq_cs, \"aw\"\n\t"                    \
                ".balign 32\n\t"                                        \
                __rseq_str(label) ":\n\t"                               \
                ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
                ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
+               ".popsection\n\t"                                       \
+               ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"          \
+               ".long 0x0, " __rseq_str(label) "b\n\t"                 \
+               ".popsection\n\t"
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)                  \
+               ".pushsection __rseq_exit_point_array, \"aw\"\n\t"      \
+               ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \
                ".popsection\n\t"
 
 #define LONG_L                 "l"
@@ -92,14 +124,14 @@ do {                                                                       \
                ".long " __rseq_str(RSEQ_SIG) "\n\t"                    \
                __rseq_str(label) ":\n\t"                               \
                teardown                                                \
-               "j %l[" __rseq_str(abort_label) "]\n\t"                 \
+               "jg %l[" __rseq_str(abort_label) "]\n\t"                \
                ".popsection\n\t"
 
 #define RSEQ_ASM_DEFINE_CMPFAIL(label, teardown, cmpfail_label)                \
                ".pushsection __rseq_failure, \"ax\"\n\t"               \
                __rseq_str(label) ":\n\t"                               \
                teardown                                                \
-               "j %l[" __rseq_str(cmpfail_label) "]\n\t"               \
+               "jg %l[" __rseq_str(cmpfail_label) "]\n\t"              \
                ".popsection\n\t"
 
 static inline __attribute__((always_inline))
@@ -109,6 +141,11 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -167,6 +204,11 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -227,6 +269,9 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -275,6 +320,11 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -346,6 +396,12 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
@@ -414,6 +470,11 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                LONG_S " %[src], %[rseq_scratch0]\n\t"
                LONG_S " %[dst], %[rseq_scratch1]\n\t"
                LONG_S " %[len], %[rseq_scratch2]\n\t"
index 089410a314e9df814a0e94cc08cdc2ddfe1f019d..b2da6004fe307931a500d0b50eba52cb46a06645 100644 (file)
@@ -7,8 +7,25 @@
 
 #include <stdint.h>
 
+/*
+ * RSEQ_SIG is used with the following reserved undefined instructions, which
+ * trap in user-space:
+ *
+ * x86-32:    0f b9 3d 53 30 05 53      ud1    0x53053053,%edi
+ * x86-64:    0f b9 3d 53 30 05 53      ud1    0x53053053(%rip),%edi
+ */
 #define RSEQ_SIG       0x53053053
 
+/*
+ * Due to a compiler optimization bug in gcc-8 with asm goto and TLS asm input
+ * operands, we cannot use "m" input operands, and rather pass the __rseq_abi
+ * address through a "r" input operand.
+ */
+
+/* Offset of cpu_id and rseq_cs fields in struct rseq. */
+#define RSEQ_CPU_ID_OFFSET     4
+#define RSEQ_CS_OFFSET         8
+
 #ifdef __x86_64__
 
 #define rseq_smp_mb()  \
@@ -37,32 +54,49 @@ do {                                                                        \
 
 #define __RSEQ_ASM_DEFINE_TABLE(label, version, flags,                 \
                                start_ip, post_commit_offset, abort_ip) \
-               ".pushsection __rseq_table, \"aw\"\n\t"                 \
+               ".pushsection __rseq_cs, \"aw\"\n\t"                    \
                ".balign 32\n\t"                                        \
                __rseq_str(label) ":\n\t"                               \
                ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
                ".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+               ".popsection\n\t"                                       \
+               ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"          \
+               ".quad " __rseq_str(label) "b\n\t"                      \
                ".popsection\n\t"
 
+
 #define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
        __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,              \
                                (post_commit_ip - start_ip), abort_ip)
 
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)                  \
+               ".pushsection __rseq_exit_point_array, \"aw\"\n\t"      \
+               ".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \
+               ".popsection\n\t"
+
 #define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)               \
                RSEQ_INJECT_ASM(1)                                      \
                "leaq " __rseq_str(cs_label) "(%%rip), %%rax\n\t"       \
-               "movq %%rax, %[" __rseq_str(rseq_cs) "]\n\t"            \
+               "movq %%rax, " __rseq_str(rseq_cs) "\n\t"               \
                __rseq_str(label) ":\n\t"
 
 #define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label)             \
                RSEQ_INJECT_ASM(2)                                      \
-               "cmpl %[" __rseq_str(cpu_id) "], %[" __rseq_str(current_cpu_id) "]\n\t" \
+               "cmpl %[" __rseq_str(cpu_id) "], " __rseq_str(current_cpu_id) "\n\t" \
                "jnz " __rseq_str(label) "\n\t"
 
 #define RSEQ_ASM_DEFINE_ABORT(label, teardown, abort_label)            \
                ".pushsection __rseq_failure, \"ax\"\n\t"               \
-               /* Disassembler-friendly signature: nopl <sig>(%rip). */\
-               ".byte 0x0f, 0x1f, 0x05\n\t"                            \
+               /* Disassembler-friendly signature: ud1 <sig>(%rip),%edi. */ \
+               ".byte 0x0f, 0xb9, 0x3d\n\t"                            \
                ".long " __rseq_str(RSEQ_SIG) "\n\t"                    \
                __rseq_str(label) ":\n\t"                               \
                teardown                                                \
@@ -83,15 +117,20 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "cmpq %[v], %[expect]\n\t"
                "jnz %l[cmpfail]\n\t"
                RSEQ_INJECT_ASM(4)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
                "cmpq %[v], %[expect]\n\t"
                "jnz %l[error2]\n\t"
 #endif
@@ -102,8 +141,7 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  [v]                   "m" (*v),
                  [expect]              "r" (expect),
                  [newv]                "r" (newv)
@@ -140,16 +178,21 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "movq %[v], %%rbx\n\t"
                "cmpq %%rbx, %[expectnot]\n\t"
                "je %l[cmpfail]\n\t"
                RSEQ_INJECT_ASM(4)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
                "movq %[v], %%rbx\n\t"
                "cmpq %%rbx, %[expectnot]\n\t"
                "je %l[error2]\n\t"
@@ -164,8 +207,7 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* final store input */
                  [v]                   "m" (*v),
                  [expectnot]           "r" (expectnot),
@@ -199,12 +241,15 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
 #endif
                /* final store */
                "addq %[count], %[v]\n\t"
@@ -213,8 +258,7 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* final store input */
                  [v]                   "m" (*v),
                  [count]               "er" (count)
@@ -244,15 +288,20 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "cmpq %[v], %[expect]\n\t"
                "jnz %l[cmpfail]\n\t"
                RSEQ_INJECT_ASM(4)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
                "cmpq %[v], %[expect]\n\t"
                "jnz %l[error2]\n\t"
 #endif
@@ -266,8 +315,7 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* try store input */
                  [v2]                  "m" (*v2),
                  [newv2]               "r" (newv2),
@@ -314,9 +362,15 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "cmpq %[v], %[expect]\n\t"
                "jnz %l[cmpfail]\n\t"
@@ -325,7 +379,7 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
                "jnz %l[cmpfail]\n\t"
                RSEQ_INJECT_ASM(5)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
                "cmpq %[v], %[expect]\n\t"
                "jnz %l[error2]\n\t"
                "cmpq %[v2], %[expect2]\n\t"
@@ -338,8 +392,7 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* cmp2 input */
                  [v2]                  "m" (*v2),
                  [expect2]             "r" (expect2),
@@ -381,18 +434,23 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                "movq %[src], %[rseq_scratch0]\n\t"
                "movq %[dst], %[rseq_scratch1]\n\t"
                "movq %[len], %[rseq_scratch2]\n\t"
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "cmpq %[v], %[expect]\n\t"
                "jnz 5f\n\t"
                RSEQ_INJECT_ASM(4)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 6f)
                "cmpq %[v], %[expect]\n\t"
                "jnz 7f\n\t"
 #endif
@@ -440,8 +498,7 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 #endif
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* final store input */
                  [v]                   "m" (*v),
                  [expect]              "r" (expect),
@@ -520,31 +577,47 @@ do {                                                                      \
  */
 #define __RSEQ_ASM_DEFINE_TABLE(label, version, flags,                 \
                                start_ip, post_commit_offset, abort_ip) \
-               ".pushsection __rseq_table, \"aw\"\n\t"                 \
+               ".pushsection __rseq_cs, \"aw\"\n\t"                    \
                ".balign 32\n\t"                                        \
                __rseq_str(label) ":\n\t"                               \
                ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
                ".long " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+               ".popsection\n\t"                                       \
+               ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"          \
+               ".long " __rseq_str(label) "b, 0x0\n\t"                 \
                ".popsection\n\t"
 
 #define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
        __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,              \
                                (post_commit_ip - start_ip), abort_ip)
 
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)                  \
+               ".pushsection __rseq_exit_point_array, \"aw\"\n\t"      \
+               ".long " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) ", 0x0\n\t" \
+               ".popsection\n\t"
+
 #define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)               \
                RSEQ_INJECT_ASM(1)                                      \
-               "movl $" __rseq_str(cs_label) ", %[rseq_cs]\n\t"        \
+               "movl $" __rseq_str(cs_label) ", " __rseq_str(rseq_cs) "\n\t"   \
                __rseq_str(label) ":\n\t"
 
 #define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label)             \
                RSEQ_INJECT_ASM(2)                                      \
-               "cmpl %[" __rseq_str(cpu_id) "], %[" __rseq_str(current_cpu_id) "]\n\t" \
+               "cmpl %[" __rseq_str(cpu_id) "], " __rseq_str(current_cpu_id) "\n\t" \
                "jnz " __rseq_str(label) "\n\t"
 
 #define RSEQ_ASM_DEFINE_ABORT(label, teardown, abort_label)            \
                ".pushsection __rseq_failure, \"ax\"\n\t"               \
-               /* Disassembler-friendly signature: nopl <sig>. */      \
-               ".byte 0x0f, 0x1f, 0x05\n\t"                            \
+               /* Disassembler-friendly signature: ud1 <sig>,%edi. */  \
+               ".byte 0x0f, 0xb9, 0x3d\n\t"                            \
                ".long " __rseq_str(RSEQ_SIG) "\n\t"                    \
                __rseq_str(label) ":\n\t"                               \
                teardown                                                \
@@ -565,15 +638,20 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "cmpl %[v], %[expect]\n\t"
                "jnz %l[cmpfail]\n\t"
                RSEQ_INJECT_ASM(4)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
                "cmpl %[v], %[expect]\n\t"
                "jnz %l[error2]\n\t"
 #endif
@@ -584,8 +662,7 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  [v]                   "m" (*v),
                  [expect]              "r" (expect),
                  [newv]                "r" (newv)
@@ -622,16 +699,21 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "movl %[v], %%ebx\n\t"
                "cmpl %%ebx, %[expectnot]\n\t"
                "je %l[cmpfail]\n\t"
                RSEQ_INJECT_ASM(4)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
                "movl %[v], %%ebx\n\t"
                "cmpl %%ebx, %[expectnot]\n\t"
                "je %l[error2]\n\t"
@@ -646,8 +728,7 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* final store input */
                  [v]                   "m" (*v),
                  [expectnot]           "r" (expectnot),
@@ -681,12 +762,15 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
 #endif
                /* final store */
                "addl %[count], %[v]\n\t"
@@ -695,8 +779,7 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* final store input */
                  [v]                   "m" (*v),
                  [count]               "ir" (count)
@@ -726,15 +809,20 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "cmpl %[v], %[expect]\n\t"
                "jnz %l[cmpfail]\n\t"
                RSEQ_INJECT_ASM(4)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
                "cmpl %[v], %[expect]\n\t"
                "jnz %l[error2]\n\t"
 #endif
@@ -749,8 +837,7 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* try store input */
                  [v2]                  "m" (*v2),
                  [newv2]               "m" (newv2),
@@ -788,16 +875,21 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "movl %[expect], %%eax\n\t"
                "cmpl %[v], %%eax\n\t"
                "jnz %l[cmpfail]\n\t"
                RSEQ_INJECT_ASM(4)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
                "movl %[expect], %%eax\n\t"
                "cmpl %[v], %%eax\n\t"
                "jnz %l[error2]\n\t"
@@ -813,8 +905,7 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* try store input */
                  [v2]                  "m" (*v2),
                  [newv2]               "r" (newv2),
@@ -853,9 +944,15 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "cmpl %[v], %[expect]\n\t"
                "jnz %l[cmpfail]\n\t"
@@ -864,7 +961,7 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
                "jnz %l[cmpfail]\n\t"
                RSEQ_INJECT_ASM(5)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
                "cmpl %[v], %[expect]\n\t"
                "jnz %l[error2]\n\t"
                "cmpl %[expect2], %[v2]\n\t"
@@ -878,8 +975,7 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
                RSEQ_ASM_DEFINE_ABORT(4, "", abort)
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* cmp2 input */
                  [v2]                  "m" (*v2),
                  [expect2]             "r" (expect2),
@@ -922,19 +1018,24 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                "movl %[src], %[rseq_scratch0]\n\t"
                "movl %[dst], %[rseq_scratch1]\n\t"
                "movl %[len], %[rseq_scratch2]\n\t"
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "movl %[expect], %%eax\n\t"
                "cmpl %%eax, %[v]\n\t"
                "jnz 5f\n\t"
                RSEQ_INJECT_ASM(4)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 6f)
                "movl %[expect], %%eax\n\t"
                "cmpl %%eax, %[v]\n\t"
                "jnz 7f\n\t"
@@ -984,8 +1085,7 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
 #endif
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* final store input */
                  [v]                   "m" (*v),
                  [expect]              "m" (expect),
@@ -1030,19 +1130,24 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+               RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
                "movl %[src], %[rseq_scratch0]\n\t"
                "movl %[dst], %[rseq_scratch1]\n\t"
                "movl %[len], %[rseq_scratch2]\n\t"
                /* Start rseq by storing table entry pointer into rseq_cs. */
-               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+               RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_CS_OFFSET(%[rseq_abi]))
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 4f)
                RSEQ_INJECT_ASM(3)
                "movl %[expect], %%eax\n\t"
                "cmpl %%eax, %[v]\n\t"
                "jnz 5f\n\t"
                RSEQ_INJECT_ASM(4)
 #ifdef RSEQ_COMPARE_TWICE
-               RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+               RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), 6f)
                "movl %[expect], %%eax\n\t"
                "cmpl %%eax, %[v]\n\t"
                "jnz 7f\n\t"
@@ -1093,8 +1198,7 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
 #endif
                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
-                 [current_cpu_id]      "m" (__rseq_abi.cpu_id),
-                 [rseq_cs]             "m" (__rseq_abi.rseq_cs),
+                 [rseq_abi]            "r" (&__rseq_abi),
                  /* final store input */
                  [v]                   "m" (*v),
                  [expect]              "m" (expect),
index 4847e97ed0498d7dfc3dc8b5c1236d323fa9b454..7159eb777fd34ab7cf128ff2d00744eae1ac0b03 100644 (file)
 #include <syscall.h>
 #include <assert.h>
 #include <signal.h>
+#include <limits.h>
 
 #include "rseq.h"
 
 #define ARRAY_SIZE(arr)        (sizeof(arr) / sizeof((arr)[0]))
 
-__attribute__((tls_model("initial-exec"))) __thread
-volatile struct rseq __rseq_abi = {
+__thread volatile struct rseq __rseq_abi = {
        .cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
 };
 
-static __attribute__((tls_model("initial-exec"))) __thread
-volatile int refcount;
+/*
+ * Shared with other libraries. This library may take rseq ownership if it is
+ * still 0 when executing the library constructor. Set to 1 by library
+ * constructor when handling rseq. Set to 0 in destructor if handling rseq.
+ */
+int __rseq_handled;
+
+/* Whether this library have ownership of rseq registration. */
+static int rseq_ownership;
+
+static __thread volatile uint32_t __rseq_refcount;
 
 static void signal_off_save(sigset_t *oldset)
 {
@@ -69,8 +78,14 @@ int rseq_register_current_thread(void)
        int rc, ret = 0;
        sigset_t oldset;
 
+       if (!rseq_ownership)
+               return 0;
        signal_off_save(&oldset);
-       if (refcount++)
+       if (__rseq_refcount == UINT_MAX) {
+               ret = -1;
+               goto end;
+       }
+       if (__rseq_refcount++)
                goto end;
        rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG);
        if (!rc) {
@@ -78,9 +93,9 @@ int rseq_register_current_thread(void)
                goto end;
        }
        if (errno != EBUSY)
-               __rseq_abi.cpu_id = -2;
+               __rseq_abi.cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED;
        ret = -1;
-       refcount--;
+       __rseq_refcount--;
 end:
        signal_restore(oldset);
        return ret;
@@ -91,13 +106,20 @@ int rseq_unregister_current_thread(void)
        int rc, ret = 0;
        sigset_t oldset;
 
+       if (!rseq_ownership)
+               return 0;
        signal_off_save(&oldset);
-       if (--refcount)
+       if (!__rseq_refcount) {
+               ret = -1;
+               goto end;
+       }
+       if (--__rseq_refcount)
                goto end;
        rc = sys_rseq(&__rseq_abi, sizeof(struct rseq),
                      RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
        if (!rc)
                goto end;
+       __rseq_refcount = 1;
        ret = -1;
 end:
        signal_restore(oldset);
@@ -115,3 +137,20 @@ int32_t rseq_fallback_current_cpu(void)
        }
        return cpu;
 }
+
+void __attribute__((constructor)) rseq_init(void)
+{
+       /* Check whether rseq is handled by another library. */
+       if (__rseq_handled)
+               return;
+       __rseq_handled = 1;
+       rseq_ownership = 1;
+}
+
+void __attribute__((destructor)) rseq_fini(void)
+{
+       if (!rseq_ownership)
+               return;
+       __rseq_handled = 0;
+       rseq_ownership = 0;
+}
index 6c1126e7f685f6d538a0204bffddb6da1d7bdde8..d40d60e7499e8aa2f814f7282092692db84fcf73 100644 (file)
@@ -44,6 +44,7 @@
 #endif
 
 extern __thread volatile struct rseq __rseq_abi;
+extern int __rseq_handled;
 
 #define rseq_likely(x)         __builtin_expect(!!(x), 1)
 #define rseq_unlikely(x)       __builtin_expect(!!(x), 0)
index 228c2ae47687dd753250e696456f7fcedb99b17a..ad0f8df2ca0af7bccd6a6bd46d36d22480503675 100644 (file)
@@ -109,6 +109,7 @@ int main(void)
        int err;
 
        ksft_print_header();
+       ksft_set_plan(3);
 
        sigemptyset(&act.sa_mask);
        act.sa_flags = SA_ONSTACK | SA_SIGINFO;
index 7f7938263c5c5d84ac0914f7ab4e012cd882f4ed..3824b66f41a095d6bb84a82048105ca8b4343df9 100644 (file)
@@ -86,6 +86,7 @@ int main(void)
        int err;
 
        ksft_print_header();
+       ksft_set_plan(3 + 7);
 
        sync_api_supported();