]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/commitdiff
Merge tag 'for-4.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 14 Sep 2017 20:43:16 +0000 (13:43 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 14 Sep 2017 20:43:16 +0000 (13:43 -0700)
Pull device mapper updates from Mike Snitzer:

 - Some request-based DM core and DM multipath fixes and cleanups

 - Constify a few variables in DM core and DM integrity

 - Add bufio optimization and checksum failure accounting to DM
   integrity

 - Fix DM integrity to avoid checking integrity of failed reads

 - Fix DM integrity to use init_completion

 - A couple DM log-writes target fixes

 - Simplify DAX flushing by eliminating the unnecessary flush
   abstraction that was stood up for DM's use.

* tag 'for-4.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dax: remove the pmem_dax_ops->flush abstraction
  dm integrity: use init_completion instead of COMPLETION_INITIALIZER_ONSTACK
  dm integrity: make blk_integrity_profile structure const
  dm integrity: do not check integrity for failed read operations
  dm log writes: fix >512b sectorsize support
  dm log writes: don't use all the cpu while waiting to log blocks
  dm ioctl: constify ioctl lookup table
  dm: constify argument arrays
  dm integrity: count and display checksum failures
  dm integrity: optimize writing dm-bufio buffers that are partially changed
  dm rq: do not update rq partially in each ending bio
  dm rq: make dm-sq requeuing behavior consistent with dm-mq behavior
  dm mpath: complain about unsupported __multipath_map_bio() return values
  dm mpath: avoid that building with W=1 causes gcc 7 to complain about fall-through

17 files changed:
1  2 
drivers/dax/super.c
drivers/md/dm-bufio.c
drivers/md/dm-cache-target.c
drivers/md/dm-crypt.c
drivers/md/dm-flakey.c
drivers/md/dm-integrity.c
drivers/md/dm-linear.c
drivers/md/dm-log-writes.c
drivers/md/dm-mpath.c
drivers/md/dm-stripe.c
drivers/md/dm-switch.c
drivers/md/dm-thin.c
drivers/md/dm-verity-target.c
drivers/md/dm.c
drivers/nvdimm/pmem.c
fs/dax.c
include/linux/dax.h

diff --combined drivers/dax/super.c
index 3600ff7866462254d07a87741269231688b9262b,8b458f1b30c786fb8768e81f1153eaf06873c94d..557b937035328e9376e8f4eee71beeaecefa0dc2
@@@ -46,8 -46,6 +46,8 @@@ void dax_read_unlock(int id
  EXPORT_SYMBOL_GPL(dax_read_unlock);
  
  #ifdef CONFIG_BLOCK
 +#include <linux/blkdev.h>
 +
  int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
                pgoff_t *pgoff)
  {
  }
  EXPORT_SYMBOL(bdev_dax_pgoff);
  
 +#if IS_ENABLED(CONFIG_FS_DAX)
 +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
 +{
 +      if (!blk_queue_dax(bdev->bd_queue))
 +              return NULL;
 +      return fs_dax_get_by_host(bdev->bd_disk->disk_name);
 +}
 +EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
 +#endif
 +
  /**
   * __bdev_dax_supported() - Check if the device supports dax for filesystem
   * @sb: The superblock of the device
@@@ -201,8 -189,10 +201,10 @@@ static umode_t dax_visible(struct kobje
        if (!dax_dev)
                return 0;
  
-       if (a == &dev_attr_write_cache.attr && !dax_dev->ops->flush)
+ #ifndef CONFIG_ARCH_HAS_PMEM_API
+       if (a == &dev_attr_write_cache.attr)
                return 0;
+ #endif
        return a->mode;
  }
  
@@@ -267,18 -257,23 +269,23 @@@ size_t dax_copy_from_iter(struct dax_de
  }
  EXPORT_SYMBOL_GPL(dax_copy_from_iter);
  
- void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
-               size_t size)
+ #ifdef CONFIG_ARCH_HAS_PMEM_API
+ void arch_wb_cache_pmem(void *addr, size_t size);
+ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
  {
-       if (!dax_alive(dax_dev))
+       if (unlikely(!dax_alive(dax_dev)))
                return;
  
-       if (!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))
+       if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
                return;
  
-       if (dax_dev->ops->flush)
-               dax_dev->ops->flush(dax_dev, pgoff, addr, size);
+       arch_wb_cache_pmem(addr, size);
  }
+ #else
+ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
+ {
+ }
+ #endif
  EXPORT_SYMBOL_GPL(dax_flush);
  
  void dax_write_cache(struct dax_device *dax_dev, bool wc)
diff --combined drivers/md/dm-bufio.c
index 9601225e0ae9add198ed874598fb91121e6c0ede,94e050b395df8108aca955c1e71462cc499b139b..d216a8f7bc224c815c383cc588dd24b04c94ea1c
  #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT        (PAGE_SIZE >> 1)
  #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1))
  
+ /*
+  * Align buffer writes to this boundary.
+  * Tests show that SSDs have the highest IOPS when using 4k writes.
+  */
+ #define DM_BUFIO_WRITE_ALIGN          4096
  /*
   * dm_buffer->list_mode
   */
@@@ -149,6 -155,10 +155,10 @@@ struct dm_buffer 
        blk_status_t write_error;
        unsigned long state;
        unsigned long last_accessed;
+       unsigned dirty_start;
+       unsigned dirty_end;
+       unsigned write_start;
+       unsigned write_end;
        struct dm_bufio_client *c;
        struct list_head write_list;
        struct bio bio;
@@@ -560,7 -570,7 +570,7 @@@ static void dmio_complete(unsigned lon
  }
  
  static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
-                    unsigned n_sectors, bio_end_io_t *end_io)
+                    unsigned n_sectors, unsigned offset, bio_end_io_t *end_io)
  {
        int r;
        struct dm_io_request io_req = {
  
        if (b->data_mode != DATA_MODE_VMALLOC) {
                io_req.mem.type = DM_IO_KMEM;
-               io_req.mem.ptr.addr = b->data;
+               io_req.mem.ptr.addr = (char *)b->data + offset;
        } else {
                io_req.mem.type = DM_IO_VMA;
-               io_req.mem.ptr.vma = b->data;
+               io_req.mem.ptr.vma = (char *)b->data + offset;
        }
  
        b->bio.bi_end_io = end_io;
@@@ -609,14 -619,14 +619,14 @@@ static void inline_endio(struct bio *bi
  }
  
  static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector,
-                          unsigned n_sectors, bio_end_io_t *end_io)
+                          unsigned n_sectors, unsigned offset, bio_end_io_t *end_io)
  {
        char *ptr;
-       int len;
+       unsigned len;
  
        bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS);
        b->bio.bi_iter.bi_sector = sector;
 -      b->bio.bi_bdev = b->c->bdev;
 +      bio_set_dev(&b->bio, b->c->bdev);
        b->bio.bi_end_io = inline_endio;
        /*
         * Use of .bi_private isn't a problem here because
        b->bio.bi_private = end_io;
        bio_set_op_attrs(&b->bio, rw, 0);
  
-       /*
-        * We assume that if len >= PAGE_SIZE ptr is page-aligned.
-        * If len < PAGE_SIZE the buffer doesn't cross page boundary.
-        */
-       ptr = b->data;
+       ptr = (char *)b->data + offset;
        len = n_sectors << SECTOR_SHIFT;
  
-       if (len >= PAGE_SIZE)
-               BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
-       else
-               BUG_ON((unsigned long)ptr & (len - 1));
        do {
-               if (!bio_add_page(&b->bio, virt_to_page(ptr),
-                                 len < PAGE_SIZE ? len : PAGE_SIZE,
+               unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
+               if (!bio_add_page(&b->bio, virt_to_page(ptr), this_step,
                                  offset_in_page(ptr))) {
                        BUG_ON(b->c->block_size <= PAGE_SIZE);
-                       use_dmio(b, rw, sector, n_sectors, end_io);
+                       use_dmio(b, rw, sector, n_sectors, offset, end_io);
                        return;
                }
  
-               len -= PAGE_SIZE;
-               ptr += PAGE_SIZE;
+               len -= this_step;
+               ptr += this_step;
        } while (len > 0);
  
        submit_bio(&b->bio);
@@@ -657,18 -658,33 +658,33 @@@ static void submit_io(struct dm_buffer 
  {
        unsigned n_sectors;
        sector_t sector;
-       if (rw == WRITE && b->c->write_callback)
-               b->c->write_callback(b);
+       unsigned offset, end;
  
        sector = (b->block << b->c->sectors_per_block_bits) + b->c->start;
-       n_sectors = 1 << b->c->sectors_per_block_bits;
+       if (rw != WRITE) {
+               n_sectors = 1 << b->c->sectors_per_block_bits;
+               offset = 0;
+       } else {
+               if (b->c->write_callback)
+                       b->c->write_callback(b);
+               offset = b->write_start;
+               end = b->write_end;
+               offset &= -DM_BUFIO_WRITE_ALIGN;
+               end += DM_BUFIO_WRITE_ALIGN - 1;
+               end &= -DM_BUFIO_WRITE_ALIGN;
+               if (unlikely(end > b->c->block_size))
+                       end = b->c->block_size;
+               sector += offset >> SECTOR_SHIFT;
+               n_sectors = (end - offset) >> SECTOR_SHIFT;
+       }
  
        if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) &&
            b->data_mode != DATA_MODE_VMALLOC)
-               use_inline_bio(b, rw, sector, n_sectors, end_io);
+               use_inline_bio(b, rw, sector, n_sectors, offset, end_io);
        else
-               use_dmio(b, rw, sector, n_sectors, end_io);
+               use_dmio(b, rw, sector, n_sectors, offset, end_io);
  }
  
  /*----------------------------------------------------------------
@@@ -720,6 -736,9 +736,9 @@@ static void __write_dirty_buffer(struc
        clear_bit(B_DIRTY, &b->state);
        wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
  
+       b->write_start = b->dirty_start;
+       b->write_end = b->dirty_end;
        if (!write_list)
                submit_io(b, WRITE, write_endio);
        else
@@@ -1221,19 -1240,37 +1240,37 @@@ void dm_bufio_release(struct dm_buffer 
  }
  EXPORT_SYMBOL_GPL(dm_bufio_release);
  
- void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
+ void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
+                                       unsigned start, unsigned end)
  {
        struct dm_bufio_client *c = b->c;
  
+       BUG_ON(start >= end);
+       BUG_ON(end > b->c->block_size);
        dm_bufio_lock(c);
  
        BUG_ON(test_bit(B_READING, &b->state));
  
-       if (!test_and_set_bit(B_DIRTY, &b->state))
+       if (!test_and_set_bit(B_DIRTY, &b->state)) {
+               b->dirty_start = start;
+               b->dirty_end = end;
                __relink_lru(b, LIST_DIRTY);
+       } else {
+               if (start < b->dirty_start)
+                       b->dirty_start = start;
+               if (end > b->dirty_end)
+                       b->dirty_end = end;
+       }
  
        dm_bufio_unlock(c);
  }
+ EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
+ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
+ {
+       dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
+ }
  EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
  
  void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
@@@ -1398,6 -1435,8 +1435,8 @@@ retry
                wait_on_bit_io(&b->state, B_WRITING,
                               TASK_UNINTERRUPTIBLE);
                set_bit(B_DIRTY, &b->state);
+               b->dirty_start = 0;
+               b->dirty_end = c->block_size;
                __unlink_buffer(b);
                __link_buffer(b, new_block, LIST_DIRTY);
        } else {
index dcac25c2be7a25ef6ba0d67f24d69362e1abe79b,b0a5503a2fd318567d9302848646ee40eef64341..8785134c9f1f1aee8db79506a3f0384d6cf92bfa
@@@ -833,7 -833,7 +833,7 @@@ static bool is_discarded_oblock(struct 
   *--------------------------------------------------------------*/
  static void remap_to_origin(struct cache *cache, struct bio *bio)
  {
 -      bio->bi_bdev = cache->origin_dev->bdev;
 +      bio_set_dev(bio, cache->origin_dev->bdev);
  }
  
  static void remap_to_cache(struct cache *cache, struct bio *bio,
        sector_t bi_sector = bio->bi_iter.bi_sector;
        sector_t block = from_cblock(cblock);
  
 -      bio->bi_bdev = cache->cache_dev->bdev;
 +      bio_set_dev(bio, cache->cache_dev->bdev);
        if (!block_size_is_power_of_two(cache))
                bio->bi_iter.bi_sector =
                        (block * cache->sectors_per_block) +
@@@ -2306,7 -2306,7 +2306,7 @@@ static void init_features(struct cache_
  static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
                          char **error)
  {
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, 2, "Invalid number of cache feature arguments"},
        };
  
  static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
                        char **error)
  {
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, 1024, "Invalid number of policy arguments"},
        };
  
diff --combined drivers/md/dm-crypt.c
index 54aef8ed97db22fb1b28f47d4be95721aea24d0d,abf16559ed49543a728f793dfd9d244b272c7374..a55ffd4f5933fc1247b6729dcc0344964c9f01b8
@@@ -758,8 -758,9 +758,8 @@@ static int crypt_iv_tcw_whitening(struc
        int i, r;
  
        /* xor whitening with sector number */
 -      memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
 -      crypto_xor(buf, (u8 *)&sector, 8);
 -      crypto_xor(&buf[8], (u8 *)&sector, 8);
 +      crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8);
 +      crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8);
  
        /* calculate crc32 for every 32bit part and xor it */
        desc->tfm = tcw->crc32_tfm;
@@@ -804,10 -805,10 +804,10 @@@ static int crypt_iv_tcw_gen(struct cryp
        }
  
        /* Calculate IV */
 -      memcpy(iv, tcw->iv_seed, cc->iv_size);
 -      crypto_xor(iv, (u8 *)&sector, 8);
 +      crypto_xor_cpy(iv, tcw->iv_seed, (u8 *)&sector, 8);
        if (cc->iv_size > 8)
 -              crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
 +              crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector,
 +                             cc->iv_size - 8);
  
        return r;
  }
@@@ -932,6 -933,9 +932,6 @@@ static int dm_crypt_integrity_io_alloc(
        bip->bip_iter.bi_size = tag_len;
        bip->bip_iter.bi_sector = io->cc->start + io->sector;
  
 -      /* We own the metadata, do not let bio_free to release it */
 -      bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
 -
        ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
                                     tag_len, offset_in_page(io->integrity_metadata));
        if (unlikely(ret != tag_len))
@@@ -1543,7 -1547,7 +1543,7 @@@ static void clone_init(struct dm_crypt_
  
        clone->bi_private = io;
        clone->bi_end_io  = crypt_endio;
 -      clone->bi_bdev    = cc->dev->bdev;
 +      bio_set_dev(clone, cc->dev->bdev);
        clone->bi_opf     = io->base_bio->bi_opf;
  }
  
@@@ -2529,7 -2533,7 +2529,7 @@@ static int crypt_ctr_optional(struct dm
  {
        struct crypt_config *cc = ti->private;
        struct dm_arg_set as;
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, 6, "Invalid number of feature args"},
        };
        unsigned int opt_params, val;
@@@ -2792,7 -2796,7 +2792,7 @@@ static int crypt_map(struct dm_target *
         */
        if (unlikely(bio->bi_opf & REQ_PREFLUSH ||
            bio_op(bio) == REQ_OP_DISCARD)) {
 -              bio->bi_bdev = cc->dev->bdev;
 +              bio_set_dev(bio, cc->dev->bdev);
                if (bio_sectors(bio))
                        bio->bi_iter.bi_sector = cc->start +
                                dm_target_offset(ti, bio->bi_iter.bi_sector);
diff --combined drivers/md/dm-flakey.c
index 7146c2d9762dfdc14f9815b651d59b992c0583e0,d8bb371e63d7c0dfde61234e7e0e28a97abe3b21..b82cb1ab1eaa338a67800bfdb0c38d36f7913ef2
@@@ -51,7 -51,7 +51,7 @@@ static int parse_features(struct dm_arg
        unsigned argc;
        const char *arg_name;
  
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, 6, "Invalid number of feature args"},
                {1, UINT_MAX, "Invalid corrupt bio byte"},
                {0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
   */
  static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  {
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, UINT_MAX, "Invalid up interval"},
                {0, UINT_MAX, "Invalid down interval"},
        };
@@@ -274,7 -274,7 +274,7 @@@ static void flakey_map_bio(struct dm_ta
  {
        struct flakey_c *fc = ti->private;
  
 -      bio->bi_bdev = fc->dev->bdev;
 +      bio_set_dev(bio, fc->dev->bdev);
        if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
                bio->bi_iter.bi_sector =
                        flakey_map_sector(ti, bio->bi_iter.bi_sector);
index 27c0f223f8ea8f6164293283f2da8ad50d2d034d,ac0d7759594bb05f70c2a5980c64111c2f538861..096fe9b66c50749cb841bcbafc4bc6cb6ca63be7
@@@ -225,6 -225,8 +225,8 @@@ struct dm_integrity_c 
        struct alg_spec internal_hash_alg;
        struct alg_spec journal_crypt_alg;
        struct alg_spec journal_mac_alg;
+       atomic64_t number_of_mismatches;
  };
  
  struct dm_integrity_range {
@@@ -250,8 -252,7 +252,8 @@@ struct dm_integrity_io 
  
        struct completion *completion;
  
 -      struct block_device *orig_bi_bdev;
 +      struct gendisk *orig_bi_disk;
 +      u8 orig_bi_partno;
        bio_end_io_t *orig_bi_end_io;
        struct bio_integrity_payload *orig_bi_integrity;
        struct bvec_iter orig_bi_iter;
@@@ -298,7 -299,7 +300,7 @@@ static void __DEBUG_bytes(__u8 *bytes, 
  /*
   * DM Integrity profile, protection is performed layer above (dm-crypt)
   */
- static struct blk_integrity_profile dm_integrity_profile = {
+ static const struct blk_integrity_profile dm_integrity_profile = {
        .name                   = "DM-DIF-EXT-TAG",
        .generate_fn            = NULL,
        .verify_fn              = NULL,
@@@ -310,6 -311,8 +312,8 @@@ static void dm_integrity_dtr(struct dm_
  
  static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
  {
+       if (err == -EILSEQ)
+               atomic64_inc(&ic->number_of_mismatches);
        if (!cmpxchg(&ic->failed, 0, err))
                DMERR("Error on %s: %d", msg, err);
  }
@@@ -770,13 -773,13 +774,13 @@@ static void write_journal(struct dm_int
        unsigned i;
  
        io_comp.ic = ic;
-       io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
+       init_completion(&io_comp.comp);
  
        if (commit_start + commit_sections <= ic->journal_sections) {
                io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
                if (ic->journal_io) {
                        crypt_comp_1.ic = ic;
-                       crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+                       init_completion(&crypt_comp_1.comp);
                        crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
                        encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
                        wait_for_completion_io(&crypt_comp_1.comp);
                to_end = ic->journal_sections - commit_start;
                if (ic->journal_io) {
                        crypt_comp_1.ic = ic;
-                       crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+                       init_completion(&crypt_comp_1.comp);
                        crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
                        encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
                        if (try_wait_for_completion(&crypt_comp_1.comp)) {
                                rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
-                               crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+                               reinit_completion(&crypt_comp_1.comp);
                                crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
                                encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
                                wait_for_completion_io(&crypt_comp_1.comp);
                        } else {
                                crypt_comp_2.ic = ic;
-                               crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp);
+                               init_completion(&crypt_comp_2.comp);
                                crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
                                encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
                                wait_for_completion_io(&crypt_comp_1.comp);
@@@ -1041,7 -1044,7 +1045,7 @@@ static int dm_integrity_rw_tag(struct d
                        memcpy(tag, dp, to_copy);
                } else if (op == TAG_WRITE) {
                        memcpy(dp, tag, to_copy);
-                       dm_bufio_mark_buffer_dirty(b);
+                       dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
                } else  {
                        /* e.g.: op == TAG_CMP */
                        if (unlikely(memcmp(dp, tag, to_copy))) {
@@@ -1165,8 -1168,7 +1169,8 @@@ static void integrity_end_io(struct bi
        struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
  
        bio->bi_iter = dio->orig_bi_iter;
 -      bio->bi_bdev = dio->orig_bi_bdev;
 +      bio->bi_disk = dio->orig_bi_disk;
 +      bio->bi_partno = dio->orig_bi_partno;
        if (dio->orig_bi_integrity) {
                bio->bi_integrity = dio->orig_bi_integrity;
                bio->bi_opf |= REQ_INTEGRITY;
@@@ -1275,6 -1277,7 +1279,7 @@@ again
                                        DMERR("Checksum failed at sector 0x%llx",
                                              (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
                                        r = -EILSEQ;
+                                       atomic64_inc(&ic->number_of_mismatches);
                                }
                                if (likely(checksums != checksums_onstack))
                                        kfree(checksums);
@@@ -1676,16 -1679,15 +1681,16 @@@ sleep
        dio->in_flight = (atomic_t)ATOMIC_INIT(2);
  
        if (need_sync_io) {
-               read_comp = COMPLETION_INITIALIZER_ONSTACK(read_comp);
+               init_completion(&read_comp);
                dio->completion = &read_comp;
        } else
                dio->completion = NULL;
  
        dio->orig_bi_iter = bio->bi_iter;
  
 -      dio->orig_bi_bdev = bio->bi_bdev;
 -      bio->bi_bdev = ic->dev->bdev;
 +      dio->orig_bi_disk = bio->bi_disk;
 +      dio->orig_bi_partno = bio->bi_partno;
 +      bio_set_dev(bio, ic->dev->bdev);
  
        dio->orig_bi_integrity = bio_integrity(bio);
        bio->bi_integrity = NULL;
  
        if (need_sync_io) {
                wait_for_completion_io(&read_comp);
-               integrity_metadata(&dio->work);
+               if (likely(!bio->bi_status))
+                       integrity_metadata(&dio->work);
+               else
+                       dec_in_flight(dio);
        } else {
                INIT_WORK(&dio->work, integrity_metadata);
                queue_work(ic->metadata_wq, &dio->work);
@@@ -1834,7 -1840,7 +1843,7 @@@ static void do_journal_write(struct dm_
  
        comp.ic = ic;
        comp.in_flight = (atomic_t)ATOMIC_INIT(1);
-       comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+       init_completion(&comp.comp);
  
        i = write_start;
        for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
@@@ -2061,7 -2067,7 +2070,7 @@@ static void replay_journal(struct dm_in
                if (ic->journal_io) {
                        struct journal_completion crypt_comp;
                        crypt_comp.ic = ic;
-                       crypt_comp.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp.comp);
+                       init_completion(&crypt_comp.comp);
                        crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
                        encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
                        wait_for_completion(&crypt_comp.comp);
@@@ -2233,7 -2239,7 +2242,7 @@@ static void dm_integrity_status(struct 
  
        switch (type) {
        case STATUSTYPE_INFO:
-               result[0] = '\0';
+               DMEMIT("%llu", (unsigned long long)atomic64_read(&ic->number_of_mismatches));
                break;
  
        case STATUSTYPE_TABLE: {
@@@ -2634,7 -2640,7 +2643,7 @@@ static int create_journal(struct dm_int
                        memset(iv, 0x00, ivsize);
  
                        skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv);
-                       comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+                       init_completion(&comp.comp);
                        comp.in_flight = (atomic_t)ATOMIC_INIT(1);
                        if (do_crypt(true, req, &comp))
                                wait_for_completion(&comp.comp);
  
                                sg_init_one(&sg, crypt_data, crypt_len);
                                skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv);
-                               comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+                               init_completion(&comp.comp);
                                comp.in_flight = (atomic_t)ATOMIC_INIT(1);
                                if (do_crypt(true, req, &comp))
                                        wait_for_completion(&comp.comp);
@@@ -2778,7 -2784,7 +2787,7 @@@ static int dm_integrity_ctr(struct dm_t
        int r;
        unsigned extra_args;
        struct dm_arg_set as;
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, 9, "Invalid number of feature args"},
        };
        unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
        bio_list_init(&ic->flush_bio_list);
        init_waitqueue_head(&ic->copy_to_journal_wait);
        init_completion(&ic->crypto_backoff);
+       atomic64_set(&ic->number_of_mismatches, 0);
  
        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
        if (r) {
@@@ -3202,7 -3209,7 +3212,7 @@@ static void dm_integrity_dtr(struct dm_
  
  static struct target_type integrity_target = {
        .name                   = "integrity",
-       .version                = {1, 0, 0},
+       .version                = {1, 1, 0},
        .module                 = THIS_MODULE,
        .features               = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
        .ctr                    = dm_integrity_ctr,
diff --combined drivers/md/dm-linear.c
index 405eca206d67c3e8b877f61140bd89a5b7f33320,208800610af83b2c656dfc651b3056a7cd48631b..d5f8eff7c11d88a066d1dd83fe4707b33264cfae
@@@ -88,7 -88,7 +88,7 @@@ static void linear_map_bio(struct dm_ta
  {
        struct linear_c *lc = ti->private;
  
 -      bio->bi_bdev = lc->dev->bdev;
 +      bio_set_dev(bio, lc->dev->bdev);
        if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
                bio->bi_iter.bi_sector =
                        linear_map_sector(ti, bio->bi_iter.bi_sector);
@@@ -184,20 -184,6 +184,6 @@@ static size_t linear_dax_copy_from_iter
        return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
  }
  
- static void linear_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
-               size_t size)
- {
-       struct linear_c *lc = ti->private;
-       struct block_device *bdev = lc->dev->bdev;
-       struct dax_device *dax_dev = lc->dev->dax_dev;
-       sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
-       dev_sector = linear_map_sector(ti, sector);
-       if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
-               return;
-       dax_flush(dax_dev, pgoff, addr, size);
- }
  static struct target_type linear_target = {
        .name   = "linear",
        .version = {1, 4, 0},
        .iterate_devices = linear_iterate_devices,
        .direct_access = linear_dax_direct_access,
        .dax_copy_from_iter = linear_dax_copy_from_iter,
-       .dax_flush = linear_dax_flush,
  };
  
  int __init dm_linear_init(void)
index 534a254eb977381cd6589921e33953f29cfa91fb,09979bdb6fe3c356f662b195330d82019c649b29..8b80a9ce9ea9c17ad9b6797aa51e6763f7f9fbc3
@@@ -100,6 -100,7 +100,7 @@@ struct log_writes_c 
        struct dm_dev *logdev;
        u64 logged_entries;
        u32 sectorsize;
+       u32 sectorshift;
        atomic_t io_blocks;
        atomic_t pending_blocks;
        sector_t next_sector;
@@@ -128,6 -129,18 +129,18 @@@ struct per_bio_data 
        struct pending_block *block;
  };
  
+ static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc,
+                                         sector_t sectors)
+ {
+       return sectors >> (lc->sectorshift - SECTOR_SHIFT);
+ }
+ static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc,
+                                         sector_t sectors)
+ {
+       return sectors << (lc->sectorshift - SECTOR_SHIFT);
+ }
  static void put_pending_block(struct log_writes_c *lc)
  {
        if (atomic_dec_and_test(&lc->pending_blocks)) {
@@@ -198,7 -211,7 +211,7 @@@ static int write_metadata(struct log_wr
        }
        bio->bi_iter.bi_size = 0;
        bio->bi_iter.bi_sector = sector;
 -      bio->bi_bdev = lc->logdev->bdev;
 +      bio_set_dev(bio, lc->logdev->bdev);
        bio->bi_end_io = log_end_io;
        bio->bi_private = lc;
        bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@@ -253,7 -266,7 +266,7 @@@ static int log_one_block(struct log_wri
  
        if (!block->vec_cnt)
                goto out;
-       sector++;
+       sector += dev_to_bio_sectors(lc, 1);
  
        atomic_inc(&lc->io_blocks);
        bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
        }
        bio->bi_iter.bi_size = 0;
        bio->bi_iter.bi_sector = sector;
 -      bio->bi_bdev = lc->logdev->bdev;
 +      bio_set_dev(bio, lc->logdev->bdev);
        bio->bi_end_io = log_end_io;
        bio->bi_private = lc;
        bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
                        }
                        bio->bi_iter.bi_size = 0;
                        bio->bi_iter.bi_sector = sector;
 -                      bio->bi_bdev = lc->logdev->bdev;
 +                      bio_set_dev(bio, lc->logdev->bdev);
                        bio->bi_end_io = log_end_io;
                        bio->bi_private = lc;
                        bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@@ -354,10 -367,9 +367,9 @@@ static int log_writes_kthread(void *arg
                                goto next;
  
                        sector = lc->next_sector;
-                       if (block->flags & LOG_DISCARD_FLAG)
-                               lc->next_sector++;
-                       else
-                               lc->next_sector += block->nr_sectors + 1;
+                       if (!(block->flags & LOG_DISCARD_FLAG))
+                               lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors);
+                       lc->next_sector += dev_to_bio_sectors(lc, 1);
  
                        /*
                         * Apparently the size of the device may not be known
@@@ -399,7 -411,7 +411,7 @@@ next
                if (!try_to_freeze()) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (!kthread_should_stop() &&
-                           !atomic_read(&lc->pending_blocks))
+                           list_empty(&lc->logging_blocks))
                                schedule();
                        __set_current_state(TASK_RUNNING);
                }
@@@ -435,7 -447,6 +447,6 @@@ static int log_writes_ctr(struct dm_tar
        INIT_LIST_HEAD(&lc->unflushed_blocks);
        INIT_LIST_HEAD(&lc->logging_blocks);
        init_waitqueue_head(&lc->wait);
-       lc->sectorsize = 1 << SECTOR_SHIFT;
        atomic_set(&lc->io_blocks, 0);
        atomic_set(&lc->pending_blocks, 0);
  
                goto bad;
        }
  
+       lc->sectorsize = bdev_logical_block_size(lc->dev->bdev);
+       lc->sectorshift = ilog2(lc->sectorsize);
        lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
        if (IS_ERR(lc->log_kthread)) {
                ret = PTR_ERR(lc->log_kthread);
                goto bad;
        }
  
-       /* We put the super at sector 0, start logging at sector 1 */
-       lc->next_sector = 1;
+       /*
+        * next_sector is in 512b sectors to correspond to what bi_sector expects.
+        * The super starts at sector 0, and the next_sector is the next logical
+        * one based on the sectorsize of the device.
+        */
+       lc->next_sector = lc->sectorsize >> SECTOR_SHIFT;
        lc->logging_enabled = true;
        lc->end_sector = logdev_last_sector(lc);
        lc->device_supports_discard = true;
@@@ -539,7 -556,7 +556,7 @@@ static void normal_map_bio(struct dm_ta
  {
        struct log_writes_c *lc = ti->private;
  
 -      bio->bi_bdev = lc->dev->bdev;
 +      bio_set_dev(bio, lc->dev->bdev);
  }
  
  static int log_writes_map(struct dm_target *ti, struct bio *bio)
        if (discard_bio)
                block->flags |= LOG_DISCARD_FLAG;
  
-       block->sector = bio->bi_iter.bi_sector;
-       block->nr_sectors = bio_sectors(bio);
+       block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector);
+       block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio));
  
        /* We don't need the data, just submit */
        if (discard_bio) {
@@@ -767,9 -784,12 +784,12 @@@ static void log_writes_io_hints(struct 
  
        if (!q || !blk_queue_discard(q)) {
                lc->device_supports_discard = false;
-               limits->discard_granularity = 1 << SECTOR_SHIFT;
+               limits->discard_granularity = lc->sectorsize;
                limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
        }
+       limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev);
+       limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev);
+       limits->io_min = limits->physical_block_size;
  }
  
  static struct target_type log_writes_target = {
diff --combined drivers/md/dm-mpath.c
index 96aedaac2c644df49ccb7dbe590446763738ac26,bf280a99fa81f63b58d9c6f8bf828297df39745c..11f273d2f018e722b1e9480c5a5eb0e59b0f0048
@@@ -565,7 -565,7 +565,7 @@@ static int __multipath_map_bio(struct m
        mpio->nr_bytes = nr_bytes;
  
        bio->bi_status = 0;
 -      bio->bi_bdev = pgpath->path.dev->bdev;
 +      bio_set_dev(bio, pgpath->path.dev->bdev);
        bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
  
        if (pgpath->pg->ps.type->start_io)
@@@ -632,6 -632,10 +632,10 @@@ static void process_queued_bios(struct 
                case DM_MAPIO_REMAPPED:
                        generic_make_request(bio);
                        break;
+               case 0:
+                       break;
+               default:
+                       WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r);
                }
        }
        blk_finish_plug(&plug);
@@@ -698,7 -702,7 +702,7 @@@ static int parse_path_selector(struct d
        struct path_selector_type *pst;
        unsigned ps_argc;
  
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, 1024, "invalid number of path selector args"},
        };
  
@@@ -822,7 -826,7 +826,7 @@@ retain
  static struct priority_group *parse_priority_group(struct dm_arg_set *as,
                                                   struct multipath *m)
  {
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {1, 1024, "invalid number of paths"},
                {0, 1024, "invalid number of selector args"}
        };
@@@ -898,7 -902,7 +902,7 @@@ static int parse_hw_handler(struct dm_a
        int ret;
        struct dm_target *ti = m->ti;
  
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, 1024, "invalid number of hardware handler args"},
        };
  
@@@ -950,7 -954,7 +954,7 @@@ static int parse_features(struct dm_arg
        struct dm_target *ti = m->ti;
        const char *arg_name;
  
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, 8, "invalid number of feature args"},
                {1, 50, "pg_init_retries must be between 1 and 50"},
                {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
  static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
  {
        /* target arguments */
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, 1024, "invalid number of priority groups"},
                {0, 1024, "invalid initial priority group number"},
        };
@@@ -1379,6 -1383,7 +1383,7 @@@ static void pg_init_done(void *data, in
        case SCSI_DH_RETRY:
                /* Wait before retrying. */
                delay_retry = 1;
+               /* fall through */
        case SCSI_DH_IMM_RETRY:
        case SCSI_DH_RES_TEMP_UNAVAIL:
                if (pg_init_limit_reached(m, pgpath))
diff --combined drivers/md/dm-stripe.c
index ab50d7c4377f8fd95fe86f875e23c70d22919fb6,1690bb299b3f77881444137edfc9f9f1b1356db4..b5e892149c542ba76f0cad8d819cb30c442075e9
@@@ -270,7 -270,7 +270,7 @@@ static int stripe_map_range(struct stri
        stripe_map_range_sector(sc, bio_end_sector(bio),
                                target_stripe, &end);
        if (begin < end) {
 -              bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
 +              bio_set_dev(bio, sc->stripe[target_stripe].dev->bdev);
                bio->bi_iter.bi_sector = begin +
                        sc->stripe[target_stripe].physical_start;
                bio->bi_iter.bi_size = to_bytes(end - begin);
@@@ -291,7 -291,7 +291,7 @@@ static int stripe_map(struct dm_target 
        if (bio->bi_opf & REQ_PREFLUSH) {
                target_bio_nr = dm_bio_get_target_bio_nr(bio);
                BUG_ON(target_bio_nr >= sc->stripes);
 -              bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev;
 +              bio_set_dev(bio, sc->stripe[target_bio_nr].dev->bdev);
                return DM_MAPIO_REMAPPED;
        }
        if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
                          &stripe, &bio->bi_iter.bi_sector);
  
        bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start;
 -      bio->bi_bdev = sc->stripe[stripe].dev->bdev;
 +      bio_set_dev(bio, sc->stripe[stripe].dev->bdev);
  
        return DM_MAPIO_REMAPPED;
  }
@@@ -351,25 -351,6 +351,6 @@@ static size_t stripe_dax_copy_from_iter
        return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
  }
  
- static void stripe_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
-               size_t size)
- {
-       sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
-       struct stripe_c *sc = ti->private;
-       struct dax_device *dax_dev;
-       struct block_device *bdev;
-       uint32_t stripe;
-       stripe_map_sector(sc, sector, &stripe, &dev_sector);
-       dev_sector += sc->stripe[stripe].physical_start;
-       dax_dev = sc->stripe[stripe].dev->dax_dev;
-       bdev = sc->stripe[stripe].dev->bdev;
-       if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
-               return;
-       dax_flush(dax_dev, pgoff, addr, size);
- }
  /*
   * Stripe status:
   *
@@@ -430,7 -411,9 +411,7 @@@ static int stripe_end_io(struct dm_targ
                return DM_ENDIO_DONE;
  
        memset(major_minor, 0, sizeof(major_minor));
 -      sprintf(major_minor, "%d:%d",
 -              MAJOR(disk_devt(bio->bi_bdev->bd_disk)),
 -              MINOR(disk_devt(bio->bi_bdev->bd_disk)));
 +      sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)));
  
        /*
         * Test to see which stripe drive triggered the event
@@@ -489,7 -472,6 +470,6 @@@ static struct target_type stripe_targe
        .io_hints = stripe_io_hints,
        .direct_access = stripe_dax_direct_access,
        .dax_copy_from_iter = stripe_dax_copy_from_iter,
-       .dax_flush = stripe_dax_flush,
  };
  
  int __init dm_stripe_init(void)
diff --combined drivers/md/dm-switch.c
index 2dcea4c56f37f7cf0c0fd9c5b8eea8f68b229707,83a371d544124fbf21a3762bbd5201485bdc76c9..4c8de1ff78cac8ad8a575fcbd42c81a93d6bd79e
@@@ -251,7 -251,7 +251,7 @@@ static void switch_dtr(struct dm_targe
   */
  static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
  {
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
                {1, UINT_MAX, "Invalid region size"},
                {0, 0, "Invalid number of optional args"},
@@@ -322,7 -322,7 +322,7 @@@ static int switch_map(struct dm_target 
        sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
        unsigned path_nr = switch_get_path_nr(sctx, offset);
  
 -      bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
 +      bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
        bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
  
        return DM_MAPIO_REMAPPED;
diff --combined drivers/md/dm-thin.c
index 69d88aee30554d017a6f7164d343bb17ee13f716,9736621c2963e281d7c471cb6da6d746587d16ae..1e25705209c27fbb8e62f5d096c9dbe157d82c77
@@@ -679,7 -679,7 +679,7 @@@ static void remap(struct thin_c *tc, st
        struct pool *pool = tc->pool;
        sector_t bi_sector = bio->bi_iter.bi_sector;
  
 -      bio->bi_bdev = tc->pool_dev->bdev;
 +      bio_set_dev(bio, tc->pool_dev->bdev);
        if (block_size_is_power_of_two(pool))
                bio->bi_iter.bi_sector =
                        (block << pool->sectors_per_block_shift) |
  
  static void remap_to_origin(struct thin_c *tc, struct bio *bio)
  {
 -      bio->bi_bdev = tc->origin_dev->bdev;
 +      bio_set_dev(bio, tc->origin_dev->bdev);
  }
  
  static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
@@@ -3041,7 -3041,7 +3041,7 @@@ static int parse_pool_features(struct d
        unsigned argc;
        const char *arg_name;
  
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, 4, "Invalid number of pool feature arguments"},
        };
  
@@@ -3313,7 -3313,7 +3313,7 @@@ static int pool_map(struct dm_target *t
         * As this is a singleton target, ti->begin is always zero.
         */
        spin_lock_irqsave(&pool->lock, flags);
 -      bio->bi_bdev = pt->data_dev->bdev;
 +      bio_set_dev(bio, pt->data_dev->bdev);
        r = DM_MAPIO_REMAPPED;
        spin_unlock_irqrestore(&pool->lock, flags);
  
index 1c5b6185c79d049e6621a495ba27e76bbafc5916,79f18d4d7f021190e523f5a36127a9b95d2adf85..bda3caca23ca69af2fe97592aa817a29b87851d6
@@@ -637,7 -637,7 +637,7 @@@ static int verity_map(struct dm_target 
        struct dm_verity *v = ti->private;
        struct dm_verity_io *io;
  
 -      bio->bi_bdev = v->data_dev->bdev;
 +      bio_set_dev(bio, v->data_dev->bdev);
        bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector);
  
        if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
@@@ -839,7 -839,7 +839,7 @@@ static int verity_parse_opt_args(struc
        struct dm_target *ti = v->ti;
        const char *arg_name;
  
-       static struct dm_arg _args[] = {
+       static const struct dm_arg _args[] = {
                {0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"},
        };
  
diff --combined drivers/md/dm.c
index 04ae795e8a5f4d5843260772d9f624d2734f3d31,825eaffc24da9706152e092cf074678e743fc743..6e54145969c5ce30184cf162283db0f01796f1ca
@@@ -510,7 -510,7 +510,7 @@@ static void start_io_acct(struct dm_io 
        io->start_time = jiffies;
  
        cpu = part_stat_lock();
 -      part_round_stats(cpu, &dm_disk(md)->part0);
 +      part_round_stats(md->queue, cpu, &dm_disk(md)->part0);
        part_stat_unlock();
        atomic_set(&dm_disk(md)->part0.in_flight[rw],
                atomic_inc_return(&md->pending[rw]));
@@@ -529,7 -529,7 +529,7 @@@ static void end_io_acct(struct dm_io *i
        int pending;
        int rw = bio_data_dir(bio);
  
 -      generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
 +      generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
  
        if (unlikely(dm_stats_used(&md->stats)))
                dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@@ -841,10 -841,10 +841,10 @@@ static void clone_endio(struct bio *bio
  
        if (unlikely(error == BLK_STS_TARGET)) {
                if (bio_op(bio) == REQ_OP_WRITE_SAME &&
 -                  !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
 +                  !bio->bi_disk->queue->limits.max_write_same_sectors)
                        disable_write_same(md);
                if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
 -                  !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
 +                  !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
                        disable_write_zeroes(md);
        }
  
@@@ -987,24 -987,6 +987,6 @@@ static size_t dm_dax_copy_from_iter(str
        return ret;
  }
  
- static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
-               size_t size)
- {
-       struct mapped_device *md = dax_get_private(dax_dev);
-       sector_t sector = pgoff * PAGE_SECTORS;
-       struct dm_target *ti;
-       int srcu_idx;
-       ti = dm_dax_get_live_target(md, sector, &srcu_idx);
-       if (!ti)
-               goto out;
-       if (ti->type->dax_flush)
-               ti->type->dax_flush(ti, pgoff, addr, size);
-  out:
-       dm_put_live_table(md, srcu_idx);
- }
  /*
   * A target may call dm_accept_partial_bio only from the map routine.  It is
   * allowed for all bio types except REQ_PREFLUSH.
@@@ -1205,8 -1187,8 +1187,8 @@@ static void __map_bio(struct dm_target_
                break;
        case DM_MAPIO_REMAPPED:
                /* the bio has been remapped so dispatch it */
 -              trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
 -                                    tio->io->bio->bi_bdev->bd_dev, sector);
 +              trace_block_bio_remap(clone->bi_disk->queue, clone,
 +                                    bio_dev(tio->io->bio), sector);
                generic_make_request(clone);
                break;
        case DM_MAPIO_KILL:
@@@ -1532,7 -1514,7 +1514,7 @@@ static blk_qc_t dm_make_request(struct 
  
        map = dm_get_live_table(md, &srcu_idx);
  
 -      generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
 +      generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0);
  
        /* if we're suspended, we have to queue this io for later */
        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
@@@ -1786,7 -1768,7 +1768,7 @@@ static struct mapped_device *alloc_dev(
                goto bad;
  
        bio_init(&md->flush_bio, NULL, 0);
 -      md->flush_bio.bi_bdev = md->bdev;
 +      bio_set_dev(&md->flush_bio, md->bdev);
        md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
  
        dm_stats_init(&md->stats);
@@@ -2992,7 -2974,6 +2974,6 @@@ static const struct block_device_operat
  static const struct dax_operations dm_dax_ops = {
        .direct_access = dm_dax_direct_access,
        .copy_from_iter = dm_dax_copy_from_iter,
-       .flush = dm_dax_flush,
  };
  
  /*
diff --combined drivers/nvdimm/pmem.c
index e9aa453da50c5a8c3d4ce9eac8cb7b7345d9ddcc,88c1282587603122995e178e1e77d481b3e30dfa..39dfd7affa319a3aa0599e6a1bfa6ade7ec3c5f1
@@@ -80,40 -80,22 +80,40 @@@ static blk_status_t pmem_clear_poison(s
  static void write_pmem(void *pmem_addr, struct page *page,
                unsigned int off, unsigned int len)
  {
 -      void *mem = kmap_atomic(page);
 -
 -      memcpy_flushcache(pmem_addr, mem + off, len);
 -      kunmap_atomic(mem);
 +      unsigned int chunk;
 +      void *mem;
 +
 +      while (len) {
 +              mem = kmap_atomic(page);
 +              chunk = min_t(unsigned int, len, PAGE_SIZE);
 +              memcpy_flushcache(pmem_addr, mem + off, chunk);
 +              kunmap_atomic(mem);
 +              len -= chunk;
 +              off = 0;
 +              page++;
 +              pmem_addr += PAGE_SIZE;
 +      }
  }
  
  static blk_status_t read_pmem(struct page *page, unsigned int off,
                void *pmem_addr, unsigned int len)
  {
 +      unsigned int chunk;
        int rc;
 -      void *mem = kmap_atomic(page);
 -
 -      rc = memcpy_mcsafe(mem + off, pmem_addr, len);
 -      kunmap_atomic(mem);
 -      if (rc)
 -              return BLK_STS_IOERR;
 +      void *mem;
 +
 +      while (len) {
 +              mem = kmap_atomic(page);
 +              chunk = min_t(unsigned int, len, PAGE_SIZE);
 +              rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
 +              kunmap_atomic(mem);
 +              if (rc)
 +                      return BLK_STS_IOERR;
 +              len -= chunk;
 +              off = 0;
 +              page++;
 +              pmem_addr += PAGE_SIZE;
 +      }
        return BLK_STS_OK;
  }
  
@@@ -206,8 -188,7 +206,8 @@@ static int pmem_rw_page(struct block_de
        struct pmem_device *pmem = bdev->bd_queue->queuedata;
        blk_status_t rc;
  
 -      rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector);
 +      rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
 +                        0, is_write, sector);
  
        /*
         * The ->rw_page interface is subtle and tricky.  The core
@@@ -262,16 -243,9 +262,9 @@@ static size_t pmem_copy_from_iter(struc
        return copy_from_iter_flushcache(addr, bytes, i);
  }
  
- static void pmem_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff,
-               void *addr, size_t size)
- {
-       arch_wb_cache_pmem(addr, size);
- }
  static const struct dax_operations pmem_dax_ops = {
        .direct_access = pmem_dax_direct_access,
        .copy_from_iter = pmem_copy_from_iter,
-       .flush = pmem_dax_flush,
  };
  
  static const struct attribute_group *pmem_attribute_groups[] = {
diff --combined fs/dax.c
index 6afcacb3a87b9a3d9e692fb802efdcef484edbc4,18d970fb0e09f0b8c45f0468f0e5eb1377304969..f001d8c72a065173bc4f68733872a4c36413aece
+++ b/fs/dax.c
@@@ -42,9 -42,6 +42,9 @@@
  #define DAX_WAIT_TABLE_BITS 12
  #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  
 +/* The 'colour' (ie low bits) within a PMD of a page offset.  */
 +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
 +
  static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  
  static int __init init_dax_wait_table(void)
  }
  fs_initcall(init_dax_wait_table);
  
 +/*
 + * We use lowest available bit in exceptional entry for locking, one bit for
 + * the entry size (PMD) and two more to tell us if the entry is a zero page or
 + * an empty entry that is just used for locking.  In total four special bits.
 + *
 + * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
 + * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
 + * block allocation.
 + */
 +#define RADIX_DAX_SHIFT               (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
 +#define RADIX_DAX_ENTRY_LOCK  (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
 +#define RADIX_DAX_PMD         (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
 +#define RADIX_DAX_ZERO_PAGE   (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
 +#define RADIX_DAX_EMPTY               (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
 +
 +static unsigned long dax_radix_sector(void *entry)
 +{
 +      return (unsigned long)entry >> RADIX_DAX_SHIFT;
 +}
 +
 +static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 +{
 +      return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
 +                      ((unsigned long)sector << RADIX_DAX_SHIFT) |
 +                      RADIX_DAX_ENTRY_LOCK);
 +}
 +
 +static unsigned int dax_radix_order(void *entry)
 +{
 +      if ((unsigned long)entry & RADIX_DAX_PMD)
 +              return PMD_SHIFT - PAGE_SHIFT;
 +      return 0;
 +}
 +
  static int dax_is_pmd_entry(void *entry)
  {
        return (unsigned long)entry & RADIX_DAX_PMD;
@@@ -103,7 -66,7 +103,7 @@@ static int dax_is_pte_entry(void *entry
  
  static int dax_is_zero_entry(void *entry)
  {
 -      return (unsigned long)entry & RADIX_DAX_HZP;
 +      return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
  }
  
  static int dax_is_empty_entry(void *entry)
@@@ -135,7 -98,7 +135,7 @@@ static wait_queue_head_t *dax_entry_wai
         * the range covered by the PMD map to the same bit lock.
         */
        if (dax_is_pmd_entry(entry))
 -              index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
 +              index &= ~PG_PMD_COLOUR;
  
        key->mapping = mapping;
        key->entry_start = index;
@@@ -157,31 -120,6 +157,31 @@@ static int wake_exceptional_entry_func(
        return autoremove_wake_function(wait, mode, sync, NULL);
  }
  
 +/*
 + * We do not necessarily hold the mapping->tree_lock when we call this
 + * function so it is possible that 'entry' is no longer a valid item in the
 + * radix tree.  This is okay because all we really need to do is to find the
 + * correct waitqueue where tasks might be waiting for that old 'entry' and
 + * wake them.
 + */
 +static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 +              pgoff_t index, void *entry, bool wake_all)
 +{
 +      struct exceptional_entry_key key;
 +      wait_queue_head_t *wq;
 +
 +      wq = dax_entry_waitqueue(mapping, index, entry, &key);
 +
 +      /*
 +       * Checking for locked entry and prepare_to_wait_exclusive() happens
 +       * under mapping->tree_lock, ditto for entry handling in our callers.
 +       * So at this point all tasks that could have seen our entry locked
 +       * must be in the waitqueue and the following check will see them.
 +       */
 +      if (waitqueue_active(wq))
 +              __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
 +}
 +
  /*
   * Check whether the given slot is locked. The function must be called with
   * mapping->tree_lock held
@@@ -243,8 -181,7 +243,8 @@@ static void *get_unlocked_mapping_entry
        for (;;) {
                entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
                                          &slot);
 -              if (!entry || !radix_tree_exceptional_entry(entry) ||
 +              if (!entry ||
 +                  WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
                    !slot_locked(mapping, slot)) {
                        if (slotp)
                                *slotp = slot;
@@@ -279,9 -216,14 +279,9 @@@ static void dax_unlock_mapping_entry(st
  }
  
  static void put_locked_mapping_entry(struct address_space *mapping,
 -                                   pgoff_t index, void *entry)
 +              pgoff_t index)
  {
 -      if (!radix_tree_exceptional_entry(entry)) {
 -              unlock_page(entry);
 -              put_page(entry);
 -      } else {
 -              dax_unlock_mapping_entry(mapping, index);
 -      }
 +      dax_unlock_mapping_entry(mapping, index);
  }
  
  /*
  static void put_unlocked_mapping_entry(struct address_space *mapping,
                                       pgoff_t index, void *entry)
  {
 -      if (!radix_tree_exceptional_entry(entry))
 +      if (!entry)
                return;
  
        /* We have to wake up next waiter for the radix tree entry lock */
  }
  
  /*
 - * Find radix tree entry at given index. If it points to a page, return with
 - * the page locked. If it points to the exceptional entry, return with the
 - * radix tree entry locked. If the radix tree doesn't contain given index,
 - * create empty exceptional entry for the index and return with it locked.
 + * Find radix tree entry at given index. If it points to an exceptional entry,
 + * return it with the radix tree entry locked. If the radix tree doesn't
 + * contain given index, create an empty exceptional entry for the index and
 + * return with it locked.
   *
   * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
   * either return that locked entry or will return an error.  This error will
 - * happen if there are any 4k entries (either zero pages or DAX entries)
 - * within the 2MiB range that we are requesting.
 + * happen if there are any 4k entries within the 2MiB range that we are
 + * requesting.
   *
   * We always favor 4k entries over 2MiB entries. There isn't a flow where we
   * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
@@@ -334,21 -276,18 +334,21 @@@ restart
        spin_lock_irq(&mapping->tree_lock);
        entry = get_unlocked_mapping_entry(mapping, index, &slot);
  
 +      if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
 +              entry = ERR_PTR(-EIO);
 +              goto out_unlock;
 +      }
 +
        if (entry) {
                if (size_flag & RADIX_DAX_PMD) {
 -                      if (!radix_tree_exceptional_entry(entry) ||
 -                          dax_is_pte_entry(entry)) {
 +                      if (dax_is_pte_entry(entry)) {
                                put_unlocked_mapping_entry(mapping, index,
                                                entry);
                                entry = ERR_PTR(-EEXIST);
                                goto out_unlock;
                        }
                } else { /* trying to grab a PTE entry */
 -                      if (radix_tree_exceptional_entry(entry) &&
 -                          dax_is_pmd_entry(entry) &&
 +                      if (dax_is_pmd_entry(entry) &&
                            (dax_is_zero_entry(entry) ||
                             dax_is_empty_entry(entry))) {
                                pmd_downgrade = true;
                                mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
                if (err) {
                        if (pmd_downgrade)
 -                              put_locked_mapping_entry(mapping, index, entry);
 +                              put_locked_mapping_entry(mapping, index);
                        return ERR_PTR(err);
                }
                spin_lock_irq(&mapping->tree_lock);
                spin_unlock_irq(&mapping->tree_lock);
                return entry;
        }
 -      /* Normal page in radix tree? */
 -      if (!radix_tree_exceptional_entry(entry)) {
 -              struct page *page = entry;
 -
 -              get_page(page);
 -              spin_unlock_irq(&mapping->tree_lock);
 -              lock_page(page);
 -              /* Page got truncated? Retry... */
 -              if (unlikely(page->mapping != mapping)) {
 -                      unlock_page(page);
 -                      put_page(page);
 -                      goto restart;
 -              }
 -              return page;
 -      }
        entry = lock_slot(mapping, slot);
   out_unlock:
        spin_unlock_irq(&mapping->tree_lock);
        return entry;
  }
  
 -/*
 - * We do not necessarily hold the mapping->tree_lock when we call this
 - * function so it is possible that 'entry' is no longer a valid item in the
 - * radix tree.  This is okay because all we really need to do is to find the
 - * correct waitqueue where tasks might be waiting for that old 'entry' and
 - * wake them.
 - */
 -void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 -              pgoff_t index, void *entry, bool wake_all)
 -{
 -      struct exceptional_entry_key key;
 -      wait_queue_head_t *wq;
 -
 -      wq = dax_entry_waitqueue(mapping, index, entry, &key);
 -
 -      /*
 -       * Checking for locked entry and prepare_to_wait_exclusive() happens
 -       * under mapping->tree_lock, ditto for entry handling in our callers.
 -       * So at this point all tasks that could have seen our entry locked
 -       * must be in the waitqueue and the following check will see them.
 -       */
 -      if (waitqueue_active(wq))
 -              __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
 -}
 -
  static int __dax_invalidate_mapping_entry(struct address_space *mapping,
                                          pgoff_t index, bool trunc)
  {
  
        spin_lock_irq(&mapping->tree_lock);
        entry = get_unlocked_mapping_entry(mapping, index, NULL);
 -      if (!entry || !radix_tree_exceptional_entry(entry))
 +      if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
                goto out;
        if (!trunc &&
            (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
@@@ -489,6 -468,50 +489,6 @@@ int dax_invalidate_mapping_entry_sync(s
        return __dax_invalidate_mapping_entry(mapping, index, false);
  }
  
 -/*
 - * The user has performed a load from a hole in the file.  Allocating
 - * a new page in the file would cause excessive storage usage for
 - * workloads with sparse files.  We allocate a page cache page instead.
 - * We'll kick it out of the page cache if it's ever written to,
 - * otherwise it will simply fall out of the page cache under memory
 - * pressure without ever having been dirtied.
 - */
 -static int dax_load_hole(struct address_space *mapping, void **entry,
 -                       struct vm_fault *vmf)
 -{
 -      struct inode *inode = mapping->host;
 -      struct page *page;
 -      int ret;
 -
 -      /* Hole page already exists? Return it...  */
 -      if (!radix_tree_exceptional_entry(*entry)) {
 -              page = *entry;
 -              goto finish_fault;
 -      }
 -
 -      /* This will replace locked radix tree entry with a hole page */
 -      page = find_or_create_page(mapping, vmf->pgoff,
 -                                 vmf->gfp_mask | __GFP_ZERO);
 -      if (!page) {
 -              ret = VM_FAULT_OOM;
 -              goto out;
 -      }
 -
 -finish_fault:
 -      vmf->page = page;
 -      ret = finish_fault(vmf);
 -      vmf->page = NULL;
 -      *entry = page;
 -      if (!ret) {
 -              /* Grab reference for PTE that is now referencing the page */
 -              get_page(page);
 -              ret = VM_FAULT_NOPAGE;
 -      }
 -out:
 -      trace_dax_load_hole(inode, vmf, ret);
 -      return ret;
 -}
 -
  static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
                sector_t sector, size_t size, struct page *to,
                unsigned long vaddr)
@@@ -529,27 -552,47 +529,27 @@@ static void *dax_insert_mapping_entry(s
                                      unsigned long flags)
  {
        struct radix_tree_root *page_tree = &mapping->page_tree;
 -      int error = 0;
 -      bool hole_fill = false;
        void *new_entry;
        pgoff_t index = vmf->pgoff;
  
        if (vmf->flags & FAULT_FLAG_WRITE)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  
 -      /* Replacing hole page with block mapping? */
 -      if (!radix_tree_exceptional_entry(entry)) {
 -              hole_fill = true;
 -              /*
 -               * Unmap the page now before we remove it from page cache below.
 -               * The page is locked so it cannot be faulted in again.
 -               */
 -              unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
 -                                  PAGE_SIZE, 0);
 -              error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
 -              if (error)
 -                      return ERR_PTR(error);
 -      } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
 -              /* replacing huge zero page with PMD block mapping */
 -              unmap_mapping_range(mapping,
 -                      (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
 +      if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
 +              /* we are replacing a zero page with block mapping */
 +              if (dax_is_pmd_entry(entry))
 +                      unmap_mapping_range(mapping,
 +                                      (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
 +                                      PMD_SIZE, 0);
 +              else /* pte entry */
 +                      unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
 +                                      PAGE_SIZE, 0);
        }
  
        spin_lock_irq(&mapping->tree_lock);
        new_entry = dax_radix_locked_entry(sector, flags);
  
 -      if (hole_fill) {
 -              __delete_from_page_cache(entry, NULL);
 -              /* Drop pagecache reference */
 -              put_page(entry);
 -              error = __radix_tree_insert(page_tree, index,
 -                              dax_radix_order(new_entry), new_entry);
 -              if (error) {
 -                      new_entry = ERR_PTR(error);
 -                      goto unlock;
 -              }
 -              mapping->nrexceptional++;
 -      } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
 +      if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
                /*
                 * Only swap our new entry into the radix tree if the current
                 * entry is a zero page or an empty entry.  If a normal PTE or
                WARN_ON_ONCE(ret != entry);
                __radix_tree_replace(page_tree, node, slot,
                                     new_entry, NULL, NULL);
 +              entry = new_entry;
        }
 +
        if (vmf->flags & FAULT_FLAG_WRITE)
                radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 - unlock:
 +
        spin_unlock_irq(&mapping->tree_lock);
 -      if (hole_fill) {
 -              radix_tree_preload_end();
 -              /*
 -               * We don't need hole page anymore, it has been replaced with
 -               * locked radix tree entry now.
 -               */
 -              if (mapping->a_ops->freepage)
 -                      mapping->a_ops->freepage(entry);
 -              unlock_page(entry);
 -              put_page(entry);
 -      }
 -      return new_entry;
 +      return entry;
  }
  
  static inline unsigned long
@@@ -594,10 -646,11 +594,10 @@@ static void dax_mapping_entry_mkclean(s
        pte_t pte, *ptep = NULL;
        pmd_t *pmdp = NULL;
        spinlock_t *ptl;
 -      bool changed;
  
        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
 -              unsigned long address;
 +              unsigned long address, start, end;
  
                cond_resched();
  
                        continue;
  
                address = pgoff_address(index, vma);
 -              changed = false;
 -              if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
 +
 +              /*
 +               * Note because we provide start/end to follow_pte_pmd it will
 +               * call mmu_notifier_invalidate_range_start() on our behalf
 +               * before taking any lock.
 +               */
 +              if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
                        continue;
  
                if (pmdp) {
                        pmd = pmd_wrprotect(pmd);
                        pmd = pmd_mkclean(pmd);
                        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
 -                      changed = true;
 +                      mmu_notifier_invalidate_range(vma->vm_mm, start, end);
  unlock_pmd:
                        spin_unlock(ptl);
  #endif
                        pte = pte_wrprotect(pte);
                        pte = pte_mkclean(pte);
                        set_pte_at(vma->vm_mm, address, ptep, pte);
 -                      changed = true;
 +                      mmu_notifier_invalidate_range(vma->vm_mm, start, end);
  unlock_pte:
                        pte_unmap_unlock(ptep, ptl);
                }
  
 -              if (changed)
 -                      mmu_notifier_invalidate_page(vma->vm_mm, address);
 +              mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
        }
        i_mmap_unlock_read(mapping);
  }
@@@ -675,7 -724,7 +675,7 @@@ static int dax_writeback_one(struct blo
        spin_lock_irq(&mapping->tree_lock);
        entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
        /* Entry got punched out / reallocated? */
 -      if (!entry2 || !radix_tree_exceptional_entry(entry2))
 +      if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
                goto put_unlocked;
        /*
         * Entry got reallocated elsewhere? No need to writeback. We have to
        }
  
        dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
-       dax_flush(dax_dev, pgoff, kaddr, size);
+       dax_flush(dax_dev, kaddr, size);
        /*
         * After we have flushed the cache, we can clear the dirty tag. There
         * cannot be new dirty data in the pfn after the flush has completed as
        trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
   dax_unlock:
        dax_read_unlock(id);
 -      put_locked_mapping_entry(mapping, index, entry);
 +      put_locked_mapping_entry(mapping, index);
        return ret;
  
   put_unlocked:
@@@ -822,10 -871,11 +822,10 @@@ EXPORT_SYMBOL_GPL(dax_writeback_mapping
  
  static int dax_insert_mapping(struct address_space *mapping,
                struct block_device *bdev, struct dax_device *dax_dev,
 -              sector_t sector, size_t size, void **entryp,
 +              sector_t sector, size_t size, void *entry,
                struct vm_area_struct *vma, struct vm_fault *vmf)
  {
        unsigned long vaddr = vmf->address;
 -      void *entry = *entryp;
        void *ret, *kaddr;
        pgoff_t pgoff;
        int id, rc;
        ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
        if (IS_ERR(ret))
                return PTR_ERR(ret);
 -      *entryp = ret;
  
        trace_dax_insert_mapping(mapping->host, vmf, ret);
 -      return vm_insert_mixed(vma, vaddr, pfn);
 +      if (vmf->flags & FAULT_FLAG_WRITE)
 +              return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
 +      else
 +              return vm_insert_mixed(vma, vaddr, pfn);
  }
  
 -/**
 - * dax_pfn_mkwrite - handle first write to DAX page
 - * @vmf: The description of the fault
 +/*
 + * The user has performed a load from a hole in the file.  Allocating a new
 + * page in the file would cause excessive storage usage for workloads with
 + * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
 + * If this page is ever written to we will re-fault and change the mapping to
 + * point to real DAX storage instead.
   */
 -int dax_pfn_mkwrite(struct vm_fault *vmf)
 +static int dax_load_hole(struct address_space *mapping, void *entry,
 +                       struct vm_fault *vmf)
  {
 -      struct file *file = vmf->vma->vm_file;
 -      struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
 -      void *entry, **slot;
 -      pgoff_t index = vmf->pgoff;
 +      unsigned long vaddr = vmf->address;
 +      int ret = VM_FAULT_NOPAGE;
 +      struct page *zero_page;
 +      void *entry2;
  
 -      spin_lock_irq(&mapping->tree_lock);
 -      entry = get_unlocked_mapping_entry(mapping, index, &slot);
 -      if (!entry || !radix_tree_exceptional_entry(entry)) {
 -              if (entry)
 -                      put_unlocked_mapping_entry(mapping, index, entry);
 -              spin_unlock_irq(&mapping->tree_lock);
 -              trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
 -              return VM_FAULT_NOPAGE;
 +      zero_page = ZERO_PAGE(0);
 +      if (unlikely(!zero_page)) {
 +              ret = VM_FAULT_OOM;
 +              goto out;
        }
 -      radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
 -      entry = lock_slot(mapping, slot);
 -      spin_unlock_irq(&mapping->tree_lock);
 -      /*
 -       * If we race with somebody updating the PTE and finish_mkwrite_fault()
 -       * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
 -       * the fault in either case.
 -       */
 -      finish_mkwrite_fault(vmf);
 -      put_locked_mapping_entry(mapping, index, entry);
 -      trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
 -      return VM_FAULT_NOPAGE;
 +
 +      entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
 +                      RADIX_DAX_ZERO_PAGE);
 +      if (IS_ERR(entry2)) {
 +              ret = VM_FAULT_SIGBUS;
 +              goto out;
 +      }
 +
 +      vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
 +out:
 +      trace_dax_load_hole(inode, vmf, ret);
 +      return ret;
  }
 -EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
  
  static bool dax_range_is_aligned(struct block_device *bdev,
                                 unsigned int offset, unsigned int length)
@@@ -929,7 -978,7 +929,7 @@@ int __dax_zero_page_range(struct block_
                        return rc;
                }
                memset(kaddr + offset, 0, size);
-               dax_flush(dax_dev, pgoff, kaddr + offset, size);
+               dax_flush(dax_dev, kaddr + offset, size);
                dax_read_unlock(id);
        }
        return 0;
@@@ -1007,11 -1056,6 +1007,11 @@@ dax_iomap_actor(struct inode *inode, lo
                if (map_len > end - pos)
                        map_len = end - pos;
  
 +              /*
 +               * The userspace address for the memory copy has already been
 +               * validated via access_ok() in either vfs_read() or
 +               * vfs_write(), depending on which operation we are doing.
 +               */
                if (iov_iter_rw(iter) == WRITE)
                        map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
                                        map_len, iter);
@@@ -1176,7 -1220,7 +1176,7 @@@ static int dax_iomap_pte_fault(struct v
                        major = VM_FAULT_MAJOR;
                }
                error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
 -                              sector, PAGE_SIZE, &entry, vmf->vma, vmf);
 +                              sector, PAGE_SIZE, entry, vmf->vma, vmf);
                /* -EBUSY is fine, somebody else faulted on the same PTE */
                if (error == -EBUSY)
                        error = 0;
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
                if (!(vmf->flags & FAULT_FLAG_WRITE)) {
 -                      vmf_ret = dax_load_hole(mapping, &entry, vmf);
 +                      vmf_ret = dax_load_hole(mapping, entry, vmf);
                        goto finish_iomap;
                }
                /*FALLTHRU*/
                ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
        }
   unlock_entry:
 -      put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 +      put_locked_mapping_entry(mapping, vmf->pgoff);
   out:
        trace_dax_pte_fault_done(inode, vmf, vmf_ret);
        return vmf_ret;
  }
  
  #ifdef CONFIG_FS_DAX_PMD
 -/*
 - * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
 - * more often than one might expect in the below functions.
 - */
 -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
 -
  static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 -              loff_t pos, void **entryp)
 +              loff_t pos, void *entry)
  {
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        const sector_t sector = dax_iomap_sector(iomap, pos);
        void *ret = NULL, *kaddr;
        long length = 0;
        pgoff_t pgoff;
 -      pfn_t pfn;
 +      pfn_t pfn = {};
        int id;
  
        if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
                goto unlock_fallback;
        dax_read_unlock(id);
  
 -      ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
 +      ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
                        RADIX_DAX_PMD);
        if (IS_ERR(ret))
                goto fallback;
 -      *entryp = ret;
  
        trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
        return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
@@@ -1267,7 -1318,7 +1267,7 @@@ fallback
  }
  
  static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 -              void **entryp)
 +              void *entry)
  {
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        unsigned long pmd_addr = vmf->address & PMD_MASK;
        if (unlikely(!zero_page))
                goto fallback;
  
 -      ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
 -                      RADIX_DAX_PMD | RADIX_DAX_HZP);
 +      ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
 +                      RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
        if (IS_ERR(ret))
                goto fallback;
 -      *entryp = ret;
  
        ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
        if (!pmd_none(*(vmf->pmd))) {
@@@ -1361,10 -1413,10 +1361,10 @@@ static int dax_iomap_pmd_fault(struct v
                goto fallback;
  
        /*
 -       * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
 -       * PMD or a HZP entry.  If it can't (because a 4k page is already in
 -       * the tree, for instance), it will return -EEXIST and we just fall
 -       * back to 4k entries.
 +       * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
 +       * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page
 +       * is already in the tree, for instance), it will return -EEXIST and
 +       * we just fall back to 4k entries.
         */
        entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
        if (IS_ERR(entry))
  
        switch (iomap.type) {
        case IOMAP_MAPPED:
 -              result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
 +              result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
                break;
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
                if (WARN_ON_ONCE(write))
                        break;
 -              result = dax_pmd_load_hole(vmf, &iomap, &entry);
 +              result = dax_pmd_load_hole(vmf, &iomap, entry);
                break;
        default:
                WARN_ON_ONCE(1);
                                &iomap);
        }
   unlock_entry:
 -      put_locked_mapping_entry(mapping, pgoff, entry);
 +      put_locked_mapping_entry(mapping, pgoff);
   fallback:
        if (result == VM_FAULT_FALLBACK) {
                split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --combined include/linux/dax.h
index 46cad1d0f12970e4c764374992c9432af431b7b3,0d8f35f6c53dce846863b25eed27660f96d090c9..122197124b9def0e148482c40df7def28ec6ce99
@@@ -19,8 -19,6 +19,6 @@@ struct dax_operations 
        /* copy_from_iter: required operation for fs-dax direct-i/o */
        size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
                        struct iov_iter *);
-       /* flush: optional driver-specific cache management after writes */
-       void (*flush)(struct dax_device *, pgoff_t, void *, size_t);
  };
  
  extern struct attribute_group dax_attribute_group;
@@@ -57,7 -55,6 +55,7 @@@ static inline void fs_put_dax(struct da
        put_dax(dax_dev);
  }
  
 +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
  #else
  static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
  {
@@@ -72,11 -69,6 +70,11 @@@ static inline struct dax_device *fs_dax
  static inline void fs_put_dax(struct dax_device *dax_dev)
  {
  }
 +
 +static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
 +{
 +      return NULL;
 +}
  #endif
  
  int dax_read_lock(void);
@@@ -90,11 -82,38 +88,10 @@@ long dax_direct_access(struct dax_devic
                void **kaddr, pfn_t *pfn);
  size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
- void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
-               size_t size);
+ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
  void dax_write_cache(struct dax_device *dax_dev, bool wc);
  bool dax_write_cache_enabled(struct dax_device *dax_dev);
  
 -/*
 - * We use lowest available bit in exceptional entry for locking, one bit for
 - * the entry size (PMD) and two more to tell us if the entry is a huge zero
 - * page (HZP) or an empty entry that is just used for locking.  In total four
 - * special bits.
 - *
 - * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
 - * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
 - * block allocation.
 - */
 -#define RADIX_DAX_SHIFT       (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
 -#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
 -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
 -#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
 -#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
 -
 -static inline unsigned long dax_radix_sector(void *entry)
 -{
 -      return (unsigned long)entry >> RADIX_DAX_SHIFT;
 -}
 -
 -static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 -{
 -      return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
 -                      ((unsigned long)sector << RADIX_DAX_SHIFT) |
 -                      RADIX_DAX_ENTRY_LOCK);
 -}
 -
  ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops);
  int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
  int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index);
 -void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 -              pgoff_t index, void *entry, bool wake_all);
  
  #ifdef CONFIG_FS_DAX
  int __dax_zero_page_range(struct block_device *bdev,
@@@ -116,6 -137,21 +113,6 @@@ static inline int __dax_zero_page_range
  }
  #endif
  
 -#ifdef CONFIG_FS_DAX_PMD
 -static inline unsigned int dax_radix_order(void *entry)
 -{
 -      if ((unsigned long)entry & RADIX_DAX_PMD)
 -              return PMD_SHIFT - PAGE_SHIFT;
 -      return 0;
 -}
 -#else
 -static inline unsigned int dax_radix_order(void *entry)
 -{
 -      return 0;
 -}
 -#endif
 -int dax_pfn_mkwrite(struct vm_fault *vmf);
 -
  static inline bool dax_mapping(struct address_space *mapping)
  {
        return mapping->host && IS_DAX(mapping->host);