EXPORT_SYMBOL_GPL(dax_read_unlock);
#ifdef CONFIG_BLOCK
+#include <linux/blkdev.h>
+
int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
pgoff_t *pgoff)
{
}
EXPORT_SYMBOL(bdev_dax_pgoff);
+#if IS_ENABLED(CONFIG_FS_DAX)
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+{
+ if (!blk_queue_dax(bdev->bd_queue))
+ return NULL;
+ return fs_dax_get_by_host(bdev->bd_disk->disk_name);
+}
+EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
+#endif
+
/**
* __bdev_dax_supported() - Check if the device supports dax for filesystem
* @sb: The superblock of the device
if (!dax_dev)
return 0;
- if (a == &dev_attr_write_cache.attr && !dax_dev->ops->flush)
+ #ifndef CONFIG_ARCH_HAS_PMEM_API
+ if (a == &dev_attr_write_cache.attr)
return 0;
+ #endif
return a->mode;
}
}
EXPORT_SYMBOL_GPL(dax_copy_from_iter);
- void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
- size_t size)
+ #ifdef CONFIG_ARCH_HAS_PMEM_API
+ void arch_wb_cache_pmem(void *addr, size_t size);
+ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
{
- if (!dax_alive(dax_dev))
+ if (unlikely(!dax_alive(dax_dev)))
return;
- if (!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))
+ if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
return;
- if (dax_dev->ops->flush)
- dax_dev->ops->flush(dax_dev, pgoff, addr, size);
+ arch_wb_cache_pmem(addr, size);
}
+ #else
+ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
+ {
+ }
+ #endif
EXPORT_SYMBOL_GPL(dax_flush);
void dax_write_cache(struct dax_device *dax_dev, bool wc)
#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1)
#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1))
+ /*
+ * Align buffer writes to this boundary.
+ * Tests show that SSDs have the highest IOPS when using 4k writes.
+ */
+ #define DM_BUFIO_WRITE_ALIGN 4096
+
/*
* dm_buffer->list_mode
*/
blk_status_t write_error;
unsigned long state;
unsigned long last_accessed;
+ unsigned dirty_start;
+ unsigned dirty_end;
+ unsigned write_start;
+ unsigned write_end;
struct dm_bufio_client *c;
struct list_head write_list;
struct bio bio;
}
static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
- unsigned n_sectors, bio_end_io_t *end_io)
+ unsigned n_sectors, unsigned offset, bio_end_io_t *end_io)
{
int r;
struct dm_io_request io_req = {
if (b->data_mode != DATA_MODE_VMALLOC) {
io_req.mem.type = DM_IO_KMEM;
- io_req.mem.ptr.addr = b->data;
+ io_req.mem.ptr.addr = (char *)b->data + offset;
} else {
io_req.mem.type = DM_IO_VMA;
- io_req.mem.ptr.vma = b->data;
+ io_req.mem.ptr.vma = (char *)b->data + offset;
}
b->bio.bi_end_io = end_io;
}
static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector,
- unsigned n_sectors, bio_end_io_t *end_io)
+ unsigned n_sectors, unsigned offset, bio_end_io_t *end_io)
{
char *ptr;
- int len;
+ unsigned len;
bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS);
b->bio.bi_iter.bi_sector = sector;
- b->bio.bi_bdev = b->c->bdev;
+ bio_set_dev(&b->bio, b->c->bdev);
b->bio.bi_end_io = inline_endio;
/*
* Use of .bi_private isn't a problem here because
b->bio.bi_private = end_io;
bio_set_op_attrs(&b->bio, rw, 0);
- /*
- * We assume that if len >= PAGE_SIZE ptr is page-aligned.
- * If len < PAGE_SIZE the buffer doesn't cross page boundary.
- */
- ptr = b->data;
+ ptr = (char *)b->data + offset;
len = n_sectors << SECTOR_SHIFT;
- if (len >= PAGE_SIZE)
- BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
- else
- BUG_ON((unsigned long)ptr & (len - 1));
-
do {
- if (!bio_add_page(&b->bio, virt_to_page(ptr),
- len < PAGE_SIZE ? len : PAGE_SIZE,
+ unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
+ if (!bio_add_page(&b->bio, virt_to_page(ptr), this_step,
offset_in_page(ptr))) {
BUG_ON(b->c->block_size <= PAGE_SIZE);
- use_dmio(b, rw, sector, n_sectors, end_io);
+ use_dmio(b, rw, sector, n_sectors, offset, end_io);
return;
}
- len -= PAGE_SIZE;
- ptr += PAGE_SIZE;
+ len -= this_step;
+ ptr += this_step;
} while (len > 0);
submit_bio(&b->bio);
{
unsigned n_sectors;
sector_t sector;
-
- if (rw == WRITE && b->c->write_callback)
- b->c->write_callback(b);
+ unsigned offset, end;
sector = (b->block << b->c->sectors_per_block_bits) + b->c->start;
- n_sectors = 1 << b->c->sectors_per_block_bits;
+
+ if (rw != WRITE) {
+ n_sectors = 1 << b->c->sectors_per_block_bits;
+ offset = 0;
+ } else {
+ if (b->c->write_callback)
+ b->c->write_callback(b);
+ offset = b->write_start;
+ end = b->write_end;
+ offset &= -DM_BUFIO_WRITE_ALIGN;
+ end += DM_BUFIO_WRITE_ALIGN - 1;
+ end &= -DM_BUFIO_WRITE_ALIGN;
+ if (unlikely(end > b->c->block_size))
+ end = b->c->block_size;
+
+ sector += offset >> SECTOR_SHIFT;
+ n_sectors = (end - offset) >> SECTOR_SHIFT;
+ }
if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) &&
b->data_mode != DATA_MODE_VMALLOC)
- use_inline_bio(b, rw, sector, n_sectors, end_io);
+ use_inline_bio(b, rw, sector, n_sectors, offset, end_io);
else
- use_dmio(b, rw, sector, n_sectors, end_io);
+ use_dmio(b, rw, sector, n_sectors, offset, end_io);
}
/*----------------------------------------------------------------
clear_bit(B_DIRTY, &b->state);
wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
+ b->write_start = b->dirty_start;
+ b->write_end = b->dirty_end;
+
if (!write_list)
submit_io(b, WRITE, write_endio);
else
}
EXPORT_SYMBOL_GPL(dm_bufio_release);
- void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
+ void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
+ unsigned start, unsigned end)
{
struct dm_bufio_client *c = b->c;
+ BUG_ON(start >= end);
+ BUG_ON(end > b->c->block_size);
+
dm_bufio_lock(c);
BUG_ON(test_bit(B_READING, &b->state));
- if (!test_and_set_bit(B_DIRTY, &b->state))
+ if (!test_and_set_bit(B_DIRTY, &b->state)) {
+ b->dirty_start = start;
+ b->dirty_end = end;
__relink_lru(b, LIST_DIRTY);
+ } else {
+ if (start < b->dirty_start)
+ b->dirty_start = start;
+ if (end > b->dirty_end)
+ b->dirty_end = end;
+ }
dm_bufio_unlock(c);
}
+ EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
+
+ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
+ {
+ dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
+ }
EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
wait_on_bit_io(&b->state, B_WRITING,
TASK_UNINTERRUPTIBLE);
set_bit(B_DIRTY, &b->state);
+ b->dirty_start = 0;
+ b->dirty_end = c->block_size;
__unlink_buffer(b);
__link_buffer(b, new_block, LIST_DIRTY);
} else {
*--------------------------------------------------------------*/
static void remap_to_origin(struct cache *cache, struct bio *bio)
{
- bio->bi_bdev = cache->origin_dev->bdev;
+ bio_set_dev(bio, cache->origin_dev->bdev);
}
static void remap_to_cache(struct cache *cache, struct bio *bio,
sector_t bi_sector = bio->bi_iter.bi_sector;
sector_t block = from_cblock(cblock);
- bio->bi_bdev = cache->cache_dev->bdev;
+ bio_set_dev(bio, cache->cache_dev->bdev);
if (!block_size_is_power_of_two(cache))
bio->bi_iter.bi_sector =
(block * cache->sectors_per_block) +
static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
char **error)
{
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, 2, "Invalid number of cache feature arguments"},
};
static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
char **error)
{
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, 1024, "Invalid number of policy arguments"},
};
int i, r;
/* xor whitening with sector number */
- memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
- crypto_xor(buf, (u8 *)§or, 8);
- crypto_xor(&buf[8], (u8 *)§or, 8);
+ crypto_xor_cpy(buf, tcw->whitening, (u8 *)§or, 8);
+ crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)§or, 8);
/* calculate crc32 for every 32bit part and xor it */
desc->tfm = tcw->crc32_tfm;
}
/* Calculate IV */
- memcpy(iv, tcw->iv_seed, cc->iv_size);
- crypto_xor(iv, (u8 *)§or, 8);
+ crypto_xor_cpy(iv, tcw->iv_seed, (u8 *)§or, 8);
if (cc->iv_size > 8)
- crypto_xor(&iv[8], (u8 *)§or, cc->iv_size - 8);
+ crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)§or,
+ cc->iv_size - 8);
return r;
}
bip->bip_iter.bi_size = tag_len;
bip->bip_iter.bi_sector = io->cc->start + io->sector;
- /* We own the metadata, do not let bio_free to release it */
- bip->bip_flags &= ~BIP_BLOCK_INTEGRITY;
-
ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),
tag_len, offset_in_page(io->integrity_metadata));
if (unlikely(ret != tag_len))
clone->bi_private = io;
clone->bi_end_io = crypt_endio;
- clone->bi_bdev = cc->dev->bdev;
+ bio_set_dev(clone, cc->dev->bdev);
clone->bi_opf = io->base_bio->bi_opf;
}
{
struct crypt_config *cc = ti->private;
struct dm_arg_set as;
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, 6, "Invalid number of feature args"},
};
unsigned int opt_params, val;
*/
if (unlikely(bio->bi_opf & REQ_PREFLUSH ||
bio_op(bio) == REQ_OP_DISCARD)) {
- bio->bi_bdev = cc->dev->bdev;
+ bio_set_dev(bio, cc->dev->bdev);
if (bio_sectors(bio))
bio->bi_iter.bi_sector = cc->start +
dm_target_offset(ti, bio->bi_iter.bi_sector);
unsigned argc;
const char *arg_name;
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, 6, "Invalid number of feature args"},
{1, UINT_MAX, "Invalid corrupt bio byte"},
{0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
*/
static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, UINT_MAX, "Invalid up interval"},
{0, UINT_MAX, "Invalid down interval"},
};
{
struct flakey_c *fc = ti->private;
- bio->bi_bdev = fc->dev->bdev;
+ bio_set_dev(bio, fc->dev->bdev);
if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
bio->bi_iter.bi_sector =
flakey_map_sector(ti, bio->bi_iter.bi_sector);
struct alg_spec internal_hash_alg;
struct alg_spec journal_crypt_alg;
struct alg_spec journal_mac_alg;
+
+ atomic64_t number_of_mismatches;
};
struct dm_integrity_range {
struct completion *completion;
- struct block_device *orig_bi_bdev;
+ struct gendisk *orig_bi_disk;
+ u8 orig_bi_partno;
bio_end_io_t *orig_bi_end_io;
struct bio_integrity_payload *orig_bi_integrity;
struct bvec_iter orig_bi_iter;
/*
* DM Integrity profile, protection is performed layer above (dm-crypt)
*/
- static struct blk_integrity_profile dm_integrity_profile = {
+ static const struct blk_integrity_profile dm_integrity_profile = {
.name = "DM-DIF-EXT-TAG",
.generate_fn = NULL,
.verify_fn = NULL,
static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
{
+ if (err == -EILSEQ)
+ atomic64_inc(&ic->number_of_mismatches);
if (!cmpxchg(&ic->failed, 0, err))
DMERR("Error on %s: %d", msg, err);
}
unsigned i;
io_comp.ic = ic;
- io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
+ init_completion(&io_comp.comp);
if (commit_start + commit_sections <= ic->journal_sections) {
io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
if (ic->journal_io) {
crypt_comp_1.ic = ic;
- crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+ init_completion(&crypt_comp_1.comp);
crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
wait_for_completion_io(&crypt_comp_1.comp);
to_end = ic->journal_sections - commit_start;
if (ic->journal_io) {
crypt_comp_1.ic = ic;
- crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+ init_completion(&crypt_comp_1.comp);
crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
if (try_wait_for_completion(&crypt_comp_1.comp)) {
rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
- crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+ reinit_completion(&crypt_comp_1.comp);
crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
wait_for_completion_io(&crypt_comp_1.comp);
} else {
crypt_comp_2.ic = ic;
- crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp);
+ init_completion(&crypt_comp_2.comp);
crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
wait_for_completion_io(&crypt_comp_1.comp);
memcpy(tag, dp, to_copy);
} else if (op == TAG_WRITE) {
memcpy(dp, tag, to_copy);
- dm_bufio_mark_buffer_dirty(b);
+ dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
} else {
/* e.g.: op == TAG_CMP */
if (unlikely(memcmp(dp, tag, to_copy))) {
struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
bio->bi_iter = dio->orig_bi_iter;
- bio->bi_bdev = dio->orig_bi_bdev;
+ bio->bi_disk = dio->orig_bi_disk;
+ bio->bi_partno = dio->orig_bi_partno;
if (dio->orig_bi_integrity) {
bio->bi_integrity = dio->orig_bi_integrity;
bio->bi_opf |= REQ_INTEGRITY;
DMERR("Checksum failed at sector 0x%llx",
(unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
r = -EILSEQ;
+ atomic64_inc(&ic->number_of_mismatches);
}
if (likely(checksums != checksums_onstack))
kfree(checksums);
dio->in_flight = (atomic_t)ATOMIC_INIT(2);
if (need_sync_io) {
- read_comp = COMPLETION_INITIALIZER_ONSTACK(read_comp);
+ init_completion(&read_comp);
dio->completion = &read_comp;
} else
dio->completion = NULL;
dio->orig_bi_iter = bio->bi_iter;
- dio->orig_bi_bdev = bio->bi_bdev;
- bio->bi_bdev = ic->dev->bdev;
+ dio->orig_bi_disk = bio->bi_disk;
+ dio->orig_bi_partno = bio->bi_partno;
+ bio_set_dev(bio, ic->dev->bdev);
dio->orig_bi_integrity = bio_integrity(bio);
bio->bi_integrity = NULL;
if (need_sync_io) {
wait_for_completion_io(&read_comp);
- integrity_metadata(&dio->work);
+ if (likely(!bio->bi_status))
+ integrity_metadata(&dio->work);
+ else
+ dec_in_flight(dio);
+
} else {
INIT_WORK(&dio->work, integrity_metadata);
queue_work(ic->metadata_wq, &dio->work);
comp.ic = ic;
comp.in_flight = (atomic_t)ATOMIC_INIT(1);
- comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+ init_completion(&comp.comp);
i = write_start;
for (n = 0; n < write_sections; n++, i++, wraparound_section(ic, &i)) {
if (ic->journal_io) {
struct journal_completion crypt_comp;
crypt_comp.ic = ic;
- crypt_comp.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp.comp);
+ init_completion(&crypt_comp.comp);
crypt_comp.in_flight = (atomic_t)ATOMIC_INIT(0);
encrypt_journal(ic, false, 0, ic->journal_sections, &crypt_comp);
wait_for_completion(&crypt_comp.comp);
switch (type) {
case STATUSTYPE_INFO:
- result[0] = '\0';
+ DMEMIT("%llu", (unsigned long long)atomic64_read(&ic->number_of_mismatches));
break;
case STATUSTYPE_TABLE: {
memset(iv, 0x00, ivsize);
skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv);
- comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+ init_completion(&comp.comp);
comp.in_flight = (atomic_t)ATOMIC_INIT(1);
if (do_crypt(true, req, &comp))
wait_for_completion(&comp.comp);
sg_init_one(&sg, crypt_data, crypt_len);
skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv);
- comp.comp = COMPLETION_INITIALIZER_ONSTACK(comp.comp);
+ init_completion(&comp.comp);
comp.in_flight = (atomic_t)ATOMIC_INIT(1);
if (do_crypt(true, req, &comp))
wait_for_completion(&comp.comp);
int r;
unsigned extra_args;
struct dm_arg_set as;
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, 9, "Invalid number of feature args"},
};
unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
bio_list_init(&ic->flush_bio_list);
init_waitqueue_head(&ic->copy_to_journal_wait);
init_completion(&ic->crypto_backoff);
+ atomic64_set(&ic->number_of_mismatches, 0);
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
if (r) {
static struct target_type integrity_target = {
.name = "integrity",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
{
struct linear_c *lc = ti->private;
- bio->bi_bdev = lc->dev->bdev;
+ bio_set_dev(bio, lc->dev->bdev);
if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
bio->bi_iter.bi_sector =
linear_map_sector(ti, bio->bi_iter.bi_sector);
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
}
- static void linear_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
- size_t size)
- {
- struct linear_c *lc = ti->private;
- struct block_device *bdev = lc->dev->bdev;
- struct dax_device *dax_dev = lc->dev->dax_dev;
- sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
-
- dev_sector = linear_map_sector(ti, sector);
- if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
- return;
- dax_flush(dax_dev, pgoff, addr, size);
- }
-
static struct target_type linear_target = {
.name = "linear",
.version = {1, 4, 0},
.iterate_devices = linear_iterate_devices,
.direct_access = linear_dax_direct_access,
.dax_copy_from_iter = linear_dax_copy_from_iter,
- .dax_flush = linear_dax_flush,
};
int __init dm_linear_init(void)
struct dm_dev *logdev;
u64 logged_entries;
u32 sectorsize;
+ u32 sectorshift;
atomic_t io_blocks;
atomic_t pending_blocks;
sector_t next_sector;
struct pending_block *block;
};
+ static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc,
+ sector_t sectors)
+ {
+ return sectors >> (lc->sectorshift - SECTOR_SHIFT);
+ }
+
+ static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc,
+ sector_t sectors)
+ {
+ return sectors << (lc->sectorshift - SECTOR_SHIFT);
+ }
+
static void put_pending_block(struct log_writes_c *lc)
{
if (atomic_dec_and_test(&lc->pending_blocks)) {
}
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
- bio->bi_bdev = lc->logdev->bdev;
+ bio_set_dev(bio, lc->logdev->bdev);
bio->bi_end_io = log_end_io;
bio->bi_private = lc;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
if (!block->vec_cnt)
goto out;
- sector++;
+ sector += dev_to_bio_sectors(lc, 1);
atomic_inc(&lc->io_blocks);
bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
}
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
- bio->bi_bdev = lc->logdev->bdev;
+ bio_set_dev(bio, lc->logdev->bdev);
bio->bi_end_io = log_end_io;
bio->bi_private = lc;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
}
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
- bio->bi_bdev = lc->logdev->bdev;
+ bio_set_dev(bio, lc->logdev->bdev);
bio->bi_end_io = log_end_io;
bio->bi_private = lc;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
goto next;
sector = lc->next_sector;
- if (block->flags & LOG_DISCARD_FLAG)
- lc->next_sector++;
- else
- lc->next_sector += block->nr_sectors + 1;
+ if (!(block->flags & LOG_DISCARD_FLAG))
+ lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors);
+ lc->next_sector += dev_to_bio_sectors(lc, 1);
/*
* Apparently the size of the device may not be known
if (!try_to_freeze()) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop() &&
- !atomic_read(&lc->pending_blocks))
+ list_empty(&lc->logging_blocks))
schedule();
__set_current_state(TASK_RUNNING);
}
INIT_LIST_HEAD(&lc->unflushed_blocks);
INIT_LIST_HEAD(&lc->logging_blocks);
init_waitqueue_head(&lc->wait);
- lc->sectorsize = 1 << SECTOR_SHIFT;
atomic_set(&lc->io_blocks, 0);
atomic_set(&lc->pending_blocks, 0);
goto bad;
}
+ lc->sectorsize = bdev_logical_block_size(lc->dev->bdev);
+ lc->sectorshift = ilog2(lc->sectorsize);
lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
if (IS_ERR(lc->log_kthread)) {
ret = PTR_ERR(lc->log_kthread);
goto bad;
}
- /* We put the super at sector 0, start logging at sector 1 */
- lc->next_sector = 1;
+ /*
+ * next_sector is in 512b sectors to correspond to what bi_sector expects.
+ * The super starts at sector 0, and the next_sector is the next logical
+ * one based on the sectorsize of the device.
+ */
+ lc->next_sector = lc->sectorsize >> SECTOR_SHIFT;
lc->logging_enabled = true;
lc->end_sector = logdev_last_sector(lc);
lc->device_supports_discard = true;
{
struct log_writes_c *lc = ti->private;
- bio->bi_bdev = lc->dev->bdev;
+ bio_set_dev(bio, lc->dev->bdev);
}
static int log_writes_map(struct dm_target *ti, struct bio *bio)
if (discard_bio)
block->flags |= LOG_DISCARD_FLAG;
- block->sector = bio->bi_iter.bi_sector;
- block->nr_sectors = bio_sectors(bio);
+ block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector);
+ block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio));
/* We don't need the data, just submit */
if (discard_bio) {
if (!q || !blk_queue_discard(q)) {
lc->device_supports_discard = false;
- limits->discard_granularity = 1 << SECTOR_SHIFT;
+ limits->discard_granularity = lc->sectorsize;
limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
}
+ limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev);
+ limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev);
+ limits->io_min = limits->physical_block_size;
}
static struct target_type log_writes_target = {
mpio->nr_bytes = nr_bytes;
bio->bi_status = 0;
- bio->bi_bdev = pgpath->path.dev->bdev;
+ bio_set_dev(bio, pgpath->path.dev->bdev);
bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
if (pgpath->pg->ps.type->start_io)
case DM_MAPIO_REMAPPED:
generic_make_request(bio);
break;
+ case 0:
+ break;
+ default:
+ WARN_ONCE(true, "__multipath_map_bio() returned %d\n", r);
}
}
blk_finish_plug(&plug);
struct path_selector_type *pst;
unsigned ps_argc;
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, 1024, "invalid number of path selector args"},
};
static struct priority_group *parse_priority_group(struct dm_arg_set *as,
struct multipath *m)
{
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{1, 1024, "invalid number of paths"},
{0, 1024, "invalid number of selector args"}
};
int ret;
struct dm_target *ti = m->ti;
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, 1024, "invalid number of hardware handler args"},
};
struct dm_target *ti = m->ti;
const char *arg_name;
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, 8, "invalid number of feature args"},
{1, 50, "pg_init_retries must be between 1 and 50"},
{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
/* target arguments */
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, 1024, "invalid number of priority groups"},
{0, 1024, "invalid initial priority group number"},
};
case SCSI_DH_RETRY:
/* Wait before retrying. */
delay_retry = 1;
+ /* fall through */
case SCSI_DH_IMM_RETRY:
case SCSI_DH_RES_TEMP_UNAVAIL:
if (pg_init_limit_reached(m, pgpath))
stripe_map_range_sector(sc, bio_end_sector(bio),
target_stripe, &end);
if (begin < end) {
- bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
+ bio_set_dev(bio, sc->stripe[target_stripe].dev->bdev);
bio->bi_iter.bi_sector = begin +
sc->stripe[target_stripe].physical_start;
bio->bi_iter.bi_size = to_bytes(end - begin);
if (bio->bi_opf & REQ_PREFLUSH) {
target_bio_nr = dm_bio_get_target_bio_nr(bio);
BUG_ON(target_bio_nr >= sc->stripes);
- bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev;
+ bio_set_dev(bio, sc->stripe[target_bio_nr].dev->bdev);
return DM_MAPIO_REMAPPED;
}
if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
&stripe, &bio->bi_iter.bi_sector);
bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start;
- bio->bi_bdev = sc->stripe[stripe].dev->bdev;
+ bio_set_dev(bio, sc->stripe[stripe].dev->bdev);
return DM_MAPIO_REMAPPED;
}
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
}
- static void stripe_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
- size_t size)
- {
- sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
- struct stripe_c *sc = ti->private;
- struct dax_device *dax_dev;
- struct block_device *bdev;
- uint32_t stripe;
-
- stripe_map_sector(sc, sector, &stripe, &dev_sector);
- dev_sector += sc->stripe[stripe].physical_start;
- dax_dev = sc->stripe[stripe].dev->dax_dev;
- bdev = sc->stripe[stripe].dev->bdev;
-
- if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
- return;
- dax_flush(dax_dev, pgoff, addr, size);
- }
-
/*
* Stripe status:
*
return DM_ENDIO_DONE;
memset(major_minor, 0, sizeof(major_minor));
- sprintf(major_minor, "%d:%d",
- MAJOR(disk_devt(bio->bi_bdev->bd_disk)),
- MINOR(disk_devt(bio->bi_bdev->bd_disk)));
+ sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)));
/*
* Test to see which stripe drive triggered the event
.io_hints = stripe_io_hints,
.direct_access = stripe_dax_direct_access,
.dax_copy_from_iter = stripe_dax_copy_from_iter,
- .dax_flush = stripe_dax_flush,
};
int __init dm_stripe_init(void)
*/
static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
{1, UINT_MAX, "Invalid region size"},
{0, 0, "Invalid number of optional args"},
sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
unsigned path_nr = switch_get_path_nr(sctx, offset);
- bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
+ bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
return DM_MAPIO_REMAPPED;
struct pool *pool = tc->pool;
sector_t bi_sector = bio->bi_iter.bi_sector;
- bio->bi_bdev = tc->pool_dev->bdev;
+ bio_set_dev(bio, tc->pool_dev->bdev);
if (block_size_is_power_of_two(pool))
bio->bi_iter.bi_sector =
(block << pool->sectors_per_block_shift) |
static void remap_to_origin(struct thin_c *tc, struct bio *bio)
{
- bio->bi_bdev = tc->origin_dev->bdev;
+ bio_set_dev(bio, tc->origin_dev->bdev);
}
static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
unsigned argc;
const char *arg_name;
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, 4, "Invalid number of pool feature arguments"},
};
* As this is a singleton target, ti->begin is always zero.
*/
spin_lock_irqsave(&pool->lock, flags);
- bio->bi_bdev = pt->data_dev->bdev;
+ bio_set_dev(bio, pt->data_dev->bdev);
r = DM_MAPIO_REMAPPED;
spin_unlock_irqrestore(&pool->lock, flags);
struct dm_verity *v = ti->private;
struct dm_verity_io *io;
- bio->bi_bdev = v->data_dev->bdev;
+ bio_set_dev(bio, v->data_dev->bdev);
bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector);
if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
struct dm_target *ti = v->ti;
const char *arg_name;
- static struct dm_arg _args[] = {
+ static const struct dm_arg _args[] = {
{0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"},
};
io->start_time = jiffies;
cpu = part_stat_lock();
- part_round_stats(cpu, &dm_disk(md)->part0);
+ part_round_stats(md->queue, cpu, &dm_disk(md)->part0);
part_stat_unlock();
atomic_set(&dm_disk(md)->part0.in_flight[rw],
atomic_inc_return(&md->pending[rw]));
int pending;
int rw = bio_data_dir(bio);
- generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
+ generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
if (unlikely(error == BLK_STS_TARGET)) {
if (bio_op(bio) == REQ_OP_WRITE_SAME &&
- !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+ !bio->bi_disk->queue->limits.max_write_same_sectors)
disable_write_same(md);
if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
- !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+ !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
disable_write_zeroes(md);
}
return ret;
}
- static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
- size_t size)
- {
- struct mapped_device *md = dax_get_private(dax_dev);
- sector_t sector = pgoff * PAGE_SECTORS;
- struct dm_target *ti;
- int srcu_idx;
-
- ti = dm_dax_get_live_target(md, sector, &srcu_idx);
-
- if (!ti)
- goto out;
- if (ti->type->dax_flush)
- ti->type->dax_flush(ti, pgoff, addr, size);
- out:
- dm_put_live_table(md, srcu_idx);
- }
-
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
* allowed for all bio types except REQ_PREFLUSH.
break;
case DM_MAPIO_REMAPPED:
/* the bio has been remapped so dispatch it */
- trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
- tio->io->bio->bi_bdev->bd_dev, sector);
+ trace_block_bio_remap(clone->bi_disk->queue, clone,
+ bio_dev(tio->io->bio), sector);
generic_make_request(clone);
break;
case DM_MAPIO_KILL:
map = dm_get_live_table(md, &srcu_idx);
- generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
+ generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0);
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
goto bad;
bio_init(&md->flush_bio, NULL, 0);
- md->flush_bio.bi_bdev = md->bdev;
+ bio_set_dev(&md->flush_bio, md->bdev);
md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
dm_stats_init(&md->stats);
static const struct dax_operations dm_dax_ops = {
.direct_access = dm_dax_direct_access,
.copy_from_iter = dm_dax_copy_from_iter,
- .flush = dm_dax_flush,
};
/*
static void write_pmem(void *pmem_addr, struct page *page,
unsigned int off, unsigned int len)
{
- void *mem = kmap_atomic(page);
-
- memcpy_flushcache(pmem_addr, mem + off, len);
- kunmap_atomic(mem);
+ unsigned int chunk;
+ void *mem;
+
+ while (len) {
+ mem = kmap_atomic(page);
+ chunk = min_t(unsigned int, len, PAGE_SIZE);
+ memcpy_flushcache(pmem_addr, mem + off, chunk);
+ kunmap_atomic(mem);
+ len -= chunk;
+ off = 0;
+ page++;
+ pmem_addr += PAGE_SIZE;
+ }
}
static blk_status_t read_pmem(struct page *page, unsigned int off,
void *pmem_addr, unsigned int len)
{
+ unsigned int chunk;
int rc;
- void *mem = kmap_atomic(page);
-
- rc = memcpy_mcsafe(mem + off, pmem_addr, len);
- kunmap_atomic(mem);
- if (rc)
- return BLK_STS_IOERR;
+ void *mem;
+
+ while (len) {
+ mem = kmap_atomic(page);
+ chunk = min_t(unsigned int, len, PAGE_SIZE);
+ rc = memcpy_mcsafe(mem + off, pmem_addr, chunk);
+ kunmap_atomic(mem);
+ if (rc)
+ return BLK_STS_IOERR;
+ len -= chunk;
+ off = 0;
+ page++;
+ pmem_addr += PAGE_SIZE;
+ }
return BLK_STS_OK;
}
struct pmem_device *pmem = bdev->bd_queue->queuedata;
blk_status_t rc;
- rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector);
+ rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
+ 0, is_write, sector);
/*
* The ->rw_page interface is subtle and tricky. The core
return copy_from_iter_flushcache(addr, bytes, i);
}
- static void pmem_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff,
- void *addr, size_t size)
- {
- arch_wb_cache_pmem(addr, size);
- }
-
static const struct dax_operations pmem_dax_ops = {
.direct_access = pmem_dax_direct_access,
.copy_from_iter = pmem_copy_from_iter,
- .flush = pmem_dax_flush,
};
static const struct attribute_group *pmem_attribute_groups[] = {
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+/* The 'colour' (ie low bits) within a PMD of a page offset. */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
}
fs_initcall(init_dax_wait_table);
+/*
+ * We use lowest available bit in exceptional entry for locking, one bit for
+ * the entry size (PMD) and two more to tell us if the entry is a zero page or
+ * an empty entry that is just used for locking. In total four special bits.
+ *
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * block allocation.
+ */
+#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
+#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+
+static unsigned long dax_radix_sector(void *entry)
+{
+ return (unsigned long)entry >> RADIX_DAX_SHIFT;
+}
+
+static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
+{
+ return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
+ ((unsigned long)sector << RADIX_DAX_SHIFT) |
+ RADIX_DAX_ENTRY_LOCK);
+}
+
+static unsigned int dax_radix_order(void *entry)
+{
+ if ((unsigned long)entry & RADIX_DAX_PMD)
+ return PMD_SHIFT - PAGE_SHIFT;
+ return 0;
+}
+
static int dax_is_pmd_entry(void *entry)
{
return (unsigned long)entry & RADIX_DAX_PMD;
static int dax_is_zero_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_HZP;
+ return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
}
static int dax_is_empty_entry(void *entry)
* the range covered by the PMD map to the same bit lock.
*/
if (dax_is_pmd_entry(entry))
- index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+ index &= ~PG_PMD_COLOUR;
key->mapping = mapping;
key->entry_start = index;
return autoremove_wake_function(wait, mode, sync, NULL);
}
+/*
+ * We do not necessarily hold the mapping->tree_lock when we call this
+ * function so it is possible that 'entry' is no longer a valid item in the
+ * radix tree. This is okay because all we really need to do is to find the
+ * correct waitqueue where tasks might be waiting for that old 'entry' and
+ * wake them.
+ */
+static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+ pgoff_t index, void *entry, bool wake_all)
+{
+ struct exceptional_entry_key key;
+ wait_queue_head_t *wq;
+
+ wq = dax_entry_waitqueue(mapping, index, entry, &key);
+
+ /*
+ * Checking for locked entry and prepare_to_wait_exclusive() happens
+ * under mapping->tree_lock, ditto for entry handling in our callers.
+ * So at this point all tasks that could have seen our entry locked
+ * must be in the waitqueue and the following check will see them.
+ */
+ if (waitqueue_active(wq))
+ __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+}
+
/*
* Check whether the given slot is locked. The function must be called with
* mapping->tree_lock held
for (;;) {
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
&slot);
- if (!entry || !radix_tree_exceptional_entry(entry) ||
+ if (!entry ||
+ WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
!slot_locked(mapping, slot)) {
if (slotp)
*slotp = slot;
}
static void put_locked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+ pgoff_t index)
{
- if (!radix_tree_exceptional_entry(entry)) {
- unlock_page(entry);
- put_page(entry);
- } else {
- dax_unlock_mapping_entry(mapping, index);
- }
+ dax_unlock_mapping_entry(mapping, index);
}
/*
static void put_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
- if (!radix_tree_exceptional_entry(entry))
+ if (!entry)
return;
/* We have to wake up next waiter for the radix tree entry lock */
}
/*
- * Find radix tree entry at given index. If it points to a page, return with
- * the page locked. If it points to the exceptional entry, return with the
- * radix tree entry locked. If the radix tree doesn't contain given index,
- * create empty exceptional entry for the index and return with it locked.
+ * Find radix tree entry at given index. If it points to an exceptional entry,
+ * return it with the radix tree entry locked. If the radix tree doesn't
+ * contain given index, create an empty exceptional entry for the index and
+ * return with it locked.
*
* When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
* either return that locked entry or will return an error. This error will
- * happen if there are any 4k entries (either zero pages or DAX entries)
- * within the 2MiB range that we are requesting.
+ * happen if there are any 4k entries within the 2MiB range that we are
+ * requesting.
*
* We always favor 4k entries over 2MiB entries. There isn't a flow where we
* evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
+ entry = ERR_PTR(-EIO);
+ goto out_unlock;
+ }
+
if (entry) {
if (size_flag & RADIX_DAX_PMD) {
- if (!radix_tree_exceptional_entry(entry) ||
- dax_is_pte_entry(entry)) {
+ if (dax_is_pte_entry(entry)) {
put_unlocked_mapping_entry(mapping, index,
entry);
entry = ERR_PTR(-EEXIST);
goto out_unlock;
}
} else { /* trying to grab a PTE entry */
- if (radix_tree_exceptional_entry(entry) &&
- dax_is_pmd_entry(entry) &&
+ if (dax_is_pmd_entry(entry) &&
(dax_is_zero_entry(entry) ||
dax_is_empty_entry(entry))) {
pmd_downgrade = true;
mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
if (err) {
if (pmd_downgrade)
- put_locked_mapping_entry(mapping, index, entry);
+ put_locked_mapping_entry(mapping, index);
return ERR_PTR(err);
}
spin_lock_irq(&mapping->tree_lock);
spin_unlock_irq(&mapping->tree_lock);
return entry;
}
- /* Normal page in radix tree? */
- if (!radix_tree_exceptional_entry(entry)) {
- struct page *page = entry;
-
- get_page(page);
- spin_unlock_irq(&mapping->tree_lock);
- lock_page(page);
- /* Page got truncated? Retry... */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto restart;
- }
- return page;
- }
entry = lock_slot(mapping, slot);
out_unlock:
spin_unlock_irq(&mapping->tree_lock);
return entry;
}
-/*
- * We do not necessarily hold the mapping->tree_lock when we call this
- * function so it is possible that 'entry' is no longer a valid item in the
- * radix tree. This is okay because all we really need to do is to find the
- * correct waitqueue where tasks might be waiting for that old 'entry' and
- * wake them.
- */
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
- pgoff_t index, void *entry, bool wake_all)
-{
- struct exceptional_entry_key key;
- wait_queue_head_t *wq;
-
- wq = dax_entry_waitqueue(mapping, index, entry, &key);
-
- /*
- * Checking for locked entry and prepare_to_wait_exclusive() happens
- * under mapping->tree_lock, ditto for entry handling in our callers.
- * So at this point all tasks that could have seen our entry locked
- * must be in the waitqueue and the following check will see them.
- */
- if (waitqueue_active(wq))
- __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
-}
-
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || !radix_tree_exceptional_entry(entry))
+ if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
goto out;
if (!trunc &&
(radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
return __dax_invalidate_mapping_entry(mapping, index, false);
}
-/*
- * The user has performed a load from a hole in the file. Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files. We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
- */
-static int dax_load_hole(struct address_space *mapping, void **entry,
- struct vm_fault *vmf)
-{
- struct inode *inode = mapping->host;
- struct page *page;
- int ret;
-
- /* Hole page already exists? Return it... */
- if (!radix_tree_exceptional_entry(*entry)) {
- page = *entry;
- goto finish_fault;
- }
-
- /* This will replace locked radix tree entry with a hole page */
- page = find_or_create_page(mapping, vmf->pgoff,
- vmf->gfp_mask | __GFP_ZERO);
- if (!page) {
- ret = VM_FAULT_OOM;
- goto out;
- }
-
-finish_fault:
- vmf->page = page;
- ret = finish_fault(vmf);
- vmf->page = NULL;
- *entry = page;
- if (!ret) {
- /* Grab reference for PTE that is now referencing the page */
- get_page(page);
- ret = VM_FAULT_NOPAGE;
- }
-out:
- trace_dax_load_hole(inode, vmf, ret);
- return ret;
-}
-
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
sector_t sector, size_t size, struct page *to,
unsigned long vaddr)
unsigned long flags)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- int error = 0;
- bool hole_fill = false;
void *new_entry;
pgoff_t index = vmf->pgoff;
if (vmf->flags & FAULT_FLAG_WRITE)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- /* Replacing hole page with block mapping? */
- if (!radix_tree_exceptional_entry(entry)) {
- hole_fill = true;
- /*
- * Unmap the page now before we remove it from page cache below.
- * The page is locked so it cannot be faulted in again.
- */
- unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
- PAGE_SIZE, 0);
- error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
- if (error)
- return ERR_PTR(error);
- } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
- /* replacing huge zero page with PMD block mapping */
- unmap_mapping_range(mapping,
- (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+ if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+ /* we are replacing a zero page with block mapping */
+ if (dax_is_pmd_entry(entry))
+ unmap_mapping_range(mapping,
+ (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
+ PMD_SIZE, 0);
+ else /* pte entry */
+ unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+ PAGE_SIZE, 0);
}
spin_lock_irq(&mapping->tree_lock);
new_entry = dax_radix_locked_entry(sector, flags);
- if (hole_fill) {
- __delete_from_page_cache(entry, NULL);
- /* Drop pagecache reference */
- put_page(entry);
- error = __radix_tree_insert(page_tree, index,
- dax_radix_order(new_entry), new_entry);
- if (error) {
- new_entry = ERR_PTR(error);
- goto unlock;
- }
- mapping->nrexceptional++;
- } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
* Only swap our new entry into the radix tree if the current
* entry is a zero page or an empty entry. If a normal PTE or
WARN_ON_ONCE(ret != entry);
__radix_tree_replace(page_tree, node, slot,
new_entry, NULL, NULL);
+ entry = new_entry;
}
+
if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
+
spin_unlock_irq(&mapping->tree_lock);
- if (hole_fill) {
- radix_tree_preload_end();
- /*
- * We don't need hole page anymore, it has been replaced with
- * locked radix tree entry now.
- */
- if (mapping->a_ops->freepage)
- mapping->a_ops->freepage(entry);
- unlock_page(entry);
- put_page(entry);
- }
- return new_entry;
+ return entry;
}
static inline unsigned long
pte_t pte, *ptep = NULL;
pmd_t *pmdp = NULL;
spinlock_t *ptl;
- bool changed;
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
- unsigned long address;
+ unsigned long address, start, end;
cond_resched();
continue;
address = pgoff_address(index, vma);
- changed = false;
- if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
+
+ /*
+ * Note because we provide start/end to follow_pte_pmd it will
+ * call mmu_notifier_invalidate_range_start() on our behalf
+ * before taking any lock.
+ */
+ if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
continue;
if (pmdp) {
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- changed = true;
+ mmu_notifier_invalidate_range(vma->vm_mm, start, end);
unlock_pmd:
spin_unlock(ptl);
#endif
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
- changed = true;
+ mmu_notifier_invalidate_range(vma->vm_mm, start, end);
unlock_pte:
pte_unmap_unlock(ptep, ptl);
}
- if (changed)
- mmu_notifier_invalidate_page(vma->vm_mm, address);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
}
i_mmap_unlock_read(mapping);
}
spin_lock_irq(&mapping->tree_lock);
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
/* Entry got punched out / reallocated? */
- if (!entry2 || !radix_tree_exceptional_entry(entry2))
+ if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
goto put_unlocked;
/*
* Entry got reallocated elsewhere? No need to writeback. We have to
}
dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
- dax_flush(dax_dev, pgoff, kaddr, size);
+ dax_flush(dax_dev, kaddr, size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
dax_unlock:
dax_read_unlock(id);
- put_locked_mapping_entry(mapping, index, entry);
+ put_locked_mapping_entry(mapping, index);
return ret;
put_unlocked:
static int dax_insert_mapping(struct address_space *mapping,
struct block_device *bdev, struct dax_device *dax_dev,
- sector_t sector, size_t size, void **entryp,
+ sector_t sector, size_t size, void *entry,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = vmf->address;
- void *entry = *entryp;
void *ret, *kaddr;
pgoff_t pgoff;
int id, rc;
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
- *entryp = ret;
trace_dax_insert_mapping(mapping->host, vmf, ret);
- return vm_insert_mixed(vma, vaddr, pfn);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+ else
+ return vm_insert_mixed(vma, vaddr, pfn);
}
-/**
- * dax_pfn_mkwrite - handle first write to DAX page
- * @vmf: The description of the fault
+/*
+ * The user has performed a load from a hole in the file. Allocating a new
+ * page in the file would cause excessive storage usage for workloads with
+ * sparse files. Instead we insert a read-only mapping of the 4k zero page.
+ * If this page is ever written to we will re-fault and change the mapping to
+ * point to real DAX storage instead.
*/
-int dax_pfn_mkwrite(struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+ struct vm_fault *vmf)
{
- struct file *file = vmf->vma->vm_file;
- struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- void *entry, **slot;
- pgoff_t index = vmf->pgoff;
+ unsigned long vaddr = vmf->address;
+ int ret = VM_FAULT_NOPAGE;
+ struct page *zero_page;
+ void *entry2;
- spin_lock_irq(&mapping->tree_lock);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
- if (!entry || !radix_tree_exceptional_entry(entry)) {
- if (entry)
- put_unlocked_mapping_entry(mapping, index, entry);
- spin_unlock_irq(&mapping->tree_lock);
- trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
- return VM_FAULT_NOPAGE;
+ zero_page = ZERO_PAGE(0);
+ if (unlikely(!zero_page)) {
+ ret = VM_FAULT_OOM;
+ goto out;
}
- radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
- entry = lock_slot(mapping, slot);
- spin_unlock_irq(&mapping->tree_lock);
- /*
- * If we race with somebody updating the PTE and finish_mkwrite_fault()
- * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
- * the fault in either case.
- */
- finish_mkwrite_fault(vmf);
- put_locked_mapping_entry(mapping, index, entry);
- trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
- return VM_FAULT_NOPAGE;
+
+ entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+ RADIX_DAX_ZERO_PAGE);
+ if (IS_ERR(entry2)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
+ vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
+out:
+ trace_dax_load_hole(inode, vmf, ret);
+ return ret;
}
-EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
static bool dax_range_is_aligned(struct block_device *bdev,
unsigned int offset, unsigned int length)
return rc;
}
memset(kaddr + offset, 0, size);
- dax_flush(dax_dev, pgoff, kaddr + offset, size);
+ dax_flush(dax_dev, kaddr + offset, size);
dax_read_unlock(id);
}
return 0;
if (map_len > end - pos)
map_len = end - pos;
+ /*
+ * The userspace address for the memory copy has already been
+ * validated via access_ok() in either vfs_read() or
+ * vfs_write(), depending on which operation we are doing.
+ */
if (iov_iter_rw(iter) == WRITE)
map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
major = VM_FAULT_MAJOR;
}
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
- sector, PAGE_SIZE, &entry, vmf->vma, vmf);
+ sector, PAGE_SIZE, entry, vmf->vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
- vmf_ret = dax_load_hole(mapping, &entry, vmf);
+ vmf_ret = dax_load_hole(mapping, entry, vmf);
goto finish_iomap;
}
/*FALLTHRU*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ put_locked_mapping_entry(mapping, vmf->pgoff);
out:
trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
#ifdef CONFIG_FS_DAX_PMD
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
- * more often than one might expect in the below functions.
- */
-#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
-
static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
- loff_t pos, void **entryp)
+ loff_t pos, void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
const sector_t sector = dax_iomap_sector(iomap, pos);
void *ret = NULL, *kaddr;
long length = 0;
pgoff_t pgoff;
- pfn_t pfn;
+ pfn_t pfn = {};
int id;
if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
goto unlock_fallback;
dax_read_unlock(id);
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
goto fallback;
- *entryp = ret;
trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
}
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
- void **entryp)
+ void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
if (unlikely(!zero_page))
goto fallback;
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
- RADIX_DAX_PMD | RADIX_DAX_HZP);
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+ RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
if (IS_ERR(ret))
goto fallback;
- *entryp = ret;
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
goto fallback;
/*
- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
- * PMD or a HZP entry. If it can't (because a 4k page is already in
- * the tree, for instance), it will return -EEXIST and we just fall
- * back to 4k entries.
+ * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
+ * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
+ * is already in the tree, for instance), it will return -EEXIST and
+ * we just fall back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
switch (iomap.type) {
case IOMAP_MAPPED:
- result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
+ result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
break;
- result = dax_pmd_load_hole(vmf, &iomap, &entry);
+ result = dax_pmd_load_hole(vmf, &iomap, entry);
break;
default:
WARN_ON_ONCE(1);
&iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, pgoff, entry);
+ put_locked_mapping_entry(mapping, pgoff);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
/* copy_from_iter: required operation for fs-dax direct-i/o */
size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
struct iov_iter *);
- /* flush: optional driver-specific cache management after writes */
- void (*flush)(struct dax_device *, pgoff_t, void *, size_t);
};
extern struct attribute_group dax_attribute_group;
put_dax(dax_dev);
}
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
#else
static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
{
static inline void fs_put_dax(struct dax_device *dax_dev)
{
}
+
+static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+{
+ return NULL;
+}
#endif
int dax_read_lock(void);
void **kaddr, pfn_t *pfn);
size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
- void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
- size_t size);
+ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
bool dax_write_cache_enabled(struct dax_device *dax_dev);
-/*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a huge zero
- * page (HZP) or an empty entry that is just used for locking. In total four
- * special bits.
- *
- * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
- * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
- * block allocation.
- */
-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
-
-static inline unsigned long dax_radix_sector(void *entry)
-{
- return (unsigned long)entry >> RADIX_DAX_SHIFT;
-}
-
-static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
-{
- return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
- ((unsigned long)sector << RADIX_DAX_SHIFT) |
- RADIX_DAX_ENTRY_LOCK);
-}
-
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops);
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
- pgoff_t index, void *entry, bool wake_all);
#ifdef CONFIG_FS_DAX
int __dax_zero_page_range(struct block_device *bdev,
}
#endif
-#ifdef CONFIG_FS_DAX_PMD
-static inline unsigned int dax_radix_order(void *entry)
-{
- if ((unsigned long)entry & RADIX_DAX_PMD)
- return PMD_SHIFT - PAGE_SHIFT;
- return 0;
-}
-#else
-static inline unsigned int dax_radix_order(void *entry)
-{
- return 0;
-}
-#endif
-int dax_pfn_mkwrite(struct vm_fault *vmf);
-
static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);