]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - drivers/md/dm-thin.c
dm cache: fix bugs when a GFP_NOWAIT allocation fails
[mirror_ubuntu-bionic-kernel.git] / drivers / md / dm-thin.c
index f91d771fff4b6e9d9a488a7a67916326a1e85897..14e87a80493601f84036c0241b78cccf93c7bbad 100644 (file)
@@ -195,12 +195,18 @@ static void throttle_unlock(struct throttle *t)
 struct dm_thin_new_mapping;
 
 /*
- * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
+ * The pool runs in various modes.  Ordered in degraded order for comparisons.
  */
 enum pool_mode {
        PM_WRITE,               /* metadata may be changed */
        PM_OUT_OF_DATA_SPACE,   /* metadata may be changed, though data may not be allocated */
+
+       /*
+        * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
+        */
+       PM_OUT_OF_METADATA_SPACE,
        PM_READ_ONLY,           /* metadata may not be changed */
+
        PM_FAIL,                /* all I/O fails */
 };
 
@@ -251,6 +257,7 @@ struct pool {
 
        spinlock_t lock;
        struct bio_list deferred_flush_bios;
+       struct bio_list deferred_flush_completions;
        struct list_head prepared_mappings;
        struct list_head prepared_discards;
        struct list_head prepared_discards_pt2;
@@ -275,9 +282,38 @@ struct pool {
        struct dm_bio_prison_cell **cell_sort_array;
 };
 
-static enum pool_mode get_pool_mode(struct pool *pool);
 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 
+static enum pool_mode get_pool_mode(struct pool *pool)
+{
+       return pool->pf.mode;
+}
+
+static void notify_of_pool_mode_change(struct pool *pool)
+{
+       const char *descs[] = {
+               "write",
+               "out-of-data-space",
+               "read-only",
+               "read-only",
+               "fail"
+       };
+       const char *extra_desc = NULL;
+       enum pool_mode mode = get_pool_mode(pool);
+
+       if (mode == PM_OUT_OF_DATA_SPACE) {
+               if (!pool->pf.error_if_no_space)
+                       extra_desc = " (queue IO)";
+               else
+                       extra_desc = " (error IO)";
+       }
+
+       dm_table_event(pool->ti->table);
+       DMINFO("%s: switching pool to %s%s mode",
+              dm_device_name(pool->pool_md),
+              descs[(int)mode], extra_desc ? : "");
+}
+
 /*
  * Target context for a pool.
  */
@@ -915,6 +951,39 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
        mempool_free(m, m->tc->pool->mapping_pool);
 }
 
+static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
+{
+       struct pool *pool = tc->pool;
+       unsigned long flags;
+
+       /*
+        * If the bio has the REQ_FUA flag set we must commit the metadata
+        * before signaling its completion.
+        */
+       if (!bio_triggers_commit(tc, bio)) {
+               bio_endio(bio);
+               return;
+       }
+
+       /*
+        * Complete bio with an error if earlier I/O caused changes to the
+        * metadata that can't be committed, e.g, due to I/O errors on the
+        * metadata device.
+        */
+       if (dm_thin_aborted_changes(tc->td)) {
+               bio_io_error(bio);
+               return;
+       }
+
+       /*
+        * Batch together any bios that trigger commits and then issue a
+        * single commit for them in process_deferred_bios().
+        */
+       spin_lock_irqsave(&pool->lock, flags);
+       bio_list_add(&pool->deferred_flush_completions, bio);
+       spin_unlock_irqrestore(&pool->lock, flags);
+}
+
 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
        struct thin_c *tc = m->tc;
@@ -947,7 +1016,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
         */
        if (bio) {
                inc_remap_and_issue_cell(tc, m->cell, m->data_block);
-               bio_endio(bio);
+               complete_overwrite_bio(tc, bio);
        } else {
                inc_all_io_entry(tc->pool, m->cell->holder);
                remap_and_issue(tc, m->cell->holder, m->data_block);
@@ -1007,7 +1076,7 @@ static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m
         * passdown we have to check that these blocks are now unused.
         */
        int r = 0;
-       bool used = true;
+       bool shared = true;
        struct thin_c *tc = m->tc;
        struct pool *pool = tc->pool;
        dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
@@ -1017,11 +1086,11 @@ static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m
        while (b != end) {
                /* find start of unmapped run */
                for (; b < end; b++) {
-                       r = dm_pool_block_is_used(pool->pmd, b, &used);
+                       r = dm_pool_block_is_shared(pool->pmd, b, &shared);
                        if (r)
                                goto out;
 
-                       if (!used)
+                       if (!shared)
                                break;
                }
 
@@ -1030,11 +1099,11 @@ static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m
 
                /* find end of run */
                for (e = b + 1; e != end; e++) {
-                       r = dm_pool_block_is_used(pool->pmd, e, &used);
+                       r = dm_pool_block_is_shared(pool->pmd, e, &shared);
                        if (r)
                                goto out;
 
-                       if (used)
+                       if (shared)
                                break;
                }
 
@@ -1380,7 +1449,37 @@ static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
 
 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
 
-static void check_for_space(struct pool *pool)
+static void requeue_bios(struct pool *pool);
+
+static bool is_read_only_pool_mode(enum pool_mode mode)
+{
+       return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
+}
+
+static bool is_read_only(struct pool *pool)
+{
+       return is_read_only_pool_mode(get_pool_mode(pool));
+}
+
+static void check_for_metadata_space(struct pool *pool)
+{
+       int r;
+       const char *ooms_reason = NULL;
+       dm_block_t nr_free;
+
+       r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
+       if (r)
+               ooms_reason = "Could not get free metadata blocks";
+       else if (!nr_free)
+               ooms_reason = "No free metadata blocks";
+
+       if (ooms_reason && !is_read_only(pool)) {
+               DMERR("%s", ooms_reason);
+               set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
+       }
+}
+
+static void check_for_data_space(struct pool *pool)
 {
        int r;
        dm_block_t nr_free;
@@ -1392,8 +1491,10 @@ static void check_for_space(struct pool *pool)
        if (r)
                return;
 
-       if (nr_free)
+       if (nr_free) {
                set_pool_mode(pool, PM_WRITE);
+               requeue_bios(pool);
+       }
 }
 
 /*
@@ -1404,14 +1505,16 @@ static int commit(struct pool *pool)
 {
        int r;
 
-       if (get_pool_mode(pool) >= PM_READ_ONLY)
+       if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
                return -EINVAL;
 
        r = dm_pool_commit_metadata(pool->pmd);
        if (r)
                metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
-       else
-               check_for_space(pool);
+       else {
+               check_for_metadata_space(pool);
+               check_for_data_space(pool);
+       }
 
        return r;
 }
@@ -1470,10 +1573,26 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 
        r = dm_pool_alloc_data_block(pool->pmd, result);
        if (r) {
-               metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
+               if (r == -ENOSPC)
+                       set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
+               else
+                       metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
+               return r;
+       }
+
+       r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
+       if (r) {
+               metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
                return r;
        }
 
+       if (!free_blocks) {
+               /* Let's commit before we use up the metadata reserve. */
+               r = commit(pool);
+               if (r)
+                       return r;
+       }
+
        return 0;
 }
 
@@ -1505,6 +1624,7 @@ static blk_status_t should_error_unserviceable_bio(struct pool *pool)
        case PM_OUT_OF_DATA_SPACE:
                return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
 
+       case PM_OUT_OF_METADATA_SPACE:
        case PM_READ_ONLY:
        case PM_FAIL:
                return BLK_STS_IOERR;
@@ -2242,7 +2362,7 @@ static void process_deferred_bios(struct pool *pool)
 {
        unsigned long flags;
        struct bio *bio;
-       struct bio_list bios;
+       struct bio_list bios, bio_completions;
        struct thin_c *tc;
 
        tc = get_first_thin(pool);
@@ -2253,26 +2373,36 @@ static void process_deferred_bios(struct pool *pool)
        }
 
        /*
-        * If there are any deferred flush bios, we must commit
-        * the metadata before issuing them.
+        * If there are any deferred flush bios, we must commit the metadata
+        * before issuing them or signaling their completion.
         */
        bio_list_init(&bios);
+       bio_list_init(&bio_completions);
+
        spin_lock_irqsave(&pool->lock, flags);
        bio_list_merge(&bios, &pool->deferred_flush_bios);
        bio_list_init(&pool->deferred_flush_bios);
+
+       bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
+       bio_list_init(&pool->deferred_flush_completions);
        spin_unlock_irqrestore(&pool->lock, flags);
 
-       if (bio_list_empty(&bios) &&
+       if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
            !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
                return;
 
        if (commit(pool)) {
+               bio_list_merge(&bios, &bio_completions);
+
                while ((bio = bio_list_pop(&bios)))
                        bio_io_error(bio);
                return;
        }
        pool->last_commit_jiffies = jiffies;
 
+       while ((bio = bio_list_pop(&bio_completions)))
+               bio_endio(bio);
+
        while ((bio = bio_list_pop(&bios)))
                generic_make_request(bio);
 }
@@ -2305,8 +2435,6 @@ static void do_waker(struct work_struct *ws)
        queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
 }
 
-static void notify_of_pool_mode_change_to_oods(struct pool *pool);
-
 /*
  * We're holding onto IO to allow userland time to react.  After the
  * timeout either the pool will have been resized (and thus back in
@@ -2319,7 +2447,7 @@ static void do_no_space_timeout(struct work_struct *ws)
 
        if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
                pool->pf.error_if_no_space = true;
-               notify_of_pool_mode_change_to_oods(pool);
+               notify_of_pool_mode_change(pool);
                error_retry_list_with_code(pool, BLK_STS_NOSPC);
        }
 }
@@ -2387,26 +2515,6 @@ static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
 
 /*----------------------------------------------------------------*/
 
-static enum pool_mode get_pool_mode(struct pool *pool)
-{
-       return pool->pf.mode;
-}
-
-static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
-{
-       dm_table_event(pool->ti->table);
-       DMINFO("%s: switching pool to %s mode",
-              dm_device_name(pool->pool_md), new_mode);
-}
-
-static void notify_of_pool_mode_change_to_oods(struct pool *pool)
-{
-       if (!pool->pf.error_if_no_space)
-               notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)");
-       else
-               notify_of_pool_mode_change(pool, "out-of-data-space (error IO)");
-}
-
 static bool passdown_enabled(struct pool_c *pt)
 {
        return pt->adjusted_pf.discard_passdown;
@@ -2455,8 +2563,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 
        switch (new_mode) {
        case PM_FAIL:
-               if (old_mode != new_mode)
-                       notify_of_pool_mode_change(pool, "failure");
                dm_pool_metadata_read_only(pool->pmd);
                pool->process_bio = process_bio_fail;
                pool->process_discard = process_bio_fail;
@@ -2468,9 +2574,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
                error_retry_list(pool);
                break;
 
+       case PM_OUT_OF_METADATA_SPACE:
        case PM_READ_ONLY:
-               if (old_mode != new_mode)
-                       notify_of_pool_mode_change(pool, "read-only");
                dm_pool_metadata_read_only(pool->pmd);
                pool->process_bio = process_bio_read_only;
                pool->process_discard = process_bio_success;
@@ -2491,8 +2596,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
                 * alarming rate.  Adjust your low water mark if you're
                 * frequently seeing this mode.
                 */
-               if (old_mode != new_mode)
-                       notify_of_pool_mode_change_to_oods(pool);
                pool->out_of_data_space = true;
                pool->process_bio = process_bio_read_only;
                pool->process_discard = process_discard_bio;
@@ -2505,8 +2608,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
                break;
 
        case PM_WRITE:
-               if (old_mode != new_mode)
-                       notify_of_pool_mode_change(pool, "write");
+               if (old_mode == PM_OUT_OF_DATA_SPACE)
+                       cancel_delayed_work_sync(&pool->no_space_timeout);
                pool->out_of_data_space = false;
                pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
                dm_pool_metadata_read_write(pool->pmd);
@@ -2524,6 +2627,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
         * doesn't cause an unexpected mode transition on resume.
         */
        pt->adjusted_pf.mode = new_mode;
+
+       if (old_mode != new_mode)
+               notify_of_pool_mode_change(pool);
 }
 
 static void abort_transaction(struct pool *pool)
@@ -2903,6 +3009,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
        INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
        spin_lock_init(&pool->lock);
        bio_list_init(&pool->deferred_flush_bios);
+       bio_list_init(&pool->deferred_flush_completions);
        INIT_LIST_HEAD(&pool->prepared_mappings);
        INIT_LIST_HEAD(&pool->prepared_discards);
        INIT_LIST_HEAD(&pool->prepared_discards_pt2);
@@ -3185,6 +3292,13 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        as.argc = argc;
        as.argv = argv;
 
+       /* make sure metadata and data are different devices */
+       if (!strcmp(argv[0], argv[1])) {
+               ti->error = "Error setting metadata or data device";
+               r = -EINVAL;
+               goto out_unlock;
+       }
+
        /*
         * Set default pool features.
         */
@@ -3403,6 +3517,10 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
                DMINFO("%s: growing the metadata device from %llu to %llu blocks",
                       dm_device_name(pool->pool_md),
                       sb_metadata_dev_size, metadata_dev_size);
+
+               if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
+                       set_pool_mode(pool, PM_WRITE);
+
                r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
                if (r) {
                        metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
@@ -3706,7 +3824,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
        struct pool_c *pt = ti->private;
        struct pool *pool = pt->pool;
 
-       if (get_pool_mode(pool) >= PM_READ_ONLY) {
+       if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
                DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
                      dm_device_name(pool->pool_md));
                return -EOPNOTSUPP;
@@ -3780,6 +3898,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
        dm_block_t nr_blocks_data;
        dm_block_t nr_blocks_metadata;
        dm_block_t held_root;
+       enum pool_mode mode;
        char buf[BDEVNAME_SIZE];
        char buf2[BDEVNAME_SIZE];
        struct pool_c *pt = ti->private;
@@ -3850,9 +3969,10 @@ static void pool_status(struct dm_target *ti, status_type_t type,
                else
                        DMEMIT("- ");
 
-               if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+               mode = get_pool_mode(pool);
+               if (mode == PM_OUT_OF_DATA_SPACE)
                        DMEMIT("out_of_data_space ");
-               else if (pool->pf.mode == PM_READ_ONLY)
+               else if (is_read_only_pool_mode(mode))
                        DMEMIT("ro ");
                else
                        DMEMIT("rw ");
@@ -4060,6 +4180,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
        tc->sort_bio_list = RB_ROOT;
 
        if (argc == 3) {
+               if (!strcmp(argv[0], argv[2])) {
+                       ti->error = "Error setting origin device";
+                       r = -EINVAL;
+                       goto bad_origin_dev;
+               }
+
                r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
                if (r) {
                        ti->error = "Error opening origin device";