*/
int barrier_error;
+ /*
+ * Protect barrier_error from concurrent endio processing
+ * in request-based dm.
+ */
+ spinlock_t barrier_error_lock;
+
/*
* Processing queue (flush/barriers)
*/
struct workqueue_struct *wq;
+ struct work_struct barrier_work;
+
+ /* A pointer to the currently processing pre/post flush request */
+ struct request *flush_request;
/*
* The current mapping.
/*
* Block device functions
*/
+int dm_deleting_md(struct mapped_device *md)
+{
+ return test_bit(DMF_DELETING, &md->flags);
+}
+
static int dm_blk_open(struct block_device *bdev, fmode_t mode)
{
struct mapped_device *md;
goto out;
if (test_bit(DMF_FREEING, &md->flags) ||
- test_bit(DMF_DELETING, &md->flags)) {
+ dm_deleting_md(md)) {
md = NULL;
goto out;
}
unsigned int cmd, unsigned long arg)
{
struct mapped_device *md = bdev->bd_disk->private_data;
- struct dm_table *map = dm_get_table(md);
+ struct dm_table *map = dm_get_live_table(md);
struct dm_target *tgt;
int r = -ENOTTY;
tgt = dm_table_get_target(map, 0);
- if (dm_suspended(md)) {
+ if (dm_suspended_md(md)) {
r = -EAGAIN;
goto out;
}
* function to access the md->map field, and make sure they call
* dm_table_put() when finished.
*/
-struct dm_table *dm_get_table(struct mapped_device *md)
+struct dm_table *dm_get_live_table(struct mapped_device *md)
{
struct dm_table *t;
unsigned long flags;
blk_update_request(tio->orig, 0, nr_bytes);
}
+static void store_barrier_error(struct mapped_device *md, int error)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&md->barrier_error_lock, flags);
+ /*
+ * Basically, the first error is taken, but:
+ * -EOPNOTSUPP supersedes any I/O error.
+ * Requeue request supersedes any I/O error but -EOPNOTSUPP.
+ */
+ if (!md->barrier_error || error == -EOPNOTSUPP ||
+ (md->barrier_error != -EOPNOTSUPP &&
+ error == DM_ENDIO_REQUEUE))
+ md->barrier_error = error;
+ spin_unlock_irqrestore(&md->barrier_error_lock, flags);
+}
+
/*
* Don't touch any member of the md after calling this function because
* the md may be freed in dm_put() at the end of this function.
static void dm_end_request(struct request *clone, int error)
{
int rw = rq_data_dir(clone);
+ int run_queue = 1;
+ bool is_barrier = blk_barrier_rq(clone);
struct dm_rq_target_io *tio = clone->end_io_data;
struct mapped_device *md = tio->md;
struct request *rq = tio->orig;
- if (blk_pc_request(rq)) {
+ if (blk_pc_request(rq) && !is_barrier) {
rq->errors = clone->errors;
rq->resid_len = clone->resid_len;
free_rq_clone(clone);
- blk_end_request_all(rq, error);
+ if (unlikely(is_barrier)) {
+ if (unlikely(error))
+ store_barrier_error(md, error);
+ run_queue = 0;
+ } else
+ blk_end_request_all(rq, error);
- rq_completed(md, rw, 1);
+ rq_completed(md, rw, run_queue);
}
static void dm_unprep_request(struct request *rq)
struct request_queue *q = rq->q;
unsigned long flags;
+ if (unlikely(blk_barrier_rq(clone))) {
+ /*
+ * Barrier clones share an original request.
+ * Leave it to dm_end_request(), which handles this special
+ * case.
+ */
+ dm_end_request(clone, DM_ENDIO_REQUEUE);
+ return;
+ }
+
dm_unprep_request(rq);
spin_lock_irqsave(q->queue_lock, flags);
struct dm_rq_target_io *tio = clone->end_io_data;
struct request *rq = tio->orig;
+ if (unlikely(blk_barrier_rq(clone))) {
+ /*
+ * Barrier clones share an original request. So can't use
+ * softirq_done with the original.
+ * Pass the clone to dm_done() directly in this special case.
+ * It is safe (even if clone->q->queue_lock is held here)
+ * because there is no I/O dispatching during the completion
+ * of barrier clone.
+ */
+ dm_done(clone, error, true);
+ return;
+ }
+
tio->error = error;
rq->completion_data = clone;
blk_complete_request(rq);
struct dm_rq_target_io *tio = clone->end_io_data;
struct request *rq = tio->orig;
+ if (unlikely(blk_barrier_rq(clone))) {
+ /*
+ * Barrier clones share an original request.
+ * Leave it to dm_end_request(), which handles this special
+ * case.
+ */
+ BUG_ON(error > 0);
+ dm_end_request(clone, error);
+ return;
+ }
+
rq->cmd_flags |= REQ_FAILED;
dm_complete_request(clone, error);
}
struct clone_info ci;
int error = 0;
- ci.map = dm_get_table(md);
+ ci.map = dm_get_live_table(md);
if (unlikely(!ci.map)) {
if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
bio_io_error(bio);
struct bio_vec *biovec)
{
struct mapped_device *md = q->queuedata;
- struct dm_table *map = dm_get_table(md);
+ struct dm_table *map = dm_get_live_table(md);
struct dm_target *ti;
sector_t max_sectors;
int max_size = 0;
{
struct mapped_device *md = q->queuedata;
- if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
- bio_endio(bio, -EOPNOTSUPP);
- return 0;
- }
-
return md->saved_make_request_fn(q, bio); /* call __make_request() */
}
return _dm_request(q, bio);
}
+/*
+ * Mark this request as flush request, so that dm_request_fn() can
+ * recognize.
+ */
+static void dm_rq_prepare_flush(struct request_queue *q, struct request *rq)
+{
+ rq->cmd_type = REQ_TYPE_LINUX_BLOCK;
+ rq->cmd[0] = REQ_LB_OP_FLUSH;
+}
+
+static bool dm_rq_is_flush_request(struct request *rq)
+{
+ if (rq->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+ rq->cmd[0] == REQ_LB_OP_FLUSH)
+ return true;
+ else
+ return false;
+}
+
void dm_dispatch_request(struct request *rq)
{
int r;
static int setup_clone(struct request *clone, struct request *rq,
struct dm_rq_target_io *tio)
{
- int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
- dm_rq_bio_constructor, tio);
+ int r;
- if (r)
- return r;
+ if (dm_rq_is_flush_request(rq)) {
+ blk_rq_init(NULL, clone);
+ clone->cmd_type = REQ_TYPE_FS;
+ clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
+ } else {
+ r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+ dm_rq_bio_constructor, tio);
+ if (r)
+ return r;
+
+ clone->cmd = rq->cmd;
+ clone->cmd_len = rq->cmd_len;
+ clone->sense = rq->sense;
+ clone->buffer = rq->buffer;
+ }
- clone->cmd = rq->cmd;
- clone->cmd_len = rq->cmd_len;
- clone->sense = rq->sense;
- clone->buffer = rq->buffer;
clone->end_io = end_clone_request;
clone->end_io_data = tio;
struct mapped_device *md = q->queuedata;
struct request *clone;
+ if (unlikely(dm_rq_is_flush_request(rq)))
+ return BLKPREP_OK;
+
if (unlikely(rq->special)) {
DMWARN("Already has something in rq->special.");
return BLKPREP_KILL;
break;
case DM_MAPIO_REMAPPED:
/* The target has remapped the I/O so dispatch it */
+ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
+ blk_rq_pos(tio->orig));
dm_dispatch_request(clone);
break;
case DM_MAPIO_REQUEUE:
static void dm_request_fn(struct request_queue *q)
{
struct mapped_device *md = q->queuedata;
- struct dm_table *map = dm_get_table(md);
+ struct dm_table *map = dm_get_live_table(md);
struct dm_target *ti;
struct request *rq, *clone;
if (!rq)
goto plug_and_out;
+ if (unlikely(dm_rq_is_flush_request(rq))) {
+ BUG_ON(md->flush_request);
+ md->flush_request = rq;
+ blk_start_request(rq);
+ queue_work(md->wq, &md->barrier_work);
+ goto out;
+ }
+
ti = dm_table_find_target(map, blk_rq_pos(rq));
if (ti->type->busy && ti->type->busy(ti))
goto plug_and_out;
{
int r;
struct mapped_device *md = q->queuedata;
- struct dm_table *map = dm_get_table(md);
+ struct dm_table *map = dm_get_live_table(md);
if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
r = 1;
static void dm_unplug_all(struct request_queue *q)
{
struct mapped_device *md = q->queuedata;
- struct dm_table *map = dm_get_table(md);
+ struct dm_table *map = dm_get_live_table(md);
if (map) {
if (dm_request_based(md))
struct dm_table *map;
if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
- map = dm_get_table(md);
+ map = dm_get_live_table(md);
if (map) {
/*
* Request-based dm cares about only own queue for
static const struct block_device_operations dm_blk_dops;
static void dm_wq_work(struct work_struct *work);
+static void dm_rq_barrier_work(struct work_struct *work);
/*
* Allocate and initialise a blank device with a given minor.
init_rwsem(&md->io_lock);
mutex_init(&md->suspend_lock);
spin_lock_init(&md->deferred_lock);
+ spin_lock_init(&md->barrier_error_lock);
rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1);
atomic_set(&md->open_count, 0);
blk_queue_softirq_done(md->queue, dm_softirq_done);
blk_queue_prep_rq(md->queue, dm_prep_fn);
blk_queue_lld_busy(md->queue, dm_lld_busy);
+ blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH,
+ dm_rq_prepare_flush);
md->disk = alloc_disk(1);
if (!md->disk)
atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
+ INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
init_waitqueue_head(&md->eventq);
md->disk->major = _major;
mutex_unlock(&md->bdev->bd_inode->i_mutex);
}
-static int __bind(struct mapped_device *md, struct dm_table *t,
- struct queue_limits *limits)
+/*
+ * Returns old map, which caller must destroy.
+ */
+static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
+ struct queue_limits *limits)
{
+ struct dm_table *old_map;
struct request_queue *q = md->queue;
sector_t size;
unsigned long flags;
__set_size(md, size);
- if (!size) {
- dm_table_destroy(t);
- return 0;
- }
-
dm_table_event_callback(t, event_callback, md);
/*
__bind_mempools(md, t);
write_lock_irqsave(&md->map_lock, flags);
+ old_map = md->map;
md->map = t;
dm_table_set_restrictions(t, q, limits);
write_unlock_irqrestore(&md->map_lock, flags);
- return 0;
+ return old_map;
}
-static void __unbind(struct mapped_device *md)
+/*
+ * Returns unbound table for the caller to free.
+ */
+static struct dm_table *__unbind(struct mapped_device *md)
{
struct dm_table *map = md->map;
unsigned long flags;
if (!map)
- return;
+ return NULL;
dm_table_event_callback(map, NULL, NULL);
write_lock_irqsave(&md->map_lock, flags);
md->map = NULL;
write_unlock_irqrestore(&md->map_lock, flags);
- dm_table_destroy(map);
+
+ return map;
}
/*
BUG_ON(test_bit(DMF_FREEING, &md->flags));
if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
- map = dm_get_table(md);
+ map = dm_get_live_table(md);
idr_replace(&_minor_idr, MINOR_ALLOCED,
MINOR(disk_devt(dm_disk(md))));
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
- if (!dm_suspended(md)) {
+ if (!dm_suspended_md(md)) {
dm_table_presuspend_targets(map);
dm_table_postsuspend_targets(map);
}
dm_sysfs_exit(md);
dm_table_put(map);
- __unbind(md);
+ dm_table_destroy(__unbind(md));
free_dev(md);
}
}
queue_work(md->wq, &md->work);
}
+static void dm_rq_set_flush_nr(struct request *clone, unsigned flush_nr)
+{
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ tio->info.flush_request = flush_nr;
+}
+
+/* Issue barrier requests to targets and wait for their completion. */
+static int dm_rq_barrier(struct mapped_device *md)
+{
+ int i, j;
+ struct dm_table *map = dm_get_live_table(md);
+ unsigned num_targets = dm_table_get_num_targets(map);
+ struct dm_target *ti;
+ struct request *clone;
+
+ md->barrier_error = 0;
+
+ for (i = 0; i < num_targets; i++) {
+ ti = dm_table_get_target(map, i);
+ for (j = 0; j < ti->num_flush_requests; j++) {
+ clone = clone_rq(md->flush_request, md, GFP_NOIO);
+ dm_rq_set_flush_nr(clone, j);
+ atomic_inc(&md->pending[rq_data_dir(clone)]);
+ map_request(ti, clone, md);
+ }
+ }
+
+ dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+ dm_table_put(map);
+
+ return md->barrier_error;
+}
+
+static void dm_rq_barrier_work(struct work_struct *work)
+{
+ int error;
+ struct mapped_device *md = container_of(work, struct mapped_device,
+ barrier_work);
+ struct request_queue *q = md->queue;
+ struct request *rq;
+ unsigned long flags;
+
+ /*
+ * Hold the md reference here and leave it at the last part so that
+ * the md can't be deleted by device opener when the barrier request
+ * completes.
+ */
+ dm_get(md);
+
+ error = dm_rq_barrier(md);
+
+ rq = md->flush_request;
+ md->flush_request = NULL;
+
+ if (error == DM_ENDIO_REQUEUE) {
+ spin_lock_irqsave(q->queue_lock, flags);
+ blk_requeue_request(q, rq);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ } else
+ blk_end_request_all(rq, error);
+
+ blk_run_queue(q);
+
+ dm_put(md);
+}
+
/*
- * Swap in a new table (destroying old one).
+ * Swap in a new table, returning the old one for the caller to destroy.
*/
-int dm_swap_table(struct mapped_device *md, struct dm_table *table)
+struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
{
+ struct dm_table *map = ERR_PTR(-EINVAL);
struct queue_limits limits;
- int r = -EINVAL;
+ int r;
mutex_lock(&md->suspend_lock);
/* device must be suspended */
- if (!dm_suspended(md))
+ if (!dm_suspended_md(md))
goto out;
r = dm_calculate_queue_limits(table, &limits);
- if (r)
+ if (r) {
+ map = ERR_PTR(r);
goto out;
+ }
/* cannot change the device type, once a table is bound */
if (md->map &&
goto out;
}
- __unbind(md);
- r = __bind(md, table, &limits);
+ map = __bind(md, table, &limits);
out:
mutex_unlock(&md->suspend_lock);
- return r;
+ return map;
}
/*
mutex_lock(&md->suspend_lock);
- if (dm_suspended(md)) {
+ if (dm_suspended_md(md)) {
r = -EINVAL;
goto out_unlock;
}
- map = dm_get_table(md);
+ map = dm_get_live_table(md);
/*
* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
up_write(&md->io_lock);
- flush_workqueue(md->wq);
-
+ /*
+ * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
+ * can be kicked until md->queue is stopped. So stop md->queue before
+ * flushing md->wq.
+ */
if (dm_request_based(md))
stop_queue(md->queue);
+ flush_workqueue(md->wq);
+
/*
* At this point no more requests are entering target request routines.
* We call dm_wait_for_completion to wait for all existing requests
* requests are being added to md->deferred list.
*/
- dm_table_postsuspend_targets(map);
-
set_bit(DMF_SUSPENDED, &md->flags);
+ dm_table_postsuspend_targets(map);
+
out:
dm_table_put(map);
struct dm_table *map = NULL;
mutex_lock(&md->suspend_lock);
- if (!dm_suspended(md))
+ if (!dm_suspended_md(md))
goto out;
- map = dm_get_table(md);
+ map = dm_get_live_table(md);
if (!map || !dm_table_get_size(map))
goto out;
return NULL;
if (test_bit(DMF_FREEING, &md->flags) ||
- test_bit(DMF_DELETING, &md->flags))
+ dm_deleting_md(md))
return NULL;
dm_get(md);
return md;
}
-int dm_suspended(struct mapped_device *md)
+int dm_suspended_md(struct mapped_device *md)
{
return test_bit(DMF_SUSPENDED, &md->flags);
}