drivers/md/dm.c

   1 /*
   2  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm.h"
   9 #include "dm-uevent.h"
  10
  11 #include <linux/init.h>
  12 #include <linux/module.h>
  13 #include <linux/mutex.h>
  14 #include <linux/moduleparam.h>
  15 #include <linux/blkpg.h>
  16 #include <linux/bio.h>
  17 #include <linux/buffer_head.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/mempool.h>
  20 #include <linux/slab.h>
  21 #include <linux/idr.h>
  22 #include <linux/hdreg.h>
  23 #include <linux/delay.h>
  24
  25 #include <trace/events/block.h>
  26
  27 #define DM_MSG_PREFIX "core"
  28
  29 /*
  30  * Cookies are numeric values sent with CHANGE and REMOVE
  31  * uevents while resuming, removing or renaming the device.
  32  */
  33 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  34 #define DM_COOKIE_LENGTH 24
  35
  36 static const char *_name = DM_NAME;
  37
  38 static unsigned int major = 0;
  39 static unsigned int _major = 0;
  40
  41 static DEFINE_SPINLOCK(_minor_lock);
  42 /*
  43  * For bio-based dm.
  44  * One of these is allocated per bio.
  45  */
  46 struct dm_io {
  47         struct mapped_device *md;
  48         int error;
  49         atomic_t io_count;
  50         struct bio *bio;
  51         unsigned long start_time;
  52         spinlock_t endio_lock;
  53 };
  54
  55 /*
  56  * For bio-based dm.
  57  * One of these is allocated per target within a bio.  Hopefully
  58  * this will be simplified out one day.
  59  */
  60 struct dm_target_io {
  61         struct dm_io *io;
  62         struct dm_target *ti;
  63         union map_info info;
  64 };
  65
  66 /*
  67  * For request-based dm.
  68  * One of these is allocated per request.
  69  */
  70 struct dm_rq_target_io {
  71         struct mapped_device *md;
  72         struct dm_target *ti;
  73         struct request *orig, clone;
  74         int error;
  75         union map_info info;
  76 };
  77
  78 /*
  79  * For request-based dm.
  80  * One of these is allocated per bio.
  81  */
  82 struct dm_rq_clone_bio_info {
  83         struct bio *orig;
  84         struct dm_rq_target_io *tio;
  85 };
  86
  87 union map_info *dm_get_mapinfo(struct bio *bio)
  88 {
  89         if (bio && bio->bi_private)
  90                 return &((struct dm_target_io *)bio->bi_private)->info;
  91         return NULL;
  92 }
  93
  94 union map_info *dm_get_rq_mapinfo(struct request *rq)
  95 {
  96         if (rq && rq->end_io_data)
  97                 return &((struct dm_rq_target_io *)rq->end_io_data)->info;
  98         return NULL;
  99 }
 100 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 101
 102 #define MINOR_ALLOCED ((void *)-1)
 103
 104 /*
 105  * Bits for the md->flags field.
 106  */
 107 #define DMF_BLOCK_IO_FOR_SUSPEND 0
 108 #define DMF_SUSPENDED 1
 109 #define DMF_FROZEN 2
 110 #define DMF_FREEING 3
 111 #define DMF_DELETING 4
 112 #define DMF_NOFLUSH_SUSPENDING 5
 113 #define DMF_QUEUE_IO_TO_THREAD 6
 114
 115 /*
 116  * Work processed by per-device workqueue.
 117  */
 118 struct mapped_device {
 119         struct rw_semaphore io_lock;
 120         struct mutex suspend_lock;
 121         rwlock_t map_lock;
 122         atomic_t holders;
 123         atomic_t open_count;
 124
 125         unsigned long flags;
 126
 127         struct request_queue *queue;
 128         unsigned type;
 129         /* Protect queue and type against concurrent access. */
 130         struct mutex type_lock;
 131
 132         struct gendisk *disk;
 133         char name[16];
 134
 135         void *interface_ptr;
 136
 137         /*
 138          * A list of ios that arrived while we were suspended.
 139          */
 140         atomic_t pending[2];
 141         wait_queue_head_t wait;
 142         struct work_struct work;
 143         struct bio_list deferred;
 144         spinlock_t deferred_lock;
 145
 146         /*
 147          * An error from the flush request currently being processed.
 148          */
 149         int flush_error;
 150
 151         /*
 152          * Protect barrier_error from concurrent endio processing
 153          * in request-based dm.
 154          */
 155         spinlock_t barrier_error_lock;
 156         int barrier_error;
 157
 158         /*
 159          * Processing queue (flush/barriers)
 160          */
 161         struct workqueue_struct *wq;
 162         struct work_struct barrier_work;
 163
 164         /* A pointer to the currently processing pre/post flush request */
 165         struct request *flush_request;
 166
 167         /*
 168          * The current mapping.
 169          */
 170         struct dm_table *map;
 171
 172         /*
 173          * io objects are allocated from here.
 174          */
 175         mempool_t *io_pool;
 176         mempool_t *tio_pool;
 177
 178         struct bio_set *bs;
 179
 180         /*
 181          * Event handling.
 182          */
 183         atomic_t event_nr;
 184         wait_queue_head_t eventq;
 185         atomic_t uevent_seq;
 186         struct list_head uevent_list;
 187         spinlock_t uevent_lock; /* Protect access to uevent_list */
 188
 189         /*
 190          * freeze/thaw support require holding onto a super block
 191          */
 192         struct super_block *frozen_sb;
 193         struct block_device *bdev;
 194
 195         /* forced geometry settings */
 196         struct hd_geometry geometry;
 197
 198         /* For saving the address of __make_request for request based dm */
 199         make_request_fn *saved_make_request_fn;
 200
 201         /* sysfs handle */
 202         struct kobject kobj;
 203
 204         /* zero-length flush that will be cloned and submitted to targets */
 205         struct bio flush_bio;
 206 };
 207
 208 /*
 209  * For mempools pre-allocation at the table loading time.
 210  */
 211 struct dm_md_mempools {
 212         mempool_t *io_pool;
 213         mempool_t *tio_pool;
 214         struct bio_set *bs;
 215 };
 216
 217 #define MIN_IOS 256
 218 static struct kmem_cache *_io_cache;
 219 static struct kmem_cache *_tio_cache;
 220 static struct kmem_cache *_rq_tio_cache;
 221 static struct kmem_cache *_rq_bio_info_cache;
 222
 223 static int __init local_init(void)
 224 {
 225         int r = -ENOMEM;
 226
 227         /* allocate a slab for the dm_ios */
 228         _io_cache = KMEM_CACHE(dm_io, 0);
 229         if (!_io_cache)
 230                 return r;
 231
 232         /* allocate a slab for the target ios */
 233         _tio_cache = KMEM_CACHE(dm_target_io, 0);
 234         if (!_tio_cache)
 235                 goto out_free_io_cache;
 236
 237         _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 238         if (!_rq_tio_cache)
 239                 goto out_free_tio_cache;
 240
 241         _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
 242         if (!_rq_bio_info_cache)
 243                 goto out_free_rq_tio_cache;
 244
 245         r = dm_uevent_init();
 246         if (r)
 247                 goto out_free_rq_bio_info_cache;
 248
 249         _major = major;
 250         r = register_blkdev(_major, _name);
 251         if (r < 0)
 252                 goto out_uevent_exit;
 253
 254         if (!_major)
 255                 _major = r;
 256
 257         return 0;
 258
 259 out_uevent_exit:
 260         dm_uevent_exit();
 261 out_free_rq_bio_info_cache:
 262         kmem_cache_destroy(_rq_bio_info_cache);
 263 out_free_rq_tio_cache:
 264         kmem_cache_destroy(_rq_tio_cache);
 265 out_free_tio_cache:
 266         kmem_cache_destroy(_tio_cache);
 267 out_free_io_cache:
 268         kmem_cache_destroy(_io_cache);
 269
 270         return r;
 271 }
 272
 273 static void local_exit(void)
 274 {
 275         kmem_cache_destroy(_rq_bio_info_cache);
 276         kmem_cache_destroy(_rq_tio_cache);
 277         kmem_cache_destroy(_tio_cache);
 278         kmem_cache_destroy(_io_cache);
 279         unregister_blkdev(_major, _name);
 280         dm_uevent_exit();
 281
 282         _major = 0;
 283
 284         DMINFO("cleaned up");
 285 }
 286
 287 static int (*_inits[])(void) __initdata = {
 288         local_init,
 289         dm_target_init,
 290         dm_linear_init,
 291         dm_stripe_init,
 292         dm_io_init,
 293         dm_kcopyd_init,
 294         dm_interface_init,
 295 };
 296
 297 static void (*_exits[])(void) = {
 298         local_exit,
 299         dm_target_exit,
 300         dm_linear_exit,
 301         dm_stripe_exit,
 302         dm_io_exit,
 303         dm_kcopyd_exit,
 304         dm_interface_exit,
 305 };
 306
 307 static int __init dm_init(void)
 308 {
 309         const int count = ARRAY_SIZE(_inits);
 310
 311         int r, i;
 312
 313         for (i = 0; i < count; i++) {
 314                 r = _inits[i]();
 315                 if (r)
 316                         goto bad;
 317         }
 318
 319         return 0;
 320
 321       bad:
 322         while (i--)
 323                 _exits[i]();
 324
 325         return r;
 326 }
 327
 328 static void __exit dm_exit(void)
 329 {
 330         int i = ARRAY_SIZE(_exits);
 331
 332         while (i--)
 333                 _exits[i]();
 334 }
 335
 336 /*
 337  * Block device functions
 338  */
 339 int dm_deleting_md(struct mapped_device *md)
 340 {
 341         return test_bit(DMF_DELETING, &md->flags);
 342 }
 343
 344 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 345 {
 346         struct mapped_device *md;
 347
 348         lock_kernel();
 349         spin_lock(&_minor_lock);
 350
 351         md = bdev->bd_disk->private_data;
 352         if (!md)
 353                 goto out;
 354
 355         if (test_bit(DMF_FREEING, &md->flags) ||
 356             dm_deleting_md(md)) {
 357                 md = NULL;
 358                 goto out;
 359         }
 360
 361         dm_get(md);
 362         atomic_inc(&md->open_count);
 363
 364 out:
 365         spin_unlock(&_minor_lock);
 366         unlock_kernel();
 367
 368         return md ? 0 : -ENXIO;
 369 }
 370
 371 static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 372 {
 373         struct mapped_device *md = disk->private_data;
 374
 375         lock_kernel();
 376         atomic_dec(&md->open_count);
 377         dm_put(md);
 378         unlock_kernel();
 379
 380         return 0;
 381 }
 382
 383 int dm_open_count(struct mapped_device *md)
 384 {
 385         return atomic_read(&md->open_count);
 386 }
 387
 388 /*
 389  * Guarantees nothing is using the device before it's deleted.
 390  */
 391 int dm_lock_for_deletion(struct mapped_device *md)
 392 {
 393         int r = 0;
 394
 395         spin_lock(&_minor_lock);
 396
 397         if (dm_open_count(md))
 398                 r = -EBUSY;
 399         else
 400                 set_bit(DMF_DELETING, &md->flags);
 401
 402         spin_unlock(&_minor_lock);
 403
 404         return r;
 405 }
 406
 407 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 408 {
 409         struct mapped_device *md = bdev->bd_disk->private_data;
 410
 411         return dm_get_geometry(md, geo);
 412 }
 413
 414 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 415                         unsigned int cmd, unsigned long arg)
 416 {
 417         struct mapped_device *md = bdev->bd_disk->private_data;
 418         struct dm_table *map = dm_get_live_table(md);
 419         struct dm_target *tgt;
 420         int r = -ENOTTY;
 421
 422         if (!map || !dm_table_get_size(map))
 423                 goto out;
 424
 425         /* We only support devices that have a single target */
 426         if (dm_table_get_num_targets(map) != 1)
 427                 goto out;
 428
 429         tgt = dm_table_get_target(map, 0);
 430
 431         if (dm_suspended_md(md)) {
 432                 r = -EAGAIN;
 433                 goto out;
 434         }
 435
 436         if (tgt->type->ioctl)
 437                 r = tgt->type->ioctl(tgt, cmd, arg);
 438
 439 out:
 440         dm_table_put(map);
 441
 442         return r;
 443 }
 444
 445 static struct dm_io *alloc_io(struct mapped_device *md)
 446 {
 447         return mempool_alloc(md->io_pool, GFP_NOIO);
 448 }
 449
 450 static void free_io(struct mapped_device *md, struct dm_io *io)
 451 {
 452         mempool_free(io, md->io_pool);
 453 }
 454
 455 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 456 {
 457         mempool_free(tio, md->tio_pool);
 458 }
 459
 460 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
 461                                             gfp_t gfp_mask)
 462 {
 463         return mempool_alloc(md->tio_pool, gfp_mask);
 464 }
 465
 466 static void free_rq_tio(struct dm_rq_target_io *tio)
 467 {
 468         mempool_free(tio, tio->md->tio_pool);
 469 }
 470
 471 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
 472 {
 473         return mempool_alloc(md->io_pool, GFP_ATOMIC);
 474 }
 475
 476 static void free_bio_info(struct dm_rq_clone_bio_info *info)
 477 {
 478         mempool_free(info, info->tio->md->io_pool);
 479 }
 480
 481 static int md_in_flight(struct mapped_device *md)
 482 {
 483         return atomic_read(&md->pending[READ]) +
 484                atomic_read(&md->pending[WRITE]);
 485 }
 486
 487 static void start_io_acct(struct dm_io *io)
 488 {
 489         struct mapped_device *md = io->md;
 490         int cpu;
 491         int rw = bio_data_dir(io->bio);
 492
 493         io->start_time = jiffies;
 494
 495         cpu = part_stat_lock();
 496         part_round_stats(cpu, &dm_disk(md)->part0);
 497         part_stat_unlock();
 498         dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
 499 }
 500
 501 static void end_io_acct(struct dm_io *io)
 502 {
 503         struct mapped_device *md = io->md;
 504         struct bio *bio = io->bio;
 505         unsigned long duration = jiffies - io->start_time;
 506         int pending, cpu;
 507         int rw = bio_data_dir(bio);
 508
 509         cpu = part_stat_lock();
 510         part_round_stats(cpu, &dm_disk(md)->part0);
 511         part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 512         part_stat_unlock();
 513
 514         /*
 515          * After this is decremented the bio must not be touched if it is
 516          * a flush.
 517          */
 518         dm_disk(md)->part0.in_flight[rw] = pending =
 519                 atomic_dec_return(&md->pending[rw]);
 520         pending += atomic_read(&md->pending[rw^0x1]);
 521
 522         /* nudge anyone waiting on suspend queue */
 523         if (!pending)
 524                 wake_up(&md->wait);
 525 }
 526
 527 /*
 528  * Add the bio to the list of deferred io.
 529  */
 530 static void queue_io(struct mapped_device *md, struct bio *bio)
 531 {
 532         down_write(&md->io_lock);
 533
 534         spin_lock_irq(&md->deferred_lock);
 535         bio_list_add(&md->deferred, bio);
 536         spin_unlock_irq(&md->deferred_lock);
 537
 538         if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
 539                 queue_work(md->wq, &md->work);
 540
 541         up_write(&md->io_lock);
 542 }
 543
 544 /*
 545  * Everyone (including functions in this file), should use this
 546  * function to access the md->map field, and make sure they call
 547  * dm_table_put() when finished.
 548  */
 549 struct dm_table *dm_get_live_table(struct mapped_device *md)
 550 {
 551         struct dm_table *t;
 552         unsigned long flags;
 553
 554         read_lock_irqsave(&md->map_lock, flags);
 555         t = md->map;
 556         if (t)
 557                 dm_table_get(t);
 558         read_unlock_irqrestore(&md->map_lock, flags);
 559
 560         return t;
 561 }
 562
 563 /*
 564  * Get the geometry associated with a dm device
 565  */
 566 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 567 {
 568         *geo = md->geometry;
 569
 570         return 0;
 571 }
 572
 573 /*
 574  * Set the geometry of a device.
 575  */
 576 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 577 {
 578         sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 579
 580         if (geo->start > sz) {
 581                 DMWARN("Start sector is beyond the geometry limits.");
 582                 return -EINVAL;
 583         }
 584
 585         md->geometry = *geo;
 586
 587         return 0;
 588 }
 589
 590 /*-----------------------------------------------------------------
 591  * CRUD START:
 592  *   A more elegant soln is in the works that uses the queue
 593  *   merge fn, unfortunately there are a couple of changes to
 594  *   the block layer that I want to make for this.  So in the
 595  *   interests of getting something for people to use I give
 596  *   you this clearly demarcated crap.
 597  *---------------------------------------------------------------*/
 598
 599 static int __noflush_suspending(struct mapped_device *md)
 600 {
 601         return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 602 }
 603
 604 /*
 605  * Decrements the number of outstanding ios that a bio has been
 606  * cloned into, completing the original io if necc.
 607  */
 608 static void dec_pending(struct dm_io *io, int error)
 609 {
 610         unsigned long flags;
 611         int io_error;
 612         struct bio *bio;
 613         struct mapped_device *md = io->md;
 614
 615         /* Push-back supersedes any I/O errors */
 616         if (unlikely(error)) {
 617                 spin_lock_irqsave(&io->endio_lock, flags);
 618                 if (!(io->error > 0 && __noflush_suspending(md)))
 619                         io->error = error;
 620                 spin_unlock_irqrestore(&io->endio_lock, flags);
 621         }
 622
 623         if (atomic_dec_and_test(&io->io_count)) {
 624                 if (io->error == DM_ENDIO_REQUEUE) {
 625                         /*
 626                          * Target requested pushing back the I/O.
 627                          */
 628                         spin_lock_irqsave(&md->deferred_lock, flags);
 629                         if (__noflush_suspending(md)) {
 630                                 if (!(io->bio->bi_rw & REQ_FLUSH))
 631                                         bio_list_add_head(&md->deferred,
 632                                                           io->bio);
 633                         } else
 634                                 /* noflush suspend was interrupted. */
 635                                 io->error = -EIO;
 636                         spin_unlock_irqrestore(&md->deferred_lock, flags);
 637                 }
 638
 639                 io_error = io->error;
 640                 bio = io->bio;
 641
 642                 if (bio->bi_rw & REQ_FLUSH) {
 643                         /*
 644                          * There can be just one flush request so we use
 645                          * a per-device variable for error reporting.
 646                          * Note that you can't touch the bio after end_io_acct
 647                          */
 648                         if (!md->flush_error)
 649                                 md->flush_error = io_error;
 650                         end_io_acct(io);
 651                         free_io(md, io);
 652                 } else {
 653                         end_io_acct(io);
 654                         free_io(md, io);
 655
 656                         if (io_error != DM_ENDIO_REQUEUE) {
 657                                 trace_block_bio_complete(md->queue, bio);
 658
 659                                 bio_endio(bio, io_error);
 660                         }
 661                 }
 662         }
 663 }
 664
 665 static void clone_endio(struct bio *bio, int error)
 666 {
 667         int r = 0;
 668         struct dm_target_io *tio = bio->bi_private;
 669         struct dm_io *io = tio->io;
 670         struct mapped_device *md = tio->io->md;
 671         dm_endio_fn endio = tio->ti->type->end_io;
 672
 673         if (!bio_flagged(bio, BIO_UPTODATE) && !error)
 674                 error = -EIO;
 675
 676         if (endio) {
 677                 r = endio(tio->ti, bio, error, &tio->info);
 678                 if (r < 0 || r == DM_ENDIO_REQUEUE)
 679                         /*
 680                          * error and requeue request are handled
 681                          * in dec_pending().
 682                          */
 683                         error = r;
 684                 else if (r == DM_ENDIO_INCOMPLETE)
 685                         /* The target will handle the io */
 686                         return;
 687                 else if (r) {
 688                         DMWARN("unimplemented target endio return value: %d", r);
 689                         BUG();
 690                 }
 691         }
 692
 693         /*
 694          * Store md for cleanup instead of tio which is about to get freed.
 695          */
 696         bio->bi_private = md->bs;
 697
 698         free_tio(md, tio);
 699         bio_put(bio);
 700         dec_pending(io, error);
 701 }
 702
 703 /*
 704  * Partial completion handling for request-based dm
 705  */
 706 static void end_clone_bio(struct bio *clone, int error)
 707 {
 708         struct dm_rq_clone_bio_info *info = clone->bi_private;
 709         struct dm_rq_target_io *tio = info->tio;
 710         struct bio *bio = info->orig;
 711         unsigned int nr_bytes = info->orig->bi_size;
 712
 713         bio_put(clone);
 714
 715         if (tio->error)
 716                 /*
 717                  * An error has already been detected on the request.
 718                  * Once error occurred, just let clone->end_io() handle
 719                  * the remainder.
 720                  */
 721                 return;
 722         else if (error) {
 723                 /*
 724                  * Don't notice the error to the upper layer yet.
 725                  * The error handling decision is made by the target driver,
 726                  * when the request is completed.
 727                  */
 728                 tio->error = error;
 729                 return;
 730         }
 731
 732         /*
 733          * I/O for the bio successfully completed.
 734          * Notice the data completion to the upper layer.
 735          */
 736
 737         /*
 738          * bios are processed from the head of the list.
 739          * So the completing bio should always be rq->bio.
 740          * If it's not, something wrong is happening.
 741          */
 742         if (tio->orig->bio != bio)
 743                 DMERR("bio completion is going in the middle of the request");
 744
 745         /*
 746          * Update the original request.
 747          * Do not use blk_end_request() here, because it may complete
 748          * the original request before the clone, and break the ordering.
 749          */
 750         blk_update_request(tio->orig, 0, nr_bytes);
 751 }
 752
 753 static void store_barrier_error(struct mapped_device *md, int error)
 754 {
 755         unsigned long flags;
 756
 757         spin_lock_irqsave(&md->barrier_error_lock, flags);
 758         /*
 759          * Basically, the first error is taken, but:
 760          *   -EOPNOTSUPP supersedes any I/O error.
 761          *   Requeue request supersedes any I/O error but -EOPNOTSUPP.
 762          */
 763         if (!md->barrier_error || error == -EOPNOTSUPP ||
 764             (md->barrier_error != -EOPNOTSUPP &&
 765              error == DM_ENDIO_REQUEUE))
 766                 md->barrier_error = error;
 767         spin_unlock_irqrestore(&md->barrier_error_lock, flags);
 768 }
 769
 770 /*
 771  * Don't touch any member of the md after calling this function because
 772  * the md may be freed in dm_put() at the end of this function.
 773  * Or do dm_get() before calling this function and dm_put() later.
 774  */
 775 static void rq_completed(struct mapped_device *md, int rw, int run_queue)
 776 {
 777         atomic_dec(&md->pending[rw]);
 778
 779         /* nudge anyone waiting on suspend queue */
 780         if (!md_in_flight(md))
 781                 wake_up(&md->wait);
 782
 783         if (run_queue)
 784                 blk_run_queue(md->queue);
 785
 786         /*
 787          * dm_put() must be at the end of this function. See the comment above
 788          */
 789         dm_put(md);
 790 }
 791
 792 static void free_rq_clone(struct request *clone)
 793 {
 794         struct dm_rq_target_io *tio = clone->end_io_data;
 795
 796         blk_rq_unprep_clone(clone);
 797         free_rq_tio(tio);
 798 }
 799
 800 /*
 801  * Complete the clone and the original request.
 802  * Must be called without queue lock.
 803  */
 804 static void dm_end_request(struct request *clone, int error)
 805 {
 806         int rw = rq_data_dir(clone);
 807         int run_queue = 1;
 808         bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
 809         struct dm_rq_target_io *tio = clone->end_io_data;
 810         struct mapped_device *md = tio->md;
 811         struct request *rq = tio->orig;
 812
 813         if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
 814                 rq->errors = clone->errors;
 815                 rq->resid_len = clone->resid_len;
 816
 817                 if (rq->sense)
 818                         /*
 819                          * We are using the sense buffer of the original
 820                          * request.
 821                          * So setting the length of the sense data is enough.
 822                          */
 823                         rq->sense_len = clone->sense_len;
 824         }
 825
 826         free_rq_clone(clone);
 827
 828         if (unlikely(is_barrier)) {
 829                 if (unlikely(error))
 830                         store_barrier_error(md, error);
 831                 run_queue = 0;
 832         } else
 833                 blk_end_request_all(rq, error);
 834
 835         rq_completed(md, rw, run_queue);
 836 }
 837
 838 static void dm_unprep_request(struct request *rq)
 839 {
 840         struct request *clone = rq->special;
 841
 842         rq->special = NULL;
 843         rq->cmd_flags &= ~REQ_DONTPREP;
 844
 845         free_rq_clone(clone);
 846 }
 847
 848 /*
 849  * Requeue the original request of a clone.
 850  */
 851 void dm_requeue_unmapped_request(struct request *clone)
 852 {
 853         int rw = rq_data_dir(clone);
 854         struct dm_rq_target_io *tio = clone->end_io_data;
 855         struct mapped_device *md = tio->md;
 856         struct request *rq = tio->orig;
 857         struct request_queue *q = rq->q;
 858         unsigned long flags;
 859
 860         if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
 861                 /*
 862                  * Barrier clones share an original request.
 863                  * Leave it to dm_end_request(), which handles this special
 864                  * case.
 865                  */
 866                 dm_end_request(clone, DM_ENDIO_REQUEUE);
 867                 return;
 868         }
 869
 870         dm_unprep_request(rq);
 871
 872         spin_lock_irqsave(q->queue_lock, flags);
 873         if (elv_queue_empty(q))
 874                 blk_plug_device(q);
 875         blk_requeue_request(q, rq);
 876         spin_unlock_irqrestore(q->queue_lock, flags);
 877
 878         rq_completed(md, rw, 0);
 879 }
 880 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
 881
 882 static void __stop_queue(struct request_queue *q)
 883 {
 884         blk_stop_queue(q);
 885 }
 886
 887 static void stop_queue(struct request_queue *q)
 888 {
 889         unsigned long flags;
 890
 891         spin_lock_irqsave(q->queue_lock, flags);
 892         __stop_queue(q);
 893         spin_unlock_irqrestore(q->queue_lock, flags);
 894 }
 895
 896 static void __start_queue(struct request_queue *q)
 897 {
 898         if (blk_queue_stopped(q))
 899                 blk_start_queue(q);
 900 }
 901
 902 static void start_queue(struct request_queue *q)
 903 {
 904         unsigned long flags;
 905
 906         spin_lock_irqsave(q->queue_lock, flags);
 907         __start_queue(q);
 908         spin_unlock_irqrestore(q->queue_lock, flags);
 909 }
 910
 911 static void dm_done(struct request *clone, int error, bool mapped)
 912 {
 913         int r = error;
 914         struct dm_rq_target_io *tio = clone->end_io_data;
 915         dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
 916
 917         if (mapped && rq_end_io)
 918                 r = rq_end_io(tio->ti, clone, error, &tio->info);
 919
 920         if (r <= 0)
 921                 /* The target wants to complete the I/O */
 922                 dm_end_request(clone, r);
 923         else if (r == DM_ENDIO_INCOMPLETE)
 924                 /* The target will handle the I/O */
 925                 return;
 926         else if (r == DM_ENDIO_REQUEUE)
 927                 /* The target wants to requeue the I/O */
 928                 dm_requeue_unmapped_request(clone);
 929         else {
 930                 DMWARN("unimplemented target endio return value: %d", r);
 931                 BUG();
 932         }
 933 }
 934
 935 /*
 936  * Request completion handler for request-based dm
 937  */
 938 static void dm_softirq_done(struct request *rq)
 939 {
 940         bool mapped = true;
 941         struct request *clone = rq->completion_data;
 942         struct dm_rq_target_io *tio = clone->end_io_data;
 943
 944         if (rq->cmd_flags & REQ_FAILED)
 945                 mapped = false;
 946
 947         dm_done(clone, tio->error, mapped);
 948 }
 949
 950 /*
 951  * Complete the clone and the original request with the error status
 952  * through softirq context.
 953  */
 954 static void dm_complete_request(struct request *clone, int error)
 955 {
 956         struct dm_rq_target_io *tio = clone->end_io_data;
 957         struct request *rq = tio->orig;
 958
 959         if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
 960                 /*
 961                  * Barrier clones share an original request.  So can't use
 962                  * softirq_done with the original.
 963                  * Pass the clone to dm_done() directly in this special case.
 964                  * It is safe (even if clone->q->queue_lock is held here)
 965                  * because there is no I/O dispatching during the completion
 966                  * of barrier clone.
 967                  */
 968                 dm_done(clone, error, true);
 969                 return;
 970         }
 971
 972         tio->error = error;
 973         rq->completion_data = clone;
 974         blk_complete_request(rq);
 975 }
 976
 977 /*
 978  * Complete the not-mapped clone and the original request with the error status
 979  * through softirq context.
 980  * Target's rq_end_io() function isn't called.
 981  * This may be used when the target's map_rq() function fails.
 982  */
 983 void dm_kill_unmapped_request(struct request *clone, int error)
 984 {
 985         struct dm_rq_target_io *tio = clone->end_io_data;
 986         struct request *rq = tio->orig;
 987
 988         if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
 989                 /*
 990                  * Barrier clones share an original request.
 991                  * Leave it to dm_end_request(), which handles this special
 992                  * case.
 993                  */
 994                 BUG_ON(error > 0);
 995                 dm_end_request(clone, error);
 996                 return;
 997         }
 998
 999         rq->cmd_flags |= REQ_FAILED;
1000         dm_complete_request(clone, error);
1001 }
1002 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
1003
1004 /*
1005  * Called with the queue lock held
1006  */
1007 static void end_clone_request(struct request *clone, int error)
1008 {
1009         /*
1010          * For just cleaning up the information of the queue in which
1011          * the clone was dispatched.
1012          * The clone is *NOT* freed actually here because it is alloced from
1013          * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
1014          */
1015         __blk_put_request(clone->q, clone);
1016
1017         /*
1018          * Actual request completion is done in a softirq context which doesn't
1019          * hold the queue lock.  Otherwise, deadlock could occur because:
1020          *     - another request may be submitted by the upper level driver
1021          *       of the stacking during the completion
1022          *     - the submission which requires queue lock may be done
1023          *       against this queue
1024          */
1025         dm_complete_request(clone, error);
1026 }
1027
1028 /*
1029  * Return maximum size of I/O possible at the supplied sector up to the current
1030  * target boundary.
1031  */
1032 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1033 {
1034         sector_t target_offset = dm_target_offset(ti, sector);
1035
1036         return ti->len - target_offset;
1037 }
1038
1039 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1040 {
1041         sector_t len = max_io_len_target_boundary(sector, ti);
1042
1043         /*
1044          * Does the target need to split even further ?
1045          */
1046         if (ti->split_io) {
1047                 sector_t boundary;
1048                 sector_t offset = dm_target_offset(ti, sector);
1049                 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
1050                            - offset;
1051                 if (len > boundary)
1052                         len = boundary;
1053         }
1054
1055         return len;
1056 }
1057
1058 static void __map_bio(struct dm_target *ti, struct bio *clone,
1059                       struct dm_target_io *tio)
1060 {
1061         int r;
1062         sector_t sector;
1063         struct mapped_device *md;
1064
1065         clone->bi_end_io = clone_endio;
1066         clone->bi_private = tio;
1067
1068         /*
1069          * Map the clone.  If r == 0 we don't need to do
1070          * anything, the target has assumed ownership of
1071          * this io.
1072          */
1073         atomic_inc(&tio->io->io_count);
1074         sector = clone->bi_sector;
1075         r = ti->type->map(ti, clone, &tio->info);
1076         if (r == DM_MAPIO_REMAPPED) {
1077                 /* the bio has been remapped so dispatch it */
1078
1079                 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
1080                                     tio->io->bio->bi_bdev->bd_dev, sector);
1081
1082                 generic_make_request(clone);
1083         } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1084                 /* error the io and bail out, or requeue it if needed */
1085                 md = tio->io->md;
1086                 dec_pending(tio->io, r);
1087                 /*
1088                  * Store bio_set for cleanup.
1089                  */
1090                 clone->bi_private = md->bs;
1091                 bio_put(clone);
1092                 free_tio(md, tio);
1093         } else if (r) {
1094                 DMWARN("unimplemented target map return value: %d", r);
1095                 BUG();
1096         }
1097 }
1098
1099 struct clone_info {
1100         struct mapped_device *md;
1101         struct dm_table *map;
1102         struct bio *bio;
1103         struct dm_io *io;
1104         sector_t sector;
1105         sector_t sector_count;
1106         unsigned short idx;
1107 };
1108
1109 static void dm_bio_destructor(struct bio *bio)
1110 {
1111         struct bio_set *bs = bio->bi_private;
1112
1113         bio_free(bio, bs);
1114 }
1115
1116 /*
1117  * Creates a little bio that just does part of a bvec.
1118  */
1119 static struct bio *split_bvec(struct bio *bio, sector_t sector,
1120                               unsigned short idx, unsigned int offset,
1121                               unsigned int len, struct bio_set *bs)
1122 {
1123         struct bio *clone;
1124         struct bio_vec *bv = bio->bi_io_vec + idx;
1125
1126         clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
1127         clone->bi_destructor = dm_bio_destructor;
1128         *clone->bi_io_vec = *bv;
1129
1130         clone->bi_sector = sector;
1131         clone->bi_bdev = bio->bi_bdev;
1132         clone->bi_rw = bio->bi_rw;
1133         clone->bi_vcnt = 1;
1134         clone->bi_size = to_bytes(len);
1135         clone->bi_io_vec->bv_offset = offset;
1136         clone->bi_io_vec->bv_len = clone->bi_size;
1137         clone->bi_flags |= 1 << BIO_CLONED;
1138
1139         if (bio_integrity(bio)) {
1140                 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1141                 bio_integrity_trim(clone,
1142                                    bio_sector_offset(bio, idx, offset), len);
1143         }
1144
1145         return clone;
1146 }
1147
1148 /*
1149  * Creates a bio that consists of range of complete bvecs.
1150  */
1151 static struct bio *clone_bio(struct bio *bio, sector_t sector,
1152                              unsigned short idx, unsigned short bv_count,
1153                              unsigned int len, struct bio_set *bs)
1154 {
1155         struct bio *clone;
1156
1157         clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1158         __bio_clone(clone, bio);
1159         clone->bi_destructor = dm_bio_destructor;
1160         clone->bi_sector = sector;
1161         clone->bi_idx = idx;
1162         clone->bi_vcnt = idx + bv_count;
1163         clone->bi_size = to_bytes(len);
1164         clone->bi_flags &= ~(1 << BIO_SEG_VALID);
1165
1166         if (bio_integrity(bio)) {
1167                 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1168
1169                 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1170                         bio_integrity_trim(clone,
1171                                            bio_sector_offset(bio, idx, 0), len);
1172         }
1173
1174         return clone;
1175 }
1176
1177 static struct dm_target_io *alloc_tio(struct clone_info *ci,
1178                                       struct dm_target *ti)
1179 {
1180         struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
1181
1182         tio->io = ci->io;
1183         tio->ti = ti;
1184         memset(&tio->info, 0, sizeof(tio->info));
1185
1186         return tio;
1187 }
1188
1189 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
1190                                    unsigned request_nr, sector_t len)
1191 {
1192         struct dm_target_io *tio = alloc_tio(ci, ti);
1193         struct bio *clone;
1194
1195         tio->info.target_request_nr = request_nr;
1196
1197         /*
1198          * Discard requests require the bio's inline iovecs be initialized.
1199          * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1200          * and discard, so no need for concern about wasted bvec allocations.
1201          */
1202         clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
1203         __bio_clone(clone, ci->bio);
1204         clone->bi_destructor = dm_bio_destructor;
1205         if (len) {
1206                 clone->bi_sector = ci->sector;
1207                 clone->bi_size = to_bytes(len);
1208         }
1209
1210         __map_bio(ti, clone, tio);
1211 }
1212
1213 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
1214                                     unsigned num_requests, sector_t len)
1215 {
1216         unsigned request_nr;
1217
1218         for (request_nr = 0; request_nr < num_requests; request_nr++)
1219                 __issue_target_request(ci, ti, request_nr, len);
1220 }
1221
1222 static int __clone_and_map_flush(struct clone_info *ci)
1223 {
1224         unsigned target_nr = 0;
1225         struct dm_target *ti;
1226
1227         while ((ti = dm_table_get_target(ci->map, target_nr++)))
1228                 __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
1229
1230         ci->sector_count = 0;
1231
1232         return 0;
1233 }
1234
1235 /*
1236  * Perform all io with a single clone.
1237  */
1238 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
1239 {
1240         struct bio *clone, *bio = ci->bio;
1241         struct dm_target_io *tio;
1242
1243         tio = alloc_tio(ci, ti);
1244         clone = clone_bio(bio, ci->sector, ci->idx,
1245                           bio->bi_vcnt - ci->idx, ci->sector_count,
1246                           ci->md->bs);
1247         __map_bio(ti, clone, tio);
1248         ci->sector_count = 0;
1249 }
1250
1251 static int __clone_and_map_discard(struct clone_info *ci)
1252 {
1253         struct dm_target *ti;
1254         sector_t len;
1255
1256         do {
1257                 ti = dm_table_find_target(ci->map, ci->sector);
1258                 if (!dm_target_is_valid(ti))
1259                         return -EIO;
1260
1261                 /*
1262                  * Even though the device advertised discard support,
1263                  * reconfiguration might have changed that since the
1264                  * check was performed.
1265                  */
1266                 if (!ti->num_discard_requests)
1267                         return -EOPNOTSUPP;
1268
1269                 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1270
1271                 __issue_target_requests(ci, ti, ti->num_discard_requests, len);
1272
1273                 ci->sector += len;
1274         } while (ci->sector_count -= len);
1275
1276         return 0;
1277 }
1278
1279 static int __clone_and_map(struct clone_info *ci)
1280 {
1281         struct bio *clone, *bio = ci->bio;
1282         struct dm_target *ti;
1283         sector_t len = 0, max;
1284         struct dm_target_io *tio;
1285
1286         if (unlikely(bio->bi_rw & REQ_DISCARD))
1287                 return __clone_and_map_discard(ci);
1288
1289         ti = dm_table_find_target(ci->map, ci->sector);
1290         if (!dm_target_is_valid(ti))
1291                 return -EIO;
1292
1293         max = max_io_len(ci->sector, ti);
1294
1295         if (ci->sector_count <= max) {
1296                 /*
1297                  * Optimise for the simple case where we can do all of
1298                  * the remaining io with a single clone.
1299                  */
1300                 __clone_and_map_simple(ci, ti);
1301
1302         } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1303                 /*
1304                  * There are some bvecs that don't span targets.
1305                  * Do as many of these as possible.
1306                  */
1307                 int i;
1308                 sector_t remaining = max;
1309                 sector_t bv_len;
1310
1311                 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
1312                         bv_len = to_sector(bio->bi_io_vec[i].bv_len);
1313
1314                         if (bv_len > remaining)
1315                                 break;
1316
1317                         remaining -= bv_len;
1318                         len += bv_len;
1319                 }
1320
1321                 tio = alloc_tio(ci, ti);
1322                 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
1323                                   ci->md->bs);
1324                 __map_bio(ti, clone, tio);
1325
1326                 ci->sector += len;
1327                 ci->sector_count -= len;
1328                 ci->idx = i;
1329
1330         } else {
1331                 /*
1332                  * Handle a bvec that must be split between two or more targets.
1333                  */
1334                 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1335                 sector_t remaining = to_sector(bv->bv_len);
1336                 unsigned int offset = 0;
1337
1338                 do {
1339                         if (offset) {
1340                                 ti = dm_table_find_target(ci->map, ci->sector);
1341                                 if (!dm_target_is_valid(ti))
1342                                         return -EIO;
1343
1344                                 max = max_io_len(ci->sector, ti);
1345                         }
1346
1347                         len = min(remaining, max);
1348
1349                         tio = alloc_tio(ci, ti);
1350                         clone = split_bvec(bio, ci->sector, ci->idx,
1351                                            bv->bv_offset + offset, len,
1352                                            ci->md->bs);
1353
1354                         __map_bio(ti, clone, tio);
1355
1356                         ci->sector += len;
1357                         ci->sector_count -= len;
1358                         offset += to_bytes(len);
1359                 } while (remaining -= len);
1360
1361                 ci->idx++;
1362         }
1363
1364         return 0;
1365 }
1366
1367 /*
1368  * Split the bio into several clones and submit it to targets.
1369  */
1370 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1371 {
1372         struct clone_info ci;
1373         int error = 0;
1374
1375         ci.map = dm_get_live_table(md);
1376         if (unlikely(!ci.map)) {
1377                 if (!(bio->bi_rw & REQ_FLUSH))
1378                         bio_io_error(bio);
1379                 else
1380                         if (!md->flush_error)
1381                                 md->flush_error = -EIO;
1382                 return;
1383         }
1384
1385         ci.md = md;
1386         ci.bio = bio;
1387         ci.io = alloc_io(md);
1388         ci.io->error = 0;
1389         atomic_set(&ci.io->io_count, 1);
1390         ci.io->bio = bio;
1391         ci.io->md = md;
1392         spin_lock_init(&ci.io->endio_lock);
1393         ci.sector = bio->bi_sector;
1394         if (!(bio->bi_rw & REQ_FLUSH))
1395                 ci.sector_count = bio_sectors(bio);
1396         else {
1397                 /* all FLUSH bio's reaching here should be empty */
1398                 WARN_ON_ONCE(bio_has_data(bio));
1399                 ci.sector_count = 1;
1400         }
1401         ci.idx = bio->bi_idx;
1402
1403         start_io_acct(ci.io);
1404         while (ci.sector_count && !error) {
1405                 if (!(bio->bi_rw & REQ_FLUSH))
1406                         error = __clone_and_map(&ci);
1407                 else
1408                         error = __clone_and_map_flush(&ci);
1409         }
1410
1411         /* drop the extra reference count */
1412         dec_pending(ci.io, error);
1413         dm_table_put(ci.map);
1414 }
1415 /*-----------------------------------------------------------------
1416  * CRUD END
1417  *---------------------------------------------------------------*/
1418
1419 static int dm_merge_bvec(struct request_queue *q,
1420                          struct bvec_merge_data *bvm,
1421                          struct bio_vec *biovec)
1422 {
1423         struct mapped_device *md = q->queuedata;
1424         struct dm_table *map = dm_get_live_table(md);
1425         struct dm_target *ti;
1426         sector_t max_sectors;
1427         int max_size = 0;
1428
1429         if (unlikely(!map))
1430                 goto out;
1431
1432         ti = dm_table_find_target(map, bvm->bi_sector);
1433         if (!dm_target_is_valid(ti))
1434                 goto out_table;
1435
1436         /*
1437          * Find maximum amount of I/O that won't need splitting
1438          */
1439         max_sectors = min(max_io_len(bvm->bi_sector, ti),
1440                           (sector_t) BIO_MAX_SECTORS);
1441         max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1442         if (max_size < 0)
1443                 max_size = 0;
1444
1445         /*
1446          * merge_bvec_fn() returns number of bytes
1447          * it can accept at this offset
1448          * max is precomputed maximal io size
1449          */
1450         if (max_size && ti->type->merge)
1451                 max_size = ti->type->merge(ti, bvm, biovec, max_size);
1452         /*
1453          * If the target doesn't support merge method and some of the devices
1454          * provided their merge_bvec method (we know this by looking at
1455          * queue_max_hw_sectors), then we can't allow bios with multiple vector
1456          * entries.  So always set max_size to 0, and the code below allows
1457          * just one page.
1458          */
1459         else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1460
1461                 max_size = 0;
1462
1463 out_table:
1464         dm_table_put(map);
1465
1466 out:
1467         /*
1468          * Always allow an entire first page
1469          */
1470         if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1471                 max_size = biovec->bv_len;
1472
1473         return max_size;
1474 }
1475
1476 /*
1477  * The request function that just remaps the bio built up by
1478  * dm_merge_bvec.
1479  */
1480 static int _dm_request(struct request_queue *q, struct bio *bio)
1481 {
1482         int rw = bio_data_dir(bio);
1483         struct mapped_device *md = q->queuedata;
1484         int cpu;
1485
1486         down_read(&md->io_lock);
1487
1488         cpu = part_stat_lock();
1489         part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1490         part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1491         part_stat_unlock();
1492
1493         /*
1494          * If we're suspended or the thread is processing flushes
1495          * we have to queue this io for later.
1496          */
1497         if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
1498             (bio->bi_rw & REQ_FLUSH)) {
1499                 up_read(&md->io_lock);
1500
1501                 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
1502                     bio_rw(bio) == READA) {
1503                         bio_io_error(bio);
1504                         return 0;
1505                 }
1506
1507                 queue_io(md, bio);
1508
1509                 return 0;
1510         }
1511
1512         __split_and_process_bio(md, bio);
1513         up_read(&md->io_lock);
1514         return 0;
1515 }
1516
1517 static int dm_make_request(struct request_queue *q, struct bio *bio)
1518 {
1519         struct mapped_device *md = q->queuedata;
1520
1521         return md->saved_make_request_fn(q, bio); /* call __make_request() */
1522 }
1523
1524 static int dm_request_based(struct mapped_device *md)
1525 {
1526         return blk_queue_stackable(md->queue);
1527 }
1528
1529 static int dm_request(struct request_queue *q, struct bio *bio)
1530 {
1531         struct mapped_device *md = q->queuedata;
1532
1533         if (dm_request_based(md))
1534                 return dm_make_request(q, bio);
1535
1536         return _dm_request(q, bio);
1537 }
1538
1539 static bool dm_rq_is_flush_request(struct request *rq)
1540 {
1541         if (rq->cmd_flags & REQ_FLUSH)
1542                 return true;
1543         else
1544                 return false;
1545 }
1546
1547 void dm_dispatch_request(struct request *rq)
1548 {
1549         int r;
1550
1551         if (blk_queue_io_stat(rq->q))
1552                 rq->cmd_flags |= REQ_IO_STAT;
1553
1554         rq->start_time = jiffies;
1555         r = blk_insert_cloned_request(rq->q, rq);
1556         if (r)
1557                 dm_complete_request(rq, r);
1558 }
1559 EXPORT_SYMBOL_GPL(dm_dispatch_request);
1560
1561 static void dm_rq_bio_destructor(struct bio *bio)
1562 {
1563         struct dm_rq_clone_bio_info *info = bio->bi_private;
1564         struct mapped_device *md = info->tio->md;
1565
1566         free_bio_info(info);
1567         bio_free(bio, md->bs);
1568 }
1569
1570 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1571                                  void *data)
1572 {
1573         struct dm_rq_target_io *tio = data;
1574         struct mapped_device *md = tio->md;
1575         struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1576
1577         if (!info)
1578                 return -ENOMEM;
1579
1580         info->orig = bio_orig;
1581         info->tio = tio;
1582         bio->bi_end_io = end_clone_bio;
1583         bio->bi_private = info;
1584         bio->bi_destructor = dm_rq_bio_destructor;
1585
1586         return 0;
1587 }
1588
1589 static int setup_clone(struct request *clone, struct request *rq,
1590                        struct dm_rq_target_io *tio)
1591 {
1592         int r;
1593
1594         if (dm_rq_is_flush_request(rq)) {
1595                 blk_rq_init(NULL, clone);
1596                 clone->cmd_type = REQ_TYPE_FS;
1597                 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
1598         } else {
1599                 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1600                                       dm_rq_bio_constructor, tio);
1601                 if (r)
1602                         return r;
1603
1604                 clone->cmd = rq->cmd;
1605                 clone->cmd_len = rq->cmd_len;
1606                 clone->sense = rq->sense;
1607                 clone->buffer = rq->buffer;
1608         }
1609
1610         clone->end_io = end_clone_request;
1611         clone->end_io_data = tio;
1612
1613         return 0;
1614 }
1615
1616 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1617                                 gfp_t gfp_mask)
1618 {
1619         struct request *clone;
1620         struct dm_rq_target_io *tio;
1621
1622         tio = alloc_rq_tio(md, gfp_mask);
1623         if (!tio)
1624                 return NULL;
1625
1626         tio->md = md;
1627         tio->ti = NULL;
1628         tio->orig = rq;
1629         tio->error = 0;
1630         memset(&tio->info, 0, sizeof(tio->info));
1631
1632         clone = &tio->clone;
1633         if (setup_clone(clone, rq, tio)) {
1634                 /* -ENOMEM */
1635                 free_rq_tio(tio);
1636                 return NULL;
1637         }
1638
1639         return clone;
1640 }
1641
1642 /*
1643  * Called with the queue lock held.
1644  */
1645 static int dm_prep_fn(struct request_queue *q, struct request *rq)
1646 {
1647         struct mapped_device *md = q->queuedata;
1648         struct request *clone;
1649
1650         if (unlikely(dm_rq_is_flush_request(rq)))
1651                 return BLKPREP_OK;
1652
1653         if (unlikely(rq->special)) {
1654                 DMWARN("Already has something in rq->special.");
1655                 return BLKPREP_KILL;
1656         }
1657
1658         clone = clone_rq(rq, md, GFP_ATOMIC);
1659         if (!clone)
1660                 return BLKPREP_DEFER;
1661
1662         rq->special = clone;
1663         rq->cmd_flags |= REQ_DONTPREP;
1664
1665         return BLKPREP_OK;
1666 }
1667
1668 /*
1669  * Returns:
1670  * 0  : the request has been processed (not requeued)
1671  * !0 : the request has been requeued
1672  */
1673 static int map_request(struct dm_target *ti, struct request *clone,
1674                        struct mapped_device *md)
1675 {
1676         int r, requeued = 0;
1677         struct dm_rq_target_io *tio = clone->end_io_data;
1678
1679         /*
1680          * Hold the md reference here for the in-flight I/O.
1681          * We can't rely on the reference count by device opener,
1682          * because the device may be closed during the request completion
1683          * when all bios are completed.
1684          * See the comment in rq_completed() too.
1685          */
1686         dm_get(md);
1687
1688         tio->ti = ti;
1689         r = ti->type->map_rq(ti, clone, &tio->info);
1690         switch (r) {
1691         case DM_MAPIO_SUBMITTED:
1692                 /* The target has taken the I/O to submit by itself later */
1693                 break;
1694         case DM_MAPIO_REMAPPED:
1695                 /* The target has remapped the I/O so dispatch it */
1696                 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1697                                      blk_rq_pos(tio->orig));
1698                 dm_dispatch_request(clone);
1699                 break;
1700         case DM_MAPIO_REQUEUE:
1701                 /* The target wants to requeue the I/O */
1702                 dm_requeue_unmapped_request(clone);
1703                 requeued = 1;
1704                 break;
1705         default:
1706                 if (r > 0) {
1707                         DMWARN("unimplemented target map return value: %d", r);
1708                         BUG();
1709                 }
1710
1711                 /* The target wants to complete the I/O */
1712                 dm_kill_unmapped_request(clone, r);
1713                 break;
1714         }
1715
1716         return requeued;
1717 }
1718
1719 /*
1720  * q->request_fn for request-based dm.
1721  * Called with the queue lock held.
1722  */
1723 static void dm_request_fn(struct request_queue *q)
1724 {
1725         struct mapped_device *md = q->queuedata;
1726         struct dm_table *map = dm_get_live_table(md);
1727         struct dm_target *ti;
1728         struct request *rq, *clone;
1729
1730         /*
1731          * For suspend, check blk_queue_stopped() and increment
1732          * ->pending within a single queue_lock not to increment the
1733          * number of in-flight I/Os after the queue is stopped in
1734          * dm_suspend().
1735          */
1736         while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1737                 rq = blk_peek_request(q);
1738                 if (!rq)
1739                         goto plug_and_out;
1740
1741                 if (unlikely(dm_rq_is_flush_request(rq))) {
1742                         BUG_ON(md->flush_request);
1743                         md->flush_request = rq;
1744                         blk_start_request(rq);
1745                         queue_work(md->wq, &md->barrier_work);
1746                         goto out;
1747                 }
1748
1749                 ti = dm_table_find_target(map, blk_rq_pos(rq));
1750                 if (ti->type->busy && ti->type->busy(ti))
1751                         goto plug_and_out;
1752
1753                 blk_start_request(rq);
1754                 clone = rq->special;
1755                 atomic_inc(&md->pending[rq_data_dir(clone)]);
1756
1757                 spin_unlock(q->queue_lock);
1758                 if (map_request(ti, clone, md))
1759                         goto requeued;
1760
1761                 spin_lock_irq(q->queue_lock);
1762         }
1763
1764         goto out;
1765
1766 requeued:
1767         spin_lock_irq(q->queue_lock);
1768
1769 plug_and_out:
1770         if (!elv_queue_empty(q))
1771                 /* Some requests still remain, retry later */
1772                 blk_plug_device(q);
1773
1774 out:
1775         dm_table_put(map);
1776
1777         return;
1778 }
1779
1780 int dm_underlying_device_busy(struct request_queue *q)
1781 {
1782         return blk_lld_busy(q);
1783 }
1784 EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1785
1786 static int dm_lld_busy(struct request_queue *q)
1787 {
1788         int r;
1789         struct mapped_device *md = q->queuedata;
1790         struct dm_table *map = dm_get_live_table(md);
1791
1792         if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1793                 r = 1;
1794         else
1795                 r = dm_table_any_busy_target(map);
1796
1797         dm_table_put(map);
1798
1799         return r;
1800 }
1801
1802 static void dm_unplug_all(struct request_queue *q)
1803 {
1804         struct mapped_device *md = q->queuedata;
1805         struct dm_table *map = dm_get_live_table(md);
1806
1807         if (map) {
1808                 if (dm_request_based(md))
1809                         generic_unplug_device(q);
1810
1811                 dm_table_unplug_all(map);
1812                 dm_table_put(map);
1813         }
1814 }
1815
1816 static int dm_any_congested(void *congested_data, int bdi_bits)
1817 {
1818         int r = bdi_bits;
1819         struct mapped_device *md = congested_data;
1820         struct dm_table *map;
1821
1822         if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1823                 map = dm_get_live_table(md);
1824                 if (map) {
1825                         /*
1826                          * Request-based dm cares about only own queue for
1827                          * the query about congestion status of request_queue
1828                          */
1829                         if (dm_request_based(md))
1830                                 r = md->queue->backing_dev_info.state &
1831                                     bdi_bits;
1832                         else
1833                                 r = dm_table_any_congested(map, bdi_bits);
1834
1835                         dm_table_put(map);
1836                 }
1837         }
1838
1839         return r;
1840 }
1841
1842 /*-----------------------------------------------------------------
1843  * An IDR is used to keep track of allocated minor numbers.
1844  *---------------------------------------------------------------*/
1845 static DEFINE_IDR(_minor_idr);
1846
1847 static void free_minor(int minor)
1848 {
1849         spin_lock(&_minor_lock);
1850         idr_remove(&_minor_idr, minor);
1851         spin_unlock(&_minor_lock);
1852 }
1853
1854 /*
1855  * See if the device with a specific minor # is free.
1856  */
1857 static int specific_minor(int minor)
1858 {
1859         int r, m;
1860
1861         if (minor >= (1 << MINORBITS))
1862                 return -EINVAL;
1863
1864         r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1865         if (!r)
1866                 return -ENOMEM;
1867
1868         spin_lock(&_minor_lock);
1869
1870         if (idr_find(&_minor_idr, minor)) {
1871                 r = -EBUSY;
1872                 goto out;
1873         }
1874
1875         r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
1876         if (r)
1877                 goto out;
1878
1879         if (m != minor) {
1880                 idr_remove(&_minor_idr, m);
1881                 r = -EBUSY;
1882                 goto out;
1883         }
1884
1885 out:
1886         spin_unlock(&_minor_lock);
1887         return r;
1888 }
1889
1890 static int next_free_minor(int *minor)
1891 {
1892         int r, m;
1893
1894         r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1895         if (!r)
1896                 return -ENOMEM;
1897
1898         spin_lock(&_minor_lock);
1899
1900         r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
1901         if (r)
1902                 goto out;
1903
1904         if (m >= (1 << MINORBITS)) {
1905                 idr_remove(&_minor_idr, m);
1906                 r = -ENOSPC;
1907                 goto out;
1908         }
1909
1910         *minor = m;
1911
1912 out:
1913         spin_unlock(&_minor_lock);
1914         return r;
1915 }
1916
1917 static const struct block_device_operations dm_blk_dops;
1918
1919 static void dm_wq_work(struct work_struct *work);
1920 static void dm_rq_barrier_work(struct work_struct *work);
1921
1922 static void dm_init_md_queue(struct mapped_device *md)
1923 {
1924         /*
1925          * Request-based dm devices cannot be stacked on top of bio-based dm
1926          * devices.  The type of this dm device has not been decided yet.
1927          * The type is decided at the first table loading time.
1928          * To prevent problematic device stacking, clear the queue flag
1929          * for request stacking support until then.
1930          *
1931          * This queue is new, so no concurrency on the queue_flags.
1932          */
1933         queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1934
1935         md->queue->queuedata = md;
1936         md->queue->backing_dev_info.congested_fn = dm_any_congested;
1937         md->queue->backing_dev_info.congested_data = md;
1938         blk_queue_make_request(md->queue, dm_request);
1939         blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1940         md->queue->unplug_fn = dm_unplug_all;
1941         blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1942         blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1943 }
1944
1945 /*
1946  * Allocate and initialise a blank device with a given minor.
1947  */
1948 static struct mapped_device *alloc_dev(int minor)
1949 {
1950         int r;
1951         struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
1952         void *old_md;
1953
1954         if (!md) {
1955                 DMWARN("unable to allocate device, out of memory.");
1956                 return NULL;
1957         }
1958
1959         if (!try_module_get(THIS_MODULE))
1960                 goto bad_module_get;
1961
1962         /* get a minor number for the dev */
1963         if (minor == DM_ANY_MINOR)
1964                 r = next_free_minor(&minor);
1965         else
1966                 r = specific_minor(minor);
1967         if (r < 0)
1968                 goto bad_minor;
1969
1970         md->type = DM_TYPE_NONE;
1971         init_rwsem(&md->io_lock);
1972         mutex_init(&md->suspend_lock);
1973         mutex_init(&md->type_lock);
1974         spin_lock_init(&md->deferred_lock);
1975         spin_lock_init(&md->barrier_error_lock);
1976         rwlock_init(&md->map_lock);
1977         atomic_set(&md->holders, 1);
1978         atomic_set(&md->open_count, 0);
1979         atomic_set(&md->event_nr, 0);
1980         atomic_set(&md->uevent_seq, 0);
1981         INIT_LIST_HEAD(&md->uevent_list);
1982         spin_lock_init(&md->uevent_lock);
1983
1984         md->queue = blk_alloc_queue(GFP_KERNEL);
1985         if (!md->queue)
1986                 goto bad_queue;
1987
1988         dm_init_md_queue(md);
1989
1990         md->disk = alloc_disk(1);
1991         if (!md->disk)
1992                 goto bad_disk;
1993
1994         atomic_set(&md->pending[0], 0);
1995         atomic_set(&md->pending[1], 0);
1996         init_waitqueue_head(&md->wait);
1997         INIT_WORK(&md->work, dm_wq_work);
1998         INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1999         init_waitqueue_head(&md->eventq);
2000
2001         md->disk->major = _major;
2002         md->disk->first_minor = minor;
2003         md->disk->fops = &dm_blk_dops;
2004         md->disk->queue = md->queue;
2005         md->disk->private_data = md;
2006         sprintf(md->disk->disk_name, "dm-%d", minor);
2007         add_disk(md->disk);
2008         format_dev_t(md->name, MKDEV(_major, minor));
2009
2010         md->wq = create_singlethread_workqueue("kdmflush");
2011         if (!md->wq)
2012                 goto bad_thread;
2013
2014         md->bdev = bdget_disk(md->disk, 0);
2015         if (!md->bdev)
2016                 goto bad_bdev;
2017
2018         /* Populate the mapping, nobody knows we exist yet */
2019         spin_lock(&_minor_lock);
2020         old_md = idr_replace(&_minor_idr, md, minor);
2021         spin_unlock(&_minor_lock);
2022
2023         BUG_ON(old_md != MINOR_ALLOCED);
2024
2025         return md;
2026
2027 bad_bdev:
2028         destroy_workqueue(md->wq);
2029 bad_thread:
2030         del_gendisk(md->disk);
2031         put_disk(md->disk);
2032 bad_disk:
2033         blk_cleanup_queue(md->queue);
2034 bad_queue:
2035         free_minor(minor);
2036 bad_minor:
2037         module_put(THIS_MODULE);
2038 bad_module_get:
2039         kfree(md);
2040         return NULL;
2041 }
2042
2043 static void unlock_fs(struct mapped_device *md);
2044
2045 static void free_dev(struct mapped_device *md)
2046 {
2047         int minor = MINOR(disk_devt(md->disk));
2048
2049         unlock_fs(md);
2050         bdput(md->bdev);
2051         destroy_workqueue(md->wq);
2052         if (md->tio_pool)
2053                 mempool_destroy(md->tio_pool);
2054         if (md->io_pool)
2055                 mempool_destroy(md->io_pool);
2056         if (md->bs)
2057                 bioset_free(md->bs);
2058         blk_integrity_unregister(md->disk);
2059         del_gendisk(md->disk);
2060         free_minor(minor);
2061
2062         spin_lock(&_minor_lock);
2063         md->disk->private_data = NULL;
2064         spin_unlock(&_minor_lock);
2065
2066         put_disk(md->disk);
2067         blk_cleanup_queue(md->queue);
2068         module_put(THIS_MODULE);
2069         kfree(md);
2070 }
2071
2072 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
2073 {
2074         struct dm_md_mempools *p;
2075
2076         if (md->io_pool && md->tio_pool && md->bs)
2077                 /* the md already has necessary mempools */
2078                 goto out;
2079
2080         p = dm_table_get_md_mempools(t);
2081         BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
2082
2083         md->io_pool = p->io_pool;
2084         p->io_pool = NULL;
2085         md->tio_pool = p->tio_pool;
2086         p->tio_pool = NULL;
2087         md->bs = p->bs;
2088         p->bs = NULL;
2089
2090 out:
2091         /* mempool bind completed, now no need any mempools in the table */
2092         dm_table_free_md_mempools(t);
2093 }
2094
2095 /*
2096  * Bind a table to the device.
2097  */
2098 static void event_callback(void *context)
2099 {
2100         unsigned long flags;
2101         LIST_HEAD(uevents);
2102         struct mapped_device *md = (struct mapped_device *) context;
2103
2104         spin_lock_irqsave(&md->uevent_lock, flags);
2105         list_splice_init(&md->uevent_list, &uevents);
2106         spin_unlock_irqrestore(&md->uevent_lock, flags);
2107
2108         dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2109
2110         atomic_inc(&md->event_nr);
2111         wake_up(&md->eventq);
2112 }
2113
2114 static void __set_size(struct mapped_device *md, sector_t size)
2115 {
2116         set_capacity(md->disk, size);
2117
2118         mutex_lock(&md->bdev->bd_inode->i_mutex);
2119         i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2120         mutex_unlock(&md->bdev->bd_inode->i_mutex);
2121 }
2122
2123 /*
2124  * Returns old map, which caller must destroy.
2125  */
2126 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2127                                struct queue_limits *limits)
2128 {
2129         struct dm_table *old_map;
2130         struct request_queue *q = md->queue;
2131         sector_t size;
2132         unsigned long flags;
2133
2134         size = dm_table_get_size(t);
2135
2136         /*
2137          * Wipe any geometry if the size of the table changed.
2138          */
2139         if (size != get_capacity(md->disk))
2140                 memset(&md->geometry, 0, sizeof(md->geometry));
2141
2142         __set_size(md, size);
2143
2144         dm_table_event_callback(t, event_callback, md);
2145
2146         /*
2147          * The queue hasn't been stopped yet, if the old table type wasn't
2148          * for request-based during suspension.  So stop it to prevent
2149          * I/O mapping before resume.
2150          * This must be done before setting the queue restrictions,
2151          * because request-based dm may be run just after the setting.
2152          */
2153         if (dm_table_request_based(t) && !blk_queue_stopped(q))
2154                 stop_queue(q);
2155
2156         __bind_mempools(md, t);
2157
2158         write_lock_irqsave(&md->map_lock, flags);
2159         old_map = md->map;
2160         md->map = t;
2161         dm_table_set_restrictions(t, q, limits);
2162         write_unlock_irqrestore(&md->map_lock, flags);
2163
2164         return old_map;
2165 }
2166
2167 /*
2168  * Returns unbound table for the caller to free.
2169  */
2170 static struct dm_table *__unbind(struct mapped_device *md)
2171 {
2172         struct dm_table *map = md->map;
2173         unsigned long flags;
2174
2175         if (!map)
2176                 return NULL;
2177
2178         dm_table_event_callback(map, NULL, NULL);
2179         write_lock_irqsave(&md->map_lock, flags);
2180         md->map = NULL;
2181         write_unlock_irqrestore(&md->map_lock, flags);
2182
2183         return map;
2184 }
2185
2186 /*
2187  * Constructor for a new device.
2188  */
2189 int dm_create(int minor, struct mapped_device **result)
2190 {
2191         struct mapped_device *md;
2192
2193         md = alloc_dev(minor);
2194         if (!md)
2195                 return -ENXIO;
2196
2197         dm_sysfs_init(md);
2198
2199         *result = md;
2200         return 0;
2201 }
2202
2203 /*
2204  * Functions to manage md->type.
2205  * All are required to hold md->type_lock.
2206  */
2207 void dm_lock_md_type(struct mapped_device *md)
2208 {
2209         mutex_lock(&md->type_lock);
2210 }
2211
2212 void dm_unlock_md_type(struct mapped_device *md)
2213 {
2214         mutex_unlock(&md->type_lock);
2215 }
2216
2217 void dm_set_md_type(struct mapped_device *md, unsigned type)
2218 {
2219         md->type = type;
2220 }
2221
2222 unsigned dm_get_md_type(struct mapped_device *md)
2223 {
2224         return md->type;
2225 }
2226
2227 /*
2228  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2229  */
2230 static int dm_init_request_based_queue(struct mapped_device *md)
2231 {
2232         struct request_queue *q = NULL;
2233
2234         if (md->queue->elevator)
2235                 return 1;
2236
2237         /* Fully initialize the queue */
2238         q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2239         if (!q)
2240                 return 0;
2241
2242         md->queue = q;
2243         md->saved_make_request_fn = md->queue->make_request_fn;
2244         dm_init_md_queue(md);
2245         blk_queue_softirq_done(md->queue, dm_softirq_done);
2246         blk_queue_prep_rq(md->queue, dm_prep_fn);
2247         blk_queue_lld_busy(md->queue, dm_lld_busy);
2248         /* no flush support for request based dm yet */
2249         blk_queue_flush(md->queue, 0);
2250
2251         elv_register_queue(md->queue);
2252
2253         return 1;
2254 }
2255
2256 /*
2257  * Setup the DM device's queue based on md's type
2258  */
2259 int dm_setup_md_queue(struct mapped_device *md)
2260 {
2261         if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
2262             !dm_init_request_based_queue(md)) {
2263                 DMWARN("Cannot initialize queue for request-based mapped device");
2264                 return -EINVAL;
2265         }
2266
2267         return 0;
2268 }
2269
2270 static struct mapped_device *dm_find_md(dev_t dev)
2271 {
2272         struct mapped_device *md;
2273         unsigned minor = MINOR(dev);
2274
2275         if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2276                 return NULL;
2277
2278         spin_lock(&_minor_lock);
2279
2280         md = idr_find(&_minor_idr, minor);
2281         if (md && (md == MINOR_ALLOCED ||
2282                    (MINOR(disk_devt(dm_disk(md))) != minor) ||
2283                    dm_deleting_md(md) ||
2284                    test_bit(DMF_FREEING, &md->flags))) {
2285                 md = NULL;
2286                 goto out;
2287         }
2288
2289 out:
2290         spin_unlock(&_minor_lock);
2291
2292         return md;
2293 }
2294
2295 struct mapped_device *dm_get_md(dev_t dev)
2296 {
2297         struct mapped_device *md = dm_find_md(dev);
2298
2299         if (md)
2300                 dm_get(md);
2301
2302         return md;
2303 }
2304
2305 void *dm_get_mdptr(struct mapped_device *md)
2306 {
2307         return md->interface_ptr;
2308 }
2309
2310 void dm_set_mdptr(struct mapped_device *md, void *ptr)
2311 {
2312         md->interface_ptr = ptr;
2313 }
2314
2315 void dm_get(struct mapped_device *md)
2316 {
2317         atomic_inc(&md->holders);
2318         BUG_ON(test_bit(DMF_FREEING, &md->flags));
2319 }
2320
2321 const char *dm_device_name(struct mapped_device *md)
2322 {
2323         return md->name;
2324 }
2325 EXPORT_SYMBOL_GPL(dm_device_name);
2326
2327 static void __dm_destroy(struct mapped_device *md, bool wait)
2328 {
2329         struct dm_table *map;
2330
2331         might_sleep();
2332
2333         spin_lock(&_minor_lock);
2334         map = dm_get_live_table(md);
2335         idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2336         set_bit(DMF_FREEING, &md->flags);
2337         spin_unlock(&_minor_lock);
2338
2339         if (!dm_suspended_md(md)) {
2340                 dm_table_presuspend_targets(map);
2341                 dm_table_postsuspend_targets(map);
2342         }
2343
2344         /*
2345          * Rare, but there may be I/O requests still going to complete,
2346          * for example.  Wait for all references to disappear.
2347          * No one should increment the reference count of the mapped_device,
2348          * after the mapped_device state becomes DMF_FREEING.
2349          */
2350         if (wait)
2351                 while (atomic_read(&md->holders))
2352                         msleep(1);
2353         else if (atomic_read(&md->holders))
2354                 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2355                        dm_device_name(md), atomic_read(&md->holders));
2356
2357         dm_sysfs_exit(md);
2358         dm_table_put(map);
2359         dm_table_destroy(__unbind(md));
2360         free_dev(md);
2361 }
2362
2363 void dm_destroy(struct mapped_device *md)
2364 {
2365         __dm_destroy(md, true);
2366 }
2367
2368 void dm_destroy_immediate(struct mapped_device *md)
2369 {
2370         __dm_destroy(md, false);
2371 }
2372
2373 void dm_put(struct mapped_device *md)
2374 {
2375         atomic_dec(&md->holders);
2376 }
2377 EXPORT_SYMBOL_GPL(dm_put);
2378
2379 static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2380 {
2381         int r = 0;
2382         DECLARE_WAITQUEUE(wait, current);
2383
2384         dm_unplug_all(md->queue);
2385
2386         add_wait_queue(&md->wait, &wait);
2387
2388         while (1) {
2389                 set_current_state(interruptible);
2390
2391                 smp_mb();
2392                 if (!md_in_flight(md))
2393                         break;
2394
2395                 if (interruptible == TASK_INTERRUPTIBLE &&
2396                     signal_pending(current)) {
2397                         r = -EINTR;
2398                         break;
2399                 }
2400
2401                 io_schedule();
2402         }
2403         set_current_state(TASK_RUNNING);
2404
2405         remove_wait_queue(&md->wait, &wait);
2406
2407         return r;
2408 }
2409
2410 static void process_flush(struct mapped_device *md, struct bio *bio)
2411 {
2412         md->flush_error = 0;
2413
2414         /* handle REQ_FLUSH */
2415         dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2416
2417         bio_init(&md->flush_bio);
2418         md->flush_bio.bi_bdev = md->bdev;
2419         md->flush_bio.bi_rw = WRITE_FLUSH;
2420         __split_and_process_bio(md, &md->flush_bio);
2421
2422         dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2423
2424         /* if it's an empty flush or the preflush failed, we're done */
2425         if (!bio_has_data(bio) || md->flush_error) {
2426                 if (md->flush_error != DM_ENDIO_REQUEUE)
2427                         bio_endio(bio, md->flush_error);
2428                 else {
2429                         spin_lock_irq(&md->deferred_lock);
2430                         bio_list_add_head(&md->deferred, bio);
2431                         spin_unlock_irq(&md->deferred_lock);
2432                 }
2433                 return;
2434         }
2435
2436         /* issue data + REQ_FUA */
2437         bio->bi_rw &= ~REQ_FLUSH;
2438         __split_and_process_bio(md, bio);
2439 }
2440
2441 /*
2442  * Process the deferred bios
2443  */
2444 static void dm_wq_work(struct work_struct *work)
2445 {
2446         struct mapped_device *md = container_of(work, struct mapped_device,
2447                                                 work);
2448         struct bio *c;
2449
2450         down_write(&md->io_lock);
2451
2452         while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2453                 spin_lock_irq(&md->deferred_lock);
2454                 c = bio_list_pop(&md->deferred);
2455                 spin_unlock_irq(&md->deferred_lock);
2456
2457                 if (!c) {
2458                         clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2459                         break;
2460                 }
2461
2462                 up_write(&md->io_lock);
2463
2464                 if (dm_request_based(md))
2465                         generic_make_request(c);
2466                 else {
2467                         if (c->bi_rw & REQ_FLUSH)
2468                                 process_flush(md, c);
2469                         else
2470                                 __split_and_process_bio(md, c);
2471                 }
2472
2473                 down_write(&md->io_lock);
2474         }
2475
2476         up_write(&md->io_lock);
2477 }
2478
2479 static void dm_queue_flush(struct mapped_device *md)
2480 {
2481         clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2482         smp_mb__after_clear_bit();
2483         queue_work(md->wq, &md->work);
2484 }
2485
2486 static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
2487 {
2488         struct dm_rq_target_io *tio = clone->end_io_data;
2489
2490         tio->info.target_request_nr = request_nr;
2491 }
2492
2493 /* Issue barrier requests to targets and wait for their completion. */
2494 static int dm_rq_barrier(struct mapped_device *md)
2495 {
2496         int i, j;
2497         struct dm_table *map = dm_get_live_table(md);
2498         unsigned num_targets = dm_table_get_num_targets(map);
2499         struct dm_target *ti;
2500         struct request *clone;
2501
2502         md->barrier_error = 0;
2503
2504         for (i = 0; i < num_targets; i++) {
2505                 ti = dm_table_get_target(map, i);
2506                 for (j = 0; j < ti->num_flush_requests; j++) {
2507                         clone = clone_rq(md->flush_request, md, GFP_NOIO);
2508                         dm_rq_set_target_request_nr(clone, j);
2509                         atomic_inc(&md->pending[rq_data_dir(clone)]);
2510                         map_request(ti, clone, md);
2511                 }
2512         }
2513
2514         dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2515         dm_table_put(map);
2516
2517         return md->barrier_error;
2518 }
2519
2520 static void dm_rq_barrier_work(struct work_struct *work)
2521 {
2522         int error;
2523         struct mapped_device *md = container_of(work, struct mapped_device,
2524                                                 barrier_work);
2525         struct request_queue *q = md->queue;
2526         struct request *rq;
2527         unsigned long flags;
2528
2529         /*
2530          * Hold the md reference here and leave it at the last part so that
2531          * the md can't be deleted by device opener when the barrier request
2532          * completes.
2533          */
2534         dm_get(md);
2535
2536         error = dm_rq_barrier(md);
2537
2538         rq = md->flush_request;
2539         md->flush_request = NULL;
2540
2541         if (error == DM_ENDIO_REQUEUE) {
2542                 spin_lock_irqsave(q->queue_lock, flags);
2543                 blk_requeue_request(q, rq);
2544                 spin_unlock_irqrestore(q->queue_lock, flags);
2545         } else
2546                 blk_end_request_all(rq, error);
2547
2548         blk_run_queue(q);
2549
2550         dm_put(md);
2551 }
2552
2553 /*
2554  * Swap in a new table, returning the old one for the caller to destroy.
2555  */
2556 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2557 {
2558         struct dm_table *map = ERR_PTR(-EINVAL);
2559         struct queue_limits limits;
2560         int r;
2561
2562         mutex_lock(&md->suspend_lock);
2563
2564         /* device must be suspended */
2565         if (!dm_suspended_md(md))
2566                 goto out;
2567
2568         r = dm_calculate_queue_limits(table, &limits);
2569         if (r) {
2570                 map = ERR_PTR(r);
2571                 goto out;
2572         }
2573
2574         map = __bind(md, table, &limits);
2575
2576 out:
2577         mutex_unlock(&md->suspend_lock);
2578         return map;
2579 }
2580
2581 /*
2582  * Functions to lock and unlock any filesystem running on the
2583  * device.
2584  */
2585 static int lock_fs(struct mapped_device *md)
2586 {
2587         int r;
2588
2589         WARN_ON(md->frozen_sb);
2590
2591         md->frozen_sb = freeze_bdev(md->bdev);
2592         if (IS_ERR(md->frozen_sb)) {
2593                 r = PTR_ERR(md->frozen_sb);
2594                 md->frozen_sb = NULL;
2595                 return r;
2596         }
2597
2598         set_bit(DMF_FROZEN, &md->flags);
2599
2600         return 0;
2601 }
2602
2603 static void unlock_fs(struct mapped_device *md)
2604 {
2605         if (!test_bit(DMF_FROZEN, &md->flags))
2606                 return;
2607
2608         thaw_bdev(md->bdev, md->frozen_sb);
2609         md->frozen_sb = NULL;
2610         clear_bit(DMF_FROZEN, &md->flags);
2611 }
2612
2613 /*
2614  * We need to be able to change a mapping table under a mounted
2615  * filesystem.  For example we might want to move some data in
2616  * the background.  Before the table can be swapped with
2617  * dm_bind_table, dm_suspend must be called to flush any in
2618  * flight bios and ensure that any further io gets deferred.
2619  */
2620 /*
2621  * Suspend mechanism in request-based dm.
2622  *
2623  * 1. Flush all I/Os by lock_fs() if needed.
2624  * 2. Stop dispatching any I/O by stopping the request_queue.
2625  * 3. Wait for all in-flight I/Os to be completed or requeued.
2626  *
2627  * To abort suspend, start the request_queue.
2628  */
2629 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2630 {
2631         struct dm_table *map = NULL;
2632         int r = 0;
2633         int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
2634         int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
2635
2636         mutex_lock(&md->suspend_lock);
2637
2638         if (dm_suspended_md(md)) {
2639                 r = -EINVAL;
2640                 goto out_unlock;
2641         }
2642
2643         map = dm_get_live_table(md);
2644
2645         /*
2646          * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2647          * This flag is cleared before dm_suspend returns.
2648          */
2649         if (noflush)
2650                 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2651
2652         /* This does not get reverted if there's an error later. */
2653         dm_table_presuspend_targets(map);
2654
2655         /*
2656          * Flush I/O to the device.
2657          * Any I/O submitted after lock_fs() may not be flushed.
2658          * noflush takes precedence over do_lockfs.
2659          * (lock_fs() flushes I/Os and waits for them to complete.)
2660          */
2661         if (!noflush && do_lockfs) {
2662                 r = lock_fs(md);
2663                 if (r)
2664                         goto out;
2665         }
2666
2667         /*
2668          * Here we must make sure that no processes are submitting requests
2669          * to target drivers i.e. no one may be executing
2670          * __split_and_process_bio. This is called from dm_request and
2671          * dm_wq_work.
2672          *
2673          * To get all processes out of __split_and_process_bio in dm_request,
2674          * we take the write lock. To prevent any process from reentering
2675          * __split_and_process_bio from dm_request, we set
2676          * DMF_QUEUE_IO_TO_THREAD.
2677          *
2678          * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
2679          * and call flush_workqueue(md->wq). flush_workqueue will wait until
2680          * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
2681          * further calls to __split_and_process_bio from dm_wq_work.
2682          */
2683         down_write(&md->io_lock);
2684         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2685         set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2686         up_write(&md->io_lock);
2687
2688         /*
2689          * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
2690          * can be kicked until md->queue is stopped.  So stop md->queue before
2691          * flushing md->wq.
2692          */
2693         if (dm_request_based(md))
2694                 stop_queue(md->queue);
2695
2696         flush_workqueue(md->wq);
2697
2698         /*
2699          * At this point no more requests are entering target request routines.
2700          * We call dm_wait_for_completion to wait for all existing requests
2701          * to finish.
2702          */
2703         r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
2704
2705         down_write(&md->io_lock);
2706         if (noflush)
2707                 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2708         up_write(&md->io_lock);
2709
2710         /* were we interrupted ? */
2711         if (r < 0) {
2712                 dm_queue_flush(md);
2713
2714                 if (dm_request_based(md))
2715                         start_queue(md->queue);
2716
2717                 unlock_fs(md);
2718                 goto out; /* pushback list is already flushed, so skip flush */
2719         }
2720
2721         /*
2722          * If dm_wait_for_completion returned 0, the device is completely
2723          * quiescent now. There is no request-processing activity. All new
2724          * requests are being added to md->deferred list.
2725          */
2726
2727         set_bit(DMF_SUSPENDED, &md->flags);
2728
2729         dm_table_postsuspend_targets(map);
2730
2731 out:
2732         dm_table_put(map);
2733
2734 out_unlock:
2735         mutex_unlock(&md->suspend_lock);
2736         return r;
2737 }
2738
2739 int dm_resume(struct mapped_device *md)
2740 {
2741         int r = -EINVAL;
2742         struct dm_table *map = NULL;
2743
2744         mutex_lock(&md->suspend_lock);
2745         if (!dm_suspended_md(md))
2746                 goto out;
2747
2748         map = dm_get_live_table(md);
2749         if (!map || !dm_table_get_size(map))
2750                 goto out;
2751
2752         r = dm_table_resume_targets(map);
2753         if (r)
2754                 goto out;
2755
2756         dm_queue_flush(md);
2757
2758         /*
2759          * Flushing deferred I/Os must be done after targets are resumed
2760          * so that mapping of targets can work correctly.
2761          * Request-based dm is queueing the deferred I/Os in its request_queue.
2762          */
2763         if (dm_request_based(md))
2764                 start_queue(md->queue);
2765
2766         unlock_fs(md);
2767
2768         clear_bit(DMF_SUSPENDED, &md->flags);
2769
2770         dm_table_unplug_all(map);
2771         r = 0;
2772 out:
2773         dm_table_put(map);
2774         mutex_unlock(&md->suspend_lock);
2775
2776         return r;
2777 }
2778
2779 /*-----------------------------------------------------------------
2780  * Event notification.
2781  *---------------------------------------------------------------*/
2782 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2783                        unsigned cookie)
2784 {
2785         char udev_cookie[DM_COOKIE_LENGTH];
2786         char *envp[] = { udev_cookie, NULL };
2787
2788         if (!cookie)
2789                 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2790         else {
2791                 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2792                          DM_COOKIE_ENV_VAR_NAME, cookie);
2793                 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2794                                           action, envp);
2795         }
2796 }
2797
2798 uint32_t dm_next_uevent_seq(struct mapped_device *md)
2799 {
2800         return atomic_add_return(1, &md->uevent_seq);
2801 }
2802
2803 uint32_t dm_get_event_nr(struct mapped_device *md)
2804 {
2805         return atomic_read(&md->event_nr);
2806 }
2807
2808 int dm_wait_event(struct mapped_device *md, int event_nr)
2809 {
2810         return wait_event_interruptible(md->eventq,
2811                         (event_nr != atomic_read(&md->event_nr)));
2812 }
2813
2814 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2815 {
2816         unsigned long flags;
2817
2818         spin_lock_irqsave(&md->uevent_lock, flags);
2819         list_add(elist, &md->uevent_list);
2820         spin_unlock_irqrestore(&md->uevent_lock, flags);
2821 }
2822
2823 /*
2824  * The gendisk is only valid as long as you have a reference
2825  * count on 'md'.
2826  */
2827 struct gendisk *dm_disk(struct mapped_device *md)
2828 {
2829         return md->disk;
2830 }
2831
2832 struct kobject *dm_kobject(struct mapped_device *md)
2833 {
2834         return &md->kobj;
2835 }
2836
2837 /*
2838  * struct mapped_device should not be exported outside of dm.c
2839  * so use this check to verify that kobj is part of md structure
2840  */
2841 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2842 {
2843         struct mapped_device *md;
2844
2845         md = container_of(kobj, struct mapped_device, kobj);
2846         if (&md->kobj != kobj)
2847                 return NULL;
2848
2849         if (test_bit(DMF_FREEING, &md->flags) ||
2850             dm_deleting_md(md))
2851                 return NULL;
2852
2853         dm_get(md);
2854         return md;
2855 }
2856
2857 int dm_suspended_md(struct mapped_device *md)
2858 {
2859         return test_bit(DMF_SUSPENDED, &md->flags);
2860 }
2861
2862 int dm_suspended(struct dm_target *ti)
2863 {
2864         return dm_suspended_md(dm_table_get_md(ti->table));
2865 }
2866 EXPORT_SYMBOL_GPL(dm_suspended);
2867
2868 int dm_noflush_suspending(struct dm_target *ti)
2869 {
2870         return __noflush_suspending(dm_table_get_md(ti->table));
2871 }
2872 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2873
2874 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
2875 {
2876         struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2877
2878         if (!pools)
2879                 return NULL;
2880
2881         pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
2882                          mempool_create_slab_pool(MIN_IOS, _io_cache) :
2883                          mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2884         if (!pools->io_pool)
2885                 goto free_pools_and_out;
2886
2887         pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
2888                           mempool_create_slab_pool(MIN_IOS, _tio_cache) :
2889                           mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2890         if (!pools->tio_pool)
2891                 goto free_io_pool_and_out;
2892
2893         pools->bs = (type == DM_TYPE_BIO_BASED) ?
2894                     bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
2895         if (!pools->bs)
2896                 goto free_tio_pool_and_out;
2897
2898         return pools;
2899
2900 free_tio_pool_and_out:
2901         mempool_destroy(pools->tio_pool);
2902
2903 free_io_pool_and_out:
2904         mempool_destroy(pools->io_pool);
2905
2906 free_pools_and_out:
2907         kfree(pools);
2908
2909         return NULL;
2910 }
2911
2912 void dm_free_md_mempools(struct dm_md_mempools *pools)
2913 {
2914         if (!pools)
2915                 return;
2916
2917         if (pools->io_pool)
2918                 mempool_destroy(pools->io_pool);
2919
2920         if (pools->tio_pool)
2921                 mempool_destroy(pools->tio_pool);
2922
2923         if (pools->bs)
2924                 bioset_free(pools->bs);
2925
2926         kfree(pools);
2927 }
2928
2929 static const struct block_device_operations dm_blk_dops = {
2930         .open = dm_blk_open,
2931         .release = dm_blk_close,
2932         .ioctl = dm_blk_ioctl,
2933         .getgeo = dm_blk_getgeo,
2934         .owner = THIS_MODULE
2935 };
2936
2937 EXPORT_SYMBOL(dm_get_mapinfo);
2938
2939 /*
2940  * module hooks
2941  */
2942 module_init(dm_init);
2943 module_exit(dm_exit);
2944
2945 module_param(major, uint, 0);
2946 MODULE_PARM_DESC(major, "The major number of the device mapper");
2947 MODULE_DESCRIPTION(DM_NAME " driver");
2948 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2949 MODULE_LICENSE("GPL");