drivers/md/dm.c

   1 /*
   2  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm.h"
   9 #include "dm-uevent.h"
  10
  11 #include <linux/init.h>
  12 #include <linux/module.h>
  13 #include <linux/mutex.h>
  14 #include <linux/moduleparam.h>
  15 #include <linux/blkpg.h>
  16 #include <linux/bio.h>
  17 #include <linux/mempool.h>
  18 #include <linux/slab.h>
  19 #include <linux/idr.h>
  20 #include <linux/hdreg.h>
  21 #include <linux/delay.h>
  22 #include <linux/wait.h>
  23 #include <linux/kthread.h>
  24 #include <linux/ktime.h>
  25 #include <linux/elevator.h> /* for rq_end_sector() */
  26
  27 #include <trace/events/block.h>
  28
  29 #define DM_MSG_PREFIX "core"
  30
  31 #ifdef CONFIG_PRINTK
  32 /*
  33  * ratelimit state to be used in DMXXX_LIMIT().
  34  */
  35 DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
  36                        DEFAULT_RATELIMIT_INTERVAL,
  37                        DEFAULT_RATELIMIT_BURST);
  38 EXPORT_SYMBOL(dm_ratelimit_state);
  39 #endif
  40
  41 /*
  42  * Cookies are numeric values sent with CHANGE and REMOVE
  43  * uevents while resuming, removing or renaming the device.
  44  */
  45 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  46 #define DM_COOKIE_LENGTH 24
  47
  48 static const char *_name = DM_NAME;
  49
  50 static unsigned int major = 0;
  51 static unsigned int _major = 0;
  52
  53 static DEFINE_IDR(_minor_idr);
  54
  55 static DEFINE_SPINLOCK(_minor_lock);
  56
  57 static void do_deferred_remove(struct work_struct *w);
  58
  59 static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  60
  61 static struct workqueue_struct *deferred_remove_workqueue;
  62
  63 /*
  64  * For bio-based dm.
  65  * One of these is allocated per bio.
  66  */
  67 struct dm_io {
  68         struct mapped_device *md;
  69         int error;
  70         atomic_t io_count;
  71         struct bio *bio;
  72         unsigned long start_time;
  73         spinlock_t endio_lock;
  74         struct dm_stats_aux stats_aux;
  75 };
  76
  77 /*
  78  * For request-based dm.
  79  * One of these is allocated per request.
  80  */
  81 struct dm_rq_target_io {
  82         struct mapped_device *md;
  83         struct dm_target *ti;
  84         struct request *orig, *clone;
  85         struct kthread_work work;
  86         int error;
  87         union map_info info;
  88 };
  89
  90 /*
  91  * For request-based dm - the bio clones we allocate are embedded in these
  92  * structs.
  93  *
  94  * We allocate these with bio_alloc_bioset, using the front_pad parameter when
  95  * the bioset is created - this means the bio has to come at the end of the
  96  * struct.
  97  */
  98 struct dm_rq_clone_bio_info {
  99         struct bio *orig;
 100         struct dm_rq_target_io *tio;
 101         struct bio clone;
 102 };
 103
 104 union map_info *dm_get_rq_mapinfo(struct request *rq)
 105 {
 106         if (rq && rq->end_io_data)
 107                 return &((struct dm_rq_target_io *)rq->end_io_data)->info;
 108         return NULL;
 109 }
 110 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 111
 112 #define MINOR_ALLOCED ((void *)-1)
 113
 114 /*
 115  * Bits for the md->flags field.
 116  */
 117 #define DMF_BLOCK_IO_FOR_SUSPEND 0
 118 #define DMF_SUSPENDED 1
 119 #define DMF_FROZEN 2
 120 #define DMF_FREEING 3
 121 #define DMF_DELETING 4
 122 #define DMF_NOFLUSH_SUSPENDING 5
 123 #define DMF_MERGE_IS_OPTIONAL 6
 124 #define DMF_DEFERRED_REMOVE 7
 125 #define DMF_SUSPENDED_INTERNALLY 8
 126
 127 /*
 128  * A dummy definition to make RCU happy.
 129  * struct dm_table should never be dereferenced in this file.
 130  */
 131 struct dm_table {
 132         int undefined__;
 133 };
 134
 135 /*
 136  * Work processed by per-device workqueue.
 137  */
 138 struct mapped_device {
 139         struct srcu_struct io_barrier;
 140         struct mutex suspend_lock;
 141         atomic_t holders;
 142         atomic_t open_count;
 143
 144         /*
 145          * The current mapping.
 146          * Use dm_get_live_table{_fast} or take suspend_lock for
 147          * dereference.
 148          */
 149         struct dm_table __rcu *map;
 150
 151         struct list_head table_devices;
 152         struct mutex table_devices_lock;
 153
 154         unsigned long flags;
 155
 156         struct request_queue *queue;
 157         unsigned type;
 158         /* Protect queue and type against concurrent access. */
 159         struct mutex type_lock;
 160
 161         struct target_type *immutable_target_type;
 162
 163         struct gendisk *disk;
 164         char name[16];
 165
 166         void *interface_ptr;
 167
 168         /*
 169          * A list of ios that arrived while we were suspended.
 170          */
 171         atomic_t pending[2];
 172         wait_queue_head_t wait;
 173         struct work_struct work;
 174         struct bio_list deferred;
 175         spinlock_t deferred_lock;
 176
 177         /*
 178          * Processing queue (flush)
 179          */
 180         struct workqueue_struct *wq;
 181
 182         /*
 183          * io objects are allocated from here.
 184          */
 185         mempool_t *io_pool;
 186         mempool_t *rq_pool;
 187
 188         struct bio_set *bs;
 189
 190         /*
 191          * Event handling.
 192          */
 193         atomic_t event_nr;
 194         wait_queue_head_t eventq;
 195         atomic_t uevent_seq;
 196         struct list_head uevent_list;
 197         spinlock_t uevent_lock; /* Protect access to uevent_list */
 198
 199         /*
 200          * freeze/thaw support require holding onto a super block
 201          */
 202         struct super_block *frozen_sb;
 203         struct block_device *bdev;
 204
 205         /* forced geometry settings */
 206         struct hd_geometry geometry;
 207
 208         /* kobject and completion */
 209         struct dm_kobject_holder kobj_holder;
 210
 211         /* zero-length flush that will be cloned and submitted to targets */
 212         struct bio flush_bio;
 213
 214         /* the number of internal suspends */
 215         unsigned internal_suspend_count;
 216
 217         struct dm_stats stats;
 218
 219         struct kthread_worker kworker;
 220         struct task_struct *kworker_task;
 221
 222         /* for request-based merge heuristic in dm_request_fn() */
 223         unsigned seq_rq_merge_deadline_usecs;
 224         int last_rq_rw;
 225         sector_t last_rq_pos;
 226         ktime_t last_rq_start_time;
 227 };
 228
 229 /*
 230  * For mempools pre-allocation at the table loading time.
 231  */
 232 struct dm_md_mempools {
 233         mempool_t *io_pool;
 234         mempool_t *rq_pool;
 235         struct bio_set *bs;
 236 };
 237
 238 struct table_device {
 239         struct list_head list;
 240         atomic_t count;
 241         struct dm_dev dm_dev;
 242 };
 243
 244 #define RESERVED_BIO_BASED_IOS          16
 245 #define RESERVED_REQUEST_BASED_IOS      256
 246 #define RESERVED_MAX_IOS                1024
 247 static struct kmem_cache *_io_cache;
 248 static struct kmem_cache *_rq_tio_cache;
 249 static struct kmem_cache *_rq_cache;
 250
 251 /*
 252  * Bio-based DM's mempools' reserved IOs set by the user.
 253  */
 254 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 255
 256 /*
 257  * Request-based DM's mempools' reserved IOs set by the user.
 258  */
 259 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 260
 261 static unsigned __dm_get_module_param(unsigned *module_param,
 262                                       unsigned def, unsigned max)
 263 {
 264         unsigned param = ACCESS_ONCE(*module_param);
 265         unsigned modified_param = 0;
 266
 267         if (!param)
 268                 modified_param = def;
 269         else if (param > max)
 270                 modified_param = max;
 271
 272         if (modified_param) {
 273                 (void)cmpxchg(module_param, param, modified_param);
 274                 param = modified_param;
 275         }
 276
 277         return param;
 278 }
 279
 280 unsigned dm_get_reserved_bio_based_ios(void)
 281 {
 282         return __dm_get_module_param(&reserved_bio_based_ios,
 283                                      RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
 284 }
 285 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 286
 287 unsigned dm_get_reserved_rq_based_ios(void)
 288 {
 289         return __dm_get_module_param(&reserved_rq_based_ios,
 290                                      RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
 291 }
 292 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
 293
 294 static int __init local_init(void)
 295 {
 296         int r = -ENOMEM;
 297
 298         /* allocate a slab for the dm_ios */
 299         _io_cache = KMEM_CACHE(dm_io, 0);
 300         if (!_io_cache)
 301                 return r;
 302
 303         _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 304         if (!_rq_tio_cache)
 305                 goto out_free_io_cache;
 306
 307         _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
 308                                       __alignof__(struct request), 0, NULL);
 309         if (!_rq_cache)
 310                 goto out_free_rq_tio_cache;
 311
 312         r = dm_uevent_init();
 313         if (r)
 314                 goto out_free_rq_cache;
 315
 316         deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 317         if (!deferred_remove_workqueue) {
 318                 r = -ENOMEM;
 319                 goto out_uevent_exit;
 320         }
 321
 322         _major = major;
 323         r = register_blkdev(_major, _name);
 324         if (r < 0)
 325                 goto out_free_workqueue;
 326
 327         if (!_major)
 328                 _major = r;
 329
 330         return 0;
 331
 332 out_free_workqueue:
 333         destroy_workqueue(deferred_remove_workqueue);
 334 out_uevent_exit:
 335         dm_uevent_exit();
 336 out_free_rq_cache:
 337         kmem_cache_destroy(_rq_cache);
 338 out_free_rq_tio_cache:
 339         kmem_cache_destroy(_rq_tio_cache);
 340 out_free_io_cache:
 341         kmem_cache_destroy(_io_cache);
 342
 343         return r;
 344 }
 345
 346 static void local_exit(void)
 347 {
 348         flush_scheduled_work();
 349         destroy_workqueue(deferred_remove_workqueue);
 350
 351         kmem_cache_destroy(_rq_cache);
 352         kmem_cache_destroy(_rq_tio_cache);
 353         kmem_cache_destroy(_io_cache);
 354         unregister_blkdev(_major, _name);
 355         dm_uevent_exit();
 356
 357         _major = 0;
 358
 359         DMINFO("cleaned up");
 360 }
 361
 362 static int (*_inits[])(void) __initdata = {
 363         local_init,
 364         dm_target_init,
 365         dm_linear_init,
 366         dm_stripe_init,
 367         dm_io_init,
 368         dm_kcopyd_init,
 369         dm_interface_init,
 370         dm_statistics_init,
 371 };
 372
 373 static void (*_exits[])(void) = {
 374         local_exit,
 375         dm_target_exit,
 376         dm_linear_exit,
 377         dm_stripe_exit,
 378         dm_io_exit,
 379         dm_kcopyd_exit,
 380         dm_interface_exit,
 381         dm_statistics_exit,
 382 };
 383
 384 static int __init dm_init(void)
 385 {
 386         const int count = ARRAY_SIZE(_inits);
 387
 388         int r, i;
 389
 390         for (i = 0; i < count; i++) {
 391                 r = _inits[i]();
 392                 if (r)
 393                         goto bad;
 394         }
 395
 396         return 0;
 397
 398       bad:
 399         while (i--)
 400                 _exits[i]();
 401
 402         return r;
 403 }
 404
 405 static void __exit dm_exit(void)
 406 {
 407         int i = ARRAY_SIZE(_exits);
 408
 409         while (i--)
 410                 _exits[i]();
 411
 412         /*
 413          * Should be empty by this point.
 414          */
 415         idr_destroy(&_minor_idr);
 416 }
 417
 418 /*
 419  * Block device functions
 420  */
 421 int dm_deleting_md(struct mapped_device *md)
 422 {
 423         return test_bit(DMF_DELETING, &md->flags);
 424 }
 425
 426 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 427 {
 428         struct mapped_device *md;
 429
 430         spin_lock(&_minor_lock);
 431
 432         md = bdev->bd_disk->private_data;
 433         if (!md)
 434                 goto out;
 435
 436         if (test_bit(DMF_FREEING, &md->flags) ||
 437             dm_deleting_md(md)) {
 438                 md = NULL;
 439                 goto out;
 440         }
 441
 442         dm_get(md);
 443         atomic_inc(&md->open_count);
 444 out:
 445         spin_unlock(&_minor_lock);
 446
 447         return md ? 0 : -ENXIO;
 448 }
 449
 450 static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 451 {
 452         struct mapped_device *md;
 453
 454         spin_lock(&_minor_lock);
 455
 456         md = disk->private_data;
 457         if (WARN_ON(!md))
 458                 goto out;
 459
 460         if (atomic_dec_and_test(&md->open_count) &&
 461             (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 462                 queue_work(deferred_remove_workqueue, &deferred_remove_work);
 463
 464         dm_put(md);
 465 out:
 466         spin_unlock(&_minor_lock);
 467 }
 468
 469 int dm_open_count(struct mapped_device *md)
 470 {
 471         return atomic_read(&md->open_count);
 472 }
 473
 474 /*
 475  * Guarantees nothing is using the device before it's deleted.
 476  */
 477 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 478 {
 479         int r = 0;
 480
 481         spin_lock(&_minor_lock);
 482
 483         if (dm_open_count(md)) {
 484                 r = -EBUSY;
 485                 if (mark_deferred)
 486                         set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 487         } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 488                 r = -EEXIST;
 489         else
 490                 set_bit(DMF_DELETING, &md->flags);
 491
 492         spin_unlock(&_minor_lock);
 493
 494         return r;
 495 }
 496
 497 int dm_cancel_deferred_remove(struct mapped_device *md)
 498 {
 499         int r = 0;
 500
 501         spin_lock(&_minor_lock);
 502
 503         if (test_bit(DMF_DELETING, &md->flags))
 504                 r = -EBUSY;
 505         else
 506                 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 507
 508         spin_unlock(&_minor_lock);
 509
 510         return r;
 511 }
 512
 513 static void do_deferred_remove(struct work_struct *w)
 514 {
 515         dm_deferred_remove();
 516 }
 517
 518 sector_t dm_get_size(struct mapped_device *md)
 519 {
 520         return get_capacity(md->disk);
 521 }
 522
 523 struct request_queue *dm_get_md_queue(struct mapped_device *md)
 524 {
 525         return md->queue;
 526 }
 527
 528 struct dm_stats *dm_get_stats(struct mapped_device *md)
 529 {
 530         return &md->stats;
 531 }
 532
 533 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 534 {
 535         struct mapped_device *md = bdev->bd_disk->private_data;
 536
 537         return dm_get_geometry(md, geo);
 538 }
 539
 540 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 541                         unsigned int cmd, unsigned long arg)
 542 {
 543         struct mapped_device *md = bdev->bd_disk->private_data;
 544         int srcu_idx;
 545         struct dm_table *map;
 546         struct dm_target *tgt;
 547         int r = -ENOTTY;
 548
 549 retry:
 550         map = dm_get_live_table(md, &srcu_idx);
 551
 552         if (!map || !dm_table_get_size(map))
 553                 goto out;
 554
 555         /* We only support devices that have a single target */
 556         if (dm_table_get_num_targets(map) != 1)
 557                 goto out;
 558
 559         tgt = dm_table_get_target(map, 0);
 560         if (!tgt->type->ioctl)
 561                 goto out;
 562
 563         if (dm_suspended_md(md)) {
 564                 r = -EAGAIN;
 565                 goto out;
 566         }
 567
 568         r = tgt->type->ioctl(tgt, cmd, arg);
 569
 570 out:
 571         dm_put_live_table(md, srcu_idx);
 572
 573         if (r == -ENOTCONN) {
 574                 msleep(10);
 575                 goto retry;
 576         }
 577
 578         return r;
 579 }
 580
 581 static struct dm_io *alloc_io(struct mapped_device *md)
 582 {
 583         return mempool_alloc(md->io_pool, GFP_NOIO);
 584 }
 585
 586 static void free_io(struct mapped_device *md, struct dm_io *io)
 587 {
 588         mempool_free(io, md->io_pool);
 589 }
 590
 591 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 592 {
 593         bio_put(&tio->clone);
 594 }
 595
 596 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
 597                                             gfp_t gfp_mask)
 598 {
 599         return mempool_alloc(md->io_pool, gfp_mask);
 600 }
 601
 602 static void free_rq_tio(struct dm_rq_target_io *tio)
 603 {
 604         mempool_free(tio, tio->md->io_pool);
 605 }
 606
 607 static struct request *alloc_clone_request(struct mapped_device *md,
 608                                            gfp_t gfp_mask)
 609 {
 610         return mempool_alloc(md->rq_pool, gfp_mask);
 611 }
 612
 613 static void free_clone_request(struct mapped_device *md, struct request *rq)
 614 {
 615         mempool_free(rq, md->rq_pool);
 616 }
 617
 618 static int md_in_flight(struct mapped_device *md)
 619 {
 620         return atomic_read(&md->pending[READ]) +
 621                atomic_read(&md->pending[WRITE]);
 622 }
 623
 624 static void start_io_acct(struct dm_io *io)
 625 {
 626         struct mapped_device *md = io->md;
 627         struct bio *bio = io->bio;
 628         int cpu;
 629         int rw = bio_data_dir(bio);
 630
 631         io->start_time = jiffies;
 632
 633         cpu = part_stat_lock();
 634         part_round_stats(cpu, &dm_disk(md)->part0);
 635         part_stat_unlock();
 636         atomic_set(&dm_disk(md)->part0.in_flight[rw],
 637                 atomic_inc_return(&md->pending[rw]));
 638
 639         if (unlikely(dm_stats_used(&md->stats)))
 640                 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
 641                                     bio_sectors(bio), false, 0, &io->stats_aux);
 642 }
 643
 644 static void end_io_acct(struct dm_io *io)
 645 {
 646         struct mapped_device *md = io->md;
 647         struct bio *bio = io->bio;
 648         unsigned long duration = jiffies - io->start_time;
 649         int pending;
 650         int rw = bio_data_dir(bio);
 651
 652         generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
 653
 654         if (unlikely(dm_stats_used(&md->stats)))
 655                 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
 656                                     bio_sectors(bio), true, duration, &io->stats_aux);
 657
 658         /*
 659          * After this is decremented the bio must not be touched if it is
 660          * a flush.
 661          */
 662         pending = atomic_dec_return(&md->pending[rw]);
 663         atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 664         pending += atomic_read(&md->pending[rw^0x1]);
 665
 666         /* nudge anyone waiting on suspend queue */
 667         if (!pending)
 668                 wake_up(&md->wait);
 669 }
 670
 671 /*
 672  * Add the bio to the list of deferred io.
 673  */
 674 static void queue_io(struct mapped_device *md, struct bio *bio)
 675 {
 676         unsigned long flags;
 677
 678         spin_lock_irqsave(&md->deferred_lock, flags);
 679         bio_list_add(&md->deferred, bio);
 680         spin_unlock_irqrestore(&md->deferred_lock, flags);
 681         queue_work(md->wq, &md->work);
 682 }
 683
 684 /*
 685  * Everyone (including functions in this file), should use this
 686  * function to access the md->map field, and make sure they call
 687  * dm_put_live_table() when finished.
 688  */
 689 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 690 {
 691         *srcu_idx = srcu_read_lock(&md->io_barrier);
 692
 693         return srcu_dereference(md->map, &md->io_barrier);
 694 }
 695
 696 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 697 {
 698         srcu_read_unlock(&md->io_barrier, srcu_idx);
 699 }
 700
 701 void dm_sync_table(struct mapped_device *md)
 702 {
 703         synchronize_srcu(&md->io_barrier);
 704         synchronize_rcu_expedited();
 705 }
 706
 707 /*
 708  * A fast alternative to dm_get_live_table/dm_put_live_table.
 709  * The caller must not block between these two functions.
 710  */
 711 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 712 {
 713         rcu_read_lock();
 714         return rcu_dereference(md->map);
 715 }
 716
 717 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 718 {
 719         rcu_read_unlock();
 720 }
 721
 722 /*
 723  * Open a table device so we can use it as a map destination.
 724  */
 725 static int open_table_device(struct table_device *td, dev_t dev,
 726                              struct mapped_device *md)
 727 {
 728         static char *_claim_ptr = "I belong to device-mapper";
 729         struct block_device *bdev;
 730
 731         int r;
 732
 733         BUG_ON(td->dm_dev.bdev);
 734
 735         bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 736         if (IS_ERR(bdev))
 737                 return PTR_ERR(bdev);
 738
 739         r = bd_link_disk_holder(bdev, dm_disk(md));
 740         if (r) {
 741                 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 742                 return r;
 743         }
 744
 745         td->dm_dev.bdev = bdev;
 746         return 0;
 747 }
 748
 749 /*
 750  * Close a table device that we've been using.
 751  */
 752 static void close_table_device(struct table_device *td, struct mapped_device *md)
 753 {
 754         if (!td->dm_dev.bdev)
 755                 return;
 756
 757         bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 758         blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 759         td->dm_dev.bdev = NULL;
 760 }
 761
 762 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 763                                               fmode_t mode) {
 764         struct table_device *td;
 765
 766         list_for_each_entry(td, l, list)
 767                 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 768                         return td;
 769
 770         return NULL;
 771 }
 772
 773 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 774                         struct dm_dev **result) {
 775         int r;
 776         struct table_device *td;
 777
 778         mutex_lock(&md->table_devices_lock);
 779         td = find_table_device(&md->table_devices, dev, mode);
 780         if (!td) {
 781                 td = kmalloc(sizeof(*td), GFP_KERNEL);
 782                 if (!td) {
 783                         mutex_unlock(&md->table_devices_lock);
 784                         return -ENOMEM;
 785                 }
 786
 787                 td->dm_dev.mode = mode;
 788                 td->dm_dev.bdev = NULL;
 789
 790                 if ((r = open_table_device(td, dev, md))) {
 791                         mutex_unlock(&md->table_devices_lock);
 792                         kfree(td);
 793                         return r;
 794                 }
 795
 796                 format_dev_t(td->dm_dev.name, dev);
 797
 798                 atomic_set(&td->count, 0);
 799                 list_add(&td->list, &md->table_devices);
 800         }
 801         atomic_inc(&td->count);
 802         mutex_unlock(&md->table_devices_lock);
 803
 804         *result = &td->dm_dev;
 805         return 0;
 806 }
 807 EXPORT_SYMBOL_GPL(dm_get_table_device);
 808
 809 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 810 {
 811         struct table_device *td = container_of(d, struct table_device, dm_dev);
 812
 813         mutex_lock(&md->table_devices_lock);
 814         if (atomic_dec_and_test(&td->count)) {
 815                 close_table_device(td, md);
 816                 list_del(&td->list);
 817                 kfree(td);
 818         }
 819         mutex_unlock(&md->table_devices_lock);
 820 }
 821 EXPORT_SYMBOL(dm_put_table_device);
 822
 823 static void free_table_devices(struct list_head *devices)
 824 {
 825         struct list_head *tmp, *next;
 826
 827         list_for_each_safe(tmp, next, devices) {
 828                 struct table_device *td = list_entry(tmp, struct table_device, list);
 829
 830                 DMWARN("dm_destroy: %s still exists with %d references",
 831                        td->dm_dev.name, atomic_read(&td->count));
 832                 kfree(td);
 833         }
 834 }
 835
 836 /*
 837  * Get the geometry associated with a dm device
 838  */
 839 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 840 {
 841         *geo = md->geometry;
 842
 843         return 0;
 844 }
 845
 846 /*
 847  * Set the geometry of a device.
 848  */
 849 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 850 {
 851         sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 852
 853         if (geo->start > sz) {
 854                 DMWARN("Start sector is beyond the geometry limits.");
 855                 return -EINVAL;
 856         }
 857
 858         md->geometry = *geo;
 859
 860         return 0;
 861 }
 862
 863 /*-----------------------------------------------------------------
 864  * CRUD START:
 865  *   A more elegant soln is in the works that uses the queue
 866  *   merge fn, unfortunately there are a couple of changes to
 867  *   the block layer that I want to make for this.  So in the
 868  *   interests of getting something for people to use I give
 869  *   you this clearly demarcated crap.
 870  *---------------------------------------------------------------*/
 871
 872 static int __noflush_suspending(struct mapped_device *md)
 873 {
 874         return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 875 }
 876
 877 /*
 878  * Decrements the number of outstanding ios that a bio has been
 879  * cloned into, completing the original io if necc.
 880  */
 881 static void dec_pending(struct dm_io *io, int error)
 882 {
 883         unsigned long flags;
 884         int io_error;
 885         struct bio *bio;
 886         struct mapped_device *md = io->md;
 887
 888         /* Push-back supersedes any I/O errors */
 889         if (unlikely(error)) {
 890                 spin_lock_irqsave(&io->endio_lock, flags);
 891                 if (!(io->error > 0 && __noflush_suspending(md)))
 892                         io->error = error;
 893                 spin_unlock_irqrestore(&io->endio_lock, flags);
 894         }
 895
 896         if (atomic_dec_and_test(&io->io_count)) {
 897                 if (io->error == DM_ENDIO_REQUEUE) {
 898                         /*
 899                          * Target requested pushing back the I/O.
 900                          */
 901                         spin_lock_irqsave(&md->deferred_lock, flags);
 902                         if (__noflush_suspending(md))
 903                                 bio_list_add_head(&md->deferred, io->bio);
 904                         else
 905                                 /* noflush suspend was interrupted. */
 906                                 io->error = -EIO;
 907                         spin_unlock_irqrestore(&md->deferred_lock, flags);
 908                 }
 909
 910                 io_error = io->error;
 911                 bio = io->bio;
 912                 end_io_acct(io);
 913                 free_io(md, io);
 914
 915                 if (io_error == DM_ENDIO_REQUEUE)
 916                         return;
 917
 918                 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
 919                         /*
 920                          * Preflush done for flush with data, reissue
 921                          * without REQ_FLUSH.
 922                          */
 923                         bio->bi_rw &= ~REQ_FLUSH;
 924                         queue_io(md, bio);
 925                 } else {
 926                         /* done with normal IO or empty flush */
 927                         trace_block_bio_complete(md->queue, bio, io_error);
 928                         bio_endio(bio, io_error);
 929                 }
 930         }
 931 }
 932
 933 static void disable_write_same(struct mapped_device *md)
 934 {
 935         struct queue_limits *limits = dm_get_queue_limits(md);
 936
 937         /* device doesn't really support WRITE SAME, disable it */
 938         limits->max_write_same_sectors = 0;
 939 }
 940
 941 static void clone_endio(struct bio *bio, int error)
 942 {
 943         int r = error;
 944         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 945         struct dm_io *io = tio->io;
 946         struct mapped_device *md = tio->io->md;
 947         dm_endio_fn endio = tio->ti->type->end_io;
 948
 949         if (!bio_flagged(bio, BIO_UPTODATE) && !error)
 950                 error = -EIO;
 951
 952         if (endio) {
 953                 r = endio(tio->ti, bio, error);
 954                 if (r < 0 || r == DM_ENDIO_REQUEUE)
 955                         /*
 956                          * error and requeue request are handled
 957                          * in dec_pending().
 958                          */
 959                         error = r;
 960                 else if (r == DM_ENDIO_INCOMPLETE)
 961                         /* The target will handle the io */
 962                         return;
 963                 else if (r) {
 964                         DMWARN("unimplemented target endio return value: %d", r);
 965                         BUG();
 966                 }
 967         }
 968
 969         if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
 970                      !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
 971                 disable_write_same(md);
 972
 973         free_tio(md, tio);
 974         dec_pending(io, error);
 975 }
 976
 977 /*
 978  * Partial completion handling for request-based dm
 979  */
 980 static void end_clone_bio(struct bio *clone, int error)
 981 {
 982         struct dm_rq_clone_bio_info *info =
 983                 container_of(clone, struct dm_rq_clone_bio_info, clone);
 984         struct dm_rq_target_io *tio = info->tio;
 985         struct bio *bio = info->orig;
 986         unsigned int nr_bytes = info->orig->bi_iter.bi_size;
 987
 988         bio_put(clone);
 989
 990         if (tio->error)
 991                 /*
 992                  * An error has already been detected on the request.
 993                  * Once error occurred, just let clone->end_io() handle
 994                  * the remainder.
 995                  */
 996                 return;
 997         else if (error) {
 998                 /*
 999                  * Don't notice the error to the upper layer yet.
1000                  * The error handling decision is made by the target driver,
1001                  * when the request is completed.
1002                  */
1003                 tio->error = error;
1004                 return;
1005         }
1006
1007         /*
1008          * I/O for the bio successfully completed.
1009          * Notice the data completion to the upper layer.
1010          */
1011
1012         /*
1013          * bios are processed from the head of the list.
1014          * So the completing bio should always be rq->bio.
1015          * If it's not, something wrong is happening.
1016          */
1017         if (tio->orig->bio != bio)
1018                 DMERR("bio completion is going in the middle of the request");
1019
1020         /*
1021          * Update the original request.
1022          * Do not use blk_end_request() here, because it may complete
1023          * the original request before the clone, and break the ordering.
1024          */
1025         blk_update_request(tio->orig, 0, nr_bytes);
1026 }
1027
1028 /*
1029  * Don't touch any member of the md after calling this function because
1030  * the md may be freed in dm_put() at the end of this function.
1031  * Or do dm_get() before calling this function and dm_put() later.
1032  */
1033 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
1034 {
1035         int nr_requests_pending;
1036
1037         atomic_dec(&md->pending[rw]);
1038
1039         /* nudge anyone waiting on suspend queue */
1040         nr_requests_pending = md_in_flight(md);
1041         if (!nr_requests_pending)
1042                 wake_up(&md->wait);
1043
1044         /*
1045          * Run this off this callpath, as drivers could invoke end_io while
1046          * inside their request_fn (and holding the queue lock). Calling
1047          * back into ->request_fn() could deadlock attempting to grab the
1048          * queue lock again.
1049          */
1050         if (run_queue) {
1051                 if (!nr_requests_pending ||
1052                     (nr_requests_pending >= md->queue->nr_congestion_on))
1053                         blk_run_queue_async(md->queue);
1054         }
1055
1056         /*
1057          * dm_put() must be at the end of this function. See the comment above
1058          */
1059         dm_put(md);
1060 }
1061
1062 static void free_rq_clone(struct request *clone)
1063 {
1064         struct dm_rq_target_io *tio = clone->end_io_data;
1065
1066         blk_rq_unprep_clone(clone);
1067         if (clone->q && clone->q->mq_ops)
1068                 tio->ti->type->release_clone_rq(clone);
1069         else
1070                 free_clone_request(tio->md, clone);
1071         free_rq_tio(tio);
1072 }
1073
1074 /*
1075  * Complete the clone and the original request.
1076  * Must be called without clone's queue lock held,
1077  * see end_clone_request() for more details.
1078  */
1079 static void dm_end_request(struct request *clone, int error)
1080 {
1081         int rw = rq_data_dir(clone);
1082         struct dm_rq_target_io *tio = clone->end_io_data;
1083         struct mapped_device *md = tio->md;
1084         struct request *rq = tio->orig;
1085
1086         if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
1087                 rq->errors = clone->errors;
1088                 rq->resid_len = clone->resid_len;
1089
1090                 if (rq->sense)
1091                         /*
1092                          * We are using the sense buffer of the original
1093                          * request.
1094                          * So setting the length of the sense data is enough.
1095                          */
1096                         rq->sense_len = clone->sense_len;
1097         }
1098
1099         free_rq_clone(clone);
1100         blk_end_request_all(rq, error);
1101         rq_completed(md, rw, true);
1102 }
1103
1104 static void dm_unprep_request(struct request *rq)
1105 {
1106         struct dm_rq_target_io *tio = rq->special;
1107         struct request *clone = tio->clone;
1108
1109         rq->special = NULL;
1110         rq->cmd_flags &= ~REQ_DONTPREP;
1111
1112         if (clone)
1113                 free_rq_clone(clone);
1114 }
1115
1116 /*
1117  * Requeue the original request of a clone.
1118  */
1119 static void dm_requeue_unmapped_original_request(struct mapped_device *md,
1120                                                  struct request *rq)
1121 {
1122         int rw = rq_data_dir(rq);
1123         struct request_queue *q = rq->q;
1124         unsigned long flags;
1125
1126         dm_unprep_request(rq);
1127
1128         spin_lock_irqsave(q->queue_lock, flags);
1129         blk_requeue_request(q, rq);
1130         spin_unlock_irqrestore(q->queue_lock, flags);
1131
1132         rq_completed(md, rw, false);
1133 }
1134
1135 static void dm_requeue_unmapped_request(struct request *clone)
1136 {
1137         struct dm_rq_target_io *tio = clone->end_io_data;
1138
1139         dm_requeue_unmapped_original_request(tio->md, tio->orig);
1140 }
1141
1142 static void __stop_queue(struct request_queue *q)
1143 {
1144         blk_stop_queue(q);
1145 }
1146
1147 static void stop_queue(struct request_queue *q)
1148 {
1149         unsigned long flags;
1150
1151         spin_lock_irqsave(q->queue_lock, flags);
1152         __stop_queue(q);
1153         spin_unlock_irqrestore(q->queue_lock, flags);
1154 }
1155
1156 static void __start_queue(struct request_queue *q)
1157 {
1158         if (blk_queue_stopped(q))
1159                 blk_start_queue(q);
1160 }
1161
1162 static void start_queue(struct request_queue *q)
1163 {
1164         unsigned long flags;
1165
1166         spin_lock_irqsave(q->queue_lock, flags);
1167         __start_queue(q);
1168         spin_unlock_irqrestore(q->queue_lock, flags);
1169 }
1170
1171 static void dm_done(struct request *clone, int error, bool mapped)
1172 {
1173         int r = error;
1174         struct dm_rq_target_io *tio = clone->end_io_data;
1175         dm_request_endio_fn rq_end_io = NULL;
1176
1177         if (tio->ti) {
1178                 rq_end_io = tio->ti->type->rq_end_io;
1179
1180                 if (mapped && rq_end_io)
1181                         r = rq_end_io(tio->ti, clone, error, &tio->info);
1182         }
1183
1184         if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) &&
1185                      !clone->q->limits.max_write_same_sectors))
1186                 disable_write_same(tio->md);
1187
1188         if (r <= 0)
1189                 /* The target wants to complete the I/O */
1190                 dm_end_request(clone, r);
1191         else if (r == DM_ENDIO_INCOMPLETE)
1192                 /* The target will handle the I/O */
1193                 return;
1194         else if (r == DM_ENDIO_REQUEUE)
1195                 /* The target wants to requeue the I/O */
1196                 dm_requeue_unmapped_request(clone);
1197         else {
1198                 DMWARN("unimplemented target endio return value: %d", r);
1199                 BUG();
1200         }
1201 }
1202
1203 /*
1204  * Request completion handler for request-based dm
1205  */
1206 static void dm_softirq_done(struct request *rq)
1207 {
1208         bool mapped = true;
1209         struct dm_rq_target_io *tio = rq->special;
1210         struct request *clone = tio->clone;
1211
1212         if (!clone) {
1213                 blk_end_request_all(rq, tio->error);
1214                 rq_completed(tio->md, rq_data_dir(rq), false);
1215                 free_rq_tio(tio);
1216                 return;
1217         }
1218
1219         if (rq->cmd_flags & REQ_FAILED)
1220                 mapped = false;
1221
1222         dm_done(clone, tio->error, mapped);
1223 }
1224
1225 /*
1226  * Complete the clone and the original request with the error status
1227  * through softirq context.
1228  */
1229 static void dm_complete_request(struct request *rq, int error)
1230 {
1231         struct dm_rq_target_io *tio = rq->special;
1232
1233         tio->error = error;
1234         blk_complete_request(rq);
1235 }
1236
1237 /*
1238  * Complete the not-mapped clone and the original request with the error status
1239  * through softirq context.
1240  * Target's rq_end_io() function isn't called.
1241  * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
1242  */
1243 static void dm_kill_unmapped_request(struct request *rq, int error)
1244 {
1245         rq->cmd_flags |= REQ_FAILED;
1246         dm_complete_request(rq, error);
1247 }
1248
1249 /*
1250  * Called with the clone's queue lock held
1251  */
1252 static void end_clone_request(struct request *clone, int error)
1253 {
1254         struct dm_rq_target_io *tio = clone->end_io_data;
1255
1256         if (!clone->q->mq_ops) {
1257                 /*
1258                  * For just cleaning up the information of the queue in which
1259                  * the clone was dispatched.
1260                  * The clone is *NOT* freed actually here because it is alloced
1261                  * from dm own mempool (REQ_ALLOCED isn't set).
1262                  */
1263                 __blk_put_request(clone->q, clone);
1264         }
1265
1266         /*
1267          * Actual request completion is done in a softirq context which doesn't
1268          * hold the clone's queue lock.  Otherwise, deadlock could occur because:
1269          *     - another request may be submitted by the upper level driver
1270          *       of the stacking during the completion
1271          *     - the submission which requires queue lock may be done
1272          *       against this clone's queue
1273          */
1274         dm_complete_request(tio->orig, error);
1275 }
1276
1277 /*
1278  * Return maximum size of I/O possible at the supplied sector up to the current
1279  * target boundary.
1280  */
1281 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1282 {
1283         sector_t target_offset = dm_target_offset(ti, sector);
1284
1285         return ti->len - target_offset;
1286 }
1287
1288 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1289 {
1290         sector_t len = max_io_len_target_boundary(sector, ti);
1291         sector_t offset, max_len;
1292
1293         /*
1294          * Does the target need to split even further?
1295          */
1296         if (ti->max_io_len) {
1297                 offset = dm_target_offset(ti, sector);
1298                 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1299                         max_len = sector_div(offset, ti->max_io_len);
1300                 else
1301                         max_len = offset & (ti->max_io_len - 1);
1302                 max_len = ti->max_io_len - max_len;
1303
1304                 if (len > max_len)
1305                         len = max_len;
1306         }
1307
1308         return len;
1309 }
1310
1311 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1312 {
1313         if (len > UINT_MAX) {
1314                 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1315                       (unsigned long long)len, UINT_MAX);
1316                 ti->error = "Maximum size of target IO is too large";
1317                 return -EINVAL;
1318         }
1319
1320         ti->max_io_len = (uint32_t) len;
1321
1322         return 0;
1323 }
1324 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1325
1326 /*
1327  * A target may call dm_accept_partial_bio only from the map routine.  It is
1328  * allowed for all bio types except REQ_FLUSH.
1329  *
1330  * dm_accept_partial_bio informs the dm that the target only wants to process
1331  * additional n_sectors sectors of the bio and the rest of the data should be
1332  * sent in a next bio.
1333  *
1334  * A diagram that explains the arithmetics:
1335  * +--------------------+---------------+-------+
1336  * |         1          |       2       |   3   |
1337  * +--------------------+---------------+-------+
1338  *
1339  * <-------------- *tio->len_ptr --------------->
1340  *                      <------- bi_size ------->
1341  *                      <-- n_sectors -->
1342  *
1343  * Region 1 was already iterated over with bio_advance or similar function.
1344  *      (it may be empty if the target doesn't use bio_advance)
1345  * Region 2 is the remaining bio size that the target wants to process.
1346  *      (it may be empty if region 1 is non-empty, although there is no reason
1347  *       to make it empty)
1348  * The target requires that region 3 is to be sent in the next bio.
1349  *
1350  * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1351  * the partially processed part (the sum of regions 1+2) must be the same for all
1352  * copies of the bio.
1353  */
1354 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1355 {
1356         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1357         unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1358         BUG_ON(bio->bi_rw & REQ_FLUSH);
1359         BUG_ON(bi_size > *tio->len_ptr);
1360         BUG_ON(n_sectors > bi_size);
1361         *tio->len_ptr -= bi_size - n_sectors;
1362         bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1363 }
1364 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1365
1366 static void __map_bio(struct dm_target_io *tio)
1367 {
1368         int r;
1369         sector_t sector;
1370         struct mapped_device *md;
1371         struct bio *clone = &tio->clone;
1372         struct dm_target *ti = tio->ti;
1373
1374         clone->bi_end_io = clone_endio;
1375
1376         /*
1377          * Map the clone.  If r == 0 we don't need to do
1378          * anything, the target has assumed ownership of
1379          * this io.
1380          */
1381         atomic_inc(&tio->io->io_count);
1382         sector = clone->bi_iter.bi_sector;
1383         r = ti->type->map(ti, clone);
1384         if (r == DM_MAPIO_REMAPPED) {
1385                 /* the bio has been remapped so dispatch it */
1386
1387                 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1388                                       tio->io->bio->bi_bdev->bd_dev, sector);
1389
1390                 generic_make_request(clone);
1391         } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1392                 /* error the io and bail out, or requeue it if needed */
1393                 md = tio->io->md;
1394                 dec_pending(tio->io, r);
1395                 free_tio(md, tio);
1396         } else if (r) {
1397                 DMWARN("unimplemented target map return value: %d", r);
1398                 BUG();
1399         }
1400 }
1401
1402 struct clone_info {
1403         struct mapped_device *md;
1404         struct dm_table *map;
1405         struct bio *bio;
1406         struct dm_io *io;
1407         sector_t sector;
1408         unsigned sector_count;
1409 };
1410
1411 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1412 {
1413         bio->bi_iter.bi_sector = sector;
1414         bio->bi_iter.bi_size = to_bytes(len);
1415 }
1416
1417 /*
1418  * Creates a bio that consists of range of complete bvecs.
1419  */
1420 static void clone_bio(struct dm_target_io *tio, struct bio *bio,
1421                       sector_t sector, unsigned len)
1422 {
1423         struct bio *clone = &tio->clone;
1424
1425         __bio_clone_fast(clone, bio);
1426
1427         if (bio_integrity(bio))
1428                 bio_integrity_clone(clone, bio, GFP_NOIO);
1429
1430         bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1431         clone->bi_iter.bi_size = to_bytes(len);
1432
1433         if (bio_integrity(bio))
1434                 bio_integrity_trim(clone, 0, len);
1435 }
1436
1437 static struct dm_target_io *alloc_tio(struct clone_info *ci,
1438                                       struct dm_target *ti,
1439                                       unsigned target_bio_nr)
1440 {
1441         struct dm_target_io *tio;
1442         struct bio *clone;
1443
1444         clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1445         tio = container_of(clone, struct dm_target_io, clone);
1446
1447         tio->io = ci->io;
1448         tio->ti = ti;
1449         tio->target_bio_nr = target_bio_nr;
1450
1451         return tio;
1452 }
1453
1454 static void __clone_and_map_simple_bio(struct clone_info *ci,
1455                                        struct dm_target *ti,
1456                                        unsigned target_bio_nr, unsigned *len)
1457 {
1458         struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
1459         struct bio *clone = &tio->clone;
1460
1461         tio->len_ptr = len;
1462
1463         __bio_clone_fast(clone, ci->bio);
1464         if (len)
1465                 bio_setup_sector(clone, ci->sector, *len);
1466
1467         __map_bio(tio);
1468 }
1469
1470 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1471                                   unsigned num_bios, unsigned *len)
1472 {
1473         unsigned target_bio_nr;
1474
1475         for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1476                 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1477 }
1478
1479 static int __send_empty_flush(struct clone_info *ci)
1480 {
1481         unsigned target_nr = 0;
1482         struct dm_target *ti;
1483
1484         BUG_ON(bio_has_data(ci->bio));
1485         while ((ti = dm_table_get_target(ci->map, target_nr++)))
1486                 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1487
1488         return 0;
1489 }
1490
1491 static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1492                                      sector_t sector, unsigned *len)
1493 {
1494         struct bio *bio = ci->bio;
1495         struct dm_target_io *tio;
1496         unsigned target_bio_nr;
1497         unsigned num_target_bios = 1;
1498
1499         /*
1500          * Does the target want to receive duplicate copies of the bio?
1501          */
1502         if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1503                 num_target_bios = ti->num_write_bios(ti, bio);
1504
1505         for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1506                 tio = alloc_tio(ci, ti, target_bio_nr);
1507                 tio->len_ptr = len;
1508                 clone_bio(tio, bio, sector, *len);
1509                 __map_bio(tio);
1510         }
1511 }
1512
1513 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1514
1515 static unsigned get_num_discard_bios(struct dm_target *ti)
1516 {
1517         return ti->num_discard_bios;
1518 }
1519
1520 static unsigned get_num_write_same_bios(struct dm_target *ti)
1521 {
1522         return ti->num_write_same_bios;
1523 }
1524
1525 typedef bool (*is_split_required_fn)(struct dm_target *ti);
1526
1527 static bool is_split_required_for_discard(struct dm_target *ti)
1528 {
1529         return ti->split_discard_bios;
1530 }
1531
1532 static int __send_changing_extent_only(struct clone_info *ci,
1533                                        get_num_bios_fn get_num_bios,
1534                                        is_split_required_fn is_split_required)
1535 {
1536         struct dm_target *ti;
1537         unsigned len;
1538         unsigned num_bios;
1539
1540         do {
1541                 ti = dm_table_find_target(ci->map, ci->sector);
1542                 if (!dm_target_is_valid(ti))
1543                         return -EIO;
1544
1545                 /*
1546                  * Even though the device advertised support for this type of
1547                  * request, that does not mean every target supports it, and
1548                  * reconfiguration might also have changed that since the
1549                  * check was performed.
1550                  */
1551                 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1552                 if (!num_bios)
1553                         return -EOPNOTSUPP;
1554
1555                 if (is_split_required && !is_split_required(ti))
1556                         len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1557                 else
1558                         len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1559
1560                 __send_duplicate_bios(ci, ti, num_bios, &len);
1561
1562                 ci->sector += len;
1563         } while (ci->sector_count -= len);
1564
1565         return 0;
1566 }
1567
1568 static int __send_discard(struct clone_info *ci)
1569 {
1570         return __send_changing_extent_only(ci, get_num_discard_bios,
1571                                            is_split_required_for_discard);
1572 }
1573
1574 static int __send_write_same(struct clone_info *ci)
1575 {
1576         return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1577 }
1578
1579 /*
1580  * Select the correct strategy for processing a non-flush bio.
1581  */
1582 static int __split_and_process_non_flush(struct clone_info *ci)
1583 {
1584         struct bio *bio = ci->bio;
1585         struct dm_target *ti;
1586         unsigned len;
1587
1588         if (unlikely(bio->bi_rw & REQ_DISCARD))
1589                 return __send_discard(ci);
1590         else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1591                 return __send_write_same(ci);
1592
1593         ti = dm_table_find_target(ci->map, ci->sector);
1594         if (!dm_target_is_valid(ti))
1595                 return -EIO;
1596
1597         len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1598
1599         __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1600
1601         ci->sector += len;
1602         ci->sector_count -= len;
1603
1604         return 0;
1605 }
1606
1607 /*
1608  * Entry point to split a bio into clones and submit them to the targets.
1609  */
1610 static void __split_and_process_bio(struct mapped_device *md,
1611                                     struct dm_table *map, struct bio *bio)
1612 {
1613         struct clone_info ci;
1614         int error = 0;
1615
1616         if (unlikely(!map)) {
1617                 bio_io_error(bio);
1618                 return;
1619         }
1620
1621         ci.map = map;
1622         ci.md = md;
1623         ci.io = alloc_io(md);
1624         ci.io->error = 0;
1625         atomic_set(&ci.io->io_count, 1);
1626         ci.io->bio = bio;
1627         ci.io->md = md;
1628         spin_lock_init(&ci.io->endio_lock);
1629         ci.sector = bio->bi_iter.bi_sector;
1630
1631         start_io_acct(ci.io);
1632
1633         if (bio->bi_rw & REQ_FLUSH) {
1634                 ci.bio = &ci.md->flush_bio;
1635                 ci.sector_count = 0;
1636                 error = __send_empty_flush(&ci);
1637                 /* dec_pending submits any data associated with flush */
1638         } else {
1639                 ci.bio = bio;
1640                 ci.sector_count = bio_sectors(bio);
1641                 while (ci.sector_count && !error)
1642                         error = __split_and_process_non_flush(&ci);
1643         }
1644
1645         /* drop the extra reference count */
1646         dec_pending(ci.io, error);
1647 }
1648 /*-----------------------------------------------------------------
1649  * CRUD END
1650  *---------------------------------------------------------------*/
1651
1652 static int dm_merge_bvec(struct request_queue *q,
1653                          struct bvec_merge_data *bvm,
1654                          struct bio_vec *biovec)
1655 {
1656         struct mapped_device *md = q->queuedata;
1657         struct dm_table *map = dm_get_live_table_fast(md);
1658         struct dm_target *ti;
1659         sector_t max_sectors;
1660         int max_size = 0;
1661
1662         if (unlikely(!map))
1663                 goto out;
1664
1665         ti = dm_table_find_target(map, bvm->bi_sector);
1666         if (!dm_target_is_valid(ti))
1667                 goto out;
1668
1669         /*
1670          * Find maximum amount of I/O that won't need splitting
1671          */
1672         max_sectors = min(max_io_len(bvm->bi_sector, ti),
1673                           (sector_t) queue_max_sectors(q));
1674         max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1675         if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
1676                 max_size = 0;
1677
1678         /*
1679          * merge_bvec_fn() returns number of bytes
1680          * it can accept at this offset
1681          * max is precomputed maximal io size
1682          */
1683         if (max_size && ti->type->merge)
1684                 max_size = ti->type->merge(ti, bvm, biovec, max_size);
1685         /*
1686          * If the target doesn't support merge method and some of the devices
1687          * provided their merge_bvec method (we know this by looking for the
1688          * max_hw_sectors that dm_set_device_limits may set), then we can't
1689          * allow bios with multiple vector entries.  So always set max_size
1690          * to 0, and the code below allows just one page.
1691          */
1692         else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1693                 max_size = 0;
1694
1695 out:
1696         dm_put_live_table_fast(md);
1697         /*
1698          * Always allow an entire first page
1699          */
1700         if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1701                 max_size = biovec->bv_len;
1702
1703         return max_size;
1704 }
1705
1706 /*
1707  * The request function that just remaps the bio built up by
1708  * dm_merge_bvec.
1709  */
1710 static void dm_make_request(struct request_queue *q, struct bio *bio)
1711 {
1712         int rw = bio_data_dir(bio);
1713         struct mapped_device *md = q->queuedata;
1714         int srcu_idx;
1715         struct dm_table *map;
1716
1717         map = dm_get_live_table(md, &srcu_idx);
1718
1719         generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
1720
1721         /* if we're suspended, we have to queue this io for later */
1722         if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1723                 dm_put_live_table(md, srcu_idx);
1724
1725                 if (bio_rw(bio) != READA)
1726                         queue_io(md, bio);
1727                 else
1728                         bio_io_error(bio);
1729                 return;
1730         }
1731
1732         __split_and_process_bio(md, map, bio);
1733         dm_put_live_table(md, srcu_idx);
1734         return;
1735 }
1736
1737 int dm_request_based(struct mapped_device *md)
1738 {
1739         return blk_queue_stackable(md->queue);
1740 }
1741
1742 static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
1743 {
1744         int r;
1745
1746         if (blk_queue_io_stat(clone->q))
1747                 clone->cmd_flags |= REQ_IO_STAT;
1748
1749         clone->start_time = jiffies;
1750         r = blk_insert_cloned_request(clone->q, clone);
1751         if (r)
1752                 /* must complete clone in terms of original request */
1753                 dm_complete_request(rq, r);
1754 }
1755
1756 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1757                                  void *data)
1758 {
1759         struct dm_rq_target_io *tio = data;
1760         struct dm_rq_clone_bio_info *info =
1761                 container_of(bio, struct dm_rq_clone_bio_info, clone);
1762
1763         info->orig = bio_orig;
1764         info->tio = tio;
1765         bio->bi_end_io = end_clone_bio;
1766
1767         return 0;
1768 }
1769
1770 static int setup_clone(struct request *clone, struct request *rq,
1771                        struct dm_rq_target_io *tio, gfp_t gfp_mask)
1772 {
1773         int r;
1774
1775         r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
1776                               dm_rq_bio_constructor, tio);
1777         if (r)
1778                 return r;
1779
1780         clone->cmd = rq->cmd;
1781         clone->cmd_len = rq->cmd_len;
1782         clone->sense = rq->sense;
1783         clone->end_io = end_clone_request;
1784         clone->end_io_data = tio;
1785
1786         tio->clone = clone;
1787
1788         return 0;
1789 }
1790
1791 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1792                                 struct dm_rq_target_io *tio, gfp_t gfp_mask)
1793 {
1794         struct request *clone = alloc_clone_request(md, gfp_mask);
1795
1796         if (!clone)
1797                 return NULL;
1798
1799         blk_rq_init(NULL, clone);
1800         if (setup_clone(clone, rq, tio, gfp_mask)) {
1801                 /* -ENOMEM */
1802                 free_clone_request(md, clone);
1803                 return NULL;
1804         }
1805
1806         return clone;
1807 }
1808
1809 static void map_tio_request(struct kthread_work *work);
1810
1811 static struct dm_rq_target_io *prep_tio(struct request *rq,
1812                                         struct mapped_device *md, gfp_t gfp_mask)
1813 {
1814         struct dm_rq_target_io *tio;
1815         int srcu_idx;
1816         struct dm_table *table;
1817
1818         tio = alloc_rq_tio(md, gfp_mask);
1819         if (!tio)
1820                 return NULL;
1821
1822         tio->md = md;
1823         tio->ti = NULL;
1824         tio->clone = NULL;
1825         tio->orig = rq;
1826         tio->error = 0;
1827         memset(&tio->info, 0, sizeof(tio->info));
1828         init_kthread_work(&tio->work, map_tio_request);
1829
1830         table = dm_get_live_table(md, &srcu_idx);
1831         if (!dm_table_mq_request_based(table)) {
1832                 if (!clone_rq(rq, md, tio, gfp_mask)) {
1833                         dm_put_live_table(md, srcu_idx);
1834                         free_rq_tio(tio);
1835                         return NULL;
1836                 }
1837         }
1838         dm_put_live_table(md, srcu_idx);
1839
1840         return tio;
1841 }
1842
1843 /*
1844  * Called with the queue lock held.
1845  */
1846 static int dm_prep_fn(struct request_queue *q, struct request *rq)
1847 {
1848         struct mapped_device *md = q->queuedata;
1849         struct dm_rq_target_io *tio;
1850
1851         if (unlikely(rq->special)) {
1852                 DMWARN("Already has something in rq->special.");
1853                 return BLKPREP_KILL;
1854         }
1855
1856         tio = prep_tio(rq, md, GFP_ATOMIC);
1857         if (!tio)
1858                 return BLKPREP_DEFER;
1859
1860         rq->special = tio;
1861         rq->cmd_flags |= REQ_DONTPREP;
1862
1863         return BLKPREP_OK;
1864 }
1865
1866 /*
1867  * Returns:
1868  * 0                : the request has been processed
1869  * DM_MAPIO_REQUEUE : the original request needs to be requeued
1870  * < 0              : the request was completed due to failure
1871  */
1872 static int map_request(struct dm_target *ti, struct request *rq,
1873                        struct mapped_device *md)
1874 {
1875         int r;
1876         struct dm_rq_target_io *tio = rq->special;
1877         struct request *clone = NULL;
1878
1879         if (tio->clone) {
1880                 clone = tio->clone;
1881                 r = ti->type->map_rq(ti, clone, &tio->info);
1882         } else {
1883                 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
1884                 if (r < 0) {
1885                         /* The target wants to complete the I/O */
1886                         dm_kill_unmapped_request(rq, r);
1887                         return r;
1888                 }
1889                 if (IS_ERR(clone))
1890                         return DM_MAPIO_REQUEUE;
1891                 if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
1892                         /* -ENOMEM */
1893                         ti->type->release_clone_rq(clone);
1894                         return DM_MAPIO_REQUEUE;
1895                 }
1896         }
1897
1898         switch (r) {
1899         case DM_MAPIO_SUBMITTED:
1900                 /* The target has taken the I/O to submit by itself later */
1901                 break;
1902         case DM_MAPIO_REMAPPED:
1903                 /* The target has remapped the I/O so dispatch it */
1904                 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1905                                      blk_rq_pos(rq));
1906                 dm_dispatch_clone_request(clone, rq);
1907                 break;
1908         case DM_MAPIO_REQUEUE:
1909                 /* The target wants to requeue the I/O */
1910                 dm_requeue_unmapped_request(clone);
1911                 break;
1912         default:
1913                 if (r > 0) {
1914                         DMWARN("unimplemented target map return value: %d", r);
1915                         BUG();
1916                 }
1917
1918                 /* The target wants to complete the I/O */
1919                 dm_kill_unmapped_request(rq, r);
1920                 return r;
1921         }
1922
1923         return 0;
1924 }
1925
1926 static void map_tio_request(struct kthread_work *work)
1927 {
1928         struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
1929         struct request *rq = tio->orig;
1930         struct mapped_device *md = tio->md;
1931
1932         if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
1933                 dm_requeue_unmapped_original_request(md, rq);
1934 }
1935
1936 static void dm_start_request(struct mapped_device *md, struct request *orig)
1937 {
1938         blk_start_request(orig);
1939         atomic_inc(&md->pending[rq_data_dir(orig)]);
1940
1941         if (md->seq_rq_merge_deadline_usecs) {
1942                 md->last_rq_pos = rq_end_sector(orig);
1943                 md->last_rq_rw = rq_data_dir(orig);
1944                 md->last_rq_start_time = ktime_get();
1945         }
1946
1947         /*
1948          * Hold the md reference here for the in-flight I/O.
1949          * We can't rely on the reference count by device opener,
1950          * because the device may be closed during the request completion
1951          * when all bios are completed.
1952          * See the comment in rq_completed() too.
1953          */
1954         dm_get(md);
1955 }
1956
1957 #define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
1958
1959 ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
1960 {
1961         return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
1962 }
1963
1964 ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
1965                                                      const char *buf, size_t count)
1966 {
1967         unsigned deadline;
1968
1969         if (!dm_request_based(md))
1970                 return count;
1971
1972         if (kstrtouint(buf, 10, &deadline))
1973                 return -EINVAL;
1974
1975         if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
1976                 deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
1977
1978         md->seq_rq_merge_deadline_usecs = deadline;
1979
1980         return count;
1981 }
1982
1983 static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
1984 {
1985         ktime_t kt_deadline;
1986
1987         if (!md->seq_rq_merge_deadline_usecs)
1988                 return false;
1989
1990         kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
1991         kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
1992
1993         return !ktime_after(ktime_get(), kt_deadline);
1994 }
1995
1996 /*
1997  * q->request_fn for request-based dm.
1998  * Called with the queue lock held.
1999  */
2000 static void dm_request_fn(struct request_queue *q)
2001 {
2002         struct mapped_device *md = q->queuedata;
2003         int srcu_idx;
2004         struct dm_table *map = dm_get_live_table(md, &srcu_idx);
2005         struct dm_target *ti;
2006         struct request *rq;
2007         struct dm_rq_target_io *tio;
2008         sector_t pos;
2009
2010         /*
2011          * For suspend, check blk_queue_stopped() and increment
2012          * ->pending within a single queue_lock not to increment the
2013          * number of in-flight I/Os after the queue is stopped in
2014          * dm_suspend().
2015          */
2016         while (!blk_queue_stopped(q)) {
2017                 rq = blk_peek_request(q);
2018                 if (!rq)
2019                         goto out;
2020
2021                 /* always use block 0 to find the target for flushes for now */
2022                 pos = 0;
2023                 if (!(rq->cmd_flags & REQ_FLUSH))
2024                         pos = blk_rq_pos(rq);
2025
2026                 ti = dm_table_find_target(map, pos);
2027                 if (!dm_target_is_valid(ti)) {
2028                         /*
2029                          * Must perform setup, that rq_completed() requires,
2030                          * before calling dm_kill_unmapped_request
2031                          */
2032                         DMERR_LIMIT("request attempted access beyond the end of device");
2033                         dm_start_request(md, rq);
2034                         dm_kill_unmapped_request(rq, -EIO);
2035                         continue;
2036                 }
2037
2038                 if (dm_request_peeked_before_merge_deadline(md) &&
2039                     md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
2040                     md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
2041                         goto delay_and_out;
2042
2043                 if (ti->type->busy && ti->type->busy(ti))
2044                         goto delay_and_out;
2045
2046                 dm_start_request(md, rq);
2047
2048                 tio = rq->special;
2049                 /* Establish tio->ti before queuing work (map_tio_request) */
2050                 tio->ti = ti;
2051                 queue_kthread_work(&md->kworker, &tio->work);
2052                 BUG_ON(!irqs_disabled());
2053         }
2054
2055         goto out;
2056
2057 delay_and_out:
2058         blk_delay_queue(q, HZ / 100);
2059 out:
2060         dm_put_live_table(md, srcu_idx);
2061 }
2062
2063 static int dm_any_congested(void *congested_data, int bdi_bits)
2064 {
2065         int r = bdi_bits;
2066         struct mapped_device *md = congested_data;
2067         struct dm_table *map;
2068
2069         if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2070                 map = dm_get_live_table_fast(md);
2071                 if (map) {
2072                         /*
2073                          * Request-based dm cares about only own queue for
2074                          * the query about congestion status of request_queue
2075                          */
2076                         if (dm_request_based(md))
2077                                 r = md->queue->backing_dev_info.state &
2078                                     bdi_bits;
2079                         else
2080                                 r = dm_table_any_congested(map, bdi_bits);
2081                 }
2082                 dm_put_live_table_fast(md);
2083         }
2084
2085         return r;
2086 }
2087
2088 /*-----------------------------------------------------------------
2089  * An IDR is used to keep track of allocated minor numbers.
2090  *---------------------------------------------------------------*/
2091 static void free_minor(int minor)
2092 {
2093         spin_lock(&_minor_lock);
2094         idr_remove(&_minor_idr, minor);
2095         spin_unlock(&_minor_lock);
2096 }
2097
2098 /*
2099  * See if the device with a specific minor # is free.
2100  */
2101 static int specific_minor(int minor)
2102 {
2103         int r;
2104
2105         if (minor >= (1 << MINORBITS))
2106                 return -EINVAL;
2107
2108         idr_preload(GFP_KERNEL);
2109         spin_lock(&_minor_lock);
2110
2111         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
2112
2113         spin_unlock(&_minor_lock);
2114         idr_preload_end();
2115         if (r < 0)
2116                 return r == -ENOSPC ? -EBUSY : r;
2117         return 0;
2118 }
2119
2120 static int next_free_minor(int *minor)
2121 {
2122         int r;
2123
2124         idr_preload(GFP_KERNEL);
2125         spin_lock(&_minor_lock);
2126
2127         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
2128
2129         spin_unlock(&_minor_lock);
2130         idr_preload_end();
2131         if (r < 0)
2132                 return r;
2133         *minor = r;
2134         return 0;
2135 }
2136
2137 static const struct block_device_operations dm_blk_dops;
2138
2139 static void dm_wq_work(struct work_struct *work);
2140
2141 static void dm_init_md_queue(struct mapped_device *md)
2142 {
2143         /*
2144          * Request-based dm devices cannot be stacked on top of bio-based dm
2145          * devices.  The type of this dm device has not been decided yet.
2146          * The type is decided at the first table loading time.
2147          * To prevent problematic device stacking, clear the queue flag
2148          * for request stacking support until then.
2149          *
2150          * This queue is new, so no concurrency on the queue_flags.
2151          */
2152         queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
2153
2154         md->queue->queuedata = md;
2155         md->queue->backing_dev_info.congested_fn = dm_any_congested;
2156         md->queue->backing_dev_info.congested_data = md;
2157
2158         blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
2159 }
2160
2161 /*
2162  * Allocate and initialise a blank device with a given minor.
2163  */
2164 static struct mapped_device *alloc_dev(int minor)
2165 {
2166         int r;
2167         struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
2168         void *old_md;
2169
2170         if (!md) {
2171                 DMWARN("unable to allocate device, out of memory.");
2172                 return NULL;
2173         }
2174
2175         if (!try_module_get(THIS_MODULE))
2176                 goto bad_module_get;
2177
2178         /* get a minor number for the dev */
2179         if (minor == DM_ANY_MINOR)
2180                 r = next_free_minor(&minor);
2181         else
2182                 r = specific_minor(minor);
2183         if (r < 0)
2184                 goto bad_minor;
2185
2186         r = init_srcu_struct(&md->io_barrier);
2187         if (r < 0)
2188                 goto bad_io_barrier;
2189
2190         md->type = DM_TYPE_NONE;
2191         mutex_init(&md->suspend_lock);
2192         mutex_init(&md->type_lock);
2193         mutex_init(&md->table_devices_lock);
2194         spin_lock_init(&md->deferred_lock);
2195         atomic_set(&md->holders, 1);
2196         atomic_set(&md->open_count, 0);
2197         atomic_set(&md->event_nr, 0);
2198         atomic_set(&md->uevent_seq, 0);
2199         INIT_LIST_HEAD(&md->uevent_list);
2200         INIT_LIST_HEAD(&md->table_devices);
2201         spin_lock_init(&md->uevent_lock);
2202
2203         md->queue = blk_alloc_queue(GFP_KERNEL);
2204         if (!md->queue)
2205                 goto bad_queue;
2206
2207         dm_init_md_queue(md);
2208
2209         md->disk = alloc_disk(1);
2210         if (!md->disk)
2211                 goto bad_disk;
2212
2213         atomic_set(&md->pending[0], 0);
2214         atomic_set(&md->pending[1], 0);
2215         init_waitqueue_head(&md->wait);
2216         INIT_WORK(&md->work, dm_wq_work);
2217         init_waitqueue_head(&md->eventq);
2218         init_completion(&md->kobj_holder.completion);
2219         md->kworker_task = NULL;
2220
2221         md->disk->major = _major;
2222         md->disk->first_minor = minor;
2223         md->disk->fops = &dm_blk_dops;
2224         md->disk->queue = md->queue;
2225         md->disk->private_data = md;
2226         sprintf(md->disk->disk_name, "dm-%d", minor);
2227         add_disk(md->disk);
2228         format_dev_t(md->name, MKDEV(_major, minor));
2229
2230         md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2231         if (!md->wq)
2232                 goto bad_thread;
2233
2234         md->bdev = bdget_disk(md->disk, 0);
2235         if (!md->bdev)
2236                 goto bad_bdev;
2237
2238         bio_init(&md->flush_bio);
2239         md->flush_bio.bi_bdev = md->bdev;
2240         md->flush_bio.bi_rw = WRITE_FLUSH;
2241
2242         dm_stats_init(&md->stats);
2243
2244         /* Populate the mapping, nobody knows we exist yet */
2245         spin_lock(&_minor_lock);
2246         old_md = idr_replace(&_minor_idr, md, minor);
2247         spin_unlock(&_minor_lock);
2248
2249         BUG_ON(old_md != MINOR_ALLOCED);
2250
2251         return md;
2252
2253 bad_bdev:
2254         destroy_workqueue(md->wq);
2255 bad_thread:
2256         del_gendisk(md->disk);
2257         put_disk(md->disk);
2258 bad_disk:
2259         blk_cleanup_queue(md->queue);
2260 bad_queue:
2261         cleanup_srcu_struct(&md->io_barrier);
2262 bad_io_barrier:
2263         free_minor(minor);
2264 bad_minor:
2265         module_put(THIS_MODULE);
2266 bad_module_get:
2267         kfree(md);
2268         return NULL;
2269 }
2270
2271 static void unlock_fs(struct mapped_device *md);
2272
2273 static void free_dev(struct mapped_device *md)
2274 {
2275         int minor = MINOR(disk_devt(md->disk));
2276
2277         unlock_fs(md);
2278         destroy_workqueue(md->wq);
2279
2280         if (md->kworker_task)
2281                 kthread_stop(md->kworker_task);
2282         if (md->io_pool)
2283                 mempool_destroy(md->io_pool);
2284         if (md->rq_pool)
2285                 mempool_destroy(md->rq_pool);
2286         if (md->bs)
2287                 bioset_free(md->bs);
2288
2289         cleanup_srcu_struct(&md->io_barrier);
2290         free_table_devices(&md->table_devices);
2291         dm_stats_cleanup(&md->stats);
2292
2293         spin_lock(&_minor_lock);
2294         md->disk->private_data = NULL;
2295         spin_unlock(&_minor_lock);
2296         if (blk_get_integrity(md->disk))
2297                 blk_integrity_unregister(md->disk);
2298         del_gendisk(md->disk);
2299         put_disk(md->disk);
2300         blk_cleanup_queue(md->queue);
2301         bdput(md->bdev);
2302         free_minor(minor);
2303
2304         module_put(THIS_MODULE);
2305         kfree(md);
2306 }
2307
2308 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
2309 {
2310         struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2311
2312         if (md->io_pool && md->bs) {
2313                 /* The md already has necessary mempools. */
2314                 if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
2315                         /*
2316                          * Reload bioset because front_pad may have changed
2317                          * because a different table was loaded.
2318                          */
2319                         bioset_free(md->bs);
2320                         md->bs = p->bs;
2321                         p->bs = NULL;
2322                 }
2323                 /*
2324                  * There's no need to reload with request-based dm
2325                  * because the size of front_pad doesn't change.
2326                  * Note for future: If you are to reload bioset,
2327                  * prep-ed requests in the queue may refer
2328                  * to bio from the old bioset, so you must walk
2329                  * through the queue to unprep.
2330                  */
2331                 goto out;
2332         }
2333
2334         BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
2335
2336         md->io_pool = p->io_pool;
2337         p->io_pool = NULL;
2338         md->rq_pool = p->rq_pool;
2339         p->rq_pool = NULL;
2340         md->bs = p->bs;
2341         p->bs = NULL;
2342
2343 out:
2344         /* mempool bind completed, now no need any mempools in the table */
2345         dm_table_free_md_mempools(t);
2346 }
2347
2348 /*
2349  * Bind a table to the device.
2350  */
2351 static void event_callback(void *context)
2352 {
2353         unsigned long flags;
2354         LIST_HEAD(uevents);
2355         struct mapped_device *md = (struct mapped_device *) context;
2356
2357         spin_lock_irqsave(&md->uevent_lock, flags);
2358         list_splice_init(&md->uevent_list, &uevents);
2359         spin_unlock_irqrestore(&md->uevent_lock, flags);
2360
2361         dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2362
2363         atomic_inc(&md->event_nr);
2364         wake_up(&md->eventq);
2365 }
2366
2367 /*
2368  * Protected by md->suspend_lock obtained by dm_swap_table().
2369  */
2370 static void __set_size(struct mapped_device *md, sector_t size)
2371 {
2372         set_capacity(md->disk, size);
2373
2374         i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2375 }
2376
2377 /*
2378  * Return 1 if the queue has a compulsory merge_bvec_fn function.
2379  *
2380  * If this function returns 0, then the device is either a non-dm
2381  * device without a merge_bvec_fn, or it is a dm device that is
2382  * able to split any bios it receives that are too big.
2383  */
2384 int dm_queue_merge_is_compulsory(struct request_queue *q)
2385 {
2386         struct mapped_device *dev_md;
2387
2388         if (!q->merge_bvec_fn)
2389                 return 0;
2390
2391         if (q->make_request_fn == dm_make_request) {
2392                 dev_md = q->queuedata;
2393                 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2394                         return 0;
2395         }
2396
2397         return 1;
2398 }
2399
2400 static int dm_device_merge_is_compulsory(struct dm_target *ti,
2401                                          struct dm_dev *dev, sector_t start,
2402                                          sector_t len, void *data)
2403 {
2404         struct block_device *bdev = dev->bdev;
2405         struct request_queue *q = bdev_get_queue(bdev);
2406
2407         return dm_queue_merge_is_compulsory(q);
2408 }
2409
2410 /*
2411  * Return 1 if it is acceptable to ignore merge_bvec_fn based
2412  * on the properties of the underlying devices.
2413  */
2414 static int dm_table_merge_is_optional(struct dm_table *table)
2415 {
2416         unsigned i = 0;
2417         struct dm_target *ti;
2418
2419         while (i < dm_table_get_num_targets(table)) {
2420                 ti = dm_table_get_target(table, i++);
2421
2422                 if (ti->type->iterate_devices &&
2423                     ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
2424                         return 0;
2425         }
2426
2427         return 1;
2428 }
2429
2430 /*
2431  * Returns old map, which caller must destroy.
2432  */
2433 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2434                                struct queue_limits *limits)
2435 {
2436         struct dm_table *old_map;
2437         struct request_queue *q = md->queue;
2438         sector_t size;
2439         int merge_is_optional;
2440
2441         size = dm_table_get_size(t);
2442
2443         /*
2444          * Wipe any geometry if the size of the table changed.
2445          */
2446         if (size != dm_get_size(md))
2447                 memset(&md->geometry, 0, sizeof(md->geometry));
2448
2449         __set_size(md, size);
2450
2451         dm_table_event_callback(t, event_callback, md);
2452
2453         /*
2454          * The queue hasn't been stopped yet, if the old table type wasn't
2455          * for request-based during suspension.  So stop it to prevent
2456          * I/O mapping before resume.
2457          * This must be done before setting the queue restrictions,
2458          * because request-based dm may be run just after the setting.
2459          */
2460         if (dm_table_request_based(t) && !blk_queue_stopped(q))
2461                 stop_queue(q);
2462
2463         __bind_mempools(md, t);
2464
2465         merge_is_optional = dm_table_merge_is_optional(t);
2466
2467         old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2468         rcu_assign_pointer(md->map, t);
2469         md->immutable_target_type = dm_table_get_immutable_target_type(t);
2470
2471         dm_table_set_restrictions(t, q, limits);
2472         if (merge_is_optional)
2473                 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2474         else
2475                 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2476         if (old_map)
2477                 dm_sync_table(md);
2478
2479         return old_map;
2480 }
2481
2482 /*
2483  * Returns unbound table for the caller to free.
2484  */
2485 static struct dm_table *__unbind(struct mapped_device *md)
2486 {
2487         struct dm_table *map = rcu_dereference_protected(md->map, 1);
2488
2489         if (!map)
2490                 return NULL;
2491
2492         dm_table_event_callback(map, NULL, NULL);
2493         RCU_INIT_POINTER(md->map, NULL);
2494         dm_sync_table(md);
2495
2496         return map;
2497 }
2498
2499 /*
2500  * Constructor for a new device.
2501  */
2502 int dm_create(int minor, struct mapped_device **result)
2503 {
2504         struct mapped_device *md;
2505
2506         md = alloc_dev(minor);
2507         if (!md)
2508                 return -ENXIO;
2509
2510         dm_sysfs_init(md);
2511
2512         *result = md;
2513         return 0;
2514 }
2515
2516 /*
2517  * Functions to manage md->type.
2518  * All are required to hold md->type_lock.
2519  */
2520 void dm_lock_md_type(struct mapped_device *md)
2521 {
2522         mutex_lock(&md->type_lock);
2523 }
2524
2525 void dm_unlock_md_type(struct mapped_device *md)
2526 {
2527         mutex_unlock(&md->type_lock);
2528 }
2529
2530 void dm_set_md_type(struct mapped_device *md, unsigned type)
2531 {
2532         BUG_ON(!mutex_is_locked(&md->type_lock));
2533         md->type = type;
2534 }
2535
2536 unsigned dm_get_md_type(struct mapped_device *md)
2537 {
2538         BUG_ON(!mutex_is_locked(&md->type_lock));
2539         return md->type;
2540 }
2541
2542 static bool dm_md_type_request_based(struct mapped_device *md)
2543 {
2544         unsigned table_type = dm_get_md_type(md);
2545
2546         return (table_type == DM_TYPE_REQUEST_BASED ||
2547                 table_type == DM_TYPE_MQ_REQUEST_BASED);
2548 }
2549
2550 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2551 {
2552         return md->immutable_target_type;
2553 }
2554
2555 /*
2556  * The queue_limits are only valid as long as you have a reference
2557  * count on 'md'.
2558  */
2559 struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2560 {
2561         BUG_ON(!atomic_read(&md->holders));
2562         return &md->queue->limits;
2563 }
2564 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2565
2566 /*
2567  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2568  */
2569 static int dm_init_request_based_queue(struct mapped_device *md)
2570 {
2571         struct request_queue *q = NULL;
2572
2573         if (md->queue->elevator)
2574                 return 1;
2575
2576         /* Fully initialize the queue */
2577         q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2578         if (!q)
2579                 return 0;
2580
2581         /* disable dm_request_fn's merge heuristic by default */
2582         md->seq_rq_merge_deadline_usecs = 0;
2583
2584         md->queue = q;
2585         dm_init_md_queue(md);
2586         blk_queue_softirq_done(md->queue, dm_softirq_done);
2587         blk_queue_prep_rq(md->queue, dm_prep_fn);
2588
2589         /* Also initialize the request-based DM worker thread */
2590         init_kthread_worker(&md->kworker);
2591         md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
2592                                        "kdmwork-%s", dm_device_name(md));
2593
2594         elv_register_queue(md->queue);
2595
2596         return 1;
2597 }
2598
2599 /*
2600  * Setup the DM device's queue based on md's type
2601  */
2602 int dm_setup_md_queue(struct mapped_device *md)
2603 {
2604         if (dm_md_type_request_based(md)) {
2605                 if (!dm_init_request_based_queue(md)) {
2606                         DMWARN("Cannot initialize queue for request-based mapped device");
2607                         return -EINVAL;
2608                 }
2609         } else {
2610                 /* bio-based specific initialization */
2611                 blk_queue_make_request(md->queue, dm_make_request);
2612                 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
2613         }
2614
2615         return 0;
2616 }
2617
2618 struct mapped_device *dm_get_md(dev_t dev)
2619 {
2620         struct mapped_device *md;
2621         unsigned minor = MINOR(dev);
2622
2623         if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2624                 return NULL;
2625
2626         spin_lock(&_minor_lock);
2627
2628         md = idr_find(&_minor_idr, minor);
2629         if (md) {
2630                 if ((md == MINOR_ALLOCED ||
2631                      (MINOR(disk_devt(dm_disk(md))) != minor) ||
2632                      dm_deleting_md(md) ||
2633                      test_bit(DMF_FREEING, &md->flags))) {
2634                         md = NULL;
2635                         goto out;
2636                 }
2637                 dm_get(md);
2638         }
2639
2640 out:
2641         spin_unlock(&_minor_lock);
2642
2643         return md;
2644 }
2645 EXPORT_SYMBOL_GPL(dm_get_md);
2646
2647 void *dm_get_mdptr(struct mapped_device *md)
2648 {
2649         return md->interface_ptr;
2650 }
2651
2652 void dm_set_mdptr(struct mapped_device *md, void *ptr)
2653 {
2654         md->interface_ptr = ptr;
2655 }
2656
2657 void dm_get(struct mapped_device *md)
2658 {
2659         atomic_inc(&md->holders);
2660         BUG_ON(test_bit(DMF_FREEING, &md->flags));
2661 }
2662
2663 int dm_hold(struct mapped_device *md)
2664 {
2665         spin_lock(&_minor_lock);
2666         if (test_bit(DMF_FREEING, &md->flags)) {
2667                 spin_unlock(&_minor_lock);
2668                 return -EBUSY;
2669         }
2670         dm_get(md);
2671         spin_unlock(&_minor_lock);
2672         return 0;
2673 }
2674 EXPORT_SYMBOL_GPL(dm_hold);
2675
2676 const char *dm_device_name(struct mapped_device *md)
2677 {
2678         return md->name;
2679 }
2680 EXPORT_SYMBOL_GPL(dm_device_name);
2681
2682 static void __dm_destroy(struct mapped_device *md, bool wait)
2683 {
2684         struct dm_table *map;
2685         int srcu_idx;
2686
2687         might_sleep();
2688
2689         map = dm_get_live_table(md, &srcu_idx);
2690
2691         spin_lock(&_minor_lock);
2692         idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2693         set_bit(DMF_FREEING, &md->flags);
2694         spin_unlock(&_minor_lock);
2695
2696         if (dm_request_based(md))
2697                 flush_kthread_worker(&md->kworker);
2698
2699         /*
2700          * Take suspend_lock so that presuspend and postsuspend methods
2701          * do not race with internal suspend.
2702          */
2703         mutex_lock(&md->suspend_lock);
2704         if (!dm_suspended_md(md)) {
2705                 dm_table_presuspend_targets(map);
2706                 dm_table_postsuspend_targets(map);
2707         }
2708         mutex_unlock(&md->suspend_lock);
2709
2710         /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2711         dm_put_live_table(md, srcu_idx);
2712
2713         /*
2714          * Rare, but there may be I/O requests still going to complete,
2715          * for example.  Wait for all references to disappear.
2716          * No one should increment the reference count of the mapped_device,
2717          * after the mapped_device state becomes DMF_FREEING.
2718          */
2719         if (wait)
2720                 while (atomic_read(&md->holders))
2721                         msleep(1);
2722         else if (atomic_read(&md->holders))
2723                 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2724                        dm_device_name(md), atomic_read(&md->holders));
2725
2726         dm_sysfs_exit(md);
2727         dm_table_destroy(__unbind(md));
2728         free_dev(md);
2729 }
2730
2731 void dm_destroy(struct mapped_device *md)
2732 {
2733         __dm_destroy(md, true);
2734 }
2735
2736 void dm_destroy_immediate(struct mapped_device *md)
2737 {
2738         __dm_destroy(md, false);
2739 }
2740
2741 void dm_put(struct mapped_device *md)
2742 {
2743         atomic_dec(&md->holders);
2744 }
2745 EXPORT_SYMBOL_GPL(dm_put);
2746
2747 static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2748 {
2749         int r = 0;
2750         DECLARE_WAITQUEUE(wait, current);
2751
2752         add_wait_queue(&md->wait, &wait);
2753
2754         while (1) {
2755                 set_current_state(interruptible);
2756
2757                 if (!md_in_flight(md))
2758                         break;
2759
2760                 if (interruptible == TASK_INTERRUPTIBLE &&
2761                     signal_pending(current)) {
2762                         r = -EINTR;
2763                         break;
2764                 }
2765
2766                 io_schedule();
2767         }
2768         set_current_state(TASK_RUNNING);
2769
2770         remove_wait_queue(&md->wait, &wait);
2771
2772         return r;
2773 }
2774
2775 /*
2776  * Process the deferred bios
2777  */
2778 static void dm_wq_work(struct work_struct *work)
2779 {
2780         struct mapped_device *md = container_of(work, struct mapped_device,
2781                                                 work);
2782         struct bio *c;
2783         int srcu_idx;
2784         struct dm_table *map;
2785
2786         map = dm_get_live_table(md, &srcu_idx);
2787
2788         while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2789                 spin_lock_irq(&md->deferred_lock);
2790                 c = bio_list_pop(&md->deferred);
2791                 spin_unlock_irq(&md->deferred_lock);
2792
2793                 if (!c)
2794                         break;
2795
2796                 if (dm_request_based(md))
2797                         generic_make_request(c);
2798                 else
2799                         __split_and_process_bio(md, map, c);
2800         }
2801
2802         dm_put_live_table(md, srcu_idx);
2803 }
2804
2805 static void dm_queue_flush(struct mapped_device *md)
2806 {
2807         clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2808         smp_mb__after_atomic();
2809         queue_work(md->wq, &md->work);
2810 }
2811
2812 /*
2813  * Swap in a new table, returning the old one for the caller to destroy.
2814  */
2815 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2816 {
2817         struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2818         struct queue_limits limits;
2819         int r;
2820
2821         mutex_lock(&md->suspend_lock);
2822
2823         /* device must be suspended */
2824         if (!dm_suspended_md(md))
2825                 goto out;
2826
2827         /*
2828          * If the new table has no data devices, retain the existing limits.
2829          * This helps multipath with queue_if_no_path if all paths disappear,
2830          * then new I/O is queued based on these limits, and then some paths
2831          * reappear.
2832          */
2833         if (dm_table_has_no_data_devices(table)) {
2834                 live_map = dm_get_live_table_fast(md);
2835                 if (live_map)
2836                         limits = md->queue->limits;
2837                 dm_put_live_table_fast(md);
2838         }
2839
2840         if (!live_map) {
2841                 r = dm_calculate_queue_limits(table, &limits);
2842                 if (r) {
2843                         map = ERR_PTR(r);
2844                         goto out;
2845                 }
2846         }
2847
2848         map = __bind(md, table, &limits);
2849
2850 out:
2851         mutex_unlock(&md->suspend_lock);
2852         return map;
2853 }
2854
2855 /*
2856  * Functions to lock and unlock any filesystem running on the
2857  * device.
2858  */
2859 static int lock_fs(struct mapped_device *md)
2860 {
2861         int r;
2862
2863         WARN_ON(md->frozen_sb);
2864
2865         md->frozen_sb = freeze_bdev(md->bdev);
2866         if (IS_ERR(md->frozen_sb)) {
2867                 r = PTR_ERR(md->frozen_sb);
2868                 md->frozen_sb = NULL;
2869                 return r;
2870         }
2871
2872         set_bit(DMF_FROZEN, &md->flags);
2873
2874         return 0;
2875 }
2876
2877 static void unlock_fs(struct mapped_device *md)
2878 {
2879         if (!test_bit(DMF_FROZEN, &md->flags))
2880                 return;
2881
2882         thaw_bdev(md->bdev, md->frozen_sb);
2883         md->frozen_sb = NULL;
2884         clear_bit(DMF_FROZEN, &md->flags);
2885 }
2886
2887 /*
2888  * If __dm_suspend returns 0, the device is completely quiescent
2889  * now. There is no request-processing activity. All new requests
2890  * are being added to md->deferred list.
2891  *
2892  * Caller must hold md->suspend_lock
2893  */
2894 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2895                         unsigned suspend_flags, int interruptible)
2896 {
2897         bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2898         bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2899         int r;
2900
2901         /*
2902          * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2903          * This flag is cleared before dm_suspend returns.
2904          */
2905         if (noflush)
2906                 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2907
2908         /*
2909          * This gets reverted if there's an error later and the targets
2910          * provide the .presuspend_undo hook.
2911          */
2912         dm_table_presuspend_targets(map);
2913
2914         /*
2915          * Flush I/O to the device.
2916          * Any I/O submitted after lock_fs() may not be flushed.
2917          * noflush takes precedence over do_lockfs.
2918          * (lock_fs() flushes I/Os and waits for them to complete.)
2919          */
2920         if (!noflush && do_lockfs) {
2921                 r = lock_fs(md);
2922                 if (r) {
2923                         dm_table_presuspend_undo_targets(map);
2924                         return r;
2925                 }
2926         }
2927
2928         /*
2929          * Here we must make sure that no processes are submitting requests
2930          * to target drivers i.e. no one may be executing
2931          * __split_and_process_bio. This is called from dm_request and
2932          * dm_wq_work.
2933          *
2934          * To get all processes out of __split_and_process_bio in dm_request,
2935          * we take the write lock. To prevent any process from reentering
2936          * __split_and_process_bio from dm_request and quiesce the thread
2937          * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2938          * flush_workqueue(md->wq).
2939          */
2940         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2941         if (map)
2942                 synchronize_srcu(&md->io_barrier);
2943
2944         /*
2945          * Stop md->queue before flushing md->wq in case request-based
2946          * dm defers requests to md->wq from md->queue.
2947          */
2948         if (dm_request_based(md)) {
2949                 stop_queue(md->queue);
2950                 flush_kthread_worker(&md->kworker);
2951         }
2952
2953         flush_workqueue(md->wq);
2954
2955         /*
2956          * At this point no more requests are entering target request routines.
2957          * We call dm_wait_for_completion to wait for all existing requests
2958          * to finish.
2959          */
2960         r = dm_wait_for_completion(md, interruptible);
2961
2962         if (noflush)
2963                 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2964         if (map)
2965                 synchronize_srcu(&md->io_barrier);
2966
2967         /* were we interrupted ? */
2968         if (r < 0) {
2969                 dm_queue_flush(md);
2970
2971                 if (dm_request_based(md))
2972                         start_queue(md->queue);
2973
2974                 unlock_fs(md);
2975                 dm_table_presuspend_undo_targets(map);
2976                 /* pushback list is already flushed, so skip flush */
2977         }
2978
2979         return r;
2980 }
2981
2982 /*
2983  * We need to be able to change a mapping table under a mounted
2984  * filesystem.  For example we might want to move some data in
2985  * the background.  Before the table can be swapped with
2986  * dm_bind_table, dm_suspend must be called to flush any in
2987  * flight bios and ensure that any further io gets deferred.
2988  */
2989 /*
2990  * Suspend mechanism in request-based dm.
2991  *
2992  * 1. Flush all I/Os by lock_fs() if needed.
2993  * 2. Stop dispatching any I/O by stopping the request_queue.
2994  * 3. Wait for all in-flight I/Os to be completed or requeued.
2995  *
2996  * To abort suspend, start the request_queue.
2997  */
2998 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2999 {
3000         struct dm_table *map = NULL;
3001         int r = 0;
3002
3003 retry:
3004         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
3005
3006         if (dm_suspended_md(md)) {
3007                 r = -EINVAL;
3008                 goto out_unlock;
3009         }
3010
3011         if (dm_suspended_internally_md(md)) {
3012                 /* already internally suspended, wait for internal resume */
3013                 mutex_unlock(&md->suspend_lock);
3014                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
3015                 if (r)
3016                         return r;
3017                 goto retry;
3018         }
3019
3020         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3021
3022         r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
3023         if (r)
3024                 goto out_unlock;
3025
3026         set_bit(DMF_SUSPENDED, &md->flags);
3027
3028         dm_table_postsuspend_targets(map);
3029
3030 out_unlock:
3031         mutex_unlock(&md->suspend_lock);
3032         return r;
3033 }
3034
3035 static int __dm_resume(struct mapped_device *md, struct dm_table *map)
3036 {
3037         if (map) {
3038                 int r = dm_table_resume_targets(map);
3039                 if (r)
3040                         return r;
3041         }
3042
3043         dm_queue_flush(md);
3044
3045         /*
3046          * Flushing deferred I/Os must be done after targets are resumed
3047          * so that mapping of targets can work correctly.
3048          * Request-based dm is queueing the deferred I/Os in its request_queue.
3049          */
3050         if (dm_request_based(md))
3051                 start_queue(md->queue);
3052
3053         unlock_fs(md);
3054
3055         return 0;
3056 }
3057
3058 int dm_resume(struct mapped_device *md)
3059 {
3060         int r = -EINVAL;
3061         struct dm_table *map = NULL;
3062
3063 retry:
3064         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
3065
3066         if (!dm_suspended_md(md))
3067                 goto out;
3068
3069         if (dm_suspended_internally_md(md)) {
3070                 /* already internally suspended, wait for internal resume */
3071                 mutex_unlock(&md->suspend_lock);
3072                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
3073                 if (r)
3074                         return r;
3075                 goto retry;
3076         }
3077
3078         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3079         if (!map || !dm_table_get_size(map))
3080                 goto out;
3081
3082         r = __dm_resume(md, map);
3083         if (r)
3084                 goto out;
3085
3086         clear_bit(DMF_SUSPENDED, &md->flags);
3087
3088         r = 0;
3089 out:
3090         mutex_unlock(&md->suspend_lock);
3091
3092         return r;
3093 }
3094
3095 /*
3096  * Internal suspend/resume works like userspace-driven suspend. It waits
3097  * until all bios finish and prevents issuing new bios to the target drivers.
3098  * It may be used only from the kernel.
3099  */
3100
3101 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
3102 {
3103         struct dm_table *map = NULL;
3104
3105         if (md->internal_suspend_count++)
3106                 return; /* nested internal suspend */
3107
3108         if (dm_suspended_md(md)) {
3109                 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3110                 return; /* nest suspend */
3111         }
3112
3113         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3114
3115         /*
3116          * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
3117          * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
3118          * would require changing .presuspend to return an error -- avoid this
3119          * until there is a need for more elaborate variants of internal suspend.
3120          */
3121         (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
3122
3123         set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3124
3125         dm_table_postsuspend_targets(map);
3126 }
3127
3128 static void __dm_internal_resume(struct mapped_device *md)
3129 {
3130         BUG_ON(!md->internal_suspend_count);
3131
3132         if (--md->internal_suspend_count)
3133                 return; /* resume from nested internal suspend */
3134
3135         if (dm_suspended_md(md))
3136                 goto done; /* resume from nested suspend */
3137
3138         /*
3139          * NOTE: existing callers don't need to call dm_table_resume_targets
3140          * (which may fail -- so best to avoid it for now by passing NULL map)
3141          */
3142         (void) __dm_resume(md, NULL);
3143
3144 done:
3145         clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3146         smp_mb__after_atomic();
3147         wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
3148 }
3149
3150 void dm_internal_suspend_noflush(struct mapped_device *md)
3151 {
3152         mutex_lock(&md->suspend_lock);
3153         __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
3154         mutex_unlock(&md->suspend_lock);
3155 }
3156 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
3157
3158 void dm_internal_resume(struct mapped_device *md)
3159 {
3160         mutex_lock(&md->suspend_lock);
3161         __dm_internal_resume(md);
3162         mutex_unlock(&md->suspend_lock);
3163 }
3164 EXPORT_SYMBOL_GPL(dm_internal_resume);
3165
3166 /*
3167  * Fast variants of internal suspend/resume hold md->suspend_lock,
3168  * which prevents interaction with userspace-driven suspend.
3169  */
3170
3171 void dm_internal_suspend_fast(struct mapped_device *md)
3172 {
3173         mutex_lock(&md->suspend_lock);
3174         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
3175                 return;
3176
3177         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
3178         synchronize_srcu(&md->io_barrier);
3179         flush_workqueue(md->wq);
3180         dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
3181 }
3182 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
3183
3184 void dm_internal_resume_fast(struct mapped_device *md)
3185 {
3186         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
3187                 goto done;
3188
3189         dm_queue_flush(md);
3190
3191 done:
3192         mutex_unlock(&md->suspend_lock);
3193 }
3194 EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
3195
3196 /*-----------------------------------------------------------------
3197  * Event notification.
3198  *---------------------------------------------------------------*/
3199 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
3200                        unsigned cookie)
3201 {
3202         char udev_cookie[DM_COOKIE_LENGTH];
3203         char *envp[] = { udev_cookie, NULL };
3204
3205         if (!cookie)
3206                 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
3207         else {
3208                 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
3209                          DM_COOKIE_ENV_VAR_NAME, cookie);
3210                 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
3211                                           action, envp);
3212         }
3213 }
3214
3215 uint32_t dm_next_uevent_seq(struct mapped_device *md)
3216 {
3217         return atomic_add_return(1, &md->uevent_seq);
3218 }
3219
3220 uint32_t dm_get_event_nr(struct mapped_device *md)
3221 {
3222         return atomic_read(&md->event_nr);
3223 }
3224
3225 int dm_wait_event(struct mapped_device *md, int event_nr)
3226 {
3227         return wait_event_interruptible(md->eventq,
3228                         (event_nr != atomic_read(&md->event_nr)));
3229 }
3230
3231 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
3232 {
3233         unsigned long flags;
3234
3235         spin_lock_irqsave(&md->uevent_lock, flags);
3236         list_add(elist, &md->uevent_list);
3237         spin_unlock_irqrestore(&md->uevent_lock, flags);
3238 }
3239
3240 /*
3241  * The gendisk is only valid as long as you have a reference
3242  * count on 'md'.
3243  */
3244 struct gendisk *dm_disk(struct mapped_device *md)
3245 {
3246         return md->disk;
3247 }
3248
3249 struct kobject *dm_kobject(struct mapped_device *md)
3250 {
3251         return &md->kobj_holder.kobj;
3252 }
3253
3254 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
3255 {
3256         struct mapped_device *md;
3257
3258         md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
3259
3260         if (test_bit(DMF_FREEING, &md->flags) ||
3261             dm_deleting_md(md))
3262                 return NULL;
3263
3264         dm_get(md);
3265         return md;
3266 }
3267
3268 int dm_suspended_md(struct mapped_device *md)
3269 {
3270         return test_bit(DMF_SUSPENDED, &md->flags);
3271 }
3272
3273 int dm_suspended_internally_md(struct mapped_device *md)
3274 {
3275         return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3276 }
3277
3278 int dm_test_deferred_remove_flag(struct mapped_device *md)
3279 {
3280         return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
3281 }
3282
3283 int dm_suspended(struct dm_target *ti)
3284 {
3285         return dm_suspended_md(dm_table_get_md(ti->table));
3286 }
3287 EXPORT_SYMBOL_GPL(dm_suspended);
3288
3289 int dm_noflush_suspending(struct dm_target *ti)
3290 {
3291         return __noflush_suspending(dm_table_get_md(ti->table));
3292 }
3293 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3294
3295 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
3296 {
3297         struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
3298         struct kmem_cache *cachep;
3299         unsigned int pool_size = 0;
3300         unsigned int front_pad;
3301
3302         if (!pools)
3303                 return NULL;
3304
3305         switch (type) {
3306         case DM_TYPE_BIO_BASED:
3307                 cachep = _io_cache;
3308                 pool_size = dm_get_reserved_bio_based_ios();
3309                 front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3310                 break;
3311         case DM_TYPE_REQUEST_BASED:
3312                 pool_size = dm_get_reserved_rq_based_ios();
3313                 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
3314                 if (!pools->rq_pool)
3315                         goto out;
3316                 /* fall through to setup remaining rq-based pools */
3317         case DM_TYPE_MQ_REQUEST_BASED:
3318                 cachep = _rq_tio_cache;
3319                 if (!pool_size)
3320                         pool_size = dm_get_reserved_rq_based_ios();
3321                 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3322                 /* per_bio_data_size is not used. See __bind_mempools(). */
3323                 WARN_ON(per_bio_data_size != 0);
3324                 break;
3325         default:
3326                 goto out;
3327         }
3328
3329         pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
3330         if (!pools->io_pool)
3331                 goto out;
3332
3333         pools->bs = bioset_create_nobvec(pool_size, front_pad);
3334         if (!pools->bs)
3335                 goto out;
3336
3337         if (integrity && bioset_integrity_create(pools->bs, pool_size))
3338                 goto out;
3339
3340         return pools;
3341
3342 out:
3343         dm_free_md_mempools(pools);
3344
3345         return NULL;
3346 }
3347
3348 void dm_free_md_mempools(struct dm_md_mempools *pools)
3349 {
3350         if (!pools)
3351                 return;
3352
3353         if (pools->io_pool)
3354                 mempool_destroy(pools->io_pool);
3355
3356         if (pools->rq_pool)
3357                 mempool_destroy(pools->rq_pool);
3358
3359         if (pools->bs)
3360                 bioset_free(pools->bs);
3361
3362         kfree(pools);
3363 }
3364
3365 static const struct block_device_operations dm_blk_dops = {
3366         .open = dm_blk_open,
3367         .release = dm_blk_close,
3368         .ioctl = dm_blk_ioctl,
3369         .getgeo = dm_blk_getgeo,
3370         .owner = THIS_MODULE
3371 };
3372
3373 /*
3374  * module hooks
3375  */
3376 module_init(dm_init);
3377 module_exit(dm_exit);
3378
3379 module_param(major, uint, 0);
3380 MODULE_PARM_DESC(major, "The major number of the device mapper");
3381
3382 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3383 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3384
3385 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
3386 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
3387
3388 MODULE_DESCRIPTION(DM_NAME " driver");
3389 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3390 MODULE_LICENSE("GPL");