drivers/md/dm-rq.c

   1 /*
   2  * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
   3  *
   4  * This file is released under the GPL.
   5  */
   6
   7 #include "dm-core.h"
   8 #include "dm-rq.h"
   9
  10 #include <linux/elevator.h> /* for rq_end_sector() */
  11 #include <linux/blk-mq.h>
  12
  13 #define DM_MSG_PREFIX "core-rq"
  14
  15 #define DM_MQ_NR_HW_QUEUES 1
  16 #define DM_MQ_QUEUE_DEPTH 2048
  17 static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
  18 static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
  19
  20 /*
  21  * Request-based DM's mempools' reserved IOs set by the user.
  22  */
  23 #define RESERVED_REQUEST_BASED_IOS      256
  24 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
  25
  26 static bool use_blk_mq = IS_ENABLED(CONFIG_DM_MQ_DEFAULT);
  27
  28 bool dm_use_blk_mq_default(void)
  29 {
  30         return use_blk_mq;
  31 }
  32
  33 bool dm_use_blk_mq(struct mapped_device *md)
  34 {
  35         return md->use_blk_mq;
  36 }
  37 EXPORT_SYMBOL_GPL(dm_use_blk_mq);
  38
  39 unsigned dm_get_reserved_rq_based_ios(void)
  40 {
  41         return __dm_get_module_param(&reserved_rq_based_ios,
  42                                      RESERVED_REQUEST_BASED_IOS, DM_RESERVED_MAX_IOS);
  43 }
  44 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
  45
  46 static unsigned dm_get_blk_mq_nr_hw_queues(void)
  47 {
  48         return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32);
  49 }
  50
  51 static unsigned dm_get_blk_mq_queue_depth(void)
  52 {
  53         return __dm_get_module_param(&dm_mq_queue_depth,
  54                                      DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH);
  55 }
  56
  57 int dm_request_based(struct mapped_device *md)
  58 {
  59         return blk_queue_stackable(md->queue);
  60 }
  61
  62 static void dm_old_start_queue(struct request_queue *q)
  63 {
  64         unsigned long flags;
  65
  66         spin_lock_irqsave(q->queue_lock, flags);
  67         if (blk_queue_stopped(q))
  68                 blk_start_queue(q);
  69         spin_unlock_irqrestore(q->queue_lock, flags);
  70 }
  71
  72 static void dm_mq_start_queue(struct request_queue *q)
  73 {
  74         blk_mq_start_stopped_hw_queues(q, true);
  75         blk_mq_kick_requeue_list(q);
  76 }
  77
  78 void dm_start_queue(struct request_queue *q)
  79 {
  80         if (!q->mq_ops)
  81                 dm_old_start_queue(q);
  82         else
  83                 dm_mq_start_queue(q);
  84 }
  85
  86 static void dm_old_stop_queue(struct request_queue *q)
  87 {
  88         unsigned long flags;
  89
  90         spin_lock_irqsave(q->queue_lock, flags);
  91         if (!blk_queue_stopped(q))
  92                 blk_stop_queue(q);
  93         spin_unlock_irqrestore(q->queue_lock, flags);
  94 }
  95
  96 static void dm_mq_stop_queue(struct request_queue *q)
  97 {
  98         if (blk_mq_queue_stopped(q))
  99                 return;
 100
 101         blk_mq_quiesce_queue(q);
 102 }
 103
 104 void dm_stop_queue(struct request_queue *q)
 105 {
 106         if (!q->mq_ops)
 107                 dm_old_stop_queue(q);
 108         else
 109                 dm_mq_stop_queue(q);
 110 }
 111
 112 static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
 113                                                 gfp_t gfp_mask)
 114 {
 115         return mempool_alloc(md->io_pool, gfp_mask);
 116 }
 117
 118 static void free_old_rq_tio(struct dm_rq_target_io *tio)
 119 {
 120         mempool_free(tio, tio->md->io_pool);
 121 }
 122
 123 static struct request *alloc_old_clone_request(struct mapped_device *md,
 124                                                gfp_t gfp_mask)
 125 {
 126         return mempool_alloc(md->rq_pool, gfp_mask);
 127 }
 128
 129 static void free_old_clone_request(struct mapped_device *md, struct request *rq)
 130 {
 131         mempool_free(rq, md->rq_pool);
 132 }
 133
 134 /*
 135  * Partial completion handling for request-based dm
 136  */
 137 static void end_clone_bio(struct bio *clone)
 138 {
 139         struct dm_rq_clone_bio_info *info =
 140                 container_of(clone, struct dm_rq_clone_bio_info, clone);
 141         struct dm_rq_target_io *tio = info->tio;
 142         struct bio *bio = info->orig;
 143         unsigned int nr_bytes = info->orig->bi_iter.bi_size;
 144         int error = clone->bi_error;
 145
 146         bio_put(clone);
 147
 148         if (tio->error)
 149                 /*
 150                  * An error has already been detected on the request.
 151                  * Once error occurred, just let clone->end_io() handle
 152                  * the remainder.
 153                  */
 154                 return;
 155         else if (error) {
 156                 /*
 157                  * Don't notice the error to the upper layer yet.
 158                  * The error handling decision is made by the target driver,
 159                  * when the request is completed.
 160                  */
 161                 tio->error = error;
 162                 return;
 163         }
 164
 165         /*
 166          * I/O for the bio successfully completed.
 167          * Notice the data completion to the upper layer.
 168          */
 169
 170         /*
 171          * bios are processed from the head of the list.
 172          * So the completing bio should always be rq->bio.
 173          * If it's not, something wrong is happening.
 174          */
 175         if (tio->orig->bio != bio)
 176                 DMERR("bio completion is going in the middle of the request");
 177
 178         /*
 179          * Update the original request.
 180          * Do not use blk_end_request() here, because it may complete
 181          * the original request before the clone, and break the ordering.
 182          */
 183         blk_update_request(tio->orig, 0, nr_bytes);
 184 }
 185
 186 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 187 {
 188         return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
 189 }
 190
 191 static void rq_end_stats(struct mapped_device *md, struct request *orig)
 192 {
 193         if (unlikely(dm_stats_used(&md->stats))) {
 194                 struct dm_rq_target_io *tio = tio_from_request(orig);
 195                 tio->duration_jiffies = jiffies - tio->duration_jiffies;
 196                 dm_stats_account_io(&md->stats, rq_data_dir(orig),
 197                                     blk_rq_pos(orig), tio->n_sectors, true,
 198                                     tio->duration_jiffies, &tio->stats_aux);
 199         }
 200 }
 201
 202 /*
 203  * Don't touch any member of the md after calling this function because
 204  * the md may be freed in dm_put() at the end of this function.
 205  * Or do dm_get() before calling this function and dm_put() later.
 206  */
 207 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 208 {
 209         struct request_queue *q = md->queue;
 210         unsigned long flags;
 211
 212         atomic_dec(&md->pending[rw]);
 213
 214         /* nudge anyone waiting on suspend queue */
 215         if (!md_in_flight(md))
 216                 wake_up(&md->wait);
 217
 218         /*
 219          * Run this off this callpath, as drivers could invoke end_io while
 220          * inside their request_fn (and holding the queue lock). Calling
 221          * back into ->request_fn() could deadlock attempting to grab the
 222          * queue lock again.
 223          */
 224         if (!q->mq_ops && run_queue) {
 225                 spin_lock_irqsave(q->queue_lock, flags);
 226                 blk_run_queue_async(q);
 227                 spin_unlock_irqrestore(q->queue_lock, flags);
 228         }
 229
 230         /*
 231          * dm_put() must be at the end of this function. See the comment above
 232          */
 233         dm_put(md);
 234 }
 235
 236 static void free_rq_clone(struct request *clone)
 237 {
 238         struct dm_rq_target_io *tio = clone->end_io_data;
 239         struct mapped_device *md = tio->md;
 240
 241         blk_rq_unprep_clone(clone);
 242
 243         /*
 244          * It is possible for a clone_old_rq() allocated clone to
 245          * get passed in -- it may not yet have a request_queue.
 246          * This is known to occur if the error target replaces
 247          * a multipath target that has a request_fn queue stacked
 248          * on blk-mq queue(s).
 249          */
 250         if (clone->q && clone->q->mq_ops)
 251                 /* stacked on blk-mq queue(s) */
 252                 tio->ti->type->release_clone_rq(clone);
 253         else if (!md->queue->mq_ops)
 254                 /* request_fn queue stacked on request_fn queue(s) */
 255                 free_old_clone_request(md, clone);
 256
 257         if (!md->queue->mq_ops)
 258                 free_old_rq_tio(tio);
 259 }
 260
 261 /*
 262  * Complete the clone and the original request.
 263  * Must be called without clone's queue lock held,
 264  * see end_clone_request() for more details.
 265  */
 266 static void dm_end_request(struct request *clone, int error)
 267 {
 268         int rw = rq_data_dir(clone);
 269         struct dm_rq_target_io *tio = clone->end_io_data;
 270         struct mapped_device *md = tio->md;
 271         struct request *rq = tio->orig;
 272
 273         if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 274                 rq->errors = clone->errors;
 275                 rq->resid_len = clone->resid_len;
 276
 277                 if (rq->sense)
 278                         /*
 279                          * We are using the sense buffer of the original
 280                          * request.
 281                          * So setting the length of the sense data is enough.
 282                          */
 283                         rq->sense_len = clone->sense_len;
 284         }
 285
 286         free_rq_clone(clone);
 287         rq_end_stats(md, rq);
 288         if (!rq->q->mq_ops)
 289                 blk_end_request_all(rq, error);
 290         else
 291                 blk_mq_end_request(rq, error);
 292         rq_completed(md, rw, true);
 293 }
 294
 295 static void dm_unprep_request(struct request *rq)
 296 {
 297         struct dm_rq_target_io *tio = tio_from_request(rq);
 298         struct request *clone = tio->clone;
 299
 300         if (!rq->q->mq_ops) {
 301                 rq->special = NULL;
 302                 rq->rq_flags &= ~RQF_DONTPREP;
 303         }
 304
 305         if (clone)
 306                 free_rq_clone(clone);
 307         else if (!tio->md->queue->mq_ops)
 308                 free_old_rq_tio(tio);
 309 }
 310
 311 /*
 312  * Requeue the original request of a clone.
 313  */
 314 static void dm_old_requeue_request(struct request *rq)
 315 {
 316         struct request_queue *q = rq->q;
 317         unsigned long flags;
 318
 319         spin_lock_irqsave(q->queue_lock, flags);
 320         blk_requeue_request(q, rq);
 321         blk_run_queue_async(q);
 322         spin_unlock_irqrestore(q->queue_lock, flags);
 323 }
 324
 325 static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs)
 326 {
 327         blk_mq_delay_kick_requeue_list(q, msecs);
 328 }
 329
 330 void dm_mq_kick_requeue_list(struct mapped_device *md)
 331 {
 332         __dm_mq_kick_requeue_list(dm_get_md_queue(md), 0);
 333 }
 334 EXPORT_SYMBOL(dm_mq_kick_requeue_list);
 335
 336 static void dm_mq_delay_requeue_request(struct request *rq, unsigned long msecs)
 337 {
 338         blk_mq_requeue_request(rq, false);
 339         __dm_mq_kick_requeue_list(rq->q, msecs);
 340 }
 341
 342 static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_requeue)
 343 {
 344         struct mapped_device *md = tio->md;
 345         struct request *rq = tio->orig;
 346         int rw = rq_data_dir(rq);
 347
 348         rq_end_stats(md, rq);
 349         dm_unprep_request(rq);
 350
 351         if (!rq->q->mq_ops)
 352                 dm_old_requeue_request(rq);
 353         else
 354                 dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0);
 355
 356         rq_completed(md, rw, false);
 357 }
 358
 359 static void dm_done(struct request *clone, int error, bool mapped)
 360 {
 361         int r = error;
 362         struct dm_rq_target_io *tio = clone->end_io_data;
 363         dm_request_endio_fn rq_end_io = NULL;
 364
 365         if (tio->ti) {
 366                 rq_end_io = tio->ti->type->rq_end_io;
 367
 368                 if (mapped && rq_end_io)
 369                         r = rq_end_io(tio->ti, clone, error, &tio->info);
 370         }
 371
 372         if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
 373                      !clone->q->limits.max_write_same_sectors))
 374                 disable_write_same(tio->md);
 375
 376         if (r <= 0)
 377                 /* The target wants to complete the I/O */
 378                 dm_end_request(clone, r);
 379         else if (r == DM_ENDIO_INCOMPLETE)
 380                 /* The target will handle the I/O */
 381                 return;
 382         else if (r == DM_ENDIO_REQUEUE)
 383                 /* The target wants to requeue the I/O */
 384                 dm_requeue_original_request(tio, false);
 385         else {
 386                 DMWARN("unimplemented target endio return value: %d", r);
 387                 BUG();
 388         }
 389 }
 390
 391 /*
 392  * Request completion handler for request-based dm
 393  */
 394 static void dm_softirq_done(struct request *rq)
 395 {
 396         bool mapped = true;
 397         struct dm_rq_target_io *tio = tio_from_request(rq);
 398         struct request *clone = tio->clone;
 399         int rw;
 400
 401         if (!clone) {
 402                 rq_end_stats(tio->md, rq);
 403                 rw = rq_data_dir(rq);
 404                 if (!rq->q->mq_ops) {
 405                         blk_end_request_all(rq, tio->error);
 406                         rq_completed(tio->md, rw, false);
 407                         free_old_rq_tio(tio);
 408                 } else {
 409                         blk_mq_end_request(rq, tio->error);
 410                         rq_completed(tio->md, rw, false);
 411                 }
 412                 return;
 413         }
 414
 415         if (rq->rq_flags & RQF_FAILED)
 416                 mapped = false;
 417
 418         dm_done(clone, tio->error, mapped);
 419 }
 420
 421 /*
 422  * Complete the clone and the original request with the error status
 423  * through softirq context.
 424  */
 425 static void dm_complete_request(struct request *rq, int error)
 426 {
 427         struct dm_rq_target_io *tio = tio_from_request(rq);
 428
 429         tio->error = error;
 430         if (!rq->q->mq_ops)
 431                 blk_complete_request(rq);
 432         else
 433                 blk_mq_complete_request(rq, error);
 434 }
 435
 436 /*
 437  * Complete the not-mapped clone and the original request with the error status
 438  * through softirq context.
 439  * Target's rq_end_io() function isn't called.
 440  * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
 441  */
 442 static void dm_kill_unmapped_request(struct request *rq, int error)
 443 {
 444         rq->rq_flags |= RQF_FAILED;
 445         dm_complete_request(rq, error);
 446 }
 447
 448 /*
 449  * Called with the clone's queue lock held (in the case of .request_fn)
 450  */
 451 static void end_clone_request(struct request *clone, int error)
 452 {
 453         struct dm_rq_target_io *tio = clone->end_io_data;
 454
 455         if (!clone->q->mq_ops) {
 456                 /*
 457                  * For just cleaning up the information of the queue in which
 458                  * the clone was dispatched.
 459                  * The clone is *NOT* freed actually here because it is alloced
 460                  * from dm own mempool (RQF_ALLOCED isn't set).
 461                  */
 462                 __blk_put_request(clone->q, clone);
 463         }
 464
 465         /*
 466          * Actual request completion is done in a softirq context which doesn't
 467          * hold the clone's queue lock.  Otherwise, deadlock could occur because:
 468          *     - another request may be submitted by the upper level driver
 469          *       of the stacking during the completion
 470          *     - the submission which requires queue lock may be done
 471          *       against this clone's queue
 472          */
 473         dm_complete_request(tio->orig, error);
 474 }
 475
 476 static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 477 {
 478         int r;
 479
 480         if (blk_queue_io_stat(clone->q))
 481                 clone->rq_flags |= RQF_IO_STAT;
 482
 483         clone->start_time = jiffies;
 484         r = blk_insert_cloned_request(clone->q, clone);
 485         if (r)
 486                 /* must complete clone in terms of original request */
 487                 dm_complete_request(rq, r);
 488 }
 489
 490 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
 491                                  void *data)
 492 {
 493         struct dm_rq_target_io *tio = data;
 494         struct dm_rq_clone_bio_info *info =
 495                 container_of(bio, struct dm_rq_clone_bio_info, clone);
 496
 497         info->orig = bio_orig;
 498         info->tio = tio;
 499         bio->bi_end_io = end_clone_bio;
 500
 501         return 0;
 502 }
 503
 504 static int setup_clone(struct request *clone, struct request *rq,
 505                        struct dm_rq_target_io *tio, gfp_t gfp_mask)
 506 {
 507         int r;
 508
 509         r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
 510                               dm_rq_bio_constructor, tio);
 511         if (r)
 512                 return r;
 513
 514         clone->cmd = rq->cmd;
 515         clone->cmd_len = rq->cmd_len;
 516         clone->sense = rq->sense;
 517         clone->end_io = end_clone_request;
 518         clone->end_io_data = tio;
 519
 520         tio->clone = clone;
 521
 522         return 0;
 523 }
 524
 525 static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
 526                                     struct dm_rq_target_io *tio, gfp_t gfp_mask)
 527 {
 528         /*
 529          * Create clone for use with .request_fn request_queue
 530          */
 531         struct request *clone;
 532
 533         clone = alloc_old_clone_request(md, gfp_mask);
 534         if (!clone)
 535                 return NULL;
 536
 537         blk_rq_init(NULL, clone);
 538         if (setup_clone(clone, rq, tio, gfp_mask)) {
 539                 /* -ENOMEM */
 540                 free_old_clone_request(md, clone);
 541                 return NULL;
 542         }
 543
 544         return clone;
 545 }
 546
 547 static void map_tio_request(struct kthread_work *work);
 548
 549 static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
 550                      struct mapped_device *md)
 551 {
 552         tio->md = md;
 553         tio->ti = NULL;
 554         tio->clone = NULL;
 555         tio->orig = rq;
 556         tio->error = 0;
 557         /*
 558          * Avoid initializing info for blk-mq; it passes
 559          * target-specific data through info.ptr
 560          * (see: dm_mq_init_request)
 561          */
 562         if (!md->init_tio_pdu)
 563                 memset(&tio->info, 0, sizeof(tio->info));
 564         if (md->kworker_task)
 565                 kthread_init_work(&tio->work, map_tio_request);
 566 }
 567
 568 static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
 569                                                struct mapped_device *md,
 570                                                gfp_t gfp_mask)
 571 {
 572         struct dm_rq_target_io *tio;
 573         int srcu_idx;
 574         struct dm_table *table;
 575
 576         tio = alloc_old_rq_tio(md, gfp_mask);
 577         if (!tio)
 578                 return NULL;
 579
 580         init_tio(tio, rq, md);
 581
 582         table = dm_get_live_table(md, &srcu_idx);
 583         /*
 584          * Must clone a request if this .request_fn DM device
 585          * is stacked on .request_fn device(s).
 586          */
 587         if (!dm_table_all_blk_mq_devices(table)) {
 588                 if (!clone_old_rq(rq, md, tio, gfp_mask)) {
 589                         dm_put_live_table(md, srcu_idx);
 590                         free_old_rq_tio(tio);
 591                         return NULL;
 592                 }
 593         }
 594         dm_put_live_table(md, srcu_idx);
 595
 596         return tio;
 597 }
 598
 599 /*
 600  * Called with the queue lock held.
 601  */
 602 static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
 603 {
 604         struct mapped_device *md = q->queuedata;
 605         struct dm_rq_target_io *tio;
 606
 607         if (unlikely(rq->special)) {
 608                 DMWARN("Already has something in rq->special.");
 609                 return BLKPREP_KILL;
 610         }
 611
 612         tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
 613         if (!tio)
 614                 return BLKPREP_DEFER;
 615
 616         rq->special = tio;
 617         rq->rq_flags |= RQF_DONTPREP;
 618
 619         return BLKPREP_OK;
 620 }
 621
 622 /*
 623  * Returns:
 624  * DM_MAPIO_*       : the request has been processed as indicated
 625  * DM_MAPIO_REQUEUE : the original request needs to be immediately requeued
 626  * < 0              : the request was completed due to failure
 627  */
 628 static int map_request(struct dm_rq_target_io *tio)
 629 {
 630         int r;
 631         struct dm_target *ti = tio->ti;
 632         struct mapped_device *md = tio->md;
 633         struct request *rq = tio->orig;
 634         struct request *clone = NULL;
 635
 636         if (tio->clone) {
 637                 clone = tio->clone;
 638                 r = ti->type->map_rq(ti, clone, &tio->info);
 639                 if (r == DM_MAPIO_DELAY_REQUEUE)
 640                         return DM_MAPIO_REQUEUE; /* .request_fn requeue is always immediate */
 641         } else {
 642                 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
 643                 if (r < 0) {
 644                         /* The target wants to complete the I/O */
 645                         dm_kill_unmapped_request(rq, r);
 646                         return r;
 647                 }
 648                 if (r == DM_MAPIO_REMAPPED &&
 649                     setup_clone(clone, rq, tio, GFP_ATOMIC)) {
 650                         /* -ENOMEM */
 651                         ti->type->release_clone_rq(clone);
 652                         return DM_MAPIO_REQUEUE;
 653                 }
 654         }
 655
 656         switch (r) {
 657         case DM_MAPIO_SUBMITTED:
 658                 /* The target has taken the I/O to submit by itself later */
 659                 break;
 660         case DM_MAPIO_REMAPPED:
 661                 /* The target has remapped the I/O so dispatch it */
 662                 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
 663                                      blk_rq_pos(rq));
 664                 dm_dispatch_clone_request(clone, rq);
 665                 break;
 666         case DM_MAPIO_REQUEUE:
 667                 /* The target wants to requeue the I/O */
 668                 break;
 669         case DM_MAPIO_DELAY_REQUEUE:
 670                 /* The target wants to requeue the I/O after a delay */
 671                 dm_requeue_original_request(tio, true);
 672                 break;
 673         default:
 674                 if (r > 0) {
 675                         DMWARN("unimplemented target map return value: %d", r);
 676                         BUG();
 677                 }
 678
 679                 /* The target wants to complete the I/O */
 680                 dm_kill_unmapped_request(rq, r);
 681         }
 682
 683         return r;
 684 }
 685
 686 static void dm_start_request(struct mapped_device *md, struct request *orig)
 687 {
 688         if (!orig->q->mq_ops)
 689                 blk_start_request(orig);
 690         else
 691                 blk_mq_start_request(orig);
 692         atomic_inc(&md->pending[rq_data_dir(orig)]);
 693
 694         if (md->seq_rq_merge_deadline_usecs) {
 695                 md->last_rq_pos = rq_end_sector(orig);
 696                 md->last_rq_rw = rq_data_dir(orig);
 697                 md->last_rq_start_time = ktime_get();
 698         }
 699
 700         if (unlikely(dm_stats_used(&md->stats))) {
 701                 struct dm_rq_target_io *tio = tio_from_request(orig);
 702                 tio->duration_jiffies = jiffies;
 703                 tio->n_sectors = blk_rq_sectors(orig);
 704                 dm_stats_account_io(&md->stats, rq_data_dir(orig),
 705                                     blk_rq_pos(orig), tio->n_sectors, false, 0,
 706                                     &tio->stats_aux);
 707         }
 708
 709         /*
 710          * Hold the md reference here for the in-flight I/O.
 711          * We can't rely on the reference count by device opener,
 712          * because the device may be closed during the request completion
 713          * when all bios are completed.
 714          * See the comment in rq_completed() too.
 715          */
 716         dm_get(md);
 717 }
 718
 719 static void map_tio_request(struct kthread_work *work)
 720 {
 721         struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
 722
 723         if (map_request(tio) == DM_MAPIO_REQUEUE)
 724                 dm_requeue_original_request(tio, false);
 725 }
 726
 727 ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
 728 {
 729         return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
 730 }
 731
 732 #define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
 733
 734 ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
 735                                                      const char *buf, size_t count)
 736 {
 737         unsigned deadline;
 738
 739         if (dm_get_md_type(md) != DM_TYPE_REQUEST_BASED)
 740                 return count;
 741
 742         if (kstrtouint(buf, 10, &deadline))
 743                 return -EINVAL;
 744
 745         if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
 746                 deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
 747
 748         md->seq_rq_merge_deadline_usecs = deadline;
 749
 750         return count;
 751 }
 752
 753 static bool dm_old_request_peeked_before_merge_deadline(struct mapped_device *md)
 754 {
 755         ktime_t kt_deadline;
 756
 757         if (!md->seq_rq_merge_deadline_usecs)
 758                 return false;
 759
 760         kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
 761         kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
 762
 763         return !ktime_after(ktime_get(), kt_deadline);
 764 }
 765
 766 /*
 767  * q->request_fn for old request-based dm.
 768  * Called with the queue lock held.
 769  */
 770 static void dm_old_request_fn(struct request_queue *q)
 771 {
 772         struct mapped_device *md = q->queuedata;
 773         struct dm_target *ti = md->immutable_target;
 774         struct request *rq;
 775         struct dm_rq_target_io *tio;
 776         sector_t pos = 0;
 777
 778         if (unlikely(!ti)) {
 779                 int srcu_idx;
 780                 struct dm_table *map = dm_get_live_table(md, &srcu_idx);
 781
 782                 ti = dm_table_find_target(map, pos);
 783                 dm_put_live_table(md, srcu_idx);
 784         }
 785
 786         /*
 787          * For suspend, check blk_queue_stopped() and increment
 788          * ->pending within a single queue_lock not to increment the
 789          * number of in-flight I/Os after the queue is stopped in
 790          * dm_suspend().
 791          */
 792         while (!blk_queue_stopped(q)) {
 793                 rq = blk_peek_request(q);
 794                 if (!rq)
 795                         return;
 796
 797                 /* always use block 0 to find the target for flushes for now */
 798                 pos = 0;
 799                 if (req_op(rq) != REQ_OP_FLUSH)
 800                         pos = blk_rq_pos(rq);
 801
 802                 if ((dm_old_request_peeked_before_merge_deadline(md) &&
 803                      md_in_flight(md) && rq->bio && !bio_multiple_segments(rq->bio) &&
 804                      md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
 805                     (ti->type->busy && ti->type->busy(ti))) {
 806                         blk_delay_queue(q, 10);
 807                         return;
 808                 }
 809
 810                 dm_start_request(md, rq);
 811
 812                 tio = tio_from_request(rq);
 813                 /* Establish tio->ti before queuing work (map_tio_request) */
 814                 tio->ti = ti;
 815                 kthread_queue_work(&md->kworker, &tio->work);
 816                 BUG_ON(!irqs_disabled());
 817         }
 818 }
 819
 820 /*
 821  * Fully initialize a .request_fn request-based queue.
 822  */
 823 int dm_old_init_request_queue(struct mapped_device *md)
 824 {
 825         /* Fully initialize the queue */
 826         if (!blk_init_allocated_queue(md->queue, dm_old_request_fn, NULL))
 827                 return -EINVAL;
 828
 829         /* disable dm_old_request_fn's merge heuristic by default */
 830         md->seq_rq_merge_deadline_usecs = 0;
 831
 832         dm_init_normal_md_queue(md);
 833         blk_queue_softirq_done(md->queue, dm_softirq_done);
 834         blk_queue_prep_rq(md->queue, dm_old_prep_fn);
 835
 836         /* Initialize the request-based DM worker thread */
 837         kthread_init_worker(&md->kworker);
 838         md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
 839                                        "kdmwork-%s", dm_device_name(md));
 840         if (IS_ERR(md->kworker_task)) {
 841                 int error = PTR_ERR(md->kworker_task);
 842                 md->kworker_task = NULL;
 843                 return error;
 844         }
 845
 846         elv_register_queue(md->queue);
 847
 848         return 0;
 849 }
 850
 851 static int dm_mq_init_request(void *data, struct request *rq,
 852                        unsigned int hctx_idx, unsigned int request_idx,
 853                        unsigned int numa_node)
 854 {
 855         struct mapped_device *md = data;
 856         struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
 857
 858         /*
 859          * Must initialize md member of tio, otherwise it won't
 860          * be available in dm_mq_queue_rq.
 861          */
 862         tio->md = md;
 863
 864         if (md->init_tio_pdu) {
 865                 /* target-specific per-io data is immediately after the tio */
 866                 tio->info.ptr = tio + 1;
 867         }
 868
 869         return 0;
 870 }
 871
 872 static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 873                           const struct blk_mq_queue_data *bd)
 874 {
 875         struct request *rq = bd->rq;
 876         struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
 877         struct mapped_device *md = tio->md;
 878         struct dm_target *ti = md->immutable_target;
 879
 880         if (unlikely(!ti)) {
 881                 int srcu_idx;
 882                 struct dm_table *map = dm_get_live_table(md, &srcu_idx);
 883
 884                 ti = dm_table_find_target(map, 0);
 885                 dm_put_live_table(md, srcu_idx);
 886         }
 887
 888         if (ti->type->busy && ti->type->busy(ti))
 889                 return BLK_MQ_RQ_QUEUE_BUSY;
 890
 891         dm_start_request(md, rq);
 892
 893         /* Init tio using md established in .init_request */
 894         init_tio(tio, rq, md);
 895
 896         /*
 897          * Establish tio->ti before calling map_request().
 898          */
 899         tio->ti = ti;
 900
 901         /* Direct call is fine since .queue_rq allows allocations */
 902         if (map_request(tio) == DM_MAPIO_REQUEUE) {
 903                 /* Undo dm_start_request() before requeuing */
 904                 rq_end_stats(md, rq);
 905                 rq_completed(md, rq_data_dir(rq), false);
 906                 return BLK_MQ_RQ_QUEUE_BUSY;
 907         }
 908
 909         return BLK_MQ_RQ_QUEUE_OK;
 910 }
 911
 912 static struct blk_mq_ops dm_mq_ops = {
 913         .queue_rq = dm_mq_queue_rq,
 914         .complete = dm_softirq_done,
 915         .init_request = dm_mq_init_request,
 916 };
 917
 918 int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 919 {
 920         struct request_queue *q;
 921         struct dm_target *immutable_tgt;
 922         int err;
 923
 924         if (!dm_table_all_blk_mq_devices(t)) {
 925                 DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
 926                 return -EINVAL;
 927         }
 928
 929         md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id);
 930         if (!md->tag_set)
 931                 return -ENOMEM;
 932
 933         md->tag_set->ops = &dm_mq_ops;
 934         md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
 935         md->tag_set->numa_node = md->numa_node_id;
 936         md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
 937         md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
 938         md->tag_set->driver_data = md;
 939
 940         md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
 941         immutable_tgt = dm_table_get_immutable_target(t);
 942         if (immutable_tgt && immutable_tgt->per_io_data_size) {
 943                 /* any target-specific per-io data is immediately after the tio */
 944                 md->tag_set->cmd_size += immutable_tgt->per_io_data_size;
 945                 md->init_tio_pdu = true;
 946         }
 947
 948         err = blk_mq_alloc_tag_set(md->tag_set);
 949         if (err)
 950                 goto out_kfree_tag_set;
 951
 952         q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
 953         if (IS_ERR(q)) {
 954                 err = PTR_ERR(q);
 955                 goto out_tag_set;
 956         }
 957         dm_init_md_queue(md);
 958
 959         /* backfill 'mq' sysfs registration normally done in blk_register_queue */
 960         blk_mq_register_dev(disk_to_dev(md->disk), q);
 961
 962         return 0;
 963
 964 out_tag_set:
 965         blk_mq_free_tag_set(md->tag_set);
 966 out_kfree_tag_set:
 967         kfree(md->tag_set);
 968
 969         return err;
 970 }
 971
 972 void dm_mq_cleanup_mapped_device(struct mapped_device *md)
 973 {
 974         if (md->tag_set) {
 975                 blk_mq_free_tag_set(md->tag_set);
 976                 kfree(md->tag_set);
 977         }
 978 }
 979
 980 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
 981 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
 982
 983 module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
 984 MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
 985
 986 module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR);
 987 MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices");
 988
 989 module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR);
 990 MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices");