UBUNTU: Start new release

[mirror_ubuntu-zesty-kernel.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 6d6f8feb48c08ab875e67c496193a743709b0621..d45989bbe03c047d640b56430b0fb479bb8dda38 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -229,8 +229,8 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
         return NULL;
  }
  
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
-               bool reserved)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+               unsigned int flags)
  {
         struct blk_mq_ctx *ctx;
         struct blk_mq_hw_ctx *hctx;
@@ -238,24 +238,22 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
         struct blk_mq_alloc_data alloc_data;
         int ret;
  
-       ret = blk_queue_enter(q, gfp);
+       ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
         if (ret)
                 return ERR_PTR(ret);
  
         ctx = blk_mq_get_ctx(q);
         hctx = q->mq_ops->map_queue(q, ctx->cpu);
-       blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
-                       reserved, ctx, hctx);
+       blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
  
         rq = __blk_mq_alloc_request(&alloc_data, rw);
-       if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
+       if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
                 __blk_mq_run_hw_queue(hctx);
                 blk_mq_put_ctx(ctx);
  
                 ctx = blk_mq_get_ctx(q);
                 hctx = q->mq_ops->map_queue(q, ctx->cpu);
-               blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
-                               hctx);
+               blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
                 rq =  __blk_mq_alloc_request(&alloc_data, rw);
                 ctx = alloc_data.ctx;
         }
@@ -268,6 +266,45 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
  }
  EXPORT_SYMBOL(blk_mq_alloc_request);
  
+struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
+               unsigned int flags, unsigned int hctx_idx)
+{
+       struct blk_mq_hw_ctx *hctx;
+       struct blk_mq_ctx *ctx;
+       struct request *rq;
+       struct blk_mq_alloc_data alloc_data;
+       int ret;
+
+       /*
+        * If the tag allocator sleeps we could get an allocation for a
+        * different hardware context.  No need to complicate the low level
+        * allocator for this for the rare use case of a command tied to
+        * a specific queue.
+        */
+       if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
+               return ERR_PTR(-EINVAL);
+
+       if (hctx_idx >= q->nr_hw_queues)
+               return ERR_PTR(-EIO);
+
+       ret = blk_queue_enter(q, true);
+       if (ret)
+               return ERR_PTR(ret);
+
+       hctx = q->queue_hw_ctx[hctx_idx];
+       ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
+
+       blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
+       rq = __blk_mq_alloc_request(&alloc_data, rw);
+       if (!rq) {
+               blk_queue_exit(q);
+               return ERR_PTR(-EWOULDBLOCK);
+       }
+
+       return rq;
+}
+EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
+
  static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
                                   struct blk_mq_ctx *ctx, struct request *rq)
  {
@@ -601,8 +638,10 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
                  * If a request wasn't started before the queue was
                  * marked dying, kill it here or it'll go unnoticed.
                  */
-               if (unlikely(blk_queue_dying(rq->q)))
-                       blk_mq_complete_request(rq, -EIO);
+               if (unlikely(blk_queue_dying(rq->q))) {
+                       rq->errors = -EIO;
+                       blk_mq_end_request(rq, rq->errors);
+               }
                 return;
         }
         if (rq->cmd_flags & REQ_NO_TIMEOUT)
@@ -617,15 +656,32 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
         }
  }
  
-static void blk_mq_rq_timer(unsigned long priv)
+static void blk_mq_timeout_work(struct work_struct *work)
  {
-       struct request_queue *q = (struct request_queue *)priv;
+       struct request_queue *q =
+               container_of(work, struct request_queue, timeout_work);
         struct blk_mq_timeout_data data = {
                 .next           = 0,
                 .next_set       = 0,
         };
         int i;
  
+       /* A deadlock might occur if a request is stuck requiring a
+        * timeout at the same time a queue freeze is waiting
+        * completion, since the timeout code would not be able to
+        * acquire the queue reference here.
+        *
+        * That's why we don't use blk_queue_enter here; instead, we use
+        * percpu_ref_tryget directly, because we need to be able to
+        * obtain a reference even in the short window between the queue
+        * starting to freeze, by dropping the first reference in
+        * blk_mq_freeze_queue_start, and the moment the last request is
+        * consumed, marked by the instant q_usage_counter reaches
+        * zero.
+        */
+       if (!percpu_ref_tryget(&q->q_usage_counter))
+               return;
+
         blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
  
         if (data.next_set) {
@@ -640,6 +696,7 @@ static void blk_mq_rq_timer(unsigned long priv)
                                 blk_mq_tag_idle(hctx);
                 }
         }
+       blk_queue_exit(q);
  }
  
  /*
@@ -730,11 +787,12 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
         struct list_head *dptr;
         int queued;
  
-       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
-
         if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
                 return;
  
+       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+               cpu_online(hctx->next_cpu));
+
         hctx->run++;
  
         /*
@@ -778,7 +836,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                 switch (ret) {
                 case BLK_MQ_RQ_QUEUE_OK:
                         queued++;
-                       continue;
+                       break;
                 case BLK_MQ_RQ_QUEUE_BUSY:
                         list_add(&rq->queuelist, &rq_list);
                         __blk_mq_requeue_request(rq);
@@ -840,7 +898,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
                 return WORK_CPU_UNBOUND;
  
         if (--hctx->next_cpu_batch <= 0) {
-               int cpu = hctx->next_cpu, next_cpu;
+               int next_cpu;
  
                 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
                 if (next_cpu >= nr_cpu_ids)
@@ -848,8 +906,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
  
                 hctx->next_cpu = next_cpu;
                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
-
-               return cpu;
         }
  
         return hctx->next_cpu;
@@ -973,10 +1029,11 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
  EXPORT_SYMBOL(blk_mq_delay_queue);
  
  static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
-                                           struct blk_mq_ctx *ctx,
                                             struct request *rq,
                                             bool at_head)
  {
+       struct blk_mq_ctx *ctx = rq->mq_ctx;
+
         trace_block_rq_insert(hctx->queue, rq);
  
         if (at_head)
@@ -990,20 +1047,16 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
  {
         struct blk_mq_ctx *ctx = rq->mq_ctx;
  
-       __blk_mq_insert_req_list(hctx, ctx, rq, at_head);
+       __blk_mq_insert_req_list(hctx, rq, at_head);
         blk_mq_hctx_mark_pending(hctx, ctx);
  }
  
  void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
-               bool async)
+                          bool async)
  {
+       struct blk_mq_ctx *ctx = rq->mq_ctx;
         struct request_queue *q = rq->q;
         struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
-
-       current_ctx = blk_mq_get_ctx(q);
-       if (!cpu_online(ctx->cpu))
-               rq->mq_ctx = ctx = current_ctx;
  
         hctx = q->mq_ops->map_queue(q, ctx->cpu);
  
@@ -1013,8 +1066,6 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
  
         if (run_queue)
                 blk_mq_run_hw_queue(hctx, async);
-
-       blk_mq_put_ctx(current_ctx);
  }
  
  static void blk_mq_insert_requests(struct request_queue *q,
@@ -1025,14 +1076,9 @@ static void blk_mq_insert_requests(struct request_queue *q,
  
  {
         struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *current_ctx;
  
         trace_block_unplug(q, depth, !from_schedule);
  
-       current_ctx = blk_mq_get_ctx(q);
-
-       if (!cpu_online(ctx->cpu))
-               ctx = current_ctx;
         hctx = q->mq_ops->map_queue(q, ctx->cpu);
  
         /*
@@ -1044,15 +1090,14 @@ static void blk_mq_insert_requests(struct request_queue *q,
                 struct request *rq;
  
                 rq = list_first_entry(list, struct request, queuelist);
+               BUG_ON(rq->mq_ctx != ctx);
                 list_del_init(&rq->queuelist);
-               rq->mq_ctx = ctx;
-               __blk_mq_insert_req_list(hctx, ctx, rq, false);
+               __blk_mq_insert_req_list(hctx, rq, false);
         }
         blk_mq_hctx_mark_pending(hctx, ctx);
         spin_unlock(&ctx->lock);
  
         blk_mq_run_hw_queue(hctx, from_schedule);
-       blk_mq_put_ctx(current_ctx);
  }
  
  static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1175,8 +1220,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
                 rw |= REQ_SYNC;
  
         trace_block_getrq(q, bio, rw);
-       blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
-                       hctx);
+       blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
         rq = __blk_mq_alloc_request(&alloc_data, rw);
         if (unlikely(!rq)) {
                 __blk_mq_run_hw_queue(hctx);
@@ -1185,8 +1229,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
  
                 ctx = blk_mq_get_ctx(q);
                 hctx = q->mq_ops->map_queue(q, ctx->cpu);
-               blk_mq_set_alloc_data(&alloc_data, q,
-                               __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
+               blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
                 rq = __blk_mq_alloc_request(&alloc_data, rw);
                 ctx = alloc_data.ctx;
                 hctx = alloc_data.hctx;
@@ -1252,19 +1295,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  
         blk_queue_bounce(q, &bio);
  
+       blk_queue_split(q, &bio, q->bio_split);
+
         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
                 bio_io_error(bio);
                 return BLK_QC_T_NONE;
         }
  
-       blk_queue_split(q, &bio, q->bio_split);
-
-       if (!is_flush_fua && !blk_queue_nomerges(q)) {
-               if (blk_attempt_plug_merge(q, bio, &request_count,
-                                          &same_queue_rq))
-                       return BLK_QC_T_NONE;
-       } else
-               request_count = blk_plug_queued_count(q);
+       if (!is_flush_fua && !blk_queue_nomerges(q) &&
+           blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+               return BLK_QC_T_NONE;
  
         rq = blk_mq_map_request(q, bio, &data);
         if (unlikely(!rq))
@@ -1311,9 +1351,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 blk_mq_put_ctx(data.ctx);
                 if (!old_rq)
                         goto done;
-               if (!blk_mq_direct_issue_request(old_rq, &cookie))
-                       goto done;
-               blk_mq_insert_request(old_rq, false, true, true);
+               if (test_bit(BLK_MQ_S_STOPPED, &data.hctx->state) ||
+                   blk_mq_direct_issue_request(old_rq, &cookie) != 0)
+                       blk_mq_insert_request(old_rq, false, true, true);
                 goto done;
         }
  
@@ -1355,9 +1395,11 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
  
         blk_queue_split(q, &bio, q->bio_split);
  
-       if (!is_flush_fua && !blk_queue_nomerges(q) &&
-           blk_attempt_plug_merge(q, bio, &request_count, NULL))
-               return BLK_QC_T_NONE;
+       if (!is_flush_fua && !blk_queue_nomerges(q)) {
+               if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
+                       return BLK_QC_T_NONE;
+       } else
+               request_count = blk_plug_queued_count(q);
  
         rq = blk_mq_map_request(q, bio, &data);
         if (unlikely(!rq))
@@ -1471,7 +1513,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
         INIT_LIST_HEAD(&tags->page_list);
  
         tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
-                                GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                  set->numa_node);
         if (!tags->rqs) {
                 blk_mq_free_tags(tags);
@@ -1497,7 +1539,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
  
                 do {
                         page = alloc_pages_node(set->numa_node,
-                               GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
+                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
                                 this_order);
                         if (page)
                                 break;
@@ -1518,7 +1560,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                  * Allow kmemleak to scan these pages as they contain pointers
                  * to additional allocations like via ops->init_request().
                  */
-               kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
+               kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
                 entries_per_page = order_to_size(this_order) / rq_size;
                 to_do = min(entries_per_page, set->queue_depth - i);
                 left -= to_do * rq_size;
@@ -1570,16 +1612,17 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
         return 0;
  }
  
+/*
+ * 'cpu' is going away. splice any existing rq_list entries from this
+ * software queue to the hw queue dispatch list, and ensure that it
+ * gets run.
+ */
  static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
  {
-       struct request_queue *q = hctx->queue;
         struct blk_mq_ctx *ctx;
         LIST_HEAD(tmp);
  
-       /*
-        * Move ctx entries to new CPU, if this one is going away.
-        */
-       ctx = __blk_mq_get_ctx(q, cpu);
+       ctx = __blk_mq_get_ctx(hctx->queue, cpu);
  
         spin_lock(&ctx->lock);
         if (!list_empty(&ctx->rq_list)) {
@@ -1591,24 +1634,11 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
         if (list_empty(&tmp))
                 return NOTIFY_OK;
  
-       ctx = blk_mq_get_ctx(q);
-       spin_lock(&ctx->lock);
-
-       while (!list_empty(&tmp)) {
-               struct request *rq;
-
-               rq = list_first_entry(&tmp, struct request, queuelist);
-               rq->mq_ctx = ctx;
-               list_move_tail(&rq->queuelist, &ctx->rq_list);
-       }
-
-       hctx = q->mq_ops->map_queue(q, ctx->cpu);
-       blk_mq_hctx_mark_pending(hctx, ctx);
-
-       spin_unlock(&ctx->lock);
+       spin_lock(&hctx->lock);
+       list_splice_tail_init(&tmp, &hctx->dispatch);
+       spin_unlock(&hctx->lock);
  
         blk_mq_run_hw_queue(hctx, true);
-       blk_mq_put_ctx(ctx);
         return NOTIFY_OK;
  }
  
@@ -1801,7 +1831,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
  static void blk_mq_map_swqueue(struct request_queue *q,
                                const struct cpumask *online_mask)
  {
-       unsigned int i;
+       unsigned int i, hctx_idx;
         struct blk_mq_hw_ctx *hctx;
         struct blk_mq_ctx *ctx;
         struct blk_mq_tag_set *set = q->tag_set;
@@ -1819,11 +1849,27 @@ static void blk_mq_map_swqueue(struct request_queue *q,
         /*
          * Map software to hardware queues
          */
-       queue_for_each_ctx(q, ctx, i) {
+       for_each_possible_cpu(i) {
                 /* If the cpu isn't online, the cpu is mapped to first hctx */
                 if (!cpumask_test_cpu(i, online_mask))
                         continue;
  
+               hctx_idx = q->mq_map[i];
+               /* unmapped hw queue can be remapped after CPU topo changed */
+               if (!set->tags[hctx_idx]) {
+                       set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx);
+
+                       /*
+                        * If tags initialization fail for some hctx,
+                        * that hctx won't be brought online.  In this
+                        * case, remap the current ctx to hctx[0] which
+                        * is guaranteed to always have tags allocated
+                        */
+                       if (!set->tags[hctx_idx])
+                               q->mq_map[i] = 0;
+               }
+
+               ctx = per_cpu_ptr(q->queue_ctx, i);
                 hctx = q->mq_ops->map_queue(q, i);
                 cpumask_set_cpu(i, hctx->cpumask);
                 ctx->index_hw = hctx->nr_ctx;
@@ -1840,7 +1886,11 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                  * disable it and free the request entries.
                  */
                 if (!hctx->nr_ctx) {
-                       if (set->tags[i]) {
+                       /* Never unmap queue 0.  We need it as a
+                        * fallback in case of a new remap fails
+                        * allocation
+                        */
+                       if (i && set->tags[i]) {
                                 blk_mq_free_rq_map(set, set->tags[i], i);
                                 set->tags[i] = NULL;
                         }
@@ -1848,12 +1898,10 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                         continue;
                 }
  
-               /* unmapped hw queue can be remapped after CPU topo changed */
-               if (!set->tags[i])
-                       set->tags[i] = blk_mq_init_rq_map(set, i);
                 hctx->tags = set->tags[i];
                 WARN_ON(!hctx->tags);
  
+               cpumask_copy(hctx->tags->cpumask, hctx->cpumask);
                 /*
                  * Set the map size to the number of mapped software queues.
                  * This is more accurate and more efficient than looping
@@ -1867,14 +1915,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                 hctx->next_cpu = cpumask_first(hctx->cpumask);
                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
         }
-
-       queue_for_each_ctx(q, ctx, i) {
-               if (!cpumask_test_cpu(i, online_mask))
-                       continue;
-
-               hctx = q->mq_ops->map_queue(q, i);
-               cpumask_set_cpu(i, hctx->tags->cpumask);
-       }
  }
  
  static void queue_set_hctx_shared(struct request_queue *q, bool shared)
@@ -2019,7 +2059,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                 hctxs[i]->queue_num = i;
         }
  
-       setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+       INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
         blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
  
         q->nr_queues = nr_cpu_ids;