UBUNTU: Start new release

[mirror_ubuntu-zesty-kernel.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 32598d212cf319225d395e183ccd245bc0317494..d45989bbe03c047d640b56430b0fb479bb8dda38 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -266,6 +266,45 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
  }
  EXPORT_SYMBOL(blk_mq_alloc_request);
  
+struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
+               unsigned int flags, unsigned int hctx_idx)
+{
+       struct blk_mq_hw_ctx *hctx;
+       struct blk_mq_ctx *ctx;
+       struct request *rq;
+       struct blk_mq_alloc_data alloc_data;
+       int ret;
+
+       /*
+        * If the tag allocator sleeps we could get an allocation for a
+        * different hardware context.  No need to complicate the low level
+        * allocator for this for the rare use case of a command tied to
+        * a specific queue.
+        */
+       if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
+               return ERR_PTR(-EINVAL);
+
+       if (hctx_idx >= q->nr_hw_queues)
+               return ERR_PTR(-EIO);
+
+       ret = blk_queue_enter(q, true);
+       if (ret)
+               return ERR_PTR(ret);
+
+       hctx = q->queue_hw_ctx[hctx_idx];
+       ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
+
+       blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
+       rq = __blk_mq_alloc_request(&alloc_data, rw);
+       if (!rq) {
+               blk_queue_exit(q);
+               return ERR_PTR(-EWOULDBLOCK);
+       }
+
+       return rq;
+}
+EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
+
  static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
                                   struct blk_mq_ctx *ctx, struct request *rq)
  {
@@ -627,7 +666,20 @@ static void blk_mq_timeout_work(struct work_struct *work)
         };
         int i;
  
-       if (blk_queue_enter(q, true))
+       /* A deadlock might occur if a request is stuck requiring a
+        * timeout at the same time a queue freeze is waiting
+        * completion, since the timeout code would not be able to
+        * acquire the queue reference here.
+        *
+        * That's why we don't use blk_queue_enter here; instead, we use
+        * percpu_ref_tryget directly, because we need to be able to
+        * obtain a reference even in the short window between the queue
+        * starting to freeze, by dropping the first reference in
+        * blk_mq_freeze_queue_start, and the moment the last request is
+        * consumed, marked by the instant q_usage_counter reaches
+        * zero.
+        */
+       if (!percpu_ref_tryget(&q->q_usage_counter))
                 return;
  
         blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
@@ -735,11 +787,12 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
         struct list_head *dptr;
         int queued;
  
-       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
-
         if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
                 return;
  
+       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+               cpu_online(hctx->next_cpu));
+
         hctx->run++;
  
         /*
@@ -783,7 +836,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                 switch (ret) {
                 case BLK_MQ_RQ_QUEUE_OK:
                         queued++;
-                       continue;
+                       break;
                 case BLK_MQ_RQ_QUEUE_BUSY:
                         list_add(&rq->queuelist, &rq_list);
                         __blk_mq_requeue_request(rq);
@@ -845,7 +898,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
                 return WORK_CPU_UNBOUND;
  
         if (--hctx->next_cpu_batch <= 0) {
-               int cpu = hctx->next_cpu, next_cpu;
+               int next_cpu;
  
                 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
                 if (next_cpu >= nr_cpu_ids)
@@ -853,8 +906,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
  
                 hctx->next_cpu = next_cpu;
                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
-
-               return cpu;
         }
  
         return hctx->next_cpu;
@@ -978,10 +1029,11 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
  EXPORT_SYMBOL(blk_mq_delay_queue);
  
  static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
-                                           struct blk_mq_ctx *ctx,
                                             struct request *rq,
                                             bool at_head)
  {
+       struct blk_mq_ctx *ctx = rq->mq_ctx;
+
         trace_block_rq_insert(hctx->queue, rq);
  
         if (at_head)
@@ -995,20 +1047,16 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
  {
         struct blk_mq_ctx *ctx = rq->mq_ctx;
  
-       __blk_mq_insert_req_list(hctx, ctx, rq, at_head);
+       __blk_mq_insert_req_list(hctx, rq, at_head);
         blk_mq_hctx_mark_pending(hctx, ctx);
  }
  
  void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
-               bool async)
+                          bool async)
  {
+       struct blk_mq_ctx *ctx = rq->mq_ctx;
         struct request_queue *q = rq->q;
         struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
-
-       current_ctx = blk_mq_get_ctx(q);
-       if (!cpu_online(ctx->cpu))
-               rq->mq_ctx = ctx = current_ctx;
  
         hctx = q->mq_ops->map_queue(q, ctx->cpu);
  
@@ -1018,8 +1066,6 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
  
         if (run_queue)
                 blk_mq_run_hw_queue(hctx, async);
-
-       blk_mq_put_ctx(current_ctx);
  }
  
  static void blk_mq_insert_requests(struct request_queue *q,
@@ -1030,14 +1076,9 @@ static void blk_mq_insert_requests(struct request_queue *q,
  
  {
         struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *current_ctx;
  
         trace_block_unplug(q, depth, !from_schedule);
  
-       current_ctx = blk_mq_get_ctx(q);
-
-       if (!cpu_online(ctx->cpu))
-               ctx = current_ctx;
         hctx = q->mq_ops->map_queue(q, ctx->cpu);
  
         /*
@@ -1049,15 +1090,14 @@ static void blk_mq_insert_requests(struct request_queue *q,
                 struct request *rq;
  
                 rq = list_first_entry(list, struct request, queuelist);
+               BUG_ON(rq->mq_ctx != ctx);
                 list_del_init(&rq->queuelist);
-               rq->mq_ctx = ctx;
-               __blk_mq_insert_req_list(hctx, ctx, rq, false);
+               __blk_mq_insert_req_list(hctx, rq, false);
         }
         blk_mq_hctx_mark_pending(hctx, ctx);
         spin_unlock(&ctx->lock);
  
         blk_mq_run_hw_queue(hctx, from_schedule);
-       blk_mq_put_ctx(current_ctx);
  }
  
  static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1255,19 +1295,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  
         blk_queue_bounce(q, &bio);
  
+       blk_queue_split(q, &bio, q->bio_split);
+
         if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
                 bio_io_error(bio);
                 return BLK_QC_T_NONE;
         }
  
-       blk_queue_split(q, &bio, q->bio_split);
-
-       if (!is_flush_fua && !blk_queue_nomerges(q)) {
-               if (blk_attempt_plug_merge(q, bio, &request_count,
-                                          &same_queue_rq))
-                       return BLK_QC_T_NONE;
-       } else
-               request_count = blk_plug_queued_count(q);
+       if (!is_flush_fua && !blk_queue_nomerges(q) &&
+           blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+               return BLK_QC_T_NONE;
  
         rq = blk_mq_map_request(q, bio, &data);
         if (unlikely(!rq))
@@ -1314,9 +1351,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 blk_mq_put_ctx(data.ctx);
                 if (!old_rq)
                         goto done;
-               if (!blk_mq_direct_issue_request(old_rq, &cookie))
-                       goto done;
-               blk_mq_insert_request(old_rq, false, true, true);
+               if (test_bit(BLK_MQ_S_STOPPED, &data.hctx->state) ||
+                   blk_mq_direct_issue_request(old_rq, &cookie) != 0)
+                       blk_mq_insert_request(old_rq, false, true, true);
                 goto done;
         }
  
@@ -1358,9 +1395,11 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
  
         blk_queue_split(q, &bio, q->bio_split);
  
-       if (!is_flush_fua && !blk_queue_nomerges(q) &&
-           blk_attempt_plug_merge(q, bio, &request_count, NULL))
-               return BLK_QC_T_NONE;
+       if (!is_flush_fua && !blk_queue_nomerges(q)) {
+               if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
+                       return BLK_QC_T_NONE;
+       } else
+               request_count = blk_plug_queued_count(q);
  
         rq = blk_mq_map_request(q, bio, &data);
         if (unlikely(!rq))
@@ -1474,7 +1513,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
         INIT_LIST_HEAD(&tags->page_list);
  
         tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
-                                GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                  set->numa_node);
         if (!tags->rqs) {
                 blk_mq_free_tags(tags);
@@ -1500,7 +1539,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
  
                 do {
                         page = alloc_pages_node(set->numa_node,
-                               GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
+                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
                                 this_order);
                         if (page)
                                 break;
@@ -1521,7 +1560,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
                  * Allow kmemleak to scan these pages as they contain pointers
                  * to additional allocations like via ops->init_request().
                  */
-               kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
+               kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
                 entries_per_page = order_to_size(this_order) / rq_size;
                 to_do = min(entries_per_page, set->queue_depth - i);
                 left -= to_do * rq_size;
@@ -1573,16 +1612,17 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
         return 0;
  }
  
+/*
+ * 'cpu' is going away. splice any existing rq_list entries from this
+ * software queue to the hw queue dispatch list, and ensure that it
+ * gets run.
+ */
  static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
  {
-       struct request_queue *q = hctx->queue;
         struct blk_mq_ctx *ctx;
         LIST_HEAD(tmp);
  
-       /*
-        * Move ctx entries to new CPU, if this one is going away.
-        */
-       ctx = __blk_mq_get_ctx(q, cpu);
+       ctx = __blk_mq_get_ctx(hctx->queue, cpu);
  
         spin_lock(&ctx->lock);
         if (!list_empty(&ctx->rq_list)) {
@@ -1594,24 +1634,11 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
         if (list_empty(&tmp))
                 return NOTIFY_OK;
  
-       ctx = blk_mq_get_ctx(q);
-       spin_lock(&ctx->lock);
-
-       while (!list_empty(&tmp)) {
-               struct request *rq;
-
-               rq = list_first_entry(&tmp, struct request, queuelist);
-               rq->mq_ctx = ctx;
-               list_move_tail(&rq->queuelist, &ctx->rq_list);
-       }
-
-       hctx = q->mq_ops->map_queue(q, ctx->cpu);
-       blk_mq_hctx_mark_pending(hctx, ctx);
-
-       spin_unlock(&ctx->lock);
+       spin_lock(&hctx->lock);
+       list_splice_tail_init(&tmp, &hctx->dispatch);
+       spin_unlock(&hctx->lock);
  
         blk_mq_run_hw_queue(hctx, true);
-       blk_mq_put_ctx(ctx);
         return NOTIFY_OK;
  }
  
@@ -1804,7 +1831,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
  static void blk_mq_map_swqueue(struct request_queue *q,
                                const struct cpumask *online_mask)
  {
-       unsigned int i;
+       unsigned int i, hctx_idx;
         struct blk_mq_hw_ctx *hctx;
         struct blk_mq_ctx *ctx;
         struct blk_mq_tag_set *set = q->tag_set;
@@ -1822,11 +1849,27 @@ static void blk_mq_map_swqueue(struct request_queue *q,
         /*
          * Map software to hardware queues
          */
-       queue_for_each_ctx(q, ctx, i) {
+       for_each_possible_cpu(i) {
                 /* If the cpu isn't online, the cpu is mapped to first hctx */
                 if (!cpumask_test_cpu(i, online_mask))
                         continue;
  
+               hctx_idx = q->mq_map[i];
+               /* unmapped hw queue can be remapped after CPU topo changed */
+               if (!set->tags[hctx_idx]) {
+                       set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx);
+
+                       /*
+                        * If tags initialization fail for some hctx,
+                        * that hctx won't be brought online.  In this
+                        * case, remap the current ctx to hctx[0] which
+                        * is guaranteed to always have tags allocated
+                        */
+                       if (!set->tags[hctx_idx])
+                               q->mq_map[i] = 0;
+               }
+
+               ctx = per_cpu_ptr(q->queue_ctx, i);
                 hctx = q->mq_ops->map_queue(q, i);
                 cpumask_set_cpu(i, hctx->cpumask);
                 ctx->index_hw = hctx->nr_ctx;
@@ -1843,7 +1886,11 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                  * disable it and free the request entries.
                  */
                 if (!hctx->nr_ctx) {
-                       if (set->tags[i]) {
+                       /* Never unmap queue 0.  We need it as a
+                        * fallback in case of a new remap fails
+                        * allocation
+                        */
+                       if (i && set->tags[i]) {
                                 blk_mq_free_rq_map(set, set->tags[i], i);
                                 set->tags[i] = NULL;
                         }
@@ -1851,9 +1898,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
                         continue;
                 }
  
-               /* unmapped hw queue can be remapped after CPU topo changed */
-               if (!set->tags[i])
-                       set->tags[i] = blk_mq_init_rq_map(set, i);
                 hctx->tags = set->tags[i];
                 WARN_ON(!hctx->tags);