block/bfq-iosched.c

   1 /*
   2  * Budget Fair Queueing (BFQ) I/O scheduler.
   3  *
   4  * Based on ideas and code from CFQ:
   5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   6  *
   7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
   8  *                    Paolo Valente <paolo.valente@unimore.it>
   9  *
  10  * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
  11  *                    Arianna Avanzini <avanzini@google.com>
  12  *
  13  * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
  14  *
  15  *  This program is free software; you can redistribute it and/or
  16  *  modify it under the terms of the GNU General Public License as
  17  *  published by the Free Software Foundation; either version 2 of the
  18  *  License, or (at your option) any later version.
  19  *
  20  *  This program is distributed in the hope that it will be useful,
  21  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  22  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23  *  General Public License for more details.
  24  *
  25  * BFQ is a proportional-share I/O scheduler, with some extra
  26  * low-latency capabilities. BFQ also supports full hierarchical
  27  * scheduling through cgroups. Next paragraphs provide an introduction
  28  * on BFQ inner workings. Details on BFQ benefits, usage and
  29  * limitations can be found in Documentation/block/bfq-iosched.txt.
  30  *
  31  * BFQ is a proportional-share storage-I/O scheduling algorithm based
  32  * on the slice-by-slice service scheme of CFQ. But BFQ assigns
  33  * budgets, measured in number of sectors, to processes instead of
  34  * time slices. The device is not granted to the in-service process
  35  * for a given time slice, but until it has exhausted its assigned
  36  * budget. This change from the time to the service domain enables BFQ
  37  * to distribute the device throughput among processes as desired,
  38  * without any distortion due to throughput fluctuations, or to device
  39  * internal queueing. BFQ uses an ad hoc internal scheduler, called
  40  * B-WF2Q+, to schedule processes according to their budgets. More
  41  * precisely, BFQ schedules queues associated with processes. Each
  42  * process/queue is assigned a user-configurable weight, and B-WF2Q+
  43  * guarantees that each queue receives a fraction of the throughput
  44  * proportional to its weight. Thanks to the accurate policy of
  45  * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
  46  * processes issuing sequential requests (to boost the throughput),
  47  * and yet guarantee a low latency to interactive and soft real-time
  48  * applications.
  49  *
  50  * In particular, to provide these low-latency guarantees, BFQ
  51  * explicitly privileges the I/O of two classes of time-sensitive
  52  * applications: interactive and soft real-time. This feature enables
  53  * BFQ to provide applications in these classes with a very low
  54  * latency. Finally, BFQ also features additional heuristics for
  55  * preserving both a low latency and a high throughput on NCQ-capable,
  56  * rotational or flash-based devices, and to get the job done quickly
  57  * for applications consisting in many I/O-bound processes.
  58  *
  59  * BFQ is described in [1], where also a reference to the initial, more
  60  * theoretical paper on BFQ can be found. The interested reader can find
  61  * in the latter paper full details on the main algorithm, as well as
  62  * formulas of the guarantees and formal proofs of all the properties.
  63  * With respect to the version of BFQ presented in these papers, this
  64  * implementation adds a few more heuristics, such as the one that
  65  * guarantees a low latency to soft real-time applications, and a
  66  * hierarchical extension based on H-WF2Q+.
  67  *
  68  * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
  69  * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
  70  * with O(log N) complexity derives from the one introduced with EEVDF
  71  * in [3].
  72  *
  73  * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
  74  *     Scheduler", Proceedings of the First Workshop on Mobile System
  75  *     Technologies (MST-2015), May 2015.
  76  *     http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
  77  *
  78  * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
  79  *     Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
  80  *     Oct 1997.
  81  *
  82  * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
  83  *
  84  * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
  85  *     First: A Flexible and Accurate Mechanism for Proportional Share
  86  *     Resource Allocation", technical report.
  87  *
  88  * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
  89  */
  90 #include <linux/module.h>
  91 #include <linux/slab.h>
  92 #include <linux/blkdev.h>
  93 #include <linux/cgroup.h>
  94 #include <linux/elevator.h>
  95 #include <linux/ktime.h>
  96 #include <linux/rbtree.h>
  97 #include <linux/ioprio.h>
  98 #include <linux/sbitmap.h>
  99 #include <linux/delay.h>
 100
 101 #include "blk.h"
 102 #include "blk-mq.h"
 103 #include "blk-mq-tag.h"
 104 #include "blk-mq-sched.h"
 105 #include <linux/blktrace_api.h>
 106 #include <linux/hrtimer.h>
 107 #include <linux/blk-cgroup.h>
 108
 109 #define BFQ_IOPRIO_CLASSES      3
 110 #define BFQ_CL_IDLE_TIMEOUT     (HZ/5)
 111
 112 #define BFQ_MIN_WEIGHT                  1
 113 #define BFQ_MAX_WEIGHT                  1000
 114 #define BFQ_WEIGHT_CONVERSION_COEFF     10
 115
 116 #define BFQ_DEFAULT_QUEUE_IOPRIO        4
 117
 118 #define BFQ_WEIGHT_LEGACY_DFL   100
 119 #define BFQ_DEFAULT_GRP_IOPRIO  0
 120 #define BFQ_DEFAULT_GRP_CLASS   IOPRIO_CLASS_BE
 121
 122 /*
 123  * Soft real-time applications are extremely more latency sensitive
 124  * than interactive ones. Over-raise the weight of the former to
 125  * privilege them against the latter.
 126  */
 127 #define BFQ_SOFTRT_WEIGHT_FACTOR        100
 128
 129 struct bfq_entity;
 130
 131 /**
 132  * struct bfq_service_tree - per ioprio_class service tree.
 133  *
 134  * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
 135  * ioprio_class has its own independent scheduler, and so its own
 136  * bfq_service_tree.  All the fields are protected by the queue lock
 137  * of the containing bfqd.
 138  */
 139 struct bfq_service_tree {
 140         /* tree for active entities (i.e., those backlogged) */
 141         struct rb_root active;
 142         /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
 143         struct rb_root idle;
 144
 145         /* idle entity with minimum F_i */
 146         struct bfq_entity *first_idle;
 147         /* idle entity with maximum F_i */
 148         struct bfq_entity *last_idle;
 149
 150         /* scheduler virtual time */
 151         u64 vtime;
 152         /* scheduler weight sum; active and idle entities contribute to it */
 153         unsigned long wsum;
 154 };
 155
 156 /**
 157  * struct bfq_sched_data - multi-class scheduler.
 158  *
 159  * bfq_sched_data is the basic scheduler queue.  It supports three
 160  * ioprio_classes, and can be used either as a toplevel queue or as an
 161  * intermediate queue on a hierarchical setup.  @next_in_service
 162  * points to the active entity of the sched_data service trees that
 163  * will be scheduled next. It is used to reduce the number of steps
 164  * needed for each hierarchical-schedule update.
 165  *
 166  * The supported ioprio_classes are the same as in CFQ, in descending
 167  * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
 168  * Requests from higher priority queues are served before all the
 169  * requests from lower priority queues; among requests of the same
 170  * queue requests are served according to B-WF2Q+.
 171  * All the fields are protected by the queue lock of the containing bfqd.
 172  */
 173 struct bfq_sched_data {
 174         /* entity in service */
 175         struct bfq_entity *in_service_entity;
 176         /* head-of-line entity (see comments above) */
 177         struct bfq_entity *next_in_service;
 178         /* array of service trees, one per ioprio_class */
 179         struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
 180         /* last time CLASS_IDLE was served */
 181         unsigned long bfq_class_idle_last_service;
 182
 183 };
 184
 185 /**
 186  * struct bfq_weight_counter - counter of the number of all active entities
 187  *                             with a given weight.
 188  */
 189 struct bfq_weight_counter {
 190         unsigned int weight; /* weight of the entities this counter refers to */
 191         unsigned int num_active; /* nr of active entities with this weight */
 192         /*
 193          * Weights tree member (see bfq_data's @queue_weights_tree and
 194          * @group_weights_tree)
 195          */
 196         struct rb_node weights_node;
 197 };
 198
 199 /**
 200  * struct bfq_entity - schedulable entity.
 201  *
 202  * A bfq_entity is used to represent either a bfq_queue (leaf node in the
 203  * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
 204  * entity belongs to the sched_data of the parent group in the cgroup
 205  * hierarchy.  Non-leaf entities have also their own sched_data, stored
 206  * in @my_sched_data.
 207  *
 208  * Each entity stores independently its priority values; this would
 209  * allow different weights on different devices, but this
 210  * functionality is not exported to userspace by now.  Priorities and
 211  * weights are updated lazily, first storing the new values into the
 212  * new_* fields, then setting the @prio_changed flag.  As soon as
 213  * there is a transition in the entity state that allows the priority
 214  * update to take place the effective and the requested priority
 215  * values are synchronized.
 216  *
 217  * Unless cgroups are used, the weight value is calculated from the
 218  * ioprio to export the same interface as CFQ.  When dealing with
 219  * ``well-behaved'' queues (i.e., queues that do not spend too much
 220  * time to consume their budget and have true sequential behavior, and
 221  * when there are no external factors breaking anticipation) the
 222  * relative weights at each level of the cgroups hierarchy should be
 223  * guaranteed.  All the fields are protected by the queue lock of the
 224  * containing bfqd.
 225  */
 226 struct bfq_entity {
 227         /* service_tree member */
 228         struct rb_node rb_node;
 229         /* pointer to the weight counter associated with this entity */
 230         struct bfq_weight_counter *weight_counter;
 231
 232         /*
 233          * Flag, true if the entity is on a tree (either the active or
 234          * the idle one of its service_tree) or is in service.
 235          */
 236         bool on_st;
 237
 238         /* B-WF2Q+ start and finish timestamps [sectors/weight] */
 239         u64 start, finish;
 240
 241         /* tree the entity is enqueued into; %NULL if not on a tree */
 242         struct rb_root *tree;
 243
 244         /*
 245          * minimum start time of the (active) subtree rooted at this
 246          * entity; used for O(log N) lookups into active trees
 247          */
 248         u64 min_start;
 249
 250         /* amount of service received during the last service slot */
 251         int service;
 252
 253         /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
 254         int budget;
 255
 256         /* weight of the queue */
 257         int weight;
 258         /* next weight if a change is in progress */
 259         int new_weight;
 260
 261         /* original weight, used to implement weight boosting */
 262         int orig_weight;
 263
 264         /* parent entity, for hierarchical scheduling */
 265         struct bfq_entity *parent;
 266
 267         /*
 268          * For non-leaf nodes in the hierarchy, the associated
 269          * scheduler queue, %NULL on leaf nodes.
 270          */
 271         struct bfq_sched_data *my_sched_data;
 272         /* the scheduler queue this entity belongs to */
 273         struct bfq_sched_data *sched_data;
 274
 275         /* flag, set to request a weight, ioprio or ioprio_class change  */
 276         int prio_changed;
 277 };
 278
 279 struct bfq_group;
 280
 281 /**
 282  * struct bfq_ttime - per process thinktime stats.
 283  */
 284 struct bfq_ttime {
 285         /* completion time of the last request */
 286         u64 last_end_request;
 287
 288         /* total process thinktime */
 289         u64 ttime_total;
 290         /* number of thinktime samples */
 291         unsigned long ttime_samples;
 292         /* average process thinktime */
 293         u64 ttime_mean;
 294 };
 295
 296 /**
 297  * struct bfq_queue - leaf schedulable entity.
 298  *
 299  * A bfq_queue is a leaf request queue; it can be associated with an
 300  * io_context or more, if it  is  async or shared  between  cooperating
 301  * processes. @cgroup holds a reference to the cgroup, to be sure that it
 302  * does not disappear while a bfqq still references it (mostly to avoid
 303  * races between request issuing and task migration followed by cgroup
 304  * destruction).
 305  * All the fields are protected by the queue lock of the containing bfqd.
 306  */
 307 struct bfq_queue {
 308         /* reference counter */
 309         int ref;
 310         /* parent bfq_data */
 311         struct bfq_data *bfqd;
 312
 313         /* current ioprio and ioprio class */
 314         unsigned short ioprio, ioprio_class;
 315         /* next ioprio and ioprio class if a change is in progress */
 316         unsigned short new_ioprio, new_ioprio_class;
 317
 318         /*
 319          * Shared bfq_queue if queue is cooperating with one or more
 320          * other queues.
 321          */
 322         struct bfq_queue *new_bfqq;
 323         /* request-position tree member (see bfq_group's @rq_pos_tree) */
 324         struct rb_node pos_node;
 325         /* request-position tree root (see bfq_group's @rq_pos_tree) */
 326         struct rb_root *pos_root;
 327
 328         /* sorted list of pending requests */
 329         struct rb_root sort_list;
 330         /* if fifo isn't expired, next request to serve */
 331         struct request *next_rq;
 332         /* number of sync and async requests queued */
 333         int queued[2];
 334         /* number of requests currently allocated */
 335         int allocated;
 336         /* number of pending metadata requests */
 337         int meta_pending;
 338         /* fifo list of requests in sort_list */
 339         struct list_head fifo;
 340
 341         /* entity representing this queue in the scheduler */
 342         struct bfq_entity entity;
 343
 344         /* maximum budget allowed from the feedback mechanism */
 345         int max_budget;
 346         /* budget expiration (in jiffies) */
 347         unsigned long budget_timeout;
 348
 349         /* number of requests on the dispatch list or inside driver */
 350         int dispatched;
 351
 352         /* status flags */
 353         unsigned long flags;
 354
 355         /* node for active/idle bfqq list inside parent bfqd */
 356         struct list_head bfqq_list;
 357
 358         /* associated @bfq_ttime struct */
 359         struct bfq_ttime ttime;
 360
 361         /* bit vector: a 1 for each seeky requests in history */
 362         u32 seek_history;
 363
 364         /* node for the device's burst list */
 365         struct hlist_node burst_list_node;
 366
 367         /* position of the last request enqueued */
 368         sector_t last_request_pos;
 369
 370         /* Number of consecutive pairs of request completion and
 371          * arrival, such that the queue becomes idle after the
 372          * completion, but the next request arrives within an idle
 373          * time slice; used only if the queue's IO_bound flag has been
 374          * cleared.
 375          */
 376         unsigned int requests_within_timer;
 377
 378         /* pid of the process owning the queue, used for logging purposes */
 379         pid_t pid;
 380
 381         /*
 382          * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
 383          * if the queue is shared.
 384          */
 385         struct bfq_io_cq *bic;
 386
 387         /* current maximum weight-raising time for this queue */
 388         unsigned long wr_cur_max_time;
 389         /*
 390          * Minimum time instant such that, only if a new request is
 391          * enqueued after this time instant in an idle @bfq_queue with
 392          * no outstanding requests, then the task associated with the
 393          * queue it is deemed as soft real-time (see the comments on
 394          * the function bfq_bfqq_softrt_next_start())
 395          */
 396         unsigned long soft_rt_next_start;
 397         /*
 398          * Start time of the current weight-raising period if
 399          * the @bfq-queue is being weight-raised, otherwise
 400          * finish time of the last weight-raising period.
 401          */
 402         unsigned long last_wr_start_finish;
 403         /* factor by which the weight of this queue is multiplied */
 404         unsigned int wr_coeff;
 405         /*
 406          * Time of the last transition of the @bfq_queue from idle to
 407          * backlogged.
 408          */
 409         unsigned long last_idle_bklogged;
 410         /*
 411          * Cumulative service received from the @bfq_queue since the
 412          * last transition from idle to backlogged.
 413          */
 414         unsigned long service_from_backlogged;
 415
 416         /*
 417          * Value of wr start time when switching to soft rt
 418          */
 419         unsigned long wr_start_at_switch_to_srt;
 420
 421         unsigned long split_time; /* time of last split */
 422 };
 423
 424 /**
 425  * struct bfq_io_cq - per (request_queue, io_context) structure.
 426  */
 427 struct bfq_io_cq {
 428         /* associated io_cq structure */
 429         struct io_cq icq; /* must be the first member */
 430         /* array of two process queues, the sync and the async */
 431         struct bfq_queue *bfqq[2];
 432         /* per (request_queue, blkcg) ioprio */
 433         int ioprio;
 434 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 435         uint64_t blkcg_serial_nr; /* the current blkcg serial */
 436 #endif
 437         /*
 438          * Snapshot of the idle window before merging; taken to
 439          * remember this value while the queue is merged, so as to be
 440          * able to restore it in case of split.
 441          */
 442         bool saved_idle_window;
 443         /*
 444          * Same purpose as the previous two fields for the I/O bound
 445          * classification of a queue.
 446          */
 447         bool saved_IO_bound;
 448
 449         /*
 450          * Same purpose as the previous fields for the value of the
 451          * field keeping the queue's belonging to a large burst
 452          */
 453         bool saved_in_large_burst;
 454         /*
 455          * True if the queue belonged to a burst list before its merge
 456          * with another cooperating queue.
 457          */
 458         bool was_in_burst_list;
 459
 460         /*
 461          * Similar to previous fields: save wr information.
 462          */
 463         unsigned long saved_wr_coeff;
 464         unsigned long saved_last_wr_start_finish;
 465         unsigned long saved_wr_start_at_switch_to_srt;
 466         unsigned int saved_wr_cur_max_time;
 467         struct bfq_ttime saved_ttime;
 468 };
 469
 470 enum bfq_device_speed {
 471         BFQ_BFQD_FAST,
 472         BFQ_BFQD_SLOW,
 473 };
 474
 475 /**
 476  * struct bfq_data - per-device data structure.
 477  *
 478  * All the fields are protected by @lock.
 479  */
 480 struct bfq_data {
 481         /* device request queue */
 482         struct request_queue *queue;
 483         /* dispatch queue */
 484         struct list_head dispatch;
 485
 486         /* root bfq_group for the device */
 487         struct bfq_group *root_group;
 488
 489         /*
 490          * rbtree of weight counters of @bfq_queues, sorted by
 491          * weight. Used to keep track of whether all @bfq_queues have
 492          * the same weight. The tree contains one counter for each
 493          * distinct weight associated to some active and not
 494          * weight-raised @bfq_queue (see the comments to the functions
 495          * bfq_weights_tree_[add|remove] for further details).
 496          */
 497         struct rb_root queue_weights_tree;
 498         /*
 499          * rbtree of non-queue @bfq_entity weight counters, sorted by
 500          * weight. Used to keep track of whether all @bfq_groups have
 501          * the same weight. The tree contains one counter for each
 502          * distinct weight associated to some active @bfq_group (see
 503          * the comments to the functions bfq_weights_tree_[add|remove]
 504          * for further details).
 505          */
 506         struct rb_root group_weights_tree;
 507
 508         /*
 509          * Number of bfq_queues containing requests (including the
 510          * queue in service, even if it is idling).
 511          */
 512         int busy_queues;
 513         /* number of weight-raised busy @bfq_queues */
 514         int wr_busy_queues;
 515         /* number of queued requests */
 516         int queued;
 517         /* number of requests dispatched and waiting for completion */
 518         int rq_in_driver;
 519
 520         /*
 521          * Maximum number of requests in driver in the last
 522          * @hw_tag_samples completed requests.
 523          */
 524         int max_rq_in_driver;
 525         /* number of samples used to calculate hw_tag */
 526         int hw_tag_samples;
 527         /* flag set to one if the driver is showing a queueing behavior */
 528         int hw_tag;
 529
 530         /* number of budgets assigned */
 531         int budgets_assigned;
 532
 533         /*
 534          * Timer set when idling (waiting) for the next request from
 535          * the queue in service.
 536          */
 537         struct hrtimer idle_slice_timer;
 538
 539         /* bfq_queue in service */
 540         struct bfq_queue *in_service_queue;
 541         /* bfq_io_cq (bic) associated with the @in_service_queue */
 542         struct bfq_io_cq *in_service_bic;
 543
 544         /* on-disk position of the last served request */
 545         sector_t last_position;
 546
 547         /* time of last request completion (ns) */
 548         u64 last_completion;
 549
 550         /* time of first rq dispatch in current observation interval (ns) */
 551         u64 first_dispatch;
 552         /* time of last rq dispatch in current observation interval (ns) */
 553         u64 last_dispatch;
 554
 555         /* beginning of the last budget */
 556         ktime_t last_budget_start;
 557         /* beginning of the last idle slice */
 558         ktime_t last_idling_start;
 559
 560         /* number of samples in current observation interval */
 561         int peak_rate_samples;
 562         /* num of samples of seq dispatches in current observation interval */
 563         u32 sequential_samples;
 564         /* total num of sectors transferred in current observation interval */
 565         u64 tot_sectors_dispatched;
 566         /* max rq size seen during current observation interval (sectors) */
 567         u32 last_rq_max_size;
 568         /* time elapsed from first dispatch in current observ. interval (us) */
 569         u64 delta_from_first;
 570         /*
 571          * Current estimate of the device peak rate, measured in
 572          * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
 573          * BFQ_RATE_SHIFT is performed to increase precision in
 574          * fixed-point calculations.
 575          */
 576         u32 peak_rate;
 577
 578         /* maximum budget allotted to a bfq_queue before rescheduling */
 579         int bfq_max_budget;
 580
 581         /* list of all the bfq_queues active on the device */
 582         struct list_head active_list;
 583         /* list of all the bfq_queues idle on the device */
 584         struct list_head idle_list;
 585
 586         /*
 587          * Timeout for async/sync requests; when it fires, requests
 588          * are served in fifo order.
 589          */
 590         u64 bfq_fifo_expire[2];
 591         /* weight of backward seeks wrt forward ones */
 592         unsigned int bfq_back_penalty;
 593         /* maximum allowed backward seek */
 594         unsigned int bfq_back_max;
 595         /* maximum idling time */
 596         u32 bfq_slice_idle;
 597
 598         /* user-configured max budget value (0 for auto-tuning) */
 599         int bfq_user_max_budget;
 600         /*
 601          * Timeout for bfq_queues to consume their budget; used to
 602          * prevent seeky queues from imposing long latencies to
 603          * sequential or quasi-sequential ones (this also implies that
 604          * seeky queues cannot receive guarantees in the service
 605          * domain; after a timeout they are charged for the time they
 606          * have been in service, to preserve fairness among them, but
 607          * without service-domain guarantees).
 608          */
 609         unsigned int bfq_timeout;
 610
 611         /*
 612          * Number of consecutive requests that must be issued within
 613          * the idle time slice to set again idling to a queue which
 614          * was marked as non-I/O-bound (see the definition of the
 615          * IO_bound flag for further details).
 616          */
 617         unsigned int bfq_requests_within_timer;
 618
 619         /*
 620          * Force device idling whenever needed to provide accurate
 621          * service guarantees, without caring about throughput
 622          * issues. CAVEAT: this may even increase latencies, in case
 623          * of useless idling for processes that did stop doing I/O.
 624          */
 625         bool strict_guarantees;
 626
 627         /*
 628          * Last time at which a queue entered the current burst of
 629          * queues being activated shortly after each other; for more
 630          * details about this and the following parameters related to
 631          * a burst of activations, see the comments on the function
 632          * bfq_handle_burst.
 633          */
 634         unsigned long last_ins_in_burst;
 635         /*
 636          * Reference time interval used to decide whether a queue has
 637          * been activated shortly after @last_ins_in_burst.
 638          */
 639         unsigned long bfq_burst_interval;
 640         /* number of queues in the current burst of queue activations */
 641         int burst_size;
 642
 643         /* common parent entity for the queues in the burst */
 644         struct bfq_entity *burst_parent_entity;
 645         /* Maximum burst size above which the current queue-activation
 646          * burst is deemed as 'large'.
 647          */
 648         unsigned long bfq_large_burst_thresh;
 649         /* true if a large queue-activation burst is in progress */
 650         bool large_burst;
 651         /*
 652          * Head of the burst list (as for the above fields, more
 653          * details in the comments on the function bfq_handle_burst).
 654          */
 655         struct hlist_head burst_list;
 656
 657         /* if set to true, low-latency heuristics are enabled */
 658         bool low_latency;
 659         /*
 660          * Maximum factor by which the weight of a weight-raised queue
 661          * is multiplied.
 662          */
 663         unsigned int bfq_wr_coeff;
 664         /* maximum duration of a weight-raising period (jiffies) */
 665         unsigned int bfq_wr_max_time;
 666
 667         /* Maximum weight-raising duration for soft real-time processes */
 668         unsigned int bfq_wr_rt_max_time;
 669         /*
 670          * Minimum idle period after which weight-raising may be
 671          * reactivated for a queue (in jiffies).
 672          */
 673         unsigned int bfq_wr_min_idle_time;
 674         /*
 675          * Minimum period between request arrivals after which
 676          * weight-raising may be reactivated for an already busy async
 677          * queue (in jiffies).
 678          */
 679         unsigned long bfq_wr_min_inter_arr_async;
 680
 681         /* Max service-rate for a soft real-time queue, in sectors/sec */
 682         unsigned int bfq_wr_max_softrt_rate;
 683         /*
 684          * Cached value of the product R*T, used for computing the
 685          * maximum duration of weight raising automatically.
 686          */
 687         u64 RT_prod;
 688         /* device-speed class for the low-latency heuristic */
 689         enum bfq_device_speed device_speed;
 690
 691         /* fallback dummy bfqq for extreme OOM conditions */
 692         struct bfq_queue oom_bfqq;
 693
 694         spinlock_t lock;
 695
 696         /*
 697          * bic associated with the task issuing current bio for
 698          * merging. This and the next field are used as a support to
 699          * be able to perform the bic lookup, needed by bio-merge
 700          * functions, before the scheduler lock is taken, and thus
 701          * avoid taking the request-queue lock while the scheduler
 702          * lock is being held.
 703          */
 704         struct bfq_io_cq *bio_bic;
 705         /* bfqq associated with the task issuing current bio for merging */
 706         struct bfq_queue *bio_bfqq;
 707
 708         /*
 709          * io context to put right after bfqd->lock is released. This
 710          * filed is used to perform put_io_context, when needed, to
 711          * after the scheduler lock has been released, and thus
 712          * prevent an ioc->lock from being possibly taken while the
 713          * scheduler lock is being held.
 714          */
 715         struct io_context *ioc_to_put;
 716 };
 717
 718 enum bfqq_state_flags {
 719         BFQQF_just_created = 0, /* queue just allocated */
 720         BFQQF_busy,             /* has requests or is in service */
 721         BFQQF_wait_request,     /* waiting for a request */
 722         BFQQF_non_blocking_wait_rq, /*
 723                                      * waiting for a request
 724                                      * without idling the device
 725                                      */
 726         BFQQF_fifo_expire,      /* FIFO checked in this slice */
 727         BFQQF_idle_window,      /* slice idling enabled */
 728         BFQQF_sync,             /* synchronous queue */
 729         BFQQF_IO_bound,         /*
 730                                  * bfqq has timed-out at least once
 731                                  * having consumed at most 2/10 of
 732                                  * its budget
 733                                  */
 734         BFQQF_in_large_burst,   /*
 735                                  * bfqq activated in a large burst,
 736                                  * see comments to bfq_handle_burst.
 737                                  */
 738         BFQQF_softrt_update,    /*
 739                                  * may need softrt-next-start
 740                                  * update
 741                                  */
 742         BFQQF_coop,             /* bfqq is shared */
 743         BFQQF_split_coop        /* shared bfqq will be split */
 744 };
 745
 746 #define BFQ_BFQQ_FNS(name)                                              \
 747 static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)                \
 748 {                                                                       \
 749         __set_bit(BFQQF_##name, &(bfqq)->flags);                        \
 750 }                                                                       \
 751 static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)               \
 752 {                                                                       \
 753         __clear_bit(BFQQF_##name, &(bfqq)->flags);              \
 754 }                                                                       \
 755 static int bfq_bfqq_##name(const struct bfq_queue *bfqq)                \
 756 {                                                                       \
 757         return test_bit(BFQQF_##name, &(bfqq)->flags);          \
 758 }
 759
 760 BFQ_BFQQ_FNS(just_created);
 761 BFQ_BFQQ_FNS(busy);
 762 BFQ_BFQQ_FNS(wait_request);
 763 BFQ_BFQQ_FNS(non_blocking_wait_rq);
 764 BFQ_BFQQ_FNS(fifo_expire);
 765 BFQ_BFQQ_FNS(idle_window);
 766 BFQ_BFQQ_FNS(sync);
 767 BFQ_BFQQ_FNS(IO_bound);
 768 BFQ_BFQQ_FNS(in_large_burst);
 769 BFQ_BFQQ_FNS(coop);
 770 BFQ_BFQQ_FNS(split_coop);
 771 BFQ_BFQQ_FNS(softrt_update);
 772 #undef BFQ_BFQQ_FNS
 773
 774 /* Logging facilities. */
 775 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 776 static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
 777 static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
 778
 779 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...)  do {                    \
 780         char __pbuf[128];                                               \
 781                                                                         \
 782         blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
 783         blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
 784                         bfq_bfqq_sync((bfqq)) ? 'S' : 'A',              \
 785                           __pbuf, ##args);                              \
 786 } while (0)
 787
 788 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)  do {                    \
 789         char __pbuf[128];                                               \
 790                                                                         \
 791         blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));          \
 792         blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);    \
 793 } while (0)
 794
 795 #else /* CONFIG_BFQ_GROUP_IOSCHED */
 796
 797 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...)  \
 798         blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,   \
 799                         bfq_bfqq_sync((bfqq)) ? 'S' : 'A',              \
 800                                 ##args)
 801 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)          do {} while (0)
 802
 803 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
 804
 805 #define bfq_log(bfqd, fmt, args...) \
 806         blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
 807
 808 /* Expiration reasons. */
 809 enum bfqq_expiration {
 810         BFQQE_TOO_IDLE = 0,             /*
 811                                          * queue has been idling for
 812                                          * too long
 813                                          */
 814         BFQQE_BUDGET_TIMEOUT,   /* budget took too long to be used */
 815         BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
 816         BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
 817         BFQQE_PREEMPTED         /* preemption in progress */
 818 };
 819
 820 struct bfqg_stats {
 821 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 822         /* number of ios merged */
 823         struct blkg_rwstat              merged;
 824         /* total time spent on device in ns, may not be accurate w/ queueing */
 825         struct blkg_rwstat              service_time;
 826         /* total time spent waiting in scheduler queue in ns */
 827         struct blkg_rwstat              wait_time;
 828         /* number of IOs queued up */
 829         struct blkg_rwstat              queued;
 830         /* total disk time and nr sectors dispatched by this group */
 831         struct blkg_stat                time;
 832         /* sum of number of ios queued across all samples */
 833         struct blkg_stat                avg_queue_size_sum;
 834         /* count of samples taken for average */
 835         struct blkg_stat                avg_queue_size_samples;
 836         /* how many times this group has been removed from service tree */
 837         struct blkg_stat                dequeue;
 838         /* total time spent waiting for it to be assigned a timeslice. */
 839         struct blkg_stat                group_wait_time;
 840         /* time spent idling for this blkcg_gq */
 841         struct blkg_stat                idle_time;
 842         /* total time with empty current active q with other requests queued */
 843         struct blkg_stat                empty_time;
 844         /* fields after this shouldn't be cleared on stat reset */
 845         uint64_t                        start_group_wait_time;
 846         uint64_t                        start_idle_time;
 847         uint64_t                        start_empty_time;
 848         uint16_t                        flags;
 849 #endif  /* CONFIG_BFQ_GROUP_IOSCHED */
 850 };
 851
 852 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 853
 854 /*
 855  * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
 856  *
 857  * @ps: @blkcg_policy_storage that this structure inherits
 858  * @weight: weight of the bfq_group
 859  */
 860 struct bfq_group_data {
 861         /* must be the first member */
 862         struct blkcg_policy_data pd;
 863
 864         unsigned int weight;
 865 };
 866
 867 /**
 868  * struct bfq_group - per (device, cgroup) data structure.
 869  * @entity: schedulable entity to insert into the parent group sched_data.
 870  * @sched_data: own sched_data, to contain child entities (they may be
 871  *              both bfq_queues and bfq_groups).
 872  * @bfqd: the bfq_data for the device this group acts upon.
 873  * @async_bfqq: array of async queues for all the tasks belonging to
 874  *              the group, one queue per ioprio value per ioprio_class,
 875  *              except for the idle class that has only one queue.
 876  * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
 877  * @my_entity: pointer to @entity, %NULL for the toplevel group; used
 878  *             to avoid too many special cases during group creation/
 879  *             migration.
 880  * @stats: stats for this bfqg.
 881  * @active_entities: number of active entities belonging to the group;
 882  *                   unused for the root group. Used to know whether there
 883  *                   are groups with more than one active @bfq_entity
 884  *                   (see the comments to the function
 885  *                   bfq_bfqq_may_idle()).
 886  * @rq_pos_tree: rbtree sorted by next_request position, used when
 887  *               determining if two or more queues have interleaving
 888  *               requests (see bfq_find_close_cooperator()).
 889  *
 890  * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
 891  * there is a set of bfq_groups, each one collecting the lower-level
 892  * entities belonging to the group that are acting on the same device.
 893  *
 894  * Locking works as follows:
 895  *    o @bfqd is protected by the queue lock, RCU is used to access it
 896  *      from the readers.
 897  *    o All the other fields are protected by the @bfqd queue lock.
 898  */
 899 struct bfq_group {
 900         /* must be the first member */
 901         struct blkg_policy_data pd;
 902
 903         struct bfq_entity entity;
 904         struct bfq_sched_data sched_data;
 905
 906         void *bfqd;
 907
 908         struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
 909         struct bfq_queue *async_idle_bfqq;
 910
 911         struct bfq_entity *my_entity;
 912
 913         int active_entities;
 914
 915         struct rb_root rq_pos_tree;
 916
 917         struct bfqg_stats stats;
 918 };
 919
 920 #else
 921 struct bfq_group {
 922         struct bfq_sched_data sched_data;
 923
 924         struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
 925         struct bfq_queue *async_idle_bfqq;
 926
 927         struct rb_root rq_pos_tree;
 928 };
 929 #endif
 930
 931 static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
 932
 933 static unsigned int bfq_class_idx(struct bfq_entity *entity)
 934 {
 935         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 936
 937         return bfqq ? bfqq->ioprio_class - 1 :
 938                 BFQ_DEFAULT_GRP_CLASS - 1;
 939 }
 940
 941 static struct bfq_service_tree *
 942 bfq_entity_service_tree(struct bfq_entity *entity)
 943 {
 944         struct bfq_sched_data *sched_data = entity->sched_data;
 945         unsigned int idx = bfq_class_idx(entity);
 946
 947         return sched_data->service_tree + idx;
 948 }
 949
 950 static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
 951 {
 952         return bic->bfqq[is_sync];
 953 }
 954
 955 static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
 956                          bool is_sync)
 957 {
 958         bic->bfqq[is_sync] = bfqq;
 959 }
 960
 961 static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
 962 {
 963         return bic->icq.q->elevator->elevator_data;
 964 }
 965
 966 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 967
 968 static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
 969 {
 970         struct bfq_entity *group_entity = bfqq->entity.parent;
 971
 972         if (!group_entity)
 973                 group_entity = &bfqq->bfqd->root_group->entity;
 974
 975         return container_of(group_entity, struct bfq_group, entity);
 976 }
 977
 978 #else
 979
 980 static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
 981 {
 982         return bfqq->bfqd->root_group;
 983 }
 984
 985 #endif
 986
 987 static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
 988 static void bfq_put_queue(struct bfq_queue *bfqq);
 989 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 990                                        struct bio *bio, bool is_sync,
 991                                        struct bfq_io_cq *bic);
 992 static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
 993                                     struct bfq_group *bfqg);
 994 static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
 995 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 996
 997 /* Expiration time of sync (0) and async (1) requests, in ns. */
 998 static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
 999
1000 /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
1001 static const int bfq_back_max = 16 * 1024;
1002
1003 /* Penalty of a backwards seek, in number of sectors. */
1004 static const int bfq_back_penalty = 2;
1005
1006 /* Idling period duration, in ns. */
1007 static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
1008
1009 /* Minimum number of assigned budgets for which stats are safe to compute. */
1010 static const int bfq_stats_min_budgets = 194;
1011
1012 /* Default maximum budget values, in sectors and number of requests. */
1013 static const int bfq_default_max_budget = 16 * 1024;
1014
1015 /*
1016  * Async to sync throughput distribution is controlled as follows:
1017  * when an async request is served, the entity is charged the number
1018  * of sectors of the request, multiplied by the factor below
1019  */
1020 static const int bfq_async_charge_factor = 10;
1021
1022 /* Default timeout values, in jiffies, approximating CFQ defaults. */
1023 static const int bfq_timeout = HZ / 8;
1024
1025 static struct kmem_cache *bfq_pool;
1026
1027 /* Below this threshold (in ns), we consider thinktime immediate. */
1028 #define BFQ_MIN_TT              (2 * NSEC_PER_MSEC)
1029
1030 /* hw_tag detection: parallel requests threshold and min samples needed. */
1031 #define BFQ_HW_QUEUE_THRESHOLD  4
1032 #define BFQ_HW_QUEUE_SAMPLES    32
1033
1034 #define BFQQ_SEEK_THR           (sector_t)(8 * 100)
1035 #define BFQQ_SECT_THR_NONROT    (sector_t)(2 * 32)
1036 #define BFQQ_CLOSE_THR          (sector_t)(8 * 1024)
1037 #define BFQQ_SEEKY(bfqq)        (hweight32(bfqq->seek_history) > 32/8)
1038
1039 /* Min number of samples required to perform peak-rate update */
1040 #define BFQ_RATE_MIN_SAMPLES    32
1041 /* Min observation time interval required to perform a peak-rate update (ns) */
1042 #define BFQ_RATE_MIN_INTERVAL   (300*NSEC_PER_MSEC)
1043 /* Target observation time interval for a peak-rate update (ns) */
1044 #define BFQ_RATE_REF_INTERVAL   NSEC_PER_SEC
1045
1046 /* Shift used for peak rate fixed precision calculations. */
1047 #define BFQ_RATE_SHIFT          16
1048
1049 /*
1050  * By default, BFQ computes the duration of the weight raising for
1051  * interactive applications automatically, using the following formula:
1052  * duration = (R / r) * T, where r is the peak rate of the device, and
1053  * R and T are two reference parameters.
1054  * In particular, R is the peak rate of the reference device (see below),
1055  * and T is a reference time: given the systems that are likely to be
1056  * installed on the reference device according to its speed class, T is
1057  * about the maximum time needed, under BFQ and while reading two files in
1058  * parallel, to load typical large applications on these systems.
1059  * In practice, the slower/faster the device at hand is, the more/less it
1060  * takes to load applications with respect to the reference device.
1061  * Accordingly, the longer/shorter BFQ grants weight raising to interactive
1062  * applications.
1063  *
1064  * BFQ uses four different reference pairs (R, T), depending on:
1065  * . whether the device is rotational or non-rotational;
1066  * . whether the device is slow, such as old or portable HDDs, as well as
1067  *   SD cards, or fast, such as newer HDDs and SSDs.
1068  *
1069  * The device's speed class is dynamically (re)detected in
1070  * bfq_update_peak_rate() every time the estimated peak rate is updated.
1071  *
1072  * In the following definitions, R_slow[0]/R_fast[0] and
1073  * T_slow[0]/T_fast[0] are the reference values for a slow/fast
1074  * rotational device, whereas R_slow[1]/R_fast[1] and
1075  * T_slow[1]/T_fast[1] are the reference values for a slow/fast
1076  * non-rotational device. Finally, device_speed_thresh are the
1077  * thresholds used to switch between speed classes. The reference
1078  * rates are not the actual peak rates of the devices used as a
1079  * reference, but slightly lower values. The reason for using these
1080  * slightly lower values is that the peak-rate estimator tends to
1081  * yield slightly lower values than the actual peak rate (it can yield
1082  * the actual peak rate only if there is only one process doing I/O,
1083  * and the process does sequential I/O).
1084  *
1085  * Both the reference peak rates and the thresholds are measured in
1086  * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
1087  */
1088 static int R_slow[2] = {1000, 10700};
1089 static int R_fast[2] = {14000, 33000};
1090 /*
1091  * To improve readability, a conversion function is used to initialize the
1092  * following arrays, which entails that they can be initialized only in a
1093  * function.
1094  */
1095 static int T_slow[2];
1096 static int T_fast[2];
1097 static int device_speed_thresh[2];
1098
1099 #define BFQ_SERVICE_TREE_INIT   ((struct bfq_service_tree)              \
1100                                 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1101
1102 #define RQ_BIC(rq)              ((struct bfq_io_cq *) (rq)->elv.priv[0])
1103 #define RQ_BFQQ(rq)             ((rq)->elv.priv[1])
1104
1105 /**
1106  * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1107  * @icq: the iocontext queue.
1108  */
1109 static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
1110 {
1111         /* bic->icq is the first member, %NULL will convert to %NULL */
1112         return container_of(icq, struct bfq_io_cq, icq);
1113 }
1114
1115 /**
1116  * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1117  * @bfqd: the lookup key.
1118  * @ioc: the io_context of the process doing I/O.
1119  * @q: the request queue.
1120  */
1121 static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
1122                                         struct io_context *ioc,
1123                                         struct request_queue *q)
1124 {
1125         if (ioc) {
1126                 unsigned long flags;
1127                 struct bfq_io_cq *icq;
1128
1129                 spin_lock_irqsave(q->queue_lock, flags);
1130                 icq = icq_to_bic(ioc_lookup_icq(ioc, q));
1131                 spin_unlock_irqrestore(q->queue_lock, flags);
1132
1133                 return icq;
1134         }
1135
1136         return NULL;
1137 }
1138
1139 /*
1140  * Scheduler run of queue, if there are requests pending and no one in the
1141  * driver that will restart queueing.
1142  */
1143 static void bfq_schedule_dispatch(struct bfq_data *bfqd)
1144 {
1145         if (bfqd->queued != 0) {
1146                 bfq_log(bfqd, "schedule dispatch");
1147                 blk_mq_run_hw_queues(bfqd->queue, true);
1148         }
1149 }
1150
1151 /*
1152  * Next two functions release bfqd->lock and put the io context
1153  * pointed by bfqd->ioc_to_put. This delayed put is used to not risk
1154  * to take an ioc->lock while the scheduler lock is being held.
1155  */
1156 static void bfq_unlock_put_ioc(struct bfq_data *bfqd)
1157 {
1158         struct io_context *ioc_to_put = bfqd->ioc_to_put;
1159
1160         bfqd->ioc_to_put = NULL;
1161         spin_unlock_irq(&bfqd->lock);
1162
1163         if (ioc_to_put)
1164                 put_io_context(ioc_to_put);
1165 }
1166
1167 static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd,
1168                                        unsigned long flags)
1169 {
1170         struct io_context *ioc_to_put = bfqd->ioc_to_put;
1171
1172         bfqd->ioc_to_put = NULL;
1173         spin_unlock_irqrestore(&bfqd->lock, flags);
1174
1175         if (ioc_to_put)
1176                 put_io_context(ioc_to_put);
1177 }
1178
1179 /**
1180  * bfq_gt - compare two timestamps.
1181  * @a: first ts.
1182  * @b: second ts.
1183  *
1184  * Return @a > @b, dealing with wrapping correctly.
1185  */
1186 static int bfq_gt(u64 a, u64 b)
1187 {
1188         return (s64)(a - b) > 0;
1189 }
1190
1191 static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
1192 {
1193         struct rb_node *node = tree->rb_node;
1194
1195         return rb_entry(node, struct bfq_entity, rb_node);
1196 }
1197
1198 static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
1199
1200 static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
1201
1202 /**
1203  * bfq_update_next_in_service - update sd->next_in_service
1204  * @sd: sched_data for which to perform the update.
1205  * @new_entity: if not NULL, pointer to the entity whose activation,
1206  *              requeueing or repositionig triggered the invocation of
1207  *              this function.
1208  *
1209  * This function is called to update sd->next_in_service, which, in
1210  * its turn, may change as a consequence of the insertion or
1211  * extraction of an entity into/from one of the active trees of
1212  * sd. These insertions/extractions occur as a consequence of
1213  * activations/deactivations of entities, with some activations being
1214  * 'true' activations, and other activations being requeueings (i.e.,
1215  * implementing the second, requeueing phase of the mechanism used to
1216  * reposition an entity in its active tree; see comments on
1217  * __bfq_activate_entity and __bfq_requeue_entity for details). In
1218  * both the last two activation sub-cases, new_entity points to the
1219  * just activated or requeued entity.
1220  *
1221  * Returns true if sd->next_in_service changes in such a way that
1222  * entity->parent may become the next_in_service for its parent
1223  * entity.
1224  */
1225 static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
1226                                        struct bfq_entity *new_entity)
1227 {
1228         struct bfq_entity *next_in_service = sd->next_in_service;
1229         bool parent_sched_may_change = false;
1230
1231         /*
1232          * If this update is triggered by the activation, requeueing
1233          * or repositiong of an entity that does not coincide with
1234          * sd->next_in_service, then a full lookup in the active tree
1235          * can be avoided. In fact, it is enough to check whether the
1236          * just-modified entity has a higher priority than
1237          * sd->next_in_service, or, even if it has the same priority
1238          * as sd->next_in_service, is eligible and has a lower virtual
1239          * finish time than sd->next_in_service. If this compound
1240          * condition holds, then the new entity becomes the new
1241          * next_in_service. Otherwise no change is needed.
1242          */
1243         if (new_entity && new_entity != sd->next_in_service) {
1244                 /*
1245                  * Flag used to decide whether to replace
1246                  * sd->next_in_service with new_entity. Tentatively
1247                  * set to true, and left as true if
1248                  * sd->next_in_service is NULL.
1249                  */
1250                 bool replace_next = true;
1251
1252                 /*
1253                  * If there is already a next_in_service candidate
1254                  * entity, then compare class priorities or timestamps
1255                  * to decide whether to replace sd->service_tree with
1256                  * new_entity.
1257                  */
1258                 if (next_in_service) {
1259                         unsigned int new_entity_class_idx =
1260                                 bfq_class_idx(new_entity);
1261                         struct bfq_service_tree *st =
1262                                 sd->service_tree + new_entity_class_idx;
1263
1264                         /*
1265                          * For efficiency, evaluate the most likely
1266                          * sub-condition first.
1267                          */
1268                         replace_next =
1269                                 (new_entity_class_idx ==
1270                                  bfq_class_idx(next_in_service)
1271                                  &&
1272                                  !bfq_gt(new_entity->start, st->vtime)
1273                                  &&
1274                                  bfq_gt(next_in_service->finish,
1275                                         new_entity->finish))
1276                                 ||
1277                                 new_entity_class_idx <
1278                                 bfq_class_idx(next_in_service);
1279                 }
1280
1281                 if (replace_next)
1282                         next_in_service = new_entity;
1283         } else /* invoked because of a deactivation: lookup needed */
1284                 next_in_service = bfq_lookup_next_entity(sd);
1285
1286         if (next_in_service) {
1287                 parent_sched_may_change = !sd->next_in_service ||
1288                         bfq_update_parent_budget(next_in_service);
1289         }
1290
1291         sd->next_in_service = next_in_service;
1292
1293         if (!next_in_service)
1294                 return parent_sched_may_change;
1295
1296         return parent_sched_may_change;
1297 }
1298
1299 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1300 /* both next loops stop at one of the child entities of the root group */
1301 #define for_each_entity(entity) \
1302         for (; entity ; entity = entity->parent)
1303
1304 /*
1305  * For each iteration, compute parent in advance, so as to be safe if
1306  * entity is deallocated during the iteration. Such a deallocation may
1307  * happen as a consequence of a bfq_put_queue that frees the bfq_queue
1308  * containing entity.
1309  */
1310 #define for_each_entity_safe(entity, parent) \
1311         for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
1312
1313 /*
1314  * Returns true if this budget changes may let next_in_service->parent
1315  * become the next_in_service entity for its parent entity.
1316  */
1317 static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
1318 {
1319         struct bfq_entity *bfqg_entity;
1320         struct bfq_group *bfqg;
1321         struct bfq_sched_data *group_sd;
1322         bool ret = false;
1323
1324         group_sd = next_in_service->sched_data;
1325
1326         bfqg = container_of(group_sd, struct bfq_group, sched_data);
1327         /*
1328          * bfq_group's my_entity field is not NULL only if the group
1329          * is not the root group. We must not touch the root entity
1330          * as it must never become an in-service entity.
1331          */
1332         bfqg_entity = bfqg->my_entity;
1333         if (bfqg_entity) {
1334                 if (bfqg_entity->budget > next_in_service->budget)
1335                         ret = true;
1336                 bfqg_entity->budget = next_in_service->budget;
1337         }
1338
1339         return ret;
1340 }
1341
1342 /*
1343  * This function tells whether entity stops being a candidate for next
1344  * service, according to the following logic.
1345  *
1346  * This function is invoked for an entity that is about to be set in
1347  * service. If such an entity is a queue, then the entity is no longer
1348  * a candidate for next service (i.e, a candidate entity to serve
1349  * after the in-service entity is expired). The function then returns
1350  * true.
1351  *
1352  * In contrast, the entity could stil be a candidate for next service
1353  * if it is not a queue, and has more than one child. In fact, even if
1354  * one of its children is about to be set in service, other children
1355  * may still be the next to serve. As a consequence, a non-queue
1356  * entity is not a candidate for next-service only if it has only one
1357  * child. And only if this condition holds, then the function returns
1358  * true for a non-queue entity.
1359  */
1360 static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
1361 {
1362         struct bfq_group *bfqg;
1363
1364         if (bfq_entity_to_bfqq(entity))
1365                 return true;
1366
1367         bfqg = container_of(entity, struct bfq_group, entity);
1368
1369         if (bfqg->active_entities == 1)
1370                 return true;
1371
1372         return false;
1373 }
1374
1375 #else /* CONFIG_BFQ_GROUP_IOSCHED */
1376 /*
1377  * Next two macros are fake loops when cgroups support is not
1378  * enabled. I fact, in such a case, there is only one level to go up
1379  * (to reach the root group).
1380  */
1381 #define for_each_entity(entity) \
1382         for (; entity ; entity = NULL)
1383
1384 #define for_each_entity_safe(entity, parent) \
1385         for (parent = NULL; entity ; entity = parent)
1386
1387 static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
1388 {
1389         return false;
1390 }
1391
1392 static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
1393 {
1394         return true;
1395 }
1396
1397 #endif /* CONFIG_BFQ_GROUP_IOSCHED */
1398
1399 /*
1400  * Shift for timestamp calculations.  This actually limits the maximum
1401  * service allowed in one timestamp delta (small shift values increase it),
1402  * the maximum total weight that can be used for the queues in the system
1403  * (big shift values increase it), and the period of virtual time
1404  * wraparounds.
1405  */
1406 #define WFQ_SERVICE_SHIFT       22
1407
1408 static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
1409 {
1410         struct bfq_queue *bfqq = NULL;
1411
1412         if (!entity->my_sched_data)
1413                 bfqq = container_of(entity, struct bfq_queue, entity);
1414
1415         return bfqq;
1416 }
1417
1418
1419 /**
1420  * bfq_delta - map service into the virtual time domain.
1421  * @service: amount of service.
1422  * @weight: scale factor (weight of an entity or weight sum).
1423  */
1424 static u64 bfq_delta(unsigned long service, unsigned long weight)
1425 {
1426         u64 d = (u64)service << WFQ_SERVICE_SHIFT;
1427
1428         do_div(d, weight);
1429         return d;
1430 }
1431
1432 /**
1433  * bfq_calc_finish - assign the finish time to an entity.
1434  * @entity: the entity to act upon.
1435  * @service: the service to be charged to the entity.
1436  */
1437 static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
1438 {
1439         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1440
1441         entity->finish = entity->start +
1442                 bfq_delta(service, entity->weight);
1443
1444         if (bfqq) {
1445                 bfq_log_bfqq(bfqq->bfqd, bfqq,
1446                         "calc_finish: serv %lu, w %d",
1447                         service, entity->weight);
1448                 bfq_log_bfqq(bfqq->bfqd, bfqq,
1449                         "calc_finish: start %llu, finish %llu, delta %llu",
1450                         entity->start, entity->finish,
1451                         bfq_delta(service, entity->weight));
1452         }
1453 }
1454
1455 /**
1456  * bfq_entity_of - get an entity from a node.
1457  * @node: the node field of the entity.
1458  *
1459  * Convert a node pointer to the relative entity.  This is used only
1460  * to simplify the logic of some functions and not as the generic
1461  * conversion mechanism because, e.g., in the tree walking functions,
1462  * the check for a %NULL value would be redundant.
1463  */
1464 static struct bfq_entity *bfq_entity_of(struct rb_node *node)
1465 {
1466         struct bfq_entity *entity = NULL;
1467
1468         if (node)
1469                 entity = rb_entry(node, struct bfq_entity, rb_node);
1470
1471         return entity;
1472 }
1473
1474 /**
1475  * bfq_extract - remove an entity from a tree.
1476  * @root: the tree root.
1477  * @entity: the entity to remove.
1478  */
1479 static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
1480 {
1481         entity->tree = NULL;
1482         rb_erase(&entity->rb_node, root);
1483 }
1484
1485 /**
1486  * bfq_idle_extract - extract an entity from the idle tree.
1487  * @st: the service tree of the owning @entity.
1488  * @entity: the entity being removed.
1489  */
1490 static void bfq_idle_extract(struct bfq_service_tree *st,
1491                              struct bfq_entity *entity)
1492 {
1493         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1494         struct rb_node *next;
1495
1496         if (entity == st->first_idle) {
1497                 next = rb_next(&entity->rb_node);
1498                 st->first_idle = bfq_entity_of(next);
1499         }
1500
1501         if (entity == st->last_idle) {
1502                 next = rb_prev(&entity->rb_node);
1503                 st->last_idle = bfq_entity_of(next);
1504         }
1505
1506         bfq_extract(&st->idle, entity);
1507
1508         if (bfqq)
1509                 list_del(&bfqq->bfqq_list);
1510 }
1511
1512 /**
1513  * bfq_insert - generic tree insertion.
1514  * @root: tree root.
1515  * @entity: entity to insert.
1516  *
1517  * This is used for the idle and the active tree, since they are both
1518  * ordered by finish time.
1519  */
1520 static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
1521 {
1522         struct bfq_entity *entry;
1523         struct rb_node **node = &root->rb_node;
1524         struct rb_node *parent = NULL;
1525
1526         while (*node) {
1527                 parent = *node;
1528                 entry = rb_entry(parent, struct bfq_entity, rb_node);
1529
1530                 if (bfq_gt(entry->finish, entity->finish))
1531                         node = &parent->rb_left;
1532                 else
1533                         node = &parent->rb_right;
1534         }
1535
1536         rb_link_node(&entity->rb_node, parent, node);
1537         rb_insert_color(&entity->rb_node, root);
1538
1539         entity->tree = root;
1540 }
1541
1542 /**
1543  * bfq_update_min - update the min_start field of a entity.
1544  * @entity: the entity to update.
1545  * @node: one of its children.
1546  *
1547  * This function is called when @entity may store an invalid value for
1548  * min_start due to updates to the active tree.  The function  assumes
1549  * that the subtree rooted at @node (which may be its left or its right
1550  * child) has a valid min_start value.
1551  */
1552 static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
1553 {
1554         struct bfq_entity *child;
1555
1556         if (node) {
1557                 child = rb_entry(node, struct bfq_entity, rb_node);
1558                 if (bfq_gt(entity->min_start, child->min_start))
1559                         entity->min_start = child->min_start;
1560         }
1561 }
1562
1563 /**
1564  * bfq_update_active_node - recalculate min_start.
1565  * @node: the node to update.
1566  *
1567  * @node may have changed position or one of its children may have moved,
1568  * this function updates its min_start value.  The left and right subtrees
1569  * are assumed to hold a correct min_start value.
1570  */
1571 static void bfq_update_active_node(struct rb_node *node)
1572 {
1573         struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
1574
1575         entity->min_start = entity->start;
1576         bfq_update_min(entity, node->rb_right);
1577         bfq_update_min(entity, node->rb_left);
1578 }
1579
1580 /**
1581  * bfq_update_active_tree - update min_start for the whole active tree.
1582  * @node: the starting node.
1583  *
1584  * @node must be the deepest modified node after an update.  This function
1585  * updates its min_start using the values held by its children, assuming
1586  * that they did not change, and then updates all the nodes that may have
1587  * changed in the path to the root.  The only nodes that may have changed
1588  * are the ones in the path or their siblings.
1589  */
1590 static void bfq_update_active_tree(struct rb_node *node)
1591 {
1592         struct rb_node *parent;
1593
1594 up:
1595         bfq_update_active_node(node);
1596
1597         parent = rb_parent(node);
1598         if (!parent)
1599                 return;
1600
1601         if (node == parent->rb_left && parent->rb_right)
1602                 bfq_update_active_node(parent->rb_right);
1603         else if (parent->rb_left)
1604                 bfq_update_active_node(parent->rb_left);
1605
1606         node = parent;
1607         goto up;
1608 }
1609
1610 static void bfq_weights_tree_add(struct bfq_data *bfqd,
1611                                  struct bfq_entity *entity,
1612                                  struct rb_root *root);
1613
1614 static void bfq_weights_tree_remove(struct bfq_data *bfqd,
1615                                     struct bfq_entity *entity,
1616                                     struct rb_root *root);
1617
1618
1619 /**
1620  * bfq_active_insert - insert an entity in the active tree of its
1621  *                     group/device.
1622  * @st: the service tree of the entity.
1623  * @entity: the entity being inserted.
1624  *
1625  * The active tree is ordered by finish time, but an extra key is kept
1626  * per each node, containing the minimum value for the start times of
1627  * its children (and the node itself), so it's possible to search for
1628  * the eligible node with the lowest finish time in logarithmic time.
1629  */
1630 static void bfq_active_insert(struct bfq_service_tree *st,
1631                               struct bfq_entity *entity)
1632 {
1633         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1634         struct rb_node *node = &entity->rb_node;
1635 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1636         struct bfq_sched_data *sd = NULL;
1637         struct bfq_group *bfqg = NULL;
1638         struct bfq_data *bfqd = NULL;
1639 #endif
1640
1641         bfq_insert(&st->active, entity);
1642
1643         if (node->rb_left)
1644                 node = node->rb_left;
1645         else if (node->rb_right)
1646                 node = node->rb_right;
1647
1648         bfq_update_active_tree(node);
1649
1650 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1651         sd = entity->sched_data;
1652         bfqg = container_of(sd, struct bfq_group, sched_data);
1653         bfqd = (struct bfq_data *)bfqg->bfqd;
1654 #endif
1655         if (bfqq)
1656                 list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
1657 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1658         else /* bfq_group */
1659                 bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
1660
1661         if (bfqg != bfqd->root_group)
1662                 bfqg->active_entities++;
1663 #endif
1664 }
1665
1666 /**
1667  * bfq_ioprio_to_weight - calc a weight from an ioprio.
1668  * @ioprio: the ioprio value to convert.
1669  */
1670 static unsigned short bfq_ioprio_to_weight(int ioprio)
1671 {
1672         return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
1673 }
1674
1675 /**
1676  * bfq_weight_to_ioprio - calc an ioprio from a weight.
1677  * @weight: the weight value to convert.
1678  *
1679  * To preserve as much as possible the old only-ioprio user interface,
1680  * 0 is used as an escape ioprio value for weights (numerically) equal or
1681  * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
1682  */
1683 static unsigned short bfq_weight_to_ioprio(int weight)
1684 {
1685         return max_t(int, 0,
1686                      IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
1687 }
1688
1689 static void bfq_get_entity(struct bfq_entity *entity)
1690 {
1691         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1692
1693         if (bfqq) {
1694                 bfqq->ref++;
1695                 bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
1696                              bfqq, bfqq->ref);
1697         }
1698 }
1699
1700 /**
1701  * bfq_find_deepest - find the deepest node that an extraction can modify.
1702  * @node: the node being removed.
1703  *
1704  * Do the first step of an extraction in an rb tree, looking for the
1705  * node that will replace @node, and returning the deepest node that
1706  * the following modifications to the tree can touch.  If @node is the
1707  * last node in the tree return %NULL.
1708  */
1709 static struct rb_node *bfq_find_deepest(struct rb_node *node)
1710 {
1711         struct rb_node *deepest;
1712
1713         if (!node->rb_right && !node->rb_left)
1714                 deepest = rb_parent(node);
1715         else if (!node->rb_right)
1716                 deepest = node->rb_left;
1717         else if (!node->rb_left)
1718                 deepest = node->rb_right;
1719         else {
1720                 deepest = rb_next(node);
1721                 if (deepest->rb_right)
1722                         deepest = deepest->rb_right;
1723                 else if (rb_parent(deepest) != node)
1724                         deepest = rb_parent(deepest);
1725         }
1726
1727         return deepest;
1728 }
1729
1730 /**
1731  * bfq_active_extract - remove an entity from the active tree.
1732  * @st: the service_tree containing the tree.
1733  * @entity: the entity being removed.
1734  */
1735 static void bfq_active_extract(struct bfq_service_tree *st,
1736                                struct bfq_entity *entity)
1737 {
1738         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1739         struct rb_node *node;
1740 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1741         struct bfq_sched_data *sd = NULL;
1742         struct bfq_group *bfqg = NULL;
1743         struct bfq_data *bfqd = NULL;
1744 #endif
1745
1746         node = bfq_find_deepest(&entity->rb_node);
1747         bfq_extract(&st->active, entity);
1748
1749         if (node)
1750                 bfq_update_active_tree(node);
1751
1752 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1753         sd = entity->sched_data;
1754         bfqg = container_of(sd, struct bfq_group, sched_data);
1755         bfqd = (struct bfq_data *)bfqg->bfqd;
1756 #endif
1757         if (bfqq)
1758                 list_del(&bfqq->bfqq_list);
1759 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1760         else /* bfq_group */
1761                 bfq_weights_tree_remove(bfqd, entity,
1762                                         &bfqd->group_weights_tree);
1763
1764         if (bfqg != bfqd->root_group)
1765                 bfqg->active_entities--;
1766 #endif
1767 }
1768
1769 /**
1770  * bfq_idle_insert - insert an entity into the idle tree.
1771  * @st: the service tree containing the tree.
1772  * @entity: the entity to insert.
1773  */
1774 static void bfq_idle_insert(struct bfq_service_tree *st,
1775                             struct bfq_entity *entity)
1776 {
1777         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1778         struct bfq_entity *first_idle = st->first_idle;
1779         struct bfq_entity *last_idle = st->last_idle;
1780
1781         if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
1782                 st->first_idle = entity;
1783         if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
1784                 st->last_idle = entity;
1785
1786         bfq_insert(&st->idle, entity);
1787
1788         if (bfqq)
1789                 list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
1790 }
1791
1792 /**
1793  * bfq_forget_entity - do not consider entity any longer for scheduling
1794  * @st: the service tree.
1795  * @entity: the entity being removed.
1796  * @is_in_service: true if entity is currently the in-service entity.
1797  *
1798  * Forget everything about @entity. In addition, if entity represents
1799  * a queue, and the latter is not in service, then release the service
1800  * reference to the queue (the one taken through bfq_get_entity). In
1801  * fact, in this case, there is really no more service reference to
1802  * the queue, as the latter is also outside any service tree. If,
1803  * instead, the queue is in service, then __bfq_bfqd_reset_in_service
1804  * will take care of putting the reference when the queue finally
1805  * stops being served.
1806  */
1807 static void bfq_forget_entity(struct bfq_service_tree *st,
1808                               struct bfq_entity *entity,
1809                               bool is_in_service)
1810 {
1811         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1812
1813         entity->on_st = false;
1814         st->wsum -= entity->weight;
1815         if (bfqq && !is_in_service)
1816                 bfq_put_queue(bfqq);
1817 }
1818
1819 /**
1820  * bfq_put_idle_entity - release the idle tree ref of an entity.
1821  * @st: service tree for the entity.
1822  * @entity: the entity being released.
1823  */
1824 static void bfq_put_idle_entity(struct bfq_service_tree *st,
1825                                 struct bfq_entity *entity)
1826 {
1827         bfq_idle_extract(st, entity);
1828         bfq_forget_entity(st, entity,
1829                           entity == entity->sched_data->in_service_entity);
1830 }
1831
1832 /**
1833  * bfq_forget_idle - update the idle tree if necessary.
1834  * @st: the service tree to act upon.
1835  *
1836  * To preserve the global O(log N) complexity we only remove one entry here;
1837  * as the idle tree will not grow indefinitely this can be done safely.
1838  */
1839 static void bfq_forget_idle(struct bfq_service_tree *st)
1840 {
1841         struct bfq_entity *first_idle = st->first_idle;
1842         struct bfq_entity *last_idle = st->last_idle;
1843
1844         if (RB_EMPTY_ROOT(&st->active) && last_idle &&
1845             !bfq_gt(last_idle->finish, st->vtime)) {
1846                 /*
1847                  * Forget the whole idle tree, increasing the vtime past
1848                  * the last finish time of idle entities.
1849                  */
1850                 st->vtime = last_idle->finish;
1851         }
1852
1853         if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
1854                 bfq_put_idle_entity(st, first_idle);
1855 }
1856
1857 static struct bfq_service_tree *
1858 __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1859                                 struct bfq_entity *entity)
1860 {
1861         struct bfq_service_tree *new_st = old_st;
1862
1863         if (entity->prio_changed) {
1864                 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1865                 unsigned int prev_weight, new_weight;
1866                 struct bfq_data *bfqd = NULL;
1867                 struct rb_root *root;
1868 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1869                 struct bfq_sched_data *sd;
1870                 struct bfq_group *bfqg;
1871 #endif
1872
1873                 if (bfqq)
1874                         bfqd = bfqq->bfqd;
1875 #ifdef CONFIG_BFQ_GROUP_IOSCHED
1876                 else {
1877                         sd = entity->my_sched_data;
1878                         bfqg = container_of(sd, struct bfq_group, sched_data);
1879                         bfqd = (struct bfq_data *)bfqg->bfqd;
1880                 }
1881 #endif
1882
1883                 old_st->wsum -= entity->weight;
1884
1885                 if (entity->new_weight != entity->orig_weight) {
1886                         if (entity->new_weight < BFQ_MIN_WEIGHT ||
1887                             entity->new_weight > BFQ_MAX_WEIGHT) {
1888                                 pr_crit("update_weight_prio: new_weight %d\n",
1889                                         entity->new_weight);
1890                                 if (entity->new_weight < BFQ_MIN_WEIGHT)
1891                                         entity->new_weight = BFQ_MIN_WEIGHT;
1892                                 else
1893                                         entity->new_weight = BFQ_MAX_WEIGHT;
1894                         }
1895                         entity->orig_weight = entity->new_weight;
1896                         if (bfqq)
1897                                 bfqq->ioprio =
1898                                   bfq_weight_to_ioprio(entity->orig_weight);
1899                 }
1900
1901                 if (bfqq)
1902                         bfqq->ioprio_class = bfqq->new_ioprio_class;
1903                 entity->prio_changed = 0;
1904
1905                 /*
1906                  * NOTE: here we may be changing the weight too early,
1907                  * this will cause unfairness.  The correct approach
1908                  * would have required additional complexity to defer
1909                  * weight changes to the proper time instants (i.e.,
1910                  * when entity->finish <= old_st->vtime).
1911                  */
1912                 new_st = bfq_entity_service_tree(entity);
1913
1914                 prev_weight = entity->weight;
1915                 new_weight = entity->orig_weight *
1916                              (bfqq ? bfqq->wr_coeff : 1);
1917                 /*
1918                  * If the weight of the entity changes, remove the entity
1919                  * from its old weight counter (if there is a counter
1920                  * associated with the entity), and add it to the counter
1921                  * associated with its new weight.
1922                  */
1923                 if (prev_weight != new_weight) {
1924                         root = bfqq ? &bfqd->queue_weights_tree :
1925                                       &bfqd->group_weights_tree;
1926                         bfq_weights_tree_remove(bfqd, entity, root);
1927                 }
1928                 entity->weight = new_weight;
1929                 /*
1930                  * Add the entity to its weights tree only if it is
1931                  * not associated with a weight-raised queue.
1932                  */
1933                 if (prev_weight != new_weight &&
1934                     (bfqq ? bfqq->wr_coeff == 1 : 1))
1935                         /* If we get here, root has been initialized. */
1936                         bfq_weights_tree_add(bfqd, entity, root);
1937
1938                 new_st->wsum += entity->weight;
1939
1940                 if (new_st != old_st)
1941                         entity->start = new_st->vtime;
1942         }
1943
1944         return new_st;
1945 }
1946
1947 static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
1948 static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
1949
1950 /**
1951  * bfq_bfqq_served - update the scheduler status after selection for
1952  *                   service.
1953  * @bfqq: the queue being served.
1954  * @served: bytes to transfer.
1955  *
1956  * NOTE: this can be optimized, as the timestamps of upper level entities
1957  * are synchronized every time a new bfqq is selected for service.  By now,
1958  * we keep it to better check consistency.
1959  */
1960 static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
1961 {
1962         struct bfq_entity *entity = &bfqq->entity;
1963         struct bfq_service_tree *st;
1964
1965         for_each_entity(entity) {
1966                 st = bfq_entity_service_tree(entity);
1967
1968                 entity->service += served;
1969
1970                 st->vtime += bfq_delta(served, st->wsum);
1971                 bfq_forget_idle(st);
1972         }
1973         bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
1974         bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
1975 }
1976
1977 /**
1978  * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
1979  *                        of the time interval during which bfqq has been in
1980  *                        service.
1981  * @bfqd: the device
1982  * @bfqq: the queue that needs a service update.
1983  * @time_ms: the amount of time during which the queue has received service
1984  *
1985  * If a queue does not consume its budget fast enough, then providing
1986  * the queue with service fairness may impair throughput, more or less
1987  * severely. For this reason, queues that consume their budget slowly
1988  * are provided with time fairness instead of service fairness. This
1989  * goal is achieved through the BFQ scheduling engine, even if such an
1990  * engine works in the service, and not in the time domain. The trick
1991  * is charging these queues with an inflated amount of service, equal
1992  * to the amount of service that they would have received during their
1993  * service slot if they had been fast, i.e., if their requests had
1994  * been dispatched at a rate equal to the estimated peak rate.
1995  *
1996  * It is worth noting that time fairness can cause important
1997  * distortions in terms of bandwidth distribution, on devices with
1998  * internal queueing. The reason is that I/O requests dispatched
1999  * during the service slot of a queue may be served after that service
2000  * slot is finished, and may have a total processing time loosely
2001  * correlated with the duration of the service slot. This is
2002  * especially true for short service slots.
2003  */
2004 static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2005                                  unsigned long time_ms)
2006 {
2007         struct bfq_entity *entity = &bfqq->entity;
2008         int tot_serv_to_charge = entity->service;
2009         unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
2010
2011         if (time_ms > 0 && time_ms < timeout_ms)
2012                 tot_serv_to_charge =
2013                         (bfqd->bfq_max_budget * time_ms) / timeout_ms;
2014
2015         if (tot_serv_to_charge < entity->service)
2016                 tot_serv_to_charge = entity->service;
2017
2018         /* Increase budget to avoid inconsistencies */
2019         if (tot_serv_to_charge > entity->budget)
2020                 entity->budget = tot_serv_to_charge;
2021
2022         bfq_bfqq_served(bfqq,
2023                         max_t(int, 0, tot_serv_to_charge - entity->service));
2024 }
2025
2026 static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
2027                                         struct bfq_service_tree *st,
2028                                         bool backshifted)
2029 {
2030         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
2031
2032         st = __bfq_entity_update_weight_prio(st, entity);
2033         bfq_calc_finish(entity, entity->budget);
2034
2035         /*
2036          * If some queues enjoy backshifting for a while, then their
2037          * (virtual) finish timestamps may happen to become lower and
2038          * lower than the system virtual time.  In particular, if
2039          * these queues often happen to be idle for short time
2040          * periods, and during such time periods other queues with
2041          * higher timestamps happen to be busy, then the backshifted
2042          * timestamps of the former queues can become much lower than
2043          * the system virtual time. In fact, to serve the queues with
2044          * higher timestamps while the ones with lower timestamps are
2045          * idle, the system virtual time may be pushed-up to much
2046          * higher values than the finish timestamps of the idle
2047          * queues. As a consequence, the finish timestamps of all new
2048          * or newly activated queues may end up being much larger than
2049          * those of lucky queues with backshifted timestamps. The
2050          * latter queues may then monopolize the device for a lot of
2051          * time. This would simply break service guarantees.
2052          *
2053          * To reduce this problem, push up a little bit the
2054          * backshifted timestamps of the queue associated with this
2055          * entity (only a queue can happen to have the backshifted
2056          * flag set): just enough to let the finish timestamp of the
2057          * queue be equal to the current value of the system virtual
2058          * time. This may introduce a little unfairness among queues
2059          * with backshifted timestamps, but it does not break
2060          * worst-case fairness guarantees.
2061          *
2062          * As a special case, if bfqq is weight-raised, push up
2063          * timestamps much less, to keep very low the probability that
2064          * this push up causes the backshifted finish timestamps of
2065          * weight-raised queues to become higher than the backshifted
2066          * finish timestamps of non weight-raised queues.
2067          */
2068         if (backshifted && bfq_gt(st->vtime, entity->finish)) {
2069                 unsigned long delta = st->vtime - entity->finish;
2070
2071                 if (bfqq)
2072                         delta /= bfqq->wr_coeff;
2073
2074                 entity->start += delta;
2075                 entity->finish += delta;
2076         }
2077
2078         bfq_active_insert(st, entity);
2079 }
2080
2081 /**
2082  * __bfq_activate_entity - handle activation of entity.
2083  * @entity: the entity being activated.
2084  * @non_blocking_wait_rq: true if entity was waiting for a request
2085  *
2086  * Called for a 'true' activation, i.e., if entity is not active and
2087  * one of its children receives a new request.
2088  *
2089  * Basically, this function updates the timestamps of entity and
2090  * inserts entity into its active tree, ater possible extracting it
2091  * from its idle tree.
2092  */
2093 static void __bfq_activate_entity(struct bfq_entity *entity,
2094                                   bool non_blocking_wait_rq)
2095 {
2096         struct bfq_service_tree *st = bfq_entity_service_tree(entity);
2097         bool backshifted = false;
2098         unsigned long long min_vstart;
2099
2100         /* See comments on bfq_fqq_update_budg_for_activation */
2101         if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
2102                 backshifted = true;
2103                 min_vstart = entity->finish;
2104         } else
2105                 min_vstart = st->vtime;
2106
2107         if (entity->tree == &st->idle) {
2108                 /*
2109                  * Must be on the idle tree, bfq_idle_extract() will
2110                  * check for that.
2111                  */
2112                 bfq_idle_extract(st, entity);
2113                 entity->start = bfq_gt(min_vstart, entity->finish) ?
2114                         min_vstart : entity->finish;
2115         } else {
2116                 /*
2117                  * The finish time of the entity may be invalid, and
2118                  * it is in the past for sure, otherwise the queue
2119                  * would have been on the idle tree.
2120                  */
2121                 entity->start = min_vstart;
2122                 st->wsum += entity->weight;
2123                 /*
2124                  * entity is about to be inserted into a service tree,
2125                  * and then set in service: get a reference to make
2126                  * sure entity does not disappear until it is no
2127                  * longer in service or scheduled for service.
2128                  */
2129                 bfq_get_entity(entity);
2130
2131                 entity->on_st = true;
2132         }
2133
2134         bfq_update_fin_time_enqueue(entity, st, backshifted);
2135 }
2136
2137 /**
2138  * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
2139  * @entity: the entity being requeued or repositioned.
2140  *
2141  * Requeueing is needed if this entity stops being served, which
2142  * happens if a leaf descendant entity has expired. On the other hand,
2143  * repositioning is needed if the next_inservice_entity for the child
2144  * entity has changed. See the comments inside the function for
2145  * details.
2146  *
2147  * Basically, this function: 1) removes entity from its active tree if
2148  * present there, 2) updates the timestamps of entity and 3) inserts
2149  * entity back into its active tree (in the new, right position for
2150  * the new values of the timestamps).
2151  */
2152 static void __bfq_requeue_entity(struct bfq_entity *entity)
2153 {
2154         struct bfq_sched_data *sd = entity->sched_data;
2155         struct bfq_service_tree *st = bfq_entity_service_tree(entity);
2156
2157         if (entity == sd->in_service_entity) {
2158                 /*
2159                  * We are requeueing the current in-service entity,
2160                  * which may have to be done for one of the following
2161                  * reasons:
2162                  * - entity represents the in-service queue, and the
2163                  *   in-service queue is being requeued after an
2164                  *   expiration;
2165                  * - entity represents a group, and its budget has
2166                  *   changed because one of its child entities has
2167                  *   just been either activated or requeued for some
2168                  *   reason; the timestamps of the entity need then to
2169                  *   be updated, and the entity needs to be enqueued
2170                  *   or repositioned accordingly.
2171                  *
2172                  * In particular, before requeueing, the start time of
2173                  * the entity must be moved forward to account for the
2174                  * service that the entity has received while in
2175                  * service. This is done by the next instructions. The
2176                  * finish time will then be updated according to this
2177                  * new value of the start time, and to the budget of
2178                  * the entity.
2179                  */
2180                 bfq_calc_finish(entity, entity->service);
2181                 entity->start = entity->finish;
2182                 /*
2183                  * In addition, if the entity had more than one child
2184                  * when set in service, then was not extracted from
2185                  * the active tree. This implies that the position of
2186                  * the entity in the active tree may need to be
2187                  * changed now, because we have just updated the start
2188                  * time of the entity, and we will update its finish
2189                  * time in a moment (the requeueing is then, more
2190                  * precisely, a repositioning in this case). To
2191                  * implement this repositioning, we: 1) dequeue the
2192                  * entity here, 2) update the finish time and
2193                  * requeue the entity according to the new
2194                  * timestamps below.
2195                  */
2196                 if (entity->tree)
2197                         bfq_active_extract(st, entity);
2198         } else { /* The entity is already active, and not in service */
2199                 /*
2200                  * In this case, this function gets called only if the
2201                  * next_in_service entity below this entity has
2202                  * changed, and this change has caused the budget of
2203                  * this entity to change, which, finally implies that
2204                  * the finish time of this entity must be
2205                  * updated. Such an update may cause the scheduling,
2206                  * i.e., the position in the active tree, of this
2207                  * entity to change. We handle this change by: 1)
2208                  * dequeueing the entity here, 2) updating the finish
2209                  * time and requeueing the entity according to the new
2210                  * timestamps below. This is the same approach as the
2211                  * non-extracted-entity sub-case above.
2212                  */
2213                 bfq_active_extract(st, entity);
2214         }
2215
2216         bfq_update_fin_time_enqueue(entity, st, false);
2217 }
2218
2219 static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
2220                                           struct bfq_sched_data *sd,
2221                                           bool non_blocking_wait_rq)
2222 {
2223         struct bfq_service_tree *st = bfq_entity_service_tree(entity);
2224
2225         if (sd->in_service_entity == entity || entity->tree == &st->active)
2226                  /*
2227                   * in service or already queued on the active tree,
2228                   * requeue or reposition
2229                   */
2230                 __bfq_requeue_entity(entity);
2231         else
2232                 /*
2233                  * Not in service and not queued on its active tree:
2234                  * the activity is idle and this is a true activation.
2235                  */
2236                 __bfq_activate_entity(entity, non_blocking_wait_rq);
2237 }
2238
2239
2240 /**
2241  * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
2242  *                       and activate, requeue or reposition all ancestors
2243  *                       for which such an update becomes necessary.
2244  * @entity: the entity to activate.
2245  * @non_blocking_wait_rq: true if this entity was waiting for a request
2246  * @requeue: true if this is a requeue, which implies that bfqq is
2247  *           being expired; thus ALL its ancestors stop being served and must
2248  *           therefore be requeued
2249  */
2250 static void bfq_activate_requeue_entity(struct bfq_entity *entity,
2251                                         bool non_blocking_wait_rq,
2252                                         bool requeue)
2253 {
2254         struct bfq_sched_data *sd;
2255
2256         for_each_entity(entity) {
2257                 sd = entity->sched_data;
2258                 __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
2259
2260                 if (!bfq_update_next_in_service(sd, entity) && !requeue)
2261                         break;
2262         }
2263 }
2264
2265 /**
2266  * __bfq_deactivate_entity - deactivate an entity from its service tree.
2267  * @entity: the entity to deactivate.
2268  * @ins_into_idle_tree: if false, the entity will not be put into the
2269  *                      idle tree.
2270  *
2271  * Deactivates an entity, independently from its previous state.  Must
2272  * be invoked only if entity is on a service tree. Extracts the entity
2273  * from that tree, and if necessary and allowed, puts it on the idle
2274  * tree.
2275  */
2276 static bool __bfq_deactivate_entity(struct bfq_entity *entity,
2277                                     bool ins_into_idle_tree)
2278 {
2279         struct bfq_sched_data *sd = entity->sched_data;
2280         struct bfq_service_tree *st = bfq_entity_service_tree(entity);
2281         int is_in_service = entity == sd->in_service_entity;
2282
2283         if (!entity->on_st) /* entity never activated, or already inactive */
2284                 return false;
2285
2286         if (is_in_service)
2287                 bfq_calc_finish(entity, entity->service);
2288
2289         if (entity->tree == &st->active)
2290                 bfq_active_extract(st, entity);
2291         else if (!is_in_service && entity->tree == &st->idle)
2292                 bfq_idle_extract(st, entity);
2293
2294         if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
2295                 bfq_forget_entity(st, entity, is_in_service);
2296         else
2297                 bfq_idle_insert(st, entity);
2298
2299         return true;
2300 }
2301
2302 /**
2303  * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
2304  * @entity: the entity to deactivate.
2305  * @ins_into_idle_tree: true if the entity can be put on the idle tree
2306  */
2307 static void bfq_deactivate_entity(struct bfq_entity *entity,
2308                                   bool ins_into_idle_tree,
2309                                   bool expiration)
2310 {
2311         struct bfq_sched_data *sd;
2312         struct bfq_entity *parent = NULL;
2313
2314         for_each_entity_safe(entity, parent) {
2315                 sd = entity->sched_data;
2316
2317                 if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
2318                         /*
2319                          * entity is not in any tree any more, so
2320                          * this deactivation is a no-op, and there is
2321                          * nothing to change for upper-level entities
2322                          * (in case of expiration, this can never
2323                          * happen).
2324                          */
2325                         return;
2326                 }
2327
2328                 if (sd->next_in_service == entity)
2329                         /*
2330                          * entity was the next_in_service entity,
2331                          * then, since entity has just been
2332                          * deactivated, a new one must be found.
2333                          */
2334                         bfq_update_next_in_service(sd, NULL);
2335
2336                 if (sd->next_in_service)
2337                         /*
2338                          * The parent entity is still backlogged,
2339                          * because next_in_service is not NULL. So, no
2340                          * further upwards deactivation must be
2341                          * performed.  Yet, next_in_service has
2342                          * changed.  Then the schedule does need to be
2343                          * updated upwards.
2344                          */
2345                         break;
2346
2347                 /*
2348                  * If we get here, then the parent is no more
2349                  * backlogged and we need to propagate the
2350                  * deactivation upwards. Thus let the loop go on.
2351                  */
2352
2353                 /*
2354                  * Also let parent be queued into the idle tree on
2355                  * deactivation, to preserve service guarantees, and
2356                  * assuming that who invoked this function does not
2357                  * need parent entities too to be removed completely.
2358                  */
2359                 ins_into_idle_tree = true;
2360         }
2361
2362         /*
2363          * If the deactivation loop is fully executed, then there are
2364          * no more entities to touch and next loop is not executed at
2365          * all. Otherwise, requeue remaining entities if they are
2366          * about to stop receiving service, or reposition them if this
2367          * is not the case.
2368          */
2369         entity = parent;
2370         for_each_entity(entity) {
2371                 /*
2372                  * Invoke __bfq_requeue_entity on entity, even if
2373                  * already active, to requeue/reposition it in the
2374                  * active tree (because sd->next_in_service has
2375                  * changed)
2376                  */
2377                 __bfq_requeue_entity(entity);
2378
2379                 sd = entity->sched_data;
2380                 if (!bfq_update_next_in_service(sd, entity) &&
2381                     !expiration)
2382                         /*
2383                          * next_in_service unchanged or not causing
2384                          * any change in entity->parent->sd, and no
2385                          * requeueing needed for expiration: stop
2386                          * here.
2387                          */
2388                         break;
2389         }
2390 }
2391
2392 /**
2393  * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
2394  *                       if needed, to have at least one entity eligible.
2395  * @st: the service tree to act upon.
2396  *
2397  * Assumes that st is not empty.
2398  */
2399 static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
2400 {
2401         struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
2402
2403         if (bfq_gt(root_entity->min_start, st->vtime))
2404                 return root_entity->min_start;
2405
2406         return st->vtime;
2407 }
2408
2409 static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
2410 {
2411         if (new_value > st->vtime) {
2412                 st->vtime = new_value;
2413                 bfq_forget_idle(st);
2414         }
2415 }
2416
2417 /**
2418  * bfq_first_active_entity - find the eligible entity with
2419  *                           the smallest finish time
2420  * @st: the service tree to select from.
2421  * @vtime: the system virtual to use as a reference for eligibility
2422  *
2423  * This function searches the first schedulable entity, starting from the
2424  * root of the tree and going on the left every time on this side there is
2425  * a subtree with at least one eligible (start >= vtime) entity. The path on
2426  * the right is followed only if a) the left subtree contains no eligible
2427  * entities and b) no eligible entity has been found yet.
2428  */
2429 static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
2430                                                   u64 vtime)
2431 {
2432         struct bfq_entity *entry, *first = NULL;
2433         struct rb_node *node = st->active.rb_node;
2434
2435         while (node) {
2436                 entry = rb_entry(node, struct bfq_entity, rb_node);
2437 left:
2438                 if (!bfq_gt(entry->start, vtime))
2439                         first = entry;
2440
2441                 if (node->rb_left) {
2442                         entry = rb_entry(node->rb_left,
2443                                          struct bfq_entity, rb_node);
2444                         if (!bfq_gt(entry->min_start, vtime)) {
2445                                 node = node->rb_left;
2446                                 goto left;
2447                         }
2448                 }
2449                 if (first)
2450                         break;
2451                 node = node->rb_right;
2452         }
2453
2454         return first;
2455 }
2456
2457 /**
2458  * __bfq_lookup_next_entity - return the first eligible entity in @st.
2459  * @st: the service tree.
2460  *
2461  * If there is no in-service entity for the sched_data st belongs to,
2462  * then return the entity that will be set in service if:
2463  * 1) the parent entity this st belongs to is set in service;
2464  * 2) no entity belonging to such parent entity undergoes a state change
2465  * that would influence the timestamps of the entity (e.g., becomes idle,
2466  * becomes backlogged, changes its budget, ...).
2467  *
2468  * In this first case, update the virtual time in @st too (see the
2469  * comments on this update inside the function).
2470  *
2471  * In constrast, if there is an in-service entity, then return the
2472  * entity that would be set in service if not only the above
2473  * conditions, but also the next one held true: the currently
2474  * in-service entity, on expiration,
2475  * 1) gets a finish time equal to the current one, or
2476  * 2) is not eligible any more, or
2477  * 3) is idle.
2478  */
2479 static struct bfq_entity *
2480 __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
2481 {
2482         struct bfq_entity *entity;
2483         u64 new_vtime;
2484
2485         if (RB_EMPTY_ROOT(&st->active))
2486                 return NULL;
2487
2488         /*
2489          * Get the value of the system virtual time for which at
2490          * least one entity is eligible.
2491          */
2492         new_vtime = bfq_calc_vtime_jump(st);
2493
2494         /*
2495          * If there is no in-service entity for the sched_data this
2496          * active tree belongs to, then push the system virtual time
2497          * up to the value that guarantees that at least one entity is
2498          * eligible. If, instead, there is an in-service entity, then
2499          * do not make any such update, because there is already an
2500          * eligible entity, namely the in-service one (even if the
2501          * entity is not on st, because it was extracted when set in
2502          * service).
2503          */
2504         if (!in_service)
2505                 bfq_update_vtime(st, new_vtime);
2506
2507         entity = bfq_first_active_entity(st, new_vtime);
2508
2509         return entity;
2510 }
2511
2512 /**
2513  * bfq_lookup_next_entity - return the first eligible entity in @sd.
2514  * @sd: the sched_data.
2515  *
2516  * This function is invoked when there has been a change in the trees
2517  * for sd, and we need know what is the new next entity after this
2518  * change.
2519  */
2520 static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
2521 {
2522         struct bfq_service_tree *st = sd->service_tree;
2523         struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
2524         struct bfq_entity *entity = NULL;
2525         int class_idx = 0;
2526
2527         /*
2528          * Choose from idle class, if needed to guarantee a minimum
2529          * bandwidth to this class (and if there is some active entity
2530          * in idle class). This should also mitigate
2531          * priority-inversion problems in case a low priority task is
2532          * holding file system resources.
2533          */
2534         if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
2535                                    BFQ_CL_IDLE_TIMEOUT)) {
2536                 if (!RB_EMPTY_ROOT(&idle_class_st->active))
2537                         class_idx = BFQ_IOPRIO_CLASSES - 1;
2538                 /* About to be served if backlogged, or not yet backlogged */
2539                 sd->bfq_class_idle_last_service = jiffies;
2540         }
2541
2542         /*
2543          * Find the next entity to serve for the highest-priority
2544          * class, unless the idle class needs to be served.
2545          */
2546         for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
2547                 entity = __bfq_lookup_next_entity(st + class_idx,
2548                                                   sd->in_service_entity);
2549
2550                 if (entity)
2551                         break;
2552         }
2553
2554         if (!entity)
2555                 return NULL;
2556
2557         return entity;
2558 }
2559
2560 static bool next_queue_may_preempt(struct bfq_data *bfqd)
2561 {
2562         struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
2563
2564         return sd->next_in_service != sd->in_service_entity;
2565 }
2566
2567 /*
2568  * Get next queue for service.
2569  */
2570 static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
2571 {
2572         struct bfq_entity *entity = NULL;
2573         struct bfq_sched_data *sd;
2574         struct bfq_queue *bfqq;
2575
2576         if (bfqd->busy_queues == 0)
2577                 return NULL;
2578
2579         /*
2580          * Traverse the path from the root to the leaf entity to
2581          * serve. Set in service all the entities visited along the
2582          * way.
2583          */
2584         sd = &bfqd->root_group->sched_data;
2585         for (; sd ; sd = entity->my_sched_data) {
2586                 /*
2587                  * WARNING. We are about to set the in-service entity
2588                  * to sd->next_in_service, i.e., to the (cached) value
2589                  * returned by bfq_lookup_next_entity(sd) the last
2590                  * time it was invoked, i.e., the last time when the
2591                  * service order in sd changed as a consequence of the
2592                  * activation or deactivation of an entity. In this
2593                  * respect, if we execute bfq_lookup_next_entity(sd)
2594                  * in this very moment, it may, although with low
2595                  * probability, yield a different entity than that
2596                  * pointed to by sd->next_in_service. This rare event
2597                  * happens in case there was no CLASS_IDLE entity to
2598                  * serve for sd when bfq_lookup_next_entity(sd) was
2599                  * invoked for the last time, while there is now one
2600                  * such entity.
2601                  *
2602                  * If the above event happens, then the scheduling of
2603                  * such entity in CLASS_IDLE is postponed until the
2604                  * service of the sd->next_in_service entity
2605                  * finishes. In fact, when the latter is expired,
2606                  * bfq_lookup_next_entity(sd) gets called again,
2607                  * exactly to update sd->next_in_service.
2608                  */
2609
2610                 /* Make next_in_service entity become in_service_entity */
2611                 entity = sd->next_in_service;
2612                 sd->in_service_entity = entity;
2613
2614                 /*
2615                  * Reset the accumulator of the amount of service that
2616                  * the entity is about to receive.
2617                  */
2618                 entity->service = 0;
2619
2620                 /*
2621                  * If entity is no longer a candidate for next
2622                  * service, then we extract it from its active tree,
2623                  * for the following reason. To further boost the
2624                  * throughput in some special case, BFQ needs to know
2625                  * which is the next candidate entity to serve, while
2626                  * there is already an entity in service. In this
2627                  * respect, to make it easy to compute/update the next
2628                  * candidate entity to serve after the current
2629                  * candidate has been set in service, there is a case
2630                  * where it is necessary to extract the current
2631                  * candidate from its service tree. Such a case is
2632                  * when the entity just set in service cannot be also
2633                  * a candidate for next service. Details about when
2634                  * this conditions holds are reported in the comments
2635                  * on the function bfq_no_longer_next_in_service()
2636                  * invoked below.
2637                  */
2638                 if (bfq_no_longer_next_in_service(entity))
2639                         bfq_active_extract(bfq_entity_service_tree(entity),
2640                                            entity);
2641
2642                 /*
2643                  * For the same reason why we may have just extracted
2644                  * entity from its active tree, we may need to update
2645                  * next_in_service for the sched_data of entity too,
2646                  * regardless of whether entity has been extracted.
2647                  * In fact, even if entity has not been extracted, a
2648                  * descendant entity may get extracted. Such an event
2649                  * would cause a change in next_in_service for the
2650                  * level of the descendant entity, and thus possibly
2651                  * back to upper levels.
2652                  *
2653                  * We cannot perform the resulting needed update
2654                  * before the end of this loop, because, to know which
2655                  * is the correct next-to-serve candidate entity for
2656                  * each level, we need first to find the leaf entity
2657                  * to set in service. In fact, only after we know
2658                  * which is the next-to-serve leaf entity, we can
2659                  * discover whether the parent entity of the leaf
2660                  * entity becomes the next-to-serve, and so on.
2661                  */
2662
2663         }
2664
2665         bfqq = bfq_entity_to_bfqq(entity);
2666
2667         /*
2668          * We can finally update all next-to-serve entities along the
2669          * path from the leaf entity just set in service to the root.
2670          */
2671         for_each_entity(entity) {
2672                 struct bfq_sched_data *sd = entity->sched_data;
2673
2674                 if (!bfq_update_next_in_service(sd, NULL))
2675                         break;
2676         }
2677
2678         return bfqq;
2679 }
2680
2681 static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
2682 {
2683         struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
2684         struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
2685         struct bfq_entity *entity = in_serv_entity;
2686
2687         if (bfqd->in_service_bic) {
2688                 /*
2689                  * Schedule the release of a reference to
2690                  * bfqd->in_service_bic->icq.ioc to right after the
2691                  * scheduler lock is released. This ioc is not
2692                  * released immediately, to not risk to possibly take
2693                  * an ioc->lock while holding the scheduler lock.
2694                  */
2695                 bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc;
2696                 bfqd->in_service_bic = NULL;
2697         }
2698
2699         bfq_clear_bfqq_wait_request(in_serv_bfqq);
2700         hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
2701         bfqd->in_service_queue = NULL;
2702
2703         /*
2704          * When this function is called, all in-service entities have
2705          * been properly deactivated or requeued, so we can safely
2706          * execute the final step: reset in_service_entity along the
2707          * path from entity to the root.
2708          */
2709         for_each_entity(entity)
2710                 entity->sched_data->in_service_entity = NULL;
2711
2712         /*
2713          * in_serv_entity is no longer in service, so, if it is in no
2714          * service tree either, then release the service reference to
2715          * the queue it represents (taken with bfq_get_entity).
2716          */
2717         if (!in_serv_entity->on_st)
2718                 bfq_put_queue(in_serv_bfqq);
2719 }
2720
2721 static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2722                                 bool ins_into_idle_tree, bool expiration)
2723 {
2724         struct bfq_entity *entity = &bfqq->entity;
2725
2726         bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
2727 }
2728
2729 static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2730 {
2731         struct bfq_entity *entity = &bfqq->entity;
2732
2733         bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
2734                                     false);
2735         bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
2736 }
2737
2738 static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2739 {
2740         struct bfq_entity *entity = &bfqq->entity;
2741
2742         bfq_activate_requeue_entity(entity, false,
2743                                     bfqq == bfqd->in_service_queue);
2744 }
2745
2746 static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
2747
2748 /*
2749  * Called when the bfqq no longer has requests pending, remove it from
2750  * the service tree. As a special case, it can be invoked during an
2751  * expiration.
2752  */
2753 static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2754                               bool expiration)
2755 {
2756         bfq_log_bfqq(bfqd, bfqq, "del from busy");
2757
2758         bfq_clear_bfqq_busy(bfqq);
2759
2760         bfqd->busy_queues--;
2761
2762         if (!bfqq->dispatched)
2763                 bfq_weights_tree_remove(bfqd, &bfqq->entity,
2764                                         &bfqd->queue_weights_tree);
2765
2766         if (bfqq->wr_coeff > 1)
2767                 bfqd->wr_busy_queues--;
2768
2769         bfqg_stats_update_dequeue(bfqq_group(bfqq));
2770
2771         bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
2772 }
2773
2774 /*
2775  * Called when an inactive queue receives a new request.
2776  */
2777 static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2778 {
2779         bfq_log_bfqq(bfqd, bfqq, "add to busy");
2780
2781         bfq_activate_bfqq(bfqd, bfqq);
2782
2783         bfq_mark_bfqq_busy(bfqq);
2784         bfqd->busy_queues++;
2785
2786         if (!bfqq->dispatched)
2787                 if (bfqq->wr_coeff == 1)
2788                         bfq_weights_tree_add(bfqd, &bfqq->entity,
2789                                              &bfqd->queue_weights_tree);
2790
2791         if (bfqq->wr_coeff > 1)
2792                 bfqd->wr_busy_queues++;
2793 }
2794
2795 #ifdef CONFIG_BFQ_GROUP_IOSCHED
2796
2797 /* bfqg stats flags */
2798 enum bfqg_stats_flags {
2799         BFQG_stats_waiting = 0,
2800         BFQG_stats_idling,
2801         BFQG_stats_empty,
2802 };
2803
2804 #define BFQG_FLAG_FNS(name)                                             \
2805 static void bfqg_stats_mark_##name(struct bfqg_stats *stats)    \
2806 {                                                                       \
2807         stats->flags |= (1 << BFQG_stats_##name);                       \
2808 }                                                                       \
2809 static void bfqg_stats_clear_##name(struct bfqg_stats *stats)   \
2810 {                                                                       \
2811         stats->flags &= ~(1 << BFQG_stats_##name);                      \
2812 }                                                                       \
2813 static int bfqg_stats_##name(struct bfqg_stats *stats)          \
2814 {                                                                       \
2815         return (stats->flags & (1 << BFQG_stats_##name)) != 0;          \
2816 }                                                                       \
2817
2818 BFQG_FLAG_FNS(waiting)
2819 BFQG_FLAG_FNS(idling)
2820 BFQG_FLAG_FNS(empty)
2821 #undef BFQG_FLAG_FNS
2822
2823 /* This should be called with the queue_lock held. */
2824 static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
2825 {
2826         unsigned long long now;
2827
2828         if (!bfqg_stats_waiting(stats))
2829                 return;
2830
2831         now = sched_clock();
2832         if (time_after64(now, stats->start_group_wait_time))
2833                 blkg_stat_add(&stats->group_wait_time,
2834                               now - stats->start_group_wait_time);
2835         bfqg_stats_clear_waiting(stats);
2836 }
2837
2838 /* This should be called with the queue_lock held. */
2839 static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
2840                                                  struct bfq_group *curr_bfqg)
2841 {
2842         struct bfqg_stats *stats = &bfqg->stats;
2843
2844         if (bfqg_stats_waiting(stats))
2845                 return;
2846         if (bfqg == curr_bfqg)
2847                 return;
2848         stats->start_group_wait_time = sched_clock();
2849         bfqg_stats_mark_waiting(stats);
2850 }
2851
2852 /* This should be called with the queue_lock held. */
2853 static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
2854 {
2855         unsigned long long now;
2856
2857         if (!bfqg_stats_empty(stats))
2858                 return;
2859
2860         now = sched_clock();
2861         if (time_after64(now, stats->start_empty_time))
2862                 blkg_stat_add(&stats->empty_time,
2863                               now - stats->start_empty_time);
2864         bfqg_stats_clear_empty(stats);
2865 }
2866
2867 static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
2868 {
2869         blkg_stat_add(&bfqg->stats.dequeue, 1);
2870 }
2871
2872 static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
2873 {
2874         struct bfqg_stats *stats = &bfqg->stats;
2875
2876         if (blkg_rwstat_total(&stats->queued))
2877                 return;
2878
2879         /*
2880          * group is already marked empty. This can happen if bfqq got new
2881          * request in parent group and moved to this group while being added
2882          * to service tree. Just ignore the event and move on.
2883          */
2884         if (bfqg_stats_empty(stats))
2885                 return;
2886
2887         stats->start_empty_time = sched_clock();
2888         bfqg_stats_mark_empty(stats);
2889 }
2890
2891 static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
2892 {
2893         struct bfqg_stats *stats = &bfqg->stats;
2894
2895         if (bfqg_stats_idling(stats)) {
2896                 unsigned long long now = sched_clock();
2897
2898                 if (time_after64(now, stats->start_idle_time))
2899                         blkg_stat_add(&stats->idle_time,
2900                                       now - stats->start_idle_time);
2901                 bfqg_stats_clear_idling(stats);
2902         }
2903 }
2904
2905 static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
2906 {
2907         struct bfqg_stats *stats = &bfqg->stats;
2908
2909         stats->start_idle_time = sched_clock();
2910         bfqg_stats_mark_idling(stats);
2911 }
2912
2913 static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
2914 {
2915         struct bfqg_stats *stats = &bfqg->stats;
2916
2917         blkg_stat_add(&stats->avg_queue_size_sum,
2918                       blkg_rwstat_total(&stats->queued));
2919         blkg_stat_add(&stats->avg_queue_size_samples, 1);
2920         bfqg_stats_update_group_wait_time(stats);
2921 }
2922
2923 /*
2924  * blk-cgroup policy-related handlers
2925  * The following functions help in converting between blk-cgroup
2926  * internal structures and BFQ-specific structures.
2927  */
2928
2929 static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
2930 {
2931         return pd ? container_of(pd, struct bfq_group, pd) : NULL;
2932 }
2933
2934 static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
2935 {
2936         return pd_to_blkg(&bfqg->pd);
2937 }
2938
2939 static struct blkcg_policy blkcg_policy_bfq;
2940
2941 static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
2942 {
2943         return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
2944 }
2945
2946 /*
2947  * bfq_group handlers
2948  * The following functions help in navigating the bfq_group hierarchy
2949  * by allowing to find the parent of a bfq_group or the bfq_group
2950  * associated to a bfq_queue.
2951  */
2952
2953 static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
2954 {
2955         struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
2956
2957         return pblkg ? blkg_to_bfqg(pblkg) : NULL;
2958 }
2959
2960 static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
2961 {
2962         struct bfq_entity *group_entity = bfqq->entity.parent;
2963
2964         return group_entity ? container_of(group_entity, struct bfq_group,
2965                                            entity) :
2966                               bfqq->bfqd->root_group;
2967 }
2968
2969 /*
2970  * The following two functions handle get and put of a bfq_group by
2971  * wrapping the related blk-cgroup hooks.
2972  */
2973
2974 static void bfqg_get(struct bfq_group *bfqg)
2975 {
2976         return blkg_get(bfqg_to_blkg(bfqg));
2977 }
2978
2979 static void bfqg_put(struct bfq_group *bfqg)
2980 {
2981         return blkg_put(bfqg_to_blkg(bfqg));
2982 }
2983
2984 static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
2985                                      struct bfq_queue *bfqq,
2986                                      unsigned int op)
2987 {
2988         blkg_rwstat_add(&bfqg->stats.queued, op, 1);
2989         bfqg_stats_end_empty_time(&bfqg->stats);
2990         if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
2991                 bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
2992 }
2993
2994 static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
2995 {
2996         blkg_rwstat_add(&bfqg->stats.queued, op, -1);
2997 }
2998
2999 static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
3000 {
3001         blkg_rwstat_add(&bfqg->stats.merged, op, 1);
3002 }
3003
3004 static void bfqg_stats_update_completion(struct bfq_group *bfqg,
3005                         uint64_t start_time, uint64_t io_start_time,
3006                         unsigned int op)
3007 {
3008         struct bfqg_stats *stats = &bfqg->stats;
3009         unsigned long long now = sched_clock();
3010
3011         if (time_after64(now, io_start_time))
3012                 blkg_rwstat_add(&stats->service_time, op,
3013                                 now - io_start_time);
3014         if (time_after64(io_start_time, start_time))
3015                 blkg_rwstat_add(&stats->wait_time, op,
3016                                 io_start_time - start_time);
3017 }
3018
3019 /* @stats = 0 */
3020 static void bfqg_stats_reset(struct bfqg_stats *stats)
3021 {
3022         /* queued stats shouldn't be cleared */
3023         blkg_rwstat_reset(&stats->merged);
3024         blkg_rwstat_reset(&stats->service_time);
3025         blkg_rwstat_reset(&stats->wait_time);
3026         blkg_stat_reset(&stats->time);
3027         blkg_stat_reset(&stats->avg_queue_size_sum);
3028         blkg_stat_reset(&stats->avg_queue_size_samples);
3029         blkg_stat_reset(&stats->dequeue);
3030         blkg_stat_reset(&stats->group_wait_time);
3031         blkg_stat_reset(&stats->idle_time);
3032         blkg_stat_reset(&stats->empty_time);
3033 }
3034
3035 /* @to += @from */
3036 static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
3037 {
3038         if (!to || !from)
3039                 return;
3040
3041         /* queued stats shouldn't be cleared */
3042         blkg_rwstat_add_aux(&to->merged, &from->merged);
3043         blkg_rwstat_add_aux(&to->service_time, &from->service_time);
3044         blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
3045         blkg_stat_add_aux(&from->time, &from->time);
3046         blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
3047         blkg_stat_add_aux(&to->avg_queue_size_samples,
3048                           &from->avg_queue_size_samples);
3049         blkg_stat_add_aux(&to->dequeue, &from->dequeue);
3050         blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
3051         blkg_stat_add_aux(&to->idle_time, &from->idle_time);
3052         blkg_stat_add_aux(&to->empty_time, &from->empty_time);
3053 }
3054
3055 /*
3056  * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
3057  * recursive stats can still account for the amount used by this bfqg after
3058  * it's gone.
3059  */
3060 static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
3061 {
3062         struct bfq_group *parent;
3063
3064         if (!bfqg) /* root_group */
3065                 return;
3066
3067         parent = bfqg_parent(bfqg);
3068
3069         lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
3070
3071         if (unlikely(!parent))
3072                 return;
3073
3074         bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
3075         bfqg_stats_reset(&bfqg->stats);
3076 }
3077
3078 static void bfq_init_entity(struct bfq_entity *entity,
3079                             struct bfq_group *bfqg)
3080 {
3081         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3082
3083         entity->weight = entity->new_weight;
3084         entity->orig_weight = entity->new_weight;
3085         if (bfqq) {
3086                 bfqq->ioprio = bfqq->new_ioprio;
3087                 bfqq->ioprio_class = bfqq->new_ioprio_class;
3088                 bfqg_get(bfqg);
3089         }
3090         entity->parent = bfqg->my_entity; /* NULL for root group */
3091         entity->sched_data = &bfqg->sched_data;
3092 }
3093
3094 static void bfqg_stats_exit(struct bfqg_stats *stats)
3095 {
3096         blkg_rwstat_exit(&stats->merged);
3097         blkg_rwstat_exit(&stats->service_time);
3098         blkg_rwstat_exit(&stats->wait_time);
3099         blkg_rwstat_exit(&stats->queued);
3100         blkg_stat_exit(&stats->time);
3101         blkg_stat_exit(&stats->avg_queue_size_sum);
3102         blkg_stat_exit(&stats->avg_queue_size_samples);
3103         blkg_stat_exit(&stats->dequeue);
3104         blkg_stat_exit(&stats->group_wait_time);
3105         blkg_stat_exit(&stats->idle_time);
3106         blkg_stat_exit(&stats->empty_time);
3107 }
3108
3109 static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
3110 {
3111         if (blkg_rwstat_init(&stats->merged, gfp) ||
3112             blkg_rwstat_init(&stats->service_time, gfp) ||
3113             blkg_rwstat_init(&stats->wait_time, gfp) ||
3114             blkg_rwstat_init(&stats->queued, gfp) ||
3115             blkg_stat_init(&stats->time, gfp) ||
3116             blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
3117             blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
3118             blkg_stat_init(&stats->dequeue, gfp) ||
3119             blkg_stat_init(&stats->group_wait_time, gfp) ||
3120             blkg_stat_init(&stats->idle_time, gfp) ||
3121             blkg_stat_init(&stats->empty_time, gfp)) {
3122                 bfqg_stats_exit(stats);
3123                 return -ENOMEM;
3124         }
3125
3126         return 0;
3127 }
3128
3129 static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
3130 {
3131         return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
3132 }
3133
3134 static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
3135 {
3136         return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
3137 }
3138
3139 static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
3140 {
3141         struct bfq_group_data *bgd;
3142
3143         bgd = kzalloc(sizeof(*bgd), gfp);
3144         if (!bgd)
3145                 return NULL;
3146         return &bgd->pd;
3147 }
3148
3149 static void bfq_cpd_init(struct blkcg_policy_data *cpd)
3150 {
3151         struct bfq_group_data *d = cpd_to_bfqgd(cpd);
3152
3153         d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
3154                 CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
3155 }
3156
3157 static void bfq_cpd_free(struct blkcg_policy_data *cpd)
3158 {
3159         kfree(cpd_to_bfqgd(cpd));
3160 }
3161
3162 static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
3163 {
3164         struct bfq_group *bfqg;
3165
3166         bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
3167         if (!bfqg)
3168                 return NULL;
3169
3170         if (bfqg_stats_init(&bfqg->stats, gfp)) {
3171                 kfree(bfqg);
3172                 return NULL;
3173         }
3174
3175         return &bfqg->pd;
3176 }
3177
3178 static void bfq_pd_init(struct blkg_policy_data *pd)
3179 {
3180         struct blkcg_gq *blkg = pd_to_blkg(pd);
3181         struct bfq_group *bfqg = blkg_to_bfqg(blkg);
3182         struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
3183         struct bfq_entity *entity = &bfqg->entity;
3184         struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
3185
3186         entity->orig_weight = entity->weight = entity->new_weight = d->weight;
3187         entity->my_sched_data = &bfqg->sched_data;
3188         bfqg->my_entity = entity; /*
3189                                    * the root_group's will be set to NULL
3190                                    * in bfq_init_queue()
3191                                    */
3192         bfqg->bfqd = bfqd;
3193         bfqg->active_entities = 0;
3194         bfqg->rq_pos_tree = RB_ROOT;
3195 }
3196
3197 static void bfq_pd_free(struct blkg_policy_data *pd)
3198 {
3199         struct bfq_group *bfqg = pd_to_bfqg(pd);
3200
3201         bfqg_stats_exit(&bfqg->stats);
3202         return kfree(bfqg);
3203 }
3204
3205 static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
3206 {
3207         struct bfq_group *bfqg = pd_to_bfqg(pd);
3208
3209         bfqg_stats_reset(&bfqg->stats);
3210 }
3211
3212 static void bfq_group_set_parent(struct bfq_group *bfqg,
3213                                         struct bfq_group *parent)
3214 {
3215         struct bfq_entity *entity;
3216
3217         entity = &bfqg->entity;
3218         entity->parent = parent->my_entity;
3219         entity->sched_data = &parent->sched_data;
3220 }
3221
3222 static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
3223                                          struct blkcg *blkcg)
3224 {
3225         struct blkcg_gq *blkg;
3226
3227         blkg = blkg_lookup(blkcg, bfqd->queue);
3228         if (likely(blkg))
3229                 return blkg_to_bfqg(blkg);
3230         return NULL;
3231 }
3232
3233 static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
3234                                             struct blkcg *blkcg)
3235 {
3236         struct bfq_group *bfqg, *parent;
3237         struct bfq_entity *entity;
3238
3239         bfqg = bfq_lookup_bfqg(bfqd, blkcg);
3240
3241         if (unlikely(!bfqg))
3242                 return NULL;
3243
3244         /*
3245          * Update chain of bfq_groups as we might be handling a leaf group
3246          * which, along with some of its relatives, has not been hooked yet
3247          * to the private hierarchy of BFQ.
3248          */
3249         entity = &bfqg->entity;
3250         for_each_entity(entity) {
3251                 bfqg = container_of(entity, struct bfq_group, entity);
3252                 if (bfqg != bfqd->root_group) {
3253                         parent = bfqg_parent(bfqg);
3254                         if (!parent)
3255                                 parent = bfqd->root_group;
3256                         bfq_group_set_parent(bfqg, parent);
3257                 }
3258         }
3259
3260         return bfqg;
3261 }
3262
3263 static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
3264                                   struct bfq_queue *bfqq);
3265 static void bfq_bfqq_expire(struct bfq_data *bfqd,
3266                             struct bfq_queue *bfqq,
3267                             bool compensate,
3268                             enum bfqq_expiration reason);
3269
3270 /**
3271  * bfq_bfqq_move - migrate @bfqq to @bfqg.
3272  * @bfqd: queue descriptor.
3273  * @bfqq: the queue to move.
3274  * @bfqg: the group to move to.
3275  *
3276  * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
3277  * it on the new one.  Avoid putting the entity on the old group idle tree.
3278  *
3279  * Must be called under the queue lock; the cgroup owning @bfqg must
3280  * not disappear (by now this just means that we are called under
3281  * rcu_read_lock()).
3282  */
3283 static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3284                           struct bfq_group *bfqg)
3285 {
3286         struct bfq_entity *entity = &bfqq->entity;
3287
3288         /* If bfqq is empty, then bfq_bfqq_expire also invokes
3289          * bfq_del_bfqq_busy, thereby removing bfqq and its entity
3290          * from data structures related to current group. Otherwise we
3291          * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
3292          * we do below.
3293          */
3294         if (bfqq == bfqd->in_service_queue)
3295                 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
3296                                 false, BFQQE_PREEMPTED);
3297
3298         if (bfq_bfqq_busy(bfqq))
3299                 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
3300         else if (entity->on_st)
3301                 bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
3302         bfqg_put(bfqq_group(bfqq));
3303
3304         /*
3305          * Here we use a reference to bfqg.  We don't need a refcounter
3306          * as the cgroup reference will not be dropped, so that its
3307          * destroy() callback will not be invoked.
3308          */
3309         entity->parent = bfqg->my_entity;
3310         entity->sched_data = &bfqg->sched_data;
3311         bfqg_get(bfqg);
3312
3313         if (bfq_bfqq_busy(bfqq)) {
3314                 bfq_pos_tree_add_move(bfqd, bfqq);
3315                 bfq_activate_bfqq(bfqd, bfqq);
3316         }
3317
3318         if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
3319                 bfq_schedule_dispatch(bfqd);
3320 }
3321
3322 /**
3323  * __bfq_bic_change_cgroup - move @bic to @cgroup.
3324  * @bfqd: the queue descriptor.
3325  * @bic: the bic to move.
3326  * @blkcg: the blk-cgroup to move to.
3327  *
3328  * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
3329  * has to make sure that the reference to cgroup is valid across the call.
3330  *
3331  * NOTE: an alternative approach might have been to store the current
3332  * cgroup in bfqq and getting a reference to it, reducing the lookup
3333  * time here, at the price of slightly more complex code.
3334  */
3335 static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
3336                                                 struct bfq_io_cq *bic,
3337                                                 struct blkcg *blkcg)
3338 {
3339         struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
3340         struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
3341         struct bfq_group *bfqg;
3342         struct bfq_entity *entity;
3343
3344         bfqg = bfq_find_set_group(bfqd, blkcg);
3345
3346         if (unlikely(!bfqg))
3347                 bfqg = bfqd->root_group;
3348
3349         if (async_bfqq) {
3350                 entity = &async_bfqq->entity;
3351
3352                 if (entity->sched_data != &bfqg->sched_data) {
3353                         bic_set_bfqq(bic, NULL, 0);
3354                         bfq_log_bfqq(bfqd, async_bfqq,
3355                                      "bic_change_group: %p %d",
3356                                      async_bfqq, async_bfqq->ref);
3357                         bfq_put_queue(async_bfqq);
3358                 }
3359         }
3360
3361         if (sync_bfqq) {
3362                 entity = &sync_bfqq->entity;
3363                 if (entity->sched_data != &bfqg->sched_data)
3364                         bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
3365         }
3366
3367         return bfqg;
3368 }
3369
3370 static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
3371 {
3372         struct bfq_data *bfqd = bic_to_bfqd(bic);
3373         struct bfq_group *bfqg = NULL;
3374         uint64_t serial_nr;
3375
3376         rcu_read_lock();
3377         serial_nr = bio_blkcg(bio)->css.serial_nr;
3378
3379         /*
3380          * Check whether blkcg has changed.  The condition may trigger
3381          * spuriously on a newly created cic but there's no harm.
3382          */
3383         if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
3384                 goto out;
3385
3386         bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
3387         bic->blkcg_serial_nr = serial_nr;
3388 out:
3389         rcu_read_unlock();
3390 }
3391
3392 /**
3393  * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
3394  * @st: the service tree being flushed.
3395  */
3396 static void bfq_flush_idle_tree(struct bfq_service_tree *st)
3397 {
3398         struct bfq_entity *entity = st->first_idle;
3399
3400         for (; entity ; entity = st->first_idle)
3401                 __bfq_deactivate_entity(entity, false);
3402 }
3403
3404 /**
3405  * bfq_reparent_leaf_entity - move leaf entity to the root_group.
3406  * @bfqd: the device data structure with the root group.
3407  * @entity: the entity to move.
3408  */
3409 static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
3410                                      struct bfq_entity *entity)
3411 {
3412         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3413
3414         bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
3415 }
3416
3417 /**
3418  * bfq_reparent_active_entities - move to the root group all active
3419  *                                entities.
3420  * @bfqd: the device data structure with the root group.
3421  * @bfqg: the group to move from.
3422  * @st: the service tree with the entities.
3423  *
3424  * Needs queue_lock to be taken and reference to be valid over the call.
3425  */
3426 static void bfq_reparent_active_entities(struct bfq_data *bfqd,
3427                                          struct bfq_group *bfqg,
3428                                          struct bfq_service_tree *st)
3429 {
3430         struct rb_root *active = &st->active;
3431         struct bfq_entity *entity = NULL;
3432
3433         if (!RB_EMPTY_ROOT(&st->active))
3434                 entity = bfq_entity_of(rb_first(active));
3435
3436         for (; entity ; entity = bfq_entity_of(rb_first(active)))
3437                 bfq_reparent_leaf_entity(bfqd, entity);
3438
3439         if (bfqg->sched_data.in_service_entity)
3440                 bfq_reparent_leaf_entity(bfqd,
3441                         bfqg->sched_data.in_service_entity);
3442 }
3443
3444 /**
3445  * bfq_pd_offline - deactivate the entity associated with @pd,
3446  *                  and reparent its children entities.
3447  * @pd: descriptor of the policy going offline.
3448  *
3449  * blkio already grabs the queue_lock for us, so no need to use
3450  * RCU-based magic
3451  */
3452 static void bfq_pd_offline(struct blkg_policy_data *pd)
3453 {
3454         struct bfq_service_tree *st;
3455         struct bfq_group *bfqg = pd_to_bfqg(pd);
3456         struct bfq_data *bfqd = bfqg->bfqd;
3457         struct bfq_entity *entity = bfqg->my_entity;
3458         unsigned long flags;
3459         int i;
3460
3461         if (!entity) /* root group */
3462                 return;
3463
3464         spin_lock_irqsave(&bfqd->lock, flags);
3465         /*
3466          * Empty all service_trees belonging to this group before
3467          * deactivating the group itself.
3468          */
3469         for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
3470                 st = bfqg->sched_data.service_tree + i;
3471
3472                 /*
3473                  * The idle tree may still contain bfq_queues belonging
3474                  * to exited task because they never migrated to a different
3475                  * cgroup from the one being destroyed now.  No one else
3476                  * can access them so it's safe to act without any lock.
3477                  */
3478                 bfq_flush_idle_tree(st);
3479
3480                 /*
3481                  * It may happen that some queues are still active
3482                  * (busy) upon group destruction (if the corresponding
3483                  * processes have been forced to terminate). We move
3484                  * all the leaf entities corresponding to these queues
3485                  * to the root_group.
3486                  * Also, it may happen that the group has an entity
3487                  * in service, which is disconnected from the active
3488                  * tree: it must be moved, too.
3489                  * There is no need to put the sync queues, as the
3490                  * scheduler has taken no reference.
3491                  */
3492                 bfq_reparent_active_entities(bfqd, bfqg, st);
3493         }
3494
3495         __bfq_deactivate_entity(entity, false);
3496         bfq_put_async_queues(bfqd, bfqg);
3497
3498         bfq_unlock_put_ioc_restore(bfqd, flags);
3499         /*
3500          * @blkg is going offline and will be ignored by
3501          * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
3502          * that they don't get lost.  If IOs complete after this point, the
3503          * stats for them will be lost.  Oh well...
3504          */
3505         bfqg_stats_xfer_dead(bfqg);
3506 }
3507
3508 static void bfq_end_wr_async(struct bfq_data *bfqd)
3509 {
3510         struct blkcg_gq *blkg;
3511
3512         list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
3513                 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
3514
3515                 bfq_end_wr_async_queues(bfqd, bfqg);
3516         }
3517         bfq_end_wr_async_queues(bfqd, bfqd->root_group);
3518 }
3519
3520 static int bfq_io_show_weight(struct seq_file *sf, void *v)
3521 {
3522         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3523         struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
3524         unsigned int val = 0;
3525
3526         if (bfqgd)
3527                 val = bfqgd->weight;
3528
3529         seq_printf(sf, "%u\n", val);
3530
3531         return 0;
3532 }
3533
3534 static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
3535                                     struct cftype *cftype,
3536                                     u64 val)
3537 {
3538         struct blkcg *blkcg = css_to_blkcg(css);
3539         struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
3540         struct blkcg_gq *blkg;
3541         int ret = -ERANGE;
3542
3543         if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
3544                 return ret;
3545
3546         ret = 0;
3547         spin_lock_irq(&blkcg->lock);
3548         bfqgd->weight = (unsigned short)val;
3549         hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3550                 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
3551
3552                 if (!bfqg)
3553                         continue;
3554                 /*
3555                  * Setting the prio_changed flag of the entity
3556                  * to 1 with new_weight == weight would re-set
3557                  * the value of the weight to its ioprio mapping.
3558                  * Set the flag only if necessary.
3559                  */
3560                 if ((unsigned short)val != bfqg->entity.new_weight) {
3561                         bfqg->entity.new_weight = (unsigned short)val;
3562                         /*
3563                          * Make sure that the above new value has been
3564                          * stored in bfqg->entity.new_weight before
3565                          * setting the prio_changed flag. In fact,
3566                          * this flag may be read asynchronously (in
3567                          * critical sections protected by a different
3568                          * lock than that held here), and finding this
3569                          * flag set may cause the execution of the code
3570                          * for updating parameters whose value may
3571                          * depend also on bfqg->entity.new_weight (in
3572                          * __bfq_entity_update_weight_prio).
3573                          * This barrier makes sure that the new value
3574                          * of bfqg->entity.new_weight is correctly
3575                          * seen in that code.
3576                          */
3577                         smp_wmb();
3578                         bfqg->entity.prio_changed = 1;
3579                 }
3580         }
3581         spin_unlock_irq(&blkcg->lock);
3582
3583         return ret;
3584 }
3585
3586 static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
3587                                  char *buf, size_t nbytes,
3588                                  loff_t off)
3589 {
3590         u64 weight;
3591         /* First unsigned long found in the file is used */
3592         int ret = kstrtoull(strim(buf), 0, &weight);
3593
3594         if (ret)
3595                 return ret;
3596
3597         return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
3598 }
3599
3600 static int bfqg_print_stat(struct seq_file *sf, void *v)
3601 {
3602         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
3603                           &blkcg_policy_bfq, seq_cft(sf)->private, false);
3604         return 0;
3605 }
3606
3607 static int bfqg_print_rwstat(struct seq_file *sf, void *v)
3608 {
3609         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
3610                           &blkcg_policy_bfq, seq_cft(sf)->private, true);
3611         return 0;
3612 }
3613
3614 static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
3615                                       struct blkg_policy_data *pd, int off)
3616 {
3617         u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
3618                                           &blkcg_policy_bfq, off);
3619         return __blkg_prfill_u64(sf, pd, sum);
3620 }
3621
3622 static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
3623                                         struct blkg_policy_data *pd, int off)
3624 {
3625         struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
3626                                                            &blkcg_policy_bfq,
3627                                                            off);
3628         return __blkg_prfill_rwstat(sf, pd, &sum);
3629 }
3630
3631 static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
3632 {
3633         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3634                           bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
3635                           seq_cft(sf)->private, false);
3636         return 0;
3637 }
3638
3639 static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
3640 {
3641         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3642                           bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
3643                           seq_cft(sf)->private, true);
3644         return 0;
3645 }
3646
3647 static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
3648                                int off)
3649 {
3650         u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
3651
3652         return __blkg_prfill_u64(sf, pd, sum >> 9);
3653 }
3654
3655 static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
3656 {
3657         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3658                           bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
3659         return 0;
3660 }
3661
3662 static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
3663                                          struct blkg_policy_data *pd, int off)
3664 {
3665         struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
3666                                         offsetof(struct blkcg_gq, stat_bytes));
3667         u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
3668                 atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
3669
3670         return __blkg_prfill_u64(sf, pd, sum >> 9);
3671 }
3672
3673 static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
3674 {
3675         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3676                           bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
3677                           false);
3678         return 0;
3679 }
3680
3681 static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
3682                                       struct blkg_policy_data *pd, int off)
3683 {
3684         struct bfq_group *bfqg = pd_to_bfqg(pd);
3685         u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
3686         u64 v = 0;
3687
3688         if (samples) {
3689                 v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
3690                 v = div64_u64(v, samples);
3691         }
3692         __blkg_prfill_u64(sf, pd, v);
3693         return 0;
3694 }
3695
3696 /* print avg_queue_size */
3697 static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
3698 {
3699         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
3700                           bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
3701                           0, false);
3702         return 0;
3703 }
3704
3705 static struct bfq_group *
3706 bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
3707 {
3708         int ret;
3709
3710         ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
3711         if (ret)
3712                 return NULL;
3713
3714         return blkg_to_bfqg(bfqd->queue->root_blkg);
3715 }
3716
3717 static struct cftype bfq_blkcg_legacy_files[] = {
3718         {
3719                 .name = "bfq.weight",
3720                 .flags = CFTYPE_NOT_ON_ROOT,
3721                 .seq_show = bfq_io_show_weight,
3722                 .write_u64 = bfq_io_set_weight_legacy,
3723         },
3724
3725         /* statistics, covers only the tasks in the bfqg */
3726         {
3727                 .name = "bfq.time",
3728                 .private = offsetof(struct bfq_group, stats.time),
3729                 .seq_show = bfqg_print_stat,
3730         },
3731         {
3732                 .name = "bfq.sectors",
3733                 .seq_show = bfqg_print_stat_sectors,
3734         },
3735         {
3736                 .name = "bfq.io_service_bytes",
3737                 .private = (unsigned long)&blkcg_policy_bfq,
3738                 .seq_show = blkg_print_stat_bytes,
3739         },
3740         {
3741                 .name = "bfq.io_serviced",
3742                 .private = (unsigned long)&blkcg_policy_bfq,
3743                 .seq_show = blkg_print_stat_ios,
3744         },
3745         {
3746                 .name = "bfq.io_service_time",
3747                 .private = offsetof(struct bfq_group, stats.service_time),
3748                 .seq_show = bfqg_print_rwstat,
3749         },
3750         {
3751                 .name = "bfq.io_wait_time",
3752                 .private = offsetof(struct bfq_group, stats.wait_time),
3753                 .seq_show = bfqg_print_rwstat,
3754         },
3755         {
3756                 .name = "bfq.io_merged",
3757                 .private = offsetof(struct bfq_group, stats.merged),
3758                 .seq_show = bfqg_print_rwstat,
3759         },
3760         {
3761                 .name = "bfq.io_queued",
3762                 .private = offsetof(struct bfq_group, stats.queued),
3763                 .seq_show = bfqg_print_rwstat,
3764         },
3765
3766         /* the same statictics which cover the bfqg and its descendants */
3767         {
3768                 .name = "bfq.time_recursive",
3769                 .private = offsetof(struct bfq_group, stats.time),
3770                 .seq_show = bfqg_print_stat_recursive,
3771         },
3772         {
3773                 .name = "bfq.sectors_recursive",
3774                 .seq_show = bfqg_print_stat_sectors_recursive,
3775         },
3776         {
3777                 .name = "bfq.io_service_bytes_recursive",
3778                 .private = (unsigned long)&blkcg_policy_bfq,
3779                 .seq_show = blkg_print_stat_bytes_recursive,
3780         },
3781         {
3782                 .name = "bfq.io_serviced_recursive",
3783                 .private = (unsigned long)&blkcg_policy_bfq,
3784                 .seq_show = blkg_print_stat_ios_recursive,
3785         },
3786         {
3787                 .name = "bfq.io_service_time_recursive",
3788                 .private = offsetof(struct bfq_group, stats.service_time),
3789                 .seq_show = bfqg_print_rwstat_recursive,
3790         },
3791         {
3792                 .name = "bfq.io_wait_time_recursive",
3793                 .private = offsetof(struct bfq_group, stats.wait_time),
3794                 .seq_show = bfqg_print_rwstat_recursive,
3795         },
3796         {
3797                 .name = "bfq.io_merged_recursive",
3798                 .private = offsetof(struct bfq_group, stats.merged),
3799                 .seq_show = bfqg_print_rwstat_recursive,
3800         },
3801         {
3802                 .name = "bfq.io_queued_recursive",
3803                 .private = offsetof(struct bfq_group, stats.queued),
3804                 .seq_show = bfqg_print_rwstat_recursive,
3805         },
3806         {
3807                 .name = "bfq.avg_queue_size",
3808                 .seq_show = bfqg_print_avg_queue_size,
3809         },
3810         {
3811                 .name = "bfq.group_wait_time",
3812                 .private = offsetof(struct bfq_group, stats.group_wait_time),
3813                 .seq_show = bfqg_print_stat,
3814         },
3815         {
3816                 .name = "bfq.idle_time",
3817                 .private = offsetof(struct bfq_group, stats.idle_time),
3818                 .seq_show = bfqg_print_stat,
3819         },
3820         {
3821                 .name = "bfq.empty_time",
3822                 .private = offsetof(struct bfq_group, stats.empty_time),
3823                 .seq_show = bfqg_print_stat,
3824         },
3825         {
3826                 .name = "bfq.dequeue",
3827                 .private = offsetof(struct bfq_group, stats.dequeue),
3828                 .seq_show = bfqg_print_stat,
3829         },
3830         { }     /* terminate */
3831 };
3832
3833 static struct cftype bfq_blkg_files[] = {
3834         {
3835                 .name = "bfq.weight",
3836                 .flags = CFTYPE_NOT_ON_ROOT,
3837                 .seq_show = bfq_io_show_weight,
3838                 .write = bfq_io_set_weight,
3839         },
3840         {} /* terminate */
3841 };
3842
3843 #else   /* CONFIG_BFQ_GROUP_IOSCHED */
3844
3845 static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
3846                         struct bfq_queue *bfqq, unsigned int op) { }
3847 static inline void
3848 bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
3849 static inline void
3850 bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
3851 static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
3852                         uint64_t start_time, uint64_t io_start_time,
3853                         unsigned int op) { }
3854 static inline void
3855 bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
3856                                      struct bfq_group *curr_bfqg) { }
3857 static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
3858 static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
3859 static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
3860 static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
3861 static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
3862 static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
3863
3864 static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3865                           struct bfq_group *bfqg) {}
3866
3867 static void bfq_init_entity(struct bfq_entity *entity,
3868                             struct bfq_group *bfqg)
3869 {
3870         struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
3871
3872         entity->weight = entity->new_weight;
3873         entity->orig_weight = entity->new_weight;
3874         if (bfqq) {
3875                 bfqq->ioprio = bfqq->new_ioprio;
3876                 bfqq->ioprio_class = bfqq->new_ioprio_class;
3877         }
3878         entity->sched_data = &bfqg->sched_data;
3879 }
3880
3881 static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
3882
3883 static void bfq_end_wr_async(struct bfq_data *bfqd)
3884 {
3885         bfq_end_wr_async_queues(bfqd, bfqd->root_group);
3886 }
3887
3888 static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
3889                                             struct blkcg *blkcg)
3890 {
3891         return bfqd->root_group;
3892 }
3893
3894 static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
3895 {
3896         return bfqq->bfqd->root_group;
3897 }
3898
3899 static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd,
3900                                                     int node)
3901 {
3902         struct bfq_group *bfqg;
3903         int i;
3904
3905         bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
3906         if (!bfqg)
3907                 return NULL;
3908
3909         for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
3910                 bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
3911
3912         return bfqg;
3913 }
3914 #endif  /* CONFIG_BFQ_GROUP_IOSCHED */
3915
3916 #define bfq_class_idle(bfqq)    ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
3917 #define bfq_class_rt(bfqq)      ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
3918
3919 #define bfq_sample_valid(samples)       ((samples) > 80)
3920
3921 /*
3922  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
3923  * We choose the request that is closesr to the head right now.  Distance
3924  * behind the head is penalized and only allowed to a certain extent.
3925  */
3926 static struct request *bfq_choose_req(struct bfq_data *bfqd,
3927                                       struct request *rq1,
3928                                       struct request *rq2,
3929                                       sector_t last)
3930 {
3931         sector_t s1, s2, d1 = 0, d2 = 0;
3932         unsigned long back_max;
3933 #define BFQ_RQ1_WRAP    0x01 /* request 1 wraps */
3934 #define BFQ_RQ2_WRAP    0x02 /* request 2 wraps */
3935         unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
3936
3937         if (!rq1 || rq1 == rq2)
3938                 return rq2;
3939         if (!rq2)
3940                 return rq1;
3941
3942         if (rq_is_sync(rq1) && !rq_is_sync(rq2))
3943                 return rq1;
3944         else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
3945                 return rq2;
3946         if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
3947                 return rq1;
3948         else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
3949                 return rq2;
3950
3951         s1 = blk_rq_pos(rq1);
3952         s2 = blk_rq_pos(rq2);
3953
3954         /*
3955          * By definition, 1KiB is 2 sectors.
3956          */
3957         back_max = bfqd->bfq_back_max * 2;
3958
3959         /*
3960          * Strict one way elevator _except_ in the case where we allow
3961          * short backward seeks which are biased as twice the cost of a
3962          * similar forward seek.
3963          */
3964         if (s1 >= last)
3965                 d1 = s1 - last;
3966         else if (s1 + back_max >= last)
3967                 d1 = (last - s1) * bfqd->bfq_back_penalty;
3968         else
3969                 wrap |= BFQ_RQ1_WRAP;
3970
3971         if (s2 >= last)
3972                 d2 = s2 - last;
3973         else if (s2 + back_max >= last)
3974                 d2 = (last - s2) * bfqd->bfq_back_penalty;
3975         else
3976                 wrap |= BFQ_RQ2_WRAP;
3977
3978         /* Found required data */
3979
3980         /*
3981          * By doing switch() on the bit mask "wrap" we avoid having to
3982          * check two variables for all permutations: --> faster!
3983          */
3984         switch (wrap) {
3985         case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
3986                 if (d1 < d2)
3987                         return rq1;
3988                 else if (d2 < d1)
3989                         return rq2;
3990
3991                 if (s1 >= s2)
3992                         return rq1;
3993                 else
3994                         return rq2;
3995
3996         case BFQ_RQ2_WRAP:
3997                 return rq1;
3998         case BFQ_RQ1_WRAP:
3999                 return rq2;
4000         case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
4001         default:
4002                 /*
4003                  * Since both rqs are wrapped,
4004                  * start with the one that's further behind head
4005                  * (--> only *one* back seek required),
4006                  * since back seek takes more time than forward.
4007                  */
4008                 if (s1 <= s2)
4009                         return rq1;
4010                 else
4011                         return rq2;
4012         }
4013 }
4014
4015 static struct bfq_queue *
4016 bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
4017                      sector_t sector, struct rb_node **ret_parent,
4018                      struct rb_node ***rb_link)
4019 {
4020         struct rb_node **p, *parent;
4021         struct bfq_queue *bfqq = NULL;
4022
4023         parent = NULL;
4024         p = &root->rb_node;
4025         while (*p) {
4026                 struct rb_node **n;
4027
4028                 parent = *p;
4029                 bfqq = rb_entry(parent, struct bfq_queue, pos_node);
4030
4031                 /*
4032                  * Sort strictly based on sector. Smallest to the left,
4033                  * largest to the right.
4034                  */
4035                 if (sector > blk_rq_pos(bfqq->next_rq))
4036                         n = &(*p)->rb_right;
4037                 else if (sector < blk_rq_pos(bfqq->next_rq))
4038                         n = &(*p)->rb_left;
4039                 else
4040                         break;
4041                 p = n;
4042                 bfqq = NULL;
4043         }
4044
4045         *ret_parent = parent;
4046         if (rb_link)
4047                 *rb_link = p;
4048
4049         bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
4050                 (unsigned long long)sector,
4051                 bfqq ? bfqq->pid : 0);
4052
4053         return bfqq;
4054 }
4055
4056 static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4057 {
4058         struct rb_node **p, *parent;
4059         struct bfq_queue *__bfqq;
4060
4061         if (bfqq->pos_root) {
4062                 rb_erase(&bfqq->pos_node, bfqq->pos_root);
4063                 bfqq->pos_root = NULL;
4064         }
4065
4066         if (bfq_class_idle(bfqq))
4067                 return;
4068         if (!bfqq->next_rq)
4069                 return;
4070
4071         bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
4072         __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
4073                         blk_rq_pos(bfqq->next_rq), &parent, &p);
4074         if (!__bfqq) {
4075                 rb_link_node(&bfqq->pos_node, parent, p);
4076                 rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
4077         } else
4078                 bfqq->pos_root = NULL;
4079 }
4080
4081 /*
4082  * Tell whether there are active queues or groups with differentiated weights.
4083  */
4084 static bool bfq_differentiated_weights(struct bfq_data *bfqd)
4085 {
4086         /*
4087          * For weights to differ, at least one of the trees must contain
4088          * at least two nodes.
4089          */
4090         return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
4091                 (bfqd->queue_weights_tree.rb_node->rb_left ||
4092                  bfqd->queue_weights_tree.rb_node->rb_right)
4093 #ifdef CONFIG_BFQ_GROUP_IOSCHED
4094                ) ||
4095                (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
4096                 (bfqd->group_weights_tree.rb_node->rb_left ||
4097                  bfqd->group_weights_tree.rb_node->rb_right)
4098 #endif
4099                );
4100 }
4101
4102 /*
4103  * The following function returns true if every queue must receive the
4104  * same share of the throughput (this condition is used when deciding
4105  * whether idling may be disabled, see the comments in the function
4106  * bfq_bfqq_may_idle()).
4107  *
4108  * Such a scenario occurs when:
4109  * 1) all active queues have the same weight,
4110  * 2) all active groups at the same level in the groups tree have the same
4111  *    weight,
4112  * 3) all active groups at the same level in the groups tree have the same
4113  *    number of children.
4114  *
4115  * Unfortunately, keeping the necessary state for evaluating exactly the
4116  * above symmetry conditions would be quite complex and time-consuming.
4117  * Therefore this function evaluates, instead, the following stronger
4118  * sub-conditions, for which it is much easier to maintain the needed
4119  * state:
4120  * 1) all active queues have the same weight,
4121  * 2) all active groups have the same weight,
4122  * 3) all active groups have at most one active child each.
4123  * In particular, the last two conditions are always true if hierarchical
4124  * support and the cgroups interface are not enabled, thus no state needs
4125  * to be maintained in this case.
4126  */
4127 static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
4128 {
4129         return !bfq_differentiated_weights(bfqd);
4130 }
4131
4132 /*
4133  * If the weight-counter tree passed as input contains no counter for
4134  * the weight of the input entity, then add that counter; otherwise just
4135  * increment the existing counter.
4136  *
4137  * Note that weight-counter trees contain few nodes in mostly symmetric
4138  * scenarios. For example, if all queues have the same weight, then the
4139  * weight-counter tree for the queues may contain at most one node.
4140  * This holds even if low_latency is on, because weight-raised queues
4141  * are not inserted in the tree.
4142  * In most scenarios, the rate at which nodes are created/destroyed
4143  * should be low too.
4144  */
4145 static void bfq_weights_tree_add(struct bfq_data *bfqd,
4146                                  struct bfq_entity *entity,
4147                                  struct rb_root *root)
4148 {
4149         struct rb_node **new = &(root->rb_node), *parent = NULL;
4150
4151         /*
4152          * Do not insert if the entity is already associated with a
4153          * counter, which happens if:
4154          *   1) the entity is associated with a queue,
4155          *   2) a request arrival has caused the queue to become both
4156          *      non-weight-raised, and hence change its weight, and
4157          *      backlogged; in this respect, each of the two events
4158          *      causes an invocation of this function,
4159          *   3) this is the invocation of this function caused by the
4160          *      second event. This second invocation is actually useless,
4161          *      and we handle this fact by exiting immediately. More
4162          *      efficient or clearer solutions might possibly be adopted.
4163          */
4164         if (entity->weight_counter)
4165                 return;
4166
4167         while (*new) {
4168                 struct bfq_weight_counter *__counter = container_of(*new,
4169                                                 struct bfq_weight_counter,
4170                                                 weights_node);
4171                 parent = *new;
4172
4173                 if (entity->weight == __counter->weight) {
4174                         entity->weight_counter = __counter;
4175                         goto inc_counter;
4176                 }
4177                 if (entity->weight < __counter->weight)
4178                         new = &((*new)->rb_left);
4179                 else
4180                         new = &((*new)->rb_right);
4181         }
4182
4183         entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
4184                                          GFP_ATOMIC);
4185
4186         /*
4187          * In the unlucky event of an allocation failure, we just
4188          * exit. This will cause the weight of entity to not be
4189          * considered in bfq_differentiated_weights, which, in its
4190          * turn, causes the scenario to be deemed wrongly symmetric in
4191          * case entity's weight would have been the only weight making
4192          * the scenario asymmetric. On the bright side, no unbalance
4193          * will however occur when entity becomes inactive again (the
4194          * invocation of this function is triggered by an activation
4195          * of entity). In fact, bfq_weights_tree_remove does nothing
4196          * if !entity->weight_counter.
4197          */
4198         if (unlikely(!entity->weight_counter))
4199                 return;
4200
4201         entity->weight_counter->weight = entity->weight;
4202         rb_link_node(&entity->weight_counter->weights_node, parent, new);
4203         rb_insert_color(&entity->weight_counter->weights_node, root);
4204
4205 inc_counter:
4206         entity->weight_counter->num_active++;
4207 }
4208
4209 /*
4210  * Decrement the weight counter associated with the entity, and, if the
4211  * counter reaches 0, remove the counter from the tree.
4212  * See the comments to the function bfq_weights_tree_add() for considerations
4213  * about overhead.
4214  */
4215 static void bfq_weights_tree_remove(struct bfq_data *bfqd,
4216                                     struct bfq_entity *entity,
4217                                     struct rb_root *root)
4218 {
4219         if (!entity->weight_counter)
4220                 return;
4221
4222         entity->weight_counter->num_active--;
4223         if (entity->weight_counter->num_active > 0)
4224                 goto reset_entity_pointer;
4225
4226         rb_erase(&entity->weight_counter->weights_node, root);
4227         kfree(entity->weight_counter);
4228
4229 reset_entity_pointer:
4230         entity->weight_counter = NULL;
4231 }
4232
4233 /*
4234  * Return expired entry, or NULL to just start from scratch in rbtree.
4235  */
4236 static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
4237                                       struct request *last)
4238 {
4239         struct request *rq;
4240
4241         if (bfq_bfqq_fifo_expire(bfqq))
4242                 return NULL;
4243
4244         bfq_mark_bfqq_fifo_expire(bfqq);
4245
4246         rq = rq_entry_fifo(bfqq->fifo.next);
4247
4248         if (rq == last || ktime_get_ns() < rq->fifo_time)
4249                 return NULL;
4250
4251         bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
4252         return rq;
4253 }
4254
4255 static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
4256                                         struct bfq_queue *bfqq,
4257                                         struct request *last)
4258 {
4259         struct rb_node *rbnext = rb_next(&last->rb_node);
4260         struct rb_node *rbprev = rb_prev(&last->rb_node);
4261         struct request *next, *prev = NULL;
4262
4263         /* Follow expired path, else get first next available. */
4264         next = bfq_check_fifo(bfqq, last);
4265         if (next)
4266                 return next;
4267
4268         if (rbprev)
4269                 prev = rb_entry_rq(rbprev);
4270
4271         if (rbnext)
4272                 next = rb_entry_rq(rbnext);
4273         else {
4274                 rbnext = rb_first(&bfqq->sort_list);
4275                 if (rbnext && rbnext != &last->rb_node)
4276                         next = rb_entry_rq(rbnext);
4277         }
4278
4279         return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
4280 }
4281
4282 /* see the definition of bfq_async_charge_factor for details */
4283 static unsigned long bfq_serv_to_charge(struct request *rq,
4284                                         struct bfq_queue *bfqq)
4285 {
4286         if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
4287                 return blk_rq_sectors(rq);
4288
4289         /*
4290          * If there are no weight-raised queues, then amplify service
4291          * by just the async charge factor; otherwise amplify service
4292          * by twice the async charge factor, to further reduce latency
4293          * for weight-raised queues.
4294          */
4295         if (bfqq->bfqd->wr_busy_queues == 0)
4296                 return blk_rq_sectors(rq) * bfq_async_charge_factor;
4297
4298         return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
4299 }
4300
4301 /**
4302  * bfq_updated_next_req - update the queue after a new next_rq selection.
4303  * @bfqd: the device data the queue belongs to.
4304  * @bfqq: the queue to update.
4305  *
4306  * If the first request of a queue changes we make sure that the queue
4307  * has enough budget to serve at least its first request (if the
4308  * request has grown).  We do this because if the queue has not enough
4309  * budget for its first request, it has to go through two dispatch
4310  * rounds to actually get it dispatched.
4311  */
4312 static void bfq_updated_next_req(struct bfq_data *bfqd,
4313                                  struct bfq_queue *bfqq)
4314 {
4315         struct bfq_entity *entity = &bfqq->entity;
4316         struct request *next_rq = bfqq->next_rq;
4317         unsigned long new_budget;
4318
4319         if (!next_rq)
4320                 return;
4321
4322         if (bfqq == bfqd->in_service_queue)
4323                 /*
4324                  * In order not to break guarantees, budgets cannot be
4325                  * changed after an entity has been selected.
4326                  */
4327                 return;
4328
4329         new_budget = max_t(unsigned long, bfqq->max_budget,
4330                            bfq_serv_to_charge(next_rq, bfqq));
4331         if (entity->budget != new_budget) {
4332                 entity->budget = new_budget;
4333                 bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
4334                                          new_budget);
4335                 bfq_requeue_bfqq(bfqd, bfqq);
4336         }
4337 }
4338
4339 static void
4340 bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
4341 {
4342         if (bic->saved_idle_window)
4343                 bfq_mark_bfqq_idle_window(bfqq);
4344         else
4345                 bfq_clear_bfqq_idle_window(bfqq);
4346
4347         if (bic->saved_IO_bound)
4348                 bfq_mark_bfqq_IO_bound(bfqq);
4349         else
4350                 bfq_clear_bfqq_IO_bound(bfqq);
4351
4352         bfqq->ttime = bic->saved_ttime;
4353         bfqq->wr_coeff = bic->saved_wr_coeff;
4354         bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
4355         bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
4356         bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
4357
4358         if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
4359             time_is_before_jiffies(bfqq->last_wr_start_finish +
4360                                    bfqq->wr_cur_max_time))) {
4361                 bfq_log_bfqq(bfqq->bfqd, bfqq,
4362                     "resume state: switching off wr");
4363
4364                 bfqq->wr_coeff = 1;
4365         }
4366
4367         /* make sure weight will be updated, however we got here */
4368         bfqq->entity.prio_changed = 1;
4369 }
4370
4371 static int bfqq_process_refs(struct bfq_queue *bfqq)
4372 {
4373         return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
4374 }
4375
4376 /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
4377 static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4378 {
4379         struct bfq_queue *item;
4380         struct hlist_node *n;
4381
4382         hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
4383                 hlist_del_init(&item->burst_list_node);
4384         hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
4385         bfqd->burst_size = 1;
4386         bfqd->burst_parent_entity = bfqq->entity.parent;
4387 }
4388
4389 /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
4390 static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4391 {
4392         /* Increment burst size to take into account also bfqq */
4393         bfqd->burst_size++;
4394
4395         if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
4396                 struct bfq_queue *pos, *bfqq_item;
4397                 struct hlist_node *n;
4398
4399                 /*
4400                  * Enough queues have been activated shortly after each
4401                  * other to consider this burst as large.
4402                  */
4403                 bfqd->large_burst = true;
4404
4405                 /*
4406                  * We can now mark all queues in the burst list as
4407                  * belonging to a large burst.
4408                  */
4409                 hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
4410                                      burst_list_node)
4411                         bfq_mark_bfqq_in_large_burst(bfqq_item);
4412                 bfq_mark_bfqq_in_large_burst(bfqq);
4413
4414                 /*
4415                  * From now on, and until the current burst finishes, any
4416                  * new queue being activated shortly after the last queue
4417                  * was inserted in the burst can be immediately marked as
4418                  * belonging to a large burst. So the burst list is not
4419                  * needed any more. Remove it.
4420                  */
4421                 hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
4422                                           burst_list_node)
4423                         hlist_del_init(&pos->burst_list_node);
4424         } else /*
4425                 * Burst not yet large: add bfqq to the burst list. Do
4426                 * not increment the ref counter for bfqq, because bfqq
4427                 * is removed from the burst list before freeing bfqq
4428                 * in put_queue.
4429                 */
4430                 hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
4431 }
4432
4433 /*
4434  * If many queues belonging to the same group happen to be created
4435  * shortly after each other, then the processes associated with these
4436  * queues have typically a common goal. In particular, bursts of queue
4437  * creations are usually caused by services or applications that spawn
4438  * many parallel threads/processes. Examples are systemd during boot,
4439  * or git grep. To help these processes get their job done as soon as
4440  * possible, it is usually better to not grant either weight-raising
4441  * or device idling to their queues.
4442  *
4443  * In this comment we describe, firstly, the reasons why this fact
4444  * holds, and, secondly, the next function, which implements the main
4445  * steps needed to properly mark these queues so that they can then be
4446  * treated in a different way.
4447  *
4448  * The above services or applications benefit mostly from a high
4449  * throughput: the quicker the requests of the activated queues are
4450  * cumulatively served, the sooner the target job of these queues gets
4451  * completed. As a consequence, weight-raising any of these queues,
4452  * which also implies idling the device for it, is almost always
4453  * counterproductive. In most cases it just lowers throughput.
4454  *
4455  * On the other hand, a burst of queue creations may be caused also by
4456  * the start of an application that does not consist of a lot of
4457  * parallel I/O-bound threads. In fact, with a complex application,
4458  * several short processes may need to be executed to start-up the
4459  * application. In this respect, to start an application as quickly as
4460  * possible, the best thing to do is in any case to privilege the I/O
4461  * related to the application with respect to all other
4462  * I/O. Therefore, the best strategy to start as quickly as possible
4463  * an application that causes a burst of queue creations is to
4464  * weight-raise all the queues created during the burst. This is the
4465  * exact opposite of the best strategy for the other type of bursts.
4466  *
4467  * In the end, to take the best action for each of the two cases, the
4468  * two types of bursts need to be distinguished. Fortunately, this
4469  * seems relatively easy, by looking at the sizes of the bursts. In
4470  * particular, we found a threshold such that only bursts with a
4471  * larger size than that threshold are apparently caused by
4472  * services or commands such as systemd or git grep. For brevity,
4473  * hereafter we call just 'large' these bursts. BFQ *does not*
4474  * weight-raise queues whose creation occurs in a large burst. In
4475  * addition, for each of these queues BFQ performs or does not perform
4476  * idling depending on which choice boosts the throughput more. The
4477  * exact choice depends on the device and request pattern at
4478  * hand.
4479  *
4480  * Unfortunately, false positives may occur while an interactive task
4481  * is starting (e.g., an application is being started). The
4482  * consequence is that the queues associated with the task do not
4483  * enjoy weight raising as expected. Fortunately these false positives
4484  * are very rare. They typically occur if some service happens to
4485  * start doing I/O exactly when the interactive task starts.
4486  *
4487  * Turning back to the next function, it implements all the steps
4488  * needed to detect the occurrence of a large burst and to properly
4489  * mark all the queues belonging to it (so that they can then be
4490  * treated in a different way). This goal is achieved by maintaining a
4491  * "burst list" that holds, temporarily, the queues that belong to the
4492  * burst in progress. The list is then used to mark these queues as
4493  * belonging to a large burst if the burst does become large. The main
4494  * steps are the following.
4495  *
4496  * . when the very first queue is created, the queue is inserted into the
4497  *   list (as it could be the first queue in a possible burst)
4498  *
4499  * . if the current burst has not yet become large, and a queue Q that does
4500  *   not yet belong to the burst is activated shortly after the last time
4501  *   at which a new queue entered the burst list, then the function appends
4502  *   Q to the burst list
4503  *
4504  * . if, as a consequence of the previous step, the burst size reaches
4505  *   the large-burst threshold, then
4506  *
4507  *     . all the queues in the burst list are marked as belonging to a
4508  *       large burst
4509  *
4510  *     . the burst list is deleted; in fact, the burst list already served
4511  *       its purpose (keeping temporarily track of the queues in a burst,
4512  *       so as to be able to mark them as belonging to a large burst in the
4513  *       previous sub-step), and now is not needed any more
4514  *
4515  *     . the device enters a large-burst mode
4516  *
4517  * . if a queue Q that does not belong to the burst is created while
4518  *   the device is in large-burst mode and shortly after the last time
4519  *   at which a queue either entered the burst list or was marked as
4520  *   belonging to the current large burst, then Q is immediately marked
4521  *   as belonging to a large burst.
4522  *
4523  * . if a queue Q that does not belong to the burst is created a while
4524  *   later, i.e., not shortly after, than the last time at which a queue
4525  *   either entered the burst list or was marked as belonging to the
4526  *   current large burst, then the current burst is deemed as finished and:
4527  *
4528  *        . the large-burst mode is reset if set
4529  *
4530  *        . the burst list is emptied
4531  *
4532  *        . Q is inserted in the burst list, as Q may be the first queue
4533  *          in a possible new burst (then the burst list contains just Q
4534  *          after this step).
4535  */
4536 static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4537 {
4538         /*
4539          * If bfqq is already in the burst list or is part of a large
4540          * burst, or finally has just been split, then there is
4541          * nothing else to do.
4542          */
4543         if (!hlist_unhashed(&bfqq->burst_list_node) ||
4544             bfq_bfqq_in_large_burst(bfqq) ||
4545             time_is_after_eq_jiffies(bfqq->split_time +
4546                                      msecs_to_jiffies(10)))
4547                 return;
4548
4549         /*
4550          * If bfqq's creation happens late enough, or bfqq belongs to
4551          * a different group than the burst group, then the current
4552          * burst is finished, and related data structures must be
4553          * reset.
4554          *
4555          * In this respect, consider the special case where bfqq is
4556          * the very first queue created after BFQ is selected for this
4557          * device. In this case, last_ins_in_burst and
4558          * burst_parent_entity are not yet significant when we get
4559          * here. But it is easy to verify that, whether or not the
4560          * following condition is true, bfqq will end up being
4561          * inserted into the burst list. In particular the list will
4562          * happen to contain only bfqq. And this is exactly what has
4563          * to happen, as bfqq may be the first queue of the first
4564          * burst.
4565          */
4566         if (time_is_before_jiffies(bfqd->last_ins_in_burst +
4567             bfqd->bfq_burst_interval) ||
4568             bfqq->entity.parent != bfqd->burst_parent_entity) {
4569                 bfqd->large_burst = false;
4570                 bfq_reset_burst_list(bfqd, bfqq);
4571                 goto end;
4572         }
4573
4574         /*
4575          * If we get here, then bfqq is being activated shortly after the
4576          * last queue. So, if the current burst is also large, we can mark
4577          * bfqq as belonging to this large burst immediately.
4578          */
4579         if (bfqd->large_burst) {
4580                 bfq_mark_bfqq_in_large_burst(bfqq);
4581                 goto end;
4582         }
4583
4584         /*
4585          * If we get here, then a large-burst state has not yet been
4586          * reached, but bfqq is being activated shortly after the last
4587          * queue. Then we add bfqq to the burst.
4588          */
4589         bfq_add_to_burst(bfqd, bfqq);
4590 end:
4591         /*
4592          * At this point, bfqq either has been added to the current
4593          * burst or has caused the current burst to terminate and a
4594          * possible new burst to start. In particular, in the second
4595          * case, bfqq has become the first queue in the possible new
4596          * burst.  In both cases last_ins_in_burst needs to be moved
4597          * forward.
4598          */
4599         bfqd->last_ins_in_burst = jiffies;
4600 }
4601
4602 static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
4603 {
4604         struct bfq_entity *entity = &bfqq->entity;
4605
4606         return entity->budget - entity->service;
4607 }
4608
4609 /*
4610  * If enough samples have been computed, return the current max budget
4611  * stored in bfqd, which is dynamically updated according to the
4612  * estimated disk peak rate; otherwise return the default max budget
4613  */
4614 static int bfq_max_budget(struct bfq_data *bfqd)
4615 {
4616         if (bfqd->budgets_assigned < bfq_stats_min_budgets)
4617                 return bfq_default_max_budget;
4618         else
4619                 return bfqd->bfq_max_budget;
4620 }
4621
4622 /*
4623  * Return min budget, which is a fraction of the current or default
4624  * max budget (trying with 1/32)
4625  */
4626 static int bfq_min_budget(struct bfq_data *bfqd)
4627 {
4628         if (bfqd->budgets_assigned < bfq_stats_min_budgets)
4629                 return bfq_default_max_budget / 32;
4630         else
4631                 return bfqd->bfq_max_budget / 32;
4632 }
4633
4634 static void bfq_bfqq_expire(struct bfq_data *bfqd,
4635                             struct bfq_queue *bfqq,
4636                             bool compensate,
4637                             enum bfqq_expiration reason);
4638
4639 /*
4640  * The next function, invoked after the input queue bfqq switches from
4641  * idle to busy, updates the budget of bfqq. The function also tells
4642  * whether the in-service queue should be expired, by returning
4643  * true. The purpose of expiring the in-service queue is to give bfqq
4644  * the chance to possibly preempt the in-service queue, and the reason
4645  * for preempting the in-service queue is to achieve one of the two
4646  * goals below.
4647  *
4648  * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
4649  * expired because it has remained idle. In particular, bfqq may have
4650  * expired for one of the following two reasons:
4651  *
4652  * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
4653  *   and did not make it to issue a new request before its last
4654  *   request was served;
4655  *
4656  * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
4657  *   a new request before the expiration of the idling-time.
4658  *
4659  * Even if bfqq has expired for one of the above reasons, the process
4660  * associated with the queue may be however issuing requests greedily,
4661  * and thus be sensitive to the bandwidth it receives (bfqq may have
4662  * remained idle for other reasons: CPU high load, bfqq not enjoying
4663  * idling, I/O throttling somewhere in the path from the process to
4664  * the I/O scheduler, ...). But if, after every expiration for one of
4665  * the above two reasons, bfqq has to wait for the service of at least
4666  * one full budget of another queue before being served again, then
4667  * bfqq is likely to get a much lower bandwidth or resource time than
4668  * its reserved ones. To address this issue, two countermeasures need
4669  * to be taken.
4670  *
4671  * First, the budget and the timestamps of bfqq need to be updated in
4672  * a special way on bfqq reactivation: they need to be updated as if
4673  * bfqq did not remain idle and did not expire. In fact, if they are
4674  * computed as if bfqq expired and remained idle until reactivation,
4675  * then the process associated with bfqq is treated as if, instead of
4676  * being greedy, it stopped issuing requests when bfqq remained idle,
4677  * and restarts issuing requests only on this reactivation. In other
4678  * words, the scheduler does not help the process recover the "service
4679  * hole" between bfqq expiration and reactivation. As a consequence,
4680  * the process receives a lower bandwidth than its reserved one. In
4681  * contrast, to recover this hole, the budget must be updated as if
4682  * bfqq was not expired at all before this reactivation, i.e., it must
4683  * be set to the value of the remaining budget when bfqq was
4684  * expired. Along the same line, timestamps need to be assigned the
4685  * value they had the last time bfqq was selected for service, i.e.,
4686  * before last expiration. Thus timestamps need to be back-shifted
4687  * with respect to their normal computation (see [1] for more details
4688  * on this tricky aspect).
4689  *
4690  * Secondly, to allow the process to recover the hole, the in-service
4691  * queue must be expired too, to give bfqq the chance to preempt it
4692  * immediately. In fact, if bfqq has to wait for a full budget of the
4693  * in-service queue to be completed, then it may become impossible to
4694  * let the process recover the hole, even if the back-shifted
4695  * timestamps of bfqq are lower than those of the in-service queue. If
4696  * this happens for most or all of the holes, then the process may not
4697  * receive its reserved bandwidth. In this respect, it is worth noting
4698  * that, being the service of outstanding requests unpreemptible, a
4699  * little fraction of the holes may however be unrecoverable, thereby
4700  * causing a little loss of bandwidth.
4701  *
4702  * The last important point is detecting whether bfqq does need this
4703  * bandwidth recovery. In this respect, the next function deems the
4704  * process associated with bfqq greedy, and thus allows it to recover
4705  * the hole, if: 1) the process is waiting for the arrival of a new
4706  * request (which implies that bfqq expired for one of the above two
4707  * reasons), and 2) such a request has arrived soon. The first
4708  * condition is controlled through the flag non_blocking_wait_rq,
4709  * while the second through the flag arrived_in_time. If both
4710  * conditions hold, then the function computes the budget in the
4711  * above-described special way, and signals that the in-service queue
4712  * should be expired. Timestamp back-shifting is done later in
4713  * __bfq_activate_entity.
4714  *
4715  * 2. Reduce latency. Even if timestamps are not backshifted to let
4716  * the process associated with bfqq recover a service hole, bfqq may
4717  * however happen to have, after being (re)activated, a lower finish
4718  * timestamp than the in-service queue.  That is, the next budget of
4719  * bfqq may have to be completed before the one of the in-service
4720  * queue. If this is the case, then preempting the in-service queue
4721  * allows this goal to be achieved, apart from the unpreemptible,
4722  * outstanding requests mentioned above.
4723  *
4724  * Unfortunately, regardless of which of the above two goals one wants
4725  * to achieve, service trees need first to be updated to know whether
4726  * the in-service queue must be preempted. To have service trees
4727  * correctly updated, the in-service queue must be expired and
4728  * rescheduled, and bfqq must be scheduled too. This is one of the
4729  * most costly operations (in future versions, the scheduling
4730  * mechanism may be re-designed in such a way to make it possible to
4731  * know whether preemption is needed without needing to update service
4732  * trees). In addition, queue preemptions almost always cause random
4733  * I/O, and thus loss of throughput. Because of these facts, the next
4734  * function adopts the following simple scheme to avoid both costly
4735  * operations and too frequent preemptions: it requests the expiration
4736  * of the in-service queue (unconditionally) only for queues that need
4737  * to recover a hole, or that either are weight-raised or deserve to
4738  * be weight-raised.
4739  */
4740 static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
4741                                                 struct bfq_queue *bfqq,
4742                                                 bool arrived_in_time,
4743                                                 bool wr_or_deserves_wr)
4744 {
4745         struct bfq_entity *entity = &bfqq->entity;
4746
4747         if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
4748                 /*
4749                  * We do not clear the flag non_blocking_wait_rq here, as
4750                  * the latter is used in bfq_activate_bfqq to signal
4751                  * that timestamps need to be back-shifted (and is
4752                  * cleared right after).
4753                  */
4754
4755                 /*
4756                  * In next assignment we rely on that either
4757                  * entity->service or entity->budget are not updated
4758                  * on expiration if bfqq is empty (see
4759                  * __bfq_bfqq_recalc_budget). Thus both quantities
4760                  * remain unchanged after such an expiration, and the
4761                  * following statement therefore assigns to
4762                  * entity->budget the remaining budget on such an
4763                  * expiration. For clarity, entity->service is not
4764                  * updated on expiration in any case, and, in normal
4765                  * operation, is reset only when bfqq is selected for
4766                  * service (see bfq_get_next_queue).
4767                  */
4768                 entity->budget = min_t(unsigned long,
4769                                        bfq_bfqq_budget_left(bfqq),
4770                                        bfqq->max_budget);
4771
4772                 return true;
4773         }
4774
4775         entity->budget = max_t(unsigned long, bfqq->max_budget,
4776                                bfq_serv_to_charge(bfqq->next_rq, bfqq));
4777         bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
4778         return wr_or_deserves_wr;
4779 }
4780
4781 static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
4782 {
4783         u64 dur;
4784
4785         if (bfqd->bfq_wr_max_time > 0)
4786                 return bfqd->bfq_wr_max_time;
4787
4788         dur = bfqd->RT_prod;
4789         do_div(dur, bfqd->peak_rate);
4790
4791         /*
4792          * Limit duration between 3 and 13 seconds. Tests show that
4793          * higher values than 13 seconds often yield the opposite of
4794          * the desired result, i.e., worsen responsiveness by letting
4795          * non-interactive and non-soft-real-time applications
4796          * preserve weight raising for a too long time interval.
4797          *
4798          * On the other end, lower values than 3 seconds make it
4799          * difficult for most interactive tasks to complete their jobs
4800          * before weight-raising finishes.
4801          */
4802         if (dur > msecs_to_jiffies(13000))
4803                 dur = msecs_to_jiffies(13000);
4804         else if (dur < msecs_to_jiffies(3000))
4805                 dur = msecs_to_jiffies(3000);
4806
4807         return dur;
4808 }
4809
4810 static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
4811                                              struct bfq_queue *bfqq,
4812                                              unsigned int old_wr_coeff,
4813                                              bool wr_or_deserves_wr,
4814                                              bool interactive,
4815                                              bool in_burst,
4816                                              bool soft_rt)
4817 {
4818         if (old_wr_coeff == 1 && wr_or_deserves_wr) {
4819                 /* start a weight-raising period */
4820                 if (interactive) {
4821                         bfqq->wr_coeff = bfqd->bfq_wr_coeff;
4822                         bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4823                 } else {
4824                         bfqq->wr_start_at_switch_to_srt = jiffies;
4825                         bfqq->wr_coeff = bfqd->bfq_wr_coeff *
4826                                 BFQ_SOFTRT_WEIGHT_FACTOR;
4827                         bfqq->wr_cur_max_time =
4828                                 bfqd->bfq_wr_rt_max_time;
4829                 }
4830
4831                 /*
4832                  * If needed, further reduce budget to make sure it is
4833                  * close to bfqq's backlog, so as to reduce the
4834                  * scheduling-error component due to a too large
4835                  * budget. Do not care about throughput consequences,
4836                  * but only about latency. Finally, do not assign a
4837                  * too small budget either, to avoid increasing
4838                  * latency by causing too frequent expirations.
4839                  */
4840                 bfqq->entity.budget = min_t(unsigned long,
4841                                             bfqq->entity.budget,
4842                                             2 * bfq_min_budget(bfqd));
4843         } else if (old_wr_coeff > 1) {
4844                 if (interactive) { /* update wr coeff and duration */
4845                         bfqq->wr_coeff = bfqd->bfq_wr_coeff;
4846                         bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4847                 } else if (in_burst)
4848                         bfqq->wr_coeff = 1;
4849                 else if (soft_rt) {
4850                         /*
4851                          * The application is now or still meeting the
4852                          * requirements for being deemed soft rt.  We
4853                          * can then correctly and safely (re)charge
4854                          * the weight-raising duration for the
4855                          * application with the weight-raising
4856                          * duration for soft rt applications.
4857                          *
4858                          * In particular, doing this recharge now, i.e.,
4859                          * before the weight-raising period for the
4860                          * application finishes, reduces the probability
4861                          * of the following negative scenario:
4862                          * 1) the weight of a soft rt application is
4863                          *    raised at startup (as for any newly
4864                          *    created application),
4865                          * 2) since the application is not interactive,
4866                          *    at a certain time weight-raising is
4867                          *    stopped for the application,
4868                          * 3) at that time the application happens to
4869                          *    still have pending requests, and hence
4870                          *    is destined to not have a chance to be
4871                          *    deemed soft rt before these requests are
4872                          *    completed (see the comments to the
4873                          *    function bfq_bfqq_softrt_next_start()
4874                          *    for details on soft rt detection),
4875                          * 4) these pending requests experience a high
4876                          *    latency because the application is not
4877                          *    weight-raised while they are pending.
4878                          */
4879                         if (bfqq->wr_cur_max_time !=
4880                                 bfqd->bfq_wr_rt_max_time) {
4881                                 bfqq->wr_start_at_switch_to_srt =
4882                                         bfqq->last_wr_start_finish;
4883
4884                                 bfqq->wr_cur_max_time =
4885                                         bfqd->bfq_wr_rt_max_time;
4886                                 bfqq->wr_coeff = bfqd->bfq_wr_coeff *
4887                                         BFQ_SOFTRT_WEIGHT_FACTOR;
4888                         }
4889                         bfqq->last_wr_start_finish = jiffies;
4890                 }
4891         }
4892 }
4893
4894 static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
4895                                         struct bfq_queue *bfqq)
4896 {
4897         return bfqq->dispatched == 0 &&
4898                 time_is_before_jiffies(
4899                         bfqq->budget_timeout +
4900                         bfqd->bfq_wr_min_idle_time);
4901 }
4902
4903 static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
4904                                              struct bfq_queue *bfqq,
4905                                              int old_wr_coeff,
4906                                              struct request *rq,
4907                                              bool *interactive)
4908 {
4909         bool soft_rt, in_burst, wr_or_deserves_wr,
4910                 bfqq_wants_to_preempt,
4911                 idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
4912                 /*
4913                  * See the comments on
4914                  * bfq_bfqq_update_budg_for_activation for
4915                  * details on the usage of the next variable.
4916                  */
4917                 arrived_in_time =  ktime_get_ns() <=
4918                         bfqq->ttime.last_end_request +
4919                         bfqd->bfq_slice_idle * 3;
4920
4921         bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
4922
4923         /*
4924          * bfqq deserves to be weight-raised if:
4925          * - it is sync,
4926          * - it does not belong to a large burst,
4927          * - it has been idle for enough time or is soft real-time,
4928          * - is linked to a bfq_io_cq (it is not shared in any sense).
4929          */
4930         in_burst = bfq_bfqq_in_large_burst(bfqq);
4931         soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
4932                 !in_burst &&
4933                 time_is_before_jiffies(bfqq->soft_rt_next_start);
4934         *interactive = !in_burst && idle_for_long_time;
4935         wr_or_deserves_wr = bfqd->low_latency &&
4936                 (bfqq->wr_coeff > 1 ||
4937                  (bfq_bfqq_sync(bfqq) &&
4938                   bfqq->bic && (*interactive || soft_rt)));
4939
4940         /*
4941          * Using the last flag, update budget and check whether bfqq
4942          * may want to preempt the in-service queue.
4943          */
4944         bfqq_wants_to_preempt =
4945                 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
4946                                                     arrived_in_time,
4947                                                     wr_or_deserves_wr);
4948
4949         /*
4950          * If bfqq happened to be activated in a burst, but has been
4951          * idle for much more than an interactive queue, then we
4952          * assume that, in the overall I/O initiated in the burst, the
4953          * I/O associated with bfqq is finished. So bfqq does not need
4954          * to be treated as a queue belonging to a burst
4955          * anymore. Accordingly, we reset bfqq's in_large_burst flag
4956          * if set, and remove bfqq from the burst list if it's
4957          * there. We do not decrement burst_size, because the fact
4958          * that bfqq does not need to belong to the burst list any
4959          * more does not invalidate the fact that bfqq was created in
4960          * a burst.
4961          */
4962         if (likely(!bfq_bfqq_just_created(bfqq)) &&
4963             idle_for_long_time &&
4964             time_is_before_jiffies(
4965                     bfqq->budget_timeout +
4966                     msecs_to_jiffies(10000))) {
4967                 hlist_del_init(&bfqq->burst_list_node);
4968                 bfq_clear_bfqq_in_large_burst(bfqq);
4969         }
4970
4971         bfq_clear_bfqq_just_created(bfqq);
4972
4973
4974         if (!bfq_bfqq_IO_bound(bfqq)) {
4975                 if (arrived_in_time) {
4976                         bfqq->requests_within_timer++;
4977                         if (bfqq->requests_within_timer >=
4978                             bfqd->bfq_requests_within_timer)
4979                                 bfq_mark_bfqq_IO_bound(bfqq);
4980                 } else
4981                         bfqq->requests_within_timer = 0;
4982         }
4983
4984         if (bfqd->low_latency) {
4985                 if (unlikely(time_is_after_jiffies(bfqq->split_time)))
4986                         /* wraparound */
4987                         bfqq->split_time =
4988                                 jiffies - bfqd->bfq_wr_min_idle_time - 1;
4989
4990                 if (time_is_before_jiffies(bfqq->split_time +
4991                                            bfqd->bfq_wr_min_idle_time)) {
4992                         bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
4993                                                          old_wr_coeff,
4994                                                          wr_or_deserves_wr,
4995                                                          *interactive,
4996                                                          in_burst,
4997                                                          soft_rt);
4998
4999                         if (old_wr_coeff != bfqq->wr_coeff)
5000                                 bfqq->entity.prio_changed = 1;
5001                 }
5002         }
5003
5004         bfqq->last_idle_bklogged = jiffies;
5005         bfqq->service_from_backlogged = 0;
5006         bfq_clear_bfqq_softrt_update(bfqq);
5007
5008         bfq_add_bfqq_busy(bfqd, bfqq);
5009
5010         /*
5011          * Expire in-service queue only if preemption may be needed
5012          * for guarantees. In this respect, the function
5013          * next_queue_may_preempt just checks a simple, necessary
5014          * condition, and not a sufficient condition based on
5015          * timestamps. In fact, for the latter condition to be
5016          * evaluated, timestamps would need first to be updated, and
5017          * this operation is quite costly (see the comments on the
5018          * function bfq_bfqq_update_budg_for_activation).
5019          */
5020         if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
5021             bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
5022             next_queue_may_preempt(bfqd))
5023                 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
5024                                 false, BFQQE_PREEMPTED);
5025 }
5026
5027 static void bfq_add_request(struct request *rq)
5028 {
5029         struct bfq_queue *bfqq = RQ_BFQQ(rq);
5030         struct bfq_data *bfqd = bfqq->bfqd;
5031         struct request *next_rq, *prev;
5032         unsigned int old_wr_coeff = bfqq->wr_coeff;
5033         bool interactive = false;
5034
5035         bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
5036         bfqq->queued[rq_is_sync(rq)]++;
5037         bfqd->queued++;
5038
5039         elv_rb_add(&bfqq->sort_list, rq);
5040
5041         /*
5042          * Check if this request is a better next-serve candidate.
5043          */
5044         prev = bfqq->next_rq;
5045         next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
5046         bfqq->next_rq = next_rq;
5047
5048         /*
5049          * Adjust priority tree position, if next_rq changes.
5050          */
5051         if (prev != bfqq->next_rq)
5052                 bfq_pos_tree_add_move(bfqd, bfqq);
5053
5054         if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
5055                 bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
5056                                                  rq, &interactive);
5057         else {
5058                 if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
5059                     time_is_before_jiffies(
5060                                 bfqq->last_wr_start_finish +
5061                                 bfqd->bfq_wr_min_inter_arr_async)) {
5062                         bfqq->wr_coeff = bfqd->bfq_wr_coeff;
5063                         bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
5064
5065                         bfqd->wr_busy_queues++;
5066                         bfqq->entity.prio_changed = 1;
5067                 }
5068                 if (prev != bfqq->next_rq)
5069                         bfq_updated_next_req(bfqd, bfqq);
5070         }
5071
5072         /*
5073          * Assign jiffies to last_wr_start_finish in the following
5074          * cases:
5075          *
5076          * . if bfqq is not going to be weight-raised, because, for
5077          *   non weight-raised queues, last_wr_start_finish stores the
5078          *   arrival time of the last request; as of now, this piece
5079          *   of information is used only for deciding whether to
5080          *   weight-raise async queues
5081          *
5082          * . if bfqq is not weight-raised, because, if bfqq is now
5083          *   switching to weight-raised, then last_wr_start_finish
5084          *   stores the time when weight-raising starts
5085          *
5086          * . if bfqq is interactive, because, regardless of whether
5087          *   bfqq is currently weight-raised, the weight-raising
5088          *   period must start or restart (this case is considered
5089          *   separately because it is not detected by the above
5090          *   conditions, if bfqq is already weight-raised)
5091          *
5092          * last_wr_start_finish has to be updated also if bfqq is soft
5093          * real-time, because the weight-raising period is constantly
5094          * restarted on idle-to-busy transitions for these queues, but
5095          * this is already done in bfq_bfqq_handle_idle_busy_switch if
5096          * needed.
5097          */
5098         if (bfqd->low_latency &&
5099                 (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
5100                 bfqq->last_wr_start_finish = jiffies;
5101 }
5102
5103 static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
5104                                           struct bio *bio,
5105                                           struct request_queue *q)
5106 {
5107         struct bfq_queue *bfqq = bfqd->bio_bfqq;
5108
5109
5110         if (bfqq)
5111                 return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
5112
5113         return NULL;
5114 }
5115
5116 static sector_t get_sdist(sector_t last_pos, struct request *rq)
5117 {
5118         if (last_pos)
5119                 return abs(blk_rq_pos(rq) - last_pos);
5120
5121         return 0;
5122 }
5123
5124 #if 0 /* Still not clear if we can do without next two functions */
5125 static void bfq_activate_request(struct request_queue *q, struct request *rq)
5126 {
5127         struct bfq_data *bfqd = q->elevator->elevator_data;
5128
5129         bfqd->rq_in_driver++;
5130 }
5131
5132 static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
5133 {
5134         struct bfq_data *bfqd = q->elevator->elevator_data;
5135
5136         bfqd->rq_in_driver--;
5137 }
5138 #endif
5139
5140 static void bfq_remove_request(struct request_queue *q,
5141                                struct request *rq)
5142 {
5143         struct bfq_queue *bfqq = RQ_BFQQ(rq);
5144         struct bfq_data *bfqd = bfqq->bfqd;
5145         const int sync = rq_is_sync(rq);
5146
5147         if (bfqq->next_rq == rq) {
5148                 bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
5149                 bfq_updated_next_req(bfqd, bfqq);
5150         }
5151
5152         if (rq->queuelist.prev != &rq->queuelist)
5153                 list_del_init(&rq->queuelist);
5154         bfqq->queued[sync]--;
5155         bfqd->queued--;
5156         elv_rb_del(&bfqq->sort_list, rq);
5157
5158         elv_rqhash_del(q, rq);
5159         if (q->last_merge == rq)
5160                 q->last_merge = NULL;
5161
5162         if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
5163                 bfqq->next_rq = NULL;
5164
5165                 if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
5166                         bfq_del_bfqq_busy(bfqd, bfqq, false);
5167                         /*
5168                          * bfqq emptied. In normal operation, when
5169                          * bfqq is empty, bfqq->entity.service and
5170                          * bfqq->entity.budget must contain,
5171                          * respectively, the service received and the
5172                          * budget used last time bfqq emptied. These
5173                          * facts do not hold in this case, as at least
5174                          * this last removal occurred while bfqq is
5175                          * not in service. To avoid inconsistencies,
5176                          * reset both bfqq->entity.service and
5177                          * bfqq->entity.budget, if bfqq has still a
5178                          * process that may issue I/O requests to it.
5179                          */
5180                         bfqq->entity.budget = bfqq->entity.service = 0;
5181                 }
5182
5183                 /*
5184                  * Remove queue from request-position tree as it is empty.
5185                  */
5186                 if (bfqq->pos_root) {
5187                         rb_erase(&bfqq->pos_node, bfqq->pos_root);
5188                         bfqq->pos_root = NULL;
5189                 }
5190         }
5191
5192         if (rq->cmd_flags & REQ_META)
5193                 bfqq->meta_pending--;
5194
5195         bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
5196 }
5197
5198 static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
5199 {
5200         struct request_queue *q = hctx->queue;
5201         struct bfq_data *bfqd = q->elevator->elevator_data;
5202         struct request *free = NULL;
5203         /*
5204          * bfq_bic_lookup grabs the queue_lock: invoke it now and
5205          * store its return value for later use, to avoid nesting
5206          * queue_lock inside the bfqd->lock. We assume that the bic
5207          * returned by bfq_bic_lookup does not go away before
5208          * bfqd->lock is taken.
5209          */
5210         struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
5211         bool ret;
5212
5213         spin_lock_irq(&bfqd->lock);
5214
5215         if (bic)
5216                 bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
5217         else
5218                 bfqd->bio_bfqq = NULL;
5219         bfqd->bio_bic = bic;
5220
5221         ret = blk_mq_sched_try_merge(q, bio, &free);
5222
5223         if (free)
5224                 blk_mq_free_request(free);
5225         spin_unlock_irq(&bfqd->lock);
5226
5227         return ret;
5228 }
5229
5230 static int bfq_request_merge(struct request_queue *q, struct request **req,
5231                              struct bio *bio)
5232 {
5233         struct bfq_data *bfqd = q->elevator->elevator_data;
5234         struct request *__rq;
5235
5236         __rq = bfq_find_rq_fmerge(bfqd, bio, q);
5237         if (__rq && elv_bio_merge_ok(__rq, bio)) {
5238                 *req = __rq;
5239                 return ELEVATOR_FRONT_MERGE;
5240         }
5241
5242         return ELEVATOR_NO_MERGE;
5243 }
5244
5245 static void bfq_request_merged(struct request_queue *q, struct request *req,
5246                                enum elv_merge type)
5247 {
5248         if (type == ELEVATOR_FRONT_MERGE &&
5249             rb_prev(&req->rb_node) &&
5250             blk_rq_pos(req) <
5251             blk_rq_pos(container_of(rb_prev(&req->rb_node),
5252                                     struct request, rb_node))) {
5253                 struct bfq_queue *bfqq = RQ_BFQQ(req);
5254                 struct bfq_data *bfqd = bfqq->bfqd;
5255                 struct request *prev, *next_rq;
5256
5257                 /* Reposition request in its sort_list */
5258                 elv_rb_del(&bfqq->sort_list, req);
5259                 elv_rb_add(&bfqq->sort_list, req);
5260
5261                 /* Choose next request to be served for bfqq */
5262                 prev = bfqq->next_rq;
5263                 next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
5264                                          bfqd->last_position);
5265                 bfqq->next_rq = next_rq;
5266                 /*
5267                  * If next_rq changes, update both the queue's budget to
5268                  * fit the new request and the queue's position in its
5269                  * rq_pos_tree.
5270                  */
5271                 if (prev != bfqq->next_rq) {
5272                         bfq_updated_next_req(bfqd, bfqq);
5273                         bfq_pos_tree_add_move(bfqd, bfqq);
5274                 }
5275         }
5276 }
5277
5278 static void bfq_requests_merged(struct request_queue *q, struct request *rq,
5279                                 struct request *next)
5280 {
5281         struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
5282
5283         if (!RB_EMPTY_NODE(&rq->rb_node))
5284                 goto end;
5285         spin_lock_irq(&bfqq->bfqd->lock);
5286
5287         /*
5288          * If next and rq belong to the same bfq_queue and next is older
5289          * than rq, then reposition rq in the fifo (by substituting next
5290          * with rq). Otherwise, if next and rq belong to different
5291          * bfq_queues, never reposition rq: in fact, we would have to
5292          * reposition it with respect to next's position in its own fifo,
5293          * which would most certainly be too expensive with respect to
5294          * the benefits.
5295          */
5296         if (bfqq == next_bfqq &&
5297             !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
5298             next->fifo_time < rq->fifo_time) {
5299                 list_del_init(&rq->queuelist);
5300                 list_replace_init(&next->queuelist, &rq->queuelist);
5301                 rq->fifo_time = next->fifo_time;
5302         }
5303
5304         if (bfqq->next_rq == next)
5305                 bfqq->next_rq = rq;
5306
5307         bfq_remove_request(q, next);
5308
5309         spin_unlock_irq(&bfqq->bfqd->lock);
5310 end:
5311         bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
5312 }
5313
5314 /* Must be called with bfqq != NULL */
5315 static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
5316 {
5317         if (bfq_bfqq_busy(bfqq))
5318                 bfqq->bfqd->wr_busy_queues--;
5319         bfqq->wr_coeff = 1;
5320         bfqq->wr_cur_max_time = 0;
5321         bfqq->last_wr_start_finish = jiffies;
5322         /*
5323          * Trigger a weight change on the next invocation of
5324          * __bfq_entity_update_weight_prio.
5325          */
5326         bfqq->entity.prio_changed = 1;
5327 }
5328
5329 static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
5330                                     struct bfq_group *bfqg)
5331 {
5332         int i, j;
5333
5334         for (i = 0; i < 2; i++)
5335                 for (j = 0; j < IOPRIO_BE_NR; j++)
5336                         if (bfqg->async_bfqq[i][j])
5337                                 bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
5338         if (bfqg->async_idle_bfqq)
5339                 bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
5340 }
5341
5342 static void bfq_end_wr(struct bfq_data *bfqd)
5343 {
5344         struct bfq_queue *bfqq;
5345
5346         spin_lock_irq(&bfqd->lock);
5347
5348         list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
5349                 bfq_bfqq_end_wr(bfqq);
5350         list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
5351                 bfq_bfqq_end_wr(bfqq);
5352         bfq_end_wr_async(bfqd);
5353
5354         spin_unlock_irq(&bfqd->lock);
5355 }
5356
5357 static sector_t bfq_io_struct_pos(void *io_struct, bool request)
5358 {
5359         if (request)
5360                 return blk_rq_pos(io_struct);
5361         else
5362                 return ((struct bio *)io_struct)->bi_iter.bi_sector;
5363 }
5364
5365 static int bfq_rq_close_to_sector(void *io_struct, bool request,
5366                                   sector_t sector)
5367 {
5368         return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
5369                BFQQ_CLOSE_THR;
5370 }
5371
5372 static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
5373                                          struct bfq_queue *bfqq,
5374                                          sector_t sector)
5375 {
5376         struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
5377         struct rb_node *parent, *node;
5378         struct bfq_queue *__bfqq;
5379
5380         if (RB_EMPTY_ROOT(root))
5381                 return NULL;
5382
5383         /*
5384          * First, if we find a request starting at the end of the last
5385          * request, choose it.
5386          */
5387         __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
5388         if (__bfqq)
5389                 return __bfqq;
5390
5391         /*
5392          * If the exact sector wasn't found, the parent of the NULL leaf
5393          * will contain the closest sector (rq_pos_tree sorted by
5394          * next_request position).
5395          */
5396         __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
5397         if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
5398                 return __bfqq;
5399
5400         if (blk_rq_pos(__bfqq->next_rq) < sector)
5401                 node = rb_next(&__bfqq->pos_node);
5402         else
5403                 node = rb_prev(&__bfqq->pos_node);
5404         if (!node)
5405                 return NULL;
5406
5407         __bfqq = rb_entry(node, struct bfq_queue, pos_node);
5408         if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
5409                 return __bfqq;
5410
5411         return NULL;
5412 }
5413
5414 static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
5415                                                    struct bfq_queue *cur_bfqq,
5416                                                    sector_t sector)
5417 {
5418         struct bfq_queue *bfqq;
5419
5420         /*
5421          * We shall notice if some of the queues are cooperating,
5422          * e.g., working closely on the same area of the device. In
5423          * that case, we can group them together and: 1) don't waste
5424          * time idling, and 2) serve the union of their requests in
5425          * the best possible order for throughput.
5426          */
5427         bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
5428         if (!bfqq || bfqq == cur_bfqq)
5429                 return NULL;
5430
5431         return bfqq;
5432 }
5433
5434 static struct bfq_queue *
5435 bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
5436 {
5437         int process_refs, new_process_refs;
5438         struct bfq_queue *__bfqq;
5439
5440         /*
5441          * If there are no process references on the new_bfqq, then it is
5442          * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
5443          * may have dropped their last reference (not just their last process
5444          * reference).
5445          */
5446         if (!bfqq_process_refs(new_bfqq))
5447                 return NULL;
5448
5449         /* Avoid a circular list and skip interim queue merges. */
5450         while ((__bfqq = new_bfqq->new_bfqq)) {
5451                 if (__bfqq == bfqq)
5452                         return NULL;
5453                 new_bfqq = __bfqq;
5454         }
5455
5456         process_refs = bfqq_process_refs(bfqq);
5457         new_process_refs = bfqq_process_refs(new_bfqq);
5458         /*
5459          * If the process for the bfqq has gone away, there is no
5460          * sense in merging the queues.
5461          */
5462         if (process_refs == 0 || new_process_refs == 0)
5463                 return NULL;
5464
5465         bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
5466                 new_bfqq->pid);
5467
5468         /*
5469          * Merging is just a redirection: the requests of the process
5470          * owning one of the two queues are redirected to the other queue.
5471          * The latter queue, in its turn, is set as shared if this is the
5472          * first time that the requests of some process are redirected to
5473          * it.
5474          *
5475          * We redirect bfqq to new_bfqq and not the opposite, because we
5476          * are in the context of the process owning bfqq, hence we have
5477          * the io_cq of this process. So we can immediately configure this
5478          * io_cq to redirect the requests of the process to new_bfqq.
5479          *
5480          * NOTE, even if new_bfqq coincides with the in-service queue, the
5481          * io_cq of new_bfqq is not available, because, if the in-service
5482          * queue is shared, bfqd->in_service_bic may not point to the
5483          * io_cq of the in-service queue.
5484          * Redirecting the requests of the process owning bfqq to the
5485          * currently in-service queue is in any case the best option, as
5486          * we feed the in-service queue with new requests close to the
5487          * last request served and, by doing so, hopefully increase the
5488          * throughput.
5489          */
5490         bfqq->new_bfqq = new_bfqq;
5491         new_bfqq->ref += process_refs;
5492         return new_bfqq;
5493 }
5494
5495 static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
5496                                         struct bfq_queue *new_bfqq)
5497 {
5498         if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
5499             (bfqq->ioprio_class != new_bfqq->ioprio_class))
5500                 return false;
5501
5502         /*
5503          * If either of the queues has already been detected as seeky,
5504          * then merging it with the other queue is unlikely to lead to
5505          * sequential I/O.
5506          */
5507         if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
5508                 return false;
5509
5510         /*
5511          * Interleaved I/O is known to be done by (some) applications
5512          * only for reads, so it does not make sense to merge async
5513          * queues.
5514          */
5515         if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
5516                 return false;
5517
5518         return true;
5519 }
5520
5521 /*
5522  * If this function returns true, then bfqq cannot be merged. The idea
5523  * is that true cooperation happens very early after processes start
5524  * to do I/O. Usually, late cooperations are just accidental false
5525  * positives. In case bfqq is weight-raised, such false positives
5526  * would evidently degrade latency guarantees for bfqq.
5527  */
5528 static bool wr_from_too_long(struct bfq_queue *bfqq)
5529 {
5530         return bfqq->wr_coeff > 1 &&
5531                 time_is_before_jiffies(bfqq->last_wr_start_finish +
5532                                        msecs_to_jiffies(100));
5533 }
5534
5535 /*
5536  * Attempt to schedule a merge of bfqq with the currently in-service
5537  * queue or with a close queue among the scheduled queues.  Return
5538  * NULL if no merge was scheduled, a pointer to the shared bfq_queue
5539  * structure otherwise.
5540  *
5541  * The OOM queue is not allowed to participate to cooperation: in fact, since
5542  * the requests temporarily redirected to the OOM queue could be redirected
5543  * again to dedicated queues at any time, the state needed to correctly
5544  * handle merging with the OOM queue would be quite complex and expensive
5545  * to maintain. Besides, in such a critical condition as an out of memory,
5546  * the benefits of queue merging may be little relevant, or even negligible.
5547  *
5548  * Weight-raised queues can be merged only if their weight-raising
5549  * period has just started. In fact cooperating processes are usually
5550  * started together. Thus, with this filter we avoid false positives
5551  * that would jeopardize low-latency guarantees.
5552  *
5553  * WARNING: queue merging may impair fairness among non-weight raised
5554  * queues, for at least two reasons: 1) the original weight of a
5555  * merged queue may change during the merged state, 2) even being the
5556  * weight the same, a merged queue may be bloated with many more
5557  * requests than the ones produced by its originally-associated
5558  * process.
5559  */
5560 static struct bfq_queue *
5561 bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5562                      void *io_struct, bool request)
5563 {
5564         struct bfq_queue *in_service_bfqq, *new_bfqq;
5565
5566         if (bfqq->new_bfqq)
5567                 return bfqq->new_bfqq;
5568
5569         if (!io_struct ||
5570             wr_from_too_long(bfqq) ||
5571             unlikely(bfqq == &bfqd->oom_bfqq))
5572                 return NULL;
5573
5574         /* If there is only one backlogged queue, don't search. */
5575         if (bfqd->busy_queues == 1)
5576                 return NULL;
5577
5578         in_service_bfqq = bfqd->in_service_queue;
5579
5580         if (!in_service_bfqq || in_service_bfqq == bfqq ||
5581             !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) ||
5582             unlikely(in_service_bfqq == &bfqd->oom_bfqq))
5583                 goto check_scheduled;
5584
5585         if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
5586             bfqq->entity.parent == in_service_bfqq->entity.parent &&
5587             bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
5588                 new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
5589                 if (new_bfqq)
5590                         return new_bfqq;
5591         }
5592         /*
5593          * Check whether there is a cooperator among currently scheduled
5594          * queues. The only thing we need is that the bio/request is not
5595          * NULL, as we need it to establish whether a cooperator exists.
5596          */
5597 check_scheduled:
5598         new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
5599                         bfq_io_struct_pos(io_struct, request));
5600
5601         if (new_bfqq && !wr_from_too_long(new_bfqq) &&
5602             likely(new_bfqq != &bfqd->oom_bfqq) &&
5603             bfq_may_be_close_cooperator(bfqq, new_bfqq))
5604                 return bfq_setup_merge(bfqq, new_bfqq);
5605
5606         return NULL;
5607 }
5608
5609 static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
5610 {
5611         struct bfq_io_cq *bic = bfqq->bic;
5612
5613         /*
5614          * If !bfqq->bic, the queue is already shared or its requests
5615          * have already been redirected to a shared queue; both idle window
5616          * and weight raising state have already been saved. Do nothing.
5617          */
5618         if (!bic)
5619                 return;
5620
5621         bic->saved_ttime = bfqq->ttime;
5622         bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
5623         bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
5624         bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
5625         bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
5626         bic->saved_wr_coeff = bfqq->wr_coeff;
5627         bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
5628         bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
5629         bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
5630 }
5631
5632 static void bfq_get_bic_reference(struct bfq_queue *bfqq)
5633 {
5634         /*
5635          * If bfqq->bic has a non-NULL value, the bic to which it belongs
5636          * is about to begin using a shared bfq_queue.
5637          */
5638         if (bfqq->bic)
5639                 atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
5640 }
5641
5642 static void
5643 bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
5644                 struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
5645 {
5646         bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
5647                 (unsigned long)new_bfqq->pid);
5648         /* Save weight raising and idle window of the merged queues */
5649         bfq_bfqq_save_state(bfqq);
5650         bfq_bfqq_save_state(new_bfqq);
5651         if (bfq_bfqq_IO_bound(bfqq))
5652                 bfq_mark_bfqq_IO_bound(new_bfqq);
5653         bfq_clear_bfqq_IO_bound(bfqq);
5654
5655         /*
5656          * If bfqq is weight-raised, then let new_bfqq inherit
5657          * weight-raising. To reduce false positives, neglect the case
5658          * where bfqq has just been created, but has not yet made it
5659          * to be weight-raised (which may happen because EQM may merge
5660          * bfqq even before bfq_add_request is executed for the first
5661          * time for bfqq). Handling this case would however be very
5662          * easy, thanks to the flag just_created.
5663          */
5664         if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
5665                 new_bfqq->wr_coeff = bfqq->wr_coeff;
5666                 new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
5667                 new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
5668                 new_bfqq->wr_start_at_switch_to_srt =
5669                         bfqq->wr_start_at_switch_to_srt;
5670                 if (bfq_bfqq_busy(new_bfqq))
5671                         bfqd->wr_busy_queues++;
5672                 new_bfqq->entity.prio_changed = 1;
5673         }
5674
5675         if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
5676                 bfqq->wr_coeff = 1;
5677                 bfqq->entity.prio_changed = 1;
5678                 if (bfq_bfqq_busy(bfqq))
5679                         bfqd->wr_busy_queues--;
5680         }
5681
5682         bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
5683                      bfqd->wr_busy_queues);
5684
5685         /*
5686          * Grab a reference to the bic, to prevent it from being destroyed
5687          * before being possibly touched by a bfq_split_bfqq().
5688          */
5689         bfq_get_bic_reference(bfqq);
5690         bfq_get_bic_reference(new_bfqq);
5691         /*
5692          * Merge queues (that is, let bic redirect its requests to new_bfqq)
5693          */
5694         bic_set_bfqq(bic, new_bfqq, 1);
5695         bfq_mark_bfqq_coop(new_bfqq);
5696         /*
5697          * new_bfqq now belongs to at least two bics (it is a shared queue):
5698          * set new_bfqq->bic to NULL. bfqq either:
5699          * - does not belong to any bic any more, and hence bfqq->bic must
5700          *   be set to NULL, or
5701          * - is a queue whose owning bics have already been redirected to a
5702          *   different queue, hence the queue is destined to not belong to
5703          *   any bic soon and bfqq->bic is already NULL (therefore the next
5704          *   assignment causes no harm).
5705          */
5706         new_bfqq->bic = NULL;
5707         bfqq->bic = NULL;
5708         /* release process reference to bfqq */
5709         bfq_put_queue(bfqq);
5710 }
5711
5712 static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
5713                                 struct bio *bio)
5714 {
5715         struct bfq_data *bfqd = q->elevator->elevator_data;
5716         bool is_sync = op_is_sync(bio->bi_opf);
5717         struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq;
5718
5719         /*
5720          * Disallow merge of a sync bio into an async request.
5721          */
5722         if (is_sync && !rq_is_sync(rq))
5723                 return false;
5724
5725         /*
5726          * Lookup the bfqq that this bio will be queued with. Allow
5727          * merge only if rq is queued there.
5728          */
5729         if (!bfqq)
5730                 return false;
5731
5732         /*
5733          * We take advantage of this function to perform an early merge
5734          * of the queues of possible cooperating processes.
5735          */
5736         new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
5737         if (new_bfqq) {
5738                 /*
5739                  * bic still points to bfqq, then it has not yet been
5740                  * redirected to some other bfq_queue, and a queue
5741                  * merge beween bfqq and new_bfqq can be safely
5742                  * fulfillled, i.e., bic can be redirected to new_bfqq
5743                  * and bfqq can be put.
5744                  */
5745                 bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
5746                                 new_bfqq);
5747                 /*
5748                  * If we get here, bio will be queued into new_queue,
5749                  * so use new_bfqq to decide whether bio and rq can be
5750                  * merged.
5751                  */
5752                 bfqq = new_bfqq;
5753
5754                 /*
5755                  * Change also bqfd->bio_bfqq, as
5756                  * bfqd->bio_bic now points to new_bfqq, and
5757                  * this function may be invoked again (and then may
5758                  * use again bqfd->bio_bfqq).
5759                  */
5760                 bfqd->bio_bfqq = bfqq;
5761         }
5762
5763         return bfqq == RQ_BFQQ(rq);
5764 }
5765
5766 /*
5767  * Set the maximum time for the in-service queue to consume its
5768  * budget. This prevents seeky processes from lowering the throughput.
5769  * In practice, a time-slice service scheme is used with seeky
5770  * processes.
5771  */
5772 static void bfq_set_budget_timeout(struct bfq_data *bfqd,
5773                                    struct bfq_queue *bfqq)
5774 {
5775         unsigned int timeout_coeff;
5776
5777         if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
5778                 timeout_coeff = 1;
5779         else
5780                 timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
5781
5782         bfqd->last_budget_start = ktime_get();
5783
5784         bfqq->budget_timeout = jiffies +
5785                 bfqd->bfq_timeout * timeout_coeff;
5786 }
5787
5788 static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
5789                                        struct bfq_queue *bfqq)
5790 {
5791         if (bfqq) {
5792                 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
5793                 bfq_clear_bfqq_fifo_expire(bfqq);
5794
5795                 bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
5796
5797                 if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
5798                     bfqq->wr_coeff > 1 &&
5799                     bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
5800                     time_is_before_jiffies(bfqq->budget_timeout)) {
5801                         /*
5802                          * For soft real-time queues, move the start
5803                          * of the weight-raising period forward by the
5804                          * time the queue has not received any
5805                          * service. Otherwise, a relatively long
5806                          * service delay is likely to cause the
5807                          * weight-raising period of the queue to end,
5808                          * because of the short duration of the
5809                          * weight-raising period of a soft real-time
5810                          * queue.  It is worth noting that this move
5811                          * is not so dangerous for the other queues,
5812                          * because soft real-time queues are not
5813                          * greedy.
5814                          *
5815                          * To not add a further variable, we use the
5816                          * overloaded field budget_timeout to
5817                          * determine for how long the queue has not
5818                          * received service, i.e., how much time has
5819                          * elapsed since the queue expired. However,
5820                          * this is a little imprecise, because
5821                          * budget_timeout is set to jiffies if bfqq
5822                          * not only expires, but also remains with no
5823                          * request.
5824                          */
5825                         if (time_after(bfqq->budget_timeout,
5826                                        bfqq->last_wr_start_finish))
5827                                 bfqq->last_wr_start_finish +=
5828                                         jiffies - bfqq->budget_timeout;
5829                         else
5830                                 bfqq->last_wr_start_finish = jiffies;
5831                 }
5832
5833                 bfq_set_budget_timeout(bfqd, bfqq);
5834                 bfq_log_bfqq(bfqd, bfqq,
5835                              "set_in_service_queue, cur-budget = %d",
5836                              bfqq->entity.budget);
5837         }
5838
5839         bfqd->in_service_queue = bfqq;
5840 }
5841
5842 /*
5843  * Get and set a new queue for service.
5844  */
5845 static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
5846 {
5847         struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
5848
5849         __bfq_set_in_service_queue(bfqd, bfqq);
5850         return bfqq;
5851 }
5852
5853 static void bfq_arm_slice_timer(struct bfq_data *bfqd)
5854 {
5855         struct bfq_queue *bfqq = bfqd->in_service_queue;
5856         struct bfq_io_cq *bic;
5857         u32 sl;
5858
5859         /* Processes have exited, don't wait. */
5860         bic = bfqd->in_service_bic;
5861         if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
5862                 return;
5863
5864         bfq_mark_bfqq_wait_request(bfqq);
5865
5866         /*
5867          * We don't want to idle for seeks, but we do want to allow
5868          * fair distribution of slice time for a process doing back-to-back
5869          * seeks. So allow a little bit of time for him to submit a new rq.
5870          */
5871         sl = bfqd->bfq_slice_idle;
5872         /*
5873          * Unless the queue is being weight-raised or the scenario is
5874          * asymmetric, grant only minimum idle time if the queue
5875          * is seeky. A long idling is preserved for a weight-raised
5876          * queue, or, more in general, in an asymmetric scenario,
5877          * because a long idling is needed for guaranteeing to a queue
5878          * its reserved share of the throughput (in particular, it is
5879          * needed if the queue has a higher weight than some other
5880          * queue).
5881          */
5882         if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
5883             bfq_symmetric_scenario(bfqd))
5884                 sl = min_t(u64, sl, BFQ_MIN_TT);
5885
5886         bfqd->last_idling_start = ktime_get();
5887         hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
5888                       HRTIMER_MODE_REL);
5889         bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
5890 }
5891
5892 /*
5893  * In autotuning mode, max_budget is dynamically recomputed as the
5894  * amount of sectors transferred in timeout at the estimated peak
5895  * rate. This enables BFQ to utilize a full timeslice with a full
5896  * budget, even if the in-service queue is served at peak rate. And
5897  * this maximises throughput with sequential workloads.
5898  */
5899 static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
5900 {
5901         return (u64)bfqd->peak_rate * USEC_PER_MSEC *
5902                 jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
5903 }
5904
5905 /*
5906  * Update parameters related to throughput and responsiveness, as a
5907  * function of the estimated peak rate. See comments on
5908  * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
5909  */
5910 static void update_thr_responsiveness_params(struct bfq_data *bfqd)
5911 {
5912         int dev_type = blk_queue_nonrot(bfqd->queue);
5913
5914         if (bfqd->bfq_user_max_budget == 0)
5915                 bfqd->bfq_max_budget =
5916                         bfq_calc_max_budget(bfqd);
5917
5918         if (bfqd->device_speed == BFQ_BFQD_FAST &&
5919             bfqd->peak_rate < device_speed_thresh[dev_type]) {
5920                 bfqd->device_speed = BFQ_BFQD_SLOW;
5921                 bfqd->RT_prod = R_slow[dev_type] *
5922                         T_slow[dev_type];
5923         } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
5924                    bfqd->peak_rate > device_speed_thresh[dev_type]) {
5925                 bfqd->device_speed = BFQ_BFQD_FAST;
5926                 bfqd->RT_prod = R_fast[dev_type] *
5927                         T_fast[dev_type];
5928         }
5929
5930         bfq_log(bfqd,
5931 "dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
5932                 dev_type == 0 ? "ROT" : "NONROT",
5933                 bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
5934                 bfqd->device_speed == BFQ_BFQD_FAST ?
5935                 (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
5936                 (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
5937                 (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
5938                 BFQ_RATE_SHIFT);
5939 }
5940
5941 static void bfq_reset_rate_computation(struct bfq_data *bfqd,
5942                                        struct request *rq)
5943 {
5944         if (rq != NULL) { /* new rq dispatch now, reset accordingly */
5945                 bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
5946                 bfqd->peak_rate_samples = 1;
5947                 bfqd->sequential_samples = 0;
5948                 bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
5949                         blk_rq_sectors(rq);
5950         } else /* no new rq dispatched, just reset the number of samples */
5951                 bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
5952
5953         bfq_log(bfqd,
5954                 "reset_rate_computation at end, sample %u/%u tot_sects %llu",
5955                 bfqd->peak_rate_samples, bfqd->sequential_samples,
5956                 bfqd->tot_sectors_dispatched);
5957 }
5958
5959 static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
5960 {
5961         u32 rate, weight, divisor;
5962
5963         /*
5964          * For the convergence property to hold (see comments on
5965          * bfq_update_peak_rate()) and for the assessment to be
5966          * reliable, a minimum number of samples must be present, and
5967          * a minimum amount of time must have elapsed. If not so, do
5968          * not compute new rate. Just reset parameters, to get ready
5969          * for a new evaluation attempt.
5970          */
5971         if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
5972             bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
5973                 goto reset_computation;
5974
5975         /*
5976          * If a new request completion has occurred after last
5977          * dispatch, then, to approximate the rate at which requests
5978          * have been served by the device, it is more precise to
5979          * extend the observation interval to the last completion.
5980          */
5981         bfqd->delta_from_first =
5982                 max_t(u64, bfqd->delta_from_first,
5983                       bfqd->last_completion - bfqd->first_dispatch);
5984
5985         /*
5986          * Rate computed in sects/usec, and not sects/nsec, for
5987          * precision issues.
5988          */
5989         rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
5990                         div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
5991
5992         /*
5993          * Peak rate not updated if:
5994          * - the percentage of sequential dispatches is below 3/4 of the
5995          *   total, and rate is below the current estimated peak rate
5996          * - rate is unreasonably high (> 20M sectors/sec)
5997          */
5998         if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
5999              rate <= bfqd->peak_rate) ||
6000                 rate > 20<<BFQ_RATE_SHIFT)
6001                 goto reset_computation;
6002
6003         /*
6004          * We have to update the peak rate, at last! To this purpose,
6005          * we use a low-pass filter. We compute the smoothing constant
6006          * of the filter as a function of the 'weight' of the new
6007          * measured rate.
6008          *
6009          * As can be seen in next formulas, we define this weight as a
6010          * quantity proportional to how sequential the workload is,
6011          * and to how long the observation time interval is.
6012          *
6013          * The weight runs from 0 to 8. The maximum value of the
6014          * weight, 8, yields the minimum value for the smoothing
6015          * constant. At this minimum value for the smoothing constant,
6016          * the measured rate contributes for half of the next value of
6017          * the estimated peak rate.
6018          *
6019          * So, the first step is to compute the weight as a function
6020          * of how sequential the workload is. Note that the weight
6021          * cannot reach 9, because bfqd->sequential_samples cannot
6022          * become equal to bfqd->peak_rate_samples, which, in its
6023          * turn, holds true because bfqd->sequential_samples is not
6024          * incremented for the first sample.
6025          */
6026         weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
6027
6028         /*
6029          * Second step: further refine the weight as a function of the
6030          * duration of the observation interval.
6031          */
6032         weight = min_t(u32, 8,
6033                        div_u64(weight * bfqd->delta_from_first,
6034                                BFQ_RATE_REF_INTERVAL));
6035
6036         /*
6037          * Divisor ranging from 10, for minimum weight, to 2, for
6038          * maximum weight.
6039          */
6040         divisor = 10 - weight;
6041
6042         /*
6043          * Finally, update peak rate:
6044          *
6045          * peak_rate = peak_rate * (divisor-1) / divisor  +  rate / divisor
6046          */
6047         bfqd->peak_rate *= divisor-1;
6048         bfqd->peak_rate /= divisor;
6049         rate /= divisor; /* smoothing constant alpha = 1/divisor */
6050
6051         bfqd->peak_rate += rate;
6052         update_thr_responsiveness_params(bfqd);
6053
6054 reset_computation:
6055         bfq_reset_rate_computation(bfqd, rq);
6056 }
6057
6058 /*
6059  * Update the read/write peak rate (the main quantity used for
6060  * auto-tuning, see update_thr_responsiveness_params()).
6061  *
6062  * It is not trivial to estimate the peak rate (correctly): because of
6063  * the presence of sw and hw queues between the scheduler and the
6064  * device components that finally serve I/O requests, it is hard to
6065  * say exactly when a given dispatched request is served inside the
6066  * device, and for how long. As a consequence, it is hard to know
6067  * precisely at what rate a given set of requests is actually served
6068  * by the device.
6069  *
6070  * On the opposite end, the dispatch time of any request is trivially
6071  * available, and, from this piece of information, the "dispatch rate"
6072  * of requests can be immediately computed. So, the idea in the next
6073  * function is to use what is known, namely request dispatch times
6074  * (plus, when useful, request completion times), to estimate what is
6075  * unknown, namely in-device request service rate.
6076  *
6077  * The main issue is that, because of the above facts, the rate at
6078  * which a certain set of requests is dispatched over a certain time
6079  * interval can vary greatly with respect to the rate at which the
6080  * same requests are then served. But, since the size of any
6081  * intermediate queue is limited, and the service scheme is lossless
6082  * (no request is silently dropped), the following obvious convergence
6083  * property holds: the number of requests dispatched MUST become
6084  * closer and closer to the number of requests completed as the
6085  * observation interval grows. This is the key property used in
6086  * the next function to estimate the peak service rate as a function
6087  * of the observed dispatch rate. The function assumes to be invoked
6088  * on every request dispatch.
6089  */
6090 static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
6091 {
6092         u64 now_ns = ktime_get_ns();
6093
6094         if (bfqd->peak_rate_samples == 0) { /* first dispatch */
6095                 bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
6096                         bfqd->peak_rate_samples);
6097                 bfq_reset_rate_computation(bfqd, rq);
6098                 goto update_last_values; /* will add one sample */
6099         }
6100
6101         /*
6102          * Device idle for very long: the observation interval lasting
6103          * up to this dispatch cannot be a valid observation interval
6104          * for computing a new peak rate (similarly to the late-
6105          * completion event in bfq_completed_request()). Go to
6106          * update_rate_and_reset to have the following three steps
6107          * taken:
6108          * - close the observation interval at the last (previous)
6109          *   request dispatch or completion
6110          * - compute rate, if possible, for that observation interval
6111          * - start a new observation interval with this dispatch
6112          */
6113         if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
6114             bfqd->rq_in_driver == 0)
6115                 goto update_rate_and_reset;
6116
6117         /* Update sampling information */
6118         bfqd->peak_rate_samples++;
6119
6120         if ((bfqd->rq_in_driver > 0 ||
6121                 now_ns - bfqd->last_completion < BFQ_MIN_TT)
6122              && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
6123                 bfqd->sequential_samples++;
6124
6125         bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
6126
6127         /* Reset max observed rq size every 32 dispatches */
6128         if (likely(bfqd->peak_rate_samples % 32))
6129                 bfqd->last_rq_max_size =
6130                         max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
6131         else
6132                 bfqd->last_rq_max_size = blk_rq_sectors(rq);
6133
6134         bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
6135
6136         /* Target observation interval not yet reached, go on sampling */
6137         if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
6138                 goto update_last_values;
6139
6140 update_rate_and_reset:
6141         bfq_update_rate_reset(bfqd, rq);
6142 update_last_values:
6143         bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
6144         bfqd->last_dispatch = now_ns;
6145 }
6146
6147 /*
6148  * Remove request from internal lists.
6149  */
6150 static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
6151 {
6152         struct bfq_queue *bfqq = RQ_BFQQ(rq);
6153
6154         /*
6155          * For consistency, the next instruction should have been
6156          * executed after removing the request from the queue and
6157          * dispatching it.  We execute instead this instruction before
6158          * bfq_remove_request() (and hence introduce a temporary
6159          * inconsistency), for efficiency.  In fact, should this
6160          * dispatch occur for a non in-service bfqq, this anticipated
6161          * increment prevents two counters related to bfqq->dispatched
6162          * from risking to be, first, uselessly decremented, and then
6163          * incremented again when the (new) value of bfqq->dispatched
6164          * happens to be taken into account.
6165          */
6166         bfqq->dispatched++;
6167         bfq_update_peak_rate(q->elevator->elevator_data, rq);
6168
6169         bfq_remove_request(q, rq);
6170 }
6171
6172 static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6173 {
6174         /*
6175          * If this bfqq is shared between multiple processes, check
6176          * to make sure that those processes are still issuing I/Os
6177          * within the mean seek distance. If not, it may be time to
6178          * break the queues apart again.
6179          */
6180         if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
6181                 bfq_mark_bfqq_split_coop(bfqq);
6182
6183         if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
6184                 if (bfqq->dispatched == 0)
6185                         /*
6186                          * Overloading budget_timeout field to store
6187                          * the time at which the queue remains with no
6188                          * backlog and no outstanding request; used by
6189                          * the weight-raising mechanism.
6190                          */
6191                         bfqq->budget_timeout = jiffies;
6192
6193                 bfq_del_bfqq_busy(bfqd, bfqq, true);
6194         } else {
6195                 bfq_requeue_bfqq(bfqd, bfqq);
6196                 /*
6197                  * Resort priority tree of potential close cooperators.
6198                  */
6199                 bfq_pos_tree_add_move(bfqd, bfqq);
6200         }
6201
6202         /*
6203          * All in-service entities must have been properly deactivated
6204          * or requeued before executing the next function, which
6205          * resets all in-service entites as no more in service.
6206          */
6207         __bfq_bfqd_reset_in_service(bfqd);
6208 }
6209
6210 /**
6211  * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
6212  * @bfqd: device data.
6213  * @bfqq: queue to update.
6214  * @reason: reason for expiration.
6215  *
6216  * Handle the feedback on @bfqq budget at queue expiration.
6217  * See the body for detailed comments.
6218  */
6219 static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
6220                                      struct bfq_queue *bfqq,
6221                                      enum bfqq_expiration reason)
6222 {
6223         struct request *next_rq;
6224         int budget, min_budget;
6225
6226         min_budget = bfq_min_budget(bfqd);
6227
6228         if (bfqq->wr_coeff == 1)
6229                 budget = bfqq->max_budget;
6230         else /*
6231               * Use a constant, low budget for weight-raised queues,
6232               * to help achieve a low latency. Keep it slightly higher
6233               * than the minimum possible budget, to cause a little
6234               * bit fewer expirations.
6235               */
6236                 budget = 2 * min_budget;
6237
6238         bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
6239                 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
6240         bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
6241                 budget, bfq_min_budget(bfqd));
6242         bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
6243                 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
6244
6245         if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
6246                 switch (reason) {
6247                 /*
6248                  * Caveat: in all the following cases we trade latency
6249                  * for throughput.
6250                  */
6251                 case BFQQE_TOO_IDLE:
6252                         /*
6253                          * This is the only case where we may reduce
6254                          * the budget: if there is no request of the
6255                          * process still waiting for completion, then
6256                          * we assume (tentatively) that the timer has
6257                          * expired because the batch of requests of
6258                          * the process could have been served with a
6259                          * smaller budget.  Hence, betting that
6260                          * process will behave in the same way when it
6261                          * becomes backlogged again, we reduce its
6262                          * next budget.  As long as we guess right,
6263                          * this budget cut reduces the latency
6264                          * experienced by the process.
6265                          *
6266                          * However, if there are still outstanding
6267                          * requests, then the process may have not yet
6268                          * issued its next request just because it is
6269                          * still waiting for the completion of some of
6270                          * the still outstanding ones.  So in this
6271                          * subcase we do not reduce its budget, on the
6272                          * contrary we increase it to possibly boost
6273                          * the throughput, as discussed in the
6274                          * comments to the BUDGET_TIMEOUT case.
6275                          */
6276                         if (bfqq->dispatched > 0) /* still outstanding reqs */
6277                                 budget = min(budget * 2, bfqd->bfq_max_budget);
6278                         else {
6279                                 if (budget > 5 * min_budget)
6280                                         budget -= 4 * min_budget;
6281                                 else
6282                                         budget = min_budget;
6283                         }
6284                         break;
6285                 case BFQQE_BUDGET_TIMEOUT:
6286                         /*
6287                          * We double the budget here because it gives
6288                          * the chance to boost the throughput if this
6289                          * is not a seeky process (and has bumped into
6290                          * this timeout because of, e.g., ZBR).
6291                          */
6292                         budget = min(budget * 2, bfqd->bfq_max_budget);
6293                         break;
6294                 case BFQQE_BUDGET_EXHAUSTED:
6295                         /*
6296                          * The process still has backlog, and did not
6297                          * let either the budget timeout or the disk
6298                          * idling timeout expire. Hence it is not
6299                          * seeky, has a short thinktime and may be
6300                          * happy with a higher budget too. So
6301                          * definitely increase the budget of this good
6302                          * candidate to boost the disk throughput.
6303                          */
6304                         budget = min(budget * 4, bfqd->bfq_max_budget);
6305                         break;
6306                 case BFQQE_NO_MORE_REQUESTS:
6307                         /*
6308                          * For queues that expire for this reason, it
6309                          * is particularly important to keep the
6310                          * budget close to the actual service they
6311                          * need. Doing so reduces the timestamp
6312                          * misalignment problem described in the
6313                          * comments in the body of
6314                          * __bfq_activate_entity. In fact, suppose
6315                          * that a queue systematically expires for
6316                          * BFQQE_NO_MORE_REQUESTS and presents a
6317                          * new request in time to enjoy timestamp
6318                          * back-shifting. The larger the budget of the
6319                          * queue is with respect to the service the
6320                          * queue actually requests in each service
6321                          * slot, the more times the queue can be
6322                          * reactivated with the same virtual finish
6323                          * time. It follows that, even if this finish
6324                          * time is pushed to the system virtual time
6325                          * to reduce the consequent timestamp
6326                          * misalignment, the queue unjustly enjoys for
6327                          * many re-activations a lower finish time
6328                          * than all newly activated queues.
6329                          *
6330                          * The service needed by bfqq is measured
6331                          * quite precisely by bfqq->entity.service.
6332                          * Since bfqq does not enjoy device idling,
6333                          * bfqq->entity.service is equal to the number
6334                          * of sectors that the process associated with
6335                          * bfqq requested to read/write before waiting
6336                          * for request completions, or blocking for
6337                          * other reasons.
6338                          */
6339                         budget = max_t(int, bfqq->entity.service, min_budget);
6340                         break;
6341                 default:
6342                         return;
6343                 }
6344         } else if (!bfq_bfqq_sync(bfqq)) {
6345                 /*
6346                  * Async queues get always the maximum possible
6347                  * budget, as for them we do not care about latency
6348                  * (in addition, their ability to dispatch is limited
6349                  * by the charging factor).
6350                  */
6351                 budget = bfqd->bfq_max_budget;
6352         }
6353
6354         bfqq->max_budget = budget;
6355
6356         if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
6357             !bfqd->bfq_user_max_budget)
6358                 bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
6359
6360         /*
6361          * If there is still backlog, then assign a new budget, making
6362          * sure that it is large enough for the next request.  Since
6363          * the finish time of bfqq must be kept in sync with the
6364          * budget, be sure to call __bfq_bfqq_expire() *after* this
6365          * update.
6366          *
6367          * If there is no backlog, then no need to update the budget;
6368          * it will be updated on the arrival of a new request.
6369          */
6370         next_rq = bfqq->next_rq;
6371         if (next_rq)
6372                 bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
6373                                             bfq_serv_to_charge(next_rq, bfqq));
6374
6375         bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
6376                         next_rq ? blk_rq_sectors(next_rq) : 0,
6377                         bfqq->entity.budget);
6378 }
6379
6380 /*
6381  * Return true if the process associated with bfqq is "slow". The slow
6382  * flag is used, in addition to the budget timeout, to reduce the
6383  * amount of service provided to seeky processes, and thus reduce
6384  * their chances to lower the throughput. More details in the comments
6385  * on the function bfq_bfqq_expire().
6386  *
6387  * An important observation is in order: as discussed in the comments
6388  * on the function bfq_update_peak_rate(), with devices with internal
6389  * queues, it is hard if ever possible to know when and for how long
6390  * an I/O request is processed by the device (apart from the trivial
6391  * I/O pattern where a new request is dispatched only after the
6392  * previous one has been completed). This makes it hard to evaluate
6393  * the real rate at which the I/O requests of each bfq_queue are
6394  * served.  In fact, for an I/O scheduler like BFQ, serving a
6395  * bfq_queue means just dispatching its requests during its service
6396  * slot (i.e., until the budget of the queue is exhausted, or the
6397  * queue remains idle, or, finally, a timeout fires). But, during the
6398  * service slot of a bfq_queue, around 100 ms at most, the device may
6399  * be even still processing requests of bfq_queues served in previous
6400  * service slots. On the opposite end, the requests of the in-service
6401  * bfq_queue may be completed after the service slot of the queue
6402  * finishes.
6403  *
6404  * Anyway, unless more sophisticated solutions are used
6405  * (where possible), the sum of the sizes of the requests dispatched
6406  * during the service slot of a bfq_queue is probably the only
6407  * approximation available for the service received by the bfq_queue
6408  * during its service slot. And this sum is the quantity used in this
6409  * function to evaluate the I/O speed of a process.
6410  */
6411 static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6412                                  bool compensate, enum bfqq_expiration reason,
6413                                  unsigned long *delta_ms)
6414 {
6415         ktime_t delta_ktime;
6416         u32 delta_usecs;
6417         bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
6418
6419         if (!bfq_bfqq_sync(bfqq))
6420                 return false;
6421
6422         if (compensate)
6423                 delta_ktime = bfqd->last_idling_start;
6424         else
6425                 delta_ktime = ktime_get();
6426         delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
6427         delta_usecs = ktime_to_us(delta_ktime);
6428
6429         /* don't use too short time intervals */
6430         if (delta_usecs < 1000) {
6431                 if (blk_queue_nonrot(bfqd->queue))
6432                          /*
6433                           * give same worst-case guarantees as idling
6434                           * for seeky
6435                           */
6436                         *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
6437                 else /* charge at least one seek */
6438                         *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
6439
6440                 return slow;
6441         }
6442
6443         *delta_ms = delta_usecs / USEC_PER_MSEC;
6444
6445         /*
6446          * Use only long (> 20ms) intervals to filter out excessive
6447          * spikes in service rate estimation.
6448          */
6449         if (delta_usecs > 20000) {
6450                 /*
6451                  * Caveat for rotational devices: processes doing I/O
6452                  * in the slower disk zones tend to be slow(er) even
6453                  * if not seeky. In this respect, the estimated peak
6454                  * rate is likely to be an average over the disk
6455                  * surface. Accordingly, to not be too harsh with
6456                  * unlucky processes, a process is deemed slow only if
6457                  * its rate has been lower than half of the estimated
6458                  * peak rate.
6459                  */
6460                 slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
6461         }
6462
6463         bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
6464
6465         return slow;
6466 }
6467
6468 /*
6469  * To be deemed as soft real-time, an application must meet two
6470  * requirements. First, the application must not require an average
6471  * bandwidth higher than the approximate bandwidth required to playback or
6472  * record a compressed high-definition video.
6473  * The next function is invoked on the completion of the last request of a
6474  * batch, to compute the next-start time instant, soft_rt_next_start, such
6475  * that, if the next request of the application does not arrive before
6476  * soft_rt_next_start, then the above requirement on the bandwidth is met.
6477  *
6478  * The second requirement is that the request pattern of the application is
6479  * isochronous, i.e., that, after issuing a request or a batch of requests,
6480  * the application stops issuing new requests until all its pending requests
6481  * have been completed. After that, the application may issue a new batch,
6482  * and so on.
6483  * For this reason the next function is invoked to compute
6484  * soft_rt_next_start only for applications that meet this requirement,
6485  * whereas soft_rt_next_start is set to infinity for applications that do
6486  * not.
6487  *
6488  * Unfortunately, even a greedy application may happen to behave in an
6489  * isochronous way if the CPU load is high. In fact, the application may
6490  * stop issuing requests while the CPUs are busy serving other processes,
6491  * then restart, then stop again for a while, and so on. In addition, if
6492  * the disk achieves a low enough throughput with the request pattern
6493  * issued by the application (e.g., because the request pattern is random
6494  * and/or the device is slow), then the application may meet the above
6495  * bandwidth requirement too. To prevent such a greedy application to be
6496  * deemed as soft real-time, a further rule is used in the computation of
6497  * soft_rt_next_start: soft_rt_next_start must be higher than the current
6498  * time plus the maximum time for which the arrival of a request is waited
6499  * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
6500  * This filters out greedy applications, as the latter issue instead their
6501  * next request as soon as possible after the last one has been completed
6502  * (in contrast, when a batch of requests is completed, a soft real-time
6503  * application spends some time processing data).
6504  *
6505  * Unfortunately, the last filter may easily generate false positives if
6506  * only bfqd->bfq_slice_idle is used as a reference time interval and one
6507  * or both the following cases occur:
6508  * 1) HZ is so low that the duration of a jiffy is comparable to or higher
6509  *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
6510  *    HZ=100.
6511  * 2) jiffies, instead of increasing at a constant rate, may stop increasing
6512  *    for a while, then suddenly 'jump' by several units to recover the lost
6513  *    increments. This seems to happen, e.g., inside virtual machines.
6514  * To address this issue, we do not use as a reference time interval just
6515  * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
6516  * particular we add the minimum number of jiffies for which the filter
6517  * seems to be quite precise also in embedded systems and KVM/QEMU virtual
6518  * machines.
6519  */
6520 static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
6521                                                 struct bfq_queue *bfqq)
6522 {
6523         return max(bfqq->last_idle_bklogged +
6524                    HZ * bfqq->service_from_backlogged /
6525                    bfqd->bfq_wr_max_softrt_rate,
6526                    jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
6527 }
6528
6529 /*
6530  * Return the farthest future time instant according to jiffies
6531  * macros.
6532  */
6533 static unsigned long bfq_greatest_from_now(void)
6534 {
6535         return jiffies + MAX_JIFFY_OFFSET;
6536 }
6537
6538 /*
6539  * Return the farthest past time instant according to jiffies
6540  * macros.
6541  */
6542 static unsigned long bfq_smallest_from_now(void)
6543 {
6544         return jiffies - MAX_JIFFY_OFFSET;
6545 }
6546
6547 /**
6548  * bfq_bfqq_expire - expire a queue.
6549  * @bfqd: device owning the queue.
6550  * @bfqq: the queue to expire.
6551  * @compensate: if true, compensate for the time spent idling.
6552  * @reason: the reason causing the expiration.
6553  *
6554  * If the process associated with bfqq does slow I/O (e.g., because it
6555  * issues random requests), we charge bfqq with the time it has been
6556  * in service instead of the service it has received (see
6557  * bfq_bfqq_charge_time for details on how this goal is achieved). As
6558  * a consequence, bfqq will typically get higher timestamps upon
6559  * reactivation, and hence it will be rescheduled as if it had
6560  * received more service than what it has actually received. In the
6561  * end, bfqq receives less service in proportion to how slowly its
6562  * associated process consumes its budgets (and hence how seriously it
6563  * tends to lower the throughput). In addition, this time-charging
6564  * strategy guarantees time fairness among slow processes. In
6565  * contrast, if the process associated with bfqq is not slow, we
6566  * charge bfqq exactly with the service it has received.
6567  *
6568  * Charging time to the first type of queues and the exact service to
6569  * the other has the effect of using the WF2Q+ policy to schedule the
6570  * former on a timeslice basis, without violating service domain
6571  * guarantees among the latter.
6572  */
6573 static void bfq_bfqq_expire(struct bfq_data *bfqd,
6574                             struct bfq_queue *bfqq,
6575                             bool compensate,
6576                             enum bfqq_expiration reason)
6577 {
6578         bool slow;
6579         unsigned long delta = 0;
6580         struct bfq_entity *entity = &bfqq->entity;
6581         int ref;
6582
6583         /*
6584          * Check whether the process is slow (see bfq_bfqq_is_slow).
6585          */
6586         slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
6587
6588         /*
6589          * Increase service_from_backlogged before next statement,
6590          * because the possible next invocation of
6591          * bfq_bfqq_charge_time would likely inflate
6592          * entity->service. In contrast, service_from_backlogged must
6593          * contain real service, to enable the soft real-time
6594          * heuristic to correctly compute the bandwidth consumed by
6595          * bfqq.
6596          */
6597         bfqq->service_from_backlogged += entity->service;
6598
6599         /*
6600          * As above explained, charge slow (typically seeky) and
6601          * timed-out queues with the time and not the service
6602          * received, to favor sequential workloads.
6603          *
6604          * Processes doing I/O in the slower disk zones will tend to
6605          * be slow(er) even if not seeky. Therefore, since the
6606          * estimated peak rate is actually an average over the disk
6607          * surface, these processes may timeout just for bad luck. To
6608          * avoid punishing them, do not charge time to processes that
6609          * succeeded in consuming at least 2/3 of their budget. This
6610          * allows BFQ to preserve enough elasticity to still perform
6611          * bandwidth, and not time, distribution with little unlucky
6612          * or quasi-sequential processes.
6613          */
6614         if (bfqq->wr_coeff == 1 &&
6615             (slow ||
6616              (reason == BFQQE_BUDGET_TIMEOUT &&
6617               bfq_bfqq_budget_left(bfqq) >=  entity->budget / 3)))
6618                 bfq_bfqq_charge_time(bfqd, bfqq, delta);
6619
6620         if (reason == BFQQE_TOO_IDLE &&
6621             entity->service <= 2 * entity->budget / 10)
6622                 bfq_clear_bfqq_IO_bound(bfqq);
6623
6624         if (bfqd->low_latency && bfqq->wr_coeff == 1)
6625                 bfqq->last_wr_start_finish = jiffies;
6626
6627         if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
6628             RB_EMPTY_ROOT(&bfqq->sort_list)) {
6629                 /*
6630                  * If we get here, and there are no outstanding
6631                  * requests, then the request pattern is isochronous
6632                  * (see the comments on the function
6633                  * bfq_bfqq_softrt_next_start()). Thus we can compute
6634                  * soft_rt_next_start. If, instead, the queue still
6635                  * has outstanding requests, then we have to wait for
6636                  * the completion of all the outstanding requests to
6637                  * discover whether the request pattern is actually
6638                  * isochronous.
6639                  */
6640                 if (bfqq->dispatched == 0)
6641                         bfqq->soft_rt_next_start =
6642                                 bfq_bfqq_softrt_next_start(bfqd, bfqq);
6643                 else {
6644                         /*
6645                          * The application is still waiting for the
6646                          * completion of one or more requests:
6647                          * prevent it from possibly being incorrectly
6648                          * deemed as soft real-time by setting its
6649                          * soft_rt_next_start to infinity. In fact,
6650                          * without this assignment, the application
6651                          * would be incorrectly deemed as soft
6652                          * real-time if:
6653                          * 1) it issued a new request before the
6654                          *    completion of all its in-flight
6655                          *    requests, and
6656                          * 2) at that time, its soft_rt_next_start
6657                          *    happened to be in the past.
6658                          */
6659                         bfqq->soft_rt_next_start =
6660                                 bfq_greatest_from_now();
6661                         /*
6662                          * Schedule an update of soft_rt_next_start to when
6663                          * the task may be discovered to be isochronous.
6664                          */
6665                         bfq_mark_bfqq_softrt_update(bfqq);
6666                 }
6667         }
6668
6669         bfq_log_bfqq(bfqd, bfqq,
6670                 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
6671                 slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
6672
6673         /*
6674          * Increase, decrease or leave budget unchanged according to
6675          * reason.
6676          */
6677         __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
6678         ref = bfqq->ref;
6679         __bfq_bfqq_expire(bfqd, bfqq);
6680
6681         /* mark bfqq as waiting a request only if a bic still points to it */
6682         if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
6683             reason != BFQQE_BUDGET_TIMEOUT &&
6684             reason != BFQQE_BUDGET_EXHAUSTED)
6685                 bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
6686 }
6687
6688 /*
6689  * Budget timeout is not implemented through a dedicated timer, but
6690  * just checked on request arrivals and completions, as well as on
6691  * idle timer expirations.
6692  */
6693 static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
6694 {
6695         return time_is_before_eq_jiffies(bfqq->budget_timeout);
6696 }
6697
6698 /*
6699  * If we expire a queue that is actively waiting (i.e., with the
6700  * device idled) for the arrival of a new request, then we may incur
6701  * the timestamp misalignment problem described in the body of the
6702  * function __bfq_activate_entity. Hence we return true only if this
6703  * condition does not hold, or if the queue is slow enough to deserve
6704  * only to be kicked off for preserving a high throughput.
6705  */
6706 static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
6707 {
6708         bfq_log_bfqq(bfqq->bfqd, bfqq,
6709                 "may_budget_timeout: wait_request %d left %d timeout %d",
6710                 bfq_bfqq_wait_request(bfqq),
6711                         bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
6712                 bfq_bfqq_budget_timeout(bfqq));
6713
6714         return (!bfq_bfqq_wait_request(bfqq) ||
6715                 bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
6716                 &&
6717                 bfq_bfqq_budget_timeout(bfqq);
6718 }
6719
6720 /*
6721  * For a queue that becomes empty, device idling is allowed only if
6722  * this function returns true for the queue. As a consequence, since
6723  * device idling plays a critical role in both throughput boosting and
6724  * service guarantees, the return value of this function plays a
6725  * critical role in both these aspects as well.
6726  *
6727  * In a nutshell, this function returns true only if idling is
6728  * beneficial for throughput or, even if detrimental for throughput,
6729  * idling is however necessary to preserve service guarantees (low
6730  * latency, desired throughput distribution, ...). In particular, on
6731  * NCQ-capable devices, this function tries to return false, so as to
6732  * help keep the drives' internal queues full, whenever this helps the
6733  * device boost the throughput without causing any service-guarantee
6734  * issue.
6735  *
6736  * In more detail, the return value of this function is obtained by,
6737  * first, computing a number of boolean variables that take into
6738  * account throughput and service-guarantee issues, and, then,
6739  * combining these variables in a logical expression. Most of the
6740  * issues taken into account are not trivial. We discuss these issues
6741  * individually while introducing the variables.
6742  */
6743 static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
6744 {
6745         struct bfq_data *bfqd = bfqq->bfqd;
6746         bool idling_boosts_thr, idling_boosts_thr_without_issues,
6747                 idling_needed_for_service_guarantees,
6748                 asymmetric_scenario;
6749
6750         if (bfqd->strict_guarantees)
6751                 return true;
6752
6753         /*
6754          * The next variable takes into account the cases where idling
6755          * boosts the throughput.
6756          *
6757          * The value of the variable is computed considering, first, that
6758          * idling is virtually always beneficial for the throughput if:
6759          * (a) the device is not NCQ-capable, or
6760          * (b) regardless of the presence of NCQ, the device is rotational
6761          *     and the request pattern for bfqq is I/O-bound and sequential.
6762          *
6763          * Secondly, and in contrast to the above item (b), idling an
6764          * NCQ-capable flash-based device would not boost the
6765          * throughput even with sequential I/O; rather it would lower
6766          * the throughput in proportion to how fast the device
6767          * is. Accordingly, the next variable is true if any of the
6768          * above conditions (a) and (b) is true, and, in particular,
6769          * happens to be false if bfqd is an NCQ-capable flash-based
6770          * device.
6771          */
6772         idling_boosts_thr = !bfqd->hw_tag ||
6773                 (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
6774                  bfq_bfqq_idle_window(bfqq));
6775
6776         /*
6777          * The value of the next variable,
6778          * idling_boosts_thr_without_issues, is equal to that of
6779          * idling_boosts_thr, unless a special case holds. In this
6780          * special case, described below, idling may cause problems to
6781          * weight-raised queues.
6782          *
6783          * When the request pool is saturated (e.g., in the presence
6784          * of write hogs), if the processes associated with
6785          * non-weight-raised queues ask for requests at a lower rate,
6786          * then processes associated with weight-raised queues have a
6787          * higher probability to get a request from the pool
6788          * immediately (or at least soon) when they need one. Thus
6789          * they have a higher probability to actually get a fraction
6790          * of the device throughput proportional to their high
6791          * weight. This is especially true with NCQ-capable drives,
6792          * which enqueue several requests in advance, and further
6793          * reorder internally-queued requests.
6794          *
6795          * For this reason, we force to false the value of
6796          * idling_boosts_thr_without_issues if there are weight-raised
6797          * busy queues. In this case, and if bfqq is not weight-raised,
6798          * this guarantees that the device is not idled for bfqq (if,
6799          * instead, bfqq is weight-raised, then idling will be
6800          * guaranteed by another variable, see below). Combined with
6801          * the timestamping rules of BFQ (see [1] for details), this
6802          * behavior causes bfqq, and hence any sync non-weight-raised
6803          * queue, to get a lower number of requests served, and thus
6804          * to ask for a lower number of requests from the request
6805          * pool, before the busy weight-raised queues get served
6806          * again. This often mitigates starvation problems in the
6807          * presence of heavy write workloads and NCQ, thereby
6808          * guaranteeing a higher application and system responsiveness
6809          * in these hostile scenarios.
6810          */
6811         idling_boosts_thr_without_issues = idling_boosts_thr &&
6812                 bfqd->wr_busy_queues == 0;
6813
6814         /*
6815          * There is then a case where idling must be performed not
6816          * for throughput concerns, but to preserve service
6817          * guarantees.
6818          *
6819          * To introduce this case, we can note that allowing the drive
6820          * to enqueue more than one request at a time, and hence
6821          * delegating de facto final scheduling decisions to the
6822          * drive's internal scheduler, entails loss of control on the
6823          * actual request service order. In particular, the critical
6824          * situation is when requests from different processes happen
6825          * to be present, at the same time, in the internal queue(s)
6826          * of the drive. In such a situation, the drive, by deciding
6827          * the service order of the internally-queued requests, does
6828          * determine also the actual throughput distribution among
6829          * these processes. But the drive typically has no notion or
6830          * concern about per-process throughput distribution, and
6831          * makes its decisions only on a per-request basis. Therefore,
6832          * the service distribution enforced by the drive's internal
6833          * scheduler is likely to coincide with the desired
6834          * device-throughput distribution only in a completely
6835          * symmetric scenario where:
6836          * (i)  each of these processes must get the same throughput as
6837          *      the others;
6838          * (ii) all these processes have the same I/O pattern
6839                 (either sequential or random).
6840          * In fact, in such a scenario, the drive will tend to treat
6841          * the requests of each of these processes in about the same
6842          * way as the requests of the others, and thus to provide
6843          * each of these processes with about the same throughput
6844          * (which is exactly the desired throughput distribution). In
6845          * contrast, in any asymmetric scenario, device idling is
6846          * certainly needed to guarantee that bfqq receives its
6847          * assigned fraction of the device throughput (see [1] for
6848          * details).
6849          *
6850          * We address this issue by controlling, actually, only the
6851          * symmetry sub-condition (i), i.e., provided that
6852          * sub-condition (i) holds, idling is not performed,
6853          * regardless of whether sub-condition (ii) holds. In other
6854          * words, only if sub-condition (i) holds, then idling is
6855          * allowed, and the device tends to be prevented from queueing
6856          * many requests, possibly of several processes. The reason
6857          * for not controlling also sub-condition (ii) is that we
6858          * exploit preemption to preserve guarantees in case of
6859          * symmetric scenarios, even if (ii) does not hold, as
6860          * explained in the next two paragraphs.
6861          *
6862          * Even if a queue, say Q, is expired when it remains idle, Q
6863          * can still preempt the new in-service queue if the next
6864          * request of Q arrives soon (see the comments on
6865          * bfq_bfqq_update_budg_for_activation). If all queues and
6866          * groups have the same weight, this form of preemption,
6867          * combined with the hole-recovery heuristic described in the
6868          * comments on function bfq_bfqq_update_budg_for_activation,
6869          * are enough to preserve a correct bandwidth distribution in
6870          * the mid term, even without idling. In fact, even if not
6871          * idling allows the internal queues of the device to contain
6872          * many requests, and thus to reorder requests, we can rather
6873          * safely assume that the internal scheduler still preserves a
6874          * minimum of mid-term fairness. The motivation for using
6875          * preemption instead of idling is that, by not idling,
6876          * service guarantees are preserved without minimally
6877          * sacrificing throughput. In other words, both a high
6878          * throughput and its desired distribution are obtained.
6879          *
6880          * More precisely, this preemption-based, idleless approach
6881          * provides fairness in terms of IOPS, and not sectors per
6882          * second. This can be seen with a simple example. Suppose
6883          * that there are two queues with the same weight, but that
6884          * the first queue receives requests of 8 sectors, while the
6885          * second queue receives requests of 1024 sectors. In
6886          * addition, suppose that each of the two queues contains at
6887          * most one request at a time, which implies that each queue
6888          * always remains idle after it is served. Finally, after
6889          * remaining idle, each queue receives very quickly a new
6890          * request. It follows that the two queues are served
6891          * alternatively, preempting each other if needed. This
6892          * implies that, although both queues have the same weight,
6893          * the queue with large requests receives a service that is
6894          * 1024/8 times as high as the service received by the other
6895          * queue.
6896          *
6897          * On the other hand, device idling is performed, and thus
6898          * pure sector-domain guarantees are provided, for the
6899          * following queues, which are likely to need stronger
6900          * throughput guarantees: weight-raised queues, and queues
6901          * with a higher weight than other queues. When such queues
6902          * are active, sub-condition (i) is false, which triggers
6903          * device idling.
6904          *
6905          * According to the above considerations, the next variable is
6906          * true (only) if sub-condition (i) holds. To compute the
6907          * value of this variable, we not only use the return value of
6908          * the function bfq_symmetric_scenario(), but also check
6909          * whether bfqq is being weight-raised, because
6910          * bfq_symmetric_scenario() does not take into account also
6911          * weight-raised queues (see comments on
6912          * bfq_weights_tree_add()).
6913          *
6914          * As a side note, it is worth considering that the above
6915          * device-idling countermeasures may however fail in the
6916          * following unlucky scenario: if idling is (correctly)
6917          * disabled in a time period during which all symmetry
6918          * sub-conditions hold, and hence the device is allowed to
6919          * enqueue many requests, but at some later point in time some
6920          * sub-condition stops to hold, then it may become impossible
6921          * to let requests be served in the desired order until all
6922          * the requests already queued in the device have been served.
6923          */
6924         asymmetric_scenario = bfqq->wr_coeff > 1 ||
6925                 !bfq_symmetric_scenario(bfqd);
6926
6927         /*
6928          * Finally, there is a case where maximizing throughput is the
6929          * best choice even if it may cause unfairness toward
6930          * bfqq. Such a case is when bfqq became active in a burst of
6931          * queue activations. Queues that became active during a large
6932          * burst benefit only from throughput, as discussed in the
6933          * comments on bfq_handle_burst. Thus, if bfqq became active
6934          * in a burst and not idling the device maximizes throughput,
6935          * then the device must no be idled, because not idling the
6936          * device provides bfqq and all other queues in the burst with
6937          * maximum benefit. Combining this and the above case, we can
6938          * now establish when idling is actually needed to preserve
6939          * service guarantees.
6940          */
6941         idling_needed_for_service_guarantees =
6942                 asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
6943
6944         /*
6945          * We have now all the components we need to compute the return
6946          * value of the function, which is true only if both the following
6947          * conditions hold:
6948          * 1) bfqq is sync, because idling make sense only for sync queues;
6949          * 2) idling either boosts the throughput (without issues), or
6950          *    is necessary to preserve service guarantees.
6951          */
6952         return bfq_bfqq_sync(bfqq) &&
6953                 (idling_boosts_thr_without_issues ||
6954                  idling_needed_for_service_guarantees);
6955 }
6956
6957 /*
6958  * If the in-service queue is empty but the function bfq_bfqq_may_idle
6959  * returns true, then:
6960  * 1) the queue must remain in service and cannot be expired, and
6961  * 2) the device must be idled to wait for the possible arrival of a new
6962  *    request for the queue.
6963  * See the comments on the function bfq_bfqq_may_idle for the reasons
6964  * why performing device idling is the best choice to boost the throughput
6965  * and preserve service guarantees when bfq_bfqq_may_idle itself
6966  * returns true.
6967  */
6968 static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
6969 {
6970         struct bfq_data *bfqd = bfqq->bfqd;
6971
6972         return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
6973                bfq_bfqq_may_idle(bfqq);
6974 }
6975
6976 /*
6977  * Select a queue for service.  If we have a current queue in service,
6978  * check whether to continue servicing it, or retrieve and set a new one.
6979  */
6980 static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6981 {
6982         struct bfq_queue *bfqq;
6983         struct request *next_rq;
6984         enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
6985
6986         bfqq = bfqd->in_service_queue;
6987         if (!bfqq)
6988                 goto new_queue;
6989
6990         bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
6991
6992         if (bfq_may_expire_for_budg_timeout(bfqq) &&
6993             !bfq_bfqq_wait_request(bfqq) &&
6994             !bfq_bfqq_must_idle(bfqq))
6995                 goto expire;
6996
6997 check_queue:
6998         /*
6999          * This loop is rarely executed more than once. Even when it
7000          * happens, it is much more convenient to re-execute this loop
7001          * than to return NULL and trigger a new dispatch to get a
7002          * request served.
7003          */
7004         next_rq = bfqq->next_rq;
7005         /*
7006          * If bfqq has requests queued and it has enough budget left to
7007          * serve them, keep the queue, otherwise expire it.
7008          */
7009         if (next_rq) {
7010                 if (bfq_serv_to_charge(next_rq, bfqq) >
7011                         bfq_bfqq_budget_left(bfqq)) {
7012                         /*
7013                          * Expire the queue for budget exhaustion,
7014                          * which makes sure that the next budget is
7015                          * enough to serve the next request, even if
7016                          * it comes from the fifo expired path.
7017                          */
7018                         reason = BFQQE_BUDGET_EXHAUSTED;
7019                         goto expire;
7020                 } else {
7021                         /*
7022                          * The idle timer may be pending because we may
7023                          * not disable disk idling even when a new request
7024                          * arrives.
7025                          */
7026                         if (bfq_bfqq_wait_request(bfqq)) {
7027                                 /*
7028                                  * If we get here: 1) at least a new request
7029                                  * has arrived but we have not disabled the
7030                                  * timer because the request was too small,
7031                                  * 2) then the block layer has unplugged
7032                                  * the device, causing the dispatch to be
7033                                  * invoked.
7034                                  *
7035                                  * Since the device is unplugged, now the
7036                                  * requests are probably large enough to
7037                                  * provide a reasonable throughput.
7038                                  * So we disable idling.
7039                                  */
7040                                 bfq_clear_bfqq_wait_request(bfqq);
7041                                 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
7042                                 bfqg_stats_update_idle_time(bfqq_group(bfqq));
7043                         }
7044                         goto keep_queue;
7045                 }
7046         }
7047
7048         /*
7049          * No requests pending. However, if the in-service queue is idling
7050          * for a new request, or has requests waiting for a completion and
7051          * may idle after their completion, then keep it anyway.
7052          */
7053         if (bfq_bfqq_wait_request(bfqq) ||
7054             (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
7055                 bfqq = NULL;
7056                 goto keep_queue;
7057         }
7058
7059         reason = BFQQE_NO_MORE_REQUESTS;
7060 expire:
7061         bfq_bfqq_expire(bfqd, bfqq, false, reason);
7062 new_queue:
7063         bfqq = bfq_set_in_service_queue(bfqd);
7064         if (bfqq) {
7065                 bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
7066                 goto check_queue;
7067         }
7068 keep_queue:
7069         if (bfqq)
7070                 bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
7071         else
7072                 bfq_log(bfqd, "select_queue: no queue returned");
7073
7074         return bfqq;
7075 }
7076
7077 static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
7078 {
7079         struct bfq_entity *entity = &bfqq->entity;
7080
7081         if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
7082                 bfq_log_bfqq(bfqd, bfqq,
7083                         "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
7084                         jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
7085                         jiffies_to_msecs(bfqq->wr_cur_max_time),
7086                         bfqq->wr_coeff,
7087                         bfqq->entity.weight, bfqq->entity.orig_weight);
7088
7089                 if (entity->prio_changed)
7090                         bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
7091
7092                 /*
7093                  * If the queue was activated in a burst, or too much
7094                  * time has elapsed from the beginning of this
7095                  * weight-raising period, then end weight raising.
7096                  */
7097                 if (bfq_bfqq_in_large_burst(bfqq))
7098                         bfq_bfqq_end_wr(bfqq);
7099                 else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
7100                                                 bfqq->wr_cur_max_time)) {
7101                         if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
7102                         time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
7103                                                bfq_wr_duration(bfqd)))
7104                                 bfq_bfqq_end_wr(bfqq);
7105                         else {
7106                                 /* switch back to interactive wr */
7107                                 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
7108                                 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
7109                                 bfqq->last_wr_start_finish =
7110                                         bfqq->wr_start_at_switch_to_srt;
7111                                 bfqq->entity.prio_changed = 1;
7112                         }
7113                 }
7114         }
7115         /* Update weight both if it must be raised and if it must be lowered */
7116         if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
7117                 __bfq_entity_update_weight_prio(
7118                         bfq_entity_service_tree(entity),
7119                         entity);
7120 }
7121
7122 /*
7123  * Dispatch next request from bfqq.
7124  */
7125 static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
7126                                                  struct bfq_queue *bfqq)
7127 {
7128         struct request *rq = bfqq->next_rq;
7129         unsigned long service_to_charge;
7130
7131         service_to_charge = bfq_serv_to_charge(rq, bfqq);
7132
7133         bfq_bfqq_served(bfqq, service_to_charge);
7134
7135         bfq_dispatch_remove(bfqd->queue, rq);
7136
7137         /*
7138          * If weight raising has to terminate for bfqq, then next
7139          * function causes an immediate update of bfqq's weight,
7140          * without waiting for next activation. As a consequence, on
7141          * expiration, bfqq will be timestamped as if has never been
7142          * weight-raised during this service slot, even if it has
7143          * received part or even most of the service as a
7144          * weight-raised queue. This inflates bfqq's timestamps, which
7145          * is beneficial, as bfqq is then more willing to leave the
7146          * device immediately to possible other weight-raised queues.
7147          */
7148         bfq_update_wr_data(bfqd, bfqq);
7149
7150         if (!bfqd->in_service_bic) {
7151                 atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
7152                 bfqd->in_service_bic = RQ_BIC(rq);
7153         }
7154
7155         /*
7156          * Expire bfqq, pretending that its budget expired, if bfqq
7157          * belongs to CLASS_IDLE and other queues are waiting for
7158          * service.
7159          */
7160         if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
7161                 goto expire;
7162
7163         return rq;
7164
7165 expire:
7166         bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
7167         return rq;
7168 }
7169
7170 static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
7171 {
7172         struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
7173
7174         /*
7175          * Avoiding lock: a race on bfqd->busy_queues should cause at
7176          * most a call to dispatch for nothing
7177          */
7178         return !list_empty_careful(&bfqd->dispatch) ||
7179                 bfqd->busy_queues > 0;
7180 }
7181
7182 static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
7183 {
7184         struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
7185         struct request *rq = NULL;
7186         struct bfq_queue *bfqq = NULL;
7187
7188         if (!list_empty(&bfqd->dispatch)) {
7189                 rq = list_first_entry(&bfqd->dispatch, struct request,
7190                                       queuelist);
7191                 list_del_init(&rq->queuelist);
7192
7193                 bfqq = RQ_BFQQ(rq);
7194
7195                 if (bfqq) {
7196                         /*
7197                          * Increment counters here, because this
7198                          * dispatch does not follow the standard
7199                          * dispatch flow (where counters are
7200                          * incremented)
7201                          */
7202                         bfqq->dispatched++;
7203
7204                         goto inc_in_driver_start_rq;
7205                 }
7206
7207                 /*
7208                  * We exploit the put_rq_private hook to decrement
7209                  * rq_in_driver, but put_rq_private will not be
7210                  * invoked on this request. So, to avoid unbalance,
7211                  * just start this request, without incrementing
7212                  * rq_in_driver. As a negative consequence,
7213                  * rq_in_driver is deceptively lower than it should be
7214                  * while this request is in service. This may cause
7215                  * bfq_schedule_dispatch to be invoked uselessly.
7216                  *
7217                  * As for implementing an exact solution, the
7218                  * put_request hook, if defined, is probably invoked
7219                  * also on this request. So, by exploiting this hook,
7220                  * we could 1) increment rq_in_driver here, and 2)
7221                  * decrement it in put_request. Such a solution would
7222                  * let the value of the counter be always accurate,
7223                  * but it would entail using an extra interface
7224                  * function. This cost seems higher than the benefit,
7225                  * being the frequency of non-elevator-private
7226                  * requests very low.
7227                  */
7228                 goto start_rq;
7229         }
7230
7231         bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
7232
7233         if (bfqd->busy_queues == 0)
7234                 goto exit;
7235
7236         /*
7237          * Force device to serve one request at a time if
7238          * strict_guarantees is true. Forcing this service scheme is
7239          * currently the ONLY way to guarantee that the request
7240          * service order enforced by the scheduler is respected by a
7241          * queueing device. Otherwise the device is free even to make
7242          * some unlucky request wait for as long as the device
7243          * wishes.
7244          *
7245          * Of course, serving one request at at time may cause loss of
7246          * throughput.
7247          */
7248         if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
7249                 goto exit;
7250
7251         bfqq = bfq_select_queue(bfqd);
7252         if (!bfqq)
7253                 goto exit;
7254
7255         rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
7256
7257         if (rq) {
7258 inc_in_driver_start_rq:
7259                 bfqd->rq_in_driver++;
7260 start_rq:
7261                 rq->rq_flags |= RQF_STARTED;
7262         }
7263 exit:
7264         return rq;
7265 }
7266
7267 static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
7268 {
7269         struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
7270         struct request *rq;
7271
7272         spin_lock_irq(&bfqd->lock);
7273
7274         rq = __bfq_dispatch_request(hctx);
7275         bfq_unlock_put_ioc(bfqd);
7276
7277         return rq;
7278 }
7279
7280 /*
7281  * Task holds one reference to the queue, dropped when task exits.  Each rq
7282  * in-flight on this queue also holds a reference, dropped when rq is freed.
7283  *
7284  * Scheduler lock must be held here. Recall not to use bfqq after calling
7285  * this function on it.
7286  */
7287 static void bfq_put_queue(struct bfq_queue *bfqq)
7288 {
7289 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7290         struct bfq_group *bfqg = bfqq_group(bfqq);
7291 #endif
7292
7293         if (bfqq->bfqd)
7294                 bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
7295                              bfqq, bfqq->ref);
7296
7297         bfqq->ref--;
7298         if (bfqq->ref)
7299                 return;
7300
7301         if (bfq_bfqq_sync(bfqq))
7302                 /*
7303                  * The fact that this queue is being destroyed does not
7304                  * invalidate the fact that this queue may have been
7305                  * activated during the current burst. As a consequence,
7306                  * although the queue does not exist anymore, and hence
7307                  * needs to be removed from the burst list if there,
7308                  * the burst size has not to be decremented.
7309                  */
7310                 hlist_del_init(&bfqq->burst_list_node);
7311
7312         kmem_cache_free(bfq_pool, bfqq);
7313 #ifdef CONFIG_BFQ_GROUP_IOSCHED
7314         bfqg_put(bfqg);
7315 #endif
7316 }
7317
7318 static void bfq_put_cooperator(struct bfq_queue *bfqq)
7319 {
7320         struct bfq_queue *__bfqq, *next;
7321
7322         /*
7323          * If this queue was scheduled to merge with another queue, be
7324          * sure to drop the reference taken on that queue (and others in
7325          * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
7326          */
7327         __bfqq = bfqq->new_bfqq;
7328         while (__bfqq) {
7329                 if (__bfqq == bfqq)
7330                         break;
7331                 next = __bfqq->new_bfqq;
7332                 bfq_put_queue(__bfqq);
7333                 __bfqq = next;
7334         }
7335 }
7336
7337 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
7338 {
7339         if (bfqq == bfqd->in_service_queue) {
7340                 __bfq_bfqq_expire(bfqd, bfqq);
7341                 bfq_schedule_dispatch(bfqd);
7342         }
7343
7344         bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
7345
7346         bfq_put_cooperator(bfqq);
7347
7348         bfq_put_queue(bfqq); /* release process reference */
7349 }
7350
7351 static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
7352 {
7353         struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
7354         struct bfq_data *bfqd;
7355
7356         if (bfqq)
7357                 bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
7358
7359         if (bfqq && bfqd) {
7360                 unsigned long flags;
7361
7362                 spin_lock_irqsave(&bfqd->lock, flags);
7363                 /*
7364                  * If the bic is using a shared queue, put the
7365                  * reference taken on the io_context when the bic
7366                  * started using a shared bfq_queue. This put cannot
7367                  * make ioc->ref_count reach 0, then no ioc->lock
7368                  * risks to be taken (leading to possible deadlock
7369                  * scenarios).
7370                  */
7371                 if (is_sync && bfq_bfqq_coop(bfqq))
7372                         put_io_context(bic->icq.ioc);
7373
7374                 bfq_exit_bfqq(bfqd, bfqq);
7375                 bic_set_bfqq(bic, NULL, is_sync);
7376                 bfq_unlock_put_ioc_restore(bfqd, flags);
7377         }
7378 }
7379
7380 static void bfq_exit_icq(struct io_cq *icq)
7381 {
7382         struct bfq_io_cq *bic = icq_to_bic(icq);
7383
7384         bfq_exit_icq_bfqq(bic, true);
7385         bfq_exit_icq_bfqq(bic, false);
7386 }
7387
7388 /*
7389  * Update the entity prio values; note that the new values will not
7390  * be used until the next (re)activation.
7391  */
7392 static void
7393 bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
7394 {
7395         struct task_struct *tsk = current;
7396         int ioprio_class;
7397         struct bfq_data *bfqd = bfqq->bfqd;
7398
7399         if (!bfqd)
7400                 return;
7401
7402         ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
7403         switch (ioprio_class) {
7404         default:
7405                 dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
7406                         "bfq: bad prio class %d\n", ioprio_class);
7407         case IOPRIO_CLASS_NONE:
7408                 /*
7409                  * No prio set, inherit CPU scheduling settings.
7410                  */
7411                 bfqq->new_ioprio = task_nice_ioprio(tsk);
7412                 bfqq->new_ioprio_class = task_nice_ioclass(tsk);
7413                 break;
7414         case IOPRIO_CLASS_RT:
7415                 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
7416                 bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
7417                 break;
7418         case IOPRIO_CLASS_BE:
7419                 bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
7420                 bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
7421                 break;
7422         case IOPRIO_CLASS_IDLE:
7423                 bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
7424                 bfqq->new_ioprio = 7;
7425                 bfq_clear_bfqq_idle_window(bfqq);
7426                 break;
7427         }
7428
7429         if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
7430                 pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
7431                         bfqq->new_ioprio);
7432                 bfqq->new_ioprio = IOPRIO_BE_NR;
7433         }
7434
7435         bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
7436         bfqq->entity.prio_changed = 1;
7437 }
7438
7439 static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
7440 {
7441         struct bfq_data *bfqd = bic_to_bfqd(bic);
7442         struct bfq_queue *bfqq;
7443         int ioprio = bic->icq.ioc->ioprio;
7444
7445         /*
7446          * This condition may trigger on a newly created bic, be sure to
7447          * drop the lock before returning.
7448          */
7449         if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
7450                 return;
7451
7452         bic->ioprio = ioprio;
7453
7454         bfqq = bic_to_bfqq(bic, false);
7455         if (bfqq) {
7456                 /* release process reference on this queue */
7457                 bfq_put_queue(bfqq);
7458                 bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
7459                 bic_set_bfqq(bic, bfqq, false);
7460         }
7461
7462         bfqq = bic_to_bfqq(bic, true);
7463         if (bfqq)
7464                 bfq_set_next_ioprio_data(bfqq, bic);
7465 }
7466
7467 static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
7468                           struct bfq_io_cq *bic, pid_t pid, int is_sync)
7469 {
7470         RB_CLEAR_NODE(&bfqq->entity.rb_node);
7471         INIT_LIST_HEAD(&bfqq->fifo);
7472         INIT_HLIST_NODE(&bfqq->burst_list_node);
7473
7474         bfqq->ref = 0;
7475         bfqq->bfqd = bfqd;
7476
7477         if (bic)
7478                 bfq_set_next_ioprio_data(bfqq, bic);
7479
7480         if (is_sync) {
7481                 if (!bfq_class_idle(bfqq))
7482                         bfq_mark_bfqq_idle_window(bfqq);
7483                 bfq_mark_bfqq_sync(bfqq);
7484                 bfq_mark_bfqq_just_created(bfqq);
7485         } else
7486                 bfq_clear_bfqq_sync(bfqq);
7487
7488         /* set end request to minus infinity from now */
7489         bfqq->ttime.last_end_request = ktime_get_ns() + 1;
7490
7491         bfq_mark_bfqq_IO_bound(bfqq);
7492
7493         bfqq->pid = pid;
7494
7495         /* Tentative initial value to trade off between thr and lat */
7496         bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
7497         bfqq->budget_timeout = bfq_smallest_from_now();
7498
7499         bfqq->wr_coeff = 1;
7500         bfqq->last_wr_start_finish = jiffies;
7501         bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
7502         bfqq->split_time = bfq_smallest_from_now();
7503
7504         /*
7505          * Set to the value for which bfqq will not be deemed as
7506          * soft rt when it becomes backlogged.
7507          */
7508         bfqq->soft_rt_next_start = bfq_greatest_from_now();
7509
7510         /* first request is almost certainly seeky */
7511         bfqq->seek_history = 1;
7512 }
7513
7514 static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
7515                                                struct bfq_group *bfqg,
7516                                                int ioprio_class, int ioprio)
7517 {
7518         switch (ioprio_class) {
7519         case IOPRIO_CLASS_RT:
7520                 return &bfqg->async_bfqq[0][ioprio];
7521         case IOPRIO_CLASS_NONE:
7522                 ioprio = IOPRIO_NORM;
7523                 /* fall through */
7524         case IOPRIO_CLASS_BE:
7525                 return &bfqg->async_bfqq[1][ioprio];
7526         case IOPRIO_CLASS_IDLE:
7527                 return &bfqg->async_idle_bfqq;
7528         default:
7529                 return NULL;
7530         }
7531 }
7532
7533 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
7534                                        struct bio *bio, bool is_sync,
7535                                        struct bfq_io_cq *bic)
7536 {
7537         const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
7538         const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
7539         struct bfq_queue **async_bfqq = NULL;
7540         struct bfq_queue *bfqq;
7541         struct bfq_group *bfqg;
7542
7543         rcu_read_lock();
7544
7545         bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
7546         if (!bfqg) {
7547                 bfqq = &bfqd->oom_bfqq;
7548                 goto out;
7549         }
7550
7551         if (!is_sync) {
7552                 async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
7553                                                   ioprio);
7554                 bfqq = *async_bfqq;
7555                 if (bfqq)
7556                         goto out;
7557         }
7558
7559         bfqq = kmem_cache_alloc_node(bfq_pool,
7560                                      GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
7561                                      bfqd->queue->node);
7562
7563         if (bfqq) {
7564                 bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
7565                               is_sync);
7566                 bfq_init_entity(&bfqq->entity, bfqg);
7567                 bfq_log_bfqq(bfqd, bfqq, "allocated");
7568         } else {
7569                 bfqq = &bfqd->oom_bfqq;
7570                 bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
7571                 goto out;
7572         }
7573
7574         /*
7575          * Pin the queue now that it's allocated, scheduler exit will
7576          * prune it.
7577          */
7578         if (async_bfqq) {
7579                 bfqq->ref++; /*
7580                               * Extra group reference, w.r.t. sync
7581                               * queue. This extra reference is removed
7582                               * only if bfqq->bfqg disappears, to
7583                               * guarantee that this queue is not freed
7584                               * until its group goes away.
7585                               */
7586                 bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
7587                              bfqq, bfqq->ref);
7588                 *async_bfqq = bfqq;
7589         }
7590
7591 out:
7592         bfqq->ref++; /* get a process reference to this queue */
7593         bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
7594         rcu_read_unlock();
7595         return bfqq;
7596 }
7597
7598 static void bfq_update_io_thinktime(struct bfq_data *bfqd,
7599                                     struct bfq_queue *bfqq)
7600 {
7601         struct bfq_ttime *ttime = &bfqq->ttime;
7602         u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
7603
7604         elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
7605
7606         ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
7607         ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
7608         ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
7609                                      ttime->ttime_samples);
7610 }
7611
7612 static void
7613 bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
7614                        struct request *rq)
7615 {
7616         bfqq->seek_history <<= 1;
7617         bfqq->seek_history |=
7618                 get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
7619                 (!blk_queue_nonrot(bfqd->queue) ||
7620                  blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
7621 }
7622
7623 /*
7624  * Disable idle window if the process thinks too long or seeks so much that
7625  * it doesn't matter.
7626  */
7627 static void bfq_update_idle_window(struct bfq_data *bfqd,
7628                                    struct bfq_queue *bfqq,
7629                                    struct bfq_io_cq *bic)
7630 {
7631         int enable_idle;
7632
7633         /* Don't idle for async or idle io prio class. */
7634         if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
7635                 return;
7636
7637         /* Idle window just restored, statistics are meaningless. */
7638         if (time_is_after_eq_jiffies(bfqq->split_time +
7639                                      bfqd->bfq_wr_min_idle_time))
7640                 return;
7641
7642         enable_idle = bfq_bfqq_idle_window(bfqq);
7643
7644         if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
7645             bfqd->bfq_slice_idle == 0 ||
7646                 (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
7647                         bfqq->wr_coeff == 1))
7648                 enable_idle = 0;
7649         else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
7650                 if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle &&
7651                         bfqq->wr_coeff == 1)
7652                         enable_idle = 0;
7653                 else
7654                         enable_idle = 1;
7655         }
7656         bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
7657                 enable_idle);
7658
7659         if (enable_idle)
7660                 bfq_mark_bfqq_idle_window(bfqq);
7661         else
7662                 bfq_clear_bfqq_idle_window(bfqq);
7663 }
7664
7665 /*
7666  * Called when a new fs request (rq) is added to bfqq.  Check if there's
7667  * something we should do about it.
7668  */
7669 static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
7670                             struct request *rq)
7671 {
7672         struct bfq_io_cq *bic = RQ_BIC(rq);
7673
7674         if (rq->cmd_flags & REQ_META)
7675                 bfqq->meta_pending++;
7676
7677         bfq_update_io_thinktime(bfqd, bfqq);
7678         bfq_update_io_seektime(bfqd, bfqq, rq);
7679         if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
7680             !BFQQ_SEEKY(bfqq))
7681                 bfq_update_idle_window(bfqd, bfqq, bic);
7682
7683         bfq_log_bfqq(bfqd, bfqq,
7684                      "rq_enqueued: idle_window=%d (seeky %d)",
7685                      bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
7686
7687         bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
7688
7689         if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
7690                 bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
7691                                  blk_rq_sectors(rq) < 32;
7692                 bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
7693
7694                 /*
7695                  * There is just this request queued: if the request
7696                  * is small and the queue is not to be expired, then
7697                  * just exit.
7698                  *
7699                  * In this way, if the device is being idled to wait
7700                  * for a new request from the in-service queue, we
7701                  * avoid unplugging the device and committing the
7702                  * device to serve just a small request. On the
7703                  * contrary, we wait for the block layer to decide
7704                  * when to unplug the device: hopefully, new requests
7705                  * will be merged to this one quickly, then the device
7706                  * will be unplugged and larger requests will be
7707                  * dispatched.
7708                  */
7709                 if (small_req && !budget_timeout)
7710                         return;
7711
7712                 /*
7713                  * A large enough request arrived, or the queue is to
7714                  * be expired: in both cases disk idling is to be
7715                  * stopped, so clear wait_request flag and reset
7716                  * timer.
7717                  */
7718                 bfq_clear_bfqq_wait_request(bfqq);
7719                 hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
7720                 bfqg_stats_update_idle_time(bfqq_group(bfqq));
7721
7722                 /*
7723                  * The queue is not empty, because a new request just
7724                  * arrived. Hence we can safely expire the queue, in
7725                  * case of budget timeout, without risking that the
7726                  * timestamps of the queue are not updated correctly.
7727                  * See [1] for more details.
7728                  */
7729                 if (budget_timeout)
7730                         bfq_bfqq_expire(bfqd, bfqq, false,
7731                                         BFQQE_BUDGET_TIMEOUT);
7732         }
7733 }
7734
7735 static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
7736 {
7737         struct bfq_queue *bfqq = RQ_BFQQ(rq),
7738                 *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
7739
7740         if (new_bfqq) {
7741                 if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
7742                         new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
7743                 /*
7744                  * Release the request's reference to the old bfqq
7745                  * and make sure one is taken to the shared queue.
7746                  */
7747                 new_bfqq->allocated++;
7748                 bfqq->allocated--;
7749                 new_bfqq->ref++;
7750                 bfq_clear_bfqq_just_created(bfqq);
7751                 /*
7752                  * If the bic associated with the process
7753                  * issuing this request still points to bfqq
7754                  * (and thus has not been already redirected
7755                  * to new_bfqq or even some other bfq_queue),
7756                  * then complete the merge and redirect it to
7757                  * new_bfqq.
7758                  */
7759                 if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
7760                         bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
7761                                         bfqq, new_bfqq);
7762                 /*
7763                  * rq is about to be enqueued into new_bfqq,
7764                  * release rq reference on bfqq
7765                  */
7766                 bfq_put_queue(bfqq);
7767                 rq->elv.priv[1] = new_bfqq;
7768                 bfqq = new_bfqq;
7769         }
7770
7771         bfq_add_request(rq);
7772
7773         rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
7774         list_add_tail(&rq->queuelist, &bfqq->fifo);
7775
7776         bfq_rq_enqueued(bfqd, bfqq, rq);
7777 }
7778
7779 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
7780                                bool at_head)
7781 {
7782         struct request_queue *q = hctx->queue;
7783         struct bfq_data *bfqd = q->elevator->elevator_data;
7784
7785         spin_lock_irq(&bfqd->lock);
7786         if (blk_mq_sched_try_insert_merge(q, rq)) {
7787                 spin_unlock_irq(&bfqd->lock);
7788                 return;
7789         }
7790
7791         spin_unlock_irq(&bfqd->lock);
7792
7793         blk_mq_sched_request_inserted(rq);
7794
7795         spin_lock_irq(&bfqd->lock);
7796         if (at_head || blk_rq_is_passthrough(rq)) {
7797                 if (at_head)
7798                         list_add(&rq->queuelist, &bfqd->dispatch);
7799                 else
7800                         list_add_tail(&rq->queuelist, &bfqd->dispatch);
7801         } else {
7802                 __bfq_insert_request(bfqd, rq);
7803
7804                 if (rq_mergeable(rq)) {
7805                         elv_rqhash_add(q, rq);
7806                         if (!q->last_merge)
7807                                 q->last_merge = rq;
7808                 }
7809         }
7810
7811         bfq_unlock_put_ioc(bfqd);
7812 }
7813
7814 static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
7815                                 struct list_head *list, bool at_head)
7816 {
7817         while (!list_empty(list)) {
7818                 struct request *rq;
7819
7820                 rq = list_first_entry(list, struct request, queuelist);
7821                 list_del_init(&rq->queuelist);
7822                 bfq_insert_request(hctx, rq, at_head);
7823         }
7824 }
7825
7826 static void bfq_update_hw_tag(struct bfq_data *bfqd)
7827 {
7828         bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
7829                                        bfqd->rq_in_driver);
7830
7831         if (bfqd->hw_tag == 1)
7832                 return;
7833
7834         /*
7835          * This sample is valid if the number of outstanding requests
7836          * is large enough to allow a queueing behavior.  Note that the
7837          * sum is not exact, as it's not taking into account deactivated
7838          * requests.
7839          */
7840         if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
7841                 return;
7842
7843         if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
7844                 return;
7845
7846         bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
7847         bfqd->max_rq_in_driver = 0;
7848         bfqd->hw_tag_samples = 0;
7849 }
7850
7851 static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
7852 {
7853         u64 now_ns;
7854         u32 delta_us;
7855
7856         bfq_update_hw_tag(bfqd);
7857
7858         bfqd->rq_in_driver--;
7859         bfqq->dispatched--;
7860
7861         if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
7862                 /*
7863                  * Set budget_timeout (which we overload to store the
7864                  * time at which the queue remains with no backlog and
7865                  * no outstanding request; used by the weight-raising
7866                  * mechanism).
7867                  */
7868                 bfqq->budget_timeout = jiffies;
7869
7870                 bfq_weights_tree_remove(bfqd, &bfqq->entity,
7871                                         &bfqd->queue_weights_tree);
7872         }
7873
7874         now_ns = ktime_get_ns();
7875
7876         bfqq->ttime.last_end_request = now_ns;
7877
7878         /*
7879          * Using us instead of ns, to get a reasonable precision in
7880          * computing rate in next check.
7881          */
7882         delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
7883
7884         /*
7885          * If the request took rather long to complete, and, according
7886          * to the maximum request size recorded, this completion latency
7887          * implies that the request was certainly served at a very low
7888          * rate (less than 1M sectors/sec), then the whole observation
7889          * interval that lasts up to this time instant cannot be a
7890          * valid time interval for computing a new peak rate.  Invoke
7891          * bfq_update_rate_reset to have the following three steps
7892          * taken:
7893          * - close the observation interval at the last (previous)
7894          *   request dispatch or completion
7895          * - compute rate, if possible, for that observation interval
7896          * - reset to zero samples, which will trigger a proper
7897          *   re-initialization of the observation interval on next
7898          *   dispatch
7899          */
7900         if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
7901            (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
7902                         1UL<<(BFQ_RATE_SHIFT - 10))
7903                 bfq_update_rate_reset(bfqd, NULL);
7904         bfqd->last_completion = now_ns;
7905
7906         /*
7907          * If we are waiting to discover whether the request pattern
7908          * of the task associated with the queue is actually
7909          * isochronous, and both requisites for this condition to hold
7910          * are now satisfied, then compute soft_rt_next_start (see the
7911          * comments on the function bfq_bfqq_softrt_next_start()). We
7912          * schedule this delayed check when bfqq expires, if it still
7913          * has in-flight requests.
7914          */
7915         if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
7916             RB_EMPTY_ROOT(&bfqq->sort_list))
7917                 bfqq->soft_rt_next_start =
7918                         bfq_bfqq_softrt_next_start(bfqd, bfqq);
7919
7920         /*
7921          * If this is the in-service queue, check if it needs to be expired,
7922          * or if we want to idle in case it has no pending requests.
7923          */
7924         if (bfqd->in_service_queue == bfqq) {
7925                 if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
7926                         bfq_arm_slice_timer(bfqd);
7927                         return;
7928                 } else if (bfq_may_expire_for_budg_timeout(bfqq))
7929                         bfq_bfqq_expire(bfqd, bfqq, false,
7930                                         BFQQE_BUDGET_TIMEOUT);
7931                 else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
7932                          (bfqq->dispatched == 0 ||
7933                           !bfq_bfqq_may_idle(bfqq)))
7934                         bfq_bfqq_expire(bfqd, bfqq, false,
7935                                         BFQQE_NO_MORE_REQUESTS);
7936         }
7937 }
7938
7939 static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
7940 {
7941         bfqq->allocated--;
7942
7943         bfq_put_queue(bfqq);
7944 }
7945
7946 static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
7947 {
7948         struct bfq_queue *bfqq = RQ_BFQQ(rq);
7949         struct bfq_data *bfqd = bfqq->bfqd;
7950
7951         if (rq->rq_flags & RQF_STARTED)
7952                 bfqg_stats_update_completion(bfqq_group(bfqq),
7953                                              rq_start_time_ns(rq),
7954                                              rq_io_start_time_ns(rq),
7955                                              rq->cmd_flags);
7956
7957         if (likely(rq->rq_flags & RQF_STARTED)) {
7958                 unsigned long flags;
7959
7960                 spin_lock_irqsave(&bfqd->lock, flags);
7961
7962                 bfq_completed_request(bfqq, bfqd);
7963                 bfq_put_rq_priv_body(bfqq);
7964
7965                 bfq_unlock_put_ioc_restore(bfqd, flags);
7966         } else {
7967                 /*
7968                  * Request rq may be still/already in the scheduler,
7969                  * in which case we need to remove it. And we cannot
7970                  * defer such a check and removal, to avoid
7971                  * inconsistencies in the time interval from the end
7972                  * of this function to the start of the deferred work.
7973                  * This situation seems to occur only in process
7974                  * context, as a consequence of a merge. In the
7975                  * current version of the code, this implies that the
7976                  * lock is held.
7977                  */
7978
7979                 if (!RB_EMPTY_NODE(&rq->rb_node))
7980                         bfq_remove_request(q, rq);
7981                 bfq_put_rq_priv_body(bfqq);
7982         }
7983
7984         rq->elv.priv[0] = NULL;
7985         rq->elv.priv[1] = NULL;
7986 }
7987
7988 /*
7989  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
7990  * was the last process referring to that bfqq.
7991  */
7992 static struct bfq_queue *
7993 bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
7994 {
7995         bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
7996
7997         if (bfqq_process_refs(bfqq) == 1) {
7998                 bfqq->pid = current->pid;
7999                 bfq_clear_bfqq_coop(bfqq);
8000                 bfq_clear_bfqq_split_coop(bfqq);
8001                 return bfqq;
8002         }
8003
8004         bic_set_bfqq(bic, NULL, 1);
8005
8006         bfq_put_cooperator(bfqq);
8007
8008         bfq_put_queue(bfqq);
8009         return NULL;
8010 }
8011
8012 static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
8013                                                    struct bfq_io_cq *bic,
8014                                                    struct bio *bio,
8015                                                    bool split, bool is_sync,
8016                                                    bool *new_queue)
8017 {
8018         struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
8019
8020         if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
8021                 return bfqq;
8022
8023         if (new_queue)
8024                 *new_queue = true;
8025
8026         if (bfqq)
8027                 bfq_put_queue(bfqq);
8028         bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
8029
8030         bic_set_bfqq(bic, bfqq, is_sync);
8031         if (split && is_sync) {
8032                 if ((bic->was_in_burst_list && bfqd->large_burst) ||
8033                     bic->saved_in_large_burst)
8034                         bfq_mark_bfqq_in_large_burst(bfqq);
8035                 else {
8036                         bfq_clear_bfqq_in_large_burst(bfqq);
8037                         if (bic->was_in_burst_list)
8038                                 hlist_add_head(&bfqq->burst_list_node,
8039                                                &bfqd->burst_list);
8040                 }
8041                 bfqq->split_time = jiffies;
8042         }
8043
8044         return bfqq;
8045 }
8046
8047 /*
8048  * Allocate bfq data structures associated with this request.
8049  */
8050 static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
8051                               struct bio *bio)
8052 {
8053         struct bfq_data *bfqd = q->elevator->elevator_data;
8054         struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
8055         const int is_sync = rq_is_sync(rq);
8056         struct bfq_queue *bfqq;
8057         bool new_queue = false;
8058
8059         spin_lock_irq(&bfqd->lock);
8060
8061         bfq_check_ioprio_change(bic, bio);
8062
8063         if (!bic)
8064                 goto queue_fail;
8065
8066         bfq_bic_update_cgroup(bic, bio);
8067
8068         bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
8069                                          &new_queue);
8070
8071         if (likely(!new_queue)) {
8072                 /* If the queue was seeky for too long, break it apart. */
8073                 if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
8074                         bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
8075
8076                         /* Update bic before losing reference to bfqq */
8077                         if (bfq_bfqq_in_large_burst(bfqq))
8078                                 bic->saved_in_large_burst = true;
8079
8080                         bfqq = bfq_split_bfqq(bic, bfqq);
8081                         /*
8082                          * A reference to bic->icq.ioc needs to be
8083                          * released after a queue split. Do not do it
8084                          * immediately, to not risk to possibly take
8085                          * an ioc->lock while holding the scheduler
8086                          * lock.
8087                          */
8088                         bfqd->ioc_to_put = bic->icq.ioc;
8089
8090                         if (!bfqq)
8091                                 bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
8092                                                                  true, is_sync,
8093                                                                  NULL);
8094                 }
8095         }
8096
8097         bfqq->allocated++;
8098         bfqq->ref++;
8099         bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
8100                      rq, bfqq, bfqq->ref);
8101
8102         rq->elv.priv[0] = bic;
8103         rq->elv.priv[1] = bfqq;
8104
8105         /*
8106          * If a bfq_queue has only one process reference, it is owned
8107          * by only this bic: we can then set bfqq->bic = bic. in
8108          * addition, if the queue has also just been split, we have to
8109          * resume its state.
8110          */
8111         if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
8112                 bfqq->bic = bic;
8113                 if (bfqd->ioc_to_put) { /* if true, there has been a split */
8114                         /*
8115                          * The queue has just been split from a shared
8116                          * queue: restore the idle window and the
8117                          * possible weight raising period.
8118                          */
8119                         bfq_bfqq_resume_state(bfqq, bic);
8120                 }
8121         }
8122
8123         if (unlikely(bfq_bfqq_just_created(bfqq)))
8124                 bfq_handle_burst(bfqd, bfqq);
8125
8126         bfq_unlock_put_ioc(bfqd);
8127
8128         return 0;
8129
8130 queue_fail:
8131         spin_unlock_irq(&bfqd->lock);
8132
8133         return 1;
8134 }
8135
8136 static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
8137 {
8138         struct bfq_data *bfqd = bfqq->bfqd;
8139         enum bfqq_expiration reason;
8140         unsigned long flags;
8141
8142         spin_lock_irqsave(&bfqd->lock, flags);
8143         bfq_clear_bfqq_wait_request(bfqq);
8144
8145         if (bfqq != bfqd->in_service_queue) {
8146                 spin_unlock_irqrestore(&bfqd->lock, flags);
8147                 return;
8148         }
8149
8150         if (bfq_bfqq_budget_timeout(bfqq))
8151                 /*
8152                  * Also here the queue can be safely expired
8153                  * for budget timeout without wasting
8154                  * guarantees
8155                  */
8156                 reason = BFQQE_BUDGET_TIMEOUT;
8157         else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
8158                 /*
8159                  * The queue may not be empty upon timer expiration,
8160                  * because we may not disable the timer when the
8161                  * first request of the in-service queue arrives
8162                  * during disk idling.
8163                  */
8164                 reason = BFQQE_TOO_IDLE;
8165         else
8166                 goto schedule_dispatch;
8167
8168         bfq_bfqq_expire(bfqd, bfqq, true, reason);
8169
8170 schedule_dispatch:
8171         bfq_unlock_put_ioc_restore(bfqd, flags);
8172         bfq_schedule_dispatch(bfqd);
8173 }
8174
8175 /*
8176  * Handler of the expiration of the timer running if the in-service queue
8177  * is idling inside its time slice.
8178  */
8179 static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
8180 {
8181         struct bfq_data *bfqd = container_of(timer, struct bfq_data,
8182                                              idle_slice_timer);
8183         struct bfq_queue *bfqq = bfqd->in_service_queue;
8184
8185         /*
8186          * Theoretical race here: the in-service queue can be NULL or
8187          * different from the queue that was idling if a new request
8188          * arrives for the current queue and there is a full dispatch
8189          * cycle that changes the in-service queue.  This can hardly
8190          * happen, but in the worst case we just expire a queue too
8191          * early.
8192          */
8193         if (bfqq)
8194                 bfq_idle_slice_timer_body(bfqq);
8195
8196         return HRTIMER_NORESTART;
8197 }
8198
8199 static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
8200                                  struct bfq_queue **bfqq_ptr)
8201 {
8202         struct bfq_queue *bfqq = *bfqq_ptr;
8203
8204         bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
8205         if (bfqq) {
8206                 bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
8207
8208                 bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
8209                              bfqq, bfqq->ref);
8210                 bfq_put_queue(bfqq);
8211                 *bfqq_ptr = NULL;
8212         }
8213 }
8214
8215 /*
8216  * Release all the bfqg references to its async queues.  If we are
8217  * deallocating the group these queues may still contain requests, so
8218  * we reparent them to the root cgroup (i.e., the only one that will
8219  * exist for sure until all the requests on a device are gone).
8220  */
8221 static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
8222 {
8223         int i, j;
8224
8225         for (i = 0; i < 2; i++)
8226                 for (j = 0; j < IOPRIO_BE_NR; j++)
8227                         __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
8228
8229         __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
8230 }
8231
8232 static void bfq_exit_queue(struct elevator_queue *e)
8233 {
8234         struct bfq_data *bfqd = e->elevator_data;
8235         struct bfq_queue *bfqq, *n;
8236
8237         hrtimer_cancel(&bfqd->idle_slice_timer);
8238
8239         spin_lock_irq(&bfqd->lock);
8240         list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
8241                 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
8242         spin_unlock_irq(&bfqd->lock);
8243
8244         hrtimer_cancel(&bfqd->idle_slice_timer);
8245
8246 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8247         blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
8248 #else
8249         spin_lock_irq(&bfqd->lock);
8250         bfq_put_async_queues(bfqd, bfqd->root_group);
8251         kfree(bfqd->root_group);
8252         spin_unlock_irq(&bfqd->lock);
8253 #endif
8254
8255         kfree(bfqd);
8256 }
8257
8258 static void bfq_init_root_group(struct bfq_group *root_group,
8259                                 struct bfq_data *bfqd)
8260 {
8261         int i;
8262
8263 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8264         root_group->entity.parent = NULL;
8265         root_group->my_entity = NULL;
8266         root_group->bfqd = bfqd;
8267 #endif
8268         root_group->rq_pos_tree = RB_ROOT;
8269         for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
8270                 root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
8271         root_group->sched_data.bfq_class_idle_last_service = jiffies;
8272 }
8273
8274 static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
8275 {
8276         struct bfq_data *bfqd;
8277         struct elevator_queue *eq;
8278
8279         eq = elevator_alloc(q, e);
8280         if (!eq)
8281                 return -ENOMEM;
8282
8283         bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
8284         if (!bfqd) {
8285                 kobject_put(&eq->kobj);
8286                 return -ENOMEM;
8287         }
8288         eq->elevator_data = bfqd;
8289
8290         spin_lock_irq(q->queue_lock);
8291         q->elevator = eq;
8292         spin_unlock_irq(q->queue_lock);
8293
8294         /*
8295          * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
8296          * Grab a permanent reference to it, so that the normal code flow
8297          * will not attempt to free it.
8298          */
8299         bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
8300         bfqd->oom_bfqq.ref++;
8301         bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
8302         bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
8303         bfqd->oom_bfqq.entity.new_weight =
8304                 bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
8305
8306         /* oom_bfqq does not participate to bursts */
8307         bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
8308
8309         /*
8310          * Trigger weight initialization, according to ioprio, at the
8311          * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
8312          * class won't be changed any more.
8313          */
8314         bfqd->oom_bfqq.entity.prio_changed = 1;
8315
8316         bfqd->queue = q;
8317
8318         INIT_LIST_HEAD(&bfqd->dispatch);
8319
8320         hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
8321                      HRTIMER_MODE_REL);
8322         bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
8323
8324         bfqd->queue_weights_tree = RB_ROOT;
8325         bfqd->group_weights_tree = RB_ROOT;
8326
8327         INIT_LIST_HEAD(&bfqd->active_list);
8328         INIT_LIST_HEAD(&bfqd->idle_list);
8329         INIT_HLIST_HEAD(&bfqd->burst_list);
8330
8331         bfqd->hw_tag = -1;
8332
8333         bfqd->bfq_max_budget = bfq_default_max_budget;
8334
8335         bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
8336         bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
8337         bfqd->bfq_back_max = bfq_back_max;
8338         bfqd->bfq_back_penalty = bfq_back_penalty;
8339         bfqd->bfq_slice_idle = bfq_slice_idle;
8340         bfqd->bfq_timeout = bfq_timeout;
8341
8342         bfqd->bfq_requests_within_timer = 120;
8343
8344         bfqd->bfq_large_burst_thresh = 8;
8345         bfqd->bfq_burst_interval = msecs_to_jiffies(180);
8346
8347         bfqd->low_latency = true;
8348
8349         /*
8350          * Trade-off between responsiveness and fairness.
8351          */
8352         bfqd->bfq_wr_coeff = 30;
8353         bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
8354         bfqd->bfq_wr_max_time = 0;
8355         bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
8356         bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
8357         bfqd->bfq_wr_max_softrt_rate = 7000; /*
8358                                               * Approximate rate required
8359                                               * to playback or record a
8360                                               * high-definition compressed
8361                                               * video.
8362                                               */
8363         bfqd->wr_busy_queues = 0;
8364
8365         /*
8366          * Begin by assuming, optimistically, that the device is a
8367          * high-speed one, and that its peak rate is equal to 2/3 of
8368          * the highest reference rate.
8369          */
8370         bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
8371                         T_fast[blk_queue_nonrot(bfqd->queue)];
8372         bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
8373         bfqd->device_speed = BFQ_BFQD_FAST;
8374
8375         spin_lock_init(&bfqd->lock);
8376
8377         /*
8378          * The invocation of the next bfq_create_group_hierarchy
8379          * function is the head of a chain of function calls
8380          * (bfq_create_group_hierarchy->blkcg_activate_policy->
8381          * blk_mq_freeze_queue) that may lead to the invocation of the
8382          * has_work hook function. For this reason,
8383          * bfq_create_group_hierarchy is invoked only after all
8384          * scheduler data has been initialized, apart from the fields
8385          * that can be initialized only after invoking
8386          * bfq_create_group_hierarchy. This, in particular, enables
8387          * has_work to correctly return false. Of course, to avoid
8388          * other inconsistencies, the blk-mq stack must then refrain
8389          * from invoking further scheduler hooks before this init
8390          * function is finished.
8391          */
8392         bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
8393         if (!bfqd->root_group)
8394                 goto out_free;
8395         bfq_init_root_group(bfqd->root_group, bfqd);
8396         bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
8397
8398
8399         return 0;
8400
8401 out_free:
8402         kfree(bfqd);
8403         kobject_put(&eq->kobj);
8404         return -ENOMEM;
8405 }
8406
8407 static void bfq_slab_kill(void)
8408 {
8409         kmem_cache_destroy(bfq_pool);
8410 }
8411
8412 static int __init bfq_slab_setup(void)
8413 {
8414         bfq_pool = KMEM_CACHE(bfq_queue, 0);
8415         if (!bfq_pool)
8416                 return -ENOMEM;
8417         return 0;
8418 }
8419
8420 static ssize_t bfq_var_show(unsigned int var, char *page)
8421 {
8422         return sprintf(page, "%u\n", var);
8423 }
8424
8425 static ssize_t bfq_var_store(unsigned long *var, const char *page,
8426                              size_t count)
8427 {
8428         unsigned long new_val;
8429         int ret = kstrtoul(page, 10, &new_val);
8430
8431         if (ret == 0)
8432                 *var = new_val;
8433
8434         return count;
8435 }
8436
8437 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)                            \
8438 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
8439 {                                                                       \
8440         struct bfq_data *bfqd = e->elevator_data;                       \
8441         u64 __data = __VAR;                                             \
8442         if (__CONV == 1)                                                \
8443                 __data = jiffies_to_msecs(__data);                      \
8444         else if (__CONV == 2)                                           \
8445                 __data = div_u64(__data, NSEC_PER_MSEC);                \
8446         return bfq_var_show(__data, (page));                            \
8447 }
8448 SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
8449 SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
8450 SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
8451 SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
8452 SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
8453 SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
8454 SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
8455 SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
8456 SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
8457 #undef SHOW_FUNCTION
8458
8459 #define USEC_SHOW_FUNCTION(__FUNC, __VAR)                               \
8460 static ssize_t __FUNC(struct elevator_queue *e, char *page)             \
8461 {                                                                       \
8462         struct bfq_data *bfqd = e->elevator_data;                       \
8463         u64 __data = __VAR;                                             \
8464         __data = div_u64(__data, NSEC_PER_USEC);                        \
8465         return bfq_var_show(__data, (page));                            \
8466 }
8467 USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
8468 #undef USEC_SHOW_FUNCTION
8469
8470 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                 \
8471 static ssize_t                                                          \
8472 __FUNC(struct elevator_queue *e, const char *page, size_t count)        \
8473 {                                                                       \
8474         struct bfq_data *bfqd = e->elevator_data;                       \
8475         unsigned long uninitialized_var(__data);                        \
8476         int ret = bfq_var_store(&__data, (page), count);                \
8477         if (__data < (MIN))                                             \
8478                 __data = (MIN);                                         \
8479         else if (__data > (MAX))                                        \
8480                 __data = (MAX);                                         \
8481         if (__CONV == 1)                                                \
8482                 *(__PTR) = msecs_to_jiffies(__data);                    \
8483         else if (__CONV == 2)                                           \
8484                 *(__PTR) = (u64)__data * NSEC_PER_MSEC;                 \
8485         else                                                            \
8486                 *(__PTR) = __data;                                      \
8487         return ret;                                                     \
8488 }
8489 STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
8490                 INT_MAX, 2);
8491 STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
8492                 INT_MAX, 2);
8493 STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
8494 STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
8495                 INT_MAX, 0);
8496 STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
8497 #undef STORE_FUNCTION
8498
8499 #define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)                    \
8500 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
8501 {                                                                       \
8502         struct bfq_data *bfqd = e->elevator_data;                       \
8503         unsigned long uninitialized_var(__data);                        \
8504         int ret = bfq_var_store(&__data, (page), count);                \
8505         if (__data < (MIN))                                             \
8506                 __data = (MIN);                                         \
8507         else if (__data > (MAX))                                        \
8508                 __data = (MAX);                                         \
8509         *(__PTR) = (u64)__data * NSEC_PER_USEC;                         \
8510         return ret;                                                     \
8511 }
8512 USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
8513                     UINT_MAX);
8514 #undef USEC_STORE_FUNCTION
8515
8516 static ssize_t bfq_max_budget_store(struct elevator_queue *e,
8517                                     const char *page, size_t count)
8518 {
8519         struct bfq_data *bfqd = e->elevator_data;
8520         unsigned long uninitialized_var(__data);
8521         int ret = bfq_var_store(&__data, (page), count);
8522
8523         if (__data == 0)
8524                 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
8525         else {
8526                 if (__data > INT_MAX)
8527                         __data = INT_MAX;
8528                 bfqd->bfq_max_budget = __data;
8529         }
8530
8531         bfqd->bfq_user_max_budget = __data;
8532
8533         return ret;
8534 }
8535
8536 /*
8537  * Leaving this name to preserve name compatibility with cfq
8538  * parameters, but this timeout is used for both sync and async.
8539  */
8540 static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
8541                                       const char *page, size_t count)
8542 {
8543         struct bfq_data *bfqd = e->elevator_data;
8544         unsigned long uninitialized_var(__data);
8545         int ret = bfq_var_store(&__data, (page), count);
8546
8547         if (__data < 1)
8548                 __data = 1;
8549         else if (__data > INT_MAX)
8550                 __data = INT_MAX;
8551
8552         bfqd->bfq_timeout = msecs_to_jiffies(__data);
8553         if (bfqd->bfq_user_max_budget == 0)
8554                 bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
8555
8556         return ret;
8557 }
8558
8559 static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
8560                                      const char *page, size_t count)
8561 {
8562         struct bfq_data *bfqd = e->elevator_data;
8563         unsigned long uninitialized_var(__data);
8564         int ret = bfq_var_store(&__data, (page), count);
8565
8566         if (__data > 1)
8567                 __data = 1;
8568         if (!bfqd->strict_guarantees && __data == 1
8569             && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
8570                 bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
8571
8572         bfqd->strict_guarantees = __data;
8573
8574         return ret;
8575 }
8576
8577 static ssize_t bfq_low_latency_store(struct elevator_queue *e,
8578                                      const char *page, size_t count)
8579 {
8580         struct bfq_data *bfqd = e->elevator_data;
8581         unsigned long uninitialized_var(__data);
8582         int ret = bfq_var_store(&__data, (page), count);
8583
8584         if (__data > 1)
8585                 __data = 1;
8586         if (__data == 0 && bfqd->low_latency != 0)
8587                 bfq_end_wr(bfqd);
8588         bfqd->low_latency = __data;
8589
8590         return ret;
8591 }
8592
8593 #define BFQ_ATTR(name) \
8594         __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
8595
8596 static struct elv_fs_entry bfq_attrs[] = {
8597         BFQ_ATTR(fifo_expire_sync),
8598         BFQ_ATTR(fifo_expire_async),
8599         BFQ_ATTR(back_seek_max),
8600         BFQ_ATTR(back_seek_penalty),
8601         BFQ_ATTR(slice_idle),
8602         BFQ_ATTR(slice_idle_us),
8603         BFQ_ATTR(max_budget),
8604         BFQ_ATTR(timeout_sync),
8605         BFQ_ATTR(strict_guarantees),
8606         BFQ_ATTR(low_latency),
8607         __ATTR_NULL
8608 };
8609
8610 static struct elevator_type iosched_bfq_mq = {
8611         .ops.mq = {
8612                 .get_rq_priv            = bfq_get_rq_private,
8613                 .put_rq_priv            = bfq_put_rq_private,
8614                 .exit_icq               = bfq_exit_icq,
8615                 .insert_requests        = bfq_insert_requests,
8616                 .dispatch_request       = bfq_dispatch_request,
8617                 .next_request           = elv_rb_latter_request,
8618                 .former_request         = elv_rb_former_request,
8619                 .allow_merge            = bfq_allow_bio_merge,
8620                 .bio_merge              = bfq_bio_merge,
8621                 .request_merge          = bfq_request_merge,
8622                 .requests_merged        = bfq_requests_merged,
8623                 .request_merged         = bfq_request_merged,
8624                 .has_work               = bfq_has_work,
8625                 .init_sched             = bfq_init_queue,
8626                 .exit_sched             = bfq_exit_queue,
8627         },
8628
8629         .uses_mq =              true,
8630         .icq_size =             sizeof(struct bfq_io_cq),
8631         .icq_align =            __alignof__(struct bfq_io_cq),
8632         .elevator_attrs =       bfq_attrs,
8633         .elevator_name =        "bfq",
8634         .elevator_owner =       THIS_MODULE,
8635 };
8636
8637 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8638 static struct blkcg_policy blkcg_policy_bfq = {
8639         .dfl_cftypes            = bfq_blkg_files,
8640         .legacy_cftypes         = bfq_blkcg_legacy_files,
8641
8642         .cpd_alloc_fn           = bfq_cpd_alloc,
8643         .cpd_init_fn            = bfq_cpd_init,
8644         .cpd_bind_fn            = bfq_cpd_init,
8645         .cpd_free_fn            = bfq_cpd_free,
8646
8647         .pd_alloc_fn            = bfq_pd_alloc,
8648         .pd_init_fn             = bfq_pd_init,
8649         .pd_offline_fn          = bfq_pd_offline,
8650         .pd_free_fn             = bfq_pd_free,
8651         .pd_reset_stats_fn      = bfq_pd_reset_stats,
8652 };
8653 #endif
8654
8655 static int __init bfq_init(void)
8656 {
8657         int ret;
8658
8659 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8660         ret = blkcg_policy_register(&blkcg_policy_bfq);
8661         if (ret)
8662                 return ret;
8663 #endif
8664
8665         ret = -ENOMEM;
8666         if (bfq_slab_setup())
8667                 goto err_pol_unreg;
8668
8669         /*
8670          * Times to load large popular applications for the typical
8671          * systems installed on the reference devices (see the
8672          * comments before the definitions of the next two
8673          * arrays). Actually, we use slightly slower values, as the
8674          * estimated peak rate tends to be smaller than the actual
8675          * peak rate.  The reason for this last fact is that estimates
8676          * are computed over much shorter time intervals than the long
8677          * intervals typically used for benchmarking. Why? First, to
8678          * adapt more quickly to variations. Second, because an I/O
8679          * scheduler cannot rely on a peak-rate-evaluation workload to
8680          * be run for a long time.
8681          */
8682         T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
8683         T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
8684         T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
8685         T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
8686
8687         /*
8688          * Thresholds that determine the switch between speed classes
8689          * (see the comments before the definition of the array
8690          * device_speed_thresh). These thresholds are biased towards
8691          * transitions to the fast class. This is safer than the
8692          * opposite bias. In fact, a wrong transition to the slow
8693          * class results in short weight-raising periods, because the
8694          * speed of the device then tends to be higher that the
8695          * reference peak rate. On the opposite end, a wrong
8696          * transition to the fast class tends to increase
8697          * weight-raising periods, because of the opposite reason.
8698          */
8699         device_speed_thresh[0] = (4 * R_slow[0]) / 3;
8700         device_speed_thresh[1] = (4 * R_slow[1]) / 3;
8701
8702         ret = elv_register(&iosched_bfq_mq);
8703         if (ret)
8704                 goto err_pol_unreg;
8705
8706         return 0;
8707
8708 err_pol_unreg:
8709 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8710         blkcg_policy_unregister(&blkcg_policy_bfq);
8711 #endif
8712         return ret;
8713 }
8714
8715 static void __exit bfq_exit(void)
8716 {
8717         elv_unregister(&iosched_bfq_mq);
8718 #ifdef CONFIG_BFQ_GROUP_IOSCHED
8719         blkcg_policy_unregister(&blkcg_policy_bfq);
8720 #endif
8721         bfq_slab_kill();
8722 }
8723
8724 module_init(bfq_init);
8725 module_exit(bfq_exit);
8726
8727 MODULE_AUTHOR("Paolo Valente");
8728 MODULE_LICENSE("GPL");
8729 MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");