block/blk-iocost.c

   1 /* SPDX-License-Identifier: GPL-2.0
   2  *
   3  * IO cost model based controller.
   4  *
   5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
   6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
   7  * Copyright (C) 2019 Facebook
   8  *
   9  * One challenge of controlling IO resources is the lack of trivially
  10  * observable cost metric.  This is distinguished from CPU and memory where
  11  * wallclock time and the number of bytes can serve as accurate enough
  12  * approximations.
  13  *
  14  * Bandwidth and iops are the most commonly used metrics for IO devices but
  15  * depending on the type and specifics of the device, different IO patterns
  16  * easily lead to multiple orders of magnitude variations rendering them
  17  * useless for the purpose of IO capacity distribution.  While on-device
  18  * time, with a lot of clutches, could serve as a useful approximation for
  19  * non-queued rotational devices, this is no longer viable with modern
  20  * devices, even the rotational ones.
  21  *
  22  * While there is no cost metric we can trivially observe, it isn't a
  23  * complete mystery.  For example, on a rotational device, seek cost
  24  * dominates while a contiguous transfer contributes a smaller amount
  25  * proportional to the size.  If we can characterize at least the relative
  26  * costs of these different types of IOs, it should be possible to
  27  * implement a reasonable work-conserving proportional IO resource
  28  * distribution.
  29  *
  30  * 1. IO Cost Model
  31  *
  32  * IO cost model estimates the cost of an IO given its basic parameters and
  33  * history (e.g. the end sector of the last IO).  The cost is measured in
  34  * device time.  If a given IO is estimated to cost 10ms, the device should
  35  * be able to process ~100 of those IOs in a second.
  36  *
  37  * Currently, there's only one builtin cost model - linear.  Each IO is
  38  * classified as sequential or random and given a base cost accordingly.
  39  * On top of that, a size cost proportional to the length of the IO is
  40  * added.  While simple, this model captures the operational
  41  * characteristics of a wide varienty of devices well enough.  Default
  42  * paramters for several different classes of devices are provided and the
  43  * parameters can be configured from userspace via
  44  * /sys/fs/cgroup/io.cost.model.
  45  *
  46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47  * device-specific coefficients.
  48  *
  49  * 2. Control Strategy
  50  *
  51  * The device virtual time (vtime) is used as the primary control metric.
  52  * The control strategy is composed of the following three parts.
  53  *
  54  * 2-1. Vtime Distribution
  55  *
  56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57  * calculated.  Please consider the following hierarchy where the numbers
  58  * inside parentheses denote the configured weights.
  59  *
  60  *           root
  61  *         /       \
  62  *      A (w:100)  B (w:300)
  63  *      /       \
  64  *  A0 (w:100)  A1 (w:100)
  65  *
  66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
  68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69  * 12.5% each.  The distribution mechanism only cares about these flattened
  70  * shares.  They're called hweights (hierarchical weights) and always add
  71  * upto 1 (WEIGHT_ONE).
  72  *
  73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75  * against the device vtime - an IO which takes 10ms on the underlying
  76  * device is considered to take 80ms on A0.
  77  *
  78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
  79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
  80  * the vtime consumed by past IOs and can issue a new IO iff doing so
  81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
  82  * suspended until the vtime has progressed enough to cover it.
  83  *
  84  * 2-2. Vrate Adjustment
  85  *
  86  * It's unrealistic to expect the cost model to be perfect.  There are too
  87  * many devices and even on the same device the overall performance
  88  * fluctuates depending on numerous factors such as IO mixture and device
  89  * internal garbage collection.  The controller needs to adapt dynamically.
  90  *
  91  * This is achieved by adjusting the overall IO rate according to how busy
  92  * the device is.  If the device becomes overloaded, we're sending down too
  93  * many IOs and should generally slow down.  If there are waiting issuers
  94  * but the device isn't saturated, we're issuing too few and should
  95  * generally speed up.
  96  *
  97  * To slow down, we lower the vrate - the rate at which the device vtime
  98  * passes compared to the wall clock.  For example, if the vtime is running
  99  * at the vrate of 75%, all cgroups added up would only be able to issue
 100  * 750ms worth of IOs per second, and vice-versa for speeding up.
 101  *
 102  * Device business is determined using two criteria - rq wait and
 103  * completion latencies.
 104  *
 105  * When a device gets saturated, the on-device and then the request queues
 106  * fill up and a bio which is ready to be issued has to wait for a request
 107  * to become available.  When this delay becomes noticeable, it's a clear
 108  * indication that the device is saturated and we lower the vrate.  This
 109  * saturation signal is fairly conservative as it only triggers when both
 110  * hardware and software queues are filled up, and is used as the default
 111  * busy signal.
 112  *
 113  * As devices can have deep queues and be unfair in how the queued commands
 114  * are executed, soley depending on rq wait may not result in satisfactory
 115  * control quality.  For a better control quality, completion latency QoS
 116  * parameters can be configured so that the device is considered saturated
 117  * if N'th percentile completion latency rises above the set point.
 118  *
 119  * The completion latency requirements are a function of both the
 120  * underlying device characteristics and the desired IO latency quality of
 121  * service.  There is an inherent trade-off - the tighter the latency QoS,
 122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
 123  * and can be set through /sys/fs/cgroup/io.cost.qos.
 124  *
 125  * 2-3. Work Conservation
 126  *
 127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 128  * periodically while B is sending out enough parallel IOs to saturate the
 129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 130  * cost per second, i.e., 10% of the device capacity.  The naive
 131  * distribution of half and half would lead to 60% utilization of the
 132  * device, a significant reduction in the total amount of work done
 133  * compared to free-for-all competition.  This is too high a cost to pay
 134  * for IO control.
 135  *
 136  * To conserve the total amount of work done, we keep track of how much
 137  * each active cgroup is actually using and yield part of its weight if
 138  * there are other cgroups which can make use of it.  In the above case,
 139  * A's weight will be lowered so that it hovers above the actual usage and
 140  * B would be able to use the rest.
 141  *
 142  * As we don't want to penalize a cgroup for donating its weight, the
 143  * surplus weight adjustment factors in a margin and has an immediate
 144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
 145  *
 146  * Note that adjusting down surplus weights has the same effects as
 147  * accelerating vtime for other cgroups and work conservation can also be
 148  * implemented by adjusting vrate dynamically.  However, squaring who can
 149  * donate and should take back how much requires hweight propagations
 150  * anyway making it easier to implement and understand as a separate
 151  * mechanism.
 152  *
 153  * 3. Monitoring
 154  *
 155  * Instead of debugfs or other clumsy monitoring mechanisms, this
 156  * controller uses a drgn based monitoring script -
 157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 158  * https://github.com/osandov/drgn.  The ouput looks like the following.
 159  *
 160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 161  *                 active      weight      hweight% inflt% dbt  delay usages%
 162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 164  *
 165  * - per        : Timer period
 166  * - cur_per    : Internal wall and device vtime clock
 167  * - vrate      : Device virtual time rate against wall clock
 168  * - weight     : Surplus-adjusted and configured weights
 169  * - hweight    : Surplus-adjusted and configured hierarchical weights
 170  * - inflt      : The percentage of in-flight IO cost at the end of last period
 171  * - del_ms     : Deferred issuer delay induction level and duration
 172  * - usages     : Usage history
 173  */
 174
 175 #include <linux/kernel.h>
 176 #include <linux/module.h>
 177 #include <linux/timer.h>
 178 #include <linux/time64.h>
 179 #include <linux/parser.h>
 180 #include <linux/sched/signal.h>
 181 #include <linux/blk-cgroup.h>
 182 #include <asm/local.h>
 183 #include <asm/local64.h>
 184 #include "blk-rq-qos.h"
 185 #include "blk-stat.h"
 186 #include "blk-wbt.h"
 187
 188 #ifdef CONFIG_TRACEPOINTS
 189
 190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
 191 #define TRACE_IOCG_PATH_LEN 1024
 192 static DEFINE_SPINLOCK(trace_iocg_path_lock);
 193 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 194
 195 #define TRACE_IOCG_PATH(type, iocg, ...)                                        \
 196         do {                                                                    \
 197                 unsigned long flags;                                            \
 198                 if (trace_iocost_##type##_enabled()) {                          \
 199                         spin_lock_irqsave(&trace_iocg_path_lock, flags);        \
 200                         cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,      \
 201                                     trace_iocg_path, TRACE_IOCG_PATH_LEN);      \
 202                         trace_iocost_##type(iocg, trace_iocg_path,              \
 203                                               ##__VA_ARGS__);                   \
 204                         spin_unlock_irqrestore(&trace_iocg_path_lock, flags);   \
 205                 }                                                               \
 206         } while (0)
 207
 208 #else   /* CONFIG_TRACE_POINTS */
 209 #define TRACE_IOCG_PATH(type, iocg, ...)        do { } while (0)
 210 #endif  /* CONFIG_TRACE_POINTS */
 211
 212 enum {
 213         MILLION                 = 1000000,
 214
 215         /* timer period is calculated from latency requirements, bound it */
 216         MIN_PERIOD              = USEC_PER_MSEC,
 217         MAX_PERIOD              = USEC_PER_SEC,
 218
 219         /*
 220          * A cgroup's vtime can run 50% behind the device vtime, which
 221          * serves as its IO credit buffer.  Surplus weight adjustment is
 222          * immediately canceled if the vtime margin runs below 10%.
 223          */
 224         MARGIN_MIN_PCT          = 10,
 225         MARGIN_MAX_PCT          = 50,
 226
 227         /* Have some play in timer operations */
 228         TIMER_SLACK_PCT         = 1,
 229
 230         /*
 231          * vtime can wrap well within a reasonable uptime when vrate is
 232          * consistently raised.  Don't trust recorded cgroup vtime if the
 233          * period counter indicates that it's older than 5mins.
 234          */
 235         VTIME_VALID_DUR         = 300 * USEC_PER_SEC,
 236
 237         /*
 238          * Remember the past three non-zero usages and use the max for
 239          * surplus calculation.  Three slots guarantee that we remember one
 240          * full period usage from the last active stretch even after
 241          * partial deactivation and re-activation periods.  Don't start
 242          * giving away weight before collecting two data points to prevent
 243          * hweight adjustments based on one partial activation period.
 244          */
 245         NR_USAGE_SLOTS          = 3,
 246         MIN_VALID_USAGES        = 2,
 247
 248         /* 1/64k is granular enough and can easily be handled w/ u32 */
 249         WEIGHT_ONE              = 1 << 16,
 250
 251         /*
 252          * As vtime is used to calculate the cost of each IO, it needs to
 253          * be fairly high precision.  For example, it should be able to
 254          * represent the cost of a single page worth of discard with
 255          * suffificient accuracy.  At the same time, it should be able to
 256          * represent reasonably long enough durations to be useful and
 257          * convenient during operation.
 258          *
 259          * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
 260          * granularity and days of wrap-around time even at extreme vrates.
 261          */
 262         VTIME_PER_SEC_SHIFT     = 37,
 263         VTIME_PER_SEC           = 1LLU << VTIME_PER_SEC_SHIFT,
 264         VTIME_PER_USEC          = VTIME_PER_SEC / USEC_PER_SEC,
 265         VTIME_PER_NSEC          = VTIME_PER_SEC / NSEC_PER_SEC,
 266
 267         /* bound vrate adjustments within two orders of magnitude */
 268         VRATE_MIN_PPM           = 10000,        /* 1% */
 269         VRATE_MAX_PPM           = 100000000,    /* 10000% */
 270
 271         VRATE_MIN               = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
 272         VRATE_CLAMP_ADJ_PCT     = 4,
 273
 274         /* if IOs end up waiting for requests, issue less */
 275         RQ_WAIT_BUSY_PCT        = 5,
 276
 277         /* unbusy hysterisis */
 278         UNBUSY_THR_PCT          = 75,
 279
 280         /* don't let cmds which take a very long time pin lagging for too long */
 281         MAX_LAGGING_PERIODS     = 10,
 282
 283         /*
 284          * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
 285          * donate the surplus.
 286          */
 287         SURPLUS_SCALE_PCT       = 125,                  /* * 125% */
 288         SURPLUS_SCALE_ABS       = WEIGHT_ONE / 50,      /* + 2% */
 289         SURPLUS_MIN_ADJ_DELTA   = WEIGHT_ONE / 33,      /* 3% */
 290
 291         /* switch iff the conditions are met for longer than this */
 292         AUTOP_CYCLE_NSEC        = 10LLU * NSEC_PER_SEC,
 293
 294         /*
 295          * Count IO size in 4k pages.  The 12bit shift helps keeping
 296          * size-proportional components of cost calculation in closer
 297          * numbers of digits to per-IO cost components.
 298          */
 299         IOC_PAGE_SHIFT          = 12,
 300         IOC_PAGE_SIZE           = 1 << IOC_PAGE_SHIFT,
 301         IOC_SECT_TO_PAGE_SHIFT  = IOC_PAGE_SHIFT - SECTOR_SHIFT,
 302
 303         /* if apart further than 16M, consider randio for linear model */
 304         LCOEF_RANDIO_PAGES      = 4096,
 305 };
 306
 307 enum ioc_running {
 308         IOC_IDLE,
 309         IOC_RUNNING,
 310         IOC_STOP,
 311 };
 312
 313 /* io.cost.qos controls including per-dev enable of the whole controller */
 314 enum {
 315         QOS_ENABLE,
 316         QOS_CTRL,
 317         NR_QOS_CTRL_PARAMS,
 318 };
 319
 320 /* io.cost.qos params */
 321 enum {
 322         QOS_RPPM,
 323         QOS_RLAT,
 324         QOS_WPPM,
 325         QOS_WLAT,
 326         QOS_MIN,
 327         QOS_MAX,
 328         NR_QOS_PARAMS,
 329 };
 330
 331 /* io.cost.model controls */
 332 enum {
 333         COST_CTRL,
 334         COST_MODEL,
 335         NR_COST_CTRL_PARAMS,
 336 };
 337
 338 /* builtin linear cost model coefficients */
 339 enum {
 340         I_LCOEF_RBPS,
 341         I_LCOEF_RSEQIOPS,
 342         I_LCOEF_RRANDIOPS,
 343         I_LCOEF_WBPS,
 344         I_LCOEF_WSEQIOPS,
 345         I_LCOEF_WRANDIOPS,
 346         NR_I_LCOEFS,
 347 };
 348
 349 enum {
 350         LCOEF_RPAGE,
 351         LCOEF_RSEQIO,
 352         LCOEF_RRANDIO,
 353         LCOEF_WPAGE,
 354         LCOEF_WSEQIO,
 355         LCOEF_WRANDIO,
 356         NR_LCOEFS,
 357 };
 358
 359 enum {
 360         AUTOP_INVALID,
 361         AUTOP_HDD,
 362         AUTOP_SSD_QD1,
 363         AUTOP_SSD_DFL,
 364         AUTOP_SSD_FAST,
 365 };
 366
 367 struct ioc_gq;
 368
 369 struct ioc_params {
 370         u32                             qos[NR_QOS_PARAMS];
 371         u64                             i_lcoefs[NR_I_LCOEFS];
 372         u64                             lcoefs[NR_LCOEFS];
 373         u32                             too_fast_vrate_pct;
 374         u32                             too_slow_vrate_pct;
 375 };
 376
 377 struct ioc_margins {
 378         s64                             min;
 379         s64                             max;
 380 };
 381
 382 struct ioc_missed {
 383         local_t                         nr_met;
 384         local_t                         nr_missed;
 385         u32                             last_met;
 386         u32                             last_missed;
 387 };
 388
 389 struct ioc_pcpu_stat {
 390         struct ioc_missed               missed[2];
 391
 392         local64_t                       rq_wait_ns;
 393         u64                             last_rq_wait_ns;
 394 };
 395
 396 /* per device */
 397 struct ioc {
 398         struct rq_qos                   rqos;
 399
 400         bool                            enabled;
 401
 402         struct ioc_params               params;
 403         struct ioc_margins              margins;
 404         u32                             period_us;
 405         u32                             timer_slack_ns;
 406         u64                             vrate_min;
 407         u64                             vrate_max;
 408
 409         spinlock_t                      lock;
 410         struct timer_list               timer;
 411         struct list_head                active_iocgs;   /* active cgroups */
 412         struct ioc_pcpu_stat __percpu   *pcpu_stat;
 413
 414         enum ioc_running                running;
 415         atomic64_t                      vtime_rate;
 416
 417         seqcount_spinlock_t             period_seqcount;
 418         u64                             period_at;      /* wallclock starttime */
 419         u64                             period_at_vtime; /* vtime starttime */
 420
 421         atomic64_t                      cur_period;     /* inc'd each period */
 422         int                             busy_level;     /* saturation history */
 423
 424         bool                            weights_updated;
 425         atomic_t                        hweight_gen;    /* for lazy hweights */
 426
 427         u64                             autop_too_fast_at;
 428         u64                             autop_too_slow_at;
 429         int                             autop_idx;
 430         bool                            user_qos_params:1;
 431         bool                            user_cost_model:1;
 432 };
 433
 434 struct iocg_pcpu_stat {
 435         local64_t                       abs_vusage;
 436 };
 437
 438 struct iocg_stat {
 439         u64                             usage_us;
 440 };
 441
 442 /* per device-cgroup pair */
 443 struct ioc_gq {
 444         struct blkg_policy_data         pd;
 445         struct ioc                      *ioc;
 446
 447         /*
 448          * A iocg can get its weight from two sources - an explicit
 449          * per-device-cgroup configuration or the default weight of the
 450          * cgroup.  `cfg_weight` is the explicit per-device-cgroup
 451          * configuration.  `weight` is the effective considering both
 452          * sources.
 453          *
 454          * When an idle cgroup becomes active its `active` goes from 0 to
 455          * `weight`.  `inuse` is the surplus adjusted active weight.
 456          * `active` and `inuse` are used to calculate `hweight_active` and
 457          * `hweight_inuse`.
 458          *
 459          * `last_inuse` remembers `inuse` while an iocg is idle to persist
 460          * surplus adjustments.
 461          */
 462         u32                             cfg_weight;
 463         u32                             weight;
 464         u32                             active;
 465         u32                             inuse;
 466         u32                             last_inuse;
 467
 468         sector_t                        cursor;         /* to detect randio */
 469
 470         /*
 471          * `vtime` is this iocg's vtime cursor which progresses as IOs are
 472          * issued.  If lagging behind device vtime, the delta represents
 473          * the currently available IO budget.  If runnning ahead, the
 474          * overage.
 475          *
 476          * `vtime_done` is the same but progressed on completion rather
 477          * than issue.  The delta behind `vtime` represents the cost of
 478          * currently in-flight IOs.
 479          */
 480         atomic64_t                      vtime;
 481         atomic64_t                      done_vtime;
 482         u64                             abs_vdebt;
 483
 484         /*
 485          * The period this iocg was last active in.  Used for deactivation
 486          * and invalidating `vtime`.
 487          */
 488         atomic64_t                      active_period;
 489         struct list_head                active_list;
 490
 491         /* see __propagate_weights() and current_hweight() for details */
 492         u64                             child_active_sum;
 493         u64                             child_inuse_sum;
 494         int                             hweight_gen;
 495         u32                             hweight_active;
 496         u32                             hweight_inuse;
 497
 498         struct list_head                walk_list;
 499         struct list_head                surplus_list;
 500
 501         struct wait_queue_head          waitq;
 502         struct hrtimer                  waitq_timer;
 503         struct hrtimer                  delay_timer;
 504
 505         /* timestamp at the latest activation */
 506         u64                             activated_at;
 507
 508         /* statistics */
 509         struct iocg_pcpu_stat __percpu  *pcpu_stat;
 510         struct iocg_stat                local_stat;
 511         struct iocg_stat                desc_stat;
 512         struct iocg_stat                last_stat;
 513         u64                             last_stat_abs_vusage;
 514
 515         /* usage is recorded as fractions of WEIGHT_ONE */
 516         u32                             usage_delta_us;
 517         int                             usage_idx;
 518         u32                             usages[NR_USAGE_SLOTS];
 519
 520         /* this iocg's depth in the hierarchy and ancestors including self */
 521         int                             level;
 522         struct ioc_gq                   *ancestors[];
 523 };
 524
 525 /* per cgroup */
 526 struct ioc_cgrp {
 527         struct blkcg_policy_data        cpd;
 528         unsigned int                    dfl_weight;
 529 };
 530
 531 struct ioc_now {
 532         u64                             now_ns;
 533         u64                             now;
 534         u64                             vnow;
 535         u64                             vrate;
 536 };
 537
 538 struct iocg_wait {
 539         struct wait_queue_entry         wait;
 540         struct bio                      *bio;
 541         u64                             abs_cost;
 542         bool                            committed;
 543 };
 544
 545 struct iocg_wake_ctx {
 546         struct ioc_gq                   *iocg;
 547         u32                             hw_inuse;
 548         s64                             vbudget;
 549 };
 550
 551 static const struct ioc_params autop[] = {
 552         [AUTOP_HDD] = {
 553                 .qos                            = {
 554                         [QOS_RLAT]              =        250000, /* 250ms */
 555                         [QOS_WLAT]              =        250000,
 556                         [QOS_MIN]               = VRATE_MIN_PPM,
 557                         [QOS_MAX]               = VRATE_MAX_PPM,
 558                 },
 559                 .i_lcoefs                       = {
 560                         [I_LCOEF_RBPS]          =     174019176,
 561                         [I_LCOEF_RSEQIOPS]      =         41708,
 562                         [I_LCOEF_RRANDIOPS]     =           370,
 563                         [I_LCOEF_WBPS]          =     178075866,
 564                         [I_LCOEF_WSEQIOPS]      =         42705,
 565                         [I_LCOEF_WRANDIOPS]     =           378,
 566                 },
 567         },
 568         [AUTOP_SSD_QD1] = {
 569                 .qos                            = {
 570                         [QOS_RLAT]              =         25000, /* 25ms */
 571                         [QOS_WLAT]              =         25000,
 572                         [QOS_MIN]               = VRATE_MIN_PPM,
 573                         [QOS_MAX]               = VRATE_MAX_PPM,
 574                 },
 575                 .i_lcoefs                       = {
 576                         [I_LCOEF_RBPS]          =     245855193,
 577                         [I_LCOEF_RSEQIOPS]      =         61575,
 578                         [I_LCOEF_RRANDIOPS]     =          6946,
 579                         [I_LCOEF_WBPS]          =     141365009,
 580                         [I_LCOEF_WSEQIOPS]      =         33716,
 581                         [I_LCOEF_WRANDIOPS]     =         26796,
 582                 },
 583         },
 584         [AUTOP_SSD_DFL] = {
 585                 .qos                            = {
 586                         [QOS_RLAT]              =         25000, /* 25ms */
 587                         [QOS_WLAT]              =         25000,
 588                         [QOS_MIN]               = VRATE_MIN_PPM,
 589                         [QOS_MAX]               = VRATE_MAX_PPM,
 590                 },
 591                 .i_lcoefs                       = {
 592                         [I_LCOEF_RBPS]          =     488636629,
 593                         [I_LCOEF_RSEQIOPS]      =          8932,
 594                         [I_LCOEF_RRANDIOPS]     =          8518,
 595                         [I_LCOEF_WBPS]          =     427891549,
 596                         [I_LCOEF_WSEQIOPS]      =         28755,
 597                         [I_LCOEF_WRANDIOPS]     =         21940,
 598                 },
 599                 .too_fast_vrate_pct             =           500,
 600         },
 601         [AUTOP_SSD_FAST] = {
 602                 .qos                            = {
 603                         [QOS_RLAT]              =          5000, /* 5ms */
 604                         [QOS_WLAT]              =          5000,
 605                         [QOS_MIN]               = VRATE_MIN_PPM,
 606                         [QOS_MAX]               = VRATE_MAX_PPM,
 607                 },
 608                 .i_lcoefs                       = {
 609                         [I_LCOEF_RBPS]          =    3102524156LLU,
 610                         [I_LCOEF_RSEQIOPS]      =        724816,
 611                         [I_LCOEF_RRANDIOPS]     =        778122,
 612                         [I_LCOEF_WBPS]          =    1742780862LLU,
 613                         [I_LCOEF_WSEQIOPS]      =        425702,
 614                         [I_LCOEF_WRANDIOPS]     =        443193,
 615                 },
 616                 .too_slow_vrate_pct             =            10,
 617         },
 618 };
 619
 620 /*
 621  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 622  * vtime credit shortage and down on device saturation.
 623  */
 624 static u32 vrate_adj_pct[] =
 625         { 0, 0, 0, 0,
 626           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 627           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 628           4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
 629
 630 static struct blkcg_policy blkcg_policy_iocost;
 631
 632 /* accessors and helpers */
 633 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
 634 {
 635         return container_of(rqos, struct ioc, rqos);
 636 }
 637
 638 static struct ioc *q_to_ioc(struct request_queue *q)
 639 {
 640         return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
 641 }
 642
 643 static const char *q_name(struct request_queue *q)
 644 {
 645         if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
 646                 return kobject_name(q->kobj.parent);
 647         else
 648                 return "<unknown>";
 649 }
 650
 651 static const char __maybe_unused *ioc_name(struct ioc *ioc)
 652 {
 653         return q_name(ioc->rqos.q);
 654 }
 655
 656 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
 657 {
 658         return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
 659 }
 660
 661 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
 662 {
 663         return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
 664 }
 665
 666 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
 667 {
 668         return pd_to_blkg(&iocg->pd);
 669 }
 670
 671 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
 672 {
 673         return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
 674                             struct ioc_cgrp, cpd);
 675 }
 676
 677 /*
 678  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 679  * weight, the more expensive each IO.  Must round up.
 680  */
 681 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
 682 {
 683         return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
 684 }
 685
 686 /*
 687  * The inverse of abs_cost_to_cost().  Must round up.
 688  */
 689 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
 690 {
 691         return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
 692 }
 693
 694 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
 695                             u64 abs_cost, u64 cost)
 696 {
 697         struct iocg_pcpu_stat *gcs;
 698
 699         bio->bi_iocost_cost = cost;
 700         atomic64_add(cost, &iocg->vtime);
 701
 702         gcs = get_cpu_ptr(iocg->pcpu_stat);
 703         local64_add(abs_cost, &gcs->abs_vusage);
 704         put_cpu_ptr(gcs);
 705 }
 706
 707 static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
 708 {
 709         if (lock_ioc) {
 710                 spin_lock_irqsave(&iocg->ioc->lock, *flags);
 711                 spin_lock(&iocg->waitq.lock);
 712         } else {
 713                 spin_lock_irqsave(&iocg->waitq.lock, *flags);
 714         }
 715 }
 716
 717 static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
 718 {
 719         if (unlock_ioc) {
 720                 spin_unlock(&iocg->waitq.lock);
 721                 spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
 722         } else {
 723                 spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
 724         }
 725 }
 726
 727 #define CREATE_TRACE_POINTS
 728 #include <trace/events/iocost.h>
 729
 730 static void ioc_refresh_margins(struct ioc *ioc)
 731 {
 732         struct ioc_margins *margins = &ioc->margins;
 733         u32 period_us = ioc->period_us;
 734         u64 vrate = atomic64_read(&ioc->vtime_rate);
 735
 736         margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
 737         margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate;
 738 }
 739
 740 /* latency Qos params changed, update period_us and all the dependent params */
 741 static void ioc_refresh_period_us(struct ioc *ioc)
 742 {
 743         u32 ppm, lat, multi, period_us;
 744
 745         lockdep_assert_held(&ioc->lock);
 746
 747         /* pick the higher latency target */
 748         if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
 749                 ppm = ioc->params.qos[QOS_RPPM];
 750                 lat = ioc->params.qos[QOS_RLAT];
 751         } else {
 752                 ppm = ioc->params.qos[QOS_WPPM];
 753                 lat = ioc->params.qos[QOS_WLAT];
 754         }
 755
 756         /*
 757          * We want the period to be long enough to contain a healthy number
 758          * of IOs while short enough for granular control.  Define it as a
 759          * multiple of the latency target.  Ideally, the multiplier should
 760          * be scaled according to the percentile so that it would nominally
 761          * contain a certain number of requests.  Let's be simpler and
 762          * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
 763          */
 764         if (ppm)
 765                 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
 766         else
 767                 multi = 2;
 768         period_us = multi * lat;
 769         period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
 770
 771         /* calculate dependent params */
 772         ioc->period_us = period_us;
 773         ioc->timer_slack_ns = div64_u64(
 774                 (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
 775                 100);
 776         ioc_refresh_margins(ioc);
 777 }
 778
 779 static int ioc_autop_idx(struct ioc *ioc)
 780 {
 781         int idx = ioc->autop_idx;
 782         const struct ioc_params *p = &autop[idx];
 783         u32 vrate_pct;
 784         u64 now_ns;
 785
 786         /* rotational? */
 787         if (!blk_queue_nonrot(ioc->rqos.q))
 788                 return AUTOP_HDD;
 789
 790         /* handle SATA SSDs w/ broken NCQ */
 791         if (blk_queue_depth(ioc->rqos.q) == 1)
 792                 return AUTOP_SSD_QD1;
 793
 794         /* use one of the normal ssd sets */
 795         if (idx < AUTOP_SSD_DFL)
 796                 return AUTOP_SSD_DFL;
 797
 798         /* if user is overriding anything, maintain what was there */
 799         if (ioc->user_qos_params || ioc->user_cost_model)
 800                 return idx;
 801
 802         /* step up/down based on the vrate */
 803         vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
 804                               VTIME_PER_USEC);
 805         now_ns = ktime_get_ns();
 806
 807         if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 808                 if (!ioc->autop_too_fast_at)
 809                         ioc->autop_too_fast_at = now_ns;
 810                 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
 811                         return idx + 1;
 812         } else {
 813                 ioc->autop_too_fast_at = 0;
 814         }
 815
 816         if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
 817                 if (!ioc->autop_too_slow_at)
 818                         ioc->autop_too_slow_at = now_ns;
 819                 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
 820                         return idx - 1;
 821         } else {
 822                 ioc->autop_too_slow_at = 0;
 823         }
 824
 825         return idx;
 826 }
 827
 828 /*
 829  * Take the followings as input
 830  *
 831  *  @bps        maximum sequential throughput
 832  *  @seqiops    maximum sequential 4k iops
 833  *  @randiops   maximum random 4k iops
 834  *
 835  * and calculate the linear model cost coefficients.
 836  *
 837  *  *@page      per-page cost           1s / (@bps / 4096)
 838  *  *@seqio     base cost of a seq IO   max((1s / @seqiops) - *@page, 0)
 839  *  @randiops   base cost of a rand IO  max((1s / @randiops) - *@page, 0)
 840  */
 841 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
 842                         u64 *page, u64 *seqio, u64 *randio)
 843 {
 844         u64 v;
 845
 846         *page = *seqio = *randio = 0;
 847
 848         if (bps)
 849                 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
 850                                            DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
 851
 852         if (seqiops) {
 853                 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
 854                 if (v > *page)
 855                         *seqio = v - *page;
 856         }
 857
 858         if (randiops) {
 859                 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
 860                 if (v > *page)
 861                         *randio = v - *page;
 862         }
 863 }
 864
 865 static void ioc_refresh_lcoefs(struct ioc *ioc)
 866 {
 867         u64 *u = ioc->params.i_lcoefs;
 868         u64 *c = ioc->params.lcoefs;
 869
 870         calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 871                     &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
 872         calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
 873                     &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
 874 }
 875
 876 static bool ioc_refresh_params(struct ioc *ioc, bool force)
 877 {
 878         const struct ioc_params *p;
 879         int idx;
 880
 881         lockdep_assert_held(&ioc->lock);
 882
 883         idx = ioc_autop_idx(ioc);
 884         p = &autop[idx];
 885
 886         if (idx == ioc->autop_idx && !force)
 887                 return false;
 888
 889         if (idx != ioc->autop_idx)
 890                 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 891
 892         ioc->autop_idx = idx;
 893         ioc->autop_too_fast_at = 0;
 894         ioc->autop_too_slow_at = 0;
 895
 896         if (!ioc->user_qos_params)
 897                 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
 898         if (!ioc->user_cost_model)
 899                 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
 900
 901         ioc_refresh_period_us(ioc);
 902         ioc_refresh_lcoefs(ioc);
 903
 904         ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 905                                             VTIME_PER_USEC, MILLION);
 906         ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
 907                                    VTIME_PER_USEC, MILLION);
 908
 909         return true;
 910 }
 911
 912 /* take a snapshot of the current [v]time and vrate */
 913 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
 914 {
 915         unsigned seq;
 916
 917         now->now_ns = ktime_get();
 918         now->now = ktime_to_us(now->now_ns);
 919         now->vrate = atomic64_read(&ioc->vtime_rate);
 920
 921         /*
 922          * The current vtime is
 923          *
 924          *   vtime at period start + (wallclock time since the start) * vrate
 925          *
 926          * As a consistent snapshot of `period_at_vtime` and `period_at` is
 927          * needed, they're seqcount protected.
 928          */
 929         do {
 930                 seq = read_seqcount_begin(&ioc->period_seqcount);
 931                 now->vnow = ioc->period_at_vtime +
 932                         (now->now - ioc->period_at) * now->vrate;
 933         } while (read_seqcount_retry(&ioc->period_seqcount, seq));
 934 }
 935
 936 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
 937 {
 938         WARN_ON_ONCE(ioc->running != IOC_RUNNING);
 939
 940         write_seqcount_begin(&ioc->period_seqcount);
 941         ioc->period_at = now->now;
 942         ioc->period_at_vtime = now->vnow;
 943         write_seqcount_end(&ioc->period_seqcount);
 944
 945         ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
 946         add_timer(&ioc->timer);
 947 }
 948
 949 /*
 950  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
 951  * weight sums and propagate upwards accordingly.
 952  */
 953 static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
 954 {
 955         struct ioc *ioc = iocg->ioc;
 956         int lvl;
 957
 958         lockdep_assert_held(&ioc->lock);
 959
 960         inuse = clamp_t(u32, inuse, 1, active);
 961
 962         if (active == iocg->active && inuse == iocg->inuse)
 963                 return;
 964
 965         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
 966                 struct ioc_gq *parent = iocg->ancestors[lvl];
 967                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
 968                 u32 parent_active = 0, parent_inuse = 0;
 969
 970                 /* update the level sums */
 971                 parent->child_active_sum += (s32)(active - child->active);
 972                 parent->child_inuse_sum += (s32)(inuse - child->inuse);
 973                 /* apply the udpates */
 974                 child->active = active;
 975                 child->inuse = inuse;
 976
 977                 /*
 978                  * The delta between inuse and active sums indicates that
 979                  * that much of weight is being given away.  Parent's inuse
 980                  * and active should reflect the ratio.
 981                  */
 982                 if (parent->child_active_sum) {
 983                         parent_active = parent->weight;
 984                         parent_inuse = DIV64_U64_ROUND_UP(
 985                                 parent_active * parent->child_inuse_sum,
 986                                 parent->child_active_sum);
 987                 }
 988
 989                 /* do we need to keep walking up? */
 990                 if (parent_active == parent->active &&
 991                     parent_inuse == parent->inuse)
 992                         break;
 993
 994                 active = parent_active;
 995                 inuse = parent_inuse;
 996         }
 997
 998         ioc->weights_updated = true;
 999 }
1000
1001 static void commit_weights(struct ioc *ioc)
1002 {
1003         lockdep_assert_held(&ioc->lock);
1004
1005         if (ioc->weights_updated) {
1006                 /* paired with rmb in current_hweight(), see there */
1007                 smp_wmb();
1008                 atomic_inc(&ioc->hweight_gen);
1009                 ioc->weights_updated = false;
1010         }
1011 }
1012
1013 static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
1014 {
1015         __propagate_weights(iocg, active, inuse);
1016         commit_weights(iocg->ioc);
1017 }
1018
1019 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1020 {
1021         struct ioc *ioc = iocg->ioc;
1022         int lvl;
1023         u32 hwa, hwi;
1024         int ioc_gen;
1025
1026         /* hot path - if uptodate, use cached */
1027         ioc_gen = atomic_read(&ioc->hweight_gen);
1028         if (ioc_gen == iocg->hweight_gen)
1029                 goto out;
1030
1031         /*
1032          * Paired with wmb in commit_weights(). If we saw the updated
1033          * hweight_gen, all the weight updates from __propagate_weights() are
1034          * visible too.
1035          *
1036          * We can race with weight updates during calculation and get it
1037          * wrong.  However, hweight_gen would have changed and a future
1038          * reader will recalculate and we're guaranteed to discard the
1039          * wrong result soon.
1040          */
1041         smp_rmb();
1042
1043         hwa = hwi = WEIGHT_ONE;
1044         for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1045                 struct ioc_gq *parent = iocg->ancestors[lvl];
1046                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1047                 u64 active_sum = READ_ONCE(parent->child_active_sum);
1048                 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1049                 u32 active = READ_ONCE(child->active);
1050                 u32 inuse = READ_ONCE(child->inuse);
1051
1052                 /* we can race with deactivations and either may read as zero */
1053                 if (!active_sum || !inuse_sum)
1054                         continue;
1055
1056                 active_sum = max_t(u64, active, active_sum);
1057                 hwa = div64_u64((u64)hwa * active, active_sum);
1058
1059                 inuse_sum = max_t(u64, inuse, inuse_sum);
1060                 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1061         }
1062
1063         iocg->hweight_active = max_t(u32, hwa, 1);
1064         iocg->hweight_inuse = max_t(u32, hwi, 1);
1065         iocg->hweight_gen = ioc_gen;
1066 out:
1067         if (hw_activep)
1068                 *hw_activep = iocg->hweight_active;
1069         if (hw_inusep)
1070                 *hw_inusep = iocg->hweight_inuse;
1071 }
1072
1073 static void weight_updated(struct ioc_gq *iocg)
1074 {
1075         struct ioc *ioc = iocg->ioc;
1076         struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1077         struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1078         u32 weight;
1079
1080         lockdep_assert_held(&ioc->lock);
1081
1082         weight = iocg->cfg_weight ?: iocc->dfl_weight;
1083         if (weight != iocg->weight && iocg->active)
1084                 propagate_weights(iocg, weight,
1085                                   DIV64_U64_ROUND_UP((u64)iocg->inuse * weight,
1086                                                      iocg->weight));
1087         iocg->weight = weight;
1088 }
1089
1090 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1091 {
1092         struct ioc *ioc = iocg->ioc;
1093         u64 last_period, cur_period, max_period_delta;
1094         u64 vtime, vmin;
1095         int i;
1096
1097         /*
1098          * If seem to be already active, just update the stamp to tell the
1099          * timer that we're still active.  We don't mind occassional races.
1100          */
1101         if (!list_empty(&iocg->active_list)) {
1102                 ioc_now(ioc, now);
1103                 cur_period = atomic64_read(&ioc->cur_period);
1104                 if (atomic64_read(&iocg->active_period) != cur_period)
1105                         atomic64_set(&iocg->active_period, cur_period);
1106                 return true;
1107         }
1108
1109         /* racy check on internal node IOs, treat as root level IOs */
1110         if (iocg->child_active_sum)
1111                 return false;
1112
1113         spin_lock_irq(&ioc->lock);
1114
1115         ioc_now(ioc, now);
1116
1117         /* update period */
1118         cur_period = atomic64_read(&ioc->cur_period);
1119         last_period = atomic64_read(&iocg->active_period);
1120         atomic64_set(&iocg->active_period, cur_period);
1121
1122         /* already activated or breaking leaf-only constraint? */
1123         if (!list_empty(&iocg->active_list))
1124                 goto succeed_unlock;
1125         for (i = iocg->level - 1; i > 0; i--)
1126                 if (!list_empty(&iocg->ancestors[i]->active_list))
1127                         goto fail_unlock;
1128
1129         if (iocg->child_active_sum)
1130                 goto fail_unlock;
1131
1132         /*
1133          * vtime may wrap when vrate is raised substantially due to
1134          * underestimated IO costs.  Look at the period and ignore its
1135          * vtime if the iocg has been idle for too long.  Also, cap the
1136          * budget it can start with to the margin.
1137          */
1138         max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1139         vtime = atomic64_read(&iocg->vtime);
1140         vmin = now->vnow - ioc->margins.max;
1141
1142         if (last_period + max_period_delta < cur_period ||
1143             time_before64(vtime, vmin)) {
1144                 atomic64_add(vmin - vtime, &iocg->vtime);
1145                 atomic64_add(vmin - vtime, &iocg->done_vtime);
1146                 vtime = vmin;
1147         }
1148
1149         /*
1150          * Activate, propagate weight and start period timer if not
1151          * running.  Reset hweight_gen to avoid accidental match from
1152          * wrapping.
1153          */
1154         iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1155         list_add(&iocg->active_list, &ioc->active_iocgs);
1156         propagate_weights(iocg, iocg->weight,
1157                           iocg->last_inuse ?: iocg->weight);
1158
1159         TRACE_IOCG_PATH(iocg_activate, iocg, now,
1160                         last_period, cur_period, vtime);
1161
1162         iocg->activated_at = now->now;
1163
1164         if (ioc->running == IOC_IDLE) {
1165                 ioc->running = IOC_RUNNING;
1166                 ioc_start_period(ioc, now);
1167         }
1168
1169 succeed_unlock:
1170         spin_unlock_irq(&ioc->lock);
1171         return true;
1172
1173 fail_unlock:
1174         spin_unlock_irq(&ioc->lock);
1175         return false;
1176 }
1177
1178 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1179 {
1180         struct ioc *ioc = iocg->ioc;
1181         struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1182         u64 vtime = atomic64_read(&iocg->vtime);
1183         u64 delta_ns, expires, oexpires;
1184         u32 hw_inuse;
1185
1186         lockdep_assert_held(&iocg->waitq.lock);
1187
1188         /* debt-adjust vtime */
1189         current_hweight(iocg, NULL, &hw_inuse);
1190         vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1191
1192         /*
1193          * Clear or maintain depending on the overage. Non-zero vdebt is what
1194          * guarantees that @iocg is online and future iocg_kick_delay() will
1195          * clear use_delay. Don't leave it on when there's no vdebt.
1196          */
1197         if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1198                 blkcg_clear_delay(blkg);
1199                 return false;
1200         }
1201         if (!atomic_read(&blkg->use_delay) &&
1202             time_before_eq64(vtime, now->vnow + ioc->margins.max))
1203                 return false;
1204
1205         /* use delay */
1206         delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
1207                                       now->vrate) * NSEC_PER_USEC;
1208         blkcg_set_delay(blkg, delta_ns);
1209         expires = now->now_ns + delta_ns;
1210
1211         /* if already active and close enough, don't bother */
1212         oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1213         if (hrtimer_is_queued(&iocg->delay_timer) &&
1214             abs(oexpires - expires) <= ioc->timer_slack_ns)
1215                 return true;
1216
1217         hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1218                                ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1219         return true;
1220 }
1221
1222 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1223 {
1224         struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1225         struct ioc_now now;
1226         unsigned long flags;
1227
1228         spin_lock_irqsave(&iocg->waitq.lock, flags);
1229         ioc_now(iocg->ioc, &now);
1230         iocg_kick_delay(iocg, &now);
1231         spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1232
1233         return HRTIMER_NORESTART;
1234 }
1235
1236 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1237                         int flags, void *key)
1238 {
1239         struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1240         struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1241         u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1242
1243         ctx->vbudget -= cost;
1244
1245         if (ctx->vbudget < 0)
1246                 return -1;
1247
1248         iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1249
1250         /*
1251          * autoremove_wake_function() removes the wait entry only when it
1252          * actually changed the task state.  We want the wait always
1253          * removed.  Remove explicitly and use default_wake_function().
1254          */
1255         list_del_init(&wq_entry->entry);
1256         wait->committed = true;
1257
1258         default_wake_function(wq_entry, mode, flags, key);
1259         return 0;
1260 }
1261
1262 /*
1263  * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1264  * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1265  * addition to iocg->waitq.lock.
1266  */
1267 static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1268                             struct ioc_now *now)
1269 {
1270         struct ioc *ioc = iocg->ioc;
1271         struct iocg_wake_ctx ctx = { .iocg = iocg };
1272         u64 vshortage, expires, oexpires;
1273         s64 vbudget;
1274         u32 hw_inuse;
1275
1276         lockdep_assert_held(&iocg->waitq.lock);
1277
1278         current_hweight(iocg, NULL, &hw_inuse);
1279         vbudget = now->vnow - atomic64_read(&iocg->vtime);
1280
1281         /* pay off debt */
1282         if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1283                 u64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1284                 u64 delta = min_t(u64, vbudget, vdebt);
1285                 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1286                                     iocg->abs_vdebt);
1287
1288                 lockdep_assert_held(&ioc->lock);
1289
1290                 atomic64_add(delta, &iocg->vtime);
1291                 atomic64_add(delta, &iocg->done_vtime);
1292                 iocg->abs_vdebt -= abs_delta;
1293                 vbudget -= vdebt;
1294
1295                 iocg_kick_delay(iocg, now);
1296         }
1297
1298         /*
1299          * Debt can still be outstanding if we haven't paid all yet or the
1300          * caller raced and called without @pay_debt. Shouldn't wake up waiters
1301          * under debt. Make sure @vbudget reflects the outstanding amount and is
1302          * not positive.
1303          */
1304         if (iocg->abs_vdebt) {
1305                 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1306                 vbudget = min_t(s64, 0, vbudget - vdebt);
1307         }
1308
1309         /*
1310          * Wake up the ones which are due and see how much vtime we'll need
1311          * for the next one.
1312          */
1313         ctx.hw_inuse = hw_inuse;
1314         ctx.vbudget = vbudget;
1315         __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1316         if (!waitqueue_active(&iocg->waitq))
1317                 return;
1318         if (WARN_ON_ONCE(ctx.vbudget >= 0))
1319                 return;
1320
1321         /* determine next wakeup, add a timer margin to guarantee chunking */
1322         vshortage = -ctx.vbudget;
1323         expires = now->now_ns +
1324                 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1325         expires += ioc->timer_slack_ns;
1326
1327         /* if already active and close enough, don't bother */
1328         oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1329         if (hrtimer_is_queued(&iocg->waitq_timer) &&
1330             abs(oexpires - expires) <= ioc->timer_slack_ns)
1331                 return;
1332
1333         hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1334                                ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1335 }
1336
1337 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1338 {
1339         struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1340         bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1341         struct ioc_now now;
1342         unsigned long flags;
1343
1344         ioc_now(iocg->ioc, &now);
1345
1346         iocg_lock(iocg, pay_debt, &flags);
1347         iocg_kick_waitq(iocg, pay_debt, &now);
1348         iocg_unlock(iocg, pay_debt, &flags);
1349
1350         return HRTIMER_NORESTART;
1351 }
1352
1353 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1354 {
1355         u32 nr_met[2] = { };
1356         u32 nr_missed[2] = { };
1357         u64 rq_wait_ns = 0;
1358         int cpu, rw;
1359
1360         for_each_online_cpu(cpu) {
1361                 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1362                 u64 this_rq_wait_ns;
1363
1364                 for (rw = READ; rw <= WRITE; rw++) {
1365                         u32 this_met = local_read(&stat->missed[rw].nr_met);
1366                         u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1367
1368                         nr_met[rw] += this_met - stat->missed[rw].last_met;
1369                         nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1370                         stat->missed[rw].last_met = this_met;
1371                         stat->missed[rw].last_missed = this_missed;
1372                 }
1373
1374                 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1375                 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1376                 stat->last_rq_wait_ns = this_rq_wait_ns;
1377         }
1378
1379         for (rw = READ; rw <= WRITE; rw++) {
1380                 if (nr_met[rw] + nr_missed[rw])
1381                         missed_ppm_ar[rw] =
1382                                 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1383                                                    nr_met[rw] + nr_missed[rw]);
1384                 else
1385                         missed_ppm_ar[rw] = 0;
1386         }
1387
1388         *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1389                                    ioc->period_us * NSEC_PER_USEC);
1390 }
1391
1392 /* was iocg idle this period? */
1393 static bool iocg_is_idle(struct ioc_gq *iocg)
1394 {
1395         struct ioc *ioc = iocg->ioc;
1396
1397         /* did something get issued this period? */
1398         if (atomic64_read(&iocg->active_period) ==
1399             atomic64_read(&ioc->cur_period))
1400                 return false;
1401
1402         /* is something in flight? */
1403         if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1404                 return false;
1405
1406         return true;
1407 }
1408
1409 /*
1410  * Call this function on the target leaf @iocg's to build pre-order traversal
1411  * list of all the ancestors in @inner_walk. The inner nodes are linked through
1412  * ->walk_list and the caller is responsible for dissolving the list after use.
1413  */
1414 static void iocg_build_inner_walk(struct ioc_gq *iocg,
1415                                   struct list_head *inner_walk)
1416 {
1417         int lvl;
1418
1419         WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1420
1421         /* find the first ancestor which hasn't been visited yet */
1422         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1423                 if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1424                         break;
1425         }
1426
1427         /* walk down and visit the inner nodes to get pre-order traversal */
1428         while (++lvl <= iocg->level - 1) {
1429                 struct ioc_gq *inner = iocg->ancestors[lvl];
1430
1431                 /* record traversal order */
1432                 list_add_tail(&inner->walk_list, inner_walk);
1433         }
1434 }
1435
1436 /* collect per-cpu counters and propagate the deltas to the parent */
1437 static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
1438 {
1439         struct iocg_stat new_stat;
1440         u64 abs_vusage = 0;
1441         u64 vusage_delta;
1442         int cpu;
1443
1444         lockdep_assert_held(&iocg->ioc->lock);
1445
1446         /* collect per-cpu counters */
1447         for_each_possible_cpu(cpu) {
1448                 abs_vusage += local64_read(
1449                                 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1450         }
1451         vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1452         iocg->last_stat_abs_vusage = abs_vusage;
1453
1454         iocg->usage_delta_us = div64_u64(vusage_delta, now->vrate);
1455         iocg->local_stat.usage_us += iocg->usage_delta_us;
1456
1457         new_stat.usage_us =
1458                 iocg->local_stat.usage_us + iocg->desc_stat.usage_us;
1459
1460         /* propagate the deltas to the parent */
1461         if (iocg->level > 0) {
1462                 struct iocg_stat *parent_stat =
1463                         &iocg->ancestors[iocg->level - 1]->desc_stat;
1464
1465                 parent_stat->usage_us +=
1466                         new_stat.usage_us - iocg->last_stat.usage_us;
1467         }
1468
1469         iocg->last_stat = new_stat;
1470 }
1471
1472 /* get stat counters ready for reading on all active iocgs */
1473 static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1474 {
1475         LIST_HEAD(inner_walk);
1476         struct ioc_gq *iocg, *tiocg;
1477
1478         /* flush leaves and build inner node walk list */
1479         list_for_each_entry(iocg, target_iocgs, active_list) {
1480                 iocg_flush_stat_one(iocg, now);
1481                 iocg_build_inner_walk(iocg, &inner_walk);
1482         }
1483
1484         /* keep flushing upwards by walking the inner list backwards */
1485         list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1486                 iocg_flush_stat_one(iocg, now);
1487                 list_del_init(&iocg->walk_list);
1488         }
1489 }
1490
1491 /* returns usage with margin added if surplus is large enough */
1492 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1493 {
1494         /* add margin */
1495         usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1496         usage += SURPLUS_SCALE_ABS;
1497
1498         /* don't bother if the surplus is too small */
1499         if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1500                 return 0;
1501
1502         return usage;
1503 }
1504
1505 static void ioc_timer_fn(struct timer_list *timer)
1506 {
1507         struct ioc *ioc = container_of(timer, struct ioc, timer);
1508         struct ioc_gq *iocg, *tiocg;
1509         struct ioc_now now;
1510         LIST_HEAD(surpluses);
1511         int nr_shortages = 0, nr_lagging = 0;
1512         u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1513         u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1514         u32 missed_ppm[2], rq_wait_pct;
1515         u64 period_vtime;
1516         int prev_busy_level, i;
1517
1518         /* how were the latencies during the period? */
1519         ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1520
1521         /* take care of active iocgs */
1522         spin_lock_irq(&ioc->lock);
1523
1524         ioc_now(ioc, &now);
1525
1526         period_vtime = now.vnow - ioc->period_at_vtime;
1527         if (WARN_ON_ONCE(!period_vtime)) {
1528                 spin_unlock_irq(&ioc->lock);
1529                 return;
1530         }
1531
1532         iocg_flush_stat(&ioc->active_iocgs, &now);
1533
1534         /*
1535          * Waiters determine the sleep durations based on the vrate they
1536          * saw at the time of sleep.  If vrate has increased, some waiters
1537          * could be sleeping for too long.  Wake up tardy waiters which
1538          * should have woken up in the last period and expire idle iocgs.
1539          */
1540         list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1541                 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1542                     !iocg_is_idle(iocg))
1543                         continue;
1544
1545                 spin_lock(&iocg->waitq.lock);
1546
1547                 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1548                         /* might be oversleeping vtime / hweight changes, kick */
1549                         iocg_kick_waitq(iocg, true, &now);
1550                 } else if (iocg_is_idle(iocg)) {
1551                         /* no waiter and idle, deactivate */
1552                         iocg->last_inuse = iocg->inuse;
1553                         __propagate_weights(iocg, 0, 0);
1554                         list_del_init(&iocg->active_list);
1555                 }
1556
1557                 spin_unlock(&iocg->waitq.lock);
1558         }
1559         commit_weights(ioc);
1560
1561         /* calc usages and see whether some weights need to be moved around */
1562         list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1563                 u64 vdone, vtime, usage_us, vmin;
1564                 u32 hw_active, hw_inuse, usage;
1565                 int uidx;
1566
1567                 /*
1568                  * Collect unused and wind vtime closer to vnow to prevent
1569                  * iocgs from accumulating a large amount of budget.
1570                  */
1571                 vdone = atomic64_read(&iocg->done_vtime);
1572                 vtime = atomic64_read(&iocg->vtime);
1573                 current_hweight(iocg, &hw_active, &hw_inuse);
1574
1575                 /*
1576                  * Latency QoS detection doesn't account for IOs which are
1577                  * in-flight for longer than a period.  Detect them by
1578                  * comparing vdone against period start.  If lagging behind
1579                  * IOs from past periods, don't increase vrate.
1580                  */
1581                 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1582                     !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1583                     time_after64(vtime, vdone) &&
1584                     time_after64(vtime, now.vnow -
1585                                  MAX_LAGGING_PERIODS * period_vtime) &&
1586                     time_before64(vdone, now.vnow - period_vtime))
1587                         nr_lagging++;
1588
1589                 /*
1590                  * Determine absolute usage factoring in pending and in-flight
1591                  * IOs to avoid stalls and high-latency completions appearing as
1592                  * idle.
1593                  */
1594                 usage_us = iocg->usage_delta_us;
1595                 if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow))
1596                         usage_us += DIV64_U64_ROUND_UP(
1597                                 cost_to_abs_cost(now.vnow - vtime, hw_inuse),
1598                                 now.vrate);
1599                 if (vdone != vtime) {
1600                         u64 inflight_us = DIV64_U64_ROUND_UP(
1601                                 cost_to_abs_cost(vtime - vdone, hw_inuse),
1602                                 now.vrate);
1603                         usage_us = max(usage_us, inflight_us);
1604                 }
1605
1606                 /* convert to hweight based usage ratio and record */
1607                 uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1608
1609                 if (time_after64(vtime, now.vnow - ioc->margins.min)) {
1610                         iocg->usage_idx = uidx;
1611                         iocg->usages[uidx] = WEIGHT_ONE;
1612                 } else if (usage_us) {
1613                         u64 started_at, dur;
1614
1615                         if (time_after64(iocg->activated_at, ioc->period_at))
1616                                 started_at = iocg->activated_at;
1617                         else
1618                                 started_at = ioc->period_at;
1619
1620                         dur = max_t(u64, now.now - started_at, 1);
1621                         usage = clamp_t(u32,
1622                                 DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur),
1623                                 1, WEIGHT_ONE);
1624
1625                         iocg->usage_idx = uidx;
1626                         iocg->usages[uidx] = usage;
1627                 } else {
1628                         usage = 0;
1629                 }
1630
1631                 /* see whether there's surplus vtime */
1632                 vmin = now.vnow - ioc->margins.max;
1633
1634                 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
1635                 if (!waitqueue_active(&iocg->waitq) &&
1636                     time_before64(vtime, vmin)) {
1637                         u64 delta = vmin - vtime;
1638
1639                         /* throw away surplus vtime */
1640                         atomic64_add(delta, &iocg->vtime);
1641                         atomic64_add(delta, &iocg->done_vtime);
1642                         /* if usage is sufficiently low, maybe it can donate */
1643                         if (surplus_adjusted_hweight_inuse(usage, hw_inuse))
1644                                 list_add(&iocg->surplus_list, &surpluses);
1645                 } else if (hw_inuse < hw_active) {
1646                         u32 new_hwi, new_inuse;
1647
1648                         /* was donating but might need to take back some */
1649                         if (waitqueue_active(&iocg->waitq)) {
1650                                 new_hwi = hw_active;
1651                         } else {
1652                                 new_hwi = max(hw_inuse,
1653                                               usage * SURPLUS_SCALE_PCT / 100 +
1654                                               SURPLUS_SCALE_ABS);
1655                         }
1656
1657                         new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1658                                               hw_inuse);
1659                         new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1660
1661                         if (new_inuse > iocg->inuse) {
1662                                 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1663                                                 iocg->inuse, new_inuse,
1664                                                 hw_inuse, new_hwi);
1665                                 __propagate_weights(iocg, iocg->weight,
1666                                                     new_inuse);
1667                         }
1668                 } else {
1669                         /* genuninely out of vtime */
1670                         nr_shortages++;
1671                 }
1672         }
1673
1674         if (!nr_shortages || list_empty(&surpluses))
1675                 goto skip_surplus_transfers;
1676
1677         /* there are both shortages and surpluses, transfer surpluses */
1678         list_for_each_entry(iocg, &surpluses, surplus_list) {
1679                 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1680                 int nr_valid = 0;
1681
1682                 /* base the decision on max historical usage */
1683                 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1684                         if (iocg->usages[i]) {
1685                                 usage = max(usage, iocg->usages[i]);
1686                                 nr_valid++;
1687                         }
1688                 }
1689                 if (nr_valid < MIN_VALID_USAGES)
1690                         continue;
1691
1692                 current_hweight(iocg, &hw_active, &hw_inuse);
1693                 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1694                 if (!new_hwi)
1695                         continue;
1696
1697                 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1698                                                hw_inuse);
1699                 if (new_inuse < iocg->inuse) {
1700                         TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1701                                         iocg->inuse, new_inuse,
1702                                         hw_inuse, new_hwi);
1703                         __propagate_weights(iocg, iocg->weight, new_inuse);
1704                 }
1705         }
1706 skip_surplus_transfers:
1707         commit_weights(ioc);
1708
1709         /* surplus list should be dissolved after use */
1710         list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
1711                 list_del_init(&iocg->surplus_list);
1712
1713         /*
1714          * If q is getting clogged or we're missing too much, we're issuing
1715          * too much IO and should lower vtime rate.  If we're not missing
1716          * and experiencing shortages but not surpluses, we're too stingy
1717          * and should increase vtime rate.
1718          */
1719         prev_busy_level = ioc->busy_level;
1720         if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1721             missed_ppm[READ] > ppm_rthr ||
1722             missed_ppm[WRITE] > ppm_wthr) {
1723                 /* clearly missing QoS targets, slow down vrate */
1724                 ioc->busy_level = max(ioc->busy_level, 0);
1725                 ioc->busy_level++;
1726         } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1727                    missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1728                    missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1729                 /* QoS targets are being met with >25% margin */
1730                 if (nr_shortages) {
1731                         /*
1732                          * We're throttling while the device has spare
1733                          * capacity.  If vrate was being slowed down, stop.
1734                          */
1735                         ioc->busy_level = min(ioc->busy_level, 0);
1736
1737                         /*
1738                          * If there are IOs spanning multiple periods, wait
1739                          * them out before pushing the device harder.
1740                          */
1741                         if (!nr_lagging)
1742                                 ioc->busy_level--;
1743                 } else {
1744                         /*
1745                          * Nobody is being throttled and the users aren't
1746                          * issuing enough IOs to saturate the device.  We
1747                          * simply don't know how close the device is to
1748                          * saturation.  Coast.
1749                          */
1750                         ioc->busy_level = 0;
1751                 }
1752         } else {
1753                 /* inside the hysterisis margin, we're good */
1754                 ioc->busy_level = 0;
1755         }
1756
1757         ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1758
1759         if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1760                 u64 vrate = atomic64_read(&ioc->vtime_rate);
1761                 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1762
1763                 /* rq_wait signal is always reliable, ignore user vrate_min */
1764                 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1765                         vrate_min = VRATE_MIN;
1766
1767                 /*
1768                  * If vrate is out of bounds, apply clamp gradually as the
1769                  * bounds can change abruptly.  Otherwise, apply busy_level
1770                  * based adjustment.
1771                  */
1772                 if (vrate < vrate_min) {
1773                         vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1774                                           100);
1775                         vrate = min(vrate, vrate_min);
1776                 } else if (vrate > vrate_max) {
1777                         vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1778                                           100);
1779                         vrate = max(vrate, vrate_max);
1780                 } else {
1781                         int idx = min_t(int, abs(ioc->busy_level),
1782                                         ARRAY_SIZE(vrate_adj_pct) - 1);
1783                         u32 adj_pct = vrate_adj_pct[idx];
1784
1785                         if (ioc->busy_level > 0)
1786                                 adj_pct = 100 - adj_pct;
1787                         else
1788                                 adj_pct = 100 + adj_pct;
1789
1790                         vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1791                                       vrate_min, vrate_max);
1792                 }
1793
1794                 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1795                                            nr_lagging, nr_shortages);
1796
1797                 atomic64_set(&ioc->vtime_rate, vrate);
1798                 ioc_refresh_margins(ioc);
1799         } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1800                 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1801                                            missed_ppm, rq_wait_pct, nr_lagging,
1802                                            nr_shortages);
1803         }
1804
1805         ioc_refresh_params(ioc, false);
1806
1807         /*
1808          * This period is done.  Move onto the next one.  If nothing's
1809          * going on with the device, stop the timer.
1810          */
1811         atomic64_inc(&ioc->cur_period);
1812
1813         if (ioc->running != IOC_STOP) {
1814                 if (!list_empty(&ioc->active_iocgs)) {
1815                         ioc_start_period(ioc, &now);
1816                 } else {
1817                         ioc->busy_level = 0;
1818                         ioc->running = IOC_IDLE;
1819                 }
1820         }
1821
1822         spin_unlock_irq(&ioc->lock);
1823 }
1824
1825 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1826                                     bool is_merge, u64 *costp)
1827 {
1828         struct ioc *ioc = iocg->ioc;
1829         u64 coef_seqio, coef_randio, coef_page;
1830         u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1831         u64 seek_pages = 0;
1832         u64 cost = 0;
1833
1834         switch (bio_op(bio)) {
1835         case REQ_OP_READ:
1836                 coef_seqio      = ioc->params.lcoefs[LCOEF_RSEQIO];
1837                 coef_randio     = ioc->params.lcoefs[LCOEF_RRANDIO];
1838                 coef_page       = ioc->params.lcoefs[LCOEF_RPAGE];
1839                 break;
1840         case REQ_OP_WRITE:
1841                 coef_seqio      = ioc->params.lcoefs[LCOEF_WSEQIO];
1842                 coef_randio     = ioc->params.lcoefs[LCOEF_WRANDIO];
1843                 coef_page       = ioc->params.lcoefs[LCOEF_WPAGE];
1844                 break;
1845         default:
1846                 goto out;
1847         }
1848
1849         if (iocg->cursor) {
1850                 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1851                 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1852         }
1853
1854         if (!is_merge) {
1855                 if (seek_pages > LCOEF_RANDIO_PAGES) {
1856                         cost += coef_randio;
1857                 } else {
1858                         cost += coef_seqio;
1859                 }
1860         }
1861         cost += pages * coef_page;
1862 out:
1863         *costp = cost;
1864 }
1865
1866 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1867 {
1868         u64 cost;
1869
1870         calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1871         return cost;
1872 }
1873
1874 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
1875                                          u64 *costp)
1876 {
1877         unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
1878
1879         switch (req_op(rq)) {
1880         case REQ_OP_READ:
1881                 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
1882                 break;
1883         case REQ_OP_WRITE:
1884                 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
1885                 break;
1886         default:
1887                 *costp = 0;
1888         }
1889 }
1890
1891 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
1892 {
1893         u64 cost;
1894
1895         calc_size_vtime_cost_builtin(rq, ioc, &cost);
1896         return cost;
1897 }
1898
1899 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1900 {
1901         struct blkcg_gq *blkg = bio->bi_blkg;
1902         struct ioc *ioc = rqos_to_ioc(rqos);
1903         struct ioc_gq *iocg = blkg_to_iocg(blkg);
1904         struct ioc_now now;
1905         struct iocg_wait wait;
1906         u32 hw_active, hw_inuse;
1907         u64 abs_cost, cost, vtime;
1908         bool use_debt, ioc_locked;
1909         unsigned long flags;
1910
1911         /* bypass IOs if disabled or for root cgroup */
1912         if (!ioc->enabled || !iocg->level)
1913                 return;
1914
1915         /* always activate so that even 0 cost IOs get protected to some level */
1916         if (!iocg_activate(iocg, &now))
1917                 return;
1918
1919         /* calculate the absolute vtime cost */
1920         abs_cost = calc_vtime_cost(bio, iocg, false);
1921         if (!abs_cost)
1922                 return;
1923
1924         iocg->cursor = bio_end_sector(bio);
1925
1926         vtime = atomic64_read(&iocg->vtime);
1927         current_hweight(iocg, &hw_active, &hw_inuse);
1928
1929         if (hw_inuse < hw_active &&
1930             time_after_eq64(vtime + ioc->margins.min, now.vnow)) {
1931                 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1932                                 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1933                 spin_lock_irq(&ioc->lock);
1934                 propagate_weights(iocg, iocg->weight, iocg->weight);
1935                 spin_unlock_irq(&ioc->lock);
1936                 current_hweight(iocg, &hw_active, &hw_inuse);
1937         }
1938
1939         cost = abs_cost_to_cost(abs_cost, hw_inuse);
1940
1941         /*
1942          * If no one's waiting and within budget, issue right away.  The
1943          * tests are racy but the races aren't systemic - we only miss once
1944          * in a while which is fine.
1945          */
1946         if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1947             time_before_eq64(vtime + cost, now.vnow)) {
1948                 iocg_commit_bio(iocg, bio, abs_cost, cost);
1949                 return;
1950         }
1951
1952         /*
1953          * We're over budget. This can be handled in two ways. IOs which may
1954          * cause priority inversions are punted to @ioc->aux_iocg and charged as
1955          * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
1956          * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
1957          * whether debt handling is needed and acquire locks accordingly.
1958          */
1959         use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
1960         ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
1961
1962         iocg_lock(iocg, ioc_locked, &flags);
1963
1964         /*
1965          * @iocg must stay activated for debt and waitq handling. Deactivation
1966          * is synchronized against both ioc->lock and waitq.lock and we won't
1967          * get deactivated as long as we're waiting or has debt, so we're good
1968          * if we're activated here. In the unlikely cases that we aren't, just
1969          * issue the IO.
1970          */
1971         if (unlikely(list_empty(&iocg->active_list))) {
1972                 iocg_unlock(iocg, ioc_locked, &flags);
1973                 iocg_commit_bio(iocg, bio, abs_cost, cost);
1974                 return;
1975         }
1976
1977         /*
1978          * We're over budget. If @bio has to be issued regardless, remember
1979          * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1980          * off the debt before waking more IOs.
1981          *
1982          * This way, the debt is continuously paid off each period with the
1983          * actual budget available to the cgroup. If we just wound vtime, we
1984          * would incorrectly use the current hw_inuse for the entire amount
1985          * which, for example, can lead to the cgroup staying blocked for a
1986          * long time even with substantially raised hw_inuse.
1987          *
1988          * An iocg with vdebt should stay online so that the timer can keep
1989          * deducting its vdebt and [de]activate use_delay mechanism
1990          * accordingly. We don't want to race against the timer trying to
1991          * clear them and leave @iocg inactive w/ dangling use_delay heavily
1992          * penalizing the cgroup and its descendants.
1993          */
1994         if (use_debt) {
1995                 iocg->abs_vdebt += abs_cost;
1996                 if (iocg_kick_delay(iocg, &now))
1997                         blkcg_schedule_throttle(rqos->q,
1998                                         (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1999                 iocg_unlock(iocg, ioc_locked, &flags);
2000                 return;
2001         }
2002
2003         /*
2004          * Append self to the waitq and schedule the wakeup timer if we're
2005          * the first waiter.  The timer duration is calculated based on the
2006          * current vrate.  vtime and hweight changes can make it too short
2007          * or too long.  Each wait entry records the absolute cost it's
2008          * waiting for to allow re-evaluation using a custom wait entry.
2009          *
2010          * If too short, the timer simply reschedules itself.  If too long,
2011          * the period timer will notice and trigger wakeups.
2012          *
2013          * All waiters are on iocg->waitq and the wait states are
2014          * synchronized using waitq.lock.
2015          */
2016         init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2017         wait.wait.private = current;
2018         wait.bio = bio;
2019         wait.abs_cost = abs_cost;
2020         wait.committed = false; /* will be set true by waker */
2021
2022         __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2023         iocg_kick_waitq(iocg, ioc_locked, &now);
2024
2025         iocg_unlock(iocg, ioc_locked, &flags);
2026
2027         while (true) {
2028                 set_current_state(TASK_UNINTERRUPTIBLE);
2029                 if (wait.committed)
2030                         break;
2031                 io_schedule();
2032         }
2033
2034         /* waker already committed us, proceed */
2035         finish_wait(&iocg->waitq, &wait.wait);
2036 }
2037
2038 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2039                            struct bio *bio)
2040 {
2041         struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2042         struct ioc *ioc = iocg->ioc;
2043         sector_t bio_end = bio_end_sector(bio);
2044         struct ioc_now now;
2045         u32 hw_inuse;
2046         u64 abs_cost, cost;
2047         unsigned long flags;
2048
2049         /* bypass if disabled or for root cgroup */
2050         if (!ioc->enabled || !iocg->level)
2051                 return;
2052
2053         abs_cost = calc_vtime_cost(bio, iocg, true);
2054         if (!abs_cost)
2055                 return;
2056
2057         ioc_now(ioc, &now);
2058         current_hweight(iocg, NULL, &hw_inuse);
2059         cost = abs_cost_to_cost(abs_cost, hw_inuse);
2060
2061         /* update cursor if backmerging into the request at the cursor */
2062         if (blk_rq_pos(rq) < bio_end &&
2063             blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2064                 iocg->cursor = bio_end;
2065
2066         /*
2067          * Charge if there's enough vtime budget and the existing request has
2068          * cost assigned.
2069          */
2070         if (rq->bio && rq->bio->bi_iocost_cost &&
2071             time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2072                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2073                 return;
2074         }
2075
2076         /*
2077          * Otherwise, account it as debt if @iocg is online, which it should
2078          * be for the vast majority of cases. See debt handling in
2079          * ioc_rqos_throttle() for details.
2080          */
2081         spin_lock_irqsave(&iocg->waitq.lock, flags);
2082         if (likely(!list_empty(&iocg->active_list))) {
2083                 iocg->abs_vdebt += abs_cost;
2084                 iocg_kick_delay(iocg, &now);
2085         } else {
2086                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2087         }
2088         spin_unlock_irqrestore(&iocg->waitq.lock, flags);
2089 }
2090
2091 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2092 {
2093         struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2094
2095         if (iocg && bio->bi_iocost_cost)
2096                 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2097 }
2098
2099 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2100 {
2101         struct ioc *ioc = rqos_to_ioc(rqos);
2102         struct ioc_pcpu_stat *ccs;
2103         u64 on_q_ns, rq_wait_ns, size_nsec;
2104         int pidx, rw;
2105
2106         if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2107                 return;
2108
2109         switch (req_op(rq) & REQ_OP_MASK) {
2110         case REQ_OP_READ:
2111                 pidx = QOS_RLAT;
2112                 rw = READ;
2113                 break;
2114         case REQ_OP_WRITE:
2115                 pidx = QOS_WLAT;
2116                 rw = WRITE;
2117                 break;
2118         default:
2119                 return;
2120         }
2121
2122         on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2123         rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2124         size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2125
2126         ccs = get_cpu_ptr(ioc->pcpu_stat);
2127
2128         if (on_q_ns <= size_nsec ||
2129             on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2130                 local_inc(&ccs->missed[rw].nr_met);
2131         else
2132                 local_inc(&ccs->missed[rw].nr_missed);
2133
2134         local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2135
2136         put_cpu_ptr(ccs);
2137 }
2138
2139 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2140 {
2141         struct ioc *ioc = rqos_to_ioc(rqos);
2142
2143         spin_lock_irq(&ioc->lock);
2144         ioc_refresh_params(ioc, false);
2145         spin_unlock_irq(&ioc->lock);
2146 }
2147
2148 static void ioc_rqos_exit(struct rq_qos *rqos)
2149 {
2150         struct ioc *ioc = rqos_to_ioc(rqos);
2151
2152         blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2153
2154         spin_lock_irq(&ioc->lock);
2155         ioc->running = IOC_STOP;
2156         spin_unlock_irq(&ioc->lock);
2157
2158         del_timer_sync(&ioc->timer);
2159         free_percpu(ioc->pcpu_stat);
2160         kfree(ioc);
2161 }
2162
2163 static struct rq_qos_ops ioc_rqos_ops = {
2164         .throttle = ioc_rqos_throttle,
2165         .merge = ioc_rqos_merge,
2166         .done_bio = ioc_rqos_done_bio,
2167         .done = ioc_rqos_done,
2168         .queue_depth_changed = ioc_rqos_queue_depth_changed,
2169         .exit = ioc_rqos_exit,
2170 };
2171
2172 static int blk_iocost_init(struct request_queue *q)
2173 {
2174         struct ioc *ioc;
2175         struct rq_qos *rqos;
2176         int i, cpu, ret;
2177
2178         ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2179         if (!ioc)
2180                 return -ENOMEM;
2181
2182         ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2183         if (!ioc->pcpu_stat) {
2184                 kfree(ioc);
2185                 return -ENOMEM;
2186         }
2187
2188         for_each_possible_cpu(cpu) {
2189                 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2190
2191                 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2192                         local_set(&ccs->missed[i].nr_met, 0);
2193                         local_set(&ccs->missed[i].nr_missed, 0);
2194                 }
2195                 local64_set(&ccs->rq_wait_ns, 0);
2196         }
2197
2198         rqos = &ioc->rqos;
2199         rqos->id = RQ_QOS_COST;
2200         rqos->ops = &ioc_rqos_ops;
2201         rqos->q = q;
2202
2203         spin_lock_init(&ioc->lock);
2204         timer_setup(&ioc->timer, ioc_timer_fn, 0);
2205         INIT_LIST_HEAD(&ioc->active_iocgs);
2206
2207         ioc->running = IOC_IDLE;
2208         atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2209         seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2210         ioc->period_at = ktime_to_us(ktime_get());
2211         atomic64_set(&ioc->cur_period, 0);
2212         atomic_set(&ioc->hweight_gen, 0);
2213
2214         spin_lock_irq(&ioc->lock);
2215         ioc->autop_idx = AUTOP_INVALID;
2216         ioc_refresh_params(ioc, true);
2217         spin_unlock_irq(&ioc->lock);
2218
2219         rq_qos_add(q, rqos);
2220         ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2221         if (ret) {
2222                 rq_qos_del(q, rqos);
2223                 free_percpu(ioc->pcpu_stat);
2224                 kfree(ioc);
2225                 return ret;
2226         }
2227         return 0;
2228 }
2229
2230 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2231 {
2232         struct ioc_cgrp *iocc;
2233
2234         iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2235         if (!iocc)
2236                 return NULL;
2237
2238         iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2239         return &iocc->cpd;
2240 }
2241
2242 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2243 {
2244         kfree(container_of(cpd, struct ioc_cgrp, cpd));
2245 }
2246
2247 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2248                                              struct blkcg *blkcg)
2249 {
2250         int levels = blkcg->css.cgroup->level + 1;
2251         struct ioc_gq *iocg;
2252
2253         iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2254         if (!iocg)
2255                 return NULL;
2256
2257         iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2258         if (!iocg->pcpu_stat) {
2259                 kfree(iocg);
2260                 return NULL;
2261         }
2262
2263         return &iocg->pd;
2264 }
2265
2266 static void ioc_pd_init(struct blkg_policy_data *pd)
2267 {
2268         struct ioc_gq *iocg = pd_to_iocg(pd);
2269         struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2270         struct ioc *ioc = q_to_ioc(blkg->q);
2271         struct ioc_now now;
2272         struct blkcg_gq *tblkg;
2273         unsigned long flags;
2274
2275         ioc_now(ioc, &now);
2276
2277         iocg->ioc = ioc;
2278         atomic64_set(&iocg->vtime, now.vnow);
2279         atomic64_set(&iocg->done_vtime, now.vnow);
2280         atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2281         INIT_LIST_HEAD(&iocg->active_list);
2282         INIT_LIST_HEAD(&iocg->walk_list);
2283         INIT_LIST_HEAD(&iocg->surplus_list);
2284         iocg->hweight_active = WEIGHT_ONE;
2285         iocg->hweight_inuse = WEIGHT_ONE;
2286
2287         init_waitqueue_head(&iocg->waitq);
2288         hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2289         iocg->waitq_timer.function = iocg_waitq_timer_fn;
2290         hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2291         iocg->delay_timer.function = iocg_delay_timer_fn;
2292
2293         iocg->level = blkg->blkcg->css.cgroup->level;
2294
2295         for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2296                 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2297                 iocg->ancestors[tiocg->level] = tiocg;
2298         }
2299
2300         spin_lock_irqsave(&ioc->lock, flags);
2301         weight_updated(iocg);
2302         spin_unlock_irqrestore(&ioc->lock, flags);
2303 }
2304
2305 static void ioc_pd_free(struct blkg_policy_data *pd)
2306 {
2307         struct ioc_gq *iocg = pd_to_iocg(pd);
2308         struct ioc *ioc = iocg->ioc;
2309         unsigned long flags;
2310
2311         if (ioc) {
2312                 spin_lock_irqsave(&ioc->lock, flags);
2313
2314                 if (!list_empty(&iocg->active_list)) {
2315                         propagate_weights(iocg, 0, 0);
2316                         list_del_init(&iocg->active_list);
2317                 }
2318
2319                 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
2320                 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2321
2322                 spin_unlock_irqrestore(&ioc->lock, flags);
2323
2324                 hrtimer_cancel(&iocg->waitq_timer);
2325                 hrtimer_cancel(&iocg->delay_timer);
2326         }
2327         free_percpu(iocg->pcpu_stat);
2328         kfree(iocg);
2329 }
2330
2331 static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
2332 {
2333         struct ioc_gq *iocg = pd_to_iocg(pd);
2334         struct ioc *ioc = iocg->ioc;
2335         size_t pos = 0;
2336
2337         if (!ioc->enabled)
2338                 return 0;
2339
2340         if (iocg->level == 0) {
2341                 unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
2342                         atomic64_read(&ioc->vtime_rate) * 10000,
2343                         VTIME_PER_USEC);
2344                 pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
2345                                   vp10k / 100, vp10k % 100);
2346         }
2347
2348         pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
2349                          iocg->last_stat.usage_us);
2350
2351         return pos;
2352 }
2353
2354 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2355                              int off)
2356 {
2357         const char *dname = blkg_dev_name(pd->blkg);
2358         struct ioc_gq *iocg = pd_to_iocg(pd);
2359
2360         if (dname && iocg->cfg_weight)
2361                 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
2362         return 0;
2363 }
2364
2365
2366 static int ioc_weight_show(struct seq_file *sf, void *v)
2367 {
2368         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2369         struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2370
2371         seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
2372         blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2373                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
2374         return 0;
2375 }
2376
2377 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2378                                 size_t nbytes, loff_t off)
2379 {
2380         struct blkcg *blkcg = css_to_blkcg(of_css(of));
2381         struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2382         struct blkg_conf_ctx ctx;
2383         struct ioc_gq *iocg;
2384         u32 v;
2385         int ret;
2386
2387         if (!strchr(buf, ':')) {
2388                 struct blkcg_gq *blkg;
2389
2390                 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2391                         return -EINVAL;
2392
2393                 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2394                         return -EINVAL;
2395
2396                 spin_lock(&blkcg->lock);
2397                 iocc->dfl_weight = v * WEIGHT_ONE;
2398                 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2399                         struct ioc_gq *iocg = blkg_to_iocg(blkg);
2400
2401                         if (iocg) {
2402                                 spin_lock_irq(&iocg->ioc->lock);
2403                                 weight_updated(iocg);
2404                                 spin_unlock_irq(&iocg->ioc->lock);
2405                         }
2406                 }
2407                 spin_unlock(&blkcg->lock);
2408
2409                 return nbytes;
2410         }
2411
2412         ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2413         if (ret)
2414                 return ret;
2415
2416         iocg = blkg_to_iocg(ctx.blkg);
2417
2418         if (!strncmp(ctx.body, "default", 7)) {
2419                 v = 0;
2420         } else {
2421                 if (!sscanf(ctx.body, "%u", &v))
2422                         goto einval;
2423                 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2424                         goto einval;
2425         }
2426
2427         spin_lock(&iocg->ioc->lock);
2428         iocg->cfg_weight = v * WEIGHT_ONE;
2429         weight_updated(iocg);
2430         spin_unlock(&iocg->ioc->lock);
2431
2432         blkg_conf_finish(&ctx);
2433         return nbytes;
2434
2435 einval:
2436         blkg_conf_finish(&ctx);
2437         return -EINVAL;
2438 }
2439
2440 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2441                           int off)
2442 {
2443         const char *dname = blkg_dev_name(pd->blkg);
2444         struct ioc *ioc = pd_to_iocg(pd)->ioc;
2445
2446         if (!dname)
2447                 return 0;
2448
2449         seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2450                    dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2451                    ioc->params.qos[QOS_RPPM] / 10000,
2452                    ioc->params.qos[QOS_RPPM] % 10000 / 100,
2453                    ioc->params.qos[QOS_RLAT],
2454                    ioc->params.qos[QOS_WPPM] / 10000,
2455                    ioc->params.qos[QOS_WPPM] % 10000 / 100,
2456                    ioc->params.qos[QOS_WLAT],
2457                    ioc->params.qos[QOS_MIN] / 10000,
2458                    ioc->params.qos[QOS_MIN] % 10000 / 100,
2459                    ioc->params.qos[QOS_MAX] / 10000,
2460                    ioc->params.qos[QOS_MAX] % 10000 / 100);
2461         return 0;
2462 }
2463
2464 static int ioc_qos_show(struct seq_file *sf, void *v)
2465 {
2466         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2467
2468         blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2469                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
2470         return 0;
2471 }
2472
2473 static const match_table_t qos_ctrl_tokens = {
2474         { QOS_ENABLE,           "enable=%u"     },
2475         { QOS_CTRL,             "ctrl=%s"       },
2476         { NR_QOS_CTRL_PARAMS,   NULL            },
2477 };
2478
2479 static const match_table_t qos_tokens = {
2480         { QOS_RPPM,             "rpct=%s"       },
2481         { QOS_RLAT,             "rlat=%u"       },
2482         { QOS_WPPM,             "wpct=%s"       },
2483         { QOS_WLAT,             "wlat=%u"       },
2484         { QOS_MIN,              "min=%s"        },
2485         { QOS_MAX,              "max=%s"        },
2486         { NR_QOS_PARAMS,        NULL            },
2487 };
2488
2489 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2490                              size_t nbytes, loff_t off)
2491 {
2492         struct gendisk *disk;
2493         struct ioc *ioc;
2494         u32 qos[NR_QOS_PARAMS];
2495         bool enable, user;
2496         char *p;
2497         int ret;
2498
2499         disk = blkcg_conf_get_disk(&input);
2500         if (IS_ERR(disk))
2501                 return PTR_ERR(disk);
2502
2503         ioc = q_to_ioc(disk->queue);
2504         if (!ioc) {
2505                 ret = blk_iocost_init(disk->queue);
2506                 if (ret)
2507                         goto err;
2508                 ioc = q_to_ioc(disk->queue);
2509         }
2510
2511         spin_lock_irq(&ioc->lock);
2512         memcpy(qos, ioc->params.qos, sizeof(qos));
2513         enable = ioc->enabled;
2514         user = ioc->user_qos_params;
2515         spin_unlock_irq(&ioc->lock);
2516
2517         while ((p = strsep(&input, " \t\n"))) {
2518                 substring_t args[MAX_OPT_ARGS];
2519                 char buf[32];
2520                 int tok;
2521                 s64 v;
2522
2523                 if (!*p)
2524                         continue;
2525
2526                 switch (match_token(p, qos_ctrl_tokens, args)) {
2527                 case QOS_ENABLE:
2528                         match_u64(&args[0], &v);
2529                         enable = v;
2530                         continue;
2531                 case QOS_CTRL:
2532                         match_strlcpy(buf, &args[0], sizeof(buf));
2533                         if (!strcmp(buf, "auto"))
2534                                 user = false;
2535                         else if (!strcmp(buf, "user"))
2536                                 user = true;
2537                         else
2538                                 goto einval;
2539                         continue;
2540                 }
2541
2542                 tok = match_token(p, qos_tokens, args);
2543                 switch (tok) {
2544                 case QOS_RPPM:
2545                 case QOS_WPPM:
2546                         if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2547                             sizeof(buf))
2548                                 goto einval;
2549                         if (cgroup_parse_float(buf, 2, &v))
2550                                 goto einval;
2551                         if (v < 0 || v > 10000)
2552                                 goto einval;
2553                         qos[tok] = v * 100;
2554                         break;
2555                 case QOS_RLAT:
2556                 case QOS_WLAT:
2557                         if (match_u64(&args[0], &v))
2558                                 goto einval;
2559                         qos[tok] = v;
2560                         break;
2561                 case QOS_MIN:
2562                 case QOS_MAX:
2563                         if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2564                             sizeof(buf))
2565                                 goto einval;
2566                         if (cgroup_parse_float(buf, 2, &v))
2567                                 goto einval;
2568                         if (v < 0)
2569                                 goto einval;
2570                         qos[tok] = clamp_t(s64, v * 100,
2571                                            VRATE_MIN_PPM, VRATE_MAX_PPM);
2572                         break;
2573                 default:
2574                         goto einval;
2575                 }
2576                 user = true;
2577         }
2578
2579         if (qos[QOS_MIN] > qos[QOS_MAX])
2580                 goto einval;
2581
2582         spin_lock_irq(&ioc->lock);
2583
2584         if (enable) {
2585                 blk_stat_enable_accounting(ioc->rqos.q);
2586                 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2587                 ioc->enabled = true;
2588         } else {
2589                 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2590                 ioc->enabled = false;
2591         }
2592
2593         if (user) {
2594                 memcpy(ioc->params.qos, qos, sizeof(qos));
2595                 ioc->user_qos_params = true;
2596         } else {
2597                 ioc->user_qos_params = false;
2598         }
2599
2600         ioc_refresh_params(ioc, true);
2601         spin_unlock_irq(&ioc->lock);
2602
2603         put_disk_and_module(disk);
2604         return nbytes;
2605 einval:
2606         ret = -EINVAL;
2607 err:
2608         put_disk_and_module(disk);
2609         return ret;
2610 }
2611
2612 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2613                                  struct blkg_policy_data *pd, int off)
2614 {
2615         const char *dname = blkg_dev_name(pd->blkg);
2616         struct ioc *ioc = pd_to_iocg(pd)->ioc;
2617         u64 *u = ioc->params.i_lcoefs;
2618
2619         if (!dname)
2620                 return 0;
2621
2622         seq_printf(sf, "%s ctrl=%s model=linear "
2623                    "rbps=%llu rseqiops=%llu rrandiops=%llu "
2624                    "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2625                    dname, ioc->user_cost_model ? "user" : "auto",
2626                    u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2627                    u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2628         return 0;
2629 }
2630
2631 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2632 {
2633         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2634
2635         blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2636                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
2637         return 0;
2638 }
2639
2640 static const match_table_t cost_ctrl_tokens = {
2641         { COST_CTRL,            "ctrl=%s"       },
2642         { COST_MODEL,           "model=%s"      },
2643         { NR_COST_CTRL_PARAMS,  NULL            },
2644 };
2645
2646 static const match_table_t i_lcoef_tokens = {
2647         { I_LCOEF_RBPS,         "rbps=%u"       },
2648         { I_LCOEF_RSEQIOPS,     "rseqiops=%u"   },
2649         { I_LCOEF_RRANDIOPS,    "rrandiops=%u"  },
2650         { I_LCOEF_WBPS,         "wbps=%u"       },
2651         { I_LCOEF_WSEQIOPS,     "wseqiops=%u"   },
2652         { I_LCOEF_WRANDIOPS,    "wrandiops=%u"  },
2653         { NR_I_LCOEFS,          NULL            },
2654 };
2655
2656 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2657                                     size_t nbytes, loff_t off)
2658 {
2659         struct gendisk *disk;
2660         struct ioc *ioc;
2661         u64 u[NR_I_LCOEFS];
2662         bool user;
2663         char *p;
2664         int ret;
2665
2666         disk = blkcg_conf_get_disk(&input);
2667         if (IS_ERR(disk))
2668                 return PTR_ERR(disk);
2669
2670         ioc = q_to_ioc(disk->queue);
2671         if (!ioc) {
2672                 ret = blk_iocost_init(disk->queue);
2673                 if (ret)
2674                         goto err;
2675                 ioc = q_to_ioc(disk->queue);
2676         }
2677
2678         spin_lock_irq(&ioc->lock);
2679         memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2680         user = ioc->user_cost_model;
2681         spin_unlock_irq(&ioc->lock);
2682
2683         while ((p = strsep(&input, " \t\n"))) {
2684                 substring_t args[MAX_OPT_ARGS];
2685                 char buf[32];
2686                 int tok;
2687                 u64 v;
2688
2689                 if (!*p)
2690                         continue;
2691
2692                 switch (match_token(p, cost_ctrl_tokens, args)) {
2693                 case COST_CTRL:
2694                         match_strlcpy(buf, &args[0], sizeof(buf));
2695                         if (!strcmp(buf, "auto"))
2696                                 user = false;
2697                         else if (!strcmp(buf, "user"))
2698                                 user = true;
2699                         else
2700                                 goto einval;
2701                         continue;
2702                 case COST_MODEL:
2703                         match_strlcpy(buf, &args[0], sizeof(buf));
2704                         if (strcmp(buf, "linear"))
2705                                 goto einval;
2706                         continue;
2707                 }
2708
2709                 tok = match_token(p, i_lcoef_tokens, args);
2710                 if (tok == NR_I_LCOEFS)
2711                         goto einval;
2712                 if (match_u64(&args[0], &v))
2713                         goto einval;
2714                 u[tok] = v;
2715                 user = true;
2716         }
2717
2718         spin_lock_irq(&ioc->lock);
2719         if (user) {
2720                 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2721                 ioc->user_cost_model = true;
2722         } else {
2723                 ioc->user_cost_model = false;
2724         }
2725         ioc_refresh_params(ioc, true);
2726         spin_unlock_irq(&ioc->lock);
2727
2728         put_disk_and_module(disk);
2729         return nbytes;
2730
2731 einval:
2732         ret = -EINVAL;
2733 err:
2734         put_disk_and_module(disk);
2735         return ret;
2736 }
2737
2738 static struct cftype ioc_files[] = {
2739         {
2740                 .name = "weight",
2741                 .flags = CFTYPE_NOT_ON_ROOT,
2742                 .seq_show = ioc_weight_show,
2743                 .write = ioc_weight_write,
2744         },
2745         {
2746                 .name = "cost.qos",
2747                 .flags = CFTYPE_ONLY_ON_ROOT,
2748                 .seq_show = ioc_qos_show,
2749                 .write = ioc_qos_write,
2750         },
2751         {
2752                 .name = "cost.model",
2753                 .flags = CFTYPE_ONLY_ON_ROOT,
2754                 .seq_show = ioc_cost_model_show,
2755                 .write = ioc_cost_model_write,
2756         },
2757         {}
2758 };
2759
2760 static struct blkcg_policy blkcg_policy_iocost = {
2761         .dfl_cftypes    = ioc_files,
2762         .cpd_alloc_fn   = ioc_cpd_alloc,
2763         .cpd_free_fn    = ioc_cpd_free,
2764         .pd_alloc_fn    = ioc_pd_alloc,
2765         .pd_init_fn     = ioc_pd_init,
2766         .pd_free_fn     = ioc_pd_free,
2767         .pd_stat_fn     = ioc_pd_stat,
2768 };
2769
2770 static int __init ioc_init(void)
2771 {
2772         return blkcg_policy_register(&blkcg_policy_iocost);
2773 }
2774
2775 static void __exit ioc_exit(void)
2776 {
2777         return blkcg_policy_unregister(&blkcg_policy_iocost);
2778 }
2779
2780 module_init(ioc_init);
2781 module_exit(ioc_exit);