mm/backing-dev.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2
   3 #include <linux/wait.h>
   4 #include <linux/rbtree.h>
   5 #include <linux/backing-dev.h>
   6 #include <linux/kthread.h>
   7 #include <linux/freezer.h>
   8 #include <linux/fs.h>
   9 #include <linux/pagemap.h>
  10 #include <linux/mm.h>
  11 #include <linux/sched/mm.h>
  12 #include <linux/sched.h>
  13 #include <linux/module.h>
  14 #include <linux/writeback.h>
  15 #include <linux/device.h>
  16 #include <trace/events/writeback.h>
  17
  18 struct backing_dev_info noop_backing_dev_info;
  19 EXPORT_SYMBOL_GPL(noop_backing_dev_info);
  20
  21 static struct class *bdi_class;
  22 static const char *bdi_unknown_name = "(unknown)";
  23
  24 /*
  25  * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
  26  * reader side locking.
  27  */
  28 DEFINE_SPINLOCK(bdi_lock);
  29 static u64 bdi_id_cursor;
  30 static struct rb_root bdi_tree = RB_ROOT;
  31 LIST_HEAD(bdi_list);
  32
  33 /* bdi_wq serves all asynchronous writeback tasks */
  34 struct workqueue_struct *bdi_wq;
  35
  36 #define K(x) ((x) << (PAGE_SHIFT - 10))
  37
  38 #ifdef CONFIG_DEBUG_FS
  39 #include <linux/debugfs.h>
  40 #include <linux/seq_file.h>
  41
  42 static struct dentry *bdi_debug_root;
  43
  44 static void bdi_debug_init(void)
  45 {
  46         bdi_debug_root = debugfs_create_dir("bdi", NULL);
  47 }
  48
  49 static int bdi_debug_stats_show(struct seq_file *m, void *v)
  50 {
  51         struct backing_dev_info *bdi = m->private;
  52         struct bdi_writeback *wb = &bdi->wb;
  53         unsigned long background_thresh;
  54         unsigned long dirty_thresh;
  55         unsigned long wb_thresh;
  56         unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
  57         struct inode *inode;
  58
  59         nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
  60         spin_lock(&wb->list_lock);
  61         list_for_each_entry(inode, &wb->b_dirty, i_io_list)
  62                 nr_dirty++;
  63         list_for_each_entry(inode, &wb->b_io, i_io_list)
  64                 nr_io++;
  65         list_for_each_entry(inode, &wb->b_more_io, i_io_list)
  66                 nr_more_io++;
  67         list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
  68                 if (inode->i_state & I_DIRTY_TIME)
  69                         nr_dirty_time++;
  70         spin_unlock(&wb->list_lock);
  71
  72         global_dirty_limits(&background_thresh, &dirty_thresh);
  73         wb_thresh = wb_calc_thresh(wb, dirty_thresh);
  74
  75         seq_printf(m,
  76                    "BdiWriteback:       %10lu kB\n"
  77                    "BdiReclaimable:     %10lu kB\n"
  78                    "BdiDirtyThresh:     %10lu kB\n"
  79                    "DirtyThresh:        %10lu kB\n"
  80                    "BackgroundThresh:   %10lu kB\n"
  81                    "BdiDirtied:         %10lu kB\n"
  82                    "BdiWritten:         %10lu kB\n"
  83                    "BdiWriteBandwidth:  %10lu kBps\n"
  84                    "b_dirty:            %10lu\n"
  85                    "b_io:               %10lu\n"
  86                    "b_more_io:          %10lu\n"
  87                    "b_dirty_time:       %10lu\n"
  88                    "bdi_list:           %10u\n"
  89                    "state:              %10lx\n",
  90                    (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
  91                    (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
  92                    K(wb_thresh),
  93                    K(dirty_thresh),
  94                    K(background_thresh),
  95                    (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
  96                    (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
  97                    (unsigned long) K(wb->write_bandwidth),
  98                    nr_dirty,
  99                    nr_io,
 100                    nr_more_io,
 101                    nr_dirty_time,
 102                    !list_empty(&bdi->bdi_list), bdi->wb.state);
 103
 104         return 0;
 105 }
 106 DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
 107
 108 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 109 {
 110         bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
 111
 112         debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
 113                             &bdi_debug_stats_fops);
 114 }
 115
 116 static void bdi_debug_unregister(struct backing_dev_info *bdi)
 117 {
 118         debugfs_remove_recursive(bdi->debug_dir);
 119 }
 120 #else
 121 static inline void bdi_debug_init(void)
 122 {
 123 }
 124 static inline void bdi_debug_register(struct backing_dev_info *bdi,
 125                                       const char *name)
 126 {
 127 }
 128 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
 129 {
 130 }
 131 #endif
 132
 133 static ssize_t read_ahead_kb_store(struct device *dev,
 134                                   struct device_attribute *attr,
 135                                   const char *buf, size_t count)
 136 {
 137         struct backing_dev_info *bdi = dev_get_drvdata(dev);
 138         unsigned long read_ahead_kb;
 139         ssize_t ret;
 140
 141         ret = kstrtoul(buf, 10, &read_ahead_kb);
 142         if (ret < 0)
 143                 return ret;
 144
 145         bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
 146
 147         return count;
 148 }
 149
 150 #define BDI_SHOW(name, expr)                                            \
 151 static ssize_t name##_show(struct device *dev,                          \
 152                            struct device_attribute *attr, char *buf)    \
 153 {                                                                       \
 154         struct backing_dev_info *bdi = dev_get_drvdata(dev);            \
 155                                                                         \
 156         return sysfs_emit(buf, "%lld\n", (long long)expr);              \
 157 }                                                                       \
 158 static DEVICE_ATTR_RW(name);
 159
 160 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
 161
 162 static ssize_t min_ratio_store(struct device *dev,
 163                 struct device_attribute *attr, const char *buf, size_t count)
 164 {
 165         struct backing_dev_info *bdi = dev_get_drvdata(dev);
 166         unsigned int ratio;
 167         ssize_t ret;
 168
 169         ret = kstrtouint(buf, 10, &ratio);
 170         if (ret < 0)
 171                 return ret;
 172
 173         ret = bdi_set_min_ratio(bdi, ratio);
 174         if (!ret)
 175                 ret = count;
 176
 177         return ret;
 178 }
 179 BDI_SHOW(min_ratio, bdi->min_ratio)
 180
 181 static ssize_t max_ratio_store(struct device *dev,
 182                 struct device_attribute *attr, const char *buf, size_t count)
 183 {
 184         struct backing_dev_info *bdi = dev_get_drvdata(dev);
 185         unsigned int ratio;
 186         ssize_t ret;
 187
 188         ret = kstrtouint(buf, 10, &ratio);
 189         if (ret < 0)
 190                 return ret;
 191
 192         ret = bdi_set_max_ratio(bdi, ratio);
 193         if (!ret)
 194                 ret = count;
 195
 196         return ret;
 197 }
 198 BDI_SHOW(max_ratio, bdi->max_ratio)
 199
 200 static ssize_t stable_pages_required_show(struct device *dev,
 201                                           struct device_attribute *attr,
 202                                           char *buf)
 203 {
 204         dev_warn_once(dev,
 205                 "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
 206         return sysfs_emit(buf, "%d\n", 0);
 207 }
 208 static DEVICE_ATTR_RO(stable_pages_required);
 209
 210 static struct attribute *bdi_dev_attrs[] = {
 211         &dev_attr_read_ahead_kb.attr,
 212         &dev_attr_min_ratio.attr,
 213         &dev_attr_max_ratio.attr,
 214         &dev_attr_stable_pages_required.attr,
 215         NULL,
 216 };
 217 ATTRIBUTE_GROUPS(bdi_dev);
 218
 219 static __init int bdi_class_init(void)
 220 {
 221         bdi_class = class_create(THIS_MODULE, "bdi");
 222         if (IS_ERR(bdi_class))
 223                 return PTR_ERR(bdi_class);
 224
 225         bdi_class->dev_groups = bdi_dev_groups;
 226         bdi_debug_init();
 227
 228         return 0;
 229 }
 230 postcore_initcall(bdi_class_init);
 231
 232 static int bdi_init(struct backing_dev_info *bdi);
 233
 234 static int __init default_bdi_init(void)
 235 {
 236         int err;
 237
 238         bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
 239                                  WQ_SYSFS, 0);
 240         if (!bdi_wq)
 241                 return -ENOMEM;
 242
 243         err = bdi_init(&noop_backing_dev_info);
 244
 245         return err;
 246 }
 247 subsys_initcall(default_bdi_init);
 248
 249 /*
 250  * This function is used when the first inode for this wb is marked dirty. It
 251  * wakes-up the corresponding bdi thread which should then take care of the
 252  * periodic background write-out of dirty inodes. Since the write-out would
 253  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 254  * set up a timer which wakes the bdi thread up later.
 255  *
 256  * Note, we wouldn't bother setting up the timer, but this function is on the
 257  * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 258  * by delaying the wake-up.
 259  *
 260  * We have to be careful not to postpone flush work if it is scheduled for
 261  * earlier. Thus we use queue_delayed_work().
 262  */
 263 void wb_wakeup_delayed(struct bdi_writeback *wb)
 264 {
 265         unsigned long timeout;
 266
 267         timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
 268         spin_lock_bh(&wb->work_lock);
 269         if (test_bit(WB_registered, &wb->state))
 270                 queue_delayed_work(bdi_wq, &wb->dwork, timeout);
 271         spin_unlock_bh(&wb->work_lock);
 272 }
 273
 274 /*
 275  * Initial write bandwidth: 100 MB/s
 276  */
 277 #define INIT_BW         (100 << (20 - PAGE_SHIFT))
 278
 279 static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 280                    gfp_t gfp)
 281 {
 282         int i, err;
 283
 284         memset(wb, 0, sizeof(*wb));
 285
 286         if (wb != &bdi->wb)
 287                 bdi_get(bdi);
 288         wb->bdi = bdi;
 289         wb->last_old_flush = jiffies;
 290         INIT_LIST_HEAD(&wb->b_dirty);
 291         INIT_LIST_HEAD(&wb->b_io);
 292         INIT_LIST_HEAD(&wb->b_more_io);
 293         INIT_LIST_HEAD(&wb->b_dirty_time);
 294         spin_lock_init(&wb->list_lock);
 295
 296         atomic_set(&wb->writeback_inodes, 0);
 297         wb->bw_time_stamp = jiffies;
 298         wb->balanced_dirty_ratelimit = INIT_BW;
 299         wb->dirty_ratelimit = INIT_BW;
 300         wb->write_bandwidth = INIT_BW;
 301         wb->avg_write_bandwidth = INIT_BW;
 302
 303         spin_lock_init(&wb->work_lock);
 304         INIT_LIST_HEAD(&wb->work_list);
 305         INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
 306         wb->dirty_sleep = jiffies;
 307
 308         err = fprop_local_init_percpu(&wb->completions, gfp);
 309         if (err)
 310                 goto out_put_bdi;
 311
 312         for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
 313                 err = percpu_counter_init(&wb->stat[i], 0, gfp);
 314                 if (err)
 315                         goto out_destroy_stat;
 316         }
 317
 318         return 0;
 319
 320 out_destroy_stat:
 321         while (i--)
 322                 percpu_counter_destroy(&wb->stat[i]);
 323         fprop_local_destroy_percpu(&wb->completions);
 324 out_put_bdi:
 325         if (wb != &bdi->wb)
 326                 bdi_put(bdi);
 327         return err;
 328 }
 329
 330 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
 331
 332 /*
 333  * Remove bdi from the global list and shutdown any threads we have running
 334  */
 335 static void wb_shutdown(struct bdi_writeback *wb)
 336 {
 337         /* Make sure nobody queues further work */
 338         spin_lock_bh(&wb->work_lock);
 339         if (!test_and_clear_bit(WB_registered, &wb->state)) {
 340                 spin_unlock_bh(&wb->work_lock);
 341                 return;
 342         }
 343         spin_unlock_bh(&wb->work_lock);
 344
 345         cgwb_remove_from_bdi_list(wb);
 346         /*
 347          * Drain work list and shutdown the delayed_work.  !WB_registered
 348          * tells wb_workfn() that @wb is dying and its work_list needs to
 349          * be drained no matter what.
 350          */
 351         mod_delayed_work(bdi_wq, &wb->dwork, 0);
 352         flush_delayed_work(&wb->dwork);
 353         WARN_ON(!list_empty(&wb->work_list));
 354 }
 355
 356 static void wb_exit(struct bdi_writeback *wb)
 357 {
 358         int i;
 359
 360         WARN_ON(delayed_work_pending(&wb->dwork));
 361
 362         for (i = 0; i < NR_WB_STAT_ITEMS; i++)
 363                 percpu_counter_destroy(&wb->stat[i]);
 364
 365         fprop_local_destroy_percpu(&wb->completions);
 366         if (wb != &wb->bdi->wb)
 367                 bdi_put(wb->bdi);
 368 }
 369
 370 #ifdef CONFIG_CGROUP_WRITEBACK
 371
 372 #include <linux/memcontrol.h>
 373
 374 /*
 375  * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
 376  * memcg->cgwb_list.  bdi->cgwb_tree is also RCU protected.
 377  */
 378 static DEFINE_SPINLOCK(cgwb_lock);
 379 static struct workqueue_struct *cgwb_release_wq;
 380
 381 static LIST_HEAD(offline_cgwbs);
 382 static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
 383 static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
 384
 385 static void cgwb_release_workfn(struct work_struct *work)
 386 {
 387         struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
 388                                                 release_work);
 389         struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
 390
 391         mutex_lock(&wb->bdi->cgwb_release_mutex);
 392         wb_shutdown(wb);
 393
 394         css_put(wb->memcg_css);
 395         css_put(wb->blkcg_css);
 396         mutex_unlock(&wb->bdi->cgwb_release_mutex);
 397
 398         /* triggers blkg destruction if no online users left */
 399         blkcg_unpin_online(blkcg);
 400
 401         fprop_local_destroy_percpu(&wb->memcg_completions);
 402
 403         spin_lock_irq(&cgwb_lock);
 404         list_del(&wb->offline_node);
 405         spin_unlock_irq(&cgwb_lock);
 406
 407         percpu_ref_exit(&wb->refcnt);
 408         wb_exit(wb);
 409         WARN_ON_ONCE(!list_empty(&wb->b_attached));
 410         kfree_rcu(wb, rcu);
 411 }
 412
 413 static void cgwb_release(struct percpu_ref *refcnt)
 414 {
 415         struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
 416                                                 refcnt);
 417         queue_work(cgwb_release_wq, &wb->release_work);
 418 }
 419
 420 static void cgwb_kill(struct bdi_writeback *wb)
 421 {
 422         lockdep_assert_held(&cgwb_lock);
 423
 424         WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
 425         list_del(&wb->memcg_node);
 426         list_del(&wb->blkcg_node);
 427         list_add(&wb->offline_node, &offline_cgwbs);
 428         percpu_ref_kill(&wb->refcnt);
 429 }
 430
 431 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
 432 {
 433         spin_lock_irq(&cgwb_lock);
 434         list_del_rcu(&wb->bdi_node);
 435         spin_unlock_irq(&cgwb_lock);
 436 }
 437
 438 static int cgwb_create(struct backing_dev_info *bdi,
 439                        struct cgroup_subsys_state *memcg_css, gfp_t gfp)
 440 {
 441         struct mem_cgroup *memcg;
 442         struct cgroup_subsys_state *blkcg_css;
 443         struct blkcg *blkcg;
 444         struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
 445         struct bdi_writeback *wb;
 446         unsigned long flags;
 447         int ret = 0;
 448
 449         memcg = mem_cgroup_from_css(memcg_css);
 450         blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
 451         blkcg = css_to_blkcg(blkcg_css);
 452         memcg_cgwb_list = &memcg->cgwb_list;
 453         blkcg_cgwb_list = &blkcg->cgwb_list;
 454
 455         /* look up again under lock and discard on blkcg mismatch */
 456         spin_lock_irqsave(&cgwb_lock, flags);
 457         wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
 458         if (wb && wb->blkcg_css != blkcg_css) {
 459                 cgwb_kill(wb);
 460                 wb = NULL;
 461         }
 462         spin_unlock_irqrestore(&cgwb_lock, flags);
 463         if (wb)
 464                 goto out_put;
 465
 466         /* need to create a new one */
 467         wb = kmalloc(sizeof(*wb), gfp);
 468         if (!wb) {
 469                 ret = -ENOMEM;
 470                 goto out_put;
 471         }
 472
 473         ret = wb_init(wb, bdi, gfp);
 474         if (ret)
 475                 goto err_free;
 476
 477         ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
 478         if (ret)
 479                 goto err_wb_exit;
 480
 481         ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
 482         if (ret)
 483                 goto err_ref_exit;
 484
 485         wb->memcg_css = memcg_css;
 486         wb->blkcg_css = blkcg_css;
 487         INIT_LIST_HEAD(&wb->b_attached);
 488         INIT_WORK(&wb->release_work, cgwb_release_workfn);
 489         set_bit(WB_registered, &wb->state);
 490
 491         /*
 492          * The root wb determines the registered state of the whole bdi and
 493          * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
 494          * whether they're still online.  Don't link @wb if any is dead.
 495          * See wb_memcg_offline() and wb_blkcg_offline().
 496          */
 497         ret = -ENODEV;
 498         spin_lock_irqsave(&cgwb_lock, flags);
 499         if (test_bit(WB_registered, &bdi->wb.state) &&
 500             blkcg_cgwb_list->next && memcg_cgwb_list->next) {
 501                 /* we might have raced another instance of this function */
 502                 ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
 503                 if (!ret) {
 504                         list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
 505                         list_add(&wb->memcg_node, memcg_cgwb_list);
 506                         list_add(&wb->blkcg_node, blkcg_cgwb_list);
 507                         blkcg_pin_online(blkcg);
 508                         css_get(memcg_css);
 509                         css_get(blkcg_css);
 510                 }
 511         }
 512         spin_unlock_irqrestore(&cgwb_lock, flags);
 513         if (ret) {
 514                 if (ret == -EEXIST)
 515                         ret = 0;
 516                 goto err_fprop_exit;
 517         }
 518         goto out_put;
 519
 520 err_fprop_exit:
 521         fprop_local_destroy_percpu(&wb->memcg_completions);
 522 err_ref_exit:
 523         percpu_ref_exit(&wb->refcnt);
 524 err_wb_exit:
 525         wb_exit(wb);
 526 err_free:
 527         kfree(wb);
 528 out_put:
 529         css_put(blkcg_css);
 530         return ret;
 531 }
 532
 533 /**
 534  * wb_get_lookup - get wb for a given memcg
 535  * @bdi: target bdi
 536  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 537  *
 538  * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
 539  * refcount incremented.
 540  *
 541  * This function uses css_get() on @memcg_css and thus expects its refcnt
 542  * to be positive on invocation.  IOW, rcu_read_lock() protection on
 543  * @memcg_css isn't enough.  try_get it before calling this function.
 544  *
 545  * A wb is keyed by its associated memcg.  As blkcg implicitly enables
 546  * memcg on the default hierarchy, memcg association is guaranteed to be
 547  * more specific (equal or descendant to the associated blkcg) and thus can
 548  * identify both the memcg and blkcg associations.
 549  *
 550  * Because the blkcg associated with a memcg may change as blkcg is enabled
 551  * and disabled closer to root in the hierarchy, each wb keeps track of
 552  * both the memcg and blkcg associated with it and verifies the blkcg on
 553  * each lookup.  On mismatch, the existing wb is discarded and a new one is
 554  * created.
 555  */
 556 struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
 557                                     struct cgroup_subsys_state *memcg_css)
 558 {
 559         struct bdi_writeback *wb;
 560
 561         if (!memcg_css->parent)
 562                 return &bdi->wb;
 563
 564         rcu_read_lock();
 565         wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
 566         if (wb) {
 567                 struct cgroup_subsys_state *blkcg_css;
 568
 569                 /* see whether the blkcg association has changed */
 570                 blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
 571                 if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
 572                         wb = NULL;
 573                 css_put(blkcg_css);
 574         }
 575         rcu_read_unlock();
 576
 577         return wb;
 578 }
 579
 580 /**
 581  * wb_get_create - get wb for a given memcg, create if necessary
 582  * @bdi: target bdi
 583  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 584  * @gfp: allocation mask to use
 585  *
 586  * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
 587  * create one.  See wb_get_lookup() for more details.
 588  */
 589 struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 590                                     struct cgroup_subsys_state *memcg_css,
 591                                     gfp_t gfp)
 592 {
 593         struct bdi_writeback *wb;
 594
 595         might_alloc(gfp);
 596
 597         if (!memcg_css->parent)
 598                 return &bdi->wb;
 599
 600         do {
 601                 wb = wb_get_lookup(bdi, memcg_css);
 602         } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
 603
 604         return wb;
 605 }
 606
 607 static int cgwb_bdi_init(struct backing_dev_info *bdi)
 608 {
 609         int ret;
 610
 611         INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
 612         mutex_init(&bdi->cgwb_release_mutex);
 613         init_rwsem(&bdi->wb_switch_rwsem);
 614
 615         ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
 616         if (!ret) {
 617                 bdi->wb.memcg_css = &root_mem_cgroup->css;
 618                 bdi->wb.blkcg_css = blkcg_root_css;
 619         }
 620         return ret;
 621 }
 622
 623 static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
 624 {
 625         struct radix_tree_iter iter;
 626         void **slot;
 627         struct bdi_writeback *wb;
 628
 629         WARN_ON(test_bit(WB_registered, &bdi->wb.state));
 630
 631         spin_lock_irq(&cgwb_lock);
 632         radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
 633                 cgwb_kill(*slot);
 634         spin_unlock_irq(&cgwb_lock);
 635
 636         mutex_lock(&bdi->cgwb_release_mutex);
 637         spin_lock_irq(&cgwb_lock);
 638         while (!list_empty(&bdi->wb_list)) {
 639                 wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
 640                                       bdi_node);
 641                 spin_unlock_irq(&cgwb_lock);
 642                 wb_shutdown(wb);
 643                 spin_lock_irq(&cgwb_lock);
 644         }
 645         spin_unlock_irq(&cgwb_lock);
 646         mutex_unlock(&bdi->cgwb_release_mutex);
 647 }
 648
 649 /*
 650  * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
 651  *
 652  * Try to release dying cgwbs by switching attached inodes to the nearest
 653  * living ancestor's writeback. Processed wbs are placed at the end
 654  * of the list to guarantee the forward progress.
 655  */
 656 static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
 657 {
 658         struct bdi_writeback *wb;
 659         LIST_HEAD(processed);
 660
 661         spin_lock_irq(&cgwb_lock);
 662
 663         while (!list_empty(&offline_cgwbs)) {
 664                 wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
 665                                       offline_node);
 666                 list_move(&wb->offline_node, &processed);
 667
 668                 /*
 669                  * If wb is dirty, cleaning up the writeback by switching
 670                  * attached inodes will result in an effective removal of any
 671                  * bandwidth restrictions, which isn't the goal.  Instead,
 672                  * it can be postponed until the next time, when all io
 673                  * will be likely completed.  If in the meantime some inodes
 674                  * will get re-dirtied, they should be eventually switched to
 675                  * a new cgwb.
 676                  */
 677                 if (wb_has_dirty_io(wb))
 678                         continue;
 679
 680                 if (!wb_tryget(wb))
 681                         continue;
 682
 683                 spin_unlock_irq(&cgwb_lock);
 684                 while (cleanup_offline_cgwb(wb))
 685                         cond_resched();
 686                 spin_lock_irq(&cgwb_lock);
 687
 688                 wb_put(wb);
 689         }
 690
 691         if (!list_empty(&processed))
 692                 list_splice_tail(&processed, &offline_cgwbs);
 693
 694         spin_unlock_irq(&cgwb_lock);
 695 }
 696
 697 /**
 698  * wb_memcg_offline - kill all wb's associated with a memcg being offlined
 699  * @memcg: memcg being offlined
 700  *
 701  * Also prevents creation of any new wb's associated with @memcg.
 702  */
 703 void wb_memcg_offline(struct mem_cgroup *memcg)
 704 {
 705         struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
 706         struct bdi_writeback *wb, *next;
 707
 708         spin_lock_irq(&cgwb_lock);
 709         list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
 710                 cgwb_kill(wb);
 711         memcg_cgwb_list->next = NULL;   /* prevent new wb's */
 712         spin_unlock_irq(&cgwb_lock);
 713
 714         queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
 715 }
 716
 717 /**
 718  * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
 719  * @blkcg: blkcg being offlined
 720  *
 721  * Also prevents creation of any new wb's associated with @blkcg.
 722  */
 723 void wb_blkcg_offline(struct blkcg *blkcg)
 724 {
 725         struct bdi_writeback *wb, *next;
 726
 727         spin_lock_irq(&cgwb_lock);
 728         list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
 729                 cgwb_kill(wb);
 730         blkcg->cgwb_list.next = NULL;   /* prevent new wb's */
 731         spin_unlock_irq(&cgwb_lock);
 732 }
 733
 734 static void cgwb_bdi_register(struct backing_dev_info *bdi)
 735 {
 736         spin_lock_irq(&cgwb_lock);
 737         list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
 738         spin_unlock_irq(&cgwb_lock);
 739 }
 740
 741 static int __init cgwb_init(void)
 742 {
 743         /*
 744          * There can be many concurrent release work items overwhelming
 745          * system_wq.  Put them in a separate wq and limit concurrency.
 746          * There's no point in executing many of these in parallel.
 747          */
 748         cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
 749         if (!cgwb_release_wq)
 750                 return -ENOMEM;
 751
 752         return 0;
 753 }
 754 subsys_initcall(cgwb_init);
 755
 756 #else   /* CONFIG_CGROUP_WRITEBACK */
 757
 758 static int cgwb_bdi_init(struct backing_dev_info *bdi)
 759 {
 760         return wb_init(&bdi->wb, bdi, GFP_KERNEL);
 761 }
 762
 763 static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
 764
 765 static void cgwb_bdi_register(struct backing_dev_info *bdi)
 766 {
 767         list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
 768 }
 769
 770 static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
 771 {
 772         list_del_rcu(&wb->bdi_node);
 773 }
 774
 775 #endif  /* CONFIG_CGROUP_WRITEBACK */
 776
 777 static int bdi_init(struct backing_dev_info *bdi)
 778 {
 779         int ret;
 780
 781         bdi->dev = NULL;
 782
 783         kref_init(&bdi->refcnt);
 784         bdi->min_ratio = 0;
 785         bdi->max_ratio = 100;
 786         bdi->max_prop_frac = FPROP_FRAC_BASE;
 787         INIT_LIST_HEAD(&bdi->bdi_list);
 788         INIT_LIST_HEAD(&bdi->wb_list);
 789         init_waitqueue_head(&bdi->wb_waitq);
 790
 791         ret = cgwb_bdi_init(bdi);
 792
 793         return ret;
 794 }
 795
 796 struct backing_dev_info *bdi_alloc(int node_id)
 797 {
 798         struct backing_dev_info *bdi;
 799
 800         bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
 801         if (!bdi)
 802                 return NULL;
 803
 804         if (bdi_init(bdi)) {
 805                 kfree(bdi);
 806                 return NULL;
 807         }
 808         bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
 809         bdi->ra_pages = VM_READAHEAD_PAGES;
 810         bdi->io_pages = VM_READAHEAD_PAGES;
 811         return bdi;
 812 }
 813 EXPORT_SYMBOL(bdi_alloc);
 814
 815 static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
 816 {
 817         struct rb_node **p = &bdi_tree.rb_node;
 818         struct rb_node *parent = NULL;
 819         struct backing_dev_info *bdi;
 820
 821         lockdep_assert_held(&bdi_lock);
 822
 823         while (*p) {
 824                 parent = *p;
 825                 bdi = rb_entry(parent, struct backing_dev_info, rb_node);
 826
 827                 if (bdi->id > id)
 828                         p = &(*p)->rb_left;
 829                 else if (bdi->id < id)
 830                         p = &(*p)->rb_right;
 831                 else
 832                         break;
 833         }
 834
 835         if (parentp)
 836                 *parentp = parent;
 837         return p;
 838 }
 839
 840 /**
 841  * bdi_get_by_id - lookup and get bdi from its id
 842  * @id: bdi id to lookup
 843  *
 844  * Find bdi matching @id and get it.  Returns NULL if the matching bdi
 845  * doesn't exist or is already unregistered.
 846  */
 847 struct backing_dev_info *bdi_get_by_id(u64 id)
 848 {
 849         struct backing_dev_info *bdi = NULL;
 850         struct rb_node **p;
 851
 852         spin_lock_bh(&bdi_lock);
 853         p = bdi_lookup_rb_node(id, NULL);
 854         if (*p) {
 855                 bdi = rb_entry(*p, struct backing_dev_info, rb_node);
 856                 bdi_get(bdi);
 857         }
 858         spin_unlock_bh(&bdi_lock);
 859
 860         return bdi;
 861 }
 862
 863 int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
 864 {
 865         struct device *dev;
 866         struct rb_node *parent, **p;
 867
 868         if (bdi->dev)   /* The driver needs to use separate queues per device */
 869                 return 0;
 870
 871         vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
 872         dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
 873         if (IS_ERR(dev))
 874                 return PTR_ERR(dev);
 875
 876         cgwb_bdi_register(bdi);
 877         bdi->dev = dev;
 878
 879         bdi_debug_register(bdi, dev_name(dev));
 880         set_bit(WB_registered, &bdi->wb.state);
 881
 882         spin_lock_bh(&bdi_lock);
 883
 884         bdi->id = ++bdi_id_cursor;
 885
 886         p = bdi_lookup_rb_node(bdi->id, &parent);
 887         rb_link_node(&bdi->rb_node, parent, p);
 888         rb_insert_color(&bdi->rb_node, &bdi_tree);
 889
 890         list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
 891
 892         spin_unlock_bh(&bdi_lock);
 893
 894         trace_writeback_bdi_register(bdi);
 895         return 0;
 896 }
 897
 898 int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
 899 {
 900         va_list args;
 901         int ret;
 902
 903         va_start(args, fmt);
 904         ret = bdi_register_va(bdi, fmt, args);
 905         va_end(args);
 906         return ret;
 907 }
 908 EXPORT_SYMBOL(bdi_register);
 909
 910 void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
 911 {
 912         WARN_ON_ONCE(bdi->owner);
 913         bdi->owner = owner;
 914         get_device(owner);
 915 }
 916
 917 /*
 918  * Remove bdi from bdi_list, and ensure that it is no longer visible
 919  */
 920 static void bdi_remove_from_list(struct backing_dev_info *bdi)
 921 {
 922         spin_lock_bh(&bdi_lock);
 923         rb_erase(&bdi->rb_node, &bdi_tree);
 924         list_del_rcu(&bdi->bdi_list);
 925         spin_unlock_bh(&bdi_lock);
 926
 927         synchronize_rcu_expedited();
 928 }
 929
 930 void bdi_unregister(struct backing_dev_info *bdi)
 931 {
 932         /* make sure nobody finds us on the bdi_list anymore */
 933         bdi_remove_from_list(bdi);
 934         wb_shutdown(&bdi->wb);
 935         cgwb_bdi_unregister(bdi);
 936
 937         if (bdi->dev) {
 938                 bdi_debug_unregister(bdi);
 939                 device_unregister(bdi->dev);
 940                 bdi->dev = NULL;
 941         }
 942
 943         if (bdi->owner) {
 944                 put_device(bdi->owner);
 945                 bdi->owner = NULL;
 946         }
 947 }
 948
 949 static void release_bdi(struct kref *ref)
 950 {
 951         struct backing_dev_info *bdi =
 952                         container_of(ref, struct backing_dev_info, refcnt);
 953
 954         if (test_bit(WB_registered, &bdi->wb.state))
 955                 bdi_unregister(bdi);
 956         WARN_ON_ONCE(bdi->dev);
 957         wb_exit(&bdi->wb);
 958         kfree(bdi);
 959 }
 960
 961 void bdi_put(struct backing_dev_info *bdi)
 962 {
 963         kref_put(&bdi->refcnt, release_bdi);
 964 }
 965 EXPORT_SYMBOL(bdi_put);
 966
 967 const char *bdi_dev_name(struct backing_dev_info *bdi)
 968 {
 969         if (!bdi || !bdi->dev)
 970                 return bdi_unknown_name;
 971         return bdi->dev_name;
 972 }
 973 EXPORT_SYMBOL_GPL(bdi_dev_name);
 974
 975 static wait_queue_head_t congestion_wqh[2] = {
 976                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 977                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 978         };
 979 static atomic_t nr_wb_congested[2];
 980
 981 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 982 {
 983         wait_queue_head_t *wqh = &congestion_wqh[sync];
 984         enum wb_congested_state bit;
 985
 986         bit = sync ? WB_sync_congested : WB_async_congested;
 987         if (test_and_clear_bit(bit, &bdi->wb.congested))
 988                 atomic_dec(&nr_wb_congested[sync]);
 989         smp_mb__after_atomic();
 990         if (waitqueue_active(wqh))
 991                 wake_up(wqh);
 992 }
 993 EXPORT_SYMBOL(clear_bdi_congested);
 994
 995 void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 996 {
 997         enum wb_congested_state bit;
 998
 999         bit = sync ? WB_sync_congested : WB_async_congested;
1000         if (!test_and_set_bit(bit, &bdi->wb.congested))
1001                 atomic_inc(&nr_wb_congested[sync]);
1002 }
1003 EXPORT_SYMBOL(set_bdi_congested);
1004
1005 /**
1006  * congestion_wait - wait for a backing_dev to become uncongested
1007  * @sync: SYNC or ASYNC IO
1008  * @timeout: timeout in jiffies
1009  *
1010  * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
1011  * write congestion.  If no backing_devs are congested then just wait for the
1012  * next write to be completed.
1013  */
1014 long congestion_wait(int sync, long timeout)
1015 {
1016         long ret;
1017         unsigned long start = jiffies;
1018         DEFINE_WAIT(wait);
1019         wait_queue_head_t *wqh = &congestion_wqh[sync];
1020
1021         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1022         ret = io_schedule_timeout(timeout);
1023         finish_wait(wqh, &wait);
1024
1025         trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
1026                                         jiffies_to_usecs(jiffies - start));
1027
1028         return ret;
1029 }
1030 EXPORT_SYMBOL(congestion_wait);
1031
1032 /**
1033  * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
1034  * @sync: SYNC or ASYNC IO
1035  * @timeout: timeout in jiffies
1036  *
1037  * In the event of a congested backing_dev (any backing_dev) this waits
1038  * for up to @timeout jiffies for either a BDI to exit congestion of the
1039  * given @sync queue or a write to complete.
1040  *
1041  * The return value is 0 if the sleep is for the full timeout. Otherwise,
1042  * it is the number of jiffies that were still remaining when the function
1043  * returned. return_value == timeout implies the function did not sleep.
1044  */
1045 long wait_iff_congested(int sync, long timeout)
1046 {
1047         long ret;
1048         unsigned long start = jiffies;
1049         DEFINE_WAIT(wait);
1050         wait_queue_head_t *wqh = &congestion_wqh[sync];
1051
1052         /*
1053          * If there is no congestion, yield if necessary instead
1054          * of sleeping on the congestion queue
1055          */
1056         if (atomic_read(&nr_wb_congested[sync]) == 0) {
1057                 cond_resched();
1058
1059                 /* In case we scheduled, work out time remaining */
1060                 ret = timeout - (jiffies - start);
1061                 if (ret < 0)
1062                         ret = 0;
1063
1064                 goto out;
1065         }
1066
1067         /* Sleep until uncongested or a write happens */
1068         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1069         ret = io_schedule_timeout(timeout);
1070         finish_wait(wqh, &wait);
1071
1072 out:
1073         trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
1074                                         jiffies_to_usecs(jiffies - start));
1075
1076         return ret;
1077 }
1078 EXPORT_SYMBOL(wait_iff_congested);