fs/btrfs/qgroup.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2011 STRATO.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/writeback.h>
   9 #include <linux/blkdev.h>
  10 #include <linux/rbtree.h>
  11 #include <linux/slab.h>
  12 #include <linux/workqueue.h>
  13 #include <linux/btrfs.h>
  14 #include <linux/sizes.h>
  15
  16 #include "ctree.h"
  17 #include "transaction.h"
  18 #include "disk-io.h"
  19 #include "locking.h"
  20 #include "ulist.h"
  21 #include "backref.h"
  22 #include "extent_io.h"
  23 #include "qgroup.h"
  24 #include "block-group.h"
  25 #include "sysfs.h"
  26
  27 /* TODO XXX FIXME
  28  *  - subvol delete -> delete when ref goes to 0? delete limits also?
  29  *  - reorganize keys
  30  *  - compressed
  31  *  - sync
  32  *  - copy also limits on subvol creation
  33  *  - limit
  34  *  - caches for ulists
  35  *  - performance benchmarks
  36  *  - check all ioctl parameters
  37  */
  38
  39 /*
  40  * Helpers to access qgroup reservation
  41  *
  42  * Callers should ensure the lock context and type are valid
  43  */
  44
  45 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
  46 {
  47         u64 ret = 0;
  48         int i;
  49
  50         for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
  51                 ret += qgroup->rsv.values[i];
  52
  53         return ret;
  54 }
  55
  56 #ifdef CONFIG_BTRFS_DEBUG
  57 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
  58 {
  59         if (type == BTRFS_QGROUP_RSV_DATA)
  60                 return "data";
  61         if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
  62                 return "meta_pertrans";
  63         if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
  64                 return "meta_prealloc";
  65         return NULL;
  66 }
  67 #endif
  68
  69 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
  70                            struct btrfs_qgroup *qgroup, u64 num_bytes,
  71                            enum btrfs_qgroup_rsv_type type)
  72 {
  73         trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
  74         qgroup->rsv.values[type] += num_bytes;
  75 }
  76
  77 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
  78                                struct btrfs_qgroup *qgroup, u64 num_bytes,
  79                                enum btrfs_qgroup_rsv_type type)
  80 {
  81         trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
  82         if (qgroup->rsv.values[type] >= num_bytes) {
  83                 qgroup->rsv.values[type] -= num_bytes;
  84                 return;
  85         }
  86 #ifdef CONFIG_BTRFS_DEBUG
  87         WARN_RATELIMIT(1,
  88                 "qgroup %llu %s reserved space underflow, have %llu to free %llu",
  89                 qgroup->qgroupid, qgroup_rsv_type_str(type),
  90                 qgroup->rsv.values[type], num_bytes);
  91 #endif
  92         qgroup->rsv.values[type] = 0;
  93 }
  94
  95 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
  96                                      struct btrfs_qgroup *dest,
  97                                      struct btrfs_qgroup *src)
  98 {
  99         int i;
 100
 101         for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 102                 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
 103 }
 104
 105 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
 106                                          struct btrfs_qgroup *dest,
 107                                           struct btrfs_qgroup *src)
 108 {
 109         int i;
 110
 111         for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 112                 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
 113 }
 114
 115 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
 116                                            int mod)
 117 {
 118         if (qg->old_refcnt < seq)
 119                 qg->old_refcnt = seq;
 120         qg->old_refcnt += mod;
 121 }
 122
 123 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
 124                                            int mod)
 125 {
 126         if (qg->new_refcnt < seq)
 127                 qg->new_refcnt = seq;
 128         qg->new_refcnt += mod;
 129 }
 130
 131 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
 132 {
 133         if (qg->old_refcnt < seq)
 134                 return 0;
 135         return qg->old_refcnt - seq;
 136 }
 137
 138 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
 139 {
 140         if (qg->new_refcnt < seq)
 141                 return 0;
 142         return qg->new_refcnt - seq;
 143 }
 144
 145 /*
 146  * glue structure to represent the relations between qgroups.
 147  */
 148 struct btrfs_qgroup_list {
 149         struct list_head next_group;
 150         struct list_head next_member;
 151         struct btrfs_qgroup *group;
 152         struct btrfs_qgroup *member;
 153 };
 154
 155 static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
 156 {
 157         return (u64)(uintptr_t)qg;
 158 }
 159
 160 static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
 161 {
 162         return (struct btrfs_qgroup *)(uintptr_t)n->aux;
 163 }
 164
 165 static int
 166 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 167                    int init_flags);
 168 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
 169
 170 /* must be called with qgroup_ioctl_lock held */
 171 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
 172                                            u64 qgroupid)
 173 {
 174         struct rb_node *n = fs_info->qgroup_tree.rb_node;
 175         struct btrfs_qgroup *qgroup;
 176
 177         while (n) {
 178                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
 179                 if (qgroup->qgroupid < qgroupid)
 180                         n = n->rb_left;
 181                 else if (qgroup->qgroupid > qgroupid)
 182                         n = n->rb_right;
 183                 else
 184                         return qgroup;
 185         }
 186         return NULL;
 187 }
 188
 189 /* must be called with qgroup_lock held */
 190 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 191                                           u64 qgroupid)
 192 {
 193         struct rb_node **p = &fs_info->qgroup_tree.rb_node;
 194         struct rb_node *parent = NULL;
 195         struct btrfs_qgroup *qgroup;
 196
 197         while (*p) {
 198                 parent = *p;
 199                 qgroup = rb_entry(parent, struct btrfs_qgroup, node);
 200
 201                 if (qgroup->qgroupid < qgroupid)
 202                         p = &(*p)->rb_left;
 203                 else if (qgroup->qgroupid > qgroupid)
 204                         p = &(*p)->rb_right;
 205                 else
 206                         return qgroup;
 207         }
 208
 209         qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
 210         if (!qgroup)
 211                 return ERR_PTR(-ENOMEM);
 212
 213         qgroup->qgroupid = qgroupid;
 214         INIT_LIST_HEAD(&qgroup->groups);
 215         INIT_LIST_HEAD(&qgroup->members);
 216         INIT_LIST_HEAD(&qgroup->dirty);
 217
 218         rb_link_node(&qgroup->node, parent, p);
 219         rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
 220
 221         return qgroup;
 222 }
 223
 224 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
 225                             struct btrfs_qgroup *qgroup)
 226 {
 227         struct btrfs_qgroup_list *list;
 228
 229         btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
 230         list_del(&qgroup->dirty);
 231         while (!list_empty(&qgroup->groups)) {
 232                 list = list_first_entry(&qgroup->groups,
 233                                         struct btrfs_qgroup_list, next_group);
 234                 list_del(&list->next_group);
 235                 list_del(&list->next_member);
 236                 kfree(list);
 237         }
 238
 239         while (!list_empty(&qgroup->members)) {
 240                 list = list_first_entry(&qgroup->members,
 241                                         struct btrfs_qgroup_list, next_member);
 242                 list_del(&list->next_group);
 243                 list_del(&list->next_member);
 244                 kfree(list);
 245         }
 246         kfree(qgroup);
 247 }
 248
 249 /* must be called with qgroup_lock held */
 250 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 251 {
 252         struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
 253
 254         if (!qgroup)
 255                 return -ENOENT;
 256
 257         rb_erase(&qgroup->node, &fs_info->qgroup_tree);
 258         __del_qgroup_rb(fs_info, qgroup);
 259         return 0;
 260 }
 261
 262 /* must be called with qgroup_lock held */
 263 static int add_relation_rb(struct btrfs_fs_info *fs_info,
 264                            u64 memberid, u64 parentid)
 265 {
 266         struct btrfs_qgroup *member;
 267         struct btrfs_qgroup *parent;
 268         struct btrfs_qgroup_list *list;
 269
 270         member = find_qgroup_rb(fs_info, memberid);
 271         parent = find_qgroup_rb(fs_info, parentid);
 272         if (!member || !parent)
 273                 return -ENOENT;
 274
 275         list = kzalloc(sizeof(*list), GFP_ATOMIC);
 276         if (!list)
 277                 return -ENOMEM;
 278
 279         list->group = parent;
 280         list->member = member;
 281         list_add_tail(&list->next_group, &member->groups);
 282         list_add_tail(&list->next_member, &parent->members);
 283
 284         return 0;
 285 }
 286
 287 /* must be called with qgroup_lock held */
 288 static int del_relation_rb(struct btrfs_fs_info *fs_info,
 289                            u64 memberid, u64 parentid)
 290 {
 291         struct btrfs_qgroup *member;
 292         struct btrfs_qgroup *parent;
 293         struct btrfs_qgroup_list *list;
 294
 295         member = find_qgroup_rb(fs_info, memberid);
 296         parent = find_qgroup_rb(fs_info, parentid);
 297         if (!member || !parent)
 298                 return -ENOENT;
 299
 300         list_for_each_entry(list, &member->groups, next_group) {
 301                 if (list->group == parent) {
 302                         list_del(&list->next_group);
 303                         list_del(&list->next_member);
 304                         kfree(list);
 305                         return 0;
 306                 }
 307         }
 308         return -ENOENT;
 309 }
 310
 311 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 312 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 313                                u64 rfer, u64 excl)
 314 {
 315         struct btrfs_qgroup *qgroup;
 316
 317         qgroup = find_qgroup_rb(fs_info, qgroupid);
 318         if (!qgroup)
 319                 return -EINVAL;
 320         if (qgroup->rfer != rfer || qgroup->excl != excl)
 321                 return -EINVAL;
 322         return 0;
 323 }
 324 #endif
 325
 326 /*
 327  * The full config is read in one go, only called from open_ctree()
 328  * It doesn't use any locking, as at this point we're still single-threaded
 329  */
 330 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 331 {
 332         struct btrfs_key key;
 333         struct btrfs_key found_key;
 334         struct btrfs_root *quota_root = fs_info->quota_root;
 335         struct btrfs_path *path = NULL;
 336         struct extent_buffer *l;
 337         int slot;
 338         int ret = 0;
 339         u64 flags = 0;
 340         u64 rescan_progress = 0;
 341
 342         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 343                 return 0;
 344
 345         fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
 346         if (!fs_info->qgroup_ulist) {
 347                 ret = -ENOMEM;
 348                 goto out;
 349         }
 350
 351         path = btrfs_alloc_path();
 352         if (!path) {
 353                 ret = -ENOMEM;
 354                 goto out;
 355         }
 356
 357         ret = btrfs_sysfs_add_qgroups(fs_info);
 358         if (ret < 0)
 359                 goto out;
 360         /* default this to quota off, in case no status key is found */
 361         fs_info->qgroup_flags = 0;
 362
 363         /*
 364          * pass 1: read status, all qgroup infos and limits
 365          */
 366         key.objectid = 0;
 367         key.type = 0;
 368         key.offset = 0;
 369         ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
 370         if (ret)
 371                 goto out;
 372
 373         while (1) {
 374                 struct btrfs_qgroup *qgroup;
 375
 376                 slot = path->slots[0];
 377                 l = path->nodes[0];
 378                 btrfs_item_key_to_cpu(l, &found_key, slot);
 379
 380                 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
 381                         struct btrfs_qgroup_status_item *ptr;
 382
 383                         ptr = btrfs_item_ptr(l, slot,
 384                                              struct btrfs_qgroup_status_item);
 385
 386                         if (btrfs_qgroup_status_version(l, ptr) !=
 387                             BTRFS_QGROUP_STATUS_VERSION) {
 388                                 btrfs_err(fs_info,
 389                                  "old qgroup version, quota disabled");
 390                                 goto out;
 391                         }
 392                         if (btrfs_qgroup_status_generation(l, ptr) !=
 393                             fs_info->generation) {
 394                                 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 395                                 btrfs_err(fs_info,
 396                                         "qgroup generation mismatch, marked as inconsistent");
 397                         }
 398                         fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
 399                                                                           ptr);
 400                         rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 401                         goto next1;
 402                 }
 403
 404                 if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
 405                     found_key.type != BTRFS_QGROUP_LIMIT_KEY)
 406                         goto next1;
 407
 408                 qgroup = find_qgroup_rb(fs_info, found_key.offset);
 409                 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
 410                     (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
 411                         btrfs_err(fs_info, "inconsistent qgroup config");
 412                         flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 413                 }
 414                 if (!qgroup) {
 415                         qgroup = add_qgroup_rb(fs_info, found_key.offset);
 416                         if (IS_ERR(qgroup)) {
 417                                 ret = PTR_ERR(qgroup);
 418                                 goto out;
 419                         }
 420                 }
 421                 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 422                 if (ret < 0)
 423                         goto out;
 424
 425                 switch (found_key.type) {
 426                 case BTRFS_QGROUP_INFO_KEY: {
 427                         struct btrfs_qgroup_info_item *ptr;
 428
 429                         ptr = btrfs_item_ptr(l, slot,
 430                                              struct btrfs_qgroup_info_item);
 431                         qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
 432                         qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
 433                         qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
 434                         qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
 435                         /* generation currently unused */
 436                         break;
 437                 }
 438                 case BTRFS_QGROUP_LIMIT_KEY: {
 439                         struct btrfs_qgroup_limit_item *ptr;
 440
 441                         ptr = btrfs_item_ptr(l, slot,
 442                                              struct btrfs_qgroup_limit_item);
 443                         qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
 444                         qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
 445                         qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
 446                         qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
 447                         qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
 448                         break;
 449                 }
 450                 }
 451 next1:
 452                 ret = btrfs_next_item(quota_root, path);
 453                 if (ret < 0)
 454                         goto out;
 455                 if (ret)
 456                         break;
 457         }
 458         btrfs_release_path(path);
 459
 460         /*
 461          * pass 2: read all qgroup relations
 462          */
 463         key.objectid = 0;
 464         key.type = BTRFS_QGROUP_RELATION_KEY;
 465         key.offset = 0;
 466         ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
 467         if (ret)
 468                 goto out;
 469         while (1) {
 470                 slot = path->slots[0];
 471                 l = path->nodes[0];
 472                 btrfs_item_key_to_cpu(l, &found_key, slot);
 473
 474                 if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
 475                         goto next2;
 476
 477                 if (found_key.objectid > found_key.offset) {
 478                         /* parent <- member, not needed to build config */
 479                         /* FIXME should we omit the key completely? */
 480                         goto next2;
 481                 }
 482
 483                 ret = add_relation_rb(fs_info, found_key.objectid,
 484                                       found_key.offset);
 485                 if (ret == -ENOENT) {
 486                         btrfs_warn(fs_info,
 487                                 "orphan qgroup relation 0x%llx->0x%llx",
 488                                 found_key.objectid, found_key.offset);
 489                         ret = 0;        /* ignore the error */
 490                 }
 491                 if (ret)
 492                         goto out;
 493 next2:
 494                 ret = btrfs_next_item(quota_root, path);
 495                 if (ret < 0)
 496                         goto out;
 497                 if (ret)
 498                         break;
 499         }
 500 out:
 501         fs_info->qgroup_flags |= flags;
 502         if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
 503                 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 504         else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
 505                  ret >= 0)
 506                 ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
 507         btrfs_free_path(path);
 508
 509         if (ret < 0) {
 510                 ulist_free(fs_info->qgroup_ulist);
 511                 fs_info->qgroup_ulist = NULL;
 512                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 513                 btrfs_sysfs_del_qgroups(fs_info);
 514         }
 515
 516         return ret < 0 ? ret : 0;
 517 }
 518
 519 /*
 520  * Called in close_ctree() when quota is still enabled.  This verifies we don't
 521  * leak some reserved space.
 522  *
 523  * Return false if no reserved space is left.
 524  * Return true if some reserved space is leaked.
 525  */
 526 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
 527 {
 528         struct rb_node *node;
 529         bool ret = false;
 530
 531         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 532                 return ret;
 533         /*
 534          * Since we're unmounting, there is no race and no need to grab qgroup
 535          * lock.  And here we don't go post-order to provide a more user
 536          * friendly sorted result.
 537          */
 538         for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
 539                 struct btrfs_qgroup *qgroup;
 540                 int i;
 541
 542                 qgroup = rb_entry(node, struct btrfs_qgroup, node);
 543                 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
 544                         if (qgroup->rsv.values[i]) {
 545                                 ret = true;
 546                                 btrfs_warn(fs_info,
 547                 "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
 548                                    btrfs_qgroup_level(qgroup->qgroupid),
 549                                    btrfs_qgroup_subvolid(qgroup->qgroupid),
 550                                    i, qgroup->rsv.values[i]);
 551                         }
 552                 }
 553         }
 554         return ret;
 555 }
 556
 557 /*
 558  * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
 559  * first two are in single-threaded paths.And for the third one, we have set
 560  * quota_root to be null with qgroup_lock held before, so it is safe to clean
 561  * up the in-memory structures without qgroup_lock held.
 562  */
 563 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 564 {
 565         struct rb_node *n;
 566         struct btrfs_qgroup *qgroup;
 567
 568         while ((n = rb_first(&fs_info->qgroup_tree))) {
 569                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
 570                 rb_erase(n, &fs_info->qgroup_tree);
 571                 __del_qgroup_rb(fs_info, qgroup);
 572         }
 573         /*
 574          * We call btrfs_free_qgroup_config() when unmounting
 575          * filesystem and disabling quota, so we set qgroup_ulist
 576          * to be null here to avoid double free.
 577          */
 578         ulist_free(fs_info->qgroup_ulist);
 579         fs_info->qgroup_ulist = NULL;
 580         btrfs_sysfs_del_qgroups(fs_info);
 581 }
 582
 583 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 584                                     u64 dst)
 585 {
 586         int ret;
 587         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 588         struct btrfs_path *path;
 589         struct btrfs_key key;
 590
 591         path = btrfs_alloc_path();
 592         if (!path)
 593                 return -ENOMEM;
 594
 595         key.objectid = src;
 596         key.type = BTRFS_QGROUP_RELATION_KEY;
 597         key.offset = dst;
 598
 599         ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
 600
 601         btrfs_mark_buffer_dirty(path->nodes[0]);
 602
 603         btrfs_free_path(path);
 604         return ret;
 605 }
 606
 607 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 608                                     u64 dst)
 609 {
 610         int ret;
 611         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 612         struct btrfs_path *path;
 613         struct btrfs_key key;
 614
 615         path = btrfs_alloc_path();
 616         if (!path)
 617                 return -ENOMEM;
 618
 619         key.objectid = src;
 620         key.type = BTRFS_QGROUP_RELATION_KEY;
 621         key.offset = dst;
 622
 623         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 624         if (ret < 0)
 625                 goto out;
 626
 627         if (ret > 0) {
 628                 ret = -ENOENT;
 629                 goto out;
 630         }
 631
 632         ret = btrfs_del_item(trans, quota_root, path);
 633 out:
 634         btrfs_free_path(path);
 635         return ret;
 636 }
 637
 638 static int add_qgroup_item(struct btrfs_trans_handle *trans,
 639                            struct btrfs_root *quota_root, u64 qgroupid)
 640 {
 641         int ret;
 642         struct btrfs_path *path;
 643         struct btrfs_qgroup_info_item *qgroup_info;
 644         struct btrfs_qgroup_limit_item *qgroup_limit;
 645         struct extent_buffer *leaf;
 646         struct btrfs_key key;
 647
 648         if (btrfs_is_testing(quota_root->fs_info))
 649                 return 0;
 650
 651         path = btrfs_alloc_path();
 652         if (!path)
 653                 return -ENOMEM;
 654
 655         key.objectid = 0;
 656         key.type = BTRFS_QGROUP_INFO_KEY;
 657         key.offset = qgroupid;
 658
 659         /*
 660          * Avoid a transaction abort by catching -EEXIST here. In that
 661          * case, we proceed by re-initializing the existing structure
 662          * on disk.
 663          */
 664
 665         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 666                                       sizeof(*qgroup_info));
 667         if (ret && ret != -EEXIST)
 668                 goto out;
 669
 670         leaf = path->nodes[0];
 671         qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
 672                                  struct btrfs_qgroup_info_item);
 673         btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
 674         btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
 675         btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
 676         btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 677         btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 678
 679         btrfs_mark_buffer_dirty(leaf);
 680
 681         btrfs_release_path(path);
 682
 683         key.type = BTRFS_QGROUP_LIMIT_KEY;
 684         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 685                                       sizeof(*qgroup_limit));
 686         if (ret && ret != -EEXIST)
 687                 goto out;
 688
 689         leaf = path->nodes[0];
 690         qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
 691                                   struct btrfs_qgroup_limit_item);
 692         btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
 693         btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
 694         btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
 695         btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 696         btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 697
 698         btrfs_mark_buffer_dirty(leaf);
 699
 700         ret = 0;
 701 out:
 702         btrfs_free_path(path);
 703         return ret;
 704 }
 705
 706 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
 707 {
 708         int ret;
 709         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 710         struct btrfs_path *path;
 711         struct btrfs_key key;
 712
 713         path = btrfs_alloc_path();
 714         if (!path)
 715                 return -ENOMEM;
 716
 717         key.objectid = 0;
 718         key.type = BTRFS_QGROUP_INFO_KEY;
 719         key.offset = qgroupid;
 720         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 721         if (ret < 0)
 722                 goto out;
 723
 724         if (ret > 0) {
 725                 ret = -ENOENT;
 726                 goto out;
 727         }
 728
 729         ret = btrfs_del_item(trans, quota_root, path);
 730         if (ret)
 731                 goto out;
 732
 733         btrfs_release_path(path);
 734
 735         key.type = BTRFS_QGROUP_LIMIT_KEY;
 736         ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 737         if (ret < 0)
 738                 goto out;
 739
 740         if (ret > 0) {
 741                 ret = -ENOENT;
 742                 goto out;
 743         }
 744
 745         ret = btrfs_del_item(trans, quota_root, path);
 746
 747 out:
 748         btrfs_free_path(path);
 749         return ret;
 750 }
 751
 752 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 753                                     struct btrfs_qgroup *qgroup)
 754 {
 755         struct btrfs_root *quota_root = trans->fs_info->quota_root;
 756         struct btrfs_path *path;
 757         struct btrfs_key key;
 758         struct extent_buffer *l;
 759         struct btrfs_qgroup_limit_item *qgroup_limit;
 760         int ret;
 761         int slot;
 762
 763         key.objectid = 0;
 764         key.type = BTRFS_QGROUP_LIMIT_KEY;
 765         key.offset = qgroup->qgroupid;
 766
 767         path = btrfs_alloc_path();
 768         if (!path)
 769                 return -ENOMEM;
 770
 771         ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 772         if (ret > 0)
 773                 ret = -ENOENT;
 774
 775         if (ret)
 776                 goto out;
 777
 778         l = path->nodes[0];
 779         slot = path->slots[0];
 780         qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
 781         btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
 782         btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
 783         btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
 784         btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 785         btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 786
 787         btrfs_mark_buffer_dirty(l);
 788
 789 out:
 790         btrfs_free_path(path);
 791         return ret;
 792 }
 793
 794 static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 795                                    struct btrfs_qgroup *qgroup)
 796 {
 797         struct btrfs_fs_info *fs_info = trans->fs_info;
 798         struct btrfs_root *quota_root = fs_info->quota_root;
 799         struct btrfs_path *path;
 800         struct btrfs_key key;
 801         struct extent_buffer *l;
 802         struct btrfs_qgroup_info_item *qgroup_info;
 803         int ret;
 804         int slot;
 805
 806         if (btrfs_is_testing(fs_info))
 807                 return 0;
 808
 809         key.objectid = 0;
 810         key.type = BTRFS_QGROUP_INFO_KEY;
 811         key.offset = qgroup->qgroupid;
 812
 813         path = btrfs_alloc_path();
 814         if (!path)
 815                 return -ENOMEM;
 816
 817         ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 818         if (ret > 0)
 819                 ret = -ENOENT;
 820
 821         if (ret)
 822                 goto out;
 823
 824         l = path->nodes[0];
 825         slot = path->slots[0];
 826         qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
 827         btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
 828         btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
 829         btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
 830         btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 831         btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
 832
 833         btrfs_mark_buffer_dirty(l);
 834
 835 out:
 836         btrfs_free_path(path);
 837         return ret;
 838 }
 839
 840 static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 841 {
 842         struct btrfs_fs_info *fs_info = trans->fs_info;
 843         struct btrfs_root *quota_root = fs_info->quota_root;
 844         struct btrfs_path *path;
 845         struct btrfs_key key;
 846         struct extent_buffer *l;
 847         struct btrfs_qgroup_status_item *ptr;
 848         int ret;
 849         int slot;
 850
 851         key.objectid = 0;
 852         key.type = BTRFS_QGROUP_STATUS_KEY;
 853         key.offset = 0;
 854
 855         path = btrfs_alloc_path();
 856         if (!path)
 857                 return -ENOMEM;
 858
 859         ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 860         if (ret > 0)
 861                 ret = -ENOENT;
 862
 863         if (ret)
 864                 goto out;
 865
 866         l = path->nodes[0];
 867         slot = path->slots[0];
 868         ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
 869         btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
 870         btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
 871         btrfs_set_qgroup_status_rescan(l, ptr,
 872                                 fs_info->qgroup_rescan_progress.objectid);
 873
 874         btrfs_mark_buffer_dirty(l);
 875
 876 out:
 877         btrfs_free_path(path);
 878         return ret;
 879 }
 880
 881 /*
 882  * called with qgroup_lock held
 883  */
 884 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 885                                   struct btrfs_root *root)
 886 {
 887         struct btrfs_path *path;
 888         struct btrfs_key key;
 889         struct extent_buffer *leaf = NULL;
 890         int ret;
 891         int nr = 0;
 892
 893         path = btrfs_alloc_path();
 894         if (!path)
 895                 return -ENOMEM;
 896
 897         path->leave_spinning = 1;
 898
 899         key.objectid = 0;
 900         key.offset = 0;
 901         key.type = 0;
 902
 903         while (1) {
 904                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 905                 if (ret < 0)
 906                         goto out;
 907                 leaf = path->nodes[0];
 908                 nr = btrfs_header_nritems(leaf);
 909                 if (!nr)
 910                         break;
 911                 /*
 912                  * delete the leaf one by one
 913                  * since the whole tree is going
 914                  * to be deleted.
 915                  */
 916                 path->slots[0] = 0;
 917                 ret = btrfs_del_items(trans, root, path, 0, nr);
 918                 if (ret)
 919                         goto out;
 920
 921                 btrfs_release_path(path);
 922         }
 923         ret = 0;
 924 out:
 925         btrfs_free_path(path);
 926         return ret;
 927 }
 928
 929 int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 930 {
 931         struct btrfs_root *quota_root;
 932         struct btrfs_root *tree_root = fs_info->tree_root;
 933         struct btrfs_path *path = NULL;
 934         struct btrfs_qgroup_status_item *ptr;
 935         struct extent_buffer *leaf;
 936         struct btrfs_key key;
 937         struct btrfs_key found_key;
 938         struct btrfs_qgroup *qgroup = NULL;
 939         struct btrfs_trans_handle *trans = NULL;
 940         int ret = 0;
 941         int slot;
 942
 943         mutex_lock(&fs_info->qgroup_ioctl_lock);
 944         if (fs_info->quota_root)
 945                 goto out;
 946
 947         fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
 948         if (!fs_info->qgroup_ulist) {
 949                 ret = -ENOMEM;
 950                 goto out;
 951         }
 952
 953         ret = btrfs_sysfs_add_qgroups(fs_info);
 954         if (ret < 0)
 955                 goto out;
 956         /*
 957          * 1 for quota root item
 958          * 1 for BTRFS_QGROUP_STATUS item
 959          *
 960          * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
 961          * per subvolume. However those are not currently reserved since it
 962          * would be a lot of overkill.
 963          */
 964         trans = btrfs_start_transaction(tree_root, 2);
 965         if (IS_ERR(trans)) {
 966                 ret = PTR_ERR(trans);
 967                 trans = NULL;
 968                 goto out;
 969         }
 970
 971         /*
 972          * initially create the quota tree
 973          */
 974         quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
 975         if (IS_ERR(quota_root)) {
 976                 ret =  PTR_ERR(quota_root);
 977                 btrfs_abort_transaction(trans, ret);
 978                 goto out;
 979         }
 980
 981         path = btrfs_alloc_path();
 982         if (!path) {
 983                 ret = -ENOMEM;
 984                 btrfs_abort_transaction(trans, ret);
 985                 goto out_free_root;
 986         }
 987
 988         key.objectid = 0;
 989         key.type = BTRFS_QGROUP_STATUS_KEY;
 990         key.offset = 0;
 991
 992         ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 993                                       sizeof(*ptr));
 994         if (ret) {
 995                 btrfs_abort_transaction(trans, ret);
 996                 goto out_free_path;
 997         }
 998
 999         leaf = path->nodes[0];
1000         ptr = btrfs_item_ptr(leaf, path->slots[0],
1001                                  struct btrfs_qgroup_status_item);
1002         btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
1003         btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
1004         fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
1005                                 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1006         btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
1007         btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
1008
1009         btrfs_mark_buffer_dirty(leaf);
1010
1011         key.objectid = 0;
1012         key.type = BTRFS_ROOT_REF_KEY;
1013         key.offset = 0;
1014
1015         btrfs_release_path(path);
1016         ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
1017         if (ret > 0)
1018                 goto out_add_root;
1019         if (ret < 0) {
1020                 btrfs_abort_transaction(trans, ret);
1021                 goto out_free_path;
1022         }
1023
1024         while (1) {
1025                 slot = path->slots[0];
1026                 leaf = path->nodes[0];
1027                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1028
1029                 if (found_key.type == BTRFS_ROOT_REF_KEY) {
1030                         ret = add_qgroup_item(trans, quota_root,
1031                                               found_key.offset);
1032                         if (ret) {
1033                                 btrfs_abort_transaction(trans, ret);
1034                                 goto out_free_path;
1035                         }
1036
1037                         qgroup = add_qgroup_rb(fs_info, found_key.offset);
1038                         if (IS_ERR(qgroup)) {
1039                                 ret = PTR_ERR(qgroup);
1040                                 btrfs_abort_transaction(trans, ret);
1041                                 goto out_free_path;
1042                         }
1043                         ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1044                         if (ret < 0) {
1045                                 btrfs_abort_transaction(trans, ret);
1046                                 goto out_free_path;
1047                         }
1048                 }
1049                 ret = btrfs_next_item(tree_root, path);
1050                 if (ret < 0) {
1051                         btrfs_abort_transaction(trans, ret);
1052                         goto out_free_path;
1053                 }
1054                 if (ret)
1055                         break;
1056         }
1057
1058 out_add_root:
1059         btrfs_release_path(path);
1060         ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1061         if (ret) {
1062                 btrfs_abort_transaction(trans, ret);
1063                 goto out_free_path;
1064         }
1065
1066         qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
1067         if (IS_ERR(qgroup)) {
1068                 ret = PTR_ERR(qgroup);
1069                 btrfs_abort_transaction(trans, ret);
1070                 goto out_free_path;
1071         }
1072         ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1073         if (ret < 0) {
1074                 btrfs_abort_transaction(trans, ret);
1075                 goto out_free_path;
1076         }
1077
1078         ret = btrfs_commit_transaction(trans);
1079         trans = NULL;
1080         if (ret)
1081                 goto out_free_path;
1082
1083         /*
1084          * Set quota enabled flag after committing the transaction, to avoid
1085          * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1086          * creation.
1087          */
1088         spin_lock(&fs_info->qgroup_lock);
1089         fs_info->quota_root = quota_root;
1090         set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1091         spin_unlock(&fs_info->qgroup_lock);
1092
1093         ret = qgroup_rescan_init(fs_info, 0, 1);
1094         if (!ret) {
1095                 qgroup_rescan_zero_tracking(fs_info);
1096                 fs_info->qgroup_rescan_running = true;
1097                 btrfs_queue_work(fs_info->qgroup_rescan_workers,
1098                                  &fs_info->qgroup_rescan_work);
1099         }
1100
1101 out_free_path:
1102         btrfs_free_path(path);
1103 out_free_root:
1104         if (ret)
1105                 btrfs_put_root(quota_root);
1106 out:
1107         if (ret) {
1108                 ulist_free(fs_info->qgroup_ulist);
1109                 fs_info->qgroup_ulist = NULL;
1110                 if (trans)
1111                         btrfs_end_transaction(trans);
1112                 btrfs_sysfs_del_qgroups(fs_info);
1113         }
1114         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1115         return ret;
1116 }
1117
1118 int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1119 {
1120         struct btrfs_root *quota_root;
1121         struct btrfs_trans_handle *trans = NULL;
1122         int ret = 0;
1123
1124         mutex_lock(&fs_info->qgroup_ioctl_lock);
1125         if (!fs_info->quota_root)
1126                 goto out;
1127
1128         /*
1129          * 1 For the root item
1130          *
1131          * We should also reserve enough items for the quota tree deletion in
1132          * btrfs_clean_quota_tree but this is not done.
1133          */
1134         trans = btrfs_start_transaction(fs_info->tree_root, 1);
1135         if (IS_ERR(trans)) {
1136                 ret = PTR_ERR(trans);
1137                 goto out;
1138         }
1139
1140         clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1141         btrfs_qgroup_wait_for_completion(fs_info, false);
1142         spin_lock(&fs_info->qgroup_lock);
1143         quota_root = fs_info->quota_root;
1144         fs_info->quota_root = NULL;
1145         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1146         spin_unlock(&fs_info->qgroup_lock);
1147
1148         btrfs_free_qgroup_config(fs_info);
1149
1150         ret = btrfs_clean_quota_tree(trans, quota_root);
1151         if (ret) {
1152                 btrfs_abort_transaction(trans, ret);
1153                 goto end_trans;
1154         }
1155
1156         ret = btrfs_del_root(trans, &quota_root->root_key);
1157         if (ret) {
1158                 btrfs_abort_transaction(trans, ret);
1159                 goto end_trans;
1160         }
1161
1162         list_del(&quota_root->dirty_list);
1163
1164         btrfs_tree_lock(quota_root->node);
1165         btrfs_clean_tree_block(quota_root->node);
1166         btrfs_tree_unlock(quota_root->node);
1167         btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
1168
1169         btrfs_put_root(quota_root);
1170
1171 end_trans:
1172         ret = btrfs_end_transaction(trans);
1173 out:
1174         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1175         return ret;
1176 }
1177
1178 static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1179                          struct btrfs_qgroup *qgroup)
1180 {
1181         if (list_empty(&qgroup->dirty))
1182                 list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1183 }
1184
1185 /*
1186  * The easy accounting, we're updating qgroup relationship whose child qgroup
1187  * only has exclusive extents.
1188  *
1189  * In this case, all exclusive extents will also be exclusive for parent, so
1190  * excl/rfer just get added/removed.
1191  *
1192  * So is qgroup reservation space, which should also be added/removed to
1193  * parent.
1194  * Or when child tries to release reservation space, parent will underflow its
1195  * reservation (for relationship adding case).
1196  *
1197  * Caller should hold fs_info->qgroup_lock.
1198  */
1199 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1200                                     struct ulist *tmp, u64 ref_root,
1201                                     struct btrfs_qgroup *src, int sign)
1202 {
1203         struct btrfs_qgroup *qgroup;
1204         struct btrfs_qgroup_list *glist;
1205         struct ulist_node *unode;
1206         struct ulist_iterator uiter;
1207         u64 num_bytes = src->excl;
1208         int ret = 0;
1209
1210         qgroup = find_qgroup_rb(fs_info, ref_root);
1211         if (!qgroup)
1212                 goto out;
1213
1214         qgroup->rfer += sign * num_bytes;
1215         qgroup->rfer_cmpr += sign * num_bytes;
1216
1217         WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1218         qgroup->excl += sign * num_bytes;
1219         qgroup->excl_cmpr += sign * num_bytes;
1220
1221         if (sign > 0)
1222                 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1223         else
1224                 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1225
1226         qgroup_dirty(fs_info, qgroup);
1227
1228         /* Get all of the parent groups that contain this qgroup */
1229         list_for_each_entry(glist, &qgroup->groups, next_group) {
1230                 ret = ulist_add(tmp, glist->group->qgroupid,
1231                                 qgroup_to_aux(glist->group), GFP_ATOMIC);
1232                 if (ret < 0)
1233                         goto out;
1234         }
1235
1236         /* Iterate all of the parents and adjust their reference counts */
1237         ULIST_ITER_INIT(&uiter);
1238         while ((unode = ulist_next(tmp, &uiter))) {
1239                 qgroup = unode_aux_to_qgroup(unode);
1240                 qgroup->rfer += sign * num_bytes;
1241                 qgroup->rfer_cmpr += sign * num_bytes;
1242                 WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1243                 qgroup->excl += sign * num_bytes;
1244                 if (sign > 0)
1245                         qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1246                 else
1247                         qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1248                 qgroup->excl_cmpr += sign * num_bytes;
1249                 qgroup_dirty(fs_info, qgroup);
1250
1251                 /* Add any parents of the parents */
1252                 list_for_each_entry(glist, &qgroup->groups, next_group) {
1253                         ret = ulist_add(tmp, glist->group->qgroupid,
1254                                         qgroup_to_aux(glist->group), GFP_ATOMIC);
1255                         if (ret < 0)
1256                                 goto out;
1257                 }
1258         }
1259         ret = 0;
1260 out:
1261         return ret;
1262 }
1263
1264
1265 /*
1266  * Quick path for updating qgroup with only excl refs.
1267  *
1268  * In that case, just update all parent will be enough.
1269  * Or we needs to do a full rescan.
1270  * Caller should also hold fs_info->qgroup_lock.
1271  *
1272  * Return 0 for quick update, return >0 for need to full rescan
1273  * and mark INCONSISTENT flag.
1274  * Return < 0 for other error.
1275  */
1276 static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1277                                    struct ulist *tmp, u64 src, u64 dst,
1278                                    int sign)
1279 {
1280         struct btrfs_qgroup *qgroup;
1281         int ret = 1;
1282         int err = 0;
1283
1284         qgroup = find_qgroup_rb(fs_info, src);
1285         if (!qgroup)
1286                 goto out;
1287         if (qgroup->excl == qgroup->rfer) {
1288                 ret = 0;
1289                 err = __qgroup_excl_accounting(fs_info, tmp, dst,
1290                                                qgroup, sign);
1291                 if (err < 0) {
1292                         ret = err;
1293                         goto out;
1294                 }
1295         }
1296 out:
1297         if (ret)
1298                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1299         return ret;
1300 }
1301
1302 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1303                               u64 dst)
1304 {
1305         struct btrfs_fs_info *fs_info = trans->fs_info;
1306         struct btrfs_qgroup *parent;
1307         struct btrfs_qgroup *member;
1308         struct btrfs_qgroup_list *list;
1309         struct ulist *tmp;
1310         int ret = 0;
1311
1312         /* Check the level of src and dst first */
1313         if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1314                 return -EINVAL;
1315
1316         tmp = ulist_alloc(GFP_KERNEL);
1317         if (!tmp)
1318                 return -ENOMEM;
1319
1320         mutex_lock(&fs_info->qgroup_ioctl_lock);
1321         if (!fs_info->quota_root) {
1322                 ret = -ENOTCONN;
1323                 goto out;
1324         }
1325         member = find_qgroup_rb(fs_info, src);
1326         parent = find_qgroup_rb(fs_info, dst);
1327         if (!member || !parent) {
1328                 ret = -EINVAL;
1329                 goto out;
1330         }
1331
1332         /* check if such qgroup relation exist firstly */
1333         list_for_each_entry(list, &member->groups, next_group) {
1334                 if (list->group == parent) {
1335                         ret = -EEXIST;
1336                         goto out;
1337                 }
1338         }
1339
1340         ret = add_qgroup_relation_item(trans, src, dst);
1341         if (ret)
1342                 goto out;
1343
1344         ret = add_qgroup_relation_item(trans, dst, src);
1345         if (ret) {
1346                 del_qgroup_relation_item(trans, src, dst);
1347                 goto out;
1348         }
1349
1350         spin_lock(&fs_info->qgroup_lock);
1351         ret = add_relation_rb(fs_info, src, dst);
1352         if (ret < 0) {
1353                 spin_unlock(&fs_info->qgroup_lock);
1354                 goto out;
1355         }
1356         ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
1357         spin_unlock(&fs_info->qgroup_lock);
1358 out:
1359         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1360         ulist_free(tmp);
1361         return ret;
1362 }
1363
1364 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1365                                  u64 dst)
1366 {
1367         struct btrfs_fs_info *fs_info = trans->fs_info;
1368         struct btrfs_qgroup *parent;
1369         struct btrfs_qgroup *member;
1370         struct btrfs_qgroup_list *list;
1371         struct ulist *tmp;
1372         bool found = false;
1373         int ret = 0;
1374         int ret2;
1375
1376         tmp = ulist_alloc(GFP_KERNEL);
1377         if (!tmp)
1378                 return -ENOMEM;
1379
1380         if (!fs_info->quota_root) {
1381                 ret = -ENOTCONN;
1382                 goto out;
1383         }
1384
1385         member = find_qgroup_rb(fs_info, src);
1386         parent = find_qgroup_rb(fs_info, dst);
1387         /*
1388          * The parent/member pair doesn't exist, then try to delete the dead
1389          * relation items only.
1390          */
1391         if (!member || !parent)
1392                 goto delete_item;
1393
1394         /* check if such qgroup relation exist firstly */
1395         list_for_each_entry(list, &member->groups, next_group) {
1396                 if (list->group == parent) {
1397                         found = true;
1398                         break;
1399                 }
1400         }
1401
1402 delete_item:
1403         ret = del_qgroup_relation_item(trans, src, dst);
1404         if (ret < 0 && ret != -ENOENT)
1405                 goto out;
1406         ret2 = del_qgroup_relation_item(trans, dst, src);
1407         if (ret2 < 0 && ret2 != -ENOENT)
1408                 goto out;
1409
1410         /* At least one deletion succeeded, return 0 */
1411         if (!ret || !ret2)
1412                 ret = 0;
1413
1414         if (found) {
1415                 spin_lock(&fs_info->qgroup_lock);
1416                 del_relation_rb(fs_info, src, dst);
1417                 ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1418                 spin_unlock(&fs_info->qgroup_lock);
1419         }
1420 out:
1421         ulist_free(tmp);
1422         return ret;
1423 }
1424
1425 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1426                               u64 dst)
1427 {
1428         struct btrfs_fs_info *fs_info = trans->fs_info;
1429         int ret = 0;
1430
1431         mutex_lock(&fs_info->qgroup_ioctl_lock);
1432         ret = __del_qgroup_relation(trans, src, dst);
1433         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1434
1435         return ret;
1436 }
1437
1438 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1439 {
1440         struct btrfs_fs_info *fs_info = trans->fs_info;
1441         struct btrfs_root *quota_root;
1442         struct btrfs_qgroup *qgroup;
1443         int ret = 0;
1444
1445         mutex_lock(&fs_info->qgroup_ioctl_lock);
1446         if (!fs_info->quota_root) {
1447                 ret = -ENOTCONN;
1448                 goto out;
1449         }
1450         quota_root = fs_info->quota_root;
1451         qgroup = find_qgroup_rb(fs_info, qgroupid);
1452         if (qgroup) {
1453                 ret = -EEXIST;
1454                 goto out;
1455         }
1456
1457         ret = add_qgroup_item(trans, quota_root, qgroupid);
1458         if (ret)
1459                 goto out;
1460
1461         spin_lock(&fs_info->qgroup_lock);
1462         qgroup = add_qgroup_rb(fs_info, qgroupid);
1463         spin_unlock(&fs_info->qgroup_lock);
1464
1465         if (IS_ERR(qgroup)) {
1466                 ret = PTR_ERR(qgroup);
1467                 goto out;
1468         }
1469         ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1470 out:
1471         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1472         return ret;
1473 }
1474
1475 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1476 {
1477         struct btrfs_fs_info *fs_info = trans->fs_info;
1478         struct btrfs_qgroup *qgroup;
1479         struct btrfs_qgroup_list *list;
1480         int ret = 0;
1481
1482         mutex_lock(&fs_info->qgroup_ioctl_lock);
1483         if (!fs_info->quota_root) {
1484                 ret = -ENOTCONN;
1485                 goto out;
1486         }
1487
1488         qgroup = find_qgroup_rb(fs_info, qgroupid);
1489         if (!qgroup) {
1490                 ret = -ENOENT;
1491                 goto out;
1492         }
1493
1494         /* Check if there are no children of this qgroup */
1495         if (!list_empty(&qgroup->members)) {
1496                 ret = -EBUSY;
1497                 goto out;
1498         }
1499
1500         ret = del_qgroup_item(trans, qgroupid);
1501         if (ret && ret != -ENOENT)
1502                 goto out;
1503
1504         while (!list_empty(&qgroup->groups)) {
1505                 list = list_first_entry(&qgroup->groups,
1506                                         struct btrfs_qgroup_list, next_group);
1507                 ret = __del_qgroup_relation(trans, qgroupid,
1508                                             list->group->qgroupid);
1509                 if (ret)
1510                         goto out;
1511         }
1512
1513         spin_lock(&fs_info->qgroup_lock);
1514         del_qgroup_rb(fs_info, qgroupid);
1515         spin_unlock(&fs_info->qgroup_lock);
1516 out:
1517         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1518         return ret;
1519 }
1520
1521 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1522                        struct btrfs_qgroup_limit *limit)
1523 {
1524         struct btrfs_fs_info *fs_info = trans->fs_info;
1525         struct btrfs_qgroup *qgroup;
1526         int ret = 0;
1527         /* Sometimes we would want to clear the limit on this qgroup.
1528          * To meet this requirement, we treat the -1 as a special value
1529          * which tell kernel to clear the limit on this qgroup.
1530          */
1531         const u64 CLEAR_VALUE = -1;
1532
1533         mutex_lock(&fs_info->qgroup_ioctl_lock);
1534         if (!fs_info->quota_root) {
1535                 ret = -ENOTCONN;
1536                 goto out;
1537         }
1538
1539         qgroup = find_qgroup_rb(fs_info, qgroupid);
1540         if (!qgroup) {
1541                 ret = -ENOENT;
1542                 goto out;
1543         }
1544
1545         spin_lock(&fs_info->qgroup_lock);
1546         if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1547                 if (limit->max_rfer == CLEAR_VALUE) {
1548                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1549                         limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1550                         qgroup->max_rfer = 0;
1551                 } else {
1552                         qgroup->max_rfer = limit->max_rfer;
1553                 }
1554         }
1555         if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1556                 if (limit->max_excl == CLEAR_VALUE) {
1557                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1558                         limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1559                         qgroup->max_excl = 0;
1560                 } else {
1561                         qgroup->max_excl = limit->max_excl;
1562                 }
1563         }
1564         if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1565                 if (limit->rsv_rfer == CLEAR_VALUE) {
1566                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1567                         limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1568                         qgroup->rsv_rfer = 0;
1569                 } else {
1570                         qgroup->rsv_rfer = limit->rsv_rfer;
1571                 }
1572         }
1573         if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1574                 if (limit->rsv_excl == CLEAR_VALUE) {
1575                         qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1576                         limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1577                         qgroup->rsv_excl = 0;
1578                 } else {
1579                         qgroup->rsv_excl = limit->rsv_excl;
1580                 }
1581         }
1582         qgroup->lim_flags |= limit->flags;
1583
1584         spin_unlock(&fs_info->qgroup_lock);
1585
1586         ret = update_qgroup_limit_item(trans, qgroup);
1587         if (ret) {
1588                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1589                 btrfs_info(fs_info, "unable to update quota limit for %llu",
1590                        qgroupid);
1591         }
1592
1593 out:
1594         mutex_unlock(&fs_info->qgroup_ioctl_lock);
1595         return ret;
1596 }
1597
1598 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1599                                 struct btrfs_delayed_ref_root *delayed_refs,
1600                                 struct btrfs_qgroup_extent_record *record)
1601 {
1602         struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1603         struct rb_node *parent_node = NULL;
1604         struct btrfs_qgroup_extent_record *entry;
1605         u64 bytenr = record->bytenr;
1606
1607         lockdep_assert_held(&delayed_refs->lock);
1608         trace_btrfs_qgroup_trace_extent(fs_info, record);
1609
1610         while (*p) {
1611                 parent_node = *p;
1612                 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1613                                  node);
1614                 if (bytenr < entry->bytenr) {
1615                         p = &(*p)->rb_left;
1616                 } else if (bytenr > entry->bytenr) {
1617                         p = &(*p)->rb_right;
1618                 } else {
1619                         if (record->data_rsv && !entry->data_rsv) {
1620                                 entry->data_rsv = record->data_rsv;
1621                                 entry->data_rsv_refroot =
1622                                         record->data_rsv_refroot;
1623                         }
1624                         return 1;
1625                 }
1626         }
1627
1628         rb_link_node(&record->node, parent_node, p);
1629         rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1630         return 0;
1631 }
1632
1633 int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
1634                                    struct btrfs_qgroup_extent_record *qrecord)
1635 {
1636         struct ulist *old_root;
1637         u64 bytenr = qrecord->bytenr;
1638         int ret;
1639
1640         ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
1641         if (ret < 0) {
1642                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1643                 btrfs_warn(fs_info,
1644 "error accounting new delayed refs extent (err code: %d), quota inconsistent",
1645                         ret);
1646                 return 0;
1647         }
1648
1649         /*
1650          * Here we don't need to get the lock of
1651          * trans->transaction->delayed_refs, since inserted qrecord won't
1652          * be deleted, only qrecord->node may be modified (new qrecord insert)
1653          *
1654          * So modifying qrecord->old_roots is safe here
1655          */
1656         qrecord->old_roots = old_root;
1657         return 0;
1658 }
1659
1660 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
1661                               u64 num_bytes, gfp_t gfp_flag)
1662 {
1663         struct btrfs_fs_info *fs_info = trans->fs_info;
1664         struct btrfs_qgroup_extent_record *record;
1665         struct btrfs_delayed_ref_root *delayed_refs;
1666         int ret;
1667
1668         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
1669             || bytenr == 0 || num_bytes == 0)
1670                 return 0;
1671         record = kzalloc(sizeof(*record), gfp_flag);
1672         if (!record)
1673                 return -ENOMEM;
1674
1675         delayed_refs = &trans->transaction->delayed_refs;
1676         record->bytenr = bytenr;
1677         record->num_bytes = num_bytes;
1678         record->old_roots = NULL;
1679
1680         spin_lock(&delayed_refs->lock);
1681         ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
1682         spin_unlock(&delayed_refs->lock);
1683         if (ret > 0) {
1684                 kfree(record);
1685                 return 0;
1686         }
1687         return btrfs_qgroup_trace_extent_post(fs_info, record);
1688 }
1689
1690 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
1691                                   struct extent_buffer *eb)
1692 {
1693         struct btrfs_fs_info *fs_info = trans->fs_info;
1694         int nr = btrfs_header_nritems(eb);
1695         int i, extent_type, ret;
1696         struct btrfs_key key;
1697         struct btrfs_file_extent_item *fi;
1698         u64 bytenr, num_bytes;
1699
1700         /* We can be called directly from walk_up_proc() */
1701         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1702                 return 0;
1703
1704         for (i = 0; i < nr; i++) {
1705                 btrfs_item_key_to_cpu(eb, &key, i);
1706
1707                 if (key.type != BTRFS_EXTENT_DATA_KEY)
1708                         continue;
1709
1710                 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
1711                 /* filter out non qgroup-accountable extents  */
1712                 extent_type = btrfs_file_extent_type(eb, fi);
1713
1714                 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1715                         continue;
1716
1717                 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1718                 if (!bytenr)
1719                         continue;
1720
1721                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1722
1723                 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
1724                                                 GFP_NOFS);
1725                 if (ret)
1726                         return ret;
1727         }
1728         cond_resched();
1729         return 0;
1730 }
1731
1732 /*
1733  * Walk up the tree from the bottom, freeing leaves and any interior
1734  * nodes which have had all slots visited. If a node (leaf or
1735  * interior) is freed, the node above it will have it's slot
1736  * incremented. The root node will never be freed.
1737  *
1738  * At the end of this function, we should have a path which has all
1739  * slots incremented to the next position for a search. If we need to
1740  * read a new node it will be NULL and the node above it will have the
1741  * correct slot selected for a later read.
1742  *
1743  * If we increment the root nodes slot counter past the number of
1744  * elements, 1 is returned to signal completion of the search.
1745  */
1746 static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
1747 {
1748         int level = 0;
1749         int nr, slot;
1750         struct extent_buffer *eb;
1751
1752         if (root_level == 0)
1753                 return 1;
1754
1755         while (level <= root_level) {
1756                 eb = path->nodes[level];
1757                 nr = btrfs_header_nritems(eb);
1758                 path->slots[level]++;
1759                 slot = path->slots[level];
1760                 if (slot >= nr || level == 0) {
1761                         /*
1762                          * Don't free the root -  we will detect this
1763                          * condition after our loop and return a
1764                          * positive value for caller to stop walking the tree.
1765                          */
1766                         if (level != root_level) {
1767                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
1768                                 path->locks[level] = 0;
1769
1770                                 free_extent_buffer(eb);
1771                                 path->nodes[level] = NULL;
1772                                 path->slots[level] = 0;
1773                         }
1774                 } else {
1775                         /*
1776                          * We have a valid slot to walk back down
1777                          * from. Stop here so caller can process these
1778                          * new nodes.
1779                          */
1780                         break;
1781                 }
1782
1783                 level++;
1784         }
1785
1786         eb = path->nodes[root_level];
1787         if (path->slots[root_level] >= btrfs_header_nritems(eb))
1788                 return 1;
1789
1790         return 0;
1791 }
1792
1793 /*
1794  * Helper function to trace a subtree tree block swap.
1795  *
1796  * The swap will happen in highest tree block, but there may be a lot of
1797  * tree blocks involved.
1798  *
1799  * For example:
1800  *  OO = Old tree blocks
1801  *  NN = New tree blocks allocated during balance
1802  *
1803  *           File tree (257)                  Reloc tree for 257
1804  * L2              OO                                NN
1805  *               /    \                            /    \
1806  * L1          OO      OO (a)                    OO      NN (a)
1807  *            / \     / \                       / \     / \
1808  * L0       OO   OO OO   OO                   OO   OO NN   NN
1809  *                  (b)  (c)                          (b)  (c)
1810  *
1811  * When calling qgroup_trace_extent_swap(), we will pass:
1812  * @src_eb = OO(a)
1813  * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
1814  * @dst_level = 0
1815  * @root_level = 1
1816  *
1817  * In that case, qgroup_trace_extent_swap() will search from OO(a) to
1818  * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
1819  *
1820  * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
1821  *
1822  * 1) Tree search from @src_eb
1823  *    It should acts as a simplified btrfs_search_slot().
1824  *    The key for search can be extracted from @dst_path->nodes[dst_level]
1825  *    (first key).
1826  *
1827  * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
1828  *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
1829  *    They should be marked during previous (@dst_level = 1) iteration.
1830  *
1831  * 3) Mark file extents in leaves dirty
1832  *    We don't have good way to pick out new file extents only.
1833  *    So we still follow the old method by scanning all file extents in
1834  *    the leave.
1835  *
1836  * This function can free us from keeping two paths, thus later we only need
1837  * to care about how to iterate all new tree blocks in reloc tree.
1838  */
1839 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
1840                                     struct extent_buffer *src_eb,
1841                                     struct btrfs_path *dst_path,
1842                                     int dst_level, int root_level,
1843                                     bool trace_leaf)
1844 {
1845         struct btrfs_key key;
1846         struct btrfs_path *src_path;
1847         struct btrfs_fs_info *fs_info = trans->fs_info;
1848         u32 nodesize = fs_info->nodesize;
1849         int cur_level = root_level;
1850         int ret;
1851
1852         BUG_ON(dst_level > root_level);
1853         /* Level mismatch */
1854         if (btrfs_header_level(src_eb) != root_level)
1855                 return -EINVAL;
1856
1857         src_path = btrfs_alloc_path();
1858         if (!src_path) {
1859                 ret = -ENOMEM;
1860                 goto out;
1861         }
1862
1863         if (dst_level)
1864                 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
1865         else
1866                 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
1867
1868         /* For src_path */
1869         atomic_inc(&src_eb->refs);
1870         src_path->nodes[root_level] = src_eb;
1871         src_path->slots[root_level] = dst_path->slots[root_level];
1872         src_path->locks[root_level] = 0;
1873
1874         /* A simplified version of btrfs_search_slot() */
1875         while (cur_level >= dst_level) {
1876                 struct btrfs_key src_key;
1877                 struct btrfs_key dst_key;
1878
1879                 if (src_path->nodes[cur_level] == NULL) {
1880                         struct btrfs_key first_key;
1881                         struct extent_buffer *eb;
1882                         int parent_slot;
1883                         u64 child_gen;
1884                         u64 child_bytenr;
1885
1886                         eb = src_path->nodes[cur_level + 1];
1887                         parent_slot = src_path->slots[cur_level + 1];
1888                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
1889                         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
1890                         btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
1891
1892                         eb = read_tree_block(fs_info, child_bytenr, child_gen,
1893                                              cur_level, &first_key);
1894                         if (IS_ERR(eb)) {
1895                                 ret = PTR_ERR(eb);
1896                                 goto out;
1897                         } else if (!extent_buffer_uptodate(eb)) {
1898                                 free_extent_buffer(eb);
1899                                 ret = -EIO;
1900                                 goto out;
1901                         }
1902
1903                         src_path->nodes[cur_level] = eb;
1904
1905                         btrfs_tree_read_lock(eb);
1906                         btrfs_set_lock_blocking_read(eb);
1907                         src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
1908                 }
1909
1910                 src_path->slots[cur_level] = dst_path->slots[cur_level];
1911                 if (cur_level) {
1912                         btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
1913                                         &dst_key, dst_path->slots[cur_level]);
1914                         btrfs_node_key_to_cpu(src_path->nodes[cur_level],
1915                                         &src_key, src_path->slots[cur_level]);
1916                 } else {
1917                         btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
1918                                         &dst_key, dst_path->slots[cur_level]);
1919                         btrfs_item_key_to_cpu(src_path->nodes[cur_level],
1920                                         &src_key, src_path->slots[cur_level]);
1921                 }
1922                 /* Content mismatch, something went wrong */
1923                 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
1924                         ret = -ENOENT;
1925                         goto out;
1926                 }
1927                 cur_level--;
1928         }
1929
1930         /*
1931          * Now both @dst_path and @src_path have been populated, record the tree
1932          * blocks for qgroup accounting.
1933          */
1934         ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
1935                         nodesize, GFP_NOFS);
1936         if (ret < 0)
1937                 goto out;
1938         ret = btrfs_qgroup_trace_extent(trans,
1939                         dst_path->nodes[dst_level]->start,
1940                         nodesize, GFP_NOFS);
1941         if (ret < 0)
1942                 goto out;
1943
1944         /* Record leaf file extents */
1945         if (dst_level == 0 && trace_leaf) {
1946                 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
1947                 if (ret < 0)
1948                         goto out;
1949                 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
1950         }
1951 out:
1952         btrfs_free_path(src_path);
1953         return ret;
1954 }
1955
1956 /*
1957  * Helper function to do recursive generation-aware depth-first search, to
1958  * locate all new tree blocks in a subtree of reloc tree.
1959  *
1960  * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
1961  *         reloc tree
1962  * L2         NN (a)
1963  *          /    \
1964  * L1    OO        NN (b)
1965  *      /  \      /  \
1966  * L0  OO  OO    OO  NN
1967  *               (c) (d)
1968  * If we pass:
1969  * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
1970  * @cur_level = 1
1971  * @root_level = 1
1972  *
1973  * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
1974  * above tree blocks along with their counter parts in file tree.
1975  * While during search, old tree blocks OO(c) will be skipped as tree block swap
1976  * won't affect OO(c).
1977  */
1978 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
1979                                            struct extent_buffer *src_eb,
1980                                            struct btrfs_path *dst_path,
1981                                            int cur_level, int root_level,
1982                                            u64 last_snapshot, bool trace_leaf)
1983 {
1984         struct btrfs_fs_info *fs_info = trans->fs_info;
1985         struct extent_buffer *eb;
1986         bool need_cleanup = false;
1987         int ret = 0;
1988         int i;
1989
1990         /* Level sanity check */
1991         if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
1992             root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
1993             root_level < cur_level) {
1994                 btrfs_err_rl(fs_info,
1995                         "%s: bad levels, cur_level=%d root_level=%d",
1996                         __func__, cur_level, root_level);
1997                 return -EUCLEAN;
1998         }
1999
2000         /* Read the tree block if needed */
2001         if (dst_path->nodes[cur_level] == NULL) {
2002                 struct btrfs_key first_key;
2003                 int parent_slot;
2004                 u64 child_gen;
2005                 u64 child_bytenr;
2006
2007                 /*
2008                  * dst_path->nodes[root_level] must be initialized before
2009                  * calling this function.
2010                  */
2011                 if (cur_level == root_level) {
2012                         btrfs_err_rl(fs_info,
2013         "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2014                                 __func__, root_level, root_level, cur_level);
2015                         return -EUCLEAN;
2016                 }
2017
2018                 /*
2019                  * We need to get child blockptr/gen from parent before we can
2020                  * read it.
2021                   */
2022                 eb = dst_path->nodes[cur_level + 1];
2023                 parent_slot = dst_path->slots[cur_level + 1];
2024                 child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2025                 child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2026                 btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
2027
2028                 /* This node is old, no need to trace */
2029                 if (child_gen < last_snapshot)
2030                         goto out;
2031
2032                 eb = read_tree_block(fs_info, child_bytenr, child_gen,
2033                                      cur_level, &first_key);
2034                 if (IS_ERR(eb)) {
2035                         ret = PTR_ERR(eb);
2036                         goto out;
2037                 } else if (!extent_buffer_uptodate(eb)) {
2038                         free_extent_buffer(eb);
2039                         ret = -EIO;
2040                         goto out;
2041                 }
2042
2043                 dst_path->nodes[cur_level] = eb;
2044                 dst_path->slots[cur_level] = 0;
2045
2046                 btrfs_tree_read_lock(eb);
2047                 btrfs_set_lock_blocking_read(eb);
2048                 dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
2049                 need_cleanup = true;
2050         }
2051
2052         /* Now record this tree block and its counter part for qgroups */
2053         ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2054                                        root_level, trace_leaf);
2055         if (ret < 0)
2056                 goto cleanup;
2057
2058         eb = dst_path->nodes[cur_level];
2059
2060         if (cur_level > 0) {
2061                 /* Iterate all child tree blocks */
2062                 for (i = 0; i < btrfs_header_nritems(eb); i++) {
2063                         /* Skip old tree blocks as they won't be swapped */
2064                         if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2065                                 continue;
2066                         dst_path->slots[cur_level] = i;
2067
2068                         /* Recursive call (at most 7 times) */
2069                         ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2070                                         dst_path, cur_level - 1, root_level,
2071                                         last_snapshot, trace_leaf);
2072                         if (ret < 0)
2073                                 goto cleanup;
2074                 }
2075         }
2076
2077 cleanup:
2078         if (need_cleanup) {
2079                 /* Clean up */
2080                 btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2081                                      dst_path->locks[cur_level]);
2082                 free_extent_buffer(dst_path->nodes[cur_level]);
2083                 dst_path->nodes[cur_level] = NULL;
2084                 dst_path->slots[cur_level] = 0;
2085                 dst_path->locks[cur_level] = 0;
2086         }
2087 out:
2088         return ret;
2089 }
2090
2091 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2092                                 struct extent_buffer *src_eb,
2093                                 struct extent_buffer *dst_eb,
2094                                 u64 last_snapshot, bool trace_leaf)
2095 {
2096         struct btrfs_fs_info *fs_info = trans->fs_info;
2097         struct btrfs_path *dst_path = NULL;
2098         int level;
2099         int ret;
2100
2101         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2102                 return 0;
2103
2104         /* Wrong parameter order */
2105         if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2106                 btrfs_err_rl(fs_info,
2107                 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2108                              btrfs_header_generation(src_eb),
2109                              btrfs_header_generation(dst_eb));
2110                 return -EUCLEAN;
2111         }
2112
2113         if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2114                 ret = -EIO;
2115                 goto out;
2116         }
2117
2118         level = btrfs_header_level(dst_eb);
2119         dst_path = btrfs_alloc_path();
2120         if (!dst_path) {
2121                 ret = -ENOMEM;
2122                 goto out;
2123         }
2124         /* For dst_path */
2125         atomic_inc(&dst_eb->refs);
2126         dst_path->nodes[level] = dst_eb;
2127         dst_path->slots[level] = 0;
2128         dst_path->locks[level] = 0;
2129
2130         /* Do the generation aware breadth-first search */
2131         ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2132                                               level, last_snapshot, trace_leaf);
2133         if (ret < 0)
2134                 goto out;
2135         ret = 0;
2136
2137 out:
2138         btrfs_free_path(dst_path);
2139         if (ret < 0)
2140                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2141         return ret;
2142 }
2143
2144 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2145                                struct extent_buffer *root_eb,
2146                                u64 root_gen, int root_level)
2147 {
2148         struct btrfs_fs_info *fs_info = trans->fs_info;
2149         int ret = 0;
2150         int level;
2151         struct extent_buffer *eb = root_eb;
2152         struct btrfs_path *path = NULL;
2153
2154         BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
2155         BUG_ON(root_eb == NULL);
2156
2157         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2158                 return 0;
2159
2160         if (!extent_buffer_uptodate(root_eb)) {
2161                 ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
2162                 if (ret)
2163                         goto out;
2164         }
2165
2166         if (root_level == 0) {
2167                 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
2168                 goto out;
2169         }
2170
2171         path = btrfs_alloc_path();
2172         if (!path)
2173                 return -ENOMEM;
2174
2175         /*
2176          * Walk down the tree.  Missing extent blocks are filled in as
2177          * we go. Metadata is accounted every time we read a new
2178          * extent block.
2179          *
2180          * When we reach a leaf, we account for file extent items in it,
2181          * walk back up the tree (adjusting slot pointers as we go)
2182          * and restart the search process.
2183          */
2184         atomic_inc(&root_eb->refs);     /* For path */
2185         path->nodes[root_level] = root_eb;
2186         path->slots[root_level] = 0;
2187         path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
2188 walk_down:
2189         level = root_level;
2190         while (level >= 0) {
2191                 if (path->nodes[level] == NULL) {
2192                         struct btrfs_key first_key;
2193                         int parent_slot;
2194                         u64 child_gen;
2195                         u64 child_bytenr;
2196
2197                         /*
2198                          * We need to get child blockptr/gen from parent before
2199                          * we can read it.
2200                           */
2201                         eb = path->nodes[level + 1];
2202                         parent_slot = path->slots[level + 1];
2203                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2204                         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2205                         btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
2206
2207                         eb = read_tree_block(fs_info, child_bytenr, child_gen,
2208                                              level, &first_key);
2209                         if (IS_ERR(eb)) {
2210                                 ret = PTR_ERR(eb);
2211                                 goto out;
2212                         } else if (!extent_buffer_uptodate(eb)) {
2213                                 free_extent_buffer(eb);
2214                                 ret = -EIO;
2215                                 goto out;
2216                         }
2217
2218                         path->nodes[level] = eb;
2219                         path->slots[level] = 0;
2220
2221                         btrfs_tree_read_lock(eb);
2222                         btrfs_set_lock_blocking_read(eb);
2223                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
2224
2225                         ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
2226                                                         fs_info->nodesize,
2227                                                         GFP_NOFS);
2228                         if (ret)
2229                                 goto out;
2230                 }
2231
2232                 if (level == 0) {
2233                         ret = btrfs_qgroup_trace_leaf_items(trans,
2234                                                             path->nodes[level]);
2235                         if (ret)
2236                                 goto out;
2237
2238                         /* Nonzero return here means we completed our search */
2239                         ret = adjust_slots_upwards(path, root_level);
2240                         if (ret)
2241                                 break;
2242
2243                         /* Restart search with new slots */
2244                         goto walk_down;
2245                 }
2246
2247                 level--;
2248         }
2249
2250         ret = 0;
2251 out:
2252         btrfs_free_path(path);
2253
2254         return ret;
2255 }
2256
2257 #define UPDATE_NEW      0
2258 #define UPDATE_OLD      1
2259 /*
2260  * Walk all of the roots that points to the bytenr and adjust their refcnts.
2261  */
2262 static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2263                                 struct ulist *roots, struct ulist *tmp,
2264                                 struct ulist *qgroups, u64 seq, int update_old)
2265 {
2266         struct ulist_node *unode;
2267         struct ulist_iterator uiter;
2268         struct ulist_node *tmp_unode;
2269         struct ulist_iterator tmp_uiter;
2270         struct btrfs_qgroup *qg;
2271         int ret = 0;
2272
2273         if (!roots)
2274                 return 0;
2275         ULIST_ITER_INIT(&uiter);
2276         while ((unode = ulist_next(roots, &uiter))) {
2277                 qg = find_qgroup_rb(fs_info, unode->val);
2278                 if (!qg)
2279                         continue;
2280
2281                 ulist_reinit(tmp);
2282                 ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
2283                                 GFP_ATOMIC);
2284                 if (ret < 0)
2285                         return ret;
2286                 ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
2287                 if (ret < 0)
2288                         return ret;
2289                 ULIST_ITER_INIT(&tmp_uiter);
2290                 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
2291                         struct btrfs_qgroup_list *glist;
2292
2293                         qg = unode_aux_to_qgroup(tmp_unode);
2294                         if (update_old)
2295                                 btrfs_qgroup_update_old_refcnt(qg, seq, 1);
2296                         else
2297                                 btrfs_qgroup_update_new_refcnt(qg, seq, 1);
2298                         list_for_each_entry(glist, &qg->groups, next_group) {
2299                                 ret = ulist_add(qgroups, glist->group->qgroupid,
2300                                                 qgroup_to_aux(glist->group),
2301                                                 GFP_ATOMIC);
2302                                 if (ret < 0)
2303                                         return ret;
2304                                 ret = ulist_add(tmp, glist->group->qgroupid,
2305                                                 qgroup_to_aux(glist->group),
2306                                                 GFP_ATOMIC);
2307                                 if (ret < 0)
2308                                         return ret;
2309                         }
2310                 }
2311         }
2312         return 0;
2313 }
2314
2315 /*
2316  * Update qgroup rfer/excl counters.
2317  * Rfer update is easy, codes can explain themselves.
2318  *
2319  * Excl update is tricky, the update is split into 2 part.
2320  * Part 1: Possible exclusive <-> sharing detect:
2321  *      |       A       |       !A      |
2322  *  -------------------------------------
2323  *  B   |       *       |       -       |
2324  *  -------------------------------------
2325  *  !B  |       +       |       **      |
2326  *  -------------------------------------
2327  *
2328  * Conditions:
2329  * A:   cur_old_roots < nr_old_roots    (not exclusive before)
2330  * !A:  cur_old_roots == nr_old_roots   (possible exclusive before)
2331  * B:   cur_new_roots < nr_new_roots    (not exclusive now)
2332  * !B:  cur_new_roots == nr_new_roots   (possible exclusive now)
2333  *
2334  * Results:
2335  * +: Possible sharing -> exclusive     -: Possible exclusive -> sharing
2336  * *: Definitely not changed.           **: Possible unchanged.
2337  *
2338  * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2339  *
2340  * To make the logic clear, we first use condition A and B to split
2341  * combination into 4 results.
2342  *
2343  * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2344  * only on variant maybe 0.
2345  *
2346  * Lastly, check result **, since there are 2 variants maybe 0, split them
2347  * again(2x2).
2348  * But this time we don't need to consider other things, the codes and logic
2349  * is easy to understand now.
2350  */
2351 static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
2352                                   struct ulist *qgroups,
2353                                   u64 nr_old_roots,
2354                                   u64 nr_new_roots,
2355                                   u64 num_bytes, u64 seq)
2356 {
2357         struct ulist_node *unode;
2358         struct ulist_iterator uiter;
2359         struct btrfs_qgroup *qg;
2360         u64 cur_new_count, cur_old_count;
2361
2362         ULIST_ITER_INIT(&uiter);
2363         while ((unode = ulist_next(qgroups, &uiter))) {
2364                 bool dirty = false;
2365
2366                 qg = unode_aux_to_qgroup(unode);
2367                 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2368                 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2369
2370                 trace_qgroup_update_counters(fs_info, qg, cur_old_count,
2371                                              cur_new_count);
2372
2373                 /* Rfer update part */
2374                 if (cur_old_count == 0 && cur_new_count > 0) {
2375                         qg->rfer += num_bytes;
2376                         qg->rfer_cmpr += num_bytes;
2377                         dirty = true;
2378                 }
2379                 if (cur_old_count > 0 && cur_new_count == 0) {
2380                         qg->rfer -= num_bytes;
2381                         qg->rfer_cmpr -= num_bytes;
2382                         dirty = true;
2383                 }
2384
2385                 /* Excl update part */
2386                 /* Exclusive/none -> shared case */
2387                 if (cur_old_count == nr_old_roots &&
2388                     cur_new_count < nr_new_roots) {
2389                         /* Exclusive -> shared */
2390                         if (cur_old_count != 0) {
2391                                 qg->excl -= num_bytes;
2392                                 qg->excl_cmpr -= num_bytes;
2393                                 dirty = true;
2394                         }
2395                 }
2396
2397                 /* Shared -> exclusive/none case */
2398                 if (cur_old_count < nr_old_roots &&
2399                     cur_new_count == nr_new_roots) {
2400                         /* Shared->exclusive */
2401                         if (cur_new_count != 0) {
2402                                 qg->excl += num_bytes;
2403                                 qg->excl_cmpr += num_bytes;
2404                                 dirty = true;
2405                         }
2406                 }
2407
2408                 /* Exclusive/none -> exclusive/none case */
2409                 if (cur_old_count == nr_old_roots &&
2410                     cur_new_count == nr_new_roots) {
2411                         if (cur_old_count == 0) {
2412                                 /* None -> exclusive/none */
2413
2414                                 if (cur_new_count != 0) {
2415                                         /* None -> exclusive */
2416                                         qg->excl += num_bytes;
2417                                         qg->excl_cmpr += num_bytes;
2418                                         dirty = true;
2419                                 }
2420                                 /* None -> none, nothing changed */
2421                         } else {
2422                                 /* Exclusive -> exclusive/none */
2423
2424                                 if (cur_new_count == 0) {
2425                                         /* Exclusive -> none */
2426                                         qg->excl -= num_bytes;
2427                                         qg->excl_cmpr -= num_bytes;
2428                                         dirty = true;
2429                                 }
2430                                 /* Exclusive -> exclusive, nothing changed */
2431                         }
2432                 }
2433
2434                 if (dirty)
2435                         qgroup_dirty(fs_info, qg);
2436         }
2437         return 0;
2438 }
2439
2440 /*
2441  * Check if the @roots potentially is a list of fs tree roots
2442  *
2443  * Return 0 for definitely not a fs/subvol tree roots ulist
2444  * Return 1 for possible fs/subvol tree roots in the list (considering an empty
2445  *          one as well)
2446  */
2447 static int maybe_fs_roots(struct ulist *roots)
2448 {
2449         struct ulist_node *unode;
2450         struct ulist_iterator uiter;
2451
2452         /* Empty one, still possible for fs roots */
2453         if (!roots || roots->nnodes == 0)
2454                 return 1;
2455
2456         ULIST_ITER_INIT(&uiter);
2457         unode = ulist_next(roots, &uiter);
2458         if (!unode)
2459                 return 1;
2460
2461         /*
2462          * If it contains fs tree roots, then it must belong to fs/subvol
2463          * trees.
2464          * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2465          */
2466         return is_fstree(unode->val);
2467 }
2468
2469 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2470                                 u64 num_bytes, struct ulist *old_roots,
2471                                 struct ulist *new_roots)
2472 {
2473         struct btrfs_fs_info *fs_info = trans->fs_info;
2474         struct ulist *qgroups = NULL;
2475         struct ulist *tmp = NULL;
2476         u64 seq;
2477         u64 nr_new_roots = 0;
2478         u64 nr_old_roots = 0;
2479         int ret = 0;
2480
2481         /*
2482          * If quotas get disabled meanwhile, the resouces need to be freed and
2483          * we can't just exit here.
2484          */
2485         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2486                 goto out_free;
2487
2488         if (new_roots) {
2489                 if (!maybe_fs_roots(new_roots))
2490                         goto out_free;
2491                 nr_new_roots = new_roots->nnodes;
2492         }
2493         if (old_roots) {
2494                 if (!maybe_fs_roots(old_roots))
2495                         goto out_free;
2496                 nr_old_roots = old_roots->nnodes;
2497         }
2498
2499         /* Quick exit, either not fs tree roots, or won't affect any qgroup */
2500         if (nr_old_roots == 0 && nr_new_roots == 0)
2501                 goto out_free;
2502
2503         BUG_ON(!fs_info->quota_root);
2504
2505         trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
2506                                         num_bytes, nr_old_roots, nr_new_roots);
2507
2508         qgroups = ulist_alloc(GFP_NOFS);
2509         if (!qgroups) {
2510                 ret = -ENOMEM;
2511                 goto out_free;
2512         }
2513         tmp = ulist_alloc(GFP_NOFS);
2514         if (!tmp) {
2515                 ret = -ENOMEM;
2516                 goto out_free;
2517         }
2518
2519         mutex_lock(&fs_info->qgroup_rescan_lock);
2520         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2521                 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2522                         mutex_unlock(&fs_info->qgroup_rescan_lock);
2523                         ret = 0;
2524                         goto out_free;
2525                 }
2526         }
2527         mutex_unlock(&fs_info->qgroup_rescan_lock);
2528
2529         spin_lock(&fs_info->qgroup_lock);
2530         seq = fs_info->qgroup_seq;
2531
2532         /* Update old refcnts using old_roots */
2533         ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
2534                                    UPDATE_OLD);
2535         if (ret < 0)
2536                 goto out;
2537
2538         /* Update new refcnts using new_roots */
2539         ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
2540                                    UPDATE_NEW);
2541         if (ret < 0)
2542                 goto out;
2543
2544         qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
2545                                num_bytes, seq);
2546
2547         /*
2548          * Bump qgroup_seq to avoid seq overlap
2549          */
2550         fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
2551 out:
2552         spin_unlock(&fs_info->qgroup_lock);
2553 out_free:
2554         ulist_free(tmp);
2555         ulist_free(qgroups);
2556         ulist_free(old_roots);
2557         ulist_free(new_roots);
2558         return ret;
2559 }
2560
2561 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2562 {
2563         struct btrfs_fs_info *fs_info = trans->fs_info;
2564         struct btrfs_qgroup_extent_record *record;
2565         struct btrfs_delayed_ref_root *delayed_refs;
2566         struct ulist *new_roots = NULL;
2567         struct rb_node *node;
2568         u64 num_dirty_extents = 0;
2569         u64 qgroup_to_skip;
2570         int ret = 0;
2571
2572         delayed_refs = &trans->transaction->delayed_refs;
2573         qgroup_to_skip = delayed_refs->qgroup_to_skip;
2574         while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
2575                 record = rb_entry(node, struct btrfs_qgroup_extent_record,
2576                                   node);
2577
2578                 num_dirty_extents++;
2579                 trace_btrfs_qgroup_account_extents(fs_info, record);
2580
2581                 if (!ret) {
2582                         /*
2583                          * Old roots should be searched when inserting qgroup
2584                          * extent record
2585                          */
2586                         if (WARN_ON(!record->old_roots)) {
2587                                 /* Search commit root to find old_roots */
2588                                 ret = btrfs_find_all_roots(NULL, fs_info,
2589                                                 record->bytenr, 0,
2590                                                 &record->old_roots, false);
2591                                 if (ret < 0)
2592                                         goto cleanup;
2593                         }
2594
2595                         /* Free the reserved data space */
2596                         btrfs_qgroup_free_refroot(fs_info,
2597                                         record->data_rsv_refroot,
2598                                         record->data_rsv,
2599                                         BTRFS_QGROUP_RSV_DATA);
2600                         /*
2601                          * Use SEQ_LAST as time_seq to do special search, which
2602                          * doesn't lock tree or delayed_refs and search current
2603                          * root. It's safe inside commit_transaction().
2604                          */
2605                         ret = btrfs_find_all_roots(trans, fs_info,
2606                                 record->bytenr, SEQ_LAST, &new_roots, false);
2607                         if (ret < 0)
2608                                 goto cleanup;
2609                         if (qgroup_to_skip) {
2610                                 ulist_del(new_roots, qgroup_to_skip, 0);
2611                                 ulist_del(record->old_roots, qgroup_to_skip,
2612                                           0);
2613                         }
2614                         ret = btrfs_qgroup_account_extent(trans, record->bytenr,
2615                                                           record->num_bytes,
2616                                                           record->old_roots,
2617                                                           new_roots);
2618                         record->old_roots = NULL;
2619                         new_roots = NULL;
2620                 }
2621 cleanup:
2622                 ulist_free(record->old_roots);
2623                 ulist_free(new_roots);
2624                 new_roots = NULL;
2625                 rb_erase(node, &delayed_refs->dirty_extent_root);
2626                 kfree(record);
2627
2628         }
2629         trace_qgroup_num_dirty_extents(fs_info, trans->transid,
2630                                        num_dirty_extents);
2631         return ret;
2632 }
2633
2634 /*
2635  * called from commit_transaction. Writes all changed qgroups to disk.
2636  */
2637 int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
2638 {
2639         struct btrfs_fs_info *fs_info = trans->fs_info;
2640         int ret = 0;
2641
2642         if (!fs_info->quota_root)
2643                 return ret;
2644
2645         spin_lock(&fs_info->qgroup_lock);
2646         while (!list_empty(&fs_info->dirty_qgroups)) {
2647                 struct btrfs_qgroup *qgroup;
2648                 qgroup = list_first_entry(&fs_info->dirty_qgroups,
2649                                           struct btrfs_qgroup, dirty);
2650                 list_del_init(&qgroup->dirty);
2651                 spin_unlock(&fs_info->qgroup_lock);
2652                 ret = update_qgroup_info_item(trans, qgroup);
2653                 if (ret)
2654                         fs_info->qgroup_flags |=
2655                                         BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2656                 ret = update_qgroup_limit_item(trans, qgroup);
2657                 if (ret)
2658                         fs_info->qgroup_flags |=
2659                                         BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2660                 spin_lock(&fs_info->qgroup_lock);
2661         }
2662         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2663                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
2664         else
2665                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
2666         spin_unlock(&fs_info->qgroup_lock);
2667
2668         ret = update_qgroup_status_item(trans);
2669         if (ret)
2670                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2671
2672         return ret;
2673 }
2674
2675 /*
2676  * Copy the accounting information between qgroups. This is necessary
2677  * when a snapshot or a subvolume is created. Throwing an error will
2678  * cause a transaction abort so we take extra care here to only error
2679  * when a readonly fs is a reasonable outcome.
2680  */
2681 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2682                          u64 objectid, struct btrfs_qgroup_inherit *inherit)
2683 {
2684         int ret = 0;
2685         int i;
2686         u64 *i_qgroups;
2687         bool committing = false;
2688         struct btrfs_fs_info *fs_info = trans->fs_info;
2689         struct btrfs_root *quota_root;
2690         struct btrfs_qgroup *srcgroup;
2691         struct btrfs_qgroup *dstgroup;
2692         bool need_rescan = false;
2693         u32 level_size = 0;
2694         u64 nums;
2695
2696         /*
2697          * There are only two callers of this function.
2698          *
2699          * One in create_subvol() in the ioctl context, which needs to hold
2700          * the qgroup_ioctl_lock.
2701          *
2702          * The other one in create_pending_snapshot() where no other qgroup
2703          * code can modify the fs as they all need to either start a new trans
2704          * or hold a trans handler, thus we don't need to hold
2705          * qgroup_ioctl_lock.
2706          * This would avoid long and complex lock chain and make lockdep happy.
2707          */
2708         spin_lock(&fs_info->trans_lock);
2709         if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
2710                 committing = true;
2711         spin_unlock(&fs_info->trans_lock);
2712
2713         if (!committing)
2714                 mutex_lock(&fs_info->qgroup_ioctl_lock);
2715         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2716                 goto out;
2717
2718         quota_root = fs_info->quota_root;
2719         if (!quota_root) {
2720                 ret = -EINVAL;
2721                 goto out;
2722         }
2723
2724         if (inherit) {
2725                 i_qgroups = (u64 *)(inherit + 1);
2726                 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2727                        2 * inherit->num_excl_copies;
2728                 for (i = 0; i < nums; ++i) {
2729                         srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
2730
2731                         /*
2732                          * Zero out invalid groups so we can ignore
2733                          * them later.
2734                          */
2735                         if (!srcgroup ||
2736                             ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
2737                                 *i_qgroups = 0ULL;
2738
2739                         ++i_qgroups;
2740                 }
2741         }
2742
2743         /*
2744          * create a tracking group for the subvol itself
2745          */
2746         ret = add_qgroup_item(trans, quota_root, objectid);
2747         if (ret)
2748                 goto out;
2749
2750         /*
2751          * add qgroup to all inherited groups
2752          */
2753         if (inherit) {
2754                 i_qgroups = (u64 *)(inherit + 1);
2755                 for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
2756                         if (*i_qgroups == 0)
2757                                 continue;
2758                         ret = add_qgroup_relation_item(trans, objectid,
2759                                                        *i_qgroups);
2760                         if (ret && ret != -EEXIST)
2761                                 goto out;
2762                         ret = add_qgroup_relation_item(trans, *i_qgroups,
2763                                                        objectid);
2764                         if (ret && ret != -EEXIST)
2765                                 goto out;
2766                 }
2767                 ret = 0;
2768         }
2769
2770
2771         spin_lock(&fs_info->qgroup_lock);
2772
2773         dstgroup = add_qgroup_rb(fs_info, objectid);
2774         if (IS_ERR(dstgroup)) {
2775                 ret = PTR_ERR(dstgroup);
2776                 goto unlock;
2777         }
2778
2779         if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
2780                 dstgroup->lim_flags = inherit->lim.flags;
2781                 dstgroup->max_rfer = inherit->lim.max_rfer;
2782                 dstgroup->max_excl = inherit->lim.max_excl;
2783                 dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
2784                 dstgroup->rsv_excl = inherit->lim.rsv_excl;
2785
2786                 ret = update_qgroup_limit_item(trans, dstgroup);
2787                 if (ret) {
2788                         fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2789                         btrfs_info(fs_info,
2790                                    "unable to update quota limit for %llu",
2791                                    dstgroup->qgroupid);
2792                         goto unlock;
2793                 }
2794         }
2795
2796         if (srcid) {
2797                 srcgroup = find_qgroup_rb(fs_info, srcid);
2798                 if (!srcgroup)
2799                         goto unlock;
2800
2801                 /*
2802                  * We call inherit after we clone the root in order to make sure
2803                  * our counts don't go crazy, so at this point the only
2804                  * difference between the two roots should be the root node.
2805                  */
2806                 level_size = fs_info->nodesize;
2807                 dstgroup->rfer = srcgroup->rfer;
2808                 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
2809                 dstgroup->excl = level_size;
2810                 dstgroup->excl_cmpr = level_size;
2811                 srcgroup->excl = level_size;
2812                 srcgroup->excl_cmpr = level_size;
2813
2814                 /* inherit the limit info */
2815                 dstgroup->lim_flags = srcgroup->lim_flags;
2816                 dstgroup->max_rfer = srcgroup->max_rfer;
2817                 dstgroup->max_excl = srcgroup->max_excl;
2818                 dstgroup->rsv_rfer = srcgroup->rsv_rfer;
2819                 dstgroup->rsv_excl = srcgroup->rsv_excl;
2820
2821                 qgroup_dirty(fs_info, dstgroup);
2822                 qgroup_dirty(fs_info, srcgroup);
2823         }
2824
2825         if (!inherit)
2826                 goto unlock;
2827
2828         i_qgroups = (u64 *)(inherit + 1);
2829         for (i = 0; i < inherit->num_qgroups; ++i) {
2830                 if (*i_qgroups) {
2831                         ret = add_relation_rb(fs_info, objectid, *i_qgroups);
2832                         if (ret)
2833                                 goto unlock;
2834                 }
2835                 ++i_qgroups;
2836
2837                 /*
2838                  * If we're doing a snapshot, and adding the snapshot to a new
2839                  * qgroup, the numbers are guaranteed to be incorrect.
2840                  */
2841                 if (srcid)
2842                         need_rescan = true;
2843         }
2844
2845         for (i = 0; i <  inherit->num_ref_copies; ++i, i_qgroups += 2) {
2846                 struct btrfs_qgroup *src;
2847                 struct btrfs_qgroup *dst;
2848
2849                 if (!i_qgroups[0] || !i_qgroups[1])
2850                         continue;
2851
2852                 src = find_qgroup_rb(fs_info, i_qgroups[0]);
2853                 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2854
2855                 if (!src || !dst) {
2856                         ret = -EINVAL;
2857                         goto unlock;
2858                 }
2859
2860                 dst->rfer = src->rfer - level_size;
2861                 dst->rfer_cmpr = src->rfer_cmpr - level_size;
2862
2863                 /* Manually tweaking numbers certainly needs a rescan */
2864                 need_rescan = true;
2865         }
2866         for (i = 0; i <  inherit->num_excl_copies; ++i, i_qgroups += 2) {
2867                 struct btrfs_qgroup *src;
2868                 struct btrfs_qgroup *dst;
2869
2870                 if (!i_qgroups[0] || !i_qgroups[1])
2871                         continue;
2872
2873                 src = find_qgroup_rb(fs_info, i_qgroups[0]);
2874                 dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2875
2876                 if (!src || !dst) {
2877                         ret = -EINVAL;
2878                         goto unlock;
2879                 }
2880
2881                 dst->excl = src->excl + level_size;
2882                 dst->excl_cmpr = src->excl_cmpr + level_size;
2883                 need_rescan = true;
2884         }
2885
2886 unlock:
2887         spin_unlock(&fs_info->qgroup_lock);
2888         if (!ret)
2889                 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
2890 out:
2891         if (!committing)
2892                 mutex_unlock(&fs_info->qgroup_ioctl_lock);
2893         if (need_rescan)
2894                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2895         return ret;
2896 }
2897
2898 /*
2899  * Two limits to commit transaction in advance.
2900  *
2901  * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
2902  * For SIZE, it will be in byte unit as threshold.
2903  */
2904 #define QGROUP_FREE_RATIO               32
2905 #define QGROUP_FREE_SIZE                SZ_32M
2906 static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
2907                                 const struct btrfs_qgroup *qg, u64 num_bytes)
2908 {
2909         u64 free;
2910         u64 threshold;
2911
2912         if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
2913             qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
2914                 return false;
2915
2916         if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
2917             qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
2918                 return false;
2919
2920         /*
2921          * Even if we passed the check, it's better to check if reservation
2922          * for meta_pertrans is pushing us near limit.
2923          * If there is too much pertrans reservation or it's near the limit,
2924          * let's try commit transaction to free some, using transaction_kthread
2925          */
2926         if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
2927                               BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
2928                 if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
2929                         free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
2930                         threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
2931                                           QGROUP_FREE_SIZE);
2932                 } else {
2933                         free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
2934                         threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
2935                                           QGROUP_FREE_SIZE);
2936                 }
2937
2938                 /*
2939                  * Use transaction_kthread to commit transaction, so we no
2940                  * longer need to bother nested transaction nor lock context.
2941                  */
2942                 if (free < threshold)
2943                         btrfs_commit_transaction_locksafe(fs_info);
2944         }
2945
2946         return true;
2947 }
2948
2949 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
2950                           enum btrfs_qgroup_rsv_type type)
2951 {
2952         struct btrfs_qgroup *qgroup;
2953         struct btrfs_fs_info *fs_info = root->fs_info;
2954         u64 ref_root = root->root_key.objectid;
2955         int ret = 0;
2956         struct ulist_node *unode;
2957         struct ulist_iterator uiter;
2958
2959         if (!is_fstree(ref_root))
2960                 return 0;
2961
2962         if (num_bytes == 0)
2963                 return 0;
2964
2965         if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
2966             capable(CAP_SYS_RESOURCE))
2967                 enforce = false;
2968
2969         spin_lock(&fs_info->qgroup_lock);
2970         if (!fs_info->quota_root)
2971                 goto out;
2972
2973         qgroup = find_qgroup_rb(fs_info, ref_root);
2974         if (!qgroup)
2975                 goto out;
2976
2977         /*
2978          * in a first step, we check all affected qgroups if any limits would
2979          * be exceeded
2980          */
2981         ulist_reinit(fs_info->qgroup_ulist);
2982         ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
2983                         qgroup_to_aux(qgroup), GFP_ATOMIC);
2984         if (ret < 0)
2985                 goto out;
2986         ULIST_ITER_INIT(&uiter);
2987         while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
2988                 struct btrfs_qgroup *qg;
2989                 struct btrfs_qgroup_list *glist;
2990
2991                 qg = unode_aux_to_qgroup(unode);
2992
2993                 if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
2994                         ret = -EDQUOT;
2995                         goto out;
2996                 }
2997
2998                 list_for_each_entry(glist, &qg->groups, next_group) {
2999                         ret = ulist_add(fs_info->qgroup_ulist,
3000                                         glist->group->qgroupid,
3001                                         qgroup_to_aux(glist->group), GFP_ATOMIC);
3002                         if (ret < 0)
3003                                 goto out;
3004                 }
3005         }
3006         ret = 0;
3007         /*
3008          * no limits exceeded, now record the reservation into all qgroups
3009          */
3010         ULIST_ITER_INIT(&uiter);
3011         while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3012                 struct btrfs_qgroup *qg;
3013
3014                 qg = unode_aux_to_qgroup(unode);
3015
3016                 qgroup_rsv_add(fs_info, qg, num_bytes, type);
3017         }
3018
3019 out:
3020         spin_unlock(&fs_info->qgroup_lock);
3021         return ret;
3022 }
3023
3024 /*
3025  * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
3026  * qgroup).
3027  *
3028  * Will handle all higher level qgroup too.
3029  *
3030  * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
3031  * This special case is only used for META_PERTRANS type.
3032  */
3033 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3034                                u64 ref_root, u64 num_bytes,
3035                                enum btrfs_qgroup_rsv_type type)
3036 {
3037         struct btrfs_qgroup *qgroup;
3038         struct ulist_node *unode;
3039         struct ulist_iterator uiter;
3040         int ret = 0;
3041
3042         if (!is_fstree(ref_root))
3043                 return;
3044
3045         if (num_bytes == 0)
3046                 return;
3047
3048         if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
3049                 WARN(1, "%s: Invalid type to free", __func__);
3050                 return;
3051         }
3052         spin_lock(&fs_info->qgroup_lock);
3053
3054         if (!fs_info->quota_root)
3055                 goto out;
3056
3057         qgroup = find_qgroup_rb(fs_info, ref_root);
3058         if (!qgroup)
3059                 goto out;
3060
3061         if (num_bytes == (u64)-1)
3062                 /*
3063                  * We're freeing all pertrans rsv, get reserved value from
3064                  * level 0 qgroup as real num_bytes to free.
3065                  */
3066                 num_bytes = qgroup->rsv.values[type];
3067
3068         ulist_reinit(fs_info->qgroup_ulist);
3069         ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3070                         qgroup_to_aux(qgroup), GFP_ATOMIC);
3071         if (ret < 0)
3072                 goto out;
3073         ULIST_ITER_INIT(&uiter);
3074         while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3075                 struct btrfs_qgroup *qg;
3076                 struct btrfs_qgroup_list *glist;
3077
3078                 qg = unode_aux_to_qgroup(unode);
3079
3080                 qgroup_rsv_release(fs_info, qg, num_bytes, type);
3081
3082                 list_for_each_entry(glist, &qg->groups, next_group) {
3083                         ret = ulist_add(fs_info->qgroup_ulist,
3084                                         glist->group->qgroupid,
3085                                         qgroup_to_aux(glist->group), GFP_ATOMIC);
3086                         if (ret < 0)
3087                                 goto out;
3088                 }
3089         }
3090
3091 out:
3092         spin_unlock(&fs_info->qgroup_lock);
3093 }
3094
3095 /*
3096  * Check if the leaf is the last leaf. Which means all node pointers
3097  * are at their last position.
3098  */
3099 static bool is_last_leaf(struct btrfs_path *path)
3100 {
3101         int i;
3102
3103         for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3104                 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
3105                         return false;
3106         }
3107         return true;
3108 }
3109
3110 /*
3111  * returns < 0 on error, 0 when more leafs are to be scanned.
3112  * returns 1 when done.
3113  */
3114 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3115                               struct btrfs_path *path)
3116 {
3117         struct btrfs_fs_info *fs_info = trans->fs_info;
3118         struct btrfs_key found;
3119         struct extent_buffer *scratch_leaf = NULL;
3120         struct ulist *roots = NULL;
3121         u64 num_bytes;
3122         bool done;
3123         int slot;
3124         int ret;
3125
3126         mutex_lock(&fs_info->qgroup_rescan_lock);
3127         ret = btrfs_search_slot_for_read(fs_info->extent_root,
3128                                          &fs_info->qgroup_rescan_progress,
3129                                          path, 1, 0);
3130
3131         btrfs_debug(fs_info,
3132                 "current progress key (%llu %u %llu), search_slot ret %d",
3133                 fs_info->qgroup_rescan_progress.objectid,
3134                 fs_info->qgroup_rescan_progress.type,
3135                 fs_info->qgroup_rescan_progress.offset, ret);
3136
3137         if (ret) {
3138                 /*
3139                  * The rescan is about to end, we will not be scanning any
3140                  * further blocks. We cannot unset the RESCAN flag here, because
3141                  * we want to commit the transaction if everything went well.
3142                  * To make the live accounting work in this phase, we set our
3143                  * scan progress pointer such that every real extent objectid
3144                  * will be smaller.
3145                  */
3146                 fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3147                 btrfs_release_path(path);
3148                 mutex_unlock(&fs_info->qgroup_rescan_lock);
3149                 return ret;
3150         }
3151         done = is_last_leaf(path);
3152
3153         btrfs_item_key_to_cpu(path->nodes[0], &found,
3154                               btrfs_header_nritems(path->nodes[0]) - 1);
3155         fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
3156
3157         scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
3158         if (!scratch_leaf) {
3159                 ret = -ENOMEM;
3160                 mutex_unlock(&fs_info->qgroup_rescan_lock);
3161                 goto out;
3162         }
3163         slot = path->slots[0];
3164         btrfs_release_path(path);
3165         mutex_unlock(&fs_info->qgroup_rescan_lock);
3166
3167         for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
3168                 btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
3169                 if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3170                     found.type != BTRFS_METADATA_ITEM_KEY)
3171                         continue;
3172                 if (found.type == BTRFS_METADATA_ITEM_KEY)
3173                         num_bytes = fs_info->nodesize;
3174                 else
3175                         num_bytes = found.offset;
3176
3177                 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
3178                                            &roots, false);
3179                 if (ret < 0)
3180                         goto out;
3181                 /* For rescan, just pass old_roots as NULL */
3182                 ret = btrfs_qgroup_account_extent(trans, found.objectid,
3183                                                   num_bytes, NULL, roots);
3184                 if (ret < 0)
3185                         goto out;
3186         }
3187 out:
3188         if (scratch_leaf)
3189                 free_extent_buffer(scratch_leaf);
3190
3191         if (done && !ret) {
3192                 ret = 1;
3193                 fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3194         }
3195         return ret;
3196 }
3197
3198 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3199 {
3200         struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
3201                                                      qgroup_rescan_work);
3202         struct btrfs_path *path;
3203         struct btrfs_trans_handle *trans = NULL;
3204         int err = -ENOMEM;
3205         int ret = 0;
3206
3207         path = btrfs_alloc_path();
3208         if (!path)
3209                 goto out;
3210         /*
3211          * Rescan should only search for commit root, and any later difference
3212          * should be recorded by qgroup
3213          */
3214         path->search_commit_root = 1;
3215         path->skip_locking = 1;
3216
3217         err = 0;
3218         while (!err && !btrfs_fs_closing(fs_info)) {
3219                 trans = btrfs_start_transaction(fs_info->fs_root, 0);
3220                 if (IS_ERR(trans)) {
3221                         err = PTR_ERR(trans);
3222                         break;
3223                 }
3224                 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
3225                         err = -EINTR;
3226                 } else {
3227                         err = qgroup_rescan_leaf(trans, path);
3228                 }
3229                 if (err > 0)
3230                         btrfs_commit_transaction(trans);
3231                 else
3232                         btrfs_end_transaction(trans);
3233         }
3234
3235 out:
3236         btrfs_free_path(path);
3237
3238         mutex_lock(&fs_info->qgroup_rescan_lock);
3239         if (err > 0 &&
3240             fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3241                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3242         } else if (err < 0) {
3243                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3244         }
3245         mutex_unlock(&fs_info->qgroup_rescan_lock);
3246
3247         /*
3248          * only update status, since the previous part has already updated the
3249          * qgroup info.
3250          */
3251         trans = btrfs_start_transaction(fs_info->quota_root, 1);
3252         if (IS_ERR(trans)) {
3253                 err = PTR_ERR(trans);
3254                 trans = NULL;
3255                 btrfs_err(fs_info,
3256                           "fail to start transaction for status update: %d",
3257                           err);
3258         }
3259
3260         mutex_lock(&fs_info->qgroup_rescan_lock);
3261         if (!btrfs_fs_closing(fs_info))
3262                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3263         if (trans) {
3264                 ret = update_qgroup_status_item(trans);
3265                 if (ret < 0) {
3266                         err = ret;
3267                         btrfs_err(fs_info, "fail to update qgroup status: %d",
3268                                   err);
3269                 }
3270         }
3271         fs_info->qgroup_rescan_running = false;
3272         complete_all(&fs_info->qgroup_rescan_completion);
3273         mutex_unlock(&fs_info->qgroup_rescan_lock);
3274
3275         if (!trans)
3276                 return;
3277
3278         btrfs_end_transaction(trans);
3279
3280         if (btrfs_fs_closing(fs_info)) {
3281                 btrfs_info(fs_info, "qgroup scan paused");
3282         } else if (err >= 0) {
3283                 btrfs_info(fs_info, "qgroup scan completed%s",
3284                         err > 0 ? " (inconsistency flag cleared)" : "");
3285         } else {
3286                 btrfs_err(fs_info, "qgroup scan failed with %d", err);
3287         }
3288 }
3289
3290 /*
3291  * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3292  * memory required for the rescan context.
3293  */
3294 static int
3295 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3296                    int init_flags)
3297 {
3298         int ret = 0;
3299
3300         if (!init_flags) {
3301                 /* we're resuming qgroup rescan at mount time */
3302                 if (!(fs_info->qgroup_flags &
3303                       BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3304                         btrfs_warn(fs_info,
3305                         "qgroup rescan init failed, qgroup rescan is not queued");
3306                         ret = -EINVAL;
3307                 } else if (!(fs_info->qgroup_flags &
3308                              BTRFS_QGROUP_STATUS_FLAG_ON)) {
3309                         btrfs_warn(fs_info,
3310                         "qgroup rescan init failed, qgroup is not enabled");
3311                         ret = -EINVAL;
3312                 }
3313
3314                 if (ret)
3315                         return ret;
3316         }
3317
3318         mutex_lock(&fs_info->qgroup_rescan_lock);
3319
3320         if (init_flags) {
3321                 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3322                         btrfs_warn(fs_info,
3323                                    "qgroup rescan is already in progress");
3324                         ret = -EINPROGRESS;
3325                 } else if (!(fs_info->qgroup_flags &
3326                              BTRFS_QGROUP_STATUS_FLAG_ON)) {
3327                         btrfs_warn(fs_info,
3328                         "qgroup rescan init failed, qgroup is not enabled");
3329                         ret = -EINVAL;
3330                 }
3331
3332                 if (ret) {
3333                         mutex_unlock(&fs_info->qgroup_rescan_lock);
3334                         return ret;
3335                 }
3336                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3337         }
3338
3339         memset(&fs_info->qgroup_rescan_progress, 0,
3340                 sizeof(fs_info->qgroup_rescan_progress));
3341         fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3342         init_completion(&fs_info->qgroup_rescan_completion);
3343         mutex_unlock(&fs_info->qgroup_rescan_lock);
3344
3345         btrfs_init_work(&fs_info->qgroup_rescan_work,
3346                         btrfs_qgroup_rescan_worker, NULL, NULL);
3347         return 0;
3348 }
3349
3350 static void
3351 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3352 {
3353         struct rb_node *n;
3354         struct btrfs_qgroup *qgroup;
3355
3356         spin_lock(&fs_info->qgroup_lock);
3357         /* clear all current qgroup tracking information */
3358         for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3359                 qgroup = rb_entry(n, struct btrfs_qgroup, node);
3360                 qgroup->rfer = 0;
3361                 qgroup->rfer_cmpr = 0;
3362                 qgroup->excl = 0;
3363                 qgroup->excl_cmpr = 0;
3364                 qgroup_dirty(fs_info, qgroup);
3365         }
3366         spin_unlock(&fs_info->qgroup_lock);
3367 }
3368
3369 int
3370 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3371 {
3372         int ret = 0;
3373         struct btrfs_trans_handle *trans;
3374
3375         ret = qgroup_rescan_init(fs_info, 0, 1);
3376         if (ret)
3377                 return ret;
3378
3379         /*
3380          * We have set the rescan_progress to 0, which means no more
3381          * delayed refs will be accounted by btrfs_qgroup_account_ref.
3382          * However, btrfs_qgroup_account_ref may be right after its call
3383          * to btrfs_find_all_roots, in which case it would still do the
3384          * accounting.
3385          * To solve this, we're committing the transaction, which will
3386          * ensure we run all delayed refs and only after that, we are
3387          * going to clear all tracking information for a clean start.
3388          */
3389
3390         trans = btrfs_join_transaction(fs_info->fs_root);
3391         if (IS_ERR(trans)) {
3392                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3393                 return PTR_ERR(trans);
3394         }
3395         ret = btrfs_commit_transaction(trans);
3396         if (ret) {
3397                 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3398                 return ret;
3399         }
3400
3401         qgroup_rescan_zero_tracking(fs_info);
3402
3403         mutex_lock(&fs_info->qgroup_rescan_lock);
3404         fs_info->qgroup_rescan_running = true;
3405         btrfs_queue_work(fs_info->qgroup_rescan_workers,
3406                          &fs_info->qgroup_rescan_work);
3407         mutex_unlock(&fs_info->qgroup_rescan_lock);
3408
3409         return 0;
3410 }
3411
3412 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
3413                                      bool interruptible)
3414 {
3415         int running;
3416         int ret = 0;
3417
3418         mutex_lock(&fs_info->qgroup_rescan_lock);
3419         running = fs_info->qgroup_rescan_running;
3420         mutex_unlock(&fs_info->qgroup_rescan_lock);
3421
3422         if (!running)
3423                 return 0;
3424
3425         if (interruptible)
3426                 ret = wait_for_completion_interruptible(
3427                                         &fs_info->qgroup_rescan_completion);
3428         else
3429                 wait_for_completion(&fs_info->qgroup_rescan_completion);
3430
3431         return ret;
3432 }
3433
3434 /*
3435  * this is only called from open_ctree where we're still single threaded, thus
3436  * locking is omitted here.
3437  */
3438 void
3439 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
3440 {
3441         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3442                 mutex_lock(&fs_info->qgroup_rescan_lock);
3443                 fs_info->qgroup_rescan_running = true;
3444                 btrfs_queue_work(fs_info->qgroup_rescan_workers,
3445                                  &fs_info->qgroup_rescan_work);
3446                 mutex_unlock(&fs_info->qgroup_rescan_lock);
3447         }
3448 }
3449
3450 /*
3451  * Reserve qgroup space for range [start, start + len).
3452  *
3453  * This function will either reserve space from related qgroups or doing
3454  * nothing if the range is already reserved.
3455  *
3456  * Return 0 for successful reserve
3457  * Return <0 for error (including -EQUOT)
3458  *
3459  * NOTE: this function may sleep for memory allocation.
3460  *       if btrfs_qgroup_reserve_data() is called multiple times with
3461  *       same @reserved, caller must ensure when error happens it's OK
3462  *       to free *ALL* reserved space.
3463  */
3464 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3465                         struct extent_changeset **reserved_ret, u64 start,
3466                         u64 len)
3467 {
3468         struct btrfs_root *root = inode->root;
3469         struct ulist_node *unode;
3470         struct ulist_iterator uiter;
3471         struct extent_changeset *reserved;
3472         u64 orig_reserved;
3473         u64 to_reserve;
3474         int ret;
3475
3476         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
3477             !is_fstree(root->root_key.objectid) || len == 0)
3478                 return 0;
3479
3480         /* @reserved parameter is mandatory for qgroup */
3481         if (WARN_ON(!reserved_ret))
3482                 return -EINVAL;
3483         if (!*reserved_ret) {
3484                 *reserved_ret = extent_changeset_alloc();
3485                 if (!*reserved_ret)
3486                         return -ENOMEM;
3487         }
3488         reserved = *reserved_ret;
3489         /* Record already reserved space */
3490         orig_reserved = reserved->bytes_changed;
3491         ret = set_record_extent_bits(&inode->io_tree, start,
3492                         start + len -1, EXTENT_QGROUP_RESERVED, reserved);
3493
3494         /* Newly reserved space */
3495         to_reserve = reserved->bytes_changed - orig_reserved;
3496         trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
3497                                         to_reserve, QGROUP_RESERVE);
3498         if (ret < 0)
3499                 goto cleanup;
3500         ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
3501         if (ret < 0)
3502                 goto cleanup;
3503
3504         return ret;
3505
3506 cleanup:
3507         /* cleanup *ALL* already reserved ranges */
3508         ULIST_ITER_INIT(&uiter);
3509         while ((unode = ulist_next(&reserved->range_changed, &uiter)))
3510                 clear_extent_bit(&inode->io_tree, unode->val,
3511                                  unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
3512         /* Also free data bytes of already reserved one */
3513         btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
3514                                   orig_reserved, BTRFS_QGROUP_RSV_DATA);
3515         extent_changeset_release(reserved);
3516         return ret;
3517 }
3518
3519 /* Free ranges specified by @reserved, normally in error path */
3520 static int qgroup_free_reserved_data(struct btrfs_inode *inode,
3521                         struct extent_changeset *reserved, u64 start, u64 len)
3522 {
3523         struct btrfs_root *root = inode->root;
3524         struct ulist_node *unode;
3525         struct ulist_iterator uiter;
3526         struct extent_changeset changeset;
3527         int freed = 0;
3528         int ret;
3529
3530         extent_changeset_init(&changeset);
3531         len = round_up(start + len, root->fs_info->sectorsize);
3532         start = round_down(start, root->fs_info->sectorsize);
3533
3534         ULIST_ITER_INIT(&uiter);
3535         while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
3536                 u64 range_start = unode->val;
3537                 /* unode->aux is the inclusive end */
3538                 u64 range_len = unode->aux - range_start + 1;
3539                 u64 free_start;
3540                 u64 free_len;
3541
3542                 extent_changeset_release(&changeset);
3543
3544                 /* Only free range in range [start, start + len) */
3545                 if (range_start >= start + len ||
3546                     range_start + range_len <= start)
3547                         continue;
3548                 free_start = max(range_start, start);
3549                 free_len = min(start + len, range_start + range_len) -
3550                            free_start;
3551                 /*
3552                  * TODO: To also modify reserved->ranges_reserved to reflect
3553                  * the modification.
3554                  *
3555                  * However as long as we free qgroup reserved according to
3556                  * EXTENT_QGROUP_RESERVED, we won't double free.
3557                  * So not need to rush.
3558                  */
3559                 ret = clear_record_extent_bits(&inode->io_tree, free_start,
3560                                 free_start + free_len - 1,
3561                                 EXTENT_QGROUP_RESERVED, &changeset);
3562                 if (ret < 0)
3563                         goto out;
3564                 freed += changeset.bytes_changed;
3565         }
3566         btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
3567                                   BTRFS_QGROUP_RSV_DATA);
3568         ret = freed;
3569 out:
3570         extent_changeset_release(&changeset);
3571         return ret;
3572 }
3573
3574 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
3575                         struct extent_changeset *reserved, u64 start, u64 len,
3576                         int free)
3577 {
3578         struct extent_changeset changeset;
3579         int trace_op = QGROUP_RELEASE;
3580         int ret;
3581
3582         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
3583                 return 0;
3584
3585         /* In release case, we shouldn't have @reserved */
3586         WARN_ON(!free && reserved);
3587         if (free && reserved)
3588                 return qgroup_free_reserved_data(inode, reserved, start, len);
3589         extent_changeset_init(&changeset);
3590         ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
3591                                        EXTENT_QGROUP_RESERVED, &changeset);
3592         if (ret < 0)
3593                 goto out;
3594
3595         if (free)
3596                 trace_op = QGROUP_FREE;
3597         trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
3598                                         changeset.bytes_changed, trace_op);
3599         if (free)
3600                 btrfs_qgroup_free_refroot(inode->root->fs_info,
3601                                 inode->root->root_key.objectid,
3602                                 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
3603         ret = changeset.bytes_changed;
3604 out:
3605         extent_changeset_release(&changeset);
3606         return ret;
3607 }
3608
3609 /*
3610  * Free a reserved space range from io_tree and related qgroups
3611  *
3612  * Should be called when a range of pages get invalidated before reaching disk.
3613  * Or for error cleanup case.
3614  * if @reserved is given, only reserved range in [@start, @start + @len) will
3615  * be freed.
3616  *
3617  * For data written to disk, use btrfs_qgroup_release_data().
3618  *
3619  * NOTE: This function may sleep for memory allocation.
3620  */
3621 int btrfs_qgroup_free_data(struct btrfs_inode *inode,
3622                         struct extent_changeset *reserved, u64 start, u64 len)
3623 {
3624         return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
3625 }
3626
3627 /*
3628  * Release a reserved space range from io_tree only.
3629  *
3630  * Should be called when a range of pages get written to disk and corresponding
3631  * FILE_EXTENT is inserted into corresponding root.
3632  *
3633  * Since new qgroup accounting framework will only update qgroup numbers at
3634  * commit_transaction() time, its reserved space shouldn't be freed from
3635  * related qgroups.
3636  *
3637  * But we should release the range from io_tree, to allow further write to be
3638  * COWed.
3639  *
3640  * NOTE: This function may sleep for memory allocation.
3641  */
3642 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
3643 {
3644         return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
3645 }
3646
3647 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3648                               enum btrfs_qgroup_rsv_type type)
3649 {
3650         if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3651             type != BTRFS_QGROUP_RSV_META_PERTRANS)
3652                 return;
3653         if (num_bytes == 0)
3654                 return;
3655
3656         spin_lock(&root->qgroup_meta_rsv_lock);
3657         if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
3658                 root->qgroup_meta_rsv_prealloc += num_bytes;
3659         else
3660                 root->qgroup_meta_rsv_pertrans += num_bytes;
3661         spin_unlock(&root->qgroup_meta_rsv_lock);
3662 }
3663
3664 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3665                              enum btrfs_qgroup_rsv_type type)
3666 {
3667         if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3668             type != BTRFS_QGROUP_RSV_META_PERTRANS)
3669                 return 0;
3670         if (num_bytes == 0)
3671                 return 0;
3672
3673         spin_lock(&root->qgroup_meta_rsv_lock);
3674         if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
3675                 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
3676                                   num_bytes);
3677                 root->qgroup_meta_rsv_prealloc -= num_bytes;
3678         } else {
3679                 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
3680                                   num_bytes);
3681                 root->qgroup_meta_rsv_pertrans -= num_bytes;
3682         }
3683         spin_unlock(&root->qgroup_meta_rsv_lock);
3684         return num_bytes;
3685 }
3686
3687 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3688                                 enum btrfs_qgroup_rsv_type type, bool enforce)
3689 {
3690         struct btrfs_fs_info *fs_info = root->fs_info;
3691         int ret;
3692
3693         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3694             !is_fstree(root->root_key.objectid) || num_bytes == 0)
3695                 return 0;
3696
3697         BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3698         trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
3699         ret = qgroup_reserve(root, num_bytes, enforce, type);
3700         if (ret < 0)
3701                 return ret;
3702         /*
3703          * Record what we have reserved into root.
3704          *
3705          * To avoid quota disabled->enabled underflow.
3706          * In that case, we may try to free space we haven't reserved
3707          * (since quota was disabled), so record what we reserved into root.
3708          * And ensure later release won't underflow this number.
3709          */
3710         add_root_meta_rsv(root, num_bytes, type);
3711         return ret;
3712 }
3713
3714 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
3715 {
3716         struct btrfs_fs_info *fs_info = root->fs_info;
3717
3718         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3719             !is_fstree(root->root_key.objectid))
3720                 return;
3721
3722         /* TODO: Update trace point to handle such free */
3723         trace_qgroup_meta_free_all_pertrans(root);
3724         /* Special value -1 means to free all reserved space */
3725         btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
3726                                   BTRFS_QGROUP_RSV_META_PERTRANS);
3727 }
3728
3729 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
3730                               enum btrfs_qgroup_rsv_type type)
3731 {
3732         struct btrfs_fs_info *fs_info = root->fs_info;
3733
3734         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3735             !is_fstree(root->root_key.objectid))
3736                 return;
3737
3738         /*
3739          * reservation for META_PREALLOC can happen before quota is enabled,
3740          * which can lead to underflow.
3741          * Here ensure we will only free what we really have reserved.
3742          */
3743         num_bytes = sub_root_meta_rsv(root, num_bytes, type);
3744         BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3745         trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
3746         btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
3747                                   num_bytes, type);
3748 }
3749
3750 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
3751                                 int num_bytes)
3752 {
3753         struct btrfs_qgroup *qgroup;
3754         struct ulist_node *unode;
3755         struct ulist_iterator uiter;
3756         int ret = 0;
3757
3758         if (num_bytes == 0)
3759                 return;
3760         if (!fs_info->quota_root)
3761                 return;
3762
3763         spin_lock(&fs_info->qgroup_lock);
3764         qgroup = find_qgroup_rb(fs_info, ref_root);
3765         if (!qgroup)
3766                 goto out;
3767         ulist_reinit(fs_info->qgroup_ulist);
3768         ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3769                        qgroup_to_aux(qgroup), GFP_ATOMIC);
3770         if (ret < 0)
3771                 goto out;
3772         ULIST_ITER_INIT(&uiter);
3773         while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3774                 struct btrfs_qgroup *qg;
3775                 struct btrfs_qgroup_list *glist;
3776
3777                 qg = unode_aux_to_qgroup(unode);
3778
3779                 qgroup_rsv_release(fs_info, qg, num_bytes,
3780                                 BTRFS_QGROUP_RSV_META_PREALLOC);
3781                 qgroup_rsv_add(fs_info, qg, num_bytes,
3782                                 BTRFS_QGROUP_RSV_META_PERTRANS);
3783                 list_for_each_entry(glist, &qg->groups, next_group) {
3784                         ret = ulist_add(fs_info->qgroup_ulist,
3785                                         glist->group->qgroupid,
3786                                         qgroup_to_aux(glist->group), GFP_ATOMIC);
3787                         if (ret < 0)
3788                                 goto out;
3789                 }
3790         }
3791 out:
3792         spin_unlock(&fs_info->qgroup_lock);
3793 }
3794
3795 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
3796 {
3797         struct btrfs_fs_info *fs_info = root->fs_info;
3798
3799         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3800             !is_fstree(root->root_key.objectid))
3801                 return;
3802         /* Same as btrfs_qgroup_free_meta_prealloc() */
3803         num_bytes = sub_root_meta_rsv(root, num_bytes,
3804                                       BTRFS_QGROUP_RSV_META_PREALLOC);
3805         trace_qgroup_meta_convert(root, num_bytes);
3806         qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
3807 }
3808
3809 /*
3810  * Check qgroup reserved space leaking, normally at destroy inode
3811  * time
3812  */
3813 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
3814 {
3815         struct extent_changeset changeset;
3816         struct ulist_node *unode;
3817         struct ulist_iterator iter;
3818         int ret;
3819
3820         extent_changeset_init(&changeset);
3821         ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
3822                         EXTENT_QGROUP_RESERVED, &changeset);
3823
3824         WARN_ON(ret < 0);
3825         if (WARN_ON(changeset.bytes_changed)) {
3826                 ULIST_ITER_INIT(&iter);
3827                 while ((unode = ulist_next(&changeset.range_changed, &iter))) {
3828                         btrfs_warn(inode->root->fs_info,
3829                 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
3830                                 btrfs_ino(inode), unode->val, unode->aux);
3831                 }
3832                 btrfs_qgroup_free_refroot(inode->root->fs_info,
3833                                 inode->root->root_key.objectid,
3834                                 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
3835
3836         }
3837         extent_changeset_release(&changeset);
3838 }
3839
3840 void btrfs_qgroup_init_swapped_blocks(
3841         struct btrfs_qgroup_swapped_blocks *swapped_blocks)
3842 {
3843         int i;
3844
3845         spin_lock_init(&swapped_blocks->lock);
3846         for (i = 0; i < BTRFS_MAX_LEVEL; i++)
3847                 swapped_blocks->blocks[i] = RB_ROOT;
3848         swapped_blocks->swapped = false;
3849 }
3850
3851 /*
3852  * Delete all swapped blocks record of @root.
3853  * Every record here means we skipped a full subtree scan for qgroup.
3854  *
3855  * Gets called when committing one transaction.
3856  */
3857 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
3858 {
3859         struct btrfs_qgroup_swapped_blocks *swapped_blocks;
3860         int i;
3861
3862         swapped_blocks = &root->swapped_blocks;
3863
3864         spin_lock(&swapped_blocks->lock);
3865         if (!swapped_blocks->swapped)
3866                 goto out;
3867         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3868                 struct rb_root *cur_root = &swapped_blocks->blocks[i];
3869                 struct btrfs_qgroup_swapped_block *entry;
3870                 struct btrfs_qgroup_swapped_block *next;
3871
3872                 rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
3873                                                      node)
3874                         kfree(entry);
3875                 swapped_blocks->blocks[i] = RB_ROOT;
3876         }
3877         swapped_blocks->swapped = false;
3878 out:
3879         spin_unlock(&swapped_blocks->lock);
3880 }
3881
3882 /*
3883  * Add subtree roots record into @subvol_root.
3884  *
3885  * @subvol_root:        tree root of the subvolume tree get swapped
3886  * @bg:                 block group under balance
3887  * @subvol_parent/slot: pointer to the subtree root in subvolume tree
3888  * @reloc_parent/slot:  pointer to the subtree root in reloc tree
3889  *                      BOTH POINTERS ARE BEFORE TREE SWAP
3890  * @last_snapshot:      last snapshot generation of the subvolume tree
3891  */
3892 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
3893                 struct btrfs_root *subvol_root,
3894                 struct btrfs_block_group *bg,
3895                 struct extent_buffer *subvol_parent, int subvol_slot,
3896                 struct extent_buffer *reloc_parent, int reloc_slot,
3897                 u64 last_snapshot)
3898 {
3899         struct btrfs_fs_info *fs_info = subvol_root->fs_info;
3900         struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
3901         struct btrfs_qgroup_swapped_block *block;
3902         struct rb_node **cur;
3903         struct rb_node *parent = NULL;
3904         int level = btrfs_header_level(subvol_parent) - 1;
3905         int ret = 0;
3906
3907         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
3908                 return 0;
3909
3910         if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
3911             btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
3912                 btrfs_err_rl(fs_info,
3913                 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
3914                         __func__,
3915                         btrfs_node_ptr_generation(subvol_parent, subvol_slot),
3916                         btrfs_node_ptr_generation(reloc_parent, reloc_slot));
3917                 return -EUCLEAN;
3918         }
3919
3920         block = kmalloc(sizeof(*block), GFP_NOFS);
3921         if (!block) {
3922                 ret = -ENOMEM;
3923                 goto out;
3924         }
3925
3926         /*
3927          * @reloc_parent/slot is still before swap, while @block is going to
3928          * record the bytenr after swap, so we do the swap here.
3929          */
3930         block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
3931         block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
3932                                                              reloc_slot);
3933         block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
3934         block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
3935                                                             subvol_slot);
3936         block->last_snapshot = last_snapshot;
3937         block->level = level;
3938
3939         /*
3940          * If we have bg == NULL, we're called from btrfs_recover_relocation(),
3941          * no one else can modify tree blocks thus we qgroup will not change
3942          * no matter the value of trace_leaf.
3943          */
3944         if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
3945                 block->trace_leaf = true;
3946         else
3947                 block->trace_leaf = false;
3948         btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
3949
3950         /* Insert @block into @blocks */
3951         spin_lock(&blocks->lock);
3952         cur = &blocks->blocks[level].rb_node;
3953         while (*cur) {
3954                 struct btrfs_qgroup_swapped_block *entry;
3955
3956                 parent = *cur;
3957                 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
3958                                  node);
3959
3960                 if (entry->subvol_bytenr < block->subvol_bytenr) {
3961                         cur = &(*cur)->rb_left;
3962                 } else if (entry->subvol_bytenr > block->subvol_bytenr) {
3963                         cur = &(*cur)->rb_right;
3964                 } else {
3965                         if (entry->subvol_generation !=
3966                                         block->subvol_generation ||
3967                             entry->reloc_bytenr != block->reloc_bytenr ||
3968                             entry->reloc_generation !=
3969                                         block->reloc_generation) {
3970                                 /*
3971                                  * Duplicated but mismatch entry found.
3972                                  * Shouldn't happen.
3973                                  *
3974                                  * Marking qgroup inconsistent should be enough
3975                                  * for end users.
3976                                  */
3977                                 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
3978                                 ret = -EEXIST;
3979                         }
3980                         kfree(block);
3981                         goto out_unlock;
3982                 }
3983         }
3984         rb_link_node(&block->node, parent, cur);
3985         rb_insert_color(&block->node, &blocks->blocks[level]);
3986         blocks->swapped = true;
3987 out_unlock:
3988         spin_unlock(&blocks->lock);
3989 out:
3990         if (ret < 0)
3991                 fs_info->qgroup_flags |=
3992                         BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3993         return ret;
3994 }
3995
3996 /*
3997  * Check if the tree block is a subtree root, and if so do the needed
3998  * delayed subtree trace for qgroup.
3999  *
4000  * This is called during btrfs_cow_block().
4001  */
4002 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4003                                          struct btrfs_root *root,
4004                                          struct extent_buffer *subvol_eb)
4005 {
4006         struct btrfs_fs_info *fs_info = root->fs_info;
4007         struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4008         struct btrfs_qgroup_swapped_block *block;
4009         struct extent_buffer *reloc_eb = NULL;
4010         struct rb_node *node;
4011         bool found = false;
4012         bool swapped = false;
4013         int level = btrfs_header_level(subvol_eb);
4014         int ret = 0;
4015         int i;
4016
4017         if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4018                 return 0;
4019         if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
4020                 return 0;
4021
4022         spin_lock(&blocks->lock);
4023         if (!blocks->swapped) {
4024                 spin_unlock(&blocks->lock);
4025                 return 0;
4026         }
4027         node = blocks->blocks[level].rb_node;
4028
4029         while (node) {
4030                 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4031                 if (block->subvol_bytenr < subvol_eb->start) {
4032                         node = node->rb_left;
4033                 } else if (block->subvol_bytenr > subvol_eb->start) {
4034                         node = node->rb_right;
4035                 } else {
4036                         found = true;
4037                         break;
4038                 }
4039         }
4040         if (!found) {
4041                 spin_unlock(&blocks->lock);
4042                 goto out;
4043         }
4044         /* Found one, remove it from @blocks first and update blocks->swapped */
4045         rb_erase(&block->node, &blocks->blocks[level]);
4046         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4047                 if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4048                         swapped = true;
4049                         break;
4050                 }
4051         }
4052         blocks->swapped = swapped;
4053         spin_unlock(&blocks->lock);
4054
4055         /* Read out reloc subtree root */
4056         reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
4057                                    block->reloc_generation, block->level,
4058                                    &block->first_key);
4059         if (IS_ERR(reloc_eb)) {
4060                 ret = PTR_ERR(reloc_eb);
4061                 reloc_eb = NULL;
4062                 goto free_out;
4063         }
4064         if (!extent_buffer_uptodate(reloc_eb)) {
4065                 ret = -EIO;
4066                 goto free_out;
4067         }
4068
4069         ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4070                         block->last_snapshot, block->trace_leaf);
4071 free_out:
4072         kfree(block);
4073         free_extent_buffer(reloc_eb);
4074 out:
4075         if (ret < 0) {
4076                 btrfs_err_rl(fs_info,
4077                              "failed to account subtree at bytenr %llu: %d",
4078                              subvol_eb->start, ret);
4079                 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4080         }
4081         return ret;
4082 }
4083
4084 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4085 {
4086         struct btrfs_qgroup_extent_record *entry;
4087         struct btrfs_qgroup_extent_record *next;
4088         struct rb_root *root;
4089
4090         root = &trans->delayed_refs.dirty_extent_root;
4091         rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
4092                 ulist_free(entry->old_roots);
4093                 kfree(entry);
4094         }
4095 }