fs/btrfs/volumes.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/sched/mm.h>
   8 #include <linux/bio.h>
   9 #include <linux/slab.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/ratelimit.h>
  12 #include <linux/kthread.h>
  13 #include <linux/raid/pq.h>
  14 #include <linux/semaphore.h>
  15 #include <linux/uuid.h>
  16 #include <linux/list_sort.h>
  17 #include "misc.h"
  18 #include "ctree.h"
  19 #include "extent_map.h"
  20 #include "disk-io.h"
  21 #include "transaction.h"
  22 #include "print-tree.h"
  23 #include "volumes.h"
  24 #include "raid56.h"
  25 #include "async-thread.h"
  26 #include "check-integrity.h"
  27 #include "rcu-string.h"
  28 #include "dev-replace.h"
  29 #include "sysfs.h"
  30 #include "tree-checker.h"
  31 #include "space-info.h"
  32 #include "block-group.h"
  33 #include "discard.h"
  34
  35 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  36         [BTRFS_RAID_RAID10] = {
  37                 .sub_stripes    = 2,
  38                 .dev_stripes    = 1,
  39                 .devs_max       = 0,    /* 0 == as many as possible */
  40                 .devs_min       = 4,
  41                 .tolerated_failures = 1,
  42                 .devs_increment = 2,
  43                 .ncopies        = 2,
  44                 .nparity        = 0,
  45                 .raid_name      = "raid10",
  46                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
  47                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
  48         },
  49         [BTRFS_RAID_RAID1] = {
  50                 .sub_stripes    = 1,
  51                 .dev_stripes    = 1,
  52                 .devs_max       = 2,
  53                 .devs_min       = 2,
  54                 .tolerated_failures = 1,
  55                 .devs_increment = 2,
  56                 .ncopies        = 2,
  57                 .nparity        = 0,
  58                 .raid_name      = "raid1",
  59                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
  60                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
  61         },
  62         [BTRFS_RAID_RAID1C3] = {
  63                 .sub_stripes    = 1,
  64                 .dev_stripes    = 1,
  65                 .devs_max       = 3,
  66                 .devs_min       = 3,
  67                 .tolerated_failures = 2,
  68                 .devs_increment = 3,
  69                 .ncopies        = 3,
  70                 .nparity        = 0,
  71                 .raid_name      = "raid1c3",
  72                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
  73                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
  74         },
  75         [BTRFS_RAID_RAID1C4] = {
  76                 .sub_stripes    = 1,
  77                 .dev_stripes    = 1,
  78                 .devs_max       = 4,
  79                 .devs_min       = 4,
  80                 .tolerated_failures = 3,
  81                 .devs_increment = 4,
  82                 .ncopies        = 4,
  83                 .nparity        = 0,
  84                 .raid_name      = "raid1c4",
  85                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
  86                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
  87         },
  88         [BTRFS_RAID_DUP] = {
  89                 .sub_stripes    = 1,
  90                 .dev_stripes    = 2,
  91                 .devs_max       = 1,
  92                 .devs_min       = 1,
  93                 .tolerated_failures = 0,
  94                 .devs_increment = 1,
  95                 .ncopies        = 2,
  96                 .nparity        = 0,
  97                 .raid_name      = "dup",
  98                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
  99                 .mindev_error   = 0,
 100         },
 101         [BTRFS_RAID_RAID0] = {
 102                 .sub_stripes    = 1,
 103                 .dev_stripes    = 1,
 104                 .devs_max       = 0,
 105                 .devs_min       = 2,
 106                 .tolerated_failures = 0,
 107                 .devs_increment = 1,
 108                 .ncopies        = 1,
 109                 .nparity        = 0,
 110                 .raid_name      = "raid0",
 111                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
 112                 .mindev_error   = 0,
 113         },
 114         [BTRFS_RAID_SINGLE] = {
 115                 .sub_stripes    = 1,
 116                 .dev_stripes    = 1,
 117                 .devs_max       = 1,
 118                 .devs_min       = 1,
 119                 .tolerated_failures = 0,
 120                 .devs_increment = 1,
 121                 .ncopies        = 1,
 122                 .nparity        = 0,
 123                 .raid_name      = "single",
 124                 .bg_flag        = 0,
 125                 .mindev_error   = 0,
 126         },
 127         [BTRFS_RAID_RAID5] = {
 128                 .sub_stripes    = 1,
 129                 .dev_stripes    = 1,
 130                 .devs_max       = 0,
 131                 .devs_min       = 2,
 132                 .tolerated_failures = 1,
 133                 .devs_increment = 1,
 134                 .ncopies        = 1,
 135                 .nparity        = 1,
 136                 .raid_name      = "raid5",
 137                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
 138                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 139         },
 140         [BTRFS_RAID_RAID6] = {
 141                 .sub_stripes    = 1,
 142                 .dev_stripes    = 1,
 143                 .devs_max       = 0,
 144                 .devs_min       = 3,
 145                 .tolerated_failures = 2,
 146                 .devs_increment = 1,
 147                 .ncopies        = 1,
 148                 .nparity        = 2,
 149                 .raid_name      = "raid6",
 150                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
 151                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 152         },
 153 };
 154
 155 const char *btrfs_bg_type_to_raid_name(u64 flags)
 156 {
 157         const int index = btrfs_bg_flags_to_raid_index(flags);
 158
 159         if (index >= BTRFS_NR_RAID_TYPES)
 160                 return NULL;
 161
 162         return btrfs_raid_array[index].raid_name;
 163 }
 164
 165 /*
 166  * Fill @buf with textual description of @bg_flags, no more than @size_buf
 167  * bytes including terminating null byte.
 168  */
 169 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 170 {
 171         int i;
 172         int ret;
 173         char *bp = buf;
 174         u64 flags = bg_flags;
 175         u32 size_bp = size_buf;
 176
 177         if (!flags) {
 178                 strcpy(bp, "NONE");
 179                 return;
 180         }
 181
 182 #define DESCRIBE_FLAG(flag, desc)                                               \
 183         do {                                                            \
 184                 if (flags & (flag)) {                                   \
 185                         ret = snprintf(bp, size_bp, "%s|", (desc));     \
 186                         if (ret < 0 || ret >= size_bp)                  \
 187                                 goto out_overflow;                      \
 188                         size_bp -= ret;                                 \
 189                         bp += ret;                                      \
 190                         flags &= ~(flag);                               \
 191                 }                                                       \
 192         } while (0)
 193
 194         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 195         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 196         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
 197
 198         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
 199         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 200                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
 201                               btrfs_raid_array[i].raid_name);
 202 #undef DESCRIBE_FLAG
 203
 204         if (flags) {
 205                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
 206                 size_bp -= ret;
 207         }
 208
 209         if (size_bp < size_buf)
 210                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
 211
 212         /*
 213          * The text is trimmed, it's up to the caller to provide sufficiently
 214          * large buffer
 215          */
 216 out_overflow:;
 217 }
 218
 219 static int init_first_rw_device(struct btrfs_trans_handle *trans);
 220 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 221 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 222 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 223 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 224                              enum btrfs_map_op op,
 225                              u64 logical, u64 *length,
 226                              struct btrfs_bio **bbio_ret,
 227                              int mirror_num, int need_raid_map);
 228
 229 /*
 230  * Device locking
 231  * ==============
 232  *
 233  * There are several mutexes that protect manipulation of devices and low-level
 234  * structures like chunks but not block groups, extents or files
 235  *
 236  * uuid_mutex (global lock)
 237  * ------------------------
 238  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 239  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 240  * device) or requested by the device= mount option
 241  *
 242  * the mutex can be very coarse and can cover long-running operations
 243  *
 244  * protects: updates to fs_devices counters like missing devices, rw devices,
 245  * seeding, structure cloning, opening/closing devices at mount/umount time
 246  *
 247  * global::fs_devs - add, remove, updates to the global list
 248  *
 249  * does not protect: manipulation of the fs_devices::devices list in general
 250  * but in mount context it could be used to exclude list modifications by eg.
 251  * scan ioctl
 252  *
 253  * btrfs_device::name - renames (write side), read is RCU
 254  *
 255  * fs_devices::device_list_mutex (per-fs, with RCU)
 256  * ------------------------------------------------
 257  * protects updates to fs_devices::devices, ie. adding and deleting
 258  *
 259  * simple list traversal with read-only actions can be done with RCU protection
 260  *
 261  * may be used to exclude some operations from running concurrently without any
 262  * modifications to the list (see write_all_supers)
 263  *
 264  * Is not required at mount and close times, because our device list is
 265  * protected by the uuid_mutex at that point.
 266  *
 267  * balance_mutex
 268  * -------------
 269  * protects balance structures (status, state) and context accessed from
 270  * several places (internally, ioctl)
 271  *
 272  * chunk_mutex
 273  * -----------
 274  * protects chunks, adding or removing during allocation, trim or when a new
 275  * device is added/removed. Additionally it also protects post_commit_list of
 276  * individual devices, since they can be added to the transaction's
 277  * post_commit_list only with chunk_mutex held.
 278  *
 279  * cleaner_mutex
 280  * -------------
 281  * a big lock that is held by the cleaner thread and prevents running subvolume
 282  * cleaning together with relocation or delayed iputs
 283  *
 284  *
 285  * Lock nesting
 286  * ============
 287  *
 288  * uuid_mutex
 289  *   device_list_mutex
 290  *     chunk_mutex
 291  *   balance_mutex
 292  *
 293  *
 294  * Exclusive operations
 295  * ====================
 296  *
 297  * Maintains the exclusivity of the following operations that apply to the
 298  * whole filesystem and cannot run in parallel.
 299  *
 300  * - Balance (*)
 301  * - Device add
 302  * - Device remove
 303  * - Device replace (*)
 304  * - Resize
 305  *
 306  * The device operations (as above) can be in one of the following states:
 307  *
 308  * - Running state
 309  * - Paused state
 310  * - Completed state
 311  *
 312  * Only device operations marked with (*) can go into the Paused state for the
 313  * following reasons:
 314  *
 315  * - ioctl (only Balance can be Paused through ioctl)
 316  * - filesystem remounted as read-only
 317  * - filesystem unmounted and mounted as read-only
 318  * - system power-cycle and filesystem mounted as read-only
 319  * - filesystem or device errors leading to forced read-only
 320  *
 321  * The status of exclusive operation is set and cleared atomically.
 322  * During the course of Paused state, fs_info::exclusive_operation remains set.
 323  * A device operation in Paused or Running state can be canceled or resumed
 324  * either by ioctl (Balance only) or when remounted as read-write.
 325  * The exclusive status is cleared when the device operation is canceled or
 326  * completed.
 327  */
 328
 329 DEFINE_MUTEX(uuid_mutex);
 330 static LIST_HEAD(fs_uuids);
 331 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
 332 {
 333         return &fs_uuids;
 334 }
 335
 336 /*
 337  * alloc_fs_devices - allocate struct btrfs_fs_devices
 338  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
 339  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
 340  *
 341  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 342  * The returned struct is not linked onto any lists and can be destroyed with
 343  * kfree() right away.
 344  */
 345 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 346                                                  const u8 *metadata_fsid)
 347 {
 348         struct btrfs_fs_devices *fs_devs;
 349
 350         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 351         if (!fs_devs)
 352                 return ERR_PTR(-ENOMEM);
 353
 354         mutex_init(&fs_devs->device_list_mutex);
 355
 356         INIT_LIST_HEAD(&fs_devs->devices);
 357         INIT_LIST_HEAD(&fs_devs->alloc_list);
 358         INIT_LIST_HEAD(&fs_devs->fs_list);
 359         INIT_LIST_HEAD(&fs_devs->seed_list);
 360         if (fsid)
 361                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 362
 363         if (metadata_fsid)
 364                 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
 365         else if (fsid)
 366                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 367
 368         return fs_devs;
 369 }
 370
 371 void btrfs_free_device(struct btrfs_device *device)
 372 {
 373         WARN_ON(!list_empty(&device->post_commit_list));
 374         rcu_string_free(device->name);
 375         extent_io_tree_release(&device->alloc_state);
 376         bio_put(device->flush_bio);
 377         kfree(device);
 378 }
 379
 380 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 381 {
 382         struct btrfs_device *device;
 383         WARN_ON(fs_devices->opened);
 384         while (!list_empty(&fs_devices->devices)) {
 385                 device = list_entry(fs_devices->devices.next,
 386                                     struct btrfs_device, dev_list);
 387                 list_del(&device->dev_list);
 388                 btrfs_free_device(device);
 389         }
 390         kfree(fs_devices);
 391 }
 392
 393 void __exit btrfs_cleanup_fs_uuids(void)
 394 {
 395         struct btrfs_fs_devices *fs_devices;
 396
 397         while (!list_empty(&fs_uuids)) {
 398                 fs_devices = list_entry(fs_uuids.next,
 399                                         struct btrfs_fs_devices, fs_list);
 400                 list_del(&fs_devices->fs_list);
 401                 free_fs_devices(fs_devices);
 402         }
 403 }
 404
 405 /*
 406  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
 407  * Returned struct is not linked onto any lists and must be destroyed using
 408  * btrfs_free_device.
 409  */
 410 static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
 411 {
 412         struct btrfs_device *dev;
 413
 414         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 415         if (!dev)
 416                 return ERR_PTR(-ENOMEM);
 417
 418         /*
 419          * Preallocate a bio that's always going to be used for flushing device
 420          * barriers and matches the device lifespan
 421          */
 422         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
 423         if (!dev->flush_bio) {
 424                 kfree(dev);
 425                 return ERR_PTR(-ENOMEM);
 426         }
 427
 428         INIT_LIST_HEAD(&dev->dev_list);
 429         INIT_LIST_HEAD(&dev->dev_alloc_list);
 430         INIT_LIST_HEAD(&dev->post_commit_list);
 431
 432         atomic_set(&dev->reada_in_flight, 0);
 433         atomic_set(&dev->dev_stats_ccnt, 0);
 434         btrfs_device_data_ordered_init(dev, fs_info);
 435         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 436         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 437         extent_io_tree_init(fs_info, &dev->alloc_state,
 438                             IO_TREE_DEVICE_ALLOC_STATE, NULL);
 439
 440         return dev;
 441 }
 442
 443 static noinline struct btrfs_fs_devices *find_fsid(
 444                 const u8 *fsid, const u8 *metadata_fsid)
 445 {
 446         struct btrfs_fs_devices *fs_devices;
 447
 448         ASSERT(fsid);
 449
 450         /* Handle non-split brain cases */
 451         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 452                 if (metadata_fsid) {
 453                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
 454                             && memcmp(metadata_fsid, fs_devices->metadata_uuid,
 455                                       BTRFS_FSID_SIZE) == 0)
 456                                 return fs_devices;
 457                 } else {
 458                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 459                                 return fs_devices;
 460                 }
 461         }
 462         return NULL;
 463 }
 464
 465 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
 466                                 struct btrfs_super_block *disk_super)
 467 {
 468
 469         struct btrfs_fs_devices *fs_devices;
 470
 471         /*
 472          * Handle scanned device having completed its fsid change but
 473          * belonging to a fs_devices that was created by first scanning
 474          * a device which didn't have its fsid/metadata_uuid changed
 475          * at all and the CHANGING_FSID_V2 flag set.
 476          */
 477         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 478                 if (fs_devices->fsid_change &&
 479                     memcmp(disk_super->metadata_uuid, fs_devices->fsid,
 480                            BTRFS_FSID_SIZE) == 0 &&
 481                     memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 482                            BTRFS_FSID_SIZE) == 0) {
 483                         return fs_devices;
 484                 }
 485         }
 486         /*
 487          * Handle scanned device having completed its fsid change but
 488          * belonging to a fs_devices that was created by a device that
 489          * has an outdated pair of fsid/metadata_uuid and
 490          * CHANGING_FSID_V2 flag set.
 491          */
 492         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 493                 if (fs_devices->fsid_change &&
 494                     memcmp(fs_devices->metadata_uuid,
 495                            fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
 496                     memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
 497                            BTRFS_FSID_SIZE) == 0) {
 498                         return fs_devices;
 499                 }
 500         }
 501
 502         return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
 503 }
 504
 505
 506 static int
 507 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 508                       int flush, struct block_device **bdev,
 509                       struct btrfs_super_block **disk_super)
 510 {
 511         int ret;
 512
 513         *bdev = blkdev_get_by_path(device_path, flags, holder);
 514
 515         if (IS_ERR(*bdev)) {
 516                 ret = PTR_ERR(*bdev);
 517                 goto error;
 518         }
 519
 520         if (flush)
 521                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 522         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
 523         if (ret) {
 524                 blkdev_put(*bdev, flags);
 525                 goto error;
 526         }
 527         invalidate_bdev(*bdev);
 528         *disk_super = btrfs_read_dev_super(*bdev);
 529         if (IS_ERR(*disk_super)) {
 530                 ret = PTR_ERR(*disk_super);
 531                 blkdev_put(*bdev, flags);
 532                 goto error;
 533         }
 534
 535         return 0;
 536
 537 error:
 538         *bdev = NULL;
 539         return ret;
 540 }
 541
 542 static bool device_path_matched(const char *path, struct btrfs_device *device)
 543 {
 544         int found;
 545
 546         rcu_read_lock();
 547         found = strcmp(rcu_str_deref(device->name), path);
 548         rcu_read_unlock();
 549
 550         return found == 0;
 551 }
 552
 553 /*
 554  *  Search and remove all stale (devices which are not mounted) devices.
 555  *  When both inputs are NULL, it will search and release all stale devices.
 556  *  path:       Optional. When provided will it release all unmounted devices
 557  *              matching this path only.
 558  *  skip_dev:   Optional. Will skip this device when searching for the stale
 559  *              devices.
 560  *  Return:     0 for success or if @path is NULL.
 561  *              -EBUSY if @path is a mounted device.
 562  *              -ENOENT if @path does not match any device in the list.
 563  */
 564 static int btrfs_free_stale_devices(const char *path,
 565                                      struct btrfs_device *skip_device)
 566 {
 567         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 568         struct btrfs_device *device, *tmp_device;
 569         int ret = 0;
 570
 571         if (path)
 572                 ret = -ENOENT;
 573
 574         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 575
 576                 mutex_lock(&fs_devices->device_list_mutex);
 577                 list_for_each_entry_safe(device, tmp_device,
 578                                          &fs_devices->devices, dev_list) {
 579                         if (skip_device && skip_device == device)
 580                                 continue;
 581                         if (path && !device->name)
 582                                 continue;
 583                         if (path && !device_path_matched(path, device))
 584                                 continue;
 585                         if (fs_devices->opened) {
 586                                 /* for an already deleted device return 0 */
 587                                 if (path && ret != 0)
 588                                         ret = -EBUSY;
 589                                 break;
 590                         }
 591
 592                         /* delete the stale device */
 593                         fs_devices->num_devices--;
 594                         list_del(&device->dev_list);
 595                         btrfs_free_device(device);
 596
 597                         ret = 0;
 598                 }
 599                 mutex_unlock(&fs_devices->device_list_mutex);
 600
 601                 if (fs_devices->num_devices == 0) {
 602                         btrfs_sysfs_remove_fsid(fs_devices);
 603                         list_del(&fs_devices->fs_list);
 604                         free_fs_devices(fs_devices);
 605                 }
 606         }
 607
 608         return ret;
 609 }
 610
 611 /*
 612  * This is only used on mount, and we are protected from competing things
 613  * messing with our fs_devices by the uuid_mutex, thus we do not need the
 614  * fs_devices->device_list_mutex here.
 615  */
 616 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 617                         struct btrfs_device *device, fmode_t flags,
 618                         void *holder)
 619 {
 620         struct request_queue *q;
 621         struct block_device *bdev;
 622         struct btrfs_super_block *disk_super;
 623         u64 devid;
 624         int ret;
 625
 626         if (device->bdev)
 627                 return -EINVAL;
 628         if (!device->name)
 629                 return -EINVAL;
 630
 631         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 632                                     &bdev, &disk_super);
 633         if (ret)
 634                 return ret;
 635
 636         devid = btrfs_stack_device_id(&disk_super->dev_item);
 637         if (devid != device->devid)
 638                 goto error_free_page;
 639
 640         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
 641                 goto error_free_page;
 642
 643         device->generation = btrfs_super_generation(disk_super);
 644
 645         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 646                 if (btrfs_super_incompat_flags(disk_super) &
 647                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
 648                         pr_err(
 649                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
 650                         goto error_free_page;
 651                 }
 652
 653                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 654                 fs_devices->seeding = true;
 655         } else {
 656                 if (bdev_read_only(bdev))
 657                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 658                 else
 659                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 660         }
 661
 662         q = bdev_get_queue(bdev);
 663         if (!blk_queue_nonrot(q))
 664                 fs_devices->rotating = true;
 665
 666         device->bdev = bdev;
 667         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 668         device->mode = flags;
 669
 670         fs_devices->open_devices++;
 671         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 672             device->devid != BTRFS_DEV_REPLACE_DEVID) {
 673                 fs_devices->rw_devices++;
 674                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 675         }
 676         btrfs_release_disk_super(disk_super);
 677
 678         return 0;
 679
 680 error_free_page:
 681         btrfs_release_disk_super(disk_super);
 682         blkdev_put(bdev, flags);
 683
 684         return -EINVAL;
 685 }
 686
 687 /*
 688  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
 689  * being created with a disk that has already completed its fsid change. Such
 690  * disk can belong to an fs which has its FSID changed or to one which doesn't.
 691  * Handle both cases here.
 692  */
 693 static struct btrfs_fs_devices *find_fsid_inprogress(
 694                                         struct btrfs_super_block *disk_super)
 695 {
 696         struct btrfs_fs_devices *fs_devices;
 697
 698         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 699                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 700                            BTRFS_FSID_SIZE) != 0 &&
 701                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 702                            BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
 703                         return fs_devices;
 704                 }
 705         }
 706
 707         return find_fsid(disk_super->fsid, NULL);
 708 }
 709
 710
 711 static struct btrfs_fs_devices *find_fsid_changed(
 712                                         struct btrfs_super_block *disk_super)
 713 {
 714         struct btrfs_fs_devices *fs_devices;
 715
 716         /*
 717          * Handles the case where scanned device is part of an fs that had
 718          * multiple successful changes of FSID but curently device didn't
 719          * observe it. Meaning our fsid will be different than theirs. We need
 720          * to handle two subcases :
 721          *  1 - The fs still continues to have different METADATA/FSID uuids.
 722          *  2 - The fs is switched back to its original FSID (METADATA/FSID
 723          *  are equal).
 724          */
 725         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 726                 /* Changed UUIDs */
 727                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 728                            BTRFS_FSID_SIZE) != 0 &&
 729                     memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
 730                            BTRFS_FSID_SIZE) == 0 &&
 731                     memcmp(fs_devices->fsid, disk_super->fsid,
 732                            BTRFS_FSID_SIZE) != 0)
 733                         return fs_devices;
 734
 735                 /* Unchanged UUIDs */
 736                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 737                            BTRFS_FSID_SIZE) == 0 &&
 738                     memcmp(fs_devices->fsid, disk_super->metadata_uuid,
 739                            BTRFS_FSID_SIZE) == 0)
 740                         return fs_devices;
 741         }
 742
 743         return NULL;
 744 }
 745
 746 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
 747                                 struct btrfs_super_block *disk_super)
 748 {
 749         struct btrfs_fs_devices *fs_devices;
 750
 751         /*
 752          * Handle the case where the scanned device is part of an fs whose last
 753          * metadata UUID change reverted it to the original FSID. At the same
 754          * time * fs_devices was first created by another constitutent device
 755          * which didn't fully observe the operation. This results in an
 756          * btrfs_fs_devices created with metadata/fsid different AND
 757          * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
 758          * fs_devices equal to the FSID of the disk.
 759          */
 760         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 761                 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 762                            BTRFS_FSID_SIZE) != 0 &&
 763                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 764                            BTRFS_FSID_SIZE) == 0 &&
 765                     fs_devices->fsid_change)
 766                         return fs_devices;
 767         }
 768
 769         return NULL;
 770 }
 771 /*
 772  * Add new device to list of registered devices
 773  *
 774  * Returns:
 775  * device pointer which was just added or updated when successful
 776  * error pointer when failed
 777  */
 778 static noinline struct btrfs_device *device_list_add(const char *path,
 779                            struct btrfs_super_block *disk_super,
 780                            bool *new_device_added)
 781 {
 782         struct btrfs_device *device;
 783         struct btrfs_fs_devices *fs_devices = NULL;
 784         struct rcu_string *name;
 785         u64 found_transid = btrfs_super_generation(disk_super);
 786         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 787         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 788                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 789         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
 790                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
 791
 792         if (fsid_change_in_progress) {
 793                 if (!has_metadata_uuid)
 794                         fs_devices = find_fsid_inprogress(disk_super);
 795                 else
 796                         fs_devices = find_fsid_changed(disk_super);
 797         } else if (has_metadata_uuid) {
 798                 fs_devices = find_fsid_with_metadata_uuid(disk_super);
 799         } else {
 800                 fs_devices = find_fsid_reverted_metadata(disk_super);
 801                 if (!fs_devices)
 802                         fs_devices = find_fsid(disk_super->fsid, NULL);
 803         }
 804
 805
 806         if (!fs_devices) {
 807                 if (has_metadata_uuid)
 808                         fs_devices = alloc_fs_devices(disk_super->fsid,
 809                                                       disk_super->metadata_uuid);
 810                 else
 811                         fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
 812
 813                 if (IS_ERR(fs_devices))
 814                         return ERR_CAST(fs_devices);
 815
 816                 fs_devices->fsid_change = fsid_change_in_progress;
 817
 818                 mutex_lock(&fs_devices->device_list_mutex);
 819                 list_add(&fs_devices->fs_list, &fs_uuids);
 820
 821                 device = NULL;
 822         } else {
 823                 mutex_lock(&fs_devices->device_list_mutex);
 824                 device = btrfs_find_device(fs_devices, devid,
 825                                 disk_super->dev_item.uuid, NULL, false);
 826
 827                 /*
 828                  * If this disk has been pulled into an fs devices created by
 829                  * a device which had the CHANGING_FSID_V2 flag then replace the
 830                  * metadata_uuid/fsid values of the fs_devices.
 831                  */
 832                 if (fs_devices->fsid_change &&
 833                     found_transid > fs_devices->latest_generation) {
 834                         memcpy(fs_devices->fsid, disk_super->fsid,
 835                                         BTRFS_FSID_SIZE);
 836
 837                         if (has_metadata_uuid)
 838                                 memcpy(fs_devices->metadata_uuid,
 839                                        disk_super->metadata_uuid,
 840                                        BTRFS_FSID_SIZE);
 841                         else
 842                                 memcpy(fs_devices->metadata_uuid,
 843                                        disk_super->fsid, BTRFS_FSID_SIZE);
 844
 845                         fs_devices->fsid_change = false;
 846                 }
 847         }
 848
 849         if (!device) {
 850                 if (fs_devices->opened) {
 851                         mutex_unlock(&fs_devices->device_list_mutex);
 852                         return ERR_PTR(-EBUSY);
 853                 }
 854
 855                 device = btrfs_alloc_device(NULL, &devid,
 856                                             disk_super->dev_item.uuid);
 857                 if (IS_ERR(device)) {
 858                         mutex_unlock(&fs_devices->device_list_mutex);
 859                         /* we can safely leave the fs_devices entry around */
 860                         return device;
 861                 }
 862
 863                 name = rcu_string_strdup(path, GFP_NOFS);
 864                 if (!name) {
 865                         btrfs_free_device(device);
 866                         mutex_unlock(&fs_devices->device_list_mutex);
 867                         return ERR_PTR(-ENOMEM);
 868                 }
 869                 rcu_assign_pointer(device->name, name);
 870
 871                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 872                 fs_devices->num_devices++;
 873
 874                 device->fs_devices = fs_devices;
 875                 *new_device_added = true;
 876
 877                 if (disk_super->label[0])
 878                         pr_info(
 879         "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
 880                                 disk_super->label, devid, found_transid, path,
 881                                 current->comm, task_pid_nr(current));
 882                 else
 883                         pr_info(
 884         "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
 885                                 disk_super->fsid, devid, found_transid, path,
 886                                 current->comm, task_pid_nr(current));
 887
 888         } else if (!device->name || strcmp(device->name->str, path)) {
 889                 /*
 890                  * When FS is already mounted.
 891                  * 1. If you are here and if the device->name is NULL that
 892                  *    means this device was missing at time of FS mount.
 893                  * 2. If you are here and if the device->name is different
 894                  *    from 'path' that means either
 895                  *      a. The same device disappeared and reappeared with
 896                  *         different name. or
 897                  *      b. The missing-disk-which-was-replaced, has
 898                  *         reappeared now.
 899                  *
 900                  * We must allow 1 and 2a above. But 2b would be a spurious
 901                  * and unintentional.
 902                  *
 903                  * Further in case of 1 and 2a above, the disk at 'path'
 904                  * would have missed some transaction when it was away and
 905                  * in case of 2a the stale bdev has to be updated as well.
 906                  * 2b must not be allowed at all time.
 907                  */
 908
 909                 /*
 910                  * For now, we do allow update to btrfs_fs_device through the
 911                  * btrfs dev scan cli after FS has been mounted.  We're still
 912                  * tracking a problem where systems fail mount by subvolume id
 913                  * when we reject replacement on a mounted FS.
 914                  */
 915                 if (!fs_devices->opened && found_transid < device->generation) {
 916                         /*
 917                          * That is if the FS is _not_ mounted and if you
 918                          * are here, that means there is more than one
 919                          * disk with same uuid and devid.We keep the one
 920                          * with larger generation number or the last-in if
 921                          * generation are equal.
 922                          */
 923                         mutex_unlock(&fs_devices->device_list_mutex);
 924                         return ERR_PTR(-EEXIST);
 925                 }
 926
 927                 /*
 928                  * We are going to replace the device path for a given devid,
 929                  * make sure it's the same device if the device is mounted
 930                  */
 931                 if (device->bdev) {
 932                         struct block_device *path_bdev;
 933
 934                         path_bdev = lookup_bdev(path);
 935                         if (IS_ERR(path_bdev)) {
 936                                 mutex_unlock(&fs_devices->device_list_mutex);
 937                                 return ERR_CAST(path_bdev);
 938                         }
 939
 940                         if (device->bdev != path_bdev) {
 941                                 bdput(path_bdev);
 942                                 mutex_unlock(&fs_devices->device_list_mutex);
 943                                 btrfs_warn_in_rcu(device->fs_info,
 944         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
 945                                                   path, devid, found_transid,
 946                                                   current->comm,
 947                                                   task_pid_nr(current));
 948                                 return ERR_PTR(-EEXIST);
 949                         }
 950                         bdput(path_bdev);
 951                         btrfs_info_in_rcu(device->fs_info,
 952         "devid %llu device path %s changed to %s scanned by %s (%d)",
 953                                           devid, rcu_str_deref(device->name),
 954                                           path, current->comm,
 955                                           task_pid_nr(current));
 956                 }
 957
 958                 name = rcu_string_strdup(path, GFP_NOFS);
 959                 if (!name) {
 960                         mutex_unlock(&fs_devices->device_list_mutex);
 961                         return ERR_PTR(-ENOMEM);
 962                 }
 963                 rcu_string_free(device->name);
 964                 rcu_assign_pointer(device->name, name);
 965                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 966                         fs_devices->missing_devices--;
 967                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 968                 }
 969         }
 970
 971         /*
 972          * Unmount does not free the btrfs_device struct but would zero
 973          * generation along with most of the other members. So just update
 974          * it back. We need it to pick the disk with largest generation
 975          * (as above).
 976          */
 977         if (!fs_devices->opened) {
 978                 device->generation = found_transid;
 979                 fs_devices->latest_generation = max_t(u64, found_transid,
 980                                                 fs_devices->latest_generation);
 981         }
 982
 983         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
 984
 985         mutex_unlock(&fs_devices->device_list_mutex);
 986         return device;
 987 }
 988
 989 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 990 {
 991         struct btrfs_fs_devices *fs_devices;
 992         struct btrfs_device *device;
 993         struct btrfs_device *orig_dev;
 994         int ret = 0;
 995
 996         fs_devices = alloc_fs_devices(orig->fsid, NULL);
 997         if (IS_ERR(fs_devices))
 998                 return fs_devices;
 999
1000         mutex_lock(&orig->device_list_mutex);
1001         fs_devices->total_devices = orig->total_devices;
1002
1003         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1004                 struct rcu_string *name;
1005
1006                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
1007                                             orig_dev->uuid);
1008                 if (IS_ERR(device)) {
1009                         ret = PTR_ERR(device);
1010                         goto error;
1011                 }
1012
1013                 /*
1014                  * This is ok to do without rcu read locked because we hold the
1015                  * uuid mutex so nothing we touch in here is going to disappear.
1016                  */
1017                 if (orig_dev->name) {
1018                         name = rcu_string_strdup(orig_dev->name->str,
1019                                         GFP_KERNEL);
1020                         if (!name) {
1021                                 btrfs_free_device(device);
1022                                 ret = -ENOMEM;
1023                                 goto error;
1024                         }
1025                         rcu_assign_pointer(device->name, name);
1026                 }
1027
1028                 list_add(&device->dev_list, &fs_devices->devices);
1029                 device->fs_devices = fs_devices;
1030                 fs_devices->num_devices++;
1031         }
1032         mutex_unlock(&orig->device_list_mutex);
1033         return fs_devices;
1034 error:
1035         mutex_unlock(&orig->device_list_mutex);
1036         free_fs_devices(fs_devices);
1037         return ERR_PTR(ret);
1038 }
1039
1040 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1041                                       int step, struct btrfs_device **latest_dev)
1042 {
1043         struct btrfs_device *device, *next;
1044
1045         /* This is the initialized path, it is safe to release the devices. */
1046         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1047                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1048                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1049                                       &device->dev_state) &&
1050                             !test_bit(BTRFS_DEV_STATE_MISSING,
1051                                       &device->dev_state) &&
1052                             (!*latest_dev ||
1053                              device->generation > (*latest_dev)->generation)) {
1054                                 *latest_dev = device;
1055                         }
1056                         continue;
1057                 }
1058
1059                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
1060                         /*
1061                          * In the first step, keep the device which has
1062                          * the correct fsid and the devid that is used
1063                          * for the dev_replace procedure.
1064                          * In the second step, the dev_replace state is
1065                          * read from the device tree and it is known
1066                          * whether the procedure is really active or
1067                          * not, which means whether this device is
1068                          * used or whether it should be removed.
1069                          */
1070                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1071                                                   &device->dev_state)) {
1072                                 continue;
1073                         }
1074                 }
1075                 if (device->bdev) {
1076                         blkdev_put(device->bdev, device->mode);
1077                         device->bdev = NULL;
1078                         fs_devices->open_devices--;
1079                 }
1080                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1081                         list_del_init(&device->dev_alloc_list);
1082                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1083                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1084                                       &device->dev_state))
1085                                 fs_devices->rw_devices--;
1086                 }
1087                 list_del_init(&device->dev_list);
1088                 fs_devices->num_devices--;
1089                 btrfs_free_device(device);
1090         }
1091
1092 }
1093
1094 /*
1095  * After we have read the system tree and know devids belonging to this
1096  * filesystem, remove the device which does not belong there.
1097  */
1098 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1099 {
1100         struct btrfs_device *latest_dev = NULL;
1101         struct btrfs_fs_devices *seed_dev;
1102
1103         mutex_lock(&uuid_mutex);
1104         __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
1105
1106         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1107                 __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
1108
1109         fs_devices->latest_bdev = latest_dev->bdev;
1110
1111         mutex_unlock(&uuid_mutex);
1112 }
1113
1114 static void btrfs_close_bdev(struct btrfs_device *device)
1115 {
1116         if (!device->bdev)
1117                 return;
1118
1119         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1120                 sync_blockdev(device->bdev);
1121                 invalidate_bdev(device->bdev);
1122         }
1123
1124         blkdev_put(device->bdev, device->mode);
1125 }
1126
1127 static void btrfs_close_one_device(struct btrfs_device *device)
1128 {
1129         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1130
1131         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1132             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1133                 list_del_init(&device->dev_alloc_list);
1134                 fs_devices->rw_devices--;
1135         }
1136
1137         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1138                 fs_devices->missing_devices--;
1139
1140         btrfs_close_bdev(device);
1141         if (device->bdev) {
1142                 fs_devices->open_devices--;
1143                 device->bdev = NULL;
1144         }
1145         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1146
1147         device->fs_info = NULL;
1148         atomic_set(&device->dev_stats_ccnt, 0);
1149         extent_io_tree_release(&device->alloc_state);
1150
1151         /* Verify the device is back in a pristine state  */
1152         ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1153         ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1154         ASSERT(list_empty(&device->dev_alloc_list));
1155         ASSERT(list_empty(&device->post_commit_list));
1156         ASSERT(atomic_read(&device->reada_in_flight) == 0);
1157 }
1158
1159 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1160 {
1161         struct btrfs_device *device, *tmp;
1162
1163         lockdep_assert_held(&uuid_mutex);
1164
1165         if (--fs_devices->opened > 0)
1166                 return;
1167
1168         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1169                 btrfs_close_one_device(device);
1170
1171         WARN_ON(fs_devices->open_devices);
1172         WARN_ON(fs_devices->rw_devices);
1173         fs_devices->opened = 0;
1174         fs_devices->seeding = false;
1175         fs_devices->fs_info = NULL;
1176 }
1177
1178 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1179 {
1180         LIST_HEAD(list);
1181         struct btrfs_fs_devices *tmp;
1182
1183         mutex_lock(&uuid_mutex);
1184         close_fs_devices(fs_devices);
1185         if (!fs_devices->opened)
1186                 list_splice_init(&fs_devices->seed_list, &list);
1187
1188         list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1189                 close_fs_devices(fs_devices);
1190                 list_del(&fs_devices->seed_list);
1191                 free_fs_devices(fs_devices);
1192         }
1193         mutex_unlock(&uuid_mutex);
1194 }
1195
1196 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1197                                 fmode_t flags, void *holder)
1198 {
1199         struct btrfs_device *device;
1200         struct btrfs_device *latest_dev = NULL;
1201         struct btrfs_device *tmp_device;
1202
1203         flags |= FMODE_EXCL;
1204
1205         list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1206                                  dev_list) {
1207                 int ret;
1208
1209                 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1210                 if (ret == 0 &&
1211                     (!latest_dev || device->generation > latest_dev->generation)) {
1212                         latest_dev = device;
1213                 } else if (ret == -ENODATA) {
1214                         fs_devices->num_devices--;
1215                         list_del(&device->dev_list);
1216                         btrfs_free_device(device);
1217                 }
1218         }
1219         if (fs_devices->open_devices == 0)
1220                 return -EINVAL;
1221
1222         fs_devices->opened = 1;
1223         fs_devices->latest_bdev = latest_dev->bdev;
1224         fs_devices->total_rw_bytes = 0;
1225         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1226
1227         return 0;
1228 }
1229
1230 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1231 {
1232         struct btrfs_device *dev1, *dev2;
1233
1234         dev1 = list_entry(a, struct btrfs_device, dev_list);
1235         dev2 = list_entry(b, struct btrfs_device, dev_list);
1236
1237         if (dev1->devid < dev2->devid)
1238                 return -1;
1239         else if (dev1->devid > dev2->devid)
1240                 return 1;
1241         return 0;
1242 }
1243
1244 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1245                        fmode_t flags, void *holder)
1246 {
1247         int ret;
1248
1249         lockdep_assert_held(&uuid_mutex);
1250         /*
1251          * The device_list_mutex cannot be taken here in case opening the
1252          * underlying device takes further locks like bd_mutex.
1253          *
1254          * We also don't need the lock here as this is called during mount and
1255          * exclusion is provided by uuid_mutex
1256          */
1257
1258         if (fs_devices->opened) {
1259                 fs_devices->opened++;
1260                 ret = 0;
1261         } else {
1262                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1263                 ret = open_fs_devices(fs_devices, flags, holder);
1264         }
1265
1266         return ret;
1267 }
1268
1269 void btrfs_release_disk_super(struct btrfs_super_block *super)
1270 {
1271         struct page *page = virt_to_page(super);
1272
1273         put_page(page);
1274 }
1275
1276 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1277                                                        u64 bytenr)
1278 {
1279         struct btrfs_super_block *disk_super;
1280         struct page *page;
1281         void *p;
1282         pgoff_t index;
1283
1284         /* make sure our super fits in the device */
1285         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1286                 return ERR_PTR(-EINVAL);
1287
1288         /* make sure our super fits in the page */
1289         if (sizeof(*disk_super) > PAGE_SIZE)
1290                 return ERR_PTR(-EINVAL);
1291
1292         /* make sure our super doesn't straddle pages on disk */
1293         index = bytenr >> PAGE_SHIFT;
1294         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1295                 return ERR_PTR(-EINVAL);
1296
1297         /* pull in the page with our super */
1298         page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1299
1300         if (IS_ERR(page))
1301                 return ERR_CAST(page);
1302
1303         p = page_address(page);
1304
1305         /* align our pointer to the offset of the super block */
1306         disk_super = p + offset_in_page(bytenr);
1307
1308         if (btrfs_super_bytenr(disk_super) != bytenr ||
1309             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1310                 btrfs_release_disk_super(p);
1311                 return ERR_PTR(-EINVAL);
1312         }
1313
1314         if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1315                 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1316
1317         return disk_super;
1318 }
1319
1320 int btrfs_forget_devices(const char *path)
1321 {
1322         int ret;
1323
1324         mutex_lock(&uuid_mutex);
1325         ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1326         mutex_unlock(&uuid_mutex);
1327
1328         return ret;
1329 }
1330
1331 /*
1332  * Look for a btrfs signature on a device. This may be called out of the mount path
1333  * and we are not allowed to call set_blocksize during the scan. The superblock
1334  * is read via pagecache
1335  */
1336 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1337                                            void *holder)
1338 {
1339         struct btrfs_super_block *disk_super;
1340         bool new_device_added = false;
1341         struct btrfs_device *device = NULL;
1342         struct block_device *bdev;
1343         u64 bytenr;
1344
1345         lockdep_assert_held(&uuid_mutex);
1346
1347         /*
1348          * we would like to check all the supers, but that would make
1349          * a btrfs mount succeed after a mkfs from a different FS.
1350          * So, we need to add a special mount option to scan for
1351          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1352          */
1353         bytenr = btrfs_sb_offset(0);
1354         flags |= FMODE_EXCL;
1355
1356         bdev = blkdev_get_by_path(path, flags, holder);
1357         if (IS_ERR(bdev))
1358                 return ERR_CAST(bdev);
1359
1360         disk_super = btrfs_read_disk_super(bdev, bytenr);
1361         if (IS_ERR(disk_super)) {
1362                 device = ERR_CAST(disk_super);
1363                 goto error_bdev_put;
1364         }
1365
1366         device = device_list_add(path, disk_super, &new_device_added);
1367         if (!IS_ERR(device)) {
1368                 if (new_device_added)
1369                         btrfs_free_stale_devices(path, device);
1370         }
1371
1372         btrfs_release_disk_super(disk_super);
1373
1374 error_bdev_put:
1375         blkdev_put(bdev, flags);
1376
1377         return device;
1378 }
1379
1380 /*
1381  * Try to find a chunk that intersects [start, start + len] range and when one
1382  * such is found, record the end of it in *start
1383  */
1384 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1385                                     u64 len)
1386 {
1387         u64 physical_start, physical_end;
1388
1389         lockdep_assert_held(&device->fs_info->chunk_mutex);
1390
1391         if (!find_first_extent_bit(&device->alloc_state, *start,
1392                                    &physical_start, &physical_end,
1393                                    CHUNK_ALLOCATED, NULL)) {
1394
1395                 if (in_range(physical_start, *start, len) ||
1396                     in_range(*start, physical_start,
1397                              physical_end - physical_start)) {
1398                         *start = physical_end + 1;
1399                         return true;
1400                 }
1401         }
1402         return false;
1403 }
1404
1405 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1406 {
1407         switch (device->fs_devices->chunk_alloc_policy) {
1408         case BTRFS_CHUNK_ALLOC_REGULAR:
1409                 /*
1410                  * We don't want to overwrite the superblock on the drive nor
1411                  * any area used by the boot loader (grub for example), so we
1412                  * make sure to start at an offset of at least 1MB.
1413                  */
1414                 return max_t(u64, start, SZ_1M);
1415         default:
1416                 BUG();
1417         }
1418 }
1419
1420 /**
1421  * dev_extent_hole_check - check if specified hole is suitable for allocation
1422  * @device:     the device which we have the hole
1423  * @hole_start: starting position of the hole
1424  * @hole_size:  the size of the hole
1425  * @num_bytes:  the size of the free space that we need
1426  *
1427  * This function may modify @hole_start and @hole_end to reflect the suitable
1428  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1429  */
1430 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1431                                   u64 *hole_size, u64 num_bytes)
1432 {
1433         bool changed = false;
1434         u64 hole_end = *hole_start + *hole_size;
1435
1436         /*
1437          * Check before we set max_hole_start, otherwise we could end up
1438          * sending back this offset anyway.
1439          */
1440         if (contains_pending_extent(device, hole_start, *hole_size)) {
1441                 if (hole_end >= *hole_start)
1442                         *hole_size = hole_end - *hole_start;
1443                 else
1444                         *hole_size = 0;
1445                 changed = true;
1446         }
1447
1448         switch (device->fs_devices->chunk_alloc_policy) {
1449         case BTRFS_CHUNK_ALLOC_REGULAR:
1450                 /* No extra check */
1451                 break;
1452         default:
1453                 BUG();
1454         }
1455
1456         return changed;
1457 }
1458
1459 /*
1460  * find_free_dev_extent_start - find free space in the specified device
1461  * @device:       the device which we search the free space in
1462  * @num_bytes:    the size of the free space that we need
1463  * @search_start: the position from which to begin the search
1464  * @start:        store the start of the free space.
1465  * @len:          the size of the free space. that we find, or the size
1466  *                of the max free space if we don't find suitable free space
1467  *
1468  * this uses a pretty simple search, the expectation is that it is
1469  * called very infrequently and that a given device has a small number
1470  * of extents
1471  *
1472  * @start is used to store the start of the free space if we find. But if we
1473  * don't find suitable free space, it will be used to store the start position
1474  * of the max free space.
1475  *
1476  * @len is used to store the size of the free space that we find.
1477  * But if we don't find suitable free space, it is used to store the size of
1478  * the max free space.
1479  *
1480  * NOTE: This function will search *commit* root of device tree, and does extra
1481  * check to ensure dev extents are not double allocated.
1482  * This makes the function safe to allocate dev extents but may not report
1483  * correct usable device space, as device extent freed in current transaction
1484  * is not reported as avaiable.
1485  */
1486 static int find_free_dev_extent_start(struct btrfs_device *device,
1487                                 u64 num_bytes, u64 search_start, u64 *start,
1488                                 u64 *len)
1489 {
1490         struct btrfs_fs_info *fs_info = device->fs_info;
1491         struct btrfs_root *root = fs_info->dev_root;
1492         struct btrfs_key key;
1493         struct btrfs_dev_extent *dev_extent;
1494         struct btrfs_path *path;
1495         u64 hole_size;
1496         u64 max_hole_start;
1497         u64 max_hole_size;
1498         u64 extent_end;
1499         u64 search_end = device->total_bytes;
1500         int ret;
1501         int slot;
1502         struct extent_buffer *l;
1503
1504         search_start = dev_extent_search_start(device, search_start);
1505
1506         path = btrfs_alloc_path();
1507         if (!path)
1508                 return -ENOMEM;
1509
1510         max_hole_start = search_start;
1511         max_hole_size = 0;
1512
1513 again:
1514         if (search_start >= search_end ||
1515                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1516                 ret = -ENOSPC;
1517                 goto out;
1518         }
1519
1520         path->reada = READA_FORWARD;
1521         path->search_commit_root = 1;
1522         path->skip_locking = 1;
1523
1524         key.objectid = device->devid;
1525         key.offset = search_start;
1526         key.type = BTRFS_DEV_EXTENT_KEY;
1527
1528         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1529         if (ret < 0)
1530                 goto out;
1531         if (ret > 0) {
1532                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1533                 if (ret < 0)
1534                         goto out;
1535         }
1536
1537         while (1) {
1538                 l = path->nodes[0];
1539                 slot = path->slots[0];
1540                 if (slot >= btrfs_header_nritems(l)) {
1541                         ret = btrfs_next_leaf(root, path);
1542                         if (ret == 0)
1543                                 continue;
1544                         if (ret < 0)
1545                                 goto out;
1546
1547                         break;
1548                 }
1549                 btrfs_item_key_to_cpu(l, &key, slot);
1550
1551                 if (key.objectid < device->devid)
1552                         goto next;
1553
1554                 if (key.objectid > device->devid)
1555                         break;
1556
1557                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1558                         goto next;
1559
1560                 if (key.offset > search_start) {
1561                         hole_size = key.offset - search_start;
1562                         dev_extent_hole_check(device, &search_start, &hole_size,
1563                                               num_bytes);
1564
1565                         if (hole_size > max_hole_size) {
1566                                 max_hole_start = search_start;
1567                                 max_hole_size = hole_size;
1568                         }
1569
1570                         /*
1571                          * If this free space is greater than which we need,
1572                          * it must be the max free space that we have found
1573                          * until now, so max_hole_start must point to the start
1574                          * of this free space and the length of this free space
1575                          * is stored in max_hole_size. Thus, we return
1576                          * max_hole_start and max_hole_size and go back to the
1577                          * caller.
1578                          */
1579                         if (hole_size >= num_bytes) {
1580                                 ret = 0;
1581                                 goto out;
1582                         }
1583                 }
1584
1585                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1586                 extent_end = key.offset + btrfs_dev_extent_length(l,
1587                                                                   dev_extent);
1588                 if (extent_end > search_start)
1589                         search_start = extent_end;
1590 next:
1591                 path->slots[0]++;
1592                 cond_resched();
1593         }
1594
1595         /*
1596          * At this point, search_start should be the end of
1597          * allocated dev extents, and when shrinking the device,
1598          * search_end may be smaller than search_start.
1599          */
1600         if (search_end > search_start) {
1601                 hole_size = search_end - search_start;
1602                 if (dev_extent_hole_check(device, &search_start, &hole_size,
1603                                           num_bytes)) {
1604                         btrfs_release_path(path);
1605                         goto again;
1606                 }
1607
1608                 if (hole_size > max_hole_size) {
1609                         max_hole_start = search_start;
1610                         max_hole_size = hole_size;
1611                 }
1612         }
1613
1614         /* See above. */
1615         if (max_hole_size < num_bytes)
1616                 ret = -ENOSPC;
1617         else
1618                 ret = 0;
1619
1620 out:
1621         btrfs_free_path(path);
1622         *start = max_hole_start;
1623         if (len)
1624                 *len = max_hole_size;
1625         return ret;
1626 }
1627
1628 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1629                          u64 *start, u64 *len)
1630 {
1631         /* FIXME use last free of some kind */
1632         return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1633 }
1634
1635 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1636                           struct btrfs_device *device,
1637                           u64 start, u64 *dev_extent_len)
1638 {
1639         struct btrfs_fs_info *fs_info = device->fs_info;
1640         struct btrfs_root *root = fs_info->dev_root;
1641         int ret;
1642         struct btrfs_path *path;
1643         struct btrfs_key key;
1644         struct btrfs_key found_key;
1645         struct extent_buffer *leaf = NULL;
1646         struct btrfs_dev_extent *extent = NULL;
1647
1648         path = btrfs_alloc_path();
1649         if (!path)
1650                 return -ENOMEM;
1651
1652         key.objectid = device->devid;
1653         key.offset = start;
1654         key.type = BTRFS_DEV_EXTENT_KEY;
1655 again:
1656         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1657         if (ret > 0) {
1658                 ret = btrfs_previous_item(root, path, key.objectid,
1659                                           BTRFS_DEV_EXTENT_KEY);
1660                 if (ret)
1661                         goto out;
1662                 leaf = path->nodes[0];
1663                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1664                 extent = btrfs_item_ptr(leaf, path->slots[0],
1665                                         struct btrfs_dev_extent);
1666                 BUG_ON(found_key.offset > start || found_key.offset +
1667                        btrfs_dev_extent_length(leaf, extent) < start);
1668                 key = found_key;
1669                 btrfs_release_path(path);
1670                 goto again;
1671         } else if (ret == 0) {
1672                 leaf = path->nodes[0];
1673                 extent = btrfs_item_ptr(leaf, path->slots[0],
1674                                         struct btrfs_dev_extent);
1675         } else {
1676                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1677                 goto out;
1678         }
1679
1680         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1681
1682         ret = btrfs_del_item(trans, root, path);
1683         if (ret) {
1684                 btrfs_handle_fs_error(fs_info, ret,
1685                                       "Failed to remove dev extent item");
1686         } else {
1687                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1688         }
1689 out:
1690         btrfs_free_path(path);
1691         return ret;
1692 }
1693
1694 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1695                                   struct btrfs_device *device,
1696                                   u64 chunk_offset, u64 start, u64 num_bytes)
1697 {
1698         int ret;
1699         struct btrfs_path *path;
1700         struct btrfs_fs_info *fs_info = device->fs_info;
1701         struct btrfs_root *root = fs_info->dev_root;
1702         struct btrfs_dev_extent *extent;
1703         struct extent_buffer *leaf;
1704         struct btrfs_key key;
1705
1706         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1707         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1708         path = btrfs_alloc_path();
1709         if (!path)
1710                 return -ENOMEM;
1711
1712         key.objectid = device->devid;
1713         key.offset = start;
1714         key.type = BTRFS_DEV_EXTENT_KEY;
1715         ret = btrfs_insert_empty_item(trans, root, path, &key,
1716                                       sizeof(*extent));
1717         if (ret)
1718                 goto out;
1719
1720         leaf = path->nodes[0];
1721         extent = btrfs_item_ptr(leaf, path->slots[0],
1722                                 struct btrfs_dev_extent);
1723         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1724                                         BTRFS_CHUNK_TREE_OBJECTID);
1725         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1726                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1727         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1728
1729         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1730         btrfs_mark_buffer_dirty(leaf);
1731 out:
1732         btrfs_free_path(path);
1733         return ret;
1734 }
1735
1736 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1737 {
1738         struct extent_map_tree *em_tree;
1739         struct extent_map *em;
1740         struct rb_node *n;
1741         u64 ret = 0;
1742
1743         em_tree = &fs_info->mapping_tree;
1744         read_lock(&em_tree->lock);
1745         n = rb_last(&em_tree->map.rb_root);
1746         if (n) {
1747                 em = rb_entry(n, struct extent_map, rb_node);
1748                 ret = em->start + em->len;
1749         }
1750         read_unlock(&em_tree->lock);
1751
1752         return ret;
1753 }
1754
1755 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1756                                     u64 *devid_ret)
1757 {
1758         int ret;
1759         struct btrfs_key key;
1760         struct btrfs_key found_key;
1761         struct btrfs_path *path;
1762
1763         path = btrfs_alloc_path();
1764         if (!path)
1765                 return -ENOMEM;
1766
1767         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1768         key.type = BTRFS_DEV_ITEM_KEY;
1769         key.offset = (u64)-1;
1770
1771         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1772         if (ret < 0)
1773                 goto error;
1774
1775         if (ret == 0) {
1776                 /* Corruption */
1777                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1778                 ret = -EUCLEAN;
1779                 goto error;
1780         }
1781
1782         ret = btrfs_previous_item(fs_info->chunk_root, path,
1783                                   BTRFS_DEV_ITEMS_OBJECTID,
1784                                   BTRFS_DEV_ITEM_KEY);
1785         if (ret) {
1786                 *devid_ret = 1;
1787         } else {
1788                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1789                                       path->slots[0]);
1790                 *devid_ret = found_key.offset + 1;
1791         }
1792         ret = 0;
1793 error:
1794         btrfs_free_path(path);
1795         return ret;
1796 }
1797
1798 /*
1799  * the device information is stored in the chunk root
1800  * the btrfs_device struct should be fully filled in
1801  */
1802 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1803                             struct btrfs_device *device)
1804 {
1805         int ret;
1806         struct btrfs_path *path;
1807         struct btrfs_dev_item *dev_item;
1808         struct extent_buffer *leaf;
1809         struct btrfs_key key;
1810         unsigned long ptr;
1811
1812         path = btrfs_alloc_path();
1813         if (!path)
1814                 return -ENOMEM;
1815
1816         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1817         key.type = BTRFS_DEV_ITEM_KEY;
1818         key.offset = device->devid;
1819
1820         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1821                                       &key, sizeof(*dev_item));
1822         if (ret)
1823                 goto out;
1824
1825         leaf = path->nodes[0];
1826         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1827
1828         btrfs_set_device_id(leaf, dev_item, device->devid);
1829         btrfs_set_device_generation(leaf, dev_item, 0);
1830         btrfs_set_device_type(leaf, dev_item, device->type);
1831         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1832         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1833         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1834         btrfs_set_device_total_bytes(leaf, dev_item,
1835                                      btrfs_device_get_disk_total_bytes(device));
1836         btrfs_set_device_bytes_used(leaf, dev_item,
1837                                     btrfs_device_get_bytes_used(device));
1838         btrfs_set_device_group(leaf, dev_item, 0);
1839         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1840         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1841         btrfs_set_device_start_offset(leaf, dev_item, 0);
1842
1843         ptr = btrfs_device_uuid(dev_item);
1844         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1845         ptr = btrfs_device_fsid(dev_item);
1846         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1847                             ptr, BTRFS_FSID_SIZE);
1848         btrfs_mark_buffer_dirty(leaf);
1849
1850         ret = 0;
1851 out:
1852         btrfs_free_path(path);
1853         return ret;
1854 }
1855
1856 /*
1857  * Function to update ctime/mtime for a given device path.
1858  * Mainly used for ctime/mtime based probe like libblkid.
1859  */
1860 static void update_dev_time(const char *path_name)
1861 {
1862         struct file *filp;
1863
1864         filp = filp_open(path_name, O_RDWR, 0);
1865         if (IS_ERR(filp))
1866                 return;
1867         file_update_time(filp);
1868         filp_close(filp, NULL);
1869 }
1870
1871 static int btrfs_rm_dev_item(struct btrfs_device *device)
1872 {
1873         struct btrfs_root *root = device->fs_info->chunk_root;
1874         int ret;
1875         struct btrfs_path *path;
1876         struct btrfs_key key;
1877         struct btrfs_trans_handle *trans;
1878
1879         path = btrfs_alloc_path();
1880         if (!path)
1881                 return -ENOMEM;
1882
1883         trans = btrfs_start_transaction(root, 0);
1884         if (IS_ERR(trans)) {
1885                 btrfs_free_path(path);
1886                 return PTR_ERR(trans);
1887         }
1888         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1889         key.type = BTRFS_DEV_ITEM_KEY;
1890         key.offset = device->devid;
1891
1892         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1893         if (ret) {
1894                 if (ret > 0)
1895                         ret = -ENOENT;
1896                 btrfs_abort_transaction(trans, ret);
1897                 btrfs_end_transaction(trans);
1898                 goto out;
1899         }
1900
1901         ret = btrfs_del_item(trans, root, path);
1902         if (ret) {
1903                 btrfs_abort_transaction(trans, ret);
1904                 btrfs_end_transaction(trans);
1905         }
1906
1907 out:
1908         btrfs_free_path(path);
1909         if (!ret)
1910                 ret = btrfs_commit_transaction(trans);
1911         return ret;
1912 }
1913
1914 /*
1915  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1916  * filesystem. It's up to the caller to adjust that number regarding eg. device
1917  * replace.
1918  */
1919 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1920                 u64 num_devices)
1921 {
1922         u64 all_avail;
1923         unsigned seq;
1924         int i;
1925
1926         do {
1927                 seq = read_seqbegin(&fs_info->profiles_lock);
1928
1929                 all_avail = fs_info->avail_data_alloc_bits |
1930                             fs_info->avail_system_alloc_bits |
1931                             fs_info->avail_metadata_alloc_bits;
1932         } while (read_seqretry(&fs_info->profiles_lock, seq));
1933
1934         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1935                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1936                         continue;
1937
1938                 if (num_devices < btrfs_raid_array[i].devs_min) {
1939                         int ret = btrfs_raid_array[i].mindev_error;
1940
1941                         if (ret)
1942                                 return ret;
1943                 }
1944         }
1945
1946         return 0;
1947 }
1948
1949 static struct btrfs_device * btrfs_find_next_active_device(
1950                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1951 {
1952         struct btrfs_device *next_device;
1953
1954         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1955                 if (next_device != device &&
1956                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1957                     && next_device->bdev)
1958                         return next_device;
1959         }
1960
1961         return NULL;
1962 }
1963
1964 /*
1965  * Helper function to check if the given device is part of s_bdev / latest_bdev
1966  * and replace it with the provided or the next active device, in the context
1967  * where this function called, there should be always be another device (or
1968  * this_dev) which is active.
1969  */
1970 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1971                                             struct btrfs_device *next_device)
1972 {
1973         struct btrfs_fs_info *fs_info = device->fs_info;
1974
1975         if (!next_device)
1976                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1977                                                             device);
1978         ASSERT(next_device);
1979
1980         if (fs_info->sb->s_bdev &&
1981                         (fs_info->sb->s_bdev == device->bdev))
1982                 fs_info->sb->s_bdev = next_device->bdev;
1983
1984         if (fs_info->fs_devices->latest_bdev == device->bdev)
1985                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1986 }
1987
1988 /*
1989  * Return btrfs_fs_devices::num_devices excluding the device that's being
1990  * currently replaced.
1991  */
1992 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1993 {
1994         u64 num_devices = fs_info->fs_devices->num_devices;
1995
1996         down_read(&fs_info->dev_replace.rwsem);
1997         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1998                 ASSERT(num_devices > 1);
1999                 num_devices--;
2000         }
2001         up_read(&fs_info->dev_replace.rwsem);
2002
2003         return num_devices;
2004 }
2005
2006 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2007                                struct block_device *bdev,
2008                                const char *device_path)
2009 {
2010         struct btrfs_super_block *disk_super;
2011         int copy_num;
2012
2013         if (!bdev)
2014                 return;
2015
2016         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2017                 struct page *page;
2018                 int ret;
2019
2020                 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2021                 if (IS_ERR(disk_super))
2022                         continue;
2023
2024                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2025
2026                 page = virt_to_page(disk_super);
2027                 set_page_dirty(page);
2028                 lock_page(page);
2029                 /* write_on_page() unlocks the page */
2030                 ret = write_one_page(page);
2031                 if (ret)
2032                         btrfs_warn(fs_info,
2033                                 "error clearing superblock number %d (%d)",
2034                                 copy_num, ret);
2035                 btrfs_release_disk_super(disk_super);
2036
2037         }
2038
2039         /* Notify udev that device has changed */
2040         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2041
2042         /* Update ctime/mtime for device path for libblkid */
2043         update_dev_time(device_path);
2044 }
2045
2046 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2047                     u64 devid)
2048 {
2049         struct btrfs_device *device;
2050         struct btrfs_fs_devices *cur_devices;
2051         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2052         u64 num_devices;
2053         int ret = 0;
2054
2055         mutex_lock(&uuid_mutex);
2056
2057         num_devices = btrfs_num_devices(fs_info);
2058
2059         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2060         if (ret)
2061                 goto out;
2062
2063         device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2064
2065         if (IS_ERR(device)) {
2066                 if (PTR_ERR(device) == -ENOENT &&
2067                     strcmp(device_path, "missing") == 0)
2068                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2069                 else
2070                         ret = PTR_ERR(device);
2071                 goto out;
2072         }
2073
2074         if (btrfs_pinned_by_swapfile(fs_info, device)) {
2075                 btrfs_warn_in_rcu(fs_info,
2076                   "cannot remove device %s (devid %llu) due to active swapfile",
2077                                   rcu_str_deref(device->name), device->devid);
2078                 ret = -ETXTBSY;
2079                 goto out;
2080         }
2081
2082         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2083                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2084                 goto out;
2085         }
2086
2087         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2088             fs_info->fs_devices->rw_devices == 1) {
2089                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2090                 goto out;
2091         }
2092
2093         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2094                 mutex_lock(&fs_info->chunk_mutex);
2095                 list_del_init(&device->dev_alloc_list);
2096                 device->fs_devices->rw_devices--;
2097                 mutex_unlock(&fs_info->chunk_mutex);
2098         }
2099
2100         mutex_unlock(&uuid_mutex);
2101         ret = btrfs_shrink_device(device, 0);
2102         if (!ret)
2103                 btrfs_reada_remove_dev(device);
2104         mutex_lock(&uuid_mutex);
2105         if (ret)
2106                 goto error_undo;
2107
2108         /*
2109          * TODO: the superblock still includes this device in its num_devices
2110          * counter although write_all_supers() is not locked out. This
2111          * could give a filesystem state which requires a degraded mount.
2112          */
2113         ret = btrfs_rm_dev_item(device);
2114         if (ret)
2115                 goto error_undo;
2116
2117         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2118         btrfs_scrub_cancel_dev(device);
2119
2120         /*
2121          * the device list mutex makes sure that we don't change
2122          * the device list while someone else is writing out all
2123          * the device supers. Whoever is writing all supers, should
2124          * lock the device list mutex before getting the number of
2125          * devices in the super block (super_copy). Conversely,
2126          * whoever updates the number of devices in the super block
2127          * (super_copy) should hold the device list mutex.
2128          */
2129
2130         /*
2131          * In normal cases the cur_devices == fs_devices. But in case
2132          * of deleting a seed device, the cur_devices should point to
2133          * its own fs_devices listed under the fs_devices->seed.
2134          */
2135         cur_devices = device->fs_devices;
2136         mutex_lock(&fs_devices->device_list_mutex);
2137         list_del_rcu(&device->dev_list);
2138
2139         cur_devices->num_devices--;
2140         cur_devices->total_devices--;
2141         /* Update total_devices of the parent fs_devices if it's seed */
2142         if (cur_devices != fs_devices)
2143                 fs_devices->total_devices--;
2144
2145         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2146                 cur_devices->missing_devices--;
2147
2148         btrfs_assign_next_active_device(device, NULL);
2149
2150         if (device->bdev) {
2151                 cur_devices->open_devices--;
2152                 /* remove sysfs entry */
2153                 btrfs_sysfs_remove_device(device);
2154         }
2155
2156         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2157         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2158         mutex_unlock(&fs_devices->device_list_mutex);
2159
2160         /*
2161          * at this point, the device is zero sized and detached from
2162          * the devices list.  All that's left is to zero out the old
2163          * supers and free the device.
2164          */
2165         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2166                 btrfs_scratch_superblocks(fs_info, device->bdev,
2167                                           device->name->str);
2168
2169         btrfs_close_bdev(device);
2170         synchronize_rcu();
2171         btrfs_free_device(device);
2172
2173         if (cur_devices->open_devices == 0) {
2174                 list_del_init(&cur_devices->seed_list);
2175                 close_fs_devices(cur_devices);
2176                 free_fs_devices(cur_devices);
2177         }
2178
2179 out:
2180         mutex_unlock(&uuid_mutex);
2181         return ret;
2182
2183 error_undo:
2184         btrfs_reada_undo_remove_dev(device);
2185         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2186                 mutex_lock(&fs_info->chunk_mutex);
2187                 list_add(&device->dev_alloc_list,
2188                          &fs_devices->alloc_list);
2189                 device->fs_devices->rw_devices++;
2190                 mutex_unlock(&fs_info->chunk_mutex);
2191         }
2192         goto out;
2193 }
2194
2195 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2196 {
2197         struct btrfs_fs_devices *fs_devices;
2198
2199         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2200
2201         /*
2202          * in case of fs with no seed, srcdev->fs_devices will point
2203          * to fs_devices of fs_info. However when the dev being replaced is
2204          * a seed dev it will point to the seed's local fs_devices. In short
2205          * srcdev will have its correct fs_devices in both the cases.
2206          */
2207         fs_devices = srcdev->fs_devices;
2208
2209         list_del_rcu(&srcdev->dev_list);
2210         list_del(&srcdev->dev_alloc_list);
2211         fs_devices->num_devices--;
2212         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2213                 fs_devices->missing_devices--;
2214
2215         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2216                 fs_devices->rw_devices--;
2217
2218         if (srcdev->bdev)
2219                 fs_devices->open_devices--;
2220 }
2221
2222 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2223 {
2224         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2225
2226         mutex_lock(&uuid_mutex);
2227
2228         btrfs_close_bdev(srcdev);
2229         synchronize_rcu();
2230         btrfs_free_device(srcdev);
2231
2232         /* if this is no devs we rather delete the fs_devices */
2233         if (!fs_devices->num_devices) {
2234                 /*
2235                  * On a mounted FS, num_devices can't be zero unless it's a
2236                  * seed. In case of a seed device being replaced, the replace
2237                  * target added to the sprout FS, so there will be no more
2238                  * device left under the seed FS.
2239                  */
2240                 ASSERT(fs_devices->seeding);
2241
2242                 list_del_init(&fs_devices->seed_list);
2243                 close_fs_devices(fs_devices);
2244                 free_fs_devices(fs_devices);
2245         }
2246         mutex_unlock(&uuid_mutex);
2247 }
2248
2249 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2250 {
2251         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2252
2253         mutex_lock(&fs_devices->device_list_mutex);
2254
2255         btrfs_sysfs_remove_device(tgtdev);
2256
2257         if (tgtdev->bdev)
2258                 fs_devices->open_devices--;
2259
2260         fs_devices->num_devices--;
2261
2262         btrfs_assign_next_active_device(tgtdev, NULL);
2263
2264         list_del_rcu(&tgtdev->dev_list);
2265
2266         mutex_unlock(&fs_devices->device_list_mutex);
2267
2268         /*
2269          * The update_dev_time() with in btrfs_scratch_superblocks()
2270          * may lead to a call to btrfs_show_devname() which will try
2271          * to hold device_list_mutex. And here this device
2272          * is already out of device list, so we don't have to hold
2273          * the device_list_mutex lock.
2274          */
2275         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2276                                   tgtdev->name->str);
2277
2278         btrfs_close_bdev(tgtdev);
2279         synchronize_rcu();
2280         btrfs_free_device(tgtdev);
2281 }
2282
2283 static struct btrfs_device *btrfs_find_device_by_path(
2284                 struct btrfs_fs_info *fs_info, const char *device_path)
2285 {
2286         int ret = 0;
2287         struct btrfs_super_block *disk_super;
2288         u64 devid;
2289         u8 *dev_uuid;
2290         struct block_device *bdev;
2291         struct btrfs_device *device;
2292
2293         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2294                                     fs_info->bdev_holder, 0, &bdev, &disk_super);
2295         if (ret)
2296                 return ERR_PTR(ret);
2297
2298         devid = btrfs_stack_device_id(&disk_super->dev_item);
2299         dev_uuid = disk_super->dev_item.uuid;
2300         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2301                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2302                                            disk_super->metadata_uuid, true);
2303         else
2304                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2305                                            disk_super->fsid, true);
2306
2307         btrfs_release_disk_super(disk_super);
2308         if (!device)
2309                 device = ERR_PTR(-ENOENT);
2310         blkdev_put(bdev, FMODE_READ);
2311         return device;
2312 }
2313
2314 /*
2315  * Lookup a device given by device id, or the path if the id is 0.
2316  */
2317 struct btrfs_device *btrfs_find_device_by_devspec(
2318                 struct btrfs_fs_info *fs_info, u64 devid,
2319                 const char *device_path)
2320 {
2321         struct btrfs_device *device;
2322
2323         if (devid) {
2324                 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2325                                            NULL, true);
2326                 if (!device)
2327                         return ERR_PTR(-ENOENT);
2328                 return device;
2329         }
2330
2331         if (!device_path || !device_path[0])
2332                 return ERR_PTR(-EINVAL);
2333
2334         if (strcmp(device_path, "missing") == 0) {
2335                 /* Find first missing device */
2336                 list_for_each_entry(device, &fs_info->fs_devices->devices,
2337                                     dev_list) {
2338                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2339                                      &device->dev_state) && !device->bdev)
2340                                 return device;
2341                 }
2342                 return ERR_PTR(-ENOENT);
2343         }
2344
2345         return btrfs_find_device_by_path(fs_info, device_path);
2346 }
2347
2348 /*
2349  * does all the dirty work required for changing file system's UUID.
2350  */
2351 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2352 {
2353         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2354         struct btrfs_fs_devices *old_devices;
2355         struct btrfs_fs_devices *seed_devices;
2356         struct btrfs_super_block *disk_super = fs_info->super_copy;
2357         struct btrfs_device *device;
2358         u64 super_flags;
2359
2360         lockdep_assert_held(&uuid_mutex);
2361         if (!fs_devices->seeding)
2362                 return -EINVAL;
2363
2364         /*
2365          * Private copy of the seed devices, anchored at
2366          * fs_info->fs_devices->seed_list
2367          */
2368         seed_devices = alloc_fs_devices(NULL, NULL);
2369         if (IS_ERR(seed_devices))
2370                 return PTR_ERR(seed_devices);
2371
2372         /*
2373          * It's necessary to retain a copy of the original seed fs_devices in
2374          * fs_uuids so that filesystems which have been seeded can successfully
2375          * reference the seed device from open_seed_devices. This also supports
2376          * multiple fs seed.
2377          */
2378         old_devices = clone_fs_devices(fs_devices);
2379         if (IS_ERR(old_devices)) {
2380                 kfree(seed_devices);
2381                 return PTR_ERR(old_devices);
2382         }
2383
2384         list_add(&old_devices->fs_list, &fs_uuids);
2385
2386         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2387         seed_devices->opened = 1;
2388         INIT_LIST_HEAD(&seed_devices->devices);
2389         INIT_LIST_HEAD(&seed_devices->alloc_list);
2390         mutex_init(&seed_devices->device_list_mutex);
2391
2392         mutex_lock(&fs_devices->device_list_mutex);
2393         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2394                               synchronize_rcu);
2395         list_for_each_entry(device, &seed_devices->devices, dev_list)
2396                 device->fs_devices = seed_devices;
2397
2398         fs_devices->seeding = false;
2399         fs_devices->num_devices = 0;
2400         fs_devices->open_devices = 0;
2401         fs_devices->missing_devices = 0;
2402         fs_devices->rotating = false;
2403         list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2404
2405         generate_random_uuid(fs_devices->fsid);
2406         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2407         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2408         mutex_unlock(&fs_devices->device_list_mutex);
2409
2410         super_flags = btrfs_super_flags(disk_super) &
2411                       ~BTRFS_SUPER_FLAG_SEEDING;
2412         btrfs_set_super_flags(disk_super, super_flags);
2413
2414         return 0;
2415 }
2416
2417 /*
2418  * Store the expected generation for seed devices in device items.
2419  */
2420 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2421 {
2422         struct btrfs_fs_info *fs_info = trans->fs_info;
2423         struct btrfs_root *root = fs_info->chunk_root;
2424         struct btrfs_path *path;
2425         struct extent_buffer *leaf;
2426         struct btrfs_dev_item *dev_item;
2427         struct btrfs_device *device;
2428         struct btrfs_key key;
2429         u8 fs_uuid[BTRFS_FSID_SIZE];
2430         u8 dev_uuid[BTRFS_UUID_SIZE];
2431         u64 devid;
2432         int ret;
2433
2434         path = btrfs_alloc_path();
2435         if (!path)
2436                 return -ENOMEM;
2437
2438         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2439         key.offset = 0;
2440         key.type = BTRFS_DEV_ITEM_KEY;
2441
2442         while (1) {
2443                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2444                 if (ret < 0)
2445                         goto error;
2446
2447                 leaf = path->nodes[0];
2448 next_slot:
2449                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2450                         ret = btrfs_next_leaf(root, path);
2451                         if (ret > 0)
2452                                 break;
2453                         if (ret < 0)
2454                                 goto error;
2455                         leaf = path->nodes[0];
2456                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2457                         btrfs_release_path(path);
2458                         continue;
2459                 }
2460
2461                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2462                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2463                     key.type != BTRFS_DEV_ITEM_KEY)
2464                         break;
2465
2466                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2467                                           struct btrfs_dev_item);
2468                 devid = btrfs_device_id(leaf, dev_item);
2469                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2470                                    BTRFS_UUID_SIZE);
2471                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2472                                    BTRFS_FSID_SIZE);
2473                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2474                                            fs_uuid, true);
2475                 BUG_ON(!device); /* Logic error */
2476
2477                 if (device->fs_devices->seeding) {
2478                         btrfs_set_device_generation(leaf, dev_item,
2479                                                     device->generation);
2480                         btrfs_mark_buffer_dirty(leaf);
2481                 }
2482
2483                 path->slots[0]++;
2484                 goto next_slot;
2485         }
2486         ret = 0;
2487 error:
2488         btrfs_free_path(path);
2489         return ret;
2490 }
2491
2492 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2493 {
2494         struct btrfs_root *root = fs_info->dev_root;
2495         struct request_queue *q;
2496         struct btrfs_trans_handle *trans;
2497         struct btrfs_device *device;
2498         struct block_device *bdev;
2499         struct super_block *sb = fs_info->sb;
2500         struct rcu_string *name;
2501         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2502         u64 orig_super_total_bytes;
2503         u64 orig_super_num_devices;
2504         int seeding_dev = 0;
2505         int ret = 0;
2506         bool locked = false;
2507
2508         if (sb_rdonly(sb) && !fs_devices->seeding)
2509                 return -EROFS;
2510
2511         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2512                                   fs_info->bdev_holder);
2513         if (IS_ERR(bdev))
2514                 return PTR_ERR(bdev);
2515
2516         if (fs_devices->seeding) {
2517                 seeding_dev = 1;
2518                 down_write(&sb->s_umount);
2519                 mutex_lock(&uuid_mutex);
2520                 locked = true;
2521         }
2522
2523         sync_blockdev(bdev);
2524
2525         rcu_read_lock();
2526         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2527                 if (device->bdev == bdev) {
2528                         ret = -EEXIST;
2529                         rcu_read_unlock();
2530                         goto error;
2531                 }
2532         }
2533         rcu_read_unlock();
2534
2535         device = btrfs_alloc_device(fs_info, NULL, NULL);
2536         if (IS_ERR(device)) {
2537                 /* we can safely leave the fs_devices entry around */
2538                 ret = PTR_ERR(device);
2539                 goto error;
2540         }
2541
2542         name = rcu_string_strdup(device_path, GFP_KERNEL);
2543         if (!name) {
2544                 ret = -ENOMEM;
2545                 goto error_free_device;
2546         }
2547         rcu_assign_pointer(device->name, name);
2548
2549         trans = btrfs_start_transaction(root, 0);
2550         if (IS_ERR(trans)) {
2551                 ret = PTR_ERR(trans);
2552                 goto error_free_device;
2553         }
2554
2555         q = bdev_get_queue(bdev);
2556         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2557         device->generation = trans->transid;
2558         device->io_width = fs_info->sectorsize;
2559         device->io_align = fs_info->sectorsize;
2560         device->sector_size = fs_info->sectorsize;
2561         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2562                                          fs_info->sectorsize);
2563         device->disk_total_bytes = device->total_bytes;
2564         device->commit_total_bytes = device->total_bytes;
2565         device->fs_info = fs_info;
2566         device->bdev = bdev;
2567         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2568         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2569         device->mode = FMODE_EXCL;
2570         device->dev_stats_valid = 1;
2571         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2572
2573         if (seeding_dev) {
2574                 sb->s_flags &= ~SB_RDONLY;
2575                 ret = btrfs_prepare_sprout(fs_info);
2576                 if (ret) {
2577                         btrfs_abort_transaction(trans, ret);
2578                         goto error_trans;
2579                 }
2580         }
2581
2582         device->fs_devices = fs_devices;
2583
2584         mutex_lock(&fs_devices->device_list_mutex);
2585         mutex_lock(&fs_info->chunk_mutex);
2586         list_add_rcu(&device->dev_list, &fs_devices->devices);
2587         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2588         fs_devices->num_devices++;
2589         fs_devices->open_devices++;
2590         fs_devices->rw_devices++;
2591         fs_devices->total_devices++;
2592         fs_devices->total_rw_bytes += device->total_bytes;
2593
2594         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2595
2596         if (!blk_queue_nonrot(q))
2597                 fs_devices->rotating = true;
2598
2599         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2600         btrfs_set_super_total_bytes(fs_info->super_copy,
2601                 round_down(orig_super_total_bytes + device->total_bytes,
2602                            fs_info->sectorsize));
2603
2604         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2605         btrfs_set_super_num_devices(fs_info->super_copy,
2606                                     orig_super_num_devices + 1);
2607
2608         /*
2609          * we've got more storage, clear any full flags on the space
2610          * infos
2611          */
2612         btrfs_clear_space_info_full(fs_info);
2613
2614         mutex_unlock(&fs_info->chunk_mutex);
2615
2616         /* Add sysfs device entry */
2617         btrfs_sysfs_add_device(device);
2618
2619         mutex_unlock(&fs_devices->device_list_mutex);
2620
2621         if (seeding_dev) {
2622                 mutex_lock(&fs_info->chunk_mutex);
2623                 ret = init_first_rw_device(trans);
2624                 mutex_unlock(&fs_info->chunk_mutex);
2625                 if (ret) {
2626                         btrfs_abort_transaction(trans, ret);
2627                         goto error_sysfs;
2628                 }
2629         }
2630
2631         ret = btrfs_add_dev_item(trans, device);
2632         if (ret) {
2633                 btrfs_abort_transaction(trans, ret);
2634                 goto error_sysfs;
2635         }
2636
2637         if (seeding_dev) {
2638                 ret = btrfs_finish_sprout(trans);
2639                 if (ret) {
2640                         btrfs_abort_transaction(trans, ret);
2641                         goto error_sysfs;
2642                 }
2643
2644                 /*
2645                  * fs_devices now represents the newly sprouted filesystem and
2646                  * its fsid has been changed by btrfs_prepare_sprout
2647                  */
2648                 btrfs_sysfs_update_sprout_fsid(fs_devices);
2649         }
2650
2651         ret = btrfs_commit_transaction(trans);
2652
2653         if (seeding_dev) {
2654                 mutex_unlock(&uuid_mutex);
2655                 up_write(&sb->s_umount);
2656                 locked = false;
2657
2658                 if (ret) /* transaction commit */
2659                         return ret;
2660
2661                 ret = btrfs_relocate_sys_chunks(fs_info);
2662                 if (ret < 0)
2663                         btrfs_handle_fs_error(fs_info, ret,
2664                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2665                 trans = btrfs_attach_transaction(root);
2666                 if (IS_ERR(trans)) {
2667                         if (PTR_ERR(trans) == -ENOENT)
2668                                 return 0;
2669                         ret = PTR_ERR(trans);
2670                         trans = NULL;
2671                         goto error_sysfs;
2672                 }
2673                 ret = btrfs_commit_transaction(trans);
2674         }
2675
2676         /*
2677          * Now that we have written a new super block to this device, check all
2678          * other fs_devices list if device_path alienates any other scanned
2679          * device.
2680          * We can ignore the return value as it typically returns -EINVAL and
2681          * only succeeds if the device was an alien.
2682          */
2683         btrfs_forget_devices(device_path);
2684
2685         /* Update ctime/mtime for blkid or udev */
2686         update_dev_time(device_path);
2687
2688         return ret;
2689
2690 error_sysfs:
2691         btrfs_sysfs_remove_device(device);
2692         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2693         mutex_lock(&fs_info->chunk_mutex);
2694         list_del_rcu(&device->dev_list);
2695         list_del(&device->dev_alloc_list);
2696         fs_info->fs_devices->num_devices--;
2697         fs_info->fs_devices->open_devices--;
2698         fs_info->fs_devices->rw_devices--;
2699         fs_info->fs_devices->total_devices--;
2700         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2701         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2702         btrfs_set_super_total_bytes(fs_info->super_copy,
2703                                     orig_super_total_bytes);
2704         btrfs_set_super_num_devices(fs_info->super_copy,
2705                                     orig_super_num_devices);
2706         mutex_unlock(&fs_info->chunk_mutex);
2707         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2708 error_trans:
2709         if (seeding_dev)
2710                 sb->s_flags |= SB_RDONLY;
2711         if (trans)
2712                 btrfs_end_transaction(trans);
2713 error_free_device:
2714         btrfs_free_device(device);
2715 error:
2716         blkdev_put(bdev, FMODE_EXCL);
2717         if (locked) {
2718                 mutex_unlock(&uuid_mutex);
2719                 up_write(&sb->s_umount);
2720         }
2721         return ret;
2722 }
2723
2724 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2725                                         struct btrfs_device *device)
2726 {
2727         int ret;
2728         struct btrfs_path *path;
2729         struct btrfs_root *root = device->fs_info->chunk_root;
2730         struct btrfs_dev_item *dev_item;
2731         struct extent_buffer *leaf;
2732         struct btrfs_key key;
2733
2734         path = btrfs_alloc_path();
2735         if (!path)
2736                 return -ENOMEM;
2737
2738         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2739         key.type = BTRFS_DEV_ITEM_KEY;
2740         key.offset = device->devid;
2741
2742         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2743         if (ret < 0)
2744                 goto out;
2745
2746         if (ret > 0) {
2747                 ret = -ENOENT;
2748                 goto out;
2749         }
2750
2751         leaf = path->nodes[0];
2752         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2753
2754         btrfs_set_device_id(leaf, dev_item, device->devid);
2755         btrfs_set_device_type(leaf, dev_item, device->type);
2756         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2757         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2758         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2759         btrfs_set_device_total_bytes(leaf, dev_item,
2760                                      btrfs_device_get_disk_total_bytes(device));
2761         btrfs_set_device_bytes_used(leaf, dev_item,
2762                                     btrfs_device_get_bytes_used(device));
2763         btrfs_mark_buffer_dirty(leaf);
2764
2765 out:
2766         btrfs_free_path(path);
2767         return ret;
2768 }
2769
2770 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2771                       struct btrfs_device *device, u64 new_size)
2772 {
2773         struct btrfs_fs_info *fs_info = device->fs_info;
2774         struct btrfs_super_block *super_copy = fs_info->super_copy;
2775         u64 old_total;
2776         u64 diff;
2777
2778         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2779                 return -EACCES;
2780
2781         new_size = round_down(new_size, fs_info->sectorsize);
2782
2783         mutex_lock(&fs_info->chunk_mutex);
2784         old_total = btrfs_super_total_bytes(super_copy);
2785         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2786
2787         if (new_size <= device->total_bytes ||
2788             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2789                 mutex_unlock(&fs_info->chunk_mutex);
2790                 return -EINVAL;
2791         }
2792
2793         btrfs_set_super_total_bytes(super_copy,
2794                         round_down(old_total + diff, fs_info->sectorsize));
2795         device->fs_devices->total_rw_bytes += diff;
2796
2797         btrfs_device_set_total_bytes(device, new_size);
2798         btrfs_device_set_disk_total_bytes(device, new_size);
2799         btrfs_clear_space_info_full(device->fs_info);
2800         if (list_empty(&device->post_commit_list))
2801                 list_add_tail(&device->post_commit_list,
2802                               &trans->transaction->dev_update_list);
2803         mutex_unlock(&fs_info->chunk_mutex);
2804
2805         return btrfs_update_device(trans, device);
2806 }
2807
2808 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2809 {
2810         struct btrfs_fs_info *fs_info = trans->fs_info;
2811         struct btrfs_root *root = fs_info->chunk_root;
2812         int ret;
2813         struct btrfs_path *path;
2814         struct btrfs_key key;
2815
2816         path = btrfs_alloc_path();
2817         if (!path)
2818                 return -ENOMEM;
2819
2820         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2821         key.offset = chunk_offset;
2822         key.type = BTRFS_CHUNK_ITEM_KEY;
2823
2824         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2825         if (ret < 0)
2826                 goto out;
2827         else if (ret > 0) { /* Logic error or corruption */
2828                 btrfs_handle_fs_error(fs_info, -ENOENT,
2829                                       "Failed lookup while freeing chunk.");
2830                 ret = -ENOENT;
2831                 goto out;
2832         }
2833
2834         ret = btrfs_del_item(trans, root, path);
2835         if (ret < 0)
2836                 btrfs_handle_fs_error(fs_info, ret,
2837                                       "Failed to delete chunk item.");
2838 out:
2839         btrfs_free_path(path);
2840         return ret;
2841 }
2842
2843 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2844 {
2845         struct btrfs_super_block *super_copy = fs_info->super_copy;
2846         struct btrfs_disk_key *disk_key;
2847         struct btrfs_chunk *chunk;
2848         u8 *ptr;
2849         int ret = 0;
2850         u32 num_stripes;
2851         u32 array_size;
2852         u32 len = 0;
2853         u32 cur;
2854         struct btrfs_key key;
2855
2856         mutex_lock(&fs_info->chunk_mutex);
2857         array_size = btrfs_super_sys_array_size(super_copy);
2858
2859         ptr = super_copy->sys_chunk_array;
2860         cur = 0;
2861
2862         while (cur < array_size) {
2863                 disk_key = (struct btrfs_disk_key *)ptr;
2864                 btrfs_disk_key_to_cpu(&key, disk_key);
2865
2866                 len = sizeof(*disk_key);
2867
2868                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2869                         chunk = (struct btrfs_chunk *)(ptr + len);
2870                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2871                         len += btrfs_chunk_item_size(num_stripes);
2872                 } else {
2873                         ret = -EIO;
2874                         break;
2875                 }
2876                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2877                     key.offset == chunk_offset) {
2878                         memmove(ptr, ptr + len, array_size - (cur + len));
2879                         array_size -= len;
2880                         btrfs_set_super_sys_array_size(super_copy, array_size);
2881                 } else {
2882                         ptr += len;
2883                         cur += len;
2884                 }
2885         }
2886         mutex_unlock(&fs_info->chunk_mutex);
2887         return ret;
2888 }
2889
2890 /*
2891  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2892  * @logical: Logical block offset in bytes.
2893  * @length: Length of extent in bytes.
2894  *
2895  * Return: Chunk mapping or ERR_PTR.
2896  */
2897 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2898                                        u64 logical, u64 length)
2899 {
2900         struct extent_map_tree *em_tree;
2901         struct extent_map *em;
2902
2903         em_tree = &fs_info->mapping_tree;
2904         read_lock(&em_tree->lock);
2905         em = lookup_extent_mapping(em_tree, logical, length);
2906         read_unlock(&em_tree->lock);
2907
2908         if (!em) {
2909                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2910                            logical, length);
2911                 return ERR_PTR(-EINVAL);
2912         }
2913
2914         if (em->start > logical || em->start + em->len < logical) {
2915                 btrfs_crit(fs_info,
2916                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2917                            logical, length, em->start, em->start + em->len);
2918                 free_extent_map(em);
2919                 return ERR_PTR(-EINVAL);
2920         }
2921
2922         /* callers are responsible for dropping em's ref. */
2923         return em;
2924 }
2925
2926 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2927 {
2928         struct btrfs_fs_info *fs_info = trans->fs_info;
2929         struct extent_map *em;
2930         struct map_lookup *map;
2931         u64 dev_extent_len = 0;
2932         int i, ret = 0;
2933         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2934
2935         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2936         if (IS_ERR(em)) {
2937                 /*
2938                  * This is a logic error, but we don't want to just rely on the
2939                  * user having built with ASSERT enabled, so if ASSERT doesn't
2940                  * do anything we still error out.
2941                  */
2942                 ASSERT(0);
2943                 return PTR_ERR(em);
2944         }
2945         map = em->map_lookup;
2946         mutex_lock(&fs_info->chunk_mutex);
2947         check_system_chunk(trans, map->type);
2948         mutex_unlock(&fs_info->chunk_mutex);
2949
2950         /*
2951          * Take the device list mutex to prevent races with the final phase of
2952          * a device replace operation that replaces the device object associated
2953          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2954          */
2955         mutex_lock(&fs_devices->device_list_mutex);
2956         for (i = 0; i < map->num_stripes; i++) {
2957                 struct btrfs_device *device = map->stripes[i].dev;
2958                 ret = btrfs_free_dev_extent(trans, device,
2959                                             map->stripes[i].physical,
2960                                             &dev_extent_len);
2961                 if (ret) {
2962                         mutex_unlock(&fs_devices->device_list_mutex);
2963                         btrfs_abort_transaction(trans, ret);
2964                         goto out;
2965                 }
2966
2967                 if (device->bytes_used > 0) {
2968                         mutex_lock(&fs_info->chunk_mutex);
2969                         btrfs_device_set_bytes_used(device,
2970                                         device->bytes_used - dev_extent_len);
2971                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2972                         btrfs_clear_space_info_full(fs_info);
2973                         mutex_unlock(&fs_info->chunk_mutex);
2974                 }
2975
2976                 ret = btrfs_update_device(trans, device);
2977                 if (ret) {
2978                         mutex_unlock(&fs_devices->device_list_mutex);
2979                         btrfs_abort_transaction(trans, ret);
2980                         goto out;
2981                 }
2982         }
2983         mutex_unlock(&fs_devices->device_list_mutex);
2984
2985         ret = btrfs_free_chunk(trans, chunk_offset);
2986         if (ret) {
2987                 btrfs_abort_transaction(trans, ret);
2988                 goto out;
2989         }
2990
2991         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2992
2993         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2994                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2995                 if (ret) {
2996                         btrfs_abort_transaction(trans, ret);
2997                         goto out;
2998                 }
2999         }
3000
3001         ret = btrfs_remove_block_group(trans, chunk_offset, em);
3002         if (ret) {
3003                 btrfs_abort_transaction(trans, ret);
3004                 goto out;
3005         }
3006
3007 out:
3008         /* once for us */
3009         free_extent_map(em);
3010         return ret;
3011 }
3012
3013 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3014 {
3015         struct btrfs_root *root = fs_info->chunk_root;
3016         struct btrfs_trans_handle *trans;
3017         struct btrfs_block_group *block_group;
3018         int ret;
3019
3020         /*
3021          * Prevent races with automatic removal of unused block groups.
3022          * After we relocate and before we remove the chunk with offset
3023          * chunk_offset, automatic removal of the block group can kick in,
3024          * resulting in a failure when calling btrfs_remove_chunk() below.
3025          *
3026          * Make sure to acquire this mutex before doing a tree search (dev
3027          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3028          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3029          * we release the path used to search the chunk/dev tree and before
3030          * the current task acquires this mutex and calls us.
3031          */
3032         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3033
3034         /* step one, relocate all the extents inside this chunk */
3035         btrfs_scrub_pause(fs_info);
3036         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3037         btrfs_scrub_continue(fs_info);
3038         if (ret)
3039                 return ret;
3040
3041         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3042         if (!block_group)
3043                 return -ENOENT;
3044         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3045         btrfs_put_block_group(block_group);
3046
3047         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3048                                                      chunk_offset);
3049         if (IS_ERR(trans)) {
3050                 ret = PTR_ERR(trans);
3051                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3052                 return ret;
3053         }
3054
3055         /*
3056          * step two, delete the device extents and the
3057          * chunk tree entries
3058          */
3059         ret = btrfs_remove_chunk(trans, chunk_offset);
3060         btrfs_end_transaction(trans);
3061         return ret;
3062 }
3063
3064 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3065 {
3066         struct btrfs_root *chunk_root = fs_info->chunk_root;
3067         struct btrfs_path *path;
3068         struct extent_buffer *leaf;
3069         struct btrfs_chunk *chunk;
3070         struct btrfs_key key;
3071         struct btrfs_key found_key;
3072         u64 chunk_type;
3073         bool retried = false;
3074         int failed = 0;
3075         int ret;
3076
3077         path = btrfs_alloc_path();
3078         if (!path)
3079                 return -ENOMEM;
3080
3081 again:
3082         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3083         key.offset = (u64)-1;
3084         key.type = BTRFS_CHUNK_ITEM_KEY;
3085
3086         while (1) {
3087                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3088                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3089                 if (ret < 0) {
3090                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3091                         goto error;
3092                 }
3093                 BUG_ON(ret == 0); /* Corruption */
3094
3095                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3096                                           key.type);
3097                 if (ret)
3098                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3099                 if (ret < 0)
3100                         goto error;
3101                 if (ret > 0)
3102                         break;
3103
3104                 leaf = path->nodes[0];
3105                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3106
3107                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3108                                        struct btrfs_chunk);
3109                 chunk_type = btrfs_chunk_type(leaf, chunk);
3110                 btrfs_release_path(path);
3111
3112                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3113                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3114                         if (ret == -ENOSPC)
3115                                 failed++;
3116                         else
3117                                 BUG_ON(ret);
3118                 }
3119                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3120
3121                 if (found_key.offset == 0)
3122                         break;
3123                 key.offset = found_key.offset - 1;
3124         }
3125         ret = 0;
3126         if (failed && !retried) {
3127                 failed = 0;
3128                 retried = true;
3129                 goto again;
3130         } else if (WARN_ON(failed && retried)) {
3131                 ret = -ENOSPC;
3132         }
3133 error:
3134         btrfs_free_path(path);
3135         return ret;
3136 }
3137
3138 /*
3139  * return 1 : allocate a data chunk successfully,
3140  * return <0: errors during allocating a data chunk,
3141  * return 0 : no need to allocate a data chunk.
3142  */
3143 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3144                                       u64 chunk_offset)
3145 {
3146         struct btrfs_block_group *cache;
3147         u64 bytes_used;
3148         u64 chunk_type;
3149
3150         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3151         ASSERT(cache);
3152         chunk_type = cache->flags;
3153         btrfs_put_block_group(cache);
3154
3155         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3156                 return 0;
3157
3158         spin_lock(&fs_info->data_sinfo->lock);
3159         bytes_used = fs_info->data_sinfo->bytes_used;
3160         spin_unlock(&fs_info->data_sinfo->lock);
3161
3162         if (!bytes_used) {
3163                 struct btrfs_trans_handle *trans;
3164                 int ret;
3165
3166                 trans = btrfs_join_transaction(fs_info->tree_root);
3167                 if (IS_ERR(trans))
3168                         return PTR_ERR(trans);
3169
3170                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3171                 btrfs_end_transaction(trans);
3172                 if (ret < 0)
3173                         return ret;
3174                 return 1;
3175         }
3176
3177         return 0;
3178 }
3179
3180 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3181                                struct btrfs_balance_control *bctl)
3182 {
3183         struct btrfs_root *root = fs_info->tree_root;
3184         struct btrfs_trans_handle *trans;
3185         struct btrfs_balance_item *item;
3186         struct btrfs_disk_balance_args disk_bargs;
3187         struct btrfs_path *path;
3188         struct extent_buffer *leaf;
3189         struct btrfs_key key;
3190         int ret, err;
3191
3192         path = btrfs_alloc_path();
3193         if (!path)
3194                 return -ENOMEM;
3195
3196         trans = btrfs_start_transaction(root, 0);
3197         if (IS_ERR(trans)) {
3198                 btrfs_free_path(path);
3199                 return PTR_ERR(trans);
3200         }
3201
3202         key.objectid = BTRFS_BALANCE_OBJECTID;
3203         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3204         key.offset = 0;
3205
3206         ret = btrfs_insert_empty_item(trans, root, path, &key,
3207                                       sizeof(*item));
3208         if (ret)
3209                 goto out;
3210
3211         leaf = path->nodes[0];
3212         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3213
3214         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3215
3216         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3217         btrfs_set_balance_data(leaf, item, &disk_bargs);
3218         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3219         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3220         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3221         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3222
3223         btrfs_set_balance_flags(leaf, item, bctl->flags);
3224
3225         btrfs_mark_buffer_dirty(leaf);
3226 out:
3227         btrfs_free_path(path);
3228         err = btrfs_commit_transaction(trans);
3229         if (err && !ret)
3230                 ret = err;
3231         return ret;
3232 }
3233
3234 static int del_balance_item(struct btrfs_fs_info *fs_info)
3235 {
3236         struct btrfs_root *root = fs_info->tree_root;
3237         struct btrfs_trans_handle *trans;
3238         struct btrfs_path *path;
3239         struct btrfs_key key;
3240         int ret, err;
3241
3242         path = btrfs_alloc_path();
3243         if (!path)
3244                 return -ENOMEM;
3245
3246         trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3247         if (IS_ERR(trans)) {
3248                 btrfs_free_path(path);
3249                 return PTR_ERR(trans);
3250         }
3251
3252         key.objectid = BTRFS_BALANCE_OBJECTID;
3253         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3254         key.offset = 0;
3255
3256         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3257         if (ret < 0)
3258                 goto out;
3259         if (ret > 0) {
3260                 ret = -ENOENT;
3261                 goto out;
3262         }
3263
3264         ret = btrfs_del_item(trans, root, path);
3265 out:
3266         btrfs_free_path(path);
3267         err = btrfs_commit_transaction(trans);
3268         if (err && !ret)
3269                 ret = err;
3270         return ret;
3271 }
3272
3273 /*
3274  * This is a heuristic used to reduce the number of chunks balanced on
3275  * resume after balance was interrupted.
3276  */
3277 static void update_balance_args(struct btrfs_balance_control *bctl)
3278 {
3279         /*
3280          * Turn on soft mode for chunk types that were being converted.
3281          */
3282         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3283                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3284         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3285                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3286         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3287                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3288
3289         /*
3290          * Turn on usage filter if is not already used.  The idea is
3291          * that chunks that we have already balanced should be
3292          * reasonably full.  Don't do it for chunks that are being
3293          * converted - that will keep us from relocating unconverted
3294          * (albeit full) chunks.
3295          */
3296         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3297             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3298             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3299                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3300                 bctl->data.usage = 90;
3301         }
3302         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3303             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3304             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3305                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3306                 bctl->sys.usage = 90;
3307         }
3308         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3309             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3310             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3311                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3312                 bctl->meta.usage = 90;
3313         }
3314 }
3315
3316 /*
3317  * Clear the balance status in fs_info and delete the balance item from disk.
3318  */
3319 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3320 {
3321         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3322         int ret;
3323
3324         BUG_ON(!fs_info->balance_ctl);
3325
3326         spin_lock(&fs_info->balance_lock);
3327         fs_info->balance_ctl = NULL;
3328         spin_unlock(&fs_info->balance_lock);
3329
3330         kfree(bctl);
3331         ret = del_balance_item(fs_info);
3332         if (ret)
3333                 btrfs_handle_fs_error(fs_info, ret, NULL);
3334 }
3335
3336 /*
3337  * Balance filters.  Return 1 if chunk should be filtered out
3338  * (should not be balanced).
3339  */
3340 static int chunk_profiles_filter(u64 chunk_type,
3341                                  struct btrfs_balance_args *bargs)
3342 {
3343         chunk_type = chunk_to_extended(chunk_type) &
3344                                 BTRFS_EXTENDED_PROFILE_MASK;
3345
3346         if (bargs->profiles & chunk_type)
3347                 return 0;
3348
3349         return 1;
3350 }
3351
3352 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3353                               struct btrfs_balance_args *bargs)
3354 {
3355         struct btrfs_block_group *cache;
3356         u64 chunk_used;
3357         u64 user_thresh_min;
3358         u64 user_thresh_max;
3359         int ret = 1;
3360
3361         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3362         chunk_used = cache->used;
3363
3364         if (bargs->usage_min == 0)
3365                 user_thresh_min = 0;
3366         else
3367                 user_thresh_min = div_factor_fine(cache->length,
3368                                                   bargs->usage_min);
3369
3370         if (bargs->usage_max == 0)
3371                 user_thresh_max = 1;
3372         else if (bargs->usage_max > 100)
3373                 user_thresh_max = cache->length;
3374         else
3375                 user_thresh_max = div_factor_fine(cache->length,
3376                                                   bargs->usage_max);
3377
3378         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3379                 ret = 0;
3380
3381         btrfs_put_block_group(cache);
3382         return ret;
3383 }
3384
3385 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3386                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3387 {
3388         struct btrfs_block_group *cache;
3389         u64 chunk_used, user_thresh;
3390         int ret = 1;
3391
3392         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3393         chunk_used = cache->used;
3394
3395         if (bargs->usage_min == 0)
3396                 user_thresh = 1;
3397         else if (bargs->usage > 100)
3398                 user_thresh = cache->length;
3399         else
3400                 user_thresh = div_factor_fine(cache->length, bargs->usage);
3401
3402         if (chunk_used < user_thresh)
3403                 ret = 0;
3404
3405         btrfs_put_block_group(cache);
3406         return ret;
3407 }
3408
3409 static int chunk_devid_filter(struct extent_buffer *leaf,
3410                               struct btrfs_chunk *chunk,
3411                               struct btrfs_balance_args *bargs)
3412 {
3413         struct btrfs_stripe *stripe;
3414         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3415         int i;
3416
3417         for (i = 0; i < num_stripes; i++) {
3418                 stripe = btrfs_stripe_nr(chunk, i);
3419                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3420                         return 0;
3421         }
3422
3423         return 1;
3424 }
3425
3426 static u64 calc_data_stripes(u64 type, int num_stripes)
3427 {
3428         const int index = btrfs_bg_flags_to_raid_index(type);
3429         const int ncopies = btrfs_raid_array[index].ncopies;
3430         const int nparity = btrfs_raid_array[index].nparity;
3431
3432         if (nparity)
3433                 return num_stripes - nparity;
3434         else
3435                 return num_stripes / ncopies;
3436 }
3437
3438 /* [pstart, pend) */
3439 static int chunk_drange_filter(struct extent_buffer *leaf,
3440                                struct btrfs_chunk *chunk,
3441                                struct btrfs_balance_args *bargs)
3442 {
3443         struct btrfs_stripe *stripe;
3444         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3445         u64 stripe_offset;
3446         u64 stripe_length;
3447         u64 type;
3448         int factor;
3449         int i;
3450
3451         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3452                 return 0;
3453
3454         type = btrfs_chunk_type(leaf, chunk);
3455         factor = calc_data_stripes(type, num_stripes);
3456
3457         for (i = 0; i < num_stripes; i++) {
3458                 stripe = btrfs_stripe_nr(chunk, i);
3459                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3460                         continue;
3461
3462                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3463                 stripe_length = btrfs_chunk_length(leaf, chunk);
3464                 stripe_length = div_u64(stripe_length, factor);
3465
3466                 if (stripe_offset < bargs->pend &&
3467                     stripe_offset + stripe_length > bargs->pstart)
3468                         return 0;
3469         }
3470
3471         return 1;
3472 }
3473
3474 /* [vstart, vend) */
3475 static int chunk_vrange_filter(struct extent_buffer *leaf,
3476                                struct btrfs_chunk *chunk,
3477                                u64 chunk_offset,
3478                                struct btrfs_balance_args *bargs)
3479 {
3480         if (chunk_offset < bargs->vend &&
3481             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3482                 /* at least part of the chunk is inside this vrange */
3483                 return 0;
3484
3485         return 1;
3486 }
3487
3488 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3489                                struct btrfs_chunk *chunk,
3490                                struct btrfs_balance_args *bargs)
3491 {
3492         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3493
3494         if (bargs->stripes_min <= num_stripes
3495                         && num_stripes <= bargs->stripes_max)
3496                 return 0;
3497
3498         return 1;
3499 }
3500
3501 static int chunk_soft_convert_filter(u64 chunk_type,
3502                                      struct btrfs_balance_args *bargs)
3503 {
3504         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3505                 return 0;
3506
3507         chunk_type = chunk_to_extended(chunk_type) &
3508                                 BTRFS_EXTENDED_PROFILE_MASK;
3509
3510         if (bargs->target == chunk_type)
3511                 return 1;
3512
3513         return 0;
3514 }
3515
3516 static int should_balance_chunk(struct extent_buffer *leaf,
3517                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3518 {
3519         struct btrfs_fs_info *fs_info = leaf->fs_info;
3520         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3521         struct btrfs_balance_args *bargs = NULL;
3522         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3523
3524         /* type filter */
3525         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3526               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3527                 return 0;
3528         }
3529
3530         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3531                 bargs = &bctl->data;
3532         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3533                 bargs = &bctl->sys;
3534         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3535                 bargs = &bctl->meta;
3536
3537         /* profiles filter */
3538         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3539             chunk_profiles_filter(chunk_type, bargs)) {
3540                 return 0;
3541         }
3542
3543         /* usage filter */
3544         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3545             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3546                 return 0;
3547         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3548             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3549                 return 0;
3550         }
3551
3552         /* devid filter */
3553         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3554             chunk_devid_filter(leaf, chunk, bargs)) {
3555                 return 0;
3556         }
3557
3558         /* drange filter, makes sense only with devid filter */
3559         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3560             chunk_drange_filter(leaf, chunk, bargs)) {
3561                 return 0;
3562         }
3563
3564         /* vrange filter */
3565         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3566             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3567                 return 0;
3568         }
3569
3570         /* stripes filter */
3571         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3572             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3573                 return 0;
3574         }
3575
3576         /* soft profile changing mode */
3577         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3578             chunk_soft_convert_filter(chunk_type, bargs)) {
3579                 return 0;
3580         }
3581
3582         /*
3583          * limited by count, must be the last filter
3584          */
3585         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3586                 if (bargs->limit == 0)
3587                         return 0;
3588                 else
3589                         bargs->limit--;
3590         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3591                 /*
3592                  * Same logic as the 'limit' filter; the minimum cannot be
3593                  * determined here because we do not have the global information
3594                  * about the count of all chunks that satisfy the filters.
3595                  */
3596                 if (bargs->limit_max == 0)
3597                         return 0;
3598                 else
3599                         bargs->limit_max--;
3600         }
3601
3602         return 1;
3603 }
3604
3605 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3606 {
3607         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3608         struct btrfs_root *chunk_root = fs_info->chunk_root;
3609         u64 chunk_type;
3610         struct btrfs_chunk *chunk;
3611         struct btrfs_path *path = NULL;
3612         struct btrfs_key key;
3613         struct btrfs_key found_key;
3614         struct extent_buffer *leaf;
3615         int slot;
3616         int ret;
3617         int enospc_errors = 0;
3618         bool counting = true;
3619         /* The single value limit and min/max limits use the same bytes in the */
3620         u64 limit_data = bctl->data.limit;
3621         u64 limit_meta = bctl->meta.limit;
3622         u64 limit_sys = bctl->sys.limit;
3623         u32 count_data = 0;
3624         u32 count_meta = 0;
3625         u32 count_sys = 0;
3626         int chunk_reserved = 0;
3627
3628         path = btrfs_alloc_path();
3629         if (!path) {
3630                 ret = -ENOMEM;
3631                 goto error;
3632         }
3633
3634         /* zero out stat counters */
3635         spin_lock(&fs_info->balance_lock);
3636         memset(&bctl->stat, 0, sizeof(bctl->stat));
3637         spin_unlock(&fs_info->balance_lock);
3638 again:
3639         if (!counting) {
3640                 /*
3641                  * The single value limit and min/max limits use the same bytes
3642                  * in the
3643                  */
3644                 bctl->data.limit = limit_data;
3645                 bctl->meta.limit = limit_meta;
3646                 bctl->sys.limit = limit_sys;
3647         }
3648         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3649         key.offset = (u64)-1;
3650         key.type = BTRFS_CHUNK_ITEM_KEY;
3651
3652         while (1) {
3653                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3654                     atomic_read(&fs_info->balance_cancel_req)) {
3655                         ret = -ECANCELED;
3656                         goto error;
3657                 }
3658
3659                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3660                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3661                 if (ret < 0) {
3662                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3663                         goto error;
3664                 }
3665
3666                 /*
3667                  * this shouldn't happen, it means the last relocate
3668                  * failed
3669                  */
3670                 if (ret == 0)
3671                         BUG(); /* FIXME break ? */
3672
3673                 ret = btrfs_previous_item(chunk_root, path, 0,
3674                                           BTRFS_CHUNK_ITEM_KEY);
3675                 if (ret) {
3676                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3677                         ret = 0;
3678                         break;
3679                 }
3680
3681                 leaf = path->nodes[0];
3682                 slot = path->slots[0];
3683                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3684
3685                 if (found_key.objectid != key.objectid) {
3686                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3687                         break;
3688                 }
3689
3690                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3691                 chunk_type = btrfs_chunk_type(leaf, chunk);
3692
3693                 if (!counting) {
3694                         spin_lock(&fs_info->balance_lock);
3695                         bctl->stat.considered++;
3696                         spin_unlock(&fs_info->balance_lock);
3697                 }
3698
3699                 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3700
3701                 btrfs_release_path(path);
3702                 if (!ret) {
3703                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3704                         goto loop;
3705                 }
3706
3707                 if (counting) {
3708                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3709                         spin_lock(&fs_info->balance_lock);
3710                         bctl->stat.expected++;
3711                         spin_unlock(&fs_info->balance_lock);
3712
3713                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3714                                 count_data++;
3715                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3716                                 count_sys++;
3717                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3718                                 count_meta++;
3719
3720                         goto loop;
3721                 }
3722
3723                 /*
3724                  * Apply limit_min filter, no need to check if the LIMITS
3725                  * filter is used, limit_min is 0 by default
3726                  */
3727                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3728                                         count_data < bctl->data.limit_min)
3729                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3730                                         count_meta < bctl->meta.limit_min)
3731                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3732                                         count_sys < bctl->sys.limit_min)) {
3733                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3734                         goto loop;
3735                 }
3736
3737                 if (!chunk_reserved) {
3738                         /*
3739                          * We may be relocating the only data chunk we have,
3740                          * which could potentially end up with losing data's
3741                          * raid profile, so lets allocate an empty one in
3742                          * advance.
3743                          */
3744                         ret = btrfs_may_alloc_data_chunk(fs_info,
3745                                                          found_key.offset);
3746                         if (ret < 0) {
3747                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3748                                 goto error;
3749                         } else if (ret == 1) {
3750                                 chunk_reserved = 1;
3751                         }
3752                 }
3753
3754                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3755                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3756                 if (ret == -ENOSPC) {
3757                         enospc_errors++;
3758                 } else if (ret == -ETXTBSY) {
3759                         btrfs_info(fs_info,
3760            "skipping relocation of block group %llu due to active swapfile",
3761                                    found_key.offset);
3762                         ret = 0;
3763                 } else if (ret) {
3764                         goto error;
3765                 } else {
3766                         spin_lock(&fs_info->balance_lock);
3767                         bctl->stat.completed++;
3768                         spin_unlock(&fs_info->balance_lock);
3769                 }
3770 loop:
3771                 if (found_key.offset == 0)
3772                         break;
3773                 key.offset = found_key.offset - 1;
3774         }
3775
3776         if (counting) {
3777                 btrfs_release_path(path);
3778                 counting = false;
3779                 goto again;
3780         }
3781 error:
3782         btrfs_free_path(path);
3783         if (enospc_errors) {
3784                 btrfs_info(fs_info, "%d enospc errors during balance",
3785                            enospc_errors);
3786                 if (!ret)
3787                         ret = -ENOSPC;
3788         }
3789
3790         return ret;
3791 }
3792
3793 /**
3794  * alloc_profile_is_valid - see if a given profile is valid and reduced
3795  * @flags: profile to validate
3796  * @extended: if true @flags is treated as an extended profile
3797  */
3798 static int alloc_profile_is_valid(u64 flags, int extended)
3799 {
3800         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3801                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3802
3803         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3804
3805         /* 1) check that all other bits are zeroed */
3806         if (flags & ~mask)
3807                 return 0;
3808
3809         /* 2) see if profile is reduced */
3810         if (flags == 0)
3811                 return !extended; /* "0" is valid for usual profiles */
3812
3813         return has_single_bit_set(flags);
3814 }
3815
3816 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3817 {
3818         /* cancel requested || normal exit path */
3819         return atomic_read(&fs_info->balance_cancel_req) ||
3820                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3821                  atomic_read(&fs_info->balance_cancel_req) == 0);
3822 }
3823
3824 /*
3825  * Validate target profile against allowed profiles and return true if it's OK.
3826  * Otherwise print the error message and return false.
3827  */
3828 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3829                 const struct btrfs_balance_args *bargs,
3830                 u64 allowed, const char *type)
3831 {
3832         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3833                 return true;
3834
3835         /* Profile is valid and does not have bits outside of the allowed set */
3836         if (alloc_profile_is_valid(bargs->target, 1) &&
3837             (bargs->target & ~allowed) == 0)
3838                 return true;
3839
3840         btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3841                         type, btrfs_bg_type_to_raid_name(bargs->target));
3842         return false;
3843 }
3844
3845 /*
3846  * Fill @buf with textual description of balance filter flags @bargs, up to
3847  * @size_buf including the terminating null. The output may be trimmed if it
3848  * does not fit into the provided buffer.
3849  */
3850 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3851                                  u32 size_buf)
3852 {
3853         int ret;
3854         u32 size_bp = size_buf;
3855         char *bp = buf;
3856         u64 flags = bargs->flags;
3857         char tmp_buf[128] = {'\0'};
3858
3859         if (!flags)
3860                 return;
3861
3862 #define CHECK_APPEND_NOARG(a)                                           \
3863         do {                                                            \
3864                 ret = snprintf(bp, size_bp, (a));                       \
3865                 if (ret < 0 || ret >= size_bp)                          \
3866                         goto out_overflow;                              \
3867                 size_bp -= ret;                                         \
3868                 bp += ret;                                              \
3869         } while (0)
3870
3871 #define CHECK_APPEND_1ARG(a, v1)                                        \
3872         do {                                                            \
3873                 ret = snprintf(bp, size_bp, (a), (v1));                 \
3874                 if (ret < 0 || ret >= size_bp)                          \
3875                         goto out_overflow;                              \
3876                 size_bp -= ret;                                         \
3877                 bp += ret;                                              \
3878         } while (0)
3879
3880 #define CHECK_APPEND_2ARG(a, v1, v2)                                    \
3881         do {                                                            \
3882                 ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
3883                 if (ret < 0 || ret >= size_bp)                          \
3884                         goto out_overflow;                              \
3885                 size_bp -= ret;                                         \
3886                 bp += ret;                                              \
3887         } while (0)
3888
3889         if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3890                 CHECK_APPEND_1ARG("convert=%s,",
3891                                   btrfs_bg_type_to_raid_name(bargs->target));
3892
3893         if (flags & BTRFS_BALANCE_ARGS_SOFT)
3894                 CHECK_APPEND_NOARG("soft,");
3895
3896         if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3897                 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3898                                             sizeof(tmp_buf));
3899                 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3900         }
3901
3902         if (flags & BTRFS_BALANCE_ARGS_USAGE)
3903                 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3904
3905         if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3906                 CHECK_APPEND_2ARG("usage=%u..%u,",
3907                                   bargs->usage_min, bargs->usage_max);
3908
3909         if (flags & BTRFS_BALANCE_ARGS_DEVID)
3910                 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3911
3912         if (flags & BTRFS_BALANCE_ARGS_DRANGE)
3913                 CHECK_APPEND_2ARG("drange=%llu..%llu,",
3914                                   bargs->pstart, bargs->pend);
3915
3916         if (flags & BTRFS_BALANCE_ARGS_VRANGE)
3917                 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
3918                                   bargs->vstart, bargs->vend);
3919
3920         if (flags & BTRFS_BALANCE_ARGS_LIMIT)
3921                 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
3922
3923         if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
3924                 CHECK_APPEND_2ARG("limit=%u..%u,",
3925                                 bargs->limit_min, bargs->limit_max);
3926
3927         if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
3928                 CHECK_APPEND_2ARG("stripes=%u..%u,",
3929                                   bargs->stripes_min, bargs->stripes_max);
3930
3931 #undef CHECK_APPEND_2ARG
3932 #undef CHECK_APPEND_1ARG
3933 #undef CHECK_APPEND_NOARG
3934
3935 out_overflow:
3936
3937         if (size_bp < size_buf)
3938                 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
3939         else
3940                 buf[0] = '\0';
3941 }
3942
3943 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
3944 {
3945         u32 size_buf = 1024;
3946         char tmp_buf[192] = {'\0'};
3947         char *buf;
3948         char *bp;
3949         u32 size_bp = size_buf;
3950         int ret;
3951         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3952
3953         buf = kzalloc(size_buf, GFP_KERNEL);
3954         if (!buf)
3955                 return;
3956
3957         bp = buf;
3958
3959 #define CHECK_APPEND_1ARG(a, v1)                                        \
3960         do {                                                            \
3961                 ret = snprintf(bp, size_bp, (a), (v1));                 \
3962                 if (ret < 0 || ret >= size_bp)                          \
3963                         goto out_overflow;                              \
3964                 size_bp -= ret;                                         \
3965                 bp += ret;                                              \
3966         } while (0)
3967
3968         if (bctl->flags & BTRFS_BALANCE_FORCE)
3969                 CHECK_APPEND_1ARG("%s", "-f ");
3970
3971         if (bctl->flags & BTRFS_BALANCE_DATA) {
3972                 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
3973                 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
3974         }
3975
3976         if (bctl->flags & BTRFS_BALANCE_METADATA) {
3977                 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
3978                 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
3979         }
3980
3981         if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
3982                 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
3983                 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
3984         }
3985
3986 #undef CHECK_APPEND_1ARG
3987
3988 out_overflow:
3989
3990         if (size_bp < size_buf)
3991                 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
3992         btrfs_info(fs_info, "balance: %s %s",
3993                    (bctl->flags & BTRFS_BALANCE_RESUME) ?
3994                    "resume" : "start", buf);
3995
3996         kfree(buf);
3997 }
3998
3999 /*
4000  * Should be called with balance mutexe held
4001  */
4002 int btrfs_balance(struct btrfs_fs_info *fs_info,
4003                   struct btrfs_balance_control *bctl,
4004                   struct btrfs_ioctl_balance_args *bargs)
4005 {
4006         u64 meta_target, data_target;
4007         u64 allowed;
4008         int mixed = 0;
4009         int ret;
4010         u64 num_devices;
4011         unsigned seq;
4012         bool reducing_redundancy;
4013         int i;
4014
4015         if (btrfs_fs_closing(fs_info) ||
4016             atomic_read(&fs_info->balance_pause_req) ||
4017             btrfs_should_cancel_balance(fs_info)) {
4018                 ret = -EINVAL;
4019                 goto out;
4020         }
4021
4022         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4023         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4024                 mixed = 1;
4025
4026         /*
4027          * In case of mixed groups both data and meta should be picked,
4028          * and identical options should be given for both of them.
4029          */
4030         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4031         if (mixed && (bctl->flags & allowed)) {
4032                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4033                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4034                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4035                         btrfs_err(fs_info,
4036           "balance: mixed groups data and metadata options must be the same");
4037                         ret = -EINVAL;
4038                         goto out;
4039                 }
4040         }
4041
4042         /*
4043          * rw_devices will not change at the moment, device add/delete/replace
4044          * are exclusive
4045          */
4046         num_devices = fs_info->fs_devices->rw_devices;
4047
4048         /*
4049          * SINGLE profile on-disk has no profile bit, but in-memory we have a
4050          * special bit for it, to make it easier to distinguish.  Thus we need
4051          * to set it manually, or balance would refuse the profile.
4052          */
4053         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4054         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4055                 if (num_devices >= btrfs_raid_array[i].devs_min)
4056                         allowed |= btrfs_raid_array[i].bg_flag;
4057
4058         if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4059             !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4060             !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4061                 ret = -EINVAL;
4062                 goto out;
4063         }
4064
4065         /*
4066          * Allow to reduce metadata or system integrity only if force set for
4067          * profiles with redundancy (copies, parity)
4068          */
4069         allowed = 0;
4070         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4071                 if (btrfs_raid_array[i].ncopies >= 2 ||
4072                     btrfs_raid_array[i].tolerated_failures >= 1)
4073                         allowed |= btrfs_raid_array[i].bg_flag;
4074         }
4075         do {
4076                 seq = read_seqbegin(&fs_info->profiles_lock);
4077
4078                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4079                      (fs_info->avail_system_alloc_bits & allowed) &&
4080                      !(bctl->sys.target & allowed)) ||
4081                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4082                      (fs_info->avail_metadata_alloc_bits & allowed) &&
4083                      !(bctl->meta.target & allowed)))
4084                         reducing_redundancy = true;
4085                 else
4086                         reducing_redundancy = false;
4087
4088                 /* if we're not converting, the target field is uninitialized */
4089                 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4090                         bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4091                 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4092                         bctl->data.target : fs_info->avail_data_alloc_bits;
4093         } while (read_seqretry(&fs_info->profiles_lock, seq));
4094
4095         if (reducing_redundancy) {
4096                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4097                         btrfs_info(fs_info,
4098                            "balance: force reducing metadata redundancy");
4099                 } else {
4100                         btrfs_err(fs_info,
4101         "balance: reduces metadata redundancy, use --force if you want this");
4102                         ret = -EINVAL;
4103                         goto out;
4104                 }
4105         }
4106
4107         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4108                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4109                 btrfs_warn(fs_info,
4110         "balance: metadata profile %s has lower redundancy than data profile %s",
4111                                 btrfs_bg_type_to_raid_name(meta_target),
4112                                 btrfs_bg_type_to_raid_name(data_target));
4113         }
4114
4115         if (fs_info->send_in_progress) {
4116                 btrfs_warn_rl(fs_info,
4117 "cannot run balance while send operations are in progress (%d in progress)",
4118                               fs_info->send_in_progress);
4119                 ret = -EAGAIN;
4120                 goto out;
4121         }
4122
4123         ret = insert_balance_item(fs_info, bctl);
4124         if (ret && ret != -EEXIST)
4125                 goto out;
4126
4127         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4128                 BUG_ON(ret == -EEXIST);
4129                 BUG_ON(fs_info->balance_ctl);
4130                 spin_lock(&fs_info->balance_lock);
4131                 fs_info->balance_ctl = bctl;
4132                 spin_unlock(&fs_info->balance_lock);
4133         } else {
4134                 BUG_ON(ret != -EEXIST);
4135                 spin_lock(&fs_info->balance_lock);
4136                 update_balance_args(bctl);
4137                 spin_unlock(&fs_info->balance_lock);
4138         }
4139
4140         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4141         set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4142         describe_balance_start_or_resume(fs_info);
4143         mutex_unlock(&fs_info->balance_mutex);
4144
4145         ret = __btrfs_balance(fs_info);
4146
4147         mutex_lock(&fs_info->balance_mutex);
4148         if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4149                 btrfs_info(fs_info, "balance: paused");
4150         /*
4151          * Balance can be canceled by:
4152          *
4153          * - Regular cancel request
4154          *   Then ret == -ECANCELED and balance_cancel_req > 0
4155          *
4156          * - Fatal signal to "btrfs" process
4157          *   Either the signal caught by wait_reserve_ticket() and callers
4158          *   got -EINTR, or caught by btrfs_should_cancel_balance() and
4159          *   got -ECANCELED.
4160          *   Either way, in this case balance_cancel_req = 0, and
4161          *   ret == -EINTR or ret == -ECANCELED.
4162          *
4163          * So here we only check the return value to catch canceled balance.
4164          */
4165         else if (ret == -ECANCELED || ret == -EINTR)
4166                 btrfs_info(fs_info, "balance: canceled");
4167         else
4168                 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4169
4170         clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4171
4172         if (bargs) {
4173                 memset(bargs, 0, sizeof(*bargs));
4174                 btrfs_update_ioctl_balance_args(fs_info, bargs);
4175         }
4176
4177         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4178             balance_need_close(fs_info)) {
4179                 reset_balance_state(fs_info);
4180                 btrfs_exclop_finish(fs_info);
4181         }
4182
4183         wake_up(&fs_info->balance_wait_q);
4184
4185         return ret;
4186 out:
4187         if (bctl->flags & BTRFS_BALANCE_RESUME)
4188                 reset_balance_state(fs_info);
4189         else
4190                 kfree(bctl);
4191         btrfs_exclop_finish(fs_info);
4192
4193         return ret;
4194 }
4195
4196 static int balance_kthread(void *data)
4197 {
4198         struct btrfs_fs_info *fs_info = data;
4199         int ret = 0;
4200
4201         mutex_lock(&fs_info->balance_mutex);
4202         if (fs_info->balance_ctl)
4203                 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4204         mutex_unlock(&fs_info->balance_mutex);
4205
4206         return ret;
4207 }
4208
4209 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4210 {
4211         struct task_struct *tsk;
4212
4213         mutex_lock(&fs_info->balance_mutex);
4214         if (!fs_info->balance_ctl) {
4215                 mutex_unlock(&fs_info->balance_mutex);
4216                 return 0;
4217         }
4218         mutex_unlock(&fs_info->balance_mutex);
4219
4220         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4221                 btrfs_info(fs_info, "balance: resume skipped");
4222                 return 0;
4223         }
4224
4225         /*
4226          * A ro->rw remount sequence should continue with the paused balance
4227          * regardless of who pauses it, system or the user as of now, so set
4228          * the resume flag.
4229          */
4230         spin_lock(&fs_info->balance_lock);
4231         fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4232         spin_unlock(&fs_info->balance_lock);
4233
4234         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4235         return PTR_ERR_OR_ZERO(tsk);
4236 }
4237
4238 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4239 {
4240         struct btrfs_balance_control *bctl;
4241         struct btrfs_balance_item *item;
4242         struct btrfs_disk_balance_args disk_bargs;
4243         struct btrfs_path *path;
4244         struct extent_buffer *leaf;
4245         struct btrfs_key key;
4246         int ret;
4247
4248         path = btrfs_alloc_path();
4249         if (!path)
4250                 return -ENOMEM;
4251
4252         key.objectid = BTRFS_BALANCE_OBJECTID;
4253         key.type = BTRFS_TEMPORARY_ITEM_KEY;
4254         key.offset = 0;
4255
4256         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4257         if (ret < 0)
4258                 goto out;
4259         if (ret > 0) { /* ret = -ENOENT; */
4260                 ret = 0;
4261                 goto out;
4262         }
4263
4264         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4265         if (!bctl) {
4266                 ret = -ENOMEM;
4267                 goto out;
4268         }
4269
4270         leaf = path->nodes[0];
4271         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4272
4273         bctl->flags = btrfs_balance_flags(leaf, item);
4274         bctl->flags |= BTRFS_BALANCE_RESUME;
4275
4276         btrfs_balance_data(leaf, item, &disk_bargs);
4277         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4278         btrfs_balance_meta(leaf, item, &disk_bargs);
4279         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4280         btrfs_balance_sys(leaf, item, &disk_bargs);
4281         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4282
4283         /*
4284          * This should never happen, as the paused balance state is recovered
4285          * during mount without any chance of other exclusive ops to collide.
4286          *
4287          * This gives the exclusive op status to balance and keeps in paused
4288          * state until user intervention (cancel or umount). If the ownership
4289          * cannot be assigned, show a message but do not fail. The balance
4290          * is in a paused state and must have fs_info::balance_ctl properly
4291          * set up.
4292          */
4293         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4294                 btrfs_warn(fs_info,
4295         "balance: cannot set exclusive op status, resume manually");
4296
4297         mutex_lock(&fs_info->balance_mutex);
4298         BUG_ON(fs_info->balance_ctl);
4299         spin_lock(&fs_info->balance_lock);
4300         fs_info->balance_ctl = bctl;
4301         spin_unlock(&fs_info->balance_lock);
4302         mutex_unlock(&fs_info->balance_mutex);
4303 out:
4304         btrfs_free_path(path);
4305         return ret;
4306 }
4307
4308 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4309 {
4310         int ret = 0;
4311
4312         mutex_lock(&fs_info->balance_mutex);
4313         if (!fs_info->balance_ctl) {
4314                 mutex_unlock(&fs_info->balance_mutex);
4315                 return -ENOTCONN;
4316         }
4317
4318         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4319                 atomic_inc(&fs_info->balance_pause_req);
4320                 mutex_unlock(&fs_info->balance_mutex);
4321
4322                 wait_event(fs_info->balance_wait_q,
4323                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4324
4325                 mutex_lock(&fs_info->balance_mutex);
4326                 /* we are good with balance_ctl ripped off from under us */
4327                 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4328                 atomic_dec(&fs_info->balance_pause_req);
4329         } else {
4330                 ret = -ENOTCONN;
4331         }
4332
4333         mutex_unlock(&fs_info->balance_mutex);
4334         return ret;
4335 }
4336
4337 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4338 {
4339         mutex_lock(&fs_info->balance_mutex);
4340         if (!fs_info->balance_ctl) {
4341                 mutex_unlock(&fs_info->balance_mutex);
4342                 return -ENOTCONN;
4343         }
4344
4345         /*
4346          * A paused balance with the item stored on disk can be resumed at
4347          * mount time if the mount is read-write. Otherwise it's still paused
4348          * and we must not allow cancelling as it deletes the item.
4349          */
4350         if (sb_rdonly(fs_info->sb)) {
4351                 mutex_unlock(&fs_info->balance_mutex);
4352                 return -EROFS;
4353         }
4354
4355         atomic_inc(&fs_info->balance_cancel_req);
4356         /*
4357          * if we are running just wait and return, balance item is
4358          * deleted in btrfs_balance in this case
4359          */
4360         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4361                 mutex_unlock(&fs_info->balance_mutex);
4362                 wait_event(fs_info->balance_wait_q,
4363                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4364                 mutex_lock(&fs_info->balance_mutex);
4365         } else {
4366                 mutex_unlock(&fs_info->balance_mutex);
4367                 /*
4368                  * Lock released to allow other waiters to continue, we'll
4369                  * reexamine the status again.
4370                  */
4371                 mutex_lock(&fs_info->balance_mutex);
4372
4373                 if (fs_info->balance_ctl) {
4374                         reset_balance_state(fs_info);
4375                         btrfs_exclop_finish(fs_info);
4376                         btrfs_info(fs_info, "balance: canceled");
4377                 }
4378         }
4379
4380         BUG_ON(fs_info->balance_ctl ||
4381                 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4382         atomic_dec(&fs_info->balance_cancel_req);
4383         mutex_unlock(&fs_info->balance_mutex);
4384         return 0;
4385 }
4386
4387 int btrfs_uuid_scan_kthread(void *data)
4388 {
4389         struct btrfs_fs_info *fs_info = data;
4390         struct btrfs_root *root = fs_info->tree_root;
4391         struct btrfs_key key;
4392         struct btrfs_path *path = NULL;
4393         int ret = 0;
4394         struct extent_buffer *eb;
4395         int slot;
4396         struct btrfs_root_item root_item;
4397         u32 item_size;
4398         struct btrfs_trans_handle *trans = NULL;
4399         bool closing = false;
4400
4401         path = btrfs_alloc_path();
4402         if (!path) {
4403                 ret = -ENOMEM;
4404                 goto out;
4405         }
4406
4407         key.objectid = 0;
4408         key.type = BTRFS_ROOT_ITEM_KEY;
4409         key.offset = 0;
4410
4411         while (1) {
4412                 if (btrfs_fs_closing(fs_info)) {
4413                         closing = true;
4414                         break;
4415                 }
4416                 ret = btrfs_search_forward(root, &key, path,
4417                                 BTRFS_OLDEST_GENERATION);
4418                 if (ret) {
4419                         if (ret > 0)
4420                                 ret = 0;
4421                         break;
4422                 }
4423
4424                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4425                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4426                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4427                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
4428                         goto skip;
4429
4430                 eb = path->nodes[0];
4431                 slot = path->slots[0];
4432                 item_size = btrfs_item_size_nr(eb, slot);
4433                 if (item_size < sizeof(root_item))
4434                         goto skip;
4435
4436                 read_extent_buffer(eb, &root_item,
4437                                    btrfs_item_ptr_offset(eb, slot),
4438                                    (int)sizeof(root_item));
4439                 if (btrfs_root_refs(&root_item) == 0)
4440                         goto skip;
4441
4442                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4443                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
4444                         if (trans)
4445                                 goto update_tree;
4446
4447                         btrfs_release_path(path);
4448                         /*
4449                          * 1 - subvol uuid item
4450                          * 1 - received_subvol uuid item
4451                          */
4452                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4453                         if (IS_ERR(trans)) {
4454                                 ret = PTR_ERR(trans);
4455                                 break;
4456                         }
4457                         continue;
4458                 } else {
4459                         goto skip;
4460                 }
4461 update_tree:
4462                 btrfs_release_path(path);
4463                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4464                         ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4465                                                   BTRFS_UUID_KEY_SUBVOL,
4466                                                   key.objectid);
4467                         if (ret < 0) {
4468                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4469                                         ret);
4470                                 break;
4471                         }
4472                 }
4473
4474                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4475                         ret = btrfs_uuid_tree_add(trans,
4476                                                   root_item.received_uuid,
4477                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4478                                                   key.objectid);
4479                         if (ret < 0) {
4480                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4481                                         ret);
4482                                 break;
4483                         }
4484                 }
4485
4486 skip:
4487                 btrfs_release_path(path);
4488                 if (trans) {
4489                         ret = btrfs_end_transaction(trans);
4490                         trans = NULL;
4491                         if (ret)
4492                                 break;
4493                 }
4494
4495                 if (key.offset < (u64)-1) {
4496                         key.offset++;
4497                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4498                         key.offset = 0;
4499                         key.type = BTRFS_ROOT_ITEM_KEY;
4500                 } else if (key.objectid < (u64)-1) {
4501                         key.offset = 0;
4502                         key.type = BTRFS_ROOT_ITEM_KEY;
4503                         key.objectid++;
4504                 } else {
4505                         break;
4506                 }
4507                 cond_resched();
4508         }
4509
4510 out:
4511         btrfs_free_path(path);
4512         if (trans && !IS_ERR(trans))
4513                 btrfs_end_transaction(trans);
4514         if (ret)
4515                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4516         else if (!closing)
4517                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4518         up(&fs_info->uuid_tree_rescan_sem);
4519         return 0;
4520 }
4521
4522 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4523 {
4524         struct btrfs_trans_handle *trans;
4525         struct btrfs_root *tree_root = fs_info->tree_root;
4526         struct btrfs_root *uuid_root;
4527         struct task_struct *task;
4528         int ret;
4529
4530         /*
4531          * 1 - root node
4532          * 1 - root item
4533          */
4534         trans = btrfs_start_transaction(tree_root, 2);
4535         if (IS_ERR(trans))
4536                 return PTR_ERR(trans);
4537
4538         uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4539         if (IS_ERR(uuid_root)) {
4540                 ret = PTR_ERR(uuid_root);
4541                 btrfs_abort_transaction(trans, ret);
4542                 btrfs_end_transaction(trans);
4543                 return ret;
4544         }
4545
4546         fs_info->uuid_root = uuid_root;
4547
4548         ret = btrfs_commit_transaction(trans);
4549         if (ret)
4550                 return ret;
4551
4552         down(&fs_info->uuid_tree_rescan_sem);
4553         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4554         if (IS_ERR(task)) {
4555                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4556                 btrfs_warn(fs_info, "failed to start uuid_scan task");
4557                 up(&fs_info->uuid_tree_rescan_sem);
4558                 return PTR_ERR(task);
4559         }
4560
4561         return 0;
4562 }
4563
4564 /*
4565  * shrinking a device means finding all of the device extents past
4566  * the new size, and then following the back refs to the chunks.
4567  * The chunk relocation code actually frees the device extent
4568  */
4569 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4570 {
4571         struct btrfs_fs_info *fs_info = device->fs_info;
4572         struct btrfs_root *root = fs_info->dev_root;
4573         struct btrfs_trans_handle *trans;
4574         struct btrfs_dev_extent *dev_extent = NULL;
4575         struct btrfs_path *path;
4576         u64 length;
4577         u64 chunk_offset;
4578         int ret;
4579         int slot;
4580         int failed = 0;
4581         bool retried = false;
4582         struct extent_buffer *l;
4583         struct btrfs_key key;
4584         struct btrfs_super_block *super_copy = fs_info->super_copy;
4585         u64 old_total = btrfs_super_total_bytes(super_copy);
4586         u64 old_size = btrfs_device_get_total_bytes(device);
4587         u64 diff;
4588         u64 start;
4589
4590         new_size = round_down(new_size, fs_info->sectorsize);
4591         start = new_size;
4592         diff = round_down(old_size - new_size, fs_info->sectorsize);
4593
4594         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4595                 return -EINVAL;
4596
4597         path = btrfs_alloc_path();
4598         if (!path)
4599                 return -ENOMEM;
4600
4601         path->reada = READA_BACK;
4602
4603         trans = btrfs_start_transaction(root, 0);
4604         if (IS_ERR(trans)) {
4605                 btrfs_free_path(path);
4606                 return PTR_ERR(trans);
4607         }
4608
4609         mutex_lock(&fs_info->chunk_mutex);
4610
4611         btrfs_device_set_total_bytes(device, new_size);
4612         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4613                 device->fs_devices->total_rw_bytes -= diff;
4614                 atomic64_sub(diff, &fs_info->free_chunk_space);
4615         }
4616
4617         /*
4618          * Once the device's size has been set to the new size, ensure all
4619          * in-memory chunks are synced to disk so that the loop below sees them
4620          * and relocates them accordingly.
4621          */
4622         if (contains_pending_extent(device, &start, diff)) {
4623                 mutex_unlock(&fs_info->chunk_mutex);
4624                 ret = btrfs_commit_transaction(trans);
4625                 if (ret)
4626                         goto done;
4627         } else {
4628                 mutex_unlock(&fs_info->chunk_mutex);
4629                 btrfs_end_transaction(trans);
4630         }
4631
4632 again:
4633         key.objectid = device->devid;
4634         key.offset = (u64)-1;
4635         key.type = BTRFS_DEV_EXTENT_KEY;
4636
4637         do {
4638                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
4639                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4640                 if (ret < 0) {
4641                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4642                         goto done;
4643                 }
4644
4645                 ret = btrfs_previous_item(root, path, 0, key.type);
4646                 if (ret)
4647                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4648                 if (ret < 0)
4649                         goto done;
4650                 if (ret) {
4651                         ret = 0;
4652                         btrfs_release_path(path);
4653                         break;
4654                 }
4655
4656                 l = path->nodes[0];
4657                 slot = path->slots[0];
4658                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4659
4660                 if (key.objectid != device->devid) {
4661                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4662                         btrfs_release_path(path);
4663                         break;
4664                 }
4665
4666                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4667                 length = btrfs_dev_extent_length(l, dev_extent);
4668
4669                 if (key.offset + length <= new_size) {
4670                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4671                         btrfs_release_path(path);
4672                         break;
4673                 }
4674
4675                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4676                 btrfs_release_path(path);
4677
4678                 /*
4679                  * We may be relocating the only data chunk we have,
4680                  * which could potentially end up with losing data's
4681                  * raid profile, so lets allocate an empty one in
4682                  * advance.
4683                  */
4684                 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4685                 if (ret < 0) {
4686                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4687                         goto done;
4688                 }
4689
4690                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4691                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4692                 if (ret == -ENOSPC) {
4693                         failed++;
4694                 } else if (ret) {
4695                         if (ret == -ETXTBSY) {
4696                                 btrfs_warn(fs_info,
4697                    "could not shrink block group %llu due to active swapfile",
4698                                            chunk_offset);
4699                         }
4700                         goto done;
4701                 }
4702         } while (key.offset-- > 0);
4703
4704         if (failed && !retried) {
4705                 failed = 0;
4706                 retried = true;
4707                 goto again;
4708         } else if (failed && retried) {
4709                 ret = -ENOSPC;
4710                 goto done;
4711         }
4712
4713         /* Shrinking succeeded, else we would be at "done". */
4714         trans = btrfs_start_transaction(root, 0);
4715         if (IS_ERR(trans)) {
4716                 ret = PTR_ERR(trans);
4717                 goto done;
4718         }
4719
4720         mutex_lock(&fs_info->chunk_mutex);
4721         /* Clear all state bits beyond the shrunk device size */
4722         clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4723                           CHUNK_STATE_MASK);
4724
4725         btrfs_device_set_disk_total_bytes(device, new_size);
4726         if (list_empty(&device->post_commit_list))
4727                 list_add_tail(&device->post_commit_list,
4728                               &trans->transaction->dev_update_list);
4729
4730         WARN_ON(diff > old_total);
4731         btrfs_set_super_total_bytes(super_copy,
4732                         round_down(old_total - diff, fs_info->sectorsize));
4733         mutex_unlock(&fs_info->chunk_mutex);
4734
4735         /* Now btrfs_update_device() will change the on-disk size. */
4736         ret = btrfs_update_device(trans, device);
4737         if (ret < 0) {
4738                 btrfs_abort_transaction(trans, ret);
4739                 btrfs_end_transaction(trans);
4740         } else {
4741                 ret = btrfs_commit_transaction(trans);
4742         }
4743 done:
4744         btrfs_free_path(path);
4745         if (ret) {
4746                 mutex_lock(&fs_info->chunk_mutex);
4747                 btrfs_device_set_total_bytes(device, old_size);
4748                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4749                         device->fs_devices->total_rw_bytes += diff;
4750                 atomic64_add(diff, &fs_info->free_chunk_space);
4751                 mutex_unlock(&fs_info->chunk_mutex);
4752         }
4753         return ret;
4754 }
4755
4756 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4757                            struct btrfs_key *key,
4758                            struct btrfs_chunk *chunk, int item_size)
4759 {
4760         struct btrfs_super_block *super_copy = fs_info->super_copy;
4761         struct btrfs_disk_key disk_key;
4762         u32 array_size;
4763         u8 *ptr;
4764
4765         mutex_lock(&fs_info->chunk_mutex);
4766         array_size = btrfs_super_sys_array_size(super_copy);
4767         if (array_size + item_size + sizeof(disk_key)
4768                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4769                 mutex_unlock(&fs_info->chunk_mutex);
4770                 return -EFBIG;
4771         }
4772
4773         ptr = super_copy->sys_chunk_array + array_size;
4774         btrfs_cpu_key_to_disk(&disk_key, key);
4775         memcpy(ptr, &disk_key, sizeof(disk_key));
4776         ptr += sizeof(disk_key);
4777         memcpy(ptr, chunk, item_size);
4778         item_size += sizeof(disk_key);
4779         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4780         mutex_unlock(&fs_info->chunk_mutex);
4781
4782         return 0;
4783 }
4784
4785 /*
4786  * sort the devices in descending order by max_avail, total_avail
4787  */
4788 static int btrfs_cmp_device_info(const void *a, const void *b)
4789 {
4790         const struct btrfs_device_info *di_a = a;
4791         const struct btrfs_device_info *di_b = b;
4792
4793         if (di_a->max_avail > di_b->max_avail)
4794                 return -1;
4795         if (di_a->max_avail < di_b->max_avail)
4796                 return 1;
4797         if (di_a->total_avail > di_b->total_avail)
4798                 return -1;
4799         if (di_a->total_avail < di_b->total_avail)
4800                 return 1;
4801         return 0;
4802 }
4803
4804 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4805 {
4806         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4807                 return;
4808
4809         btrfs_set_fs_incompat(info, RAID56);
4810 }
4811
4812 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4813 {
4814         if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4815                 return;
4816
4817         btrfs_set_fs_incompat(info, RAID1C34);
4818 }
4819
4820 /*
4821  * Structure used internally for __btrfs_alloc_chunk() function.
4822  * Wraps needed parameters.
4823  */
4824 struct alloc_chunk_ctl {
4825         u64 start;
4826         u64 type;
4827         /* Total number of stripes to allocate */
4828         int num_stripes;
4829         /* sub_stripes info for map */
4830         int sub_stripes;
4831         /* Stripes per device */
4832         int dev_stripes;
4833         /* Maximum number of devices to use */
4834         int devs_max;
4835         /* Minimum number of devices to use */
4836         int devs_min;
4837         /* ndevs has to be a multiple of this */
4838         int devs_increment;
4839         /* Number of copies */
4840         int ncopies;
4841         /* Number of stripes worth of bytes to store parity information */
4842         int nparity;
4843         u64 max_stripe_size;
4844         u64 max_chunk_size;
4845         u64 dev_extent_min;
4846         u64 stripe_size;
4847         u64 chunk_size;
4848         int ndevs;
4849 };
4850
4851 static void init_alloc_chunk_ctl_policy_regular(
4852                                 struct btrfs_fs_devices *fs_devices,
4853                                 struct alloc_chunk_ctl *ctl)
4854 {
4855         u64 type = ctl->type;
4856
4857         if (type & BTRFS_BLOCK_GROUP_DATA) {
4858                 ctl->max_stripe_size = SZ_1G;
4859                 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4860         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4861                 /* For larger filesystems, use larger metadata chunks */
4862                 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4863                         ctl->max_stripe_size = SZ_1G;
4864                 else
4865                         ctl->max_stripe_size = SZ_256M;
4866                 ctl->max_chunk_size = ctl->max_stripe_size;
4867         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4868                 ctl->max_stripe_size = SZ_32M;
4869                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4870                 ctl->devs_max = min_t(int, ctl->devs_max,
4871                                       BTRFS_MAX_DEVS_SYS_CHUNK);
4872         } else {
4873                 BUG();
4874         }
4875
4876         /* We don't want a chunk larger than 10% of writable space */
4877         ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4878                                   ctl->max_chunk_size);
4879         ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
4880 }
4881
4882 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
4883                                  struct alloc_chunk_ctl *ctl)
4884 {
4885         int index = btrfs_bg_flags_to_raid_index(ctl->type);
4886
4887         ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
4888         ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
4889         ctl->devs_max = btrfs_raid_array[index].devs_max;
4890         if (!ctl->devs_max)
4891                 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
4892         ctl->devs_min = btrfs_raid_array[index].devs_min;
4893         ctl->devs_increment = btrfs_raid_array[index].devs_increment;
4894         ctl->ncopies = btrfs_raid_array[index].ncopies;
4895         ctl->nparity = btrfs_raid_array[index].nparity;
4896         ctl->ndevs = 0;
4897
4898         switch (fs_devices->chunk_alloc_policy) {
4899         case BTRFS_CHUNK_ALLOC_REGULAR:
4900                 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
4901                 break;
4902         default:
4903                 BUG();
4904         }
4905 }
4906
4907 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
4908                               struct alloc_chunk_ctl *ctl,
4909                               struct btrfs_device_info *devices_info)
4910 {
4911         struct btrfs_fs_info *info = fs_devices->fs_info;
4912         struct btrfs_device *device;
4913         u64 total_avail;
4914         u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
4915         int ret;
4916         int ndevs = 0;
4917         u64 max_avail;
4918         u64 dev_offset;
4919
4920         /*
4921          * in the first pass through the devices list, we gather information
4922          * about the available holes on each device.
4923          */
4924         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4925                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4926                         WARN(1, KERN_ERR
4927                                "BTRFS: read-only device in alloc_list\n");
4928                         continue;
4929                 }
4930
4931                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
4932                                         &device->dev_state) ||
4933                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4934                         continue;
4935
4936                 if (device->total_bytes > device->bytes_used)
4937                         total_avail = device->total_bytes - device->bytes_used;
4938                 else
4939                         total_avail = 0;
4940
4941                 /* If there is no space on this device, skip it. */
4942                 if (total_avail < ctl->dev_extent_min)
4943                         continue;
4944
4945                 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
4946                                            &max_avail);
4947                 if (ret && ret != -ENOSPC)
4948                         return ret;
4949
4950                 if (ret == 0)
4951                         max_avail = dev_extent_want;
4952
4953                 if (max_avail < ctl->dev_extent_min) {
4954                         if (btrfs_test_opt(info, ENOSPC_DEBUG))
4955                                 btrfs_debug(info,
4956                         "%s: devid %llu has no free space, have=%llu want=%llu",
4957                                             __func__, device->devid, max_avail,
4958                                             ctl->dev_extent_min);
4959                         continue;
4960                 }
4961
4962                 if (ndevs == fs_devices->rw_devices) {
4963                         WARN(1, "%s: found more than %llu devices\n",
4964                              __func__, fs_devices->rw_devices);
4965                         break;
4966                 }
4967                 devices_info[ndevs].dev_offset = dev_offset;
4968                 devices_info[ndevs].max_avail = max_avail;
4969                 devices_info[ndevs].total_avail = total_avail;
4970                 devices_info[ndevs].dev = device;
4971                 ++ndevs;
4972         }
4973         ctl->ndevs = ndevs;
4974
4975         /*
4976          * now sort the devices by hole size / available space
4977          */
4978         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4979              btrfs_cmp_device_info, NULL);
4980
4981         return 0;
4982 }
4983
4984 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
4985                                       struct btrfs_device_info *devices_info)
4986 {
4987         /* Number of stripes that count for block group size */
4988         int data_stripes;
4989
4990         /*
4991          * The primary goal is to maximize the number of stripes, so use as
4992          * many devices as possible, even if the stripes are not maximum sized.
4993          *
4994          * The DUP profile stores more than one stripe per device, the
4995          * max_avail is the total size so we have to adjust.
4996          */
4997         ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
4998                                    ctl->dev_stripes);
4999         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5000
5001         /* This will have to be fixed for RAID1 and RAID10 over more drives */
5002         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5003
5004         /*
5005          * Use the number of data stripes to figure out how big this chunk is
5006          * really going to be in terms of logical address space, and compare
5007          * that answer with the max chunk size. If it's higher, we try to
5008          * reduce stripe_size.
5009          */
5010         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5011                 /*
5012                  * Reduce stripe_size, round it up to a 16MB boundary again and
5013                  * then use it, unless it ends up being even bigger than the
5014                  * previous value we had already.
5015                  */
5016                 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5017                                                         data_stripes), SZ_16M),
5018                                        ctl->stripe_size);
5019         }
5020
5021         /* Align to BTRFS_STRIPE_LEN */
5022         ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5023         ctl->chunk_size = ctl->stripe_size * data_stripes;
5024
5025         return 0;
5026 }
5027
5028 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5029                               struct alloc_chunk_ctl *ctl,
5030                               struct btrfs_device_info *devices_info)
5031 {
5032         struct btrfs_fs_info *info = fs_devices->fs_info;
5033
5034         /*
5035          * Round down to number of usable stripes, devs_increment can be any
5036          * number so we can't use round_down() that requires power of 2, while
5037          * rounddown is safe.
5038          */
5039         ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5040
5041         if (ctl->ndevs < ctl->devs_min) {
5042                 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5043                         btrfs_debug(info,
5044         "%s: not enough devices with free space: have=%d minimum required=%d",
5045                                     __func__, ctl->ndevs, ctl->devs_min);
5046                 }
5047                 return -ENOSPC;
5048         }
5049
5050         ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5051
5052         switch (fs_devices->chunk_alloc_policy) {
5053         case BTRFS_CHUNK_ALLOC_REGULAR:
5054                 return decide_stripe_size_regular(ctl, devices_info);
5055         default:
5056                 BUG();
5057         }
5058 }
5059
5060 static int create_chunk(struct btrfs_trans_handle *trans,
5061                         struct alloc_chunk_ctl *ctl,
5062                         struct btrfs_device_info *devices_info)
5063 {
5064         struct btrfs_fs_info *info = trans->fs_info;
5065         struct map_lookup *map = NULL;
5066         struct extent_map_tree *em_tree;
5067         struct extent_map *em;
5068         u64 start = ctl->start;
5069         u64 type = ctl->type;
5070         int ret;
5071         int i;
5072         int j;
5073
5074         map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5075         if (!map)
5076                 return -ENOMEM;
5077         map->num_stripes = ctl->num_stripes;
5078
5079         for (i = 0; i < ctl->ndevs; ++i) {
5080                 for (j = 0; j < ctl->dev_stripes; ++j) {
5081                         int s = i * ctl->dev_stripes + j;
5082                         map->stripes[s].dev = devices_info[i].dev;
5083                         map->stripes[s].physical = devices_info[i].dev_offset +
5084                                                    j * ctl->stripe_size;
5085                 }
5086         }
5087         map->stripe_len = BTRFS_STRIPE_LEN;
5088         map->io_align = BTRFS_STRIPE_LEN;
5089         map->io_width = BTRFS_STRIPE_LEN;
5090         map->type = type;
5091         map->sub_stripes = ctl->sub_stripes;
5092
5093         trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5094
5095         em = alloc_extent_map();
5096         if (!em) {
5097                 kfree(map);
5098                 return -ENOMEM;
5099         }
5100         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5101         em->map_lookup = map;
5102         em->start = start;
5103         em->len = ctl->chunk_size;
5104         em->block_start = 0;
5105         em->block_len = em->len;
5106         em->orig_block_len = ctl->stripe_size;
5107
5108         em_tree = &info->mapping_tree;
5109         write_lock(&em_tree->lock);
5110         ret = add_extent_mapping(em_tree, em, 0);
5111         if (ret) {
5112                 write_unlock(&em_tree->lock);
5113                 free_extent_map(em);
5114                 return ret;
5115         }
5116         write_unlock(&em_tree->lock);
5117
5118         ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5119         if (ret)
5120                 goto error_del_extent;
5121
5122         for (i = 0; i < map->num_stripes; i++) {
5123                 struct btrfs_device *dev = map->stripes[i].dev;
5124
5125                 btrfs_device_set_bytes_used(dev,
5126                                             dev->bytes_used + ctl->stripe_size);
5127                 if (list_empty(&dev->post_commit_list))
5128                         list_add_tail(&dev->post_commit_list,
5129                                       &trans->transaction->dev_update_list);
5130         }
5131
5132         atomic64_sub(ctl->stripe_size * map->num_stripes,
5133                      &info->free_chunk_space);
5134
5135         free_extent_map(em);
5136         check_raid56_incompat_flag(info, type);
5137         check_raid1c34_incompat_flag(info, type);
5138
5139         return 0;
5140
5141 error_del_extent:
5142         write_lock(&em_tree->lock);
5143         remove_extent_mapping(em_tree, em);
5144         write_unlock(&em_tree->lock);
5145
5146         /* One for our allocation */
5147         free_extent_map(em);
5148         /* One for the tree reference */
5149         free_extent_map(em);
5150
5151         return ret;
5152 }
5153
5154 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5155 {
5156         struct btrfs_fs_info *info = trans->fs_info;
5157         struct btrfs_fs_devices *fs_devices = info->fs_devices;
5158         struct btrfs_device_info *devices_info = NULL;
5159         struct alloc_chunk_ctl ctl;
5160         int ret;
5161
5162         lockdep_assert_held(&info->chunk_mutex);
5163
5164         if (!alloc_profile_is_valid(type, 0)) {
5165                 ASSERT(0);
5166                 return -EINVAL;
5167         }
5168
5169         if (list_empty(&fs_devices->alloc_list)) {
5170                 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5171                         btrfs_debug(info, "%s: no writable device", __func__);
5172                 return -ENOSPC;
5173         }
5174
5175         if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5176                 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5177                 ASSERT(0);
5178                 return -EINVAL;
5179         }
5180
5181         ctl.start = find_next_chunk(info);
5182         ctl.type = type;
5183         init_alloc_chunk_ctl(fs_devices, &ctl);
5184
5185         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5186                                GFP_NOFS);
5187         if (!devices_info)
5188                 return -ENOMEM;
5189
5190         ret = gather_device_info(fs_devices, &ctl, devices_info);
5191         if (ret < 0)
5192                 goto out;
5193
5194         ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5195         if (ret < 0)
5196                 goto out;
5197
5198         ret = create_chunk(trans, &ctl, devices_info);
5199
5200 out:
5201         kfree(devices_info);
5202         return ret;
5203 }
5204
5205 /*
5206  * Chunk allocation falls into two parts. The first part does work
5207  * that makes the new allocated chunk usable, but does not do any operation
5208  * that modifies the chunk tree. The second part does the work that
5209  * requires modifying the chunk tree. This division is important for the
5210  * bootstrap process of adding storage to a seed btrfs.
5211  */
5212 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
5213                              u64 chunk_offset, u64 chunk_size)
5214 {
5215         struct btrfs_fs_info *fs_info = trans->fs_info;
5216         struct btrfs_root *extent_root = fs_info->extent_root;
5217         struct btrfs_root *chunk_root = fs_info->chunk_root;
5218         struct btrfs_key key;
5219         struct btrfs_device *device;
5220         struct btrfs_chunk *chunk;
5221         struct btrfs_stripe *stripe;
5222         struct extent_map *em;
5223         struct map_lookup *map;
5224         size_t item_size;
5225         u64 dev_offset;
5226         u64 stripe_size;
5227         int i = 0;
5228         int ret = 0;
5229
5230         em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
5231         if (IS_ERR(em))
5232                 return PTR_ERR(em);
5233
5234         map = em->map_lookup;
5235         item_size = btrfs_chunk_item_size(map->num_stripes);
5236         stripe_size = em->orig_block_len;
5237
5238         chunk = kzalloc(item_size, GFP_NOFS);
5239         if (!chunk) {
5240                 ret = -ENOMEM;
5241                 goto out;
5242         }
5243
5244         /*
5245          * Take the device list mutex to prevent races with the final phase of
5246          * a device replace operation that replaces the device object associated
5247          * with the map's stripes, because the device object's id can change
5248          * at any time during that final phase of the device replace operation
5249          * (dev-replace.c:btrfs_dev_replace_finishing()).
5250          */
5251         mutex_lock(&fs_info->fs_devices->device_list_mutex);
5252         for (i = 0; i < map->num_stripes; i++) {
5253                 device = map->stripes[i].dev;
5254                 dev_offset = map->stripes[i].physical;
5255
5256                 ret = btrfs_update_device(trans, device);
5257                 if (ret)
5258                         break;
5259                 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5260                                              dev_offset, stripe_size);
5261                 if (ret)
5262                         break;
5263         }
5264         if (ret) {
5265                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5266                 goto out;
5267         }
5268
5269         stripe = &chunk->stripe;
5270         for (i = 0; i < map->num_stripes; i++) {
5271                 device = map->stripes[i].dev;
5272                 dev_offset = map->stripes[i].physical;
5273
5274                 btrfs_set_stack_stripe_devid(stripe, device->devid);
5275                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5276                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5277                 stripe++;
5278         }
5279         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5280
5281         btrfs_set_stack_chunk_length(chunk, chunk_size);
5282         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5283         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5284         btrfs_set_stack_chunk_type(chunk, map->type);
5285         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5286         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5287         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5288         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5289         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5290
5291         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5292         key.type = BTRFS_CHUNK_ITEM_KEY;
5293         key.offset = chunk_offset;
5294
5295         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5296         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5297                 /*
5298                  * TODO: Cleanup of inserted chunk root in case of
5299                  * failure.
5300                  */
5301                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5302         }
5303
5304 out:
5305         kfree(chunk);
5306         free_extent_map(em);
5307         return ret;
5308 }
5309
5310 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5311 {
5312         struct btrfs_fs_info *fs_info = trans->fs_info;
5313         u64 alloc_profile;
5314         int ret;
5315
5316         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5317         ret = btrfs_alloc_chunk(trans, alloc_profile);
5318         if (ret)
5319                 return ret;
5320
5321         alloc_profile = btrfs_system_alloc_profile(fs_info);
5322         ret = btrfs_alloc_chunk(trans, alloc_profile);
5323         return ret;
5324 }
5325
5326 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5327 {
5328         const int index = btrfs_bg_flags_to_raid_index(map->type);
5329
5330         return btrfs_raid_array[index].tolerated_failures;
5331 }
5332
5333 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5334 {
5335         struct extent_map *em;
5336         struct map_lookup *map;
5337         int readonly = 0;
5338         int miss_ndevs = 0;
5339         int i;
5340
5341         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5342         if (IS_ERR(em))
5343                 return 1;
5344
5345         map = em->map_lookup;
5346         for (i = 0; i < map->num_stripes; i++) {
5347                 if (test_bit(BTRFS_DEV_STATE_MISSING,
5348                                         &map->stripes[i].dev->dev_state)) {
5349                         miss_ndevs++;
5350                         continue;
5351                 }
5352                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5353                                         &map->stripes[i].dev->dev_state)) {
5354                         readonly = 1;
5355                         goto end;
5356                 }
5357         }
5358
5359         /*
5360          * If the number of missing devices is larger than max errors,
5361          * we can not write the data into that chunk successfully, so
5362          * set it readonly.
5363          */
5364         if (miss_ndevs > btrfs_chunk_max_errors(map))
5365                 readonly = 1;
5366 end:
5367         free_extent_map(em);
5368         return readonly;
5369 }
5370
5371 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5372 {
5373         struct extent_map *em;
5374
5375         while (1) {
5376                 write_lock(&tree->lock);
5377                 em = lookup_extent_mapping(tree, 0, (u64)-1);
5378                 if (em)
5379                         remove_extent_mapping(tree, em);
5380                 write_unlock(&tree->lock);
5381                 if (!em)
5382                         break;
5383                 /* once for us */
5384                 free_extent_map(em);
5385                 /* once for the tree */
5386                 free_extent_map(em);
5387         }
5388 }
5389
5390 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5391 {
5392         struct extent_map *em;
5393         struct map_lookup *map;
5394         int ret;
5395
5396         em = btrfs_get_chunk_map(fs_info, logical, len);
5397         if (IS_ERR(em))
5398                 /*
5399                  * We could return errors for these cases, but that could get
5400                  * ugly and we'd probably do the same thing which is just not do
5401                  * anything else and exit, so return 1 so the callers don't try
5402                  * to use other copies.
5403                  */
5404                 return 1;
5405
5406         map = em->map_lookup;
5407         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5408                 ret = map->num_stripes;
5409         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5410                 ret = map->sub_stripes;
5411         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5412                 ret = 2;
5413         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5414                 /*
5415                  * There could be two corrupted data stripes, we need
5416                  * to loop retry in order to rebuild the correct data.
5417                  *
5418                  * Fail a stripe at a time on every retry except the
5419                  * stripe under reconstruction.
5420                  */
5421                 ret = map->num_stripes;
5422         else
5423                 ret = 1;
5424         free_extent_map(em);
5425
5426         down_read(&fs_info->dev_replace.rwsem);
5427         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5428             fs_info->dev_replace.tgtdev)
5429                 ret++;
5430         up_read(&fs_info->dev_replace.rwsem);
5431
5432         return ret;
5433 }
5434
5435 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5436                                     u64 logical)
5437 {
5438         struct extent_map *em;
5439         struct map_lookup *map;
5440         unsigned long len = fs_info->sectorsize;
5441
5442         em = btrfs_get_chunk_map(fs_info, logical, len);
5443
5444         if (!WARN_ON(IS_ERR(em))) {
5445                 map = em->map_lookup;
5446                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5447                         len = map->stripe_len * nr_data_stripes(map);
5448                 free_extent_map(em);
5449         }
5450         return len;
5451 }
5452
5453 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5454 {
5455         struct extent_map *em;
5456         struct map_lookup *map;
5457         int ret = 0;
5458
5459         em = btrfs_get_chunk_map(fs_info, logical, len);
5460
5461         if(!WARN_ON(IS_ERR(em))) {
5462                 map = em->map_lookup;
5463                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5464                         ret = 1;
5465                 free_extent_map(em);
5466         }
5467         return ret;
5468 }
5469
5470 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5471                             struct map_lookup *map, int first,
5472                             int dev_replace_is_ongoing)
5473 {
5474         int i;
5475         int num_stripes;
5476         int preferred_mirror;
5477         int tolerance;
5478         struct btrfs_device *srcdev;
5479
5480         ASSERT((map->type &
5481                  (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5482
5483         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5484                 num_stripes = map->sub_stripes;
5485         else
5486                 num_stripes = map->num_stripes;
5487
5488         preferred_mirror = first + current->pid % num_stripes;
5489
5490         if (dev_replace_is_ongoing &&
5491             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5492              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5493                 srcdev = fs_info->dev_replace.srcdev;
5494         else
5495                 srcdev = NULL;
5496
5497         /*
5498          * try to avoid the drive that is the source drive for a
5499          * dev-replace procedure, only choose it if no other non-missing
5500          * mirror is available
5501          */
5502         for (tolerance = 0; tolerance < 2; tolerance++) {
5503                 if (map->stripes[preferred_mirror].dev->bdev &&
5504                     (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5505                         return preferred_mirror;
5506                 for (i = first; i < first + num_stripes; i++) {
5507                         if (map->stripes[i].dev->bdev &&
5508                             (tolerance || map->stripes[i].dev != srcdev))
5509                                 return i;
5510                 }
5511         }
5512
5513         /* we couldn't find one that doesn't fail.  Just return something
5514          * and the io error handling code will clean up eventually
5515          */
5516         return preferred_mirror;
5517 }
5518
5519 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5520 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5521 {
5522         int i;
5523         int again = 1;
5524
5525         while (again) {
5526                 again = 0;
5527                 for (i = 0; i < num_stripes - 1; i++) {
5528                         /* Swap if parity is on a smaller index */
5529                         if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5530                                 swap(bbio->stripes[i], bbio->stripes[i + 1]);
5531                                 swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5532                                 again = 1;
5533                         }
5534                 }
5535         }
5536 }
5537
5538 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5539 {
5540         struct btrfs_bio *bbio = kzalloc(
5541                  /* the size of the btrfs_bio */
5542                 sizeof(struct btrfs_bio) +
5543                 /* plus the variable array for the stripes */
5544                 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5545                 /* plus the variable array for the tgt dev */
5546                 sizeof(int) * (real_stripes) +
5547                 /*
5548                  * plus the raid_map, which includes both the tgt dev
5549                  * and the stripes
5550                  */
5551                 sizeof(u64) * (total_stripes),
5552                 GFP_NOFS|__GFP_NOFAIL);
5553
5554         atomic_set(&bbio->error, 0);
5555         refcount_set(&bbio->refs, 1);
5556
5557         bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5558         bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5559
5560         return bbio;
5561 }
5562
5563 void btrfs_get_bbio(struct btrfs_bio *bbio)
5564 {
5565         WARN_ON(!refcount_read(&bbio->refs));
5566         refcount_inc(&bbio->refs);
5567 }
5568
5569 void btrfs_put_bbio(struct btrfs_bio *bbio)
5570 {
5571         if (!bbio)
5572                 return;
5573         if (refcount_dec_and_test(&bbio->refs))
5574                 kfree(bbio);
5575 }
5576
5577 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5578 /*
5579  * Please note that, discard won't be sent to target device of device
5580  * replace.
5581  */
5582 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5583                                          u64 logical, u64 *length_ret,
5584                                          struct btrfs_bio **bbio_ret)
5585 {
5586         struct extent_map *em;
5587         struct map_lookup *map;
5588         struct btrfs_bio *bbio;
5589         u64 length = *length_ret;
5590         u64 offset;
5591         u64 stripe_nr;
5592         u64 stripe_nr_end;
5593         u64 stripe_end_offset;
5594         u64 stripe_cnt;
5595         u64 stripe_len;
5596         u64 stripe_offset;
5597         u64 num_stripes;
5598         u32 stripe_index;
5599         u32 factor = 0;
5600         u32 sub_stripes = 0;
5601         u64 stripes_per_dev = 0;
5602         u32 remaining_stripes = 0;
5603         u32 last_stripe = 0;
5604         int ret = 0;
5605         int i;
5606
5607         /* discard always return a bbio */
5608         ASSERT(bbio_ret);
5609
5610         em = btrfs_get_chunk_map(fs_info, logical, length);
5611         if (IS_ERR(em))
5612                 return PTR_ERR(em);
5613
5614         map = em->map_lookup;
5615         /* we don't discard raid56 yet */
5616         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5617                 ret = -EOPNOTSUPP;
5618                 goto out;
5619         }
5620
5621         offset = logical - em->start;
5622         length = min_t(u64, em->start + em->len - logical, length);
5623         *length_ret = length;
5624
5625         stripe_len = map->stripe_len;
5626         /*
5627          * stripe_nr counts the total number of stripes we have to stride
5628          * to get to this block
5629          */
5630         stripe_nr = div64_u64(offset, stripe_len);
5631
5632         /* stripe_offset is the offset of this block in its stripe */
5633         stripe_offset = offset - stripe_nr * stripe_len;
5634
5635         stripe_nr_end = round_up(offset + length, map->stripe_len);
5636         stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5637         stripe_cnt = stripe_nr_end - stripe_nr;
5638         stripe_end_offset = stripe_nr_end * map->stripe_len -
5639                             (offset + length);
5640         /*
5641          * after this, stripe_nr is the number of stripes on this
5642          * device we have to walk to find the data, and stripe_index is
5643          * the number of our device in the stripe array
5644          */
5645         num_stripes = 1;
5646         stripe_index = 0;
5647         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5648                          BTRFS_BLOCK_GROUP_RAID10)) {
5649                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5650                         sub_stripes = 1;
5651                 else
5652                         sub_stripes = map->sub_stripes;
5653
5654                 factor = map->num_stripes / sub_stripes;
5655                 num_stripes = min_t(u64, map->num_stripes,
5656                                     sub_stripes * stripe_cnt);
5657                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5658                 stripe_index *= sub_stripes;
5659                 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5660                                               &remaining_stripes);
5661                 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5662                 last_stripe *= sub_stripes;
5663         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5664                                 BTRFS_BLOCK_GROUP_DUP)) {
5665                 num_stripes = map->num_stripes;
5666         } else {
5667                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5668                                         &stripe_index);
5669         }
5670
5671         bbio = alloc_btrfs_bio(num_stripes, 0);
5672         if (!bbio) {
5673                 ret = -ENOMEM;
5674                 goto out;
5675         }
5676
5677         for (i = 0; i < num_stripes; i++) {
5678                 bbio->stripes[i].physical =
5679                         map->stripes[stripe_index].physical +
5680                         stripe_offset + stripe_nr * map->stripe_len;
5681                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5682
5683                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5684                                  BTRFS_BLOCK_GROUP_RAID10)) {
5685                         bbio->stripes[i].length = stripes_per_dev *
5686                                 map->stripe_len;
5687
5688                         if (i / sub_stripes < remaining_stripes)
5689                                 bbio->stripes[i].length +=
5690                                         map->stripe_len;
5691
5692                         /*
5693                          * Special for the first stripe and
5694                          * the last stripe:
5695                          *
5696                          * |-------|...|-------|
5697                          *     |----------|
5698                          *    off     end_off
5699                          */
5700                         if (i < sub_stripes)
5701                                 bbio->stripes[i].length -=
5702                                         stripe_offset;
5703
5704                         if (stripe_index >= last_stripe &&
5705                             stripe_index <= (last_stripe +
5706                                              sub_stripes - 1))
5707                                 bbio->stripes[i].length -=
5708                                         stripe_end_offset;
5709
5710                         if (i == sub_stripes - 1)
5711                                 stripe_offset = 0;
5712                 } else {
5713                         bbio->stripes[i].length = length;
5714                 }
5715
5716                 stripe_index++;
5717                 if (stripe_index == map->num_stripes) {
5718                         stripe_index = 0;
5719                         stripe_nr++;
5720                 }
5721         }
5722
5723         *bbio_ret = bbio;
5724         bbio->map_type = map->type;
5725         bbio->num_stripes = num_stripes;
5726 out:
5727         free_extent_map(em);
5728         return ret;
5729 }
5730
5731 /*
5732  * In dev-replace case, for repair case (that's the only case where the mirror
5733  * is selected explicitly when calling btrfs_map_block), blocks left of the
5734  * left cursor can also be read from the target drive.
5735  *
5736  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5737  * array of stripes.
5738  * For READ, it also needs to be supported using the same mirror number.
5739  *
5740  * If the requested block is not left of the left cursor, EIO is returned. This
5741  * can happen because btrfs_num_copies() returns one more in the dev-replace
5742  * case.
5743  */
5744 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5745                                          u64 logical, u64 length,
5746                                          u64 srcdev_devid, int *mirror_num,
5747                                          u64 *physical)
5748 {
5749         struct btrfs_bio *bbio = NULL;
5750         int num_stripes;
5751         int index_srcdev = 0;
5752         int found = 0;
5753         u64 physical_of_found = 0;
5754         int i;
5755         int ret = 0;
5756
5757         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5758                                 logical, &length, &bbio, 0, 0);
5759         if (ret) {
5760                 ASSERT(bbio == NULL);
5761                 return ret;
5762         }
5763
5764         num_stripes = bbio->num_stripes;
5765         if (*mirror_num > num_stripes) {
5766                 /*
5767                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5768                  * that means that the requested area is not left of the left
5769                  * cursor
5770                  */
5771                 btrfs_put_bbio(bbio);
5772                 return -EIO;
5773         }
5774
5775         /*
5776          * process the rest of the function using the mirror_num of the source
5777          * drive. Therefore look it up first.  At the end, patch the device
5778          * pointer to the one of the target drive.
5779          */
5780         for (i = 0; i < num_stripes; i++) {
5781                 if (bbio->stripes[i].dev->devid != srcdev_devid)
5782                         continue;
5783
5784                 /*
5785                  * In case of DUP, in order to keep it simple, only add the
5786                  * mirror with the lowest physical address
5787                  */
5788                 if (found &&
5789                     physical_of_found <= bbio->stripes[i].physical)
5790                         continue;
5791
5792                 index_srcdev = i;
5793                 found = 1;
5794                 physical_of_found = bbio->stripes[i].physical;
5795         }
5796
5797         btrfs_put_bbio(bbio);
5798
5799         ASSERT(found);
5800         if (!found)
5801                 return -EIO;
5802
5803         *mirror_num = index_srcdev + 1;
5804         *physical = physical_of_found;
5805         return ret;
5806 }
5807
5808 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5809                                       struct btrfs_bio **bbio_ret,
5810                                       struct btrfs_dev_replace *dev_replace,
5811                                       int *num_stripes_ret, int *max_errors_ret)
5812 {
5813         struct btrfs_bio *bbio = *bbio_ret;
5814         u64 srcdev_devid = dev_replace->srcdev->devid;
5815         int tgtdev_indexes = 0;
5816         int num_stripes = *num_stripes_ret;
5817         int max_errors = *max_errors_ret;
5818         int i;
5819
5820         if (op == BTRFS_MAP_WRITE) {
5821                 int index_where_to_add;
5822
5823                 /*
5824                  * duplicate the write operations while the dev replace
5825                  * procedure is running. Since the copying of the old disk to
5826                  * the new disk takes place at run time while the filesystem is
5827                  * mounted writable, the regular write operations to the old
5828                  * disk have to be duplicated to go to the new disk as well.
5829                  *
5830                  * Note that device->missing is handled by the caller, and that
5831                  * the write to the old disk is already set up in the stripes
5832                  * array.
5833                  */
5834                 index_where_to_add = num_stripes;
5835                 for (i = 0; i < num_stripes; i++) {
5836                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5837                                 /* write to new disk, too */
5838                                 struct btrfs_bio_stripe *new =
5839                                         bbio->stripes + index_where_to_add;
5840                                 struct btrfs_bio_stripe *old =
5841                                         bbio->stripes + i;
5842
5843                                 new->physical = old->physical;
5844                                 new->length = old->length;
5845                                 new->dev = dev_replace->tgtdev;
5846                                 bbio->tgtdev_map[i] = index_where_to_add;
5847                                 index_where_to_add++;
5848                                 max_errors++;
5849                                 tgtdev_indexes++;
5850                         }
5851                 }
5852                 num_stripes = index_where_to_add;
5853         } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5854                 int index_srcdev = 0;
5855                 int found = 0;
5856                 u64 physical_of_found = 0;
5857
5858                 /*
5859                  * During the dev-replace procedure, the target drive can also
5860                  * be used to read data in case it is needed to repair a corrupt
5861                  * block elsewhere. This is possible if the requested area is
5862                  * left of the left cursor. In this area, the target drive is a
5863                  * full copy of the source drive.
5864                  */
5865                 for (i = 0; i < num_stripes; i++) {
5866                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5867                                 /*
5868                                  * In case of DUP, in order to keep it simple,
5869                                  * only add the mirror with the lowest physical
5870                                  * address
5871                                  */
5872                                 if (found &&
5873                                     physical_of_found <=
5874                                      bbio->stripes[i].physical)
5875                                         continue;
5876                                 index_srcdev = i;
5877                                 found = 1;
5878                                 physical_of_found = bbio->stripes[i].physical;
5879                         }
5880                 }
5881                 if (found) {
5882                         struct btrfs_bio_stripe *tgtdev_stripe =
5883                                 bbio->stripes + num_stripes;
5884
5885                         tgtdev_stripe->physical = physical_of_found;
5886                         tgtdev_stripe->length =
5887                                 bbio->stripes[index_srcdev].length;
5888                         tgtdev_stripe->dev = dev_replace->tgtdev;
5889                         bbio->tgtdev_map[index_srcdev] = num_stripes;
5890
5891                         tgtdev_indexes++;
5892                         num_stripes++;
5893                 }
5894         }
5895
5896         *num_stripes_ret = num_stripes;
5897         *max_errors_ret = max_errors;
5898         bbio->num_tgtdevs = tgtdev_indexes;
5899         *bbio_ret = bbio;
5900 }
5901
5902 static bool need_full_stripe(enum btrfs_map_op op)
5903 {
5904         return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5905 }
5906
5907 /*
5908  * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
5909  *                     tuple. This information is used to calculate how big a
5910  *                     particular bio can get before it straddles a stripe.
5911  *
5912  * @fs_info - the filesystem
5913  * @logical - address that we want to figure out the geometry of
5914  * @len     - the length of IO we are going to perform, starting at @logical
5915  * @op      - type of operation - write or read
5916  * @io_geom - pointer used to return values
5917  *
5918  * Returns < 0 in case a chunk for the given logical address cannot be found,
5919  * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
5920  */
5921 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5922                         u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
5923 {
5924         struct extent_map *em;
5925         struct map_lookup *map;
5926         u64 offset;
5927         u64 stripe_offset;
5928         u64 stripe_nr;
5929         u64 stripe_len;
5930         u64 raid56_full_stripe_start = (u64)-1;
5931         int data_stripes;
5932         int ret = 0;
5933
5934         ASSERT(op != BTRFS_MAP_DISCARD);
5935
5936         em = btrfs_get_chunk_map(fs_info, logical, len);
5937         if (IS_ERR(em))
5938                 return PTR_ERR(em);
5939
5940         map = em->map_lookup;
5941         /* Offset of this logical address in the chunk */
5942         offset = logical - em->start;
5943         /* Len of a stripe in a chunk */
5944         stripe_len = map->stripe_len;
5945         /* Stripe wher this block falls in */
5946         stripe_nr = div64_u64(offset, stripe_len);
5947         /* Offset of stripe in the chunk */
5948         stripe_offset = stripe_nr * stripe_len;
5949         if (offset < stripe_offset) {
5950                 btrfs_crit(fs_info,
5951 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
5952                         stripe_offset, offset, em->start, logical, stripe_len);
5953                 ret = -EINVAL;
5954                 goto out;
5955         }
5956
5957         /* stripe_offset is the offset of this block in its stripe */
5958         stripe_offset = offset - stripe_offset;
5959         data_stripes = nr_data_stripes(map);
5960
5961         if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5962                 u64 max_len = stripe_len - stripe_offset;
5963
5964                 /*
5965                  * In case of raid56, we need to know the stripe aligned start
5966                  */
5967                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5968                         unsigned long full_stripe_len = stripe_len * data_stripes;
5969                         raid56_full_stripe_start = offset;
5970
5971                         /*
5972                          * Allow a write of a full stripe, but make sure we
5973                          * don't allow straddling of stripes
5974                          */
5975                         raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5976                                         full_stripe_len);
5977                         raid56_full_stripe_start *= full_stripe_len;
5978
5979                         /*
5980                          * For writes to RAID[56], allow a full stripeset across
5981                          * all disks. For other RAID types and for RAID[56]
5982                          * reads, just allow a single stripe (on a single disk).
5983                          */
5984                         if (op == BTRFS_MAP_WRITE) {
5985                                 max_len = stripe_len * data_stripes -
5986                                           (offset - raid56_full_stripe_start);
5987                         }
5988                 }
5989                 len = min_t(u64, em->len - offset, max_len);
5990         } else {
5991                 len = em->len - offset;
5992         }
5993
5994         io_geom->len = len;
5995         io_geom->offset = offset;
5996         io_geom->stripe_len = stripe_len;
5997         io_geom->stripe_nr = stripe_nr;
5998         io_geom->stripe_offset = stripe_offset;
5999         io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6000
6001 out:
6002         /* once for us */
6003         free_extent_map(em);
6004         return ret;
6005 }
6006
6007 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6008                              enum btrfs_map_op op,
6009                              u64 logical, u64 *length,
6010                              struct btrfs_bio **bbio_ret,
6011                              int mirror_num, int need_raid_map)
6012 {
6013         struct extent_map *em;
6014         struct map_lookup *map;
6015         u64 stripe_offset;
6016         u64 stripe_nr;
6017         u64 stripe_len;
6018         u32 stripe_index;
6019         int data_stripes;
6020         int i;
6021         int ret = 0;
6022         int num_stripes;
6023         int max_errors = 0;
6024         int tgtdev_indexes = 0;
6025         struct btrfs_bio *bbio = NULL;
6026         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6027         int dev_replace_is_ongoing = 0;
6028         int num_alloc_stripes;
6029         int patch_the_first_stripe_for_dev_replace = 0;
6030         u64 physical_to_patch_in_first_stripe = 0;
6031         u64 raid56_full_stripe_start = (u64)-1;
6032         struct btrfs_io_geometry geom;
6033
6034         ASSERT(bbio_ret);
6035         ASSERT(op != BTRFS_MAP_DISCARD);
6036
6037         ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
6038         if (ret < 0)
6039                 return ret;
6040
6041         em = btrfs_get_chunk_map(fs_info, logical, *length);
6042         ASSERT(!IS_ERR(em));
6043         map = em->map_lookup;
6044
6045         *length = geom.len;
6046         stripe_len = geom.stripe_len;
6047         stripe_nr = geom.stripe_nr;
6048         stripe_offset = geom.stripe_offset;
6049         raid56_full_stripe_start = geom.raid56_stripe_offset;
6050         data_stripes = nr_data_stripes(map);
6051
6052         down_read(&dev_replace->rwsem);
6053         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6054         /*
6055          * Hold the semaphore for read during the whole operation, write is
6056          * requested at commit time but must wait.
6057          */
6058         if (!dev_replace_is_ongoing)
6059                 up_read(&dev_replace->rwsem);
6060
6061         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6062             !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6063                 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6064                                                     dev_replace->srcdev->devid,
6065                                                     &mirror_num,
6066                                             &physical_to_patch_in_first_stripe);
6067                 if (ret)
6068                         goto out;
6069                 else
6070                         patch_the_first_stripe_for_dev_replace = 1;
6071         } else if (mirror_num > map->num_stripes) {
6072                 mirror_num = 0;
6073         }
6074
6075         num_stripes = 1;
6076         stripe_index = 0;
6077         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6078                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6079                                 &stripe_index);
6080                 if (!need_full_stripe(op))
6081                         mirror_num = 1;
6082         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6083                 if (need_full_stripe(op))
6084                         num_stripes = map->num_stripes;
6085                 else if (mirror_num)
6086                         stripe_index = mirror_num - 1;
6087                 else {
6088                         stripe_index = find_live_mirror(fs_info, map, 0,
6089                                             dev_replace_is_ongoing);
6090                         mirror_num = stripe_index + 1;
6091                 }
6092
6093         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6094                 if (need_full_stripe(op)) {
6095                         num_stripes = map->num_stripes;
6096                 } else if (mirror_num) {
6097                         stripe_index = mirror_num - 1;
6098                 } else {
6099                         mirror_num = 1;
6100                 }
6101
6102         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6103                 u32 factor = map->num_stripes / map->sub_stripes;
6104
6105                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6106                 stripe_index *= map->sub_stripes;
6107
6108                 if (need_full_stripe(op))
6109                         num_stripes = map->sub_stripes;
6110                 else if (mirror_num)
6111                         stripe_index += mirror_num - 1;
6112                 else {
6113                         int old_stripe_index = stripe_index;
6114                         stripe_index = find_live_mirror(fs_info, map,
6115                                               stripe_index,
6116                                               dev_replace_is_ongoing);
6117                         mirror_num = stripe_index - old_stripe_index + 1;
6118                 }
6119
6120         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6121                 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6122                         /* push stripe_nr back to the start of the full stripe */
6123                         stripe_nr = div64_u64(raid56_full_stripe_start,
6124                                         stripe_len * data_stripes);
6125
6126                         /* RAID[56] write or recovery. Return all stripes */
6127                         num_stripes = map->num_stripes;
6128                         max_errors = nr_parity_stripes(map);
6129
6130                         *length = map->stripe_len;
6131                         stripe_index = 0;
6132                         stripe_offset = 0;
6133                 } else {
6134                         /*
6135                          * Mirror #0 or #1 means the original data block.
6136                          * Mirror #2 is RAID5 parity block.
6137                          * Mirror #3 is RAID6 Q block.
6138                          */
6139                         stripe_nr = div_u64_rem(stripe_nr,
6140                                         data_stripes, &stripe_index);
6141                         if (mirror_num > 1)
6142                                 stripe_index = data_stripes + mirror_num - 2;
6143
6144                         /* We distribute the parity blocks across stripes */
6145                         div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6146                                         &stripe_index);
6147                         if (!need_full_stripe(op) && mirror_num <= 1)
6148                                 mirror_num = 1;
6149                 }
6150         } else {
6151                 /*
6152                  * after this, stripe_nr is the number of stripes on this
6153                  * device we have to walk to find the data, and stripe_index is
6154                  * the number of our device in the stripe array
6155                  */
6156                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6157                                 &stripe_index);
6158                 mirror_num = stripe_index + 1;
6159         }
6160         if (stripe_index >= map->num_stripes) {
6161                 btrfs_crit(fs_info,
6162                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6163                            stripe_index, map->num_stripes);
6164                 ret = -EINVAL;
6165                 goto out;
6166         }
6167
6168         num_alloc_stripes = num_stripes;
6169         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6170                 if (op == BTRFS_MAP_WRITE)
6171                         num_alloc_stripes <<= 1;
6172                 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6173                         num_alloc_stripes++;
6174                 tgtdev_indexes = num_stripes;
6175         }
6176
6177         bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6178         if (!bbio) {
6179                 ret = -ENOMEM;
6180                 goto out;
6181         }
6182
6183         for (i = 0; i < num_stripes; i++) {
6184                 bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6185                         stripe_offset + stripe_nr * map->stripe_len;
6186                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6187                 stripe_index++;
6188         }
6189
6190         /* build raid_map */
6191         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6192             (need_full_stripe(op) || mirror_num > 1)) {
6193                 u64 tmp;
6194                 unsigned rot;
6195
6196                 /* Work out the disk rotation on this stripe-set */
6197                 div_u64_rem(stripe_nr, num_stripes, &rot);
6198
6199                 /* Fill in the logical address of each stripe */
6200                 tmp = stripe_nr * data_stripes;
6201                 for (i = 0; i < data_stripes; i++)
6202                         bbio->raid_map[(i+rot) % num_stripes] =
6203                                 em->start + (tmp + i) * map->stripe_len;
6204
6205                 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6206                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6207                         bbio->raid_map[(i+rot+1) % num_stripes] =
6208                                 RAID6_Q_STRIPE;
6209
6210                 sort_parity_stripes(bbio, num_stripes);
6211         }
6212
6213         if (need_full_stripe(op))
6214                 max_errors = btrfs_chunk_max_errors(map);
6215
6216         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6217             need_full_stripe(op)) {
6218                 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
6219                                           &max_errors);
6220         }
6221
6222         *bbio_ret = bbio;
6223         bbio->map_type = map->type;
6224         bbio->num_stripes = num_stripes;
6225         bbio->max_errors = max_errors;
6226         bbio->mirror_num = mirror_num;
6227
6228         /*
6229          * this is the case that REQ_READ && dev_replace_is_ongoing &&
6230          * mirror_num == num_stripes + 1 && dev_replace target drive is
6231          * available as a mirror
6232          */
6233         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6234                 WARN_ON(num_stripes > 1);
6235                 bbio->stripes[0].dev = dev_replace->tgtdev;
6236                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6237                 bbio->mirror_num = map->num_stripes + 1;
6238         }
6239 out:
6240         if (dev_replace_is_ongoing) {
6241                 lockdep_assert_held(&dev_replace->rwsem);
6242                 /* Unlock and let waiting writers proceed */
6243                 up_read(&dev_replace->rwsem);
6244         }
6245         free_extent_map(em);
6246         return ret;
6247 }
6248
6249 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6250                       u64 logical, u64 *length,
6251                       struct btrfs_bio **bbio_ret, int mirror_num)
6252 {
6253         if (op == BTRFS_MAP_DISCARD)
6254                 return __btrfs_map_block_for_discard(fs_info, logical,
6255                                                      length, bbio_ret);
6256
6257         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6258                                  mirror_num, 0);
6259 }
6260
6261 /* For Scrub/replace */
6262 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6263                      u64 logical, u64 *length,
6264                      struct btrfs_bio **bbio_ret)
6265 {
6266         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6267 }
6268
6269 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6270 {
6271         bio->bi_private = bbio->private;
6272         bio->bi_end_io = bbio->end_io;
6273         bio_endio(bio);
6274
6275         btrfs_put_bbio(bbio);
6276 }
6277
6278 static void btrfs_end_bio(struct bio *bio)
6279 {
6280         struct btrfs_bio *bbio = bio->bi_private;
6281         int is_orig_bio = 0;
6282
6283         if (bio->bi_status) {
6284                 atomic_inc(&bbio->error);
6285                 if (bio->bi_status == BLK_STS_IOERR ||
6286                     bio->bi_status == BLK_STS_TARGET) {
6287                         struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6288
6289                         ASSERT(dev->bdev);
6290                         if (bio_op(bio) == REQ_OP_WRITE)
6291                                 btrfs_dev_stat_inc_and_print(dev,
6292                                                 BTRFS_DEV_STAT_WRITE_ERRS);
6293                         else if (!(bio->bi_opf & REQ_RAHEAD))
6294                                 btrfs_dev_stat_inc_and_print(dev,
6295                                                 BTRFS_DEV_STAT_READ_ERRS);
6296                         if (bio->bi_opf & REQ_PREFLUSH)
6297                                 btrfs_dev_stat_inc_and_print(dev,
6298                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
6299                 }
6300         }
6301
6302         if (bio == bbio->orig_bio)
6303                 is_orig_bio = 1;
6304
6305         btrfs_bio_counter_dec(bbio->fs_info);
6306
6307         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6308                 if (!is_orig_bio) {
6309                         bio_put(bio);
6310                         bio = bbio->orig_bio;
6311                 }
6312
6313                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6314                 /* only send an error to the higher layers if it is
6315                  * beyond the tolerance of the btrfs bio
6316                  */
6317                 if (atomic_read(&bbio->error) > bbio->max_errors) {
6318                         bio->bi_status = BLK_STS_IOERR;
6319                 } else {
6320                         /*
6321                          * this bio is actually up to date, we didn't
6322                          * go over the max number of errors
6323                          */
6324                         bio->bi_status = BLK_STS_OK;
6325                 }
6326
6327                 btrfs_end_bbio(bbio, bio);
6328         } else if (!is_orig_bio) {
6329                 bio_put(bio);
6330         }
6331 }
6332
6333 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6334                               u64 physical, struct btrfs_device *dev)
6335 {
6336         struct btrfs_fs_info *fs_info = bbio->fs_info;
6337
6338         bio->bi_private = bbio;
6339         btrfs_io_bio(bio)->device = dev;
6340         bio->bi_end_io = btrfs_end_bio;
6341         bio->bi_iter.bi_sector = physical >> 9;
6342         btrfs_debug_in_rcu(fs_info,
6343         "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6344                 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6345                 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6346                 dev->devid, bio->bi_iter.bi_size);
6347         bio_set_dev(bio, dev->bdev);
6348
6349         btrfs_bio_counter_inc_noblocked(fs_info);
6350
6351         btrfsic_submit_bio(bio);
6352 }
6353
6354 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6355 {
6356         atomic_inc(&bbio->error);
6357         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6358                 /* Should be the original bio. */
6359                 WARN_ON(bio != bbio->orig_bio);
6360
6361                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6362                 bio->bi_iter.bi_sector = logical >> 9;
6363                 if (atomic_read(&bbio->error) > bbio->max_errors)
6364                         bio->bi_status = BLK_STS_IOERR;
6365                 else
6366                         bio->bi_status = BLK_STS_OK;
6367                 btrfs_end_bbio(bbio, bio);
6368         }
6369 }
6370
6371 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6372                            int mirror_num)
6373 {
6374         struct btrfs_device *dev;
6375         struct bio *first_bio = bio;
6376         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6377         u64 length = 0;
6378         u64 map_length;
6379         int ret;
6380         int dev_nr;
6381         int total_devs;
6382         struct btrfs_bio *bbio = NULL;
6383
6384         length = bio->bi_iter.bi_size;
6385         map_length = length;
6386
6387         btrfs_bio_counter_inc_blocked(fs_info);
6388         ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6389                                 &map_length, &bbio, mirror_num, 1);
6390         if (ret) {
6391                 btrfs_bio_counter_dec(fs_info);
6392                 return errno_to_blk_status(ret);
6393         }
6394
6395         total_devs = bbio->num_stripes;
6396         bbio->orig_bio = first_bio;
6397         bbio->private = first_bio->bi_private;
6398         bbio->end_io = first_bio->bi_end_io;
6399         bbio->fs_info = fs_info;
6400         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6401
6402         if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6403             ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
6404                 /* In this case, map_length has been set to the length of
6405                    a single stripe; not the whole write */
6406                 if (bio_op(bio) == REQ_OP_WRITE) {
6407                         ret = raid56_parity_write(fs_info, bio, bbio,
6408                                                   map_length);
6409                 } else {
6410                         ret = raid56_parity_recover(fs_info, bio, bbio,
6411                                                     map_length, mirror_num, 1);
6412                 }
6413
6414                 btrfs_bio_counter_dec(fs_info);
6415                 return errno_to_blk_status(ret);
6416         }
6417
6418         if (map_length < length) {
6419                 btrfs_crit(fs_info,
6420                            "mapping failed logical %llu bio len %llu len %llu",
6421                            logical, length, map_length);
6422                 BUG();
6423         }
6424
6425         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6426                 dev = bbio->stripes[dev_nr].dev;
6427                 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6428                                                    &dev->dev_state) ||
6429                     (bio_op(first_bio) == REQ_OP_WRITE &&
6430                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6431                         bbio_error(bbio, first_bio, logical);
6432                         continue;
6433                 }
6434
6435                 if (dev_nr < total_devs - 1)
6436                         bio = btrfs_bio_clone(first_bio);
6437                 else
6438                         bio = first_bio;
6439
6440                 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6441         }
6442         btrfs_bio_counter_dec(fs_info);
6443         return BLK_STS_OK;
6444 }
6445
6446 /*
6447  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6448  * return NULL.
6449  *
6450  * If devid and uuid are both specified, the match must be exact, otherwise
6451  * only devid is used.
6452  *
6453  * If @seed is true, traverse through the seed devices.
6454  */
6455 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6456                                        u64 devid, u8 *uuid, u8 *fsid,
6457                                        bool seed)
6458 {
6459         struct btrfs_device *device;
6460         struct btrfs_fs_devices *seed_devs;
6461
6462         if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6463                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6464                         if (device->devid == devid &&
6465                             (!uuid || memcmp(device->uuid, uuid,
6466                                              BTRFS_UUID_SIZE) == 0))
6467                                 return device;
6468                 }
6469         }
6470
6471         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6472                 if (!fsid ||
6473                     !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6474                         list_for_each_entry(device, &seed_devs->devices,
6475                                             dev_list) {
6476                                 if (device->devid == devid &&
6477                                     (!uuid || memcmp(device->uuid, uuid,
6478                                                      BTRFS_UUID_SIZE) == 0))
6479                                         return device;
6480                         }
6481                 }
6482         }
6483
6484         return NULL;
6485 }
6486
6487 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6488                                             u64 devid, u8 *dev_uuid)
6489 {
6490         struct btrfs_device *device;
6491         unsigned int nofs_flag;
6492
6493         /*
6494          * We call this under the chunk_mutex, so we want to use NOFS for this
6495          * allocation, however we don't want to change btrfs_alloc_device() to
6496          * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6497          * places.
6498          */
6499         nofs_flag = memalloc_nofs_save();
6500         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6501         memalloc_nofs_restore(nofs_flag);
6502         if (IS_ERR(device))
6503                 return device;
6504
6505         list_add(&device->dev_list, &fs_devices->devices);
6506         device->fs_devices = fs_devices;
6507         fs_devices->num_devices++;
6508
6509         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6510         fs_devices->missing_devices++;
6511
6512         return device;
6513 }
6514
6515 /**
6516  * btrfs_alloc_device - allocate struct btrfs_device
6517  * @fs_info:    used only for generating a new devid, can be NULL if
6518  *              devid is provided (i.e. @devid != NULL).
6519  * @devid:      a pointer to devid for this device.  If NULL a new devid
6520  *              is generated.
6521  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6522  *              is generated.
6523  *
6524  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6525  * on error.  Returned struct is not linked onto any lists and must be
6526  * destroyed with btrfs_free_device.
6527  */
6528 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6529                                         const u64 *devid,
6530                                         const u8 *uuid)
6531 {
6532         struct btrfs_device *dev;
6533         u64 tmp;
6534
6535         if (WARN_ON(!devid && !fs_info))
6536                 return ERR_PTR(-EINVAL);
6537
6538         dev = __alloc_device(fs_info);
6539         if (IS_ERR(dev))
6540                 return dev;
6541
6542         if (devid)
6543                 tmp = *devid;
6544         else {
6545                 int ret;
6546
6547                 ret = find_next_devid(fs_info, &tmp);
6548                 if (ret) {
6549                         btrfs_free_device(dev);
6550                         return ERR_PTR(ret);
6551                 }
6552         }
6553         dev->devid = tmp;
6554
6555         if (uuid)
6556                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6557         else
6558                 generate_random_uuid(dev->uuid);
6559
6560         return dev;
6561 }
6562
6563 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6564                                         u64 devid, u8 *uuid, bool error)
6565 {
6566         if (error)
6567                 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6568                               devid, uuid);
6569         else
6570                 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6571                               devid, uuid);
6572 }
6573
6574 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6575 {
6576         int index = btrfs_bg_flags_to_raid_index(type);
6577         int ncopies = btrfs_raid_array[index].ncopies;
6578         const int nparity = btrfs_raid_array[index].nparity;
6579         int data_stripes;
6580
6581         if (nparity)
6582                 data_stripes = num_stripes - nparity;
6583         else
6584                 data_stripes = num_stripes / ncopies;
6585
6586         return div_u64(chunk_len, data_stripes);
6587 }
6588
6589 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6590                           struct btrfs_chunk *chunk)
6591 {
6592         struct btrfs_fs_info *fs_info = leaf->fs_info;
6593         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6594         struct map_lookup *map;
6595         struct extent_map *em;
6596         u64 logical;
6597         u64 length;
6598         u64 devid;
6599         u8 uuid[BTRFS_UUID_SIZE];
6600         int num_stripes;
6601         int ret;
6602         int i;
6603
6604         logical = key->offset;
6605         length = btrfs_chunk_length(leaf, chunk);
6606         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6607
6608         /*
6609          * Only need to verify chunk item if we're reading from sys chunk array,
6610          * as chunk item in tree block is already verified by tree-checker.
6611          */
6612         if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6613                 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6614                 if (ret)
6615                         return ret;
6616         }
6617
6618         read_lock(&map_tree->lock);
6619         em = lookup_extent_mapping(map_tree, logical, 1);
6620         read_unlock(&map_tree->lock);
6621
6622         /* already mapped? */
6623         if (em && em->start <= logical && em->start + em->len > logical) {
6624                 free_extent_map(em);
6625                 return 0;
6626         } else if (em) {
6627                 free_extent_map(em);
6628         }
6629
6630         em = alloc_extent_map();
6631         if (!em)
6632                 return -ENOMEM;
6633         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6634         if (!map) {
6635                 free_extent_map(em);
6636                 return -ENOMEM;
6637         }
6638
6639         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6640         em->map_lookup = map;
6641         em->start = logical;
6642         em->len = length;
6643         em->orig_start = 0;
6644         em->block_start = 0;
6645         em->block_len = em->len;
6646
6647         map->num_stripes = num_stripes;
6648         map->io_width = btrfs_chunk_io_width(leaf, chunk);
6649         map->io_align = btrfs_chunk_io_align(leaf, chunk);
6650         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6651         map->type = btrfs_chunk_type(leaf, chunk);
6652         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6653         map->verified_stripes = 0;
6654         em->orig_block_len = calc_stripe_length(map->type, em->len,
6655                                                 map->num_stripes);
6656         for (i = 0; i < num_stripes; i++) {
6657                 map->stripes[i].physical =
6658                         btrfs_stripe_offset_nr(leaf, chunk, i);
6659                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6660                 read_extent_buffer(leaf, uuid, (unsigned long)
6661                                    btrfs_stripe_dev_uuid_nr(chunk, i),
6662                                    BTRFS_UUID_SIZE);
6663                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6664                                                         devid, uuid, NULL, true);
6665                 if (!map->stripes[i].dev &&
6666                     !btrfs_test_opt(fs_info, DEGRADED)) {
6667                         free_extent_map(em);
6668                         btrfs_report_missing_device(fs_info, devid, uuid, true);
6669                         return -ENOENT;
6670                 }
6671                 if (!map->stripes[i].dev) {
6672                         map->stripes[i].dev =
6673                                 add_missing_dev(fs_info->fs_devices, devid,
6674                                                 uuid);
6675                         if (IS_ERR(map->stripes[i].dev)) {
6676                                 free_extent_map(em);
6677                                 btrfs_err(fs_info,
6678                                         "failed to init missing dev %llu: %ld",
6679                                         devid, PTR_ERR(map->stripes[i].dev));
6680                                 return PTR_ERR(map->stripes[i].dev);
6681                         }
6682                         btrfs_report_missing_device(fs_info, devid, uuid, false);
6683                 }
6684                 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6685                                 &(map->stripes[i].dev->dev_state));
6686
6687         }
6688
6689         write_lock(&map_tree->lock);
6690         ret = add_extent_mapping(map_tree, em, 0);
6691         write_unlock(&map_tree->lock);
6692         if (ret < 0) {
6693                 btrfs_err(fs_info,
6694                           "failed to add chunk map, start=%llu len=%llu: %d",
6695                           em->start, em->len, ret);
6696         }
6697         free_extent_map(em);
6698
6699         return ret;
6700 }
6701
6702 static void fill_device_from_item(struct extent_buffer *leaf,
6703                                  struct btrfs_dev_item *dev_item,
6704                                  struct btrfs_device *device)
6705 {
6706         unsigned long ptr;
6707
6708         device->devid = btrfs_device_id(leaf, dev_item);
6709         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6710         device->total_bytes = device->disk_total_bytes;
6711         device->commit_total_bytes = device->disk_total_bytes;
6712         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6713         device->commit_bytes_used = device->bytes_used;
6714         device->type = btrfs_device_type(leaf, dev_item);
6715         device->io_align = btrfs_device_io_align(leaf, dev_item);
6716         device->io_width = btrfs_device_io_width(leaf, dev_item);
6717         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6718         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6719         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6720
6721         ptr = btrfs_device_uuid(dev_item);
6722         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6723 }
6724
6725 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6726                                                   u8 *fsid)
6727 {
6728         struct btrfs_fs_devices *fs_devices;
6729         int ret;
6730
6731         lockdep_assert_held(&uuid_mutex);
6732         ASSERT(fsid);
6733
6734         /* This will match only for multi-device seed fs */
6735         list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
6736                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6737                         return fs_devices;
6738
6739
6740         fs_devices = find_fsid(fsid, NULL);
6741         if (!fs_devices) {
6742                 if (!btrfs_test_opt(fs_info, DEGRADED))
6743                         return ERR_PTR(-ENOENT);
6744
6745                 fs_devices = alloc_fs_devices(fsid, NULL);
6746                 if (IS_ERR(fs_devices))
6747                         return fs_devices;
6748
6749                 fs_devices->seeding = true;
6750                 fs_devices->opened = 1;
6751                 return fs_devices;
6752         }
6753
6754         /*
6755          * Upon first call for a seed fs fsid, just create a private copy of the
6756          * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6757          */
6758         fs_devices = clone_fs_devices(fs_devices);
6759         if (IS_ERR(fs_devices))
6760                 return fs_devices;
6761
6762         ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6763         if (ret) {
6764                 free_fs_devices(fs_devices);
6765                 return ERR_PTR(ret);
6766         }
6767
6768         if (!fs_devices->seeding) {
6769                 close_fs_devices(fs_devices);
6770                 free_fs_devices(fs_devices);
6771                 return ERR_PTR(-EINVAL);
6772         }
6773
6774         list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6775
6776         return fs_devices;
6777 }
6778
6779 static int read_one_dev(struct extent_buffer *leaf,
6780                         struct btrfs_dev_item *dev_item)
6781 {
6782         struct btrfs_fs_info *fs_info = leaf->fs_info;
6783         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6784         struct btrfs_device *device;
6785         u64 devid;
6786         int ret;
6787         u8 fs_uuid[BTRFS_FSID_SIZE];
6788         u8 dev_uuid[BTRFS_UUID_SIZE];
6789
6790         devid = btrfs_device_id(leaf, dev_item);
6791         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6792                            BTRFS_UUID_SIZE);
6793         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6794                            BTRFS_FSID_SIZE);
6795
6796         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6797                 fs_devices = open_seed_devices(fs_info, fs_uuid);
6798                 if (IS_ERR(fs_devices))
6799                         return PTR_ERR(fs_devices);
6800         }
6801
6802         device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
6803                                    fs_uuid, true);
6804         if (!device) {
6805                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6806                         btrfs_report_missing_device(fs_info, devid,
6807                                                         dev_uuid, true);
6808                         return -ENOENT;
6809                 }
6810
6811                 device = add_missing_dev(fs_devices, devid, dev_uuid);
6812                 if (IS_ERR(device)) {
6813                         btrfs_err(fs_info,
6814                                 "failed to add missing dev %llu: %ld",
6815                                 devid, PTR_ERR(device));
6816                         return PTR_ERR(device);
6817                 }
6818                 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6819         } else {
6820                 if (!device->bdev) {
6821                         if (!btrfs_test_opt(fs_info, DEGRADED)) {
6822                                 btrfs_report_missing_device(fs_info,
6823                                                 devid, dev_uuid, true);
6824                                 return -ENOENT;
6825                         }
6826                         btrfs_report_missing_device(fs_info, devid,
6827                                                         dev_uuid, false);
6828                 }
6829
6830                 if (!device->bdev &&
6831                     !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6832                         /*
6833                          * this happens when a device that was properly setup
6834                          * in the device info lists suddenly goes bad.
6835                          * device->bdev is NULL, and so we have to set
6836                          * device->missing to one here
6837                          */
6838                         device->fs_devices->missing_devices++;
6839                         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6840                 }
6841
6842                 /* Move the device to its own fs_devices */
6843                 if (device->fs_devices != fs_devices) {
6844                         ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
6845                                                         &device->dev_state));
6846
6847                         list_move(&device->dev_list, &fs_devices->devices);
6848                         device->fs_devices->num_devices--;
6849                         fs_devices->num_devices++;
6850
6851                         device->fs_devices->missing_devices--;
6852                         fs_devices->missing_devices++;
6853
6854                         device->fs_devices = fs_devices;
6855                 }
6856         }
6857
6858         if (device->fs_devices != fs_info->fs_devices) {
6859                 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
6860                 if (device->generation !=
6861                     btrfs_device_generation(leaf, dev_item))
6862                         return -EINVAL;
6863         }
6864
6865         fill_device_from_item(leaf, dev_item, device);
6866         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6867         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6868            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
6869                 device->fs_devices->total_rw_bytes += device->total_bytes;
6870                 atomic64_add(device->total_bytes - device->bytes_used,
6871                                 &fs_info->free_chunk_space);
6872         }
6873         ret = 0;
6874         return ret;
6875 }
6876
6877 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6878 {
6879         struct btrfs_root *root = fs_info->tree_root;
6880         struct btrfs_super_block *super_copy = fs_info->super_copy;
6881         struct extent_buffer *sb;
6882         struct btrfs_disk_key *disk_key;
6883         struct btrfs_chunk *chunk;
6884         u8 *array_ptr;
6885         unsigned long sb_array_offset;
6886         int ret = 0;
6887         u32 num_stripes;
6888         u32 array_size;
6889         u32 len = 0;
6890         u32 cur_offset;
6891         u64 type;
6892         struct btrfs_key key;
6893
6894         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6895         /*
6896          * This will create extent buffer of nodesize, superblock size is
6897          * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6898          * overallocate but we can keep it as-is, only the first page is used.
6899          */
6900         sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6901         if (IS_ERR(sb))
6902                 return PTR_ERR(sb);
6903         set_extent_buffer_uptodate(sb);
6904         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6905         /*
6906          * The sb extent buffer is artificial and just used to read the system array.
6907          * set_extent_buffer_uptodate() call does not properly mark all it's
6908          * pages up-to-date when the page is larger: extent does not cover the
6909          * whole page and consequently check_page_uptodate does not find all
6910          * the page's extents up-to-date (the hole beyond sb),
6911          * write_extent_buffer then triggers a WARN_ON.
6912          *
6913          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6914          * but sb spans only this function. Add an explicit SetPageUptodate call
6915          * to silence the warning eg. on PowerPC 64.
6916          */
6917         if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6918                 SetPageUptodate(sb->pages[0]);
6919
6920         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6921         array_size = btrfs_super_sys_array_size(super_copy);
6922
6923         array_ptr = super_copy->sys_chunk_array;
6924         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6925         cur_offset = 0;
6926
6927         while (cur_offset < array_size) {
6928                 disk_key = (struct btrfs_disk_key *)array_ptr;
6929                 len = sizeof(*disk_key);
6930                 if (cur_offset + len > array_size)
6931                         goto out_short_read;
6932
6933                 btrfs_disk_key_to_cpu(&key, disk_key);
6934
6935                 array_ptr += len;
6936                 sb_array_offset += len;
6937                 cur_offset += len;
6938
6939                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
6940                         btrfs_err(fs_info,
6941                             "unexpected item type %u in sys_array at offset %u",
6942                                   (u32)key.type, cur_offset);
6943                         ret = -EIO;
6944                         break;
6945                 }
6946
6947                 chunk = (struct btrfs_chunk *)sb_array_offset;
6948                 /*
6949                  * At least one btrfs_chunk with one stripe must be present,
6950                  * exact stripe count check comes afterwards
6951                  */
6952                 len = btrfs_chunk_item_size(1);
6953                 if (cur_offset + len > array_size)
6954                         goto out_short_read;
6955
6956                 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6957                 if (!num_stripes) {
6958                         btrfs_err(fs_info,
6959                         "invalid number of stripes %u in sys_array at offset %u",
6960                                   num_stripes, cur_offset);
6961                         ret = -EIO;
6962                         break;
6963                 }
6964
6965                 type = btrfs_chunk_type(sb, chunk);
6966                 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6967                         btrfs_err(fs_info,
6968                         "invalid chunk type %llu in sys_array at offset %u",
6969                                   type, cur_offset);
6970                         ret = -EIO;
6971                         break;
6972                 }
6973
6974                 len = btrfs_chunk_item_size(num_stripes);
6975                 if (cur_offset + len > array_size)
6976                         goto out_short_read;
6977
6978                 ret = read_one_chunk(&key, sb, chunk);
6979                 if (ret)
6980                         break;
6981
6982                 array_ptr += len;
6983                 sb_array_offset += len;
6984                 cur_offset += len;
6985         }
6986         clear_extent_buffer_uptodate(sb);
6987         free_extent_buffer_stale(sb);
6988         return ret;
6989
6990 out_short_read:
6991         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6992                         len, cur_offset);
6993         clear_extent_buffer_uptodate(sb);
6994         free_extent_buffer_stale(sb);
6995         return -EIO;
6996 }
6997
6998 /*
6999  * Check if all chunks in the fs are OK for read-write degraded mount
7000  *
7001  * If the @failing_dev is specified, it's accounted as missing.
7002  *
7003  * Return true if all chunks meet the minimal RW mount requirements.
7004  * Return false if any chunk doesn't meet the minimal RW mount requirements.
7005  */
7006 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7007                                         struct btrfs_device *failing_dev)
7008 {
7009         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7010         struct extent_map *em;
7011         u64 next_start = 0;
7012         bool ret = true;
7013
7014         read_lock(&map_tree->lock);
7015         em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7016         read_unlock(&map_tree->lock);
7017         /* No chunk at all? Return false anyway */
7018         if (!em) {
7019                 ret = false;
7020                 goto out;
7021         }
7022         while (em) {
7023                 struct map_lookup *map;
7024                 int missing = 0;
7025                 int max_tolerated;
7026                 int i;
7027
7028                 map = em->map_lookup;
7029                 max_tolerated =
7030                         btrfs_get_num_tolerated_disk_barrier_failures(
7031                                         map->type);
7032                 for (i = 0; i < map->num_stripes; i++) {
7033                         struct btrfs_device *dev = map->stripes[i].dev;
7034
7035                         if (!dev || !dev->bdev ||
7036                             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7037                             dev->last_flush_error)
7038                                 missing++;
7039                         else if (failing_dev && failing_dev == dev)
7040                                 missing++;
7041                 }
7042                 if (missing > max_tolerated) {
7043                         if (!failing_dev)
7044                                 btrfs_warn(fs_info,
7045         "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7046                                    em->start, missing, max_tolerated);
7047                         free_extent_map(em);
7048                         ret = false;
7049                         goto out;
7050                 }
7051                 next_start = extent_map_end(em);
7052                 free_extent_map(em);
7053
7054                 read_lock(&map_tree->lock);
7055                 em = lookup_extent_mapping(map_tree, next_start,
7056                                            (u64)(-1) - next_start);
7057                 read_unlock(&map_tree->lock);
7058         }
7059 out:
7060         return ret;
7061 }
7062
7063 static void readahead_tree_node_children(struct extent_buffer *node)
7064 {
7065         int i;
7066         const int nr_items = btrfs_header_nritems(node);
7067
7068         for (i = 0; i < nr_items; i++) {
7069                 u64 start;
7070
7071                 start = btrfs_node_blockptr(node, i);
7072                 readahead_tree_block(node->fs_info, start);
7073         }
7074 }
7075
7076 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7077 {
7078         struct btrfs_root *root = fs_info->chunk_root;
7079         struct btrfs_path *path;
7080         struct extent_buffer *leaf;
7081         struct btrfs_key key;
7082         struct btrfs_key found_key;
7083         int ret;
7084         int slot;
7085         u64 total_dev = 0;
7086         u64 last_ra_node = 0;
7087
7088         path = btrfs_alloc_path();
7089         if (!path)
7090                 return -ENOMEM;
7091
7092         /*
7093          * uuid_mutex is needed only if we are mounting a sprout FS
7094          * otherwise we don't need it.
7095          */
7096         mutex_lock(&uuid_mutex);
7097
7098         /*
7099          * It is possible for mount and umount to race in such a way that
7100          * we execute this code path, but open_fs_devices failed to clear
7101          * total_rw_bytes. We certainly want it cleared before reading the
7102          * device items, so clear it here.
7103          */
7104         fs_info->fs_devices->total_rw_bytes = 0;
7105
7106         /*
7107          * Read all device items, and then all the chunk items. All
7108          * device items are found before any chunk item (their object id
7109          * is smaller than the lowest possible object id for a chunk
7110          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7111          */
7112         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7113         key.offset = 0;
7114         key.type = 0;
7115         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7116         if (ret < 0)
7117                 goto error;
7118         while (1) {
7119                 struct extent_buffer *node;
7120
7121                 leaf = path->nodes[0];
7122                 slot = path->slots[0];
7123                 if (slot >= btrfs_header_nritems(leaf)) {
7124                         ret = btrfs_next_leaf(root, path);
7125                         if (ret == 0)
7126                                 continue;
7127                         if (ret < 0)
7128                                 goto error;
7129                         break;
7130                 }
7131                 /*
7132                  * The nodes on level 1 are not locked but we don't need to do
7133                  * that during mount time as nothing else can access the tree
7134                  */
7135                 node = path->nodes[1];
7136                 if (node) {
7137                         if (last_ra_node != node->start) {
7138                                 readahead_tree_node_children(node);
7139                                 last_ra_node = node->start;
7140                         }
7141                 }
7142                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7143                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7144                         struct btrfs_dev_item *dev_item;
7145                         dev_item = btrfs_item_ptr(leaf, slot,
7146                                                   struct btrfs_dev_item);
7147                         ret = read_one_dev(leaf, dev_item);
7148                         if (ret)
7149                                 goto error;
7150                         total_dev++;
7151                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7152                         struct btrfs_chunk *chunk;
7153                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7154                         mutex_lock(&fs_info->chunk_mutex);
7155                         ret = read_one_chunk(&found_key, leaf, chunk);
7156                         mutex_unlock(&fs_info->chunk_mutex);
7157                         if (ret)
7158                                 goto error;
7159                 }
7160                 path->slots[0]++;
7161         }
7162
7163         /*
7164          * After loading chunk tree, we've got all device information,
7165          * do another round of validation checks.
7166          */
7167         if (total_dev != fs_info->fs_devices->total_devices) {
7168                 btrfs_err(fs_info,
7169            "super_num_devices %llu mismatch with num_devices %llu found here",
7170                           btrfs_super_num_devices(fs_info->super_copy),
7171                           total_dev);
7172                 ret = -EINVAL;
7173                 goto error;
7174         }
7175         if (btrfs_super_total_bytes(fs_info->super_copy) <
7176             fs_info->fs_devices->total_rw_bytes) {
7177                 btrfs_err(fs_info,
7178         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7179                           btrfs_super_total_bytes(fs_info->super_copy),
7180                           fs_info->fs_devices->total_rw_bytes);
7181                 ret = -EINVAL;
7182                 goto error;
7183         }
7184         ret = 0;
7185 error:
7186         mutex_unlock(&uuid_mutex);
7187
7188         btrfs_free_path(path);
7189         return ret;
7190 }
7191
7192 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7193 {
7194         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7195         struct btrfs_device *device;
7196
7197         fs_devices->fs_info = fs_info;
7198
7199         mutex_lock(&fs_devices->device_list_mutex);
7200         list_for_each_entry(device, &fs_devices->devices, dev_list)
7201                 device->fs_info = fs_info;
7202
7203         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7204                 list_for_each_entry(device, &seed_devs->devices, dev_list)
7205                         device->fs_info = fs_info;
7206
7207                 seed_devs->fs_info = fs_info;
7208         }
7209         mutex_unlock(&fs_devices->device_list_mutex);
7210 }
7211
7212 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7213                                  const struct btrfs_dev_stats_item *ptr,
7214                                  int index)
7215 {
7216         u64 val;
7217
7218         read_extent_buffer(eb, &val,
7219                            offsetof(struct btrfs_dev_stats_item, values) +
7220                             ((unsigned long)ptr) + (index * sizeof(u64)),
7221                            sizeof(val));
7222         return val;
7223 }
7224
7225 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7226                                       struct btrfs_dev_stats_item *ptr,
7227                                       int index, u64 val)
7228 {
7229         write_extent_buffer(eb, &val,
7230                             offsetof(struct btrfs_dev_stats_item, values) +
7231                              ((unsigned long)ptr) + (index * sizeof(u64)),
7232                             sizeof(val));
7233 }
7234
7235 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7236                                        struct btrfs_path *path)
7237 {
7238         struct btrfs_dev_stats_item *ptr;
7239         struct extent_buffer *eb;
7240         struct btrfs_key key;
7241         int item_size;
7242         int i, ret, slot;
7243
7244         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7245         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7246         key.offset = device->devid;
7247         ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7248         if (ret) {
7249                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7250                         btrfs_dev_stat_set(device, i, 0);
7251                 device->dev_stats_valid = 1;
7252                 btrfs_release_path(path);
7253                 return ret < 0 ? ret : 0;
7254         }
7255         slot = path->slots[0];
7256         eb = path->nodes[0];
7257         item_size = btrfs_item_size_nr(eb, slot);
7258
7259         ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7260
7261         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7262                 if (item_size >= (1 + i) * sizeof(__le64))
7263                         btrfs_dev_stat_set(device, i,
7264                                            btrfs_dev_stats_value(eb, ptr, i));
7265                 else
7266                         btrfs_dev_stat_set(device, i, 0);
7267         }
7268
7269         device->dev_stats_valid = 1;
7270         btrfs_dev_stat_print_on_load(device);
7271         btrfs_release_path(path);
7272
7273         return 0;
7274 }
7275
7276 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7277 {
7278         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7279         struct btrfs_device *device;
7280         struct btrfs_path *path = NULL;
7281         int ret = 0;
7282
7283         path = btrfs_alloc_path();
7284         if (!path)
7285                 return -ENOMEM;
7286
7287         mutex_lock(&fs_devices->device_list_mutex);
7288         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7289                 ret = btrfs_device_init_dev_stats(device, path);
7290                 if (ret)
7291                         goto out;
7292         }
7293         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7294                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7295                         ret = btrfs_device_init_dev_stats(device, path);
7296                         if (ret)
7297                                 goto out;
7298                 }
7299         }
7300 out:
7301         mutex_unlock(&fs_devices->device_list_mutex);
7302
7303         btrfs_free_path(path);
7304         return ret;
7305 }
7306
7307 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7308                                 struct btrfs_device *device)
7309 {
7310         struct btrfs_fs_info *fs_info = trans->fs_info;
7311         struct btrfs_root *dev_root = fs_info->dev_root;
7312         struct btrfs_path *path;
7313         struct btrfs_key key;
7314         struct extent_buffer *eb;
7315         struct btrfs_dev_stats_item *ptr;
7316         int ret;
7317         int i;
7318
7319         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7320         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7321         key.offset = device->devid;
7322
7323         path = btrfs_alloc_path();
7324         if (!path)
7325                 return -ENOMEM;
7326         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7327         if (ret < 0) {
7328                 btrfs_warn_in_rcu(fs_info,
7329                         "error %d while searching for dev_stats item for device %s",
7330                               ret, rcu_str_deref(device->name));
7331                 goto out;
7332         }
7333
7334         if (ret == 0 &&
7335             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7336                 /* need to delete old one and insert a new one */
7337                 ret = btrfs_del_item(trans, dev_root, path);
7338                 if (ret != 0) {
7339                         btrfs_warn_in_rcu(fs_info,
7340                                 "delete too small dev_stats item for device %s failed %d",
7341                                       rcu_str_deref(device->name), ret);
7342                         goto out;
7343                 }
7344                 ret = 1;
7345         }
7346
7347         if (ret == 1) {
7348                 /* need to insert a new item */
7349                 btrfs_release_path(path);
7350                 ret = btrfs_insert_empty_item(trans, dev_root, path,
7351                                               &key, sizeof(*ptr));
7352                 if (ret < 0) {
7353                         btrfs_warn_in_rcu(fs_info,
7354                                 "insert dev_stats item for device %s failed %d",
7355                                 rcu_str_deref(device->name), ret);
7356                         goto out;
7357                 }
7358         }
7359
7360         eb = path->nodes[0];
7361         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7362         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7363                 btrfs_set_dev_stats_value(eb, ptr, i,
7364                                           btrfs_dev_stat_read(device, i));
7365         btrfs_mark_buffer_dirty(eb);
7366
7367 out:
7368         btrfs_free_path(path);
7369         return ret;
7370 }
7371
7372 /*
7373  * called from commit_transaction. Writes all changed device stats to disk.
7374  */
7375 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7376 {
7377         struct btrfs_fs_info *fs_info = trans->fs_info;
7378         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7379         struct btrfs_device *device;
7380         int stats_cnt;
7381         int ret = 0;
7382
7383         mutex_lock(&fs_devices->device_list_mutex);
7384         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7385                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7386                 if (!device->dev_stats_valid || stats_cnt == 0)
7387                         continue;
7388
7389
7390                 /*
7391                  * There is a LOAD-LOAD control dependency between the value of
7392                  * dev_stats_ccnt and updating the on-disk values which requires
7393                  * reading the in-memory counters. Such control dependencies
7394                  * require explicit read memory barriers.
7395                  *
7396                  * This memory barriers pairs with smp_mb__before_atomic in
7397                  * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7398                  * barrier implied by atomic_xchg in
7399                  * btrfs_dev_stats_read_and_reset
7400                  */
7401                 smp_rmb();
7402
7403                 ret = update_dev_stat_item(trans, device);
7404                 if (!ret)
7405                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7406         }
7407         mutex_unlock(&fs_devices->device_list_mutex);
7408
7409         return ret;
7410 }
7411
7412 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7413 {
7414         btrfs_dev_stat_inc(dev, index);
7415         btrfs_dev_stat_print_on_error(dev);
7416 }
7417
7418 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7419 {
7420         if (!dev->dev_stats_valid)
7421                 return;
7422         btrfs_err_rl_in_rcu(dev->fs_info,
7423                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7424                            rcu_str_deref(dev->name),
7425                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7426                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7427                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7428                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7429                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7430 }
7431
7432 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7433 {
7434         int i;
7435
7436         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7437                 if (btrfs_dev_stat_read(dev, i) != 0)
7438                         break;
7439         if (i == BTRFS_DEV_STAT_VALUES_MAX)
7440                 return; /* all values == 0, suppress message */
7441
7442         btrfs_info_in_rcu(dev->fs_info,
7443                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7444                rcu_str_deref(dev->name),
7445                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7446                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7447                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7448                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7449                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7450 }
7451
7452 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7453                         struct btrfs_ioctl_get_dev_stats *stats)
7454 {
7455         struct btrfs_device *dev;
7456         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7457         int i;
7458
7459         mutex_lock(&fs_devices->device_list_mutex);
7460         dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
7461                                 true);
7462         mutex_unlock(&fs_devices->device_list_mutex);
7463
7464         if (!dev) {
7465                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7466                 return -ENODEV;
7467         } else if (!dev->dev_stats_valid) {
7468                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7469                 return -ENODEV;
7470         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7471                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7472                         if (stats->nr_items > i)
7473                                 stats->values[i] =
7474                                         btrfs_dev_stat_read_and_reset(dev, i);
7475                         else
7476                                 btrfs_dev_stat_set(dev, i, 0);
7477                 }
7478                 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7479                            current->comm, task_pid_nr(current));
7480         } else {
7481                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7482                         if (stats->nr_items > i)
7483                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
7484         }
7485         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7486                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7487         return 0;
7488 }
7489
7490 /*
7491  * Update the size and bytes used for each device where it changed.  This is
7492  * delayed since we would otherwise get errors while writing out the
7493  * superblocks.
7494  *
7495  * Must be invoked during transaction commit.
7496  */
7497 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7498 {
7499         struct btrfs_device *curr, *next;
7500
7501         ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7502
7503         if (list_empty(&trans->dev_update_list))
7504                 return;
7505
7506         /*
7507          * We don't need the device_list_mutex here.  This list is owned by the
7508          * transaction and the transaction must complete before the device is
7509          * released.
7510          */
7511         mutex_lock(&trans->fs_info->chunk_mutex);
7512         list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7513                                  post_commit_list) {
7514                 list_del_init(&curr->post_commit_list);
7515                 curr->commit_total_bytes = curr->disk_total_bytes;
7516                 curr->commit_bytes_used = curr->bytes_used;
7517         }
7518         mutex_unlock(&trans->fs_info->chunk_mutex);
7519 }
7520
7521 /*
7522  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7523  */
7524 int btrfs_bg_type_to_factor(u64 flags)
7525 {
7526         const int index = btrfs_bg_flags_to_raid_index(flags);
7527
7528         return btrfs_raid_array[index].ncopies;
7529 }
7530
7531
7532
7533 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7534                                  u64 chunk_offset, u64 devid,
7535                                  u64 physical_offset, u64 physical_len)
7536 {
7537         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7538         struct extent_map *em;
7539         struct map_lookup *map;
7540         struct btrfs_device *dev;
7541         u64 stripe_len;
7542         bool found = false;
7543         int ret = 0;
7544         int i;
7545
7546         read_lock(&em_tree->lock);
7547         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7548         read_unlock(&em_tree->lock);
7549
7550         if (!em) {
7551                 btrfs_err(fs_info,
7552 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7553                           physical_offset, devid);
7554                 ret = -EUCLEAN;
7555                 goto out;
7556         }
7557
7558         map = em->map_lookup;
7559         stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7560         if (physical_len != stripe_len) {
7561                 btrfs_err(fs_info,
7562 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7563                           physical_offset, devid, em->start, physical_len,
7564                           stripe_len);
7565                 ret = -EUCLEAN;
7566                 goto out;
7567         }
7568
7569         for (i = 0; i < map->num_stripes; i++) {
7570                 if (map->stripes[i].dev->devid == devid &&
7571                     map->stripes[i].physical == physical_offset) {
7572                         found = true;
7573                         if (map->verified_stripes >= map->num_stripes) {
7574                                 btrfs_err(fs_info,
7575                                 "too many dev extents for chunk %llu found",
7576                                           em->start);
7577                                 ret = -EUCLEAN;
7578                                 goto out;
7579                         }
7580                         map->verified_stripes++;
7581                         break;
7582                 }
7583         }
7584         if (!found) {
7585                 btrfs_err(fs_info,
7586         "dev extent physical offset %llu devid %llu has no corresponding chunk",
7587                         physical_offset, devid);
7588                 ret = -EUCLEAN;
7589         }
7590
7591         /* Make sure no dev extent is beyond device bondary */
7592         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
7593         if (!dev) {
7594                 btrfs_err(fs_info, "failed to find devid %llu", devid);
7595                 ret = -EUCLEAN;
7596                 goto out;
7597         }
7598
7599         /* It's possible this device is a dummy for seed device */
7600         if (dev->disk_total_bytes == 0) {
7601                 struct btrfs_fs_devices *devs;
7602
7603                 devs = list_first_entry(&fs_info->fs_devices->seed_list,
7604                                         struct btrfs_fs_devices, seed_list);
7605                 dev = btrfs_find_device(devs, devid, NULL, NULL, false);
7606                 if (!dev) {
7607                         btrfs_err(fs_info, "failed to find seed devid %llu",
7608                                   devid);
7609                         ret = -EUCLEAN;
7610                         goto out;
7611                 }
7612         }
7613
7614         if (physical_offset + physical_len > dev->disk_total_bytes) {
7615                 btrfs_err(fs_info,
7616 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7617                           devid, physical_offset, physical_len,
7618                           dev->disk_total_bytes);
7619                 ret = -EUCLEAN;
7620                 goto out;
7621         }
7622 out:
7623         free_extent_map(em);
7624         return ret;
7625 }
7626
7627 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7628 {
7629         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7630         struct extent_map *em;
7631         struct rb_node *node;
7632         int ret = 0;
7633
7634         read_lock(&em_tree->lock);
7635         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7636                 em = rb_entry(node, struct extent_map, rb_node);
7637                 if (em->map_lookup->num_stripes !=
7638                     em->map_lookup->verified_stripes) {
7639                         btrfs_err(fs_info,
7640                         "chunk %llu has missing dev extent, have %d expect %d",
7641                                   em->start, em->map_lookup->verified_stripes,
7642                                   em->map_lookup->num_stripes);
7643                         ret = -EUCLEAN;
7644                         goto out;
7645                 }
7646         }
7647 out:
7648         read_unlock(&em_tree->lock);
7649         return ret;
7650 }
7651
7652 /*
7653  * Ensure that all dev extents are mapped to correct chunk, otherwise
7654  * later chunk allocation/free would cause unexpected behavior.
7655  *
7656  * NOTE: This will iterate through the whole device tree, which should be of
7657  * the same size level as the chunk tree.  This slightly increases mount time.
7658  */
7659 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7660 {
7661         struct btrfs_path *path;
7662         struct btrfs_root *root = fs_info->dev_root;
7663         struct btrfs_key key;
7664         u64 prev_devid = 0;
7665         u64 prev_dev_ext_end = 0;
7666         int ret = 0;
7667
7668         key.objectid = 1;
7669         key.type = BTRFS_DEV_EXTENT_KEY;
7670         key.offset = 0;
7671
7672         path = btrfs_alloc_path();
7673         if (!path)
7674                 return -ENOMEM;
7675
7676         path->reada = READA_FORWARD;
7677         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7678         if (ret < 0)
7679                 goto out;
7680
7681         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7682                 ret = btrfs_next_item(root, path);
7683                 if (ret < 0)
7684                         goto out;
7685                 /* No dev extents at all? Not good */
7686                 if (ret > 0) {
7687                         ret = -EUCLEAN;
7688                         goto out;
7689                 }
7690         }
7691         while (1) {
7692                 struct extent_buffer *leaf = path->nodes[0];
7693                 struct btrfs_dev_extent *dext;
7694                 int slot = path->slots[0];
7695                 u64 chunk_offset;
7696                 u64 physical_offset;
7697                 u64 physical_len;
7698                 u64 devid;
7699
7700                 btrfs_item_key_to_cpu(leaf, &key, slot);
7701                 if (key.type != BTRFS_DEV_EXTENT_KEY)
7702                         break;
7703                 devid = key.objectid;
7704                 physical_offset = key.offset;
7705
7706                 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7707                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7708                 physical_len = btrfs_dev_extent_length(leaf, dext);
7709
7710                 /* Check if this dev extent overlaps with the previous one */
7711                 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7712                         btrfs_err(fs_info,
7713 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7714                                   devid, physical_offset, prev_dev_ext_end);
7715                         ret = -EUCLEAN;
7716                         goto out;
7717                 }
7718
7719                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7720                                             physical_offset, physical_len);
7721                 if (ret < 0)
7722                         goto out;
7723                 prev_devid = devid;
7724                 prev_dev_ext_end = physical_offset + physical_len;
7725
7726                 ret = btrfs_next_item(root, path);
7727                 if (ret < 0)
7728                         goto out;
7729                 if (ret > 0) {
7730                         ret = 0;
7731                         break;
7732                 }
7733         }
7734
7735         /* Ensure all chunks have corresponding dev extents */
7736         ret = verify_chunk_dev_extent_mapping(fs_info);
7737 out:
7738         btrfs_free_path(path);
7739         return ret;
7740 }
7741
7742 /*
7743  * Check whether the given block group or device is pinned by any inode being
7744  * used as a swapfile.
7745  */
7746 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7747 {
7748         struct btrfs_swapfile_pin *sp;
7749         struct rb_node *node;
7750
7751         spin_lock(&fs_info->swapfile_pins_lock);
7752         node = fs_info->swapfile_pins.rb_node;
7753         while (node) {
7754                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7755                 if (ptr < sp->ptr)
7756                         node = node->rb_left;
7757                 else if (ptr > sp->ptr)
7758                         node = node->rb_right;
7759                 else
7760                         break;
7761         }
7762         spin_unlock(&fs_info->swapfile_pins_lock);
7763         return node != NULL;
7764 }