]> git.proxmox.com Git - mirror_ubuntu-kernels.git/commitdiff
bcachefs: Use KEY_TYPE_deleted whitouts for extents
authorKent Overstreet <kent.overstreet@gmail.com>
Tue, 26 Nov 2019 22:26:04 +0000 (17:26 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:33 +0000 (17:08 -0400)
Previously, partial overwrites of existing extents were handled
implicitly by the btree code; when reading in a btree node, we'd do a
mergesort of the different bsets and detect and fix partially
overlapping extents during that mergesort.

That approach won't work with snapshots: this changes extents to work
like regular keys as far as the btree code is concerned, where a 0 size
KEY_TYPE_deleted whiteout will completely overwrite an existing extent.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bcachefs_format.h
fs/bcachefs/bkey_sort.c
fs/bcachefs/btree_io.c
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/extent_update.c
fs/bcachefs/recovery.c

index 535ba27883159c08710e10d326e279ab8409009d..0a623ed3caa644ebaab61662add71af8449567a8 100644 (file)
@@ -1286,6 +1286,7 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3],  0, 16);
  * reflink:                    gates KEY_TYPE_reflink
  * inline_data:                        gates KEY_TYPE_inline_data
  * new_siphash:                        gates BCH_STR_HASH_SIPHASH
+ * new_extent_overwrite:       gates BTREE_NODE_NEW_EXTENT_OVERWRITE
  */
 #define BCH_SB_FEATURES()                      \
        x(lz4,                          0)      \
@@ -1296,7 +1297,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3],  0, 16);
        x(journal_seq_blacklist_v3,     5)      \
        x(reflink,                      6)      \
        x(new_siphash,                  7)      \
-       x(inline_data,                  8)
+       x(inline_data,                  8)      \
+       x(new_extent_overwrite,         9)
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
@@ -1620,7 +1622,9 @@ struct btree_node {
 
 LE64_BITMASK(BTREE_NODE_ID,    struct btree_node, flags,  0,  4);
 LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags,  4,  8);
-/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
+                               struct btree_node, flags,  8,  9);
+/* 9-32 unused */
 LE64_BITMASK(BTREE_NODE_SEQ,   struct btree_node, flags, 32, 64);
 
 struct btree_node_entry {
index 23b51ef57303e91cdf56ca1f39bb3092d50f49e7..18f842012f05ab1afe57a630c5c2c898c281669a 100644 (file)
@@ -130,24 +130,6 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
        return nr;
 }
 
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
-                                                 struct bkey_packed *l,
-                                                 struct bkey_packed *r)
-{
-       struct bkey ul = bkey_unpack_key(b, l);
-       struct bkey ur = bkey_unpack_key(b, r);
-
-       return bkey_cmp(bkey_start_pos(&ul),
-                       bkey_start_pos(&ur)) ?:
-               cmp_int((unsigned long) r, (unsigned long) l);
-}
-
 static void extent_sort_advance_prev(struct bkey_format *f,
                                     struct btree_nr_keys *nr,
                                     struct bkey_packed *start,
@@ -188,102 +170,6 @@ static void extent_sort_append(struct bch_fs *c,
        bkey_reassemble((void *) *prev, k.s_c);
 }
 
-struct btree_nr_keys
-bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
-                                struct sort_iter *iter)
-{
-       struct btree *b = iter->b;
-       struct bkey_format *f = &b->format;
-       struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
-       struct bkey_packed *prev = NULL;
-       struct bkey l_unpacked, r_unpacked;
-       struct bkey_s l, r;
-       struct btree_nr_keys nr;
-       struct bkey_on_stack split;
-
-       memset(&nr, 0, sizeof(nr));
-       bkey_on_stack_init(&split);
-
-       sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
-
-       while (!sort_iter_end(iter)) {
-               l = __bkey_disassemble(b, _l->k, &l_unpacked);
-
-               if (iter->used == 1) {
-                       extent_sort_append(c, f, &nr, dst->start, &prev, l);
-                       sort_iter_advance(iter,
-                                         extent_sort_fix_overlapping_cmp);
-                       continue;
-               }
-
-               r = __bkey_disassemble(b, _r->k, &r_unpacked);
-
-               /* If current key and next key don't overlap, just append */
-               if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-                       extent_sort_append(c, f, &nr, dst->start, &prev, l);
-                       sort_iter_advance(iter,
-                                         extent_sort_fix_overlapping_cmp);
-                       continue;
-               }
-
-               /* Skip 0 size keys */
-               if (!r.k->size) {
-                       __sort_iter_advance(iter, 1,
-                                           extent_sort_fix_overlapping_cmp);
-                       continue;
-               }
-
-               /*
-                * overlap: keep the newer key and trim the older key so they
-                * don't overlap. comparing pointers tells us which one is
-                * newer, since the bsets are appended one after the other.
-                */
-
-               /* can't happen because of comparison func */
-               BUG_ON(_l->k < _r->k &&
-                      !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
-               if (_l->k > _r->k) {
-                       /* l wins, trim r */
-                       if (bkey_cmp(l.k->p, r.k->p) >= 0) {
-                               __sort_iter_advance(iter, 1,
-                                        extent_sort_fix_overlapping_cmp);
-                       } else {
-                               bch2_cut_front_s(l.k->p, r);
-                               extent_save(b, _r->k, r.k);
-                               __sort_iter_sift(iter, 1,
-                                        extent_sort_fix_overlapping_cmp);
-                       }
-               } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-
-                       /*
-                        * r wins, but it overlaps in the middle of l - split l:
-                        */
-                       bkey_on_stack_reassemble(&split, c, l.s_c);
-                       bch2_cut_back(bkey_start_pos(r.k), split.k);
-
-                       bch2_cut_front_s(r.k->p, l);
-                       extent_save(b, _l->k, l.k);
-
-                       __sort_iter_sift(iter, 0,
-                                        extent_sort_fix_overlapping_cmp);
-
-                       extent_sort_append(c, f, &nr, dst->start,
-                                          &prev, bkey_i_to_s(split.k));
-               } else {
-                       bch2_cut_back_s(bkey_start_pos(r.k), l);
-                       extent_save(b, _l->k, l.k);
-               }
-       }
-
-       extent_sort_advance_prev(f, &nr, dst->start, &prev);
-
-       dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
-
-       bkey_on_stack_exit(&split, c);
-       return nr;
-}
-
 /* Sort + repack in a new format: */
 struct btree_nr_keys
 bch2_sort_repack(struct bset *dst, struct btree *src,
@@ -354,7 +240,7 @@ static inline int sort_keys_cmp(struct btree *b,
                                struct bkey_packed *r)
 {
        return bkey_cmp_packed(b, l, r) ?:
-               (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+               (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
                (int) l->needs_whiteout - (int) r->needs_whiteout;
 }
 
@@ -399,6 +285,122 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
        return (u64 *) out - (u64 *) dst;
 }
 
+/* Compat code for btree_node_old_extent_overwrite: */
+
+/*
+ * If keys compare equal, compare by pointer order:
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
+                                                 struct bkey_packed *l,
+                                                 struct bkey_packed *r)
+{
+       struct bkey ul = bkey_unpack_key(b, l);
+       struct bkey ur = bkey_unpack_key(b, r);
+
+       return bkey_cmp(bkey_start_pos(&ul),
+                       bkey_start_pos(&ur)) ?:
+               cmp_int((unsigned long) r, (unsigned long) l);
+}
+
+struct btree_nr_keys
+bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+                                struct sort_iter *iter)
+{
+       struct btree *b = iter->b;
+       struct bkey_format *f = &b->format;
+       struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
+       struct bkey_packed *prev = NULL;
+       struct bkey l_unpacked, r_unpacked;
+       struct bkey_s l, r;
+       struct btree_nr_keys nr;
+       struct bkey_on_stack split;
+
+       memset(&nr, 0, sizeof(nr));
+       bkey_on_stack_init(&split);
+
+       sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
+
+       while (!sort_iter_end(iter)) {
+               l = __bkey_disassemble(b, _l->k, &l_unpacked);
+
+               if (iter->used == 1) {
+                       extent_sort_append(c, f, &nr, dst->start, &prev, l);
+                       sort_iter_advance(iter,
+                                         extent_sort_fix_overlapping_cmp);
+                       continue;
+               }
+
+               r = __bkey_disassemble(b, _r->k, &r_unpacked);
+
+               /* If current key and next key don't overlap, just append */
+               if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+                       extent_sort_append(c, f, &nr, dst->start, &prev, l);
+                       sort_iter_advance(iter,
+                                         extent_sort_fix_overlapping_cmp);
+                       continue;
+               }
+
+               /* Skip 0 size keys */
+               if (!r.k->size) {
+                       __sort_iter_advance(iter, 1,
+                                           extent_sort_fix_overlapping_cmp);
+                       continue;
+               }
+
+               /*
+                * overlap: keep the newer key and trim the older key so they
+                * don't overlap. comparing pointers tells us which one is
+                * newer, since the bsets are appended one after the other.
+                */
+
+               /* can't happen because of comparison func */
+               BUG_ON(_l->k < _r->k &&
+                      !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+               if (_l->k > _r->k) {
+                       /* l wins, trim r */
+                       if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+                               __sort_iter_advance(iter, 1,
+                                        extent_sort_fix_overlapping_cmp);
+                       } else {
+                               bch2_cut_front_s(l.k->p, r);
+                               extent_save(b, _r->k, r.k);
+                               __sort_iter_sift(iter, 1,
+                                        extent_sort_fix_overlapping_cmp);
+                       }
+               } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+
+                       /*
+                        * r wins, but it overlaps in the middle of l - split l:
+                        */
+                       bkey_on_stack_reassemble(&split, c, l.s_c);
+                       bch2_cut_back(bkey_start_pos(r.k), split.k);
+
+                       bch2_cut_front_s(r.k->p, l);
+                       extent_save(b, _l->k, l.k);
+
+                       __sort_iter_sift(iter, 0,
+                                        extent_sort_fix_overlapping_cmp);
+
+                       extent_sort_append(c, f, &nr, dst->start,
+                                          &prev, bkey_i_to_s(split.k));
+               } else {
+                       bch2_cut_back_s(bkey_start_pos(r.k), l);
+                       extent_save(b, _l->k, l.k);
+               }
+       }
+
+       extent_sort_advance_prev(f, &nr, dst->start, &prev);
+
+       dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+
+       bkey_on_stack_exit(&split, c);
+       return nr;
+}
+
 static inline int sort_extents_cmp(struct btree *b,
                                   struct bkey_packed *l,
                                   struct bkey_packed *r)
index 209e20fbcd7069ed1ba9f378f32667e283d68c7f..c5b5143ada05ffa2971417cd909f1fe7ca9e159f 100644 (file)
@@ -22,7 +22,8 @@
 
 static void verify_no_dups(struct btree *b,
                           struct bkey_packed *start,
-                          struct bkey_packed *end)
+                          struct bkey_packed *end,
+                          bool extents)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
        struct bkey_packed *k, *p;
@@ -36,7 +37,7 @@ static void verify_no_dups(struct btree *b,
                struct bkey l = bkey_unpack_key(b, p);
                struct bkey r = bkey_unpack_key(b, k);
 
-               BUG_ON(btree_node_is_extents(b)
+               BUG_ON(extents
                       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
                       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
                //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
@@ -147,7 +148,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
        }
 
        verify_no_dups(b, new_whiteouts,
-                      (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
+                      (void *) ((u64 *) new_whiteouts + b->whiteout_u64s),
+                      btree_node_old_extent_overwrite(b));
 
        memcpy_u64s(unwritten_whiteouts_start(c, b),
                    new_whiteouts, b->whiteout_u64s);
@@ -297,7 +299,8 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c,
 
        verify_no_dups(b,
                       unwritten_whiteouts_start(c, b),
-                      unwritten_whiteouts_end(c, b));
+                      unwritten_whiteouts_end(c, b),
+                      true);
 
        btree_bounce_free(c, order, used_mempool, whiteouts);
 
@@ -377,7 +380,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
 bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
                            enum compact_mode mode)
 {
-       return !btree_node_is_extents(b)
+       return !btree_node_old_extent_overwrite(b)
                ? bch2_drop_whiteouts(b, mode)
                : bch2_compact_extent_whiteouts(c, b, mode);
 }
@@ -417,10 +420,10 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 
        start_time = local_clock();
 
-       if (btree_node_is_extents(b))
+       if (btree_node_old_extent_overwrite(b))
                filter_whiteouts = bset_written(b, start_bset);
 
-       u64s = (btree_node_is_extents(b)
+       u64s = (btree_node_old_extent_overwrite(b)
                ? bch2_sort_extents
                : bch2_sort_keys)(out->keys.start,
                                  &sort_iter,
@@ -706,7 +709,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
                         bool have_retry)
 {
        struct bkey_packed *k, *prev = NULL;
-       struct bpos prev_pos = POS_MIN;
+       struct bpos prev_pos    = POS_MIN;
+       struct bpos prev_data   = POS_MIN;
        bool seen_non_whiteout = false;
        unsigned version;
        const char *err;
@@ -839,7 +843,8 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
                     (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
                        *whiteout_u64s = k->_data - i->_data;
                        seen_non_whiteout = true;
-               } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
+               } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
+                          bkey_cmp(prev_pos, u.k->p) > 0) {
                        btree_err(BTREE_ERR_FATAL, c, b, i,
                                  "keys out of order: %llu:%llu > %llu:%llu",
                                  prev_pos.inode,
@@ -849,7 +854,10 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
                        /* XXX: repair this */
                }
 
+               if (!bkey_deleted(u.k))
+                       prev_data = u.k->p;
                prev_pos = u.k->p;
+
                prev = k;
                k = bkey_next_skip_noops(k, vstruct_last(i));
        }
@@ -908,6 +916,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
                        bset_encrypt(c, i, b->written << 9);
 
+                       if (btree_node_is_extents(b) &&
+                           !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+                               set_btree_node_old_extent_overwrite(b);
+
                        sectors = vstruct_sectors(b->data, c->block_bits);
 
                        btree_node_set_format(b, b->data->format);
@@ -971,7 +983,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
        set_btree_bset(b, b->set, &b->data->keys);
 
-       b->nr = (btree_node_is_extents(b)
+       b->nr = (btree_node_old_extent_overwrite(b)
                 ? bch2_extent_sort_fix_overlapping
                 : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter);
 
@@ -1486,7 +1498,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        i->journal_seq  = cpu_to_le64(seq);
        i->u64s         = 0;
 
-       if (!btree_node_is_extents(b)) {
+       if (!btree_node_old_extent_overwrite(b)) {
                sort_iter_add(&sort_iter,
                              unwritten_whiteouts_start(c, b),
                              unwritten_whiteouts_end(c, b));
@@ -1501,7 +1513,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
        b->whiteout_u64s = 0;
 
-       u64s = btree_node_is_extents(b)
+       u64s = btree_node_old_extent_overwrite(b)
                ? bch2_sort_extents(vstruct_last(i), &sort_iter, false)
                : bch2_sort_keys(i->start, &sort_iter, false);
        le16_add_cpu(&i->u64s, u64s);
index 98451b3dd1a5a3e6b1acbf2ed0e7c43ff009769e..cc04cdbaf4326abf122fcec6763f658c65387cf3 100644 (file)
@@ -311,6 +311,7 @@ enum btree_flags {
        BTREE_NODE_just_written,
        BTREE_NODE_dying,
        BTREE_NODE_fake,
+       BTREE_NODE_old_extent_overwrite,
 };
 
 BTREE_FLAG(read_in_flight);
@@ -324,6 +325,7 @@ BTREE_FLAG(write_in_flight);
 BTREE_FLAG(just_written);
 BTREE_FLAG(dying);
 BTREE_FLAG(fake);
+BTREE_FLAG(old_extent_overwrite);
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
index c9be0d110c64631a843863de26d43dc9729503d4..870eb0938c2249bbfde25afa546ed1c51a949c6e 100644 (file)
@@ -374,6 +374,13 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
        SET_BTREE_NODE_LEVEL(b->data, level);
        b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
 
+       if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
+               SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
+
+       if (btree_node_is_extents(b) &&
+           !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+               set_btree_node_old_extent_overwrite(b);
+
        bch2_btree_build_aux_trees(b);
 
        btree_node_will_make_reachable(as, b);
index 09f5cd6493f440defe7b4a3f3e903247426ad048..78f5674394dc7ce0d9c5603b363df135931f62f7 100644 (file)
@@ -267,6 +267,8 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
        int old_live_u64s = b->nr.live_u64s;
        int live_u64s_added, u64s_added;
 
+       insert->k->k.needs_whiteout = false;
+
        if (!btree_node_is_extents(b))
                bch2_insert_fixup_key(trans, insert);
        else
index e021e1623a91e8579f8d92a65b3abf79ee3edc1d..d2f1414f28e21a4211585a6d56f2bae38ee367da 100644 (file)
@@ -186,11 +186,26 @@ bch2_extent_can_insert(struct btree_trans *trans,
 
                overlap = bch2_extent_overlap(&insert->k->k, k.k);
 
+               /*
+                * If we're overwriting an existing extent, we may need to emit
+                * a whiteout - unless we're inserting a new extent at the same
+                * position:
+                */
+               if (k.k->needs_whiteout &&
+                   (!bkey_whiteout(&insert->k->k) ||
+                    bkey_cmp(k.k->p, insert->k->k.p)))
+                       *u64s += BKEY_U64s;
+
+               /*
+                * If we're partially overwriting an existing extent which has
+                * been written out to disk, we'll need to emit a new version of
+                * that extent:
+                */
                if (bkey_written(l->b, _k) &&
                    overlap != BCH_EXTENT_OVERLAP_ALL)
                        *u64s += _k->u64s;
 
-               /* account for having to split existing extent: */
+               /* And we may be splitting an existing extent: */
                if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
                        *u64s += _k->u64s;
 
@@ -286,6 +301,23 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
        bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
 }
 
+static void pack_push_whiteout(struct bch_fs *c, struct btree *b,
+                              struct bpos pos)
+{
+       struct bkey_packed k;
+
+       if (!bkey_pack_pos(&k, pos, b)) {
+               struct bkey_i tmp;
+
+               bkey_init(&tmp.k);
+               tmp.k.p = pos;
+               bkey_copy(&k, &tmp);
+       }
+
+       k.needs_whiteout = true;
+       push_whiteout(c, b, &k);
+}
+
 static void
 extent_drop(struct bch_fs *c, struct btree_iter *iter,
            struct bkey_packed *_k, struct bkey_s k)
@@ -297,7 +329,12 @@ extent_drop(struct bch_fs *c, struct btree_iter *iter,
 
        k.k->size = 0;
        k.k->type = KEY_TYPE_deleted;
-       k.k->needs_whiteout = false;
+
+       if (!btree_node_old_extent_overwrite(l->b) &&
+           k.k->needs_whiteout) {
+               pack_push_whiteout(c, l->b, k.k->p);
+               k.k->needs_whiteout = false;
+       }
 
        if (_k >= btree_bset_last(l->b)->start) {
                unsigned u64s = _k->u64s;
@@ -322,12 +359,29 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
        bkey_on_stack_init(&tmp);
        bkey_on_stack_init(&split);
 
+       if (!btree_node_old_extent_overwrite(l->b)) {
+               if (!bkey_whiteout(&insert->k) &&
+                   !bkey_cmp(k.k->p, insert->k.p)) {
+                       insert->k.needs_whiteout = k.k->needs_whiteout;
+                       k.k->needs_whiteout = false;
+               }
+       } else {
+               insert->k.needs_whiteout |= k.k->needs_whiteout;
+       }
+
        switch (overlap) {
        case BCH_EXTENT_OVERLAP_FRONT:
                if (bkey_written(l->b, _k)) {
                        bkey_on_stack_reassemble(&tmp, c, k.s_c);
                        bch2_cut_front(insert->k.p, tmp.k);
 
+                       /*
+                        * needs_whiteout was propagated to new version of @k,
+                        * @tmp:
+                        */
+                       if (!btree_node_old_extent_overwrite(l->b))
+                               k.k->needs_whiteout = false;
+
                        extent_drop(c, iter, _k, k);
                        extent_bset_insert(c, iter, tmp.k);
                } else {
@@ -348,9 +402,26 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
                        bkey_on_stack_reassemble(&tmp, c, k.s_c);
                        bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
 
+                       /*
+                        * @tmp has different position than @k, needs_whiteout
+                        * should not be propagated:
+                        */
+                       if (!btree_node_old_extent_overwrite(l->b))
+                               tmp.k->k.needs_whiteout = false;
+
                        extent_drop(c, iter, _k, k);
                        extent_bset_insert(c, iter, tmp.k);
                } else {
+                       /*
+                        * position of @k is changing, emit a whiteout if
+                        * needs_whiteout is set:
+                        */
+                       if (!btree_node_old_extent_overwrite(l->b) &&
+                           k.k->needs_whiteout) {
+                               pack_push_whiteout(c, l->b, k.k->p);
+                               k.k->needs_whiteout = false;
+                       }
+
                        btree_keys_account_val_delta(l->b, _k,
                                bch2_cut_back_s(bkey_start_pos(&insert->k), k));
                        extent_save(l->b, _k, k.k);
@@ -367,10 +438,17 @@ extent_squash(struct bch_fs *c, struct btree_iter *iter,
                bkey_on_stack_reassemble(&split, c, k.s_c);
                bch2_cut_back(bkey_start_pos(&insert->k), split.k);
 
+               if (!btree_node_old_extent_overwrite(l->b))
+                       split.k->k.needs_whiteout = false;
+
+               /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */
                if (bkey_written(l->b, _k)) {
                        bkey_on_stack_reassemble(&tmp, c, k.s_c);
                        bch2_cut_front(insert->k.p, tmp.k);
 
+                       if (!btree_node_old_extent_overwrite(l->b))
+                               k.k->needs_whiteout = false;
+
                        extent_drop(c, iter, _k, k);
                        extent_bset_insert(c, iter, tmp.k);
                } else {
@@ -462,7 +540,6 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
                        bch2_cut_front(cur_end, insert);
                        bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
                } else {
-                       insert->k.needs_whiteout |= k.k->needs_whiteout;
                        extent_squash(c, iter, insert, _k, k, overlap);
                }
 
@@ -480,7 +557,10 @@ void bch2_insert_fixup_extent(struct btree_trans *trans,
                if (insert->k.type == KEY_TYPE_deleted)
                        insert->k.type = KEY_TYPE_discard;
 
-               extent_bset_insert(c, iter, insert);
+               if (!bkey_whiteout(&insert->k) ||
+                   btree_node_old_extent_overwrite(l->b))
+                       extent_bset_insert(c, iter, insert);
+
                bch2_btree_journal_key(trans, iter, insert);
        }
 
index 97b367252e821ca2dbcce0529c8728c9abc40027..c7367a679b22fb2af87f32e0cd9303da306da6a4 100644 (file)
@@ -908,6 +908,7 @@ int bch2_fs_recovery(struct bch_fs *c)
                                le16_to_cpu(bcachefs_metadata_version_min);
                c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
                c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
+               c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
                write_sb = true;
        }
 
@@ -1027,6 +1028,7 @@ int bch2_fs_initialize(struct bch_fs *c)
                le16_to_cpu(bcachefs_metadata_version_current);
        c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
        c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
+       c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
 
        SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
        SET_BCH_SB_CLEAN(c->disk_sb.sb, false);