* reflink: gates KEY_TYPE_reflink
* inline_data: gates KEY_TYPE_inline_data
* new_siphash: gates BCH_STR_HASH_SIPHASH
+ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE
*/
#define BCH_SB_FEATURES() \
x(lz4, 0) \
x(journal_seq_blacklist_v3, 5) \
x(reflink, 6) \
x(new_siphash, 7) \
- x(inline_data, 8)
+ x(inline_data, 8) \
+ x(new_extent_overwrite, 9)
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4);
LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
-/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
+ struct btree_node, flags, 8, 9);
+/* 9-32 unused */
LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64);
struct btree_node_entry {
return nr;
}
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
- struct bkey_packed *l,
- struct bkey_packed *r)
-{
- struct bkey ul = bkey_unpack_key(b, l);
- struct bkey ur = bkey_unpack_key(b, r);
-
- return bkey_cmp(bkey_start_pos(&ul),
- bkey_start_pos(&ur)) ?:
- cmp_int((unsigned long) r, (unsigned long) l);
-}
-
static void extent_sort_advance_prev(struct bkey_format *f,
struct btree_nr_keys *nr,
struct bkey_packed *start,
bkey_reassemble((void *) *prev, k.s_c);
}
-struct btree_nr_keys
-bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
- struct sort_iter *iter)
-{
- struct btree *b = iter->b;
- struct bkey_format *f = &b->format;
- struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
- struct bkey_packed *prev = NULL;
- struct bkey l_unpacked, r_unpacked;
- struct bkey_s l, r;
- struct btree_nr_keys nr;
- struct bkey_on_stack split;
-
- memset(&nr, 0, sizeof(nr));
- bkey_on_stack_init(&split);
-
- sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
-
- while (!sort_iter_end(iter)) {
- l = __bkey_disassemble(b, _l->k, &l_unpacked);
-
- if (iter->used == 1) {
- extent_sort_append(c, f, &nr, dst->start, &prev, l);
- sort_iter_advance(iter,
- extent_sort_fix_overlapping_cmp);
- continue;
- }
-
- r = __bkey_disassemble(b, _r->k, &r_unpacked);
-
- /* If current key and next key don't overlap, just append */
- if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
- extent_sort_append(c, f, &nr, dst->start, &prev, l);
- sort_iter_advance(iter,
- extent_sort_fix_overlapping_cmp);
- continue;
- }
-
- /* Skip 0 size keys */
- if (!r.k->size) {
- __sort_iter_advance(iter, 1,
- extent_sort_fix_overlapping_cmp);
- continue;
- }
-
- /*
- * overlap: keep the newer key and trim the older key so they
- * don't overlap. comparing pointers tells us which one is
- * newer, since the bsets are appended one after the other.
- */
-
- /* can't happen because of comparison func */
- BUG_ON(_l->k < _r->k &&
- !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
- if (_l->k > _r->k) {
- /* l wins, trim r */
- if (bkey_cmp(l.k->p, r.k->p) >= 0) {
- __sort_iter_advance(iter, 1,
- extent_sort_fix_overlapping_cmp);
- } else {
- bch2_cut_front_s(l.k->p, r);
- extent_save(b, _r->k, r.k);
- __sort_iter_sift(iter, 1,
- extent_sort_fix_overlapping_cmp);
- }
- } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-
- /*
- * r wins, but it overlaps in the middle of l - split l:
- */
- bkey_on_stack_reassemble(&split, c, l.s_c);
- bch2_cut_back(bkey_start_pos(r.k), split.k);
-
- bch2_cut_front_s(r.k->p, l);
- extent_save(b, _l->k, l.k);
-
- __sort_iter_sift(iter, 0,
- extent_sort_fix_overlapping_cmp);
-
- extent_sort_append(c, f, &nr, dst->start,
- &prev, bkey_i_to_s(split.k));
- } else {
- bch2_cut_back_s(bkey_start_pos(r.k), l);
- extent_save(b, _l->k, l.k);
- }
- }
-
- extent_sort_advance_prev(f, &nr, dst->start, &prev);
-
- dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
-
- bkey_on_stack_exit(&split, c);
- return nr;
-}
-
/* Sort + repack in a new format: */
struct btree_nr_keys
bch2_sort_repack(struct bset *dst, struct btree *src,
struct bkey_packed *r)
{
return bkey_cmp_packed(b, l, r) ?:
- (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
+ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
(int) l->needs_whiteout - (int) r->needs_whiteout;
}
return (u64 *) out - (u64 *) dst;
}
+/* Compat code for btree_node_old_extent_overwrite: */
+
+/*
+ * If keys compare equal, compare by pointer order:
+ *
+ * Necessary for sort_fix_overlapping() - if there are multiple keys that
+ * compare equal in different sets, we have to process them newest to oldest.
+ */
+static inline int extent_sort_fix_overlapping_cmp(struct btree *b,
+ struct bkey_packed *l,
+ struct bkey_packed *r)
+{
+ struct bkey ul = bkey_unpack_key(b, l);
+ struct bkey ur = bkey_unpack_key(b, r);
+
+ return bkey_cmp(bkey_start_pos(&ul),
+ bkey_start_pos(&ur)) ?:
+ cmp_int((unsigned long) r, (unsigned long) l);
+}
+
+struct btree_nr_keys
+bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
+ struct sort_iter *iter)
+{
+ struct btree *b = iter->b;
+ struct bkey_format *f = &b->format;
+ struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
+ struct bkey_packed *prev = NULL;
+ struct bkey l_unpacked, r_unpacked;
+ struct bkey_s l, r;
+ struct btree_nr_keys nr;
+ struct bkey_on_stack split;
+
+ memset(&nr, 0, sizeof(nr));
+ bkey_on_stack_init(&split);
+
+ sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
+
+ while (!sort_iter_end(iter)) {
+ l = __bkey_disassemble(b, _l->k, &l_unpacked);
+
+ if (iter->used == 1) {
+ extent_sort_append(c, f, &nr, dst->start, &prev, l);
+ sort_iter_advance(iter,
+ extent_sort_fix_overlapping_cmp);
+ continue;
+ }
+
+ r = __bkey_disassemble(b, _r->k, &r_unpacked);
+
+ /* If current key and next key don't overlap, just append */
+ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
+ extent_sort_append(c, f, &nr, dst->start, &prev, l);
+ sort_iter_advance(iter,
+ extent_sort_fix_overlapping_cmp);
+ continue;
+ }
+
+ /* Skip 0 size keys */
+ if (!r.k->size) {
+ __sort_iter_advance(iter, 1,
+ extent_sort_fix_overlapping_cmp);
+ continue;
+ }
+
+ /*
+ * overlap: keep the newer key and trim the older key so they
+ * don't overlap. comparing pointers tells us which one is
+ * newer, since the bsets are appended one after the other.
+ */
+
+ /* can't happen because of comparison func */
+ BUG_ON(_l->k < _r->k &&
+ !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
+
+ if (_l->k > _r->k) {
+ /* l wins, trim r */
+ if (bkey_cmp(l.k->p, r.k->p) >= 0) {
+ __sort_iter_advance(iter, 1,
+ extent_sort_fix_overlapping_cmp);
+ } else {
+ bch2_cut_front_s(l.k->p, r);
+ extent_save(b, _r->k, r.k);
+ __sort_iter_sift(iter, 1,
+ extent_sort_fix_overlapping_cmp);
+ }
+ } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
+
+ /*
+ * r wins, but it overlaps in the middle of l - split l:
+ */
+ bkey_on_stack_reassemble(&split, c, l.s_c);
+ bch2_cut_back(bkey_start_pos(r.k), split.k);
+
+ bch2_cut_front_s(r.k->p, l);
+ extent_save(b, _l->k, l.k);
+
+ __sort_iter_sift(iter, 0,
+ extent_sort_fix_overlapping_cmp);
+
+ extent_sort_append(c, f, &nr, dst->start,
+ &prev, bkey_i_to_s(split.k));
+ } else {
+ bch2_cut_back_s(bkey_start_pos(r.k), l);
+ extent_save(b, _l->k, l.k);
+ }
+ }
+
+ extent_sort_advance_prev(f, &nr, dst->start, &prev);
+
+ dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+
+ bkey_on_stack_exit(&split, c);
+ return nr;
+}
+
static inline int sort_extents_cmp(struct btree *b,
struct bkey_packed *l,
struct bkey_packed *r)
static void verify_no_dups(struct btree *b,
struct bkey_packed *start,
- struct bkey_packed *end)
+ struct bkey_packed *end,
+ bool extents)
{
#ifdef CONFIG_BCACHEFS_DEBUG
struct bkey_packed *k, *p;
struct bkey l = bkey_unpack_key(b, p);
struct bkey r = bkey_unpack_key(b, k);
- BUG_ON(btree_node_is_extents(b)
+ BUG_ON(extents
? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
: bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
//BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
}
verify_no_dups(b, new_whiteouts,
- (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
+ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s),
+ btree_node_old_extent_overwrite(b));
memcpy_u64s(unwritten_whiteouts_start(c, b),
new_whiteouts, b->whiteout_u64s);
verify_no_dups(b,
unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
+ unwritten_whiteouts_end(c, b),
+ true);
btree_bounce_free(c, order, used_mempool, whiteouts);
bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
enum compact_mode mode)
{
- return !btree_node_is_extents(b)
+ return !btree_node_old_extent_overwrite(b)
? bch2_drop_whiteouts(b, mode)
: bch2_compact_extent_whiteouts(c, b, mode);
}
start_time = local_clock();
- if (btree_node_is_extents(b))
+ if (btree_node_old_extent_overwrite(b))
filter_whiteouts = bset_written(b, start_bset);
- u64s = (btree_node_is_extents(b)
+ u64s = (btree_node_old_extent_overwrite(b)
? bch2_sort_extents
: bch2_sort_keys)(out->keys.start,
&sort_iter,
bool have_retry)
{
struct bkey_packed *k, *prev = NULL;
- struct bpos prev_pos = POS_MIN;
+ struct bpos prev_pos = POS_MIN;
+ struct bpos prev_data = POS_MIN;
bool seen_non_whiteout = false;
unsigned version;
const char *err;
(bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
- } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
+ } else if (bkey_cmp(prev_data, bkey_start_pos(u.k)) > 0 ||
+ bkey_cmp(prev_pos, u.k->p) > 0) {
btree_err(BTREE_ERR_FATAL, c, b, i,
"keys out of order: %llu:%llu > %llu:%llu",
prev_pos.inode,
/* XXX: repair this */
}
+ if (!bkey_deleted(u.k))
+ prev_data = u.k->p;
prev_pos = u.k->p;
+
prev = k;
k = bkey_next_skip_noops(k, vstruct_last(i));
}
bset_encrypt(c, i, b->written << 9);
+ if (btree_node_is_extents(b) &&
+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+ set_btree_node_old_extent_overwrite(b);
+
sectors = vstruct_sectors(b->data, c->block_bits);
btree_node_set_format(b, b->data->format);
set_btree_bset(b, b->set, &b->data->keys);
- b->nr = (btree_node_is_extents(b)
+ b->nr = (btree_node_old_extent_overwrite(b)
? bch2_extent_sort_fix_overlapping
: bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter);
i->journal_seq = cpu_to_le64(seq);
i->u64s = 0;
- if (!btree_node_is_extents(b)) {
+ if (!btree_node_old_extent_overwrite(b)) {
sort_iter_add(&sort_iter,
unwritten_whiteouts_start(c, b),
unwritten_whiteouts_end(c, b));
b->whiteout_u64s = 0;
- u64s = btree_node_is_extents(b)
+ u64s = btree_node_old_extent_overwrite(b)
? bch2_sort_extents(vstruct_last(i), &sort_iter, false)
: bch2_sort_keys(i->start, &sort_iter, false);
le16_add_cpu(&i->u64s, u64s);
BTREE_NODE_just_written,
BTREE_NODE_dying,
BTREE_NODE_fake,
+ BTREE_NODE_old_extent_overwrite,
};
BTREE_FLAG(read_in_flight);
BTREE_FLAG(just_written);
BTREE_FLAG(dying);
BTREE_FLAG(fake);
+BTREE_FLAG(old_extent_overwrite);
static inline struct btree_write *btree_current_write(struct btree *b)
{
SET_BTREE_NODE_LEVEL(b->data, level);
b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
+ if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
+ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
+
+ if (btree_node_is_extents(b) &&
+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
+ set_btree_node_old_extent_overwrite(b);
+
bch2_btree_build_aux_trees(b);
btree_node_will_make_reachable(as, b);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
+ insert->k->k.needs_whiteout = false;
+
if (!btree_node_is_extents(b))
bch2_insert_fixup_key(trans, insert);
else
overlap = bch2_extent_overlap(&insert->k->k, k.k);
+ /*
+ * If we're overwriting an existing extent, we may need to emit
+ * a whiteout - unless we're inserting a new extent at the same
+ * position:
+ */
+ if (k.k->needs_whiteout &&
+ (!bkey_whiteout(&insert->k->k) ||
+ bkey_cmp(k.k->p, insert->k->k.p)))
+ *u64s += BKEY_U64s;
+
+ /*
+ * If we're partially overwriting an existing extent which has
+ * been written out to disk, we'll need to emit a new version of
+ * that extent:
+ */
if (bkey_written(l->b, _k) &&
overlap != BCH_EXTENT_OVERLAP_ALL)
*u64s += _k->u64s;
- /* account for having to split existing extent: */
+ /* And we may be splitting an existing extent: */
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
*u64s += _k->u64s;
bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
}
+static void pack_push_whiteout(struct bch_fs *c, struct btree *b,
+ struct bpos pos)
+{
+ struct bkey_packed k;
+
+ if (!bkey_pack_pos(&k, pos, b)) {
+ struct bkey_i tmp;
+
+ bkey_init(&tmp.k);
+ tmp.k.p = pos;
+ bkey_copy(&k, &tmp);
+ }
+
+ k.needs_whiteout = true;
+ push_whiteout(c, b, &k);
+}
+
static void
extent_drop(struct bch_fs *c, struct btree_iter *iter,
struct bkey_packed *_k, struct bkey_s k)
k.k->size = 0;
k.k->type = KEY_TYPE_deleted;
- k.k->needs_whiteout = false;
+
+ if (!btree_node_old_extent_overwrite(l->b) &&
+ k.k->needs_whiteout) {
+ pack_push_whiteout(c, l->b, k.k->p);
+ k.k->needs_whiteout = false;
+ }
if (_k >= btree_bset_last(l->b)->start) {
unsigned u64s = _k->u64s;
bkey_on_stack_init(&tmp);
bkey_on_stack_init(&split);
+ if (!btree_node_old_extent_overwrite(l->b)) {
+ if (!bkey_whiteout(&insert->k) &&
+ !bkey_cmp(k.k->p, insert->k.p)) {
+ insert->k.needs_whiteout = k.k->needs_whiteout;
+ k.k->needs_whiteout = false;
+ }
+ } else {
+ insert->k.needs_whiteout |= k.k->needs_whiteout;
+ }
+
switch (overlap) {
case BCH_EXTENT_OVERLAP_FRONT:
if (bkey_written(l->b, _k)) {
bkey_on_stack_reassemble(&tmp, c, k.s_c);
bch2_cut_front(insert->k.p, tmp.k);
+ /*
+ * needs_whiteout was propagated to new version of @k,
+ * @tmp:
+ */
+ if (!btree_node_old_extent_overwrite(l->b))
+ k.k->needs_whiteout = false;
+
extent_drop(c, iter, _k, k);
extent_bset_insert(c, iter, tmp.k);
} else {
bkey_on_stack_reassemble(&tmp, c, k.s_c);
bch2_cut_back(bkey_start_pos(&insert->k), tmp.k);
+ /*
+ * @tmp has different position than @k, needs_whiteout
+ * should not be propagated:
+ */
+ if (!btree_node_old_extent_overwrite(l->b))
+ tmp.k->k.needs_whiteout = false;
+
extent_drop(c, iter, _k, k);
extent_bset_insert(c, iter, tmp.k);
} else {
+ /*
+ * position of @k is changing, emit a whiteout if
+ * needs_whiteout is set:
+ */
+ if (!btree_node_old_extent_overwrite(l->b) &&
+ k.k->needs_whiteout) {
+ pack_push_whiteout(c, l->b, k.k->p);
+ k.k->needs_whiteout = false;
+ }
+
btree_keys_account_val_delta(l->b, _k,
bch2_cut_back_s(bkey_start_pos(&insert->k), k));
extent_save(l->b, _k, k.k);
bkey_on_stack_reassemble(&split, c, k.s_c);
bch2_cut_back(bkey_start_pos(&insert->k), split.k);
+ if (!btree_node_old_extent_overwrite(l->b))
+ split.k->k.needs_whiteout = false;
+
+ /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */
if (bkey_written(l->b, _k)) {
bkey_on_stack_reassemble(&tmp, c, k.s_c);
bch2_cut_front(insert->k.p, tmp.k);
+ if (!btree_node_old_extent_overwrite(l->b))
+ k.k->needs_whiteout = false;
+
extent_drop(c, iter, _k, k);
extent_bset_insert(c, iter, tmp.k);
} else {
bch2_cut_front(cur_end, insert);
bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
} else {
- insert->k.needs_whiteout |= k.k->needs_whiteout;
extent_squash(c, iter, insert, _k, k, overlap);
}
if (insert->k.type == KEY_TYPE_deleted)
insert->k.type = KEY_TYPE_discard;
- extent_bset_insert(c, iter, insert);
+ if (!bkey_whiteout(&insert->k) ||
+ btree_node_old_extent_overwrite(l->b))
+ extent_bset_insert(c, iter, insert);
+
bch2_btree_journal_key(trans, iter, insert);
}
le16_to_cpu(bcachefs_metadata_version_min);
c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
write_sb = true;
}
le16_to_cpu(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);