2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4 * Uses a block device as cache for other block devices; optimized for SSDs.
5 * All allocation is done in buckets, which should match the erase block size
8 * Buckets containing cached data are kept on a heap sorted by priority;
9 * bucket priority is increased on cache hit, and periodically all the buckets
10 * on the heap have their priority scaled down. This currently is just used as
11 * an LRU but in the future should allow for more intelligent heuristics.
13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14 * counter. Garbage collection is used to remove stale pointers.
16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17 * as keys are inserted we only sort the pages that have not yet been written.
18 * When garbage collection is run, we resort the entire node.
20 * All configuration is done via sysfs; see Documentation/bcache.txt.
26 #include "writeback.h"
28 #include <linux/slab.h>
29 #include <linux/bitops.h>
30 #include <linux/freezer.h>
31 #include <linux/hash.h>
32 #include <linux/kthread.h>
33 #include <linux/prefetch.h>
34 #include <linux/random.h>
35 #include <linux/rcupdate.h>
36 #include <trace/events/bcache.h>
40 * register_bcache: Return errors out to userspace correctly
42 * Writeback: don't undirty key until after a cache flush
44 * Create an iterator for key pointers
46 * On btree write error, mark bucket such that it won't be freed from the cache
49 * Check for bad keys in replay
51 * Refcount journal entries in journal_replay
54 * Finish incremental gc
55 * Gc should free old UUIDs, data for invalid UUIDs
57 * Provide a way to list backing device UUIDs we have data cached for, and
58 * probably how long it's been since we've seen them, and a way to invalidate
59 * dirty data for devices that will never be attached again
61 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
62 * that based on that and how much dirty data we have we can keep writeback
65 * Add a tracepoint or somesuch to watch for writeback starvation
67 * When btree depth > 1 and splitting an interior node, we have to make sure
68 * alloc_bucket() cannot fail. This should be true but is not completely
71 * Make sure all allocations get charged to the root cgroup
75 * If data write is less than hard sector size of ssd, round up offset in open
76 * bucket to the next whole sector
78 * Also lookup by cgroup in get_open_bucket()
80 * Superblock needs to be fleshed out for multiple cache devices
82 * Add a sysfs tunable for the number of writeback IOs in flight
84 * Add a sysfs tunable for the number of open data buckets
86 * IO tracking: Can we track when one process is doing io on behalf of another?
87 * IO tracking: Don't use just an average, weigh more recent stuff higher
89 * Test module load/unload
93 BTREE_INSERT_STATUS_INSERT
,
94 BTREE_INSERT_STATUS_BACK_MERGE
,
95 BTREE_INSERT_STATUS_OVERWROTE
,
96 BTREE_INSERT_STATUS_FRONT_MERGE
,
99 #define MAX_NEED_GC 64
100 #define MAX_SAVE_PRIO 72
102 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
104 #define PTR_HASH(c, k) \
105 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
107 static struct workqueue_struct
*btree_io_wq
;
109 static inline bool should_split(struct btree
*b
)
111 struct bset
*i
= write_block(b
);
112 return b
->written
>= btree_blocks(b
) ||
113 (b
->written
+ __set_blocks(i
, i
->keys
+ 15, b
->c
)
117 #define insert_lock(s, b) ((b)->level <= (s)->lock)
120 * These macros are for recursing down the btree - they handle the details of
121 * locking and looking up nodes in the cache for you. They're best treated as
122 * mere syntax when reading code that uses them.
124 * op->lock determines whether we take a read or a write lock at a given depth.
125 * If you've got a read lock and find that you need a write lock (i.e. you're
126 * going to have to split), set op->lock and return -EINTR; btree_root() will
127 * call you again and you'll have the correct lock.
131 * btree - recurse down the btree on a specified key
132 * @fn: function to call, which will be passed the child node
133 * @key: key to recurse on
134 * @b: parent btree node
135 * @op: pointer to struct btree_op
137 #define btree(fn, key, b, op, ...) \
139 int _r, l = (b)->level - 1; \
140 bool _w = l <= (op)->lock; \
141 struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \
142 if (!IS_ERR(_child)) { \
143 _child->parent = (b); \
144 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \
145 rw_unlock(_w, _child); \
147 _r = PTR_ERR(_child); \
152 * btree_root - call a function on the root of the btree
153 * @fn: function to call, which will be passed the child node
155 * @op: pointer to struct btree_op
157 #define btree_root(fn, c, op, ...) \
161 struct btree *_b = (c)->root; \
162 bool _w = insert_lock(op, _b); \
163 rw_lock(_w, _b, _b->level); \
164 if (_b == (c)->root && \
165 _w == insert_lock(op, _b)) { \
167 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \
170 bch_cannibalize_unlock(c); \
171 if (_r == -ENOSPC) { \
172 wait_event((c)->try_wait, \
176 } while (_r == -EINTR); \
181 /* Btree key manipulation */
183 void bkey_put(struct cache_set
*c
, struct bkey
*k
)
187 for (i
= 0; i
< KEY_PTRS(k
); i
++)
188 if (ptr_available(c
, k
, i
))
189 atomic_dec_bug(&PTR_BUCKET(c
, k
, i
)->pin
);
194 static uint64_t btree_csum_set(struct btree
*b
, struct bset
*i
)
196 uint64_t crc
= b
->key
.ptr
[0];
197 void *data
= (void *) i
+ 8, *end
= end(i
);
199 crc
= bch_crc64_update(crc
, data
, end
- data
);
200 return crc
^ 0xffffffffffffffffULL
;
203 static void bch_btree_node_read_done(struct btree
*b
)
205 const char *err
= "bad btree header";
206 struct bset
*i
= b
->sets
[0].data
;
207 struct btree_iter
*iter
;
209 iter
= mempool_alloc(b
->c
->fill_iter
, GFP_NOWAIT
);
210 iter
->size
= b
->c
->sb
.bucket_size
/ b
->c
->sb
.block_size
;
213 #ifdef CONFIG_BCACHE_DEBUG
221 b
->written
< btree_blocks(b
) && i
->seq
== b
->sets
[0].data
->seq
;
222 i
= write_block(b
)) {
223 err
= "unsupported bset version";
224 if (i
->version
> BCACHE_BSET_VERSION
)
227 err
= "bad btree header";
228 if (b
->written
+ set_blocks(i
, b
->c
) > btree_blocks(b
))
232 if (i
->magic
!= bset_magic(&b
->c
->sb
))
235 err
= "bad checksum";
236 switch (i
->version
) {
238 if (i
->csum
!= csum_set(i
))
241 case BCACHE_BSET_VERSION
:
242 if (i
->csum
!= btree_csum_set(b
, i
))
248 if (i
!= b
->sets
[0].data
&& !i
->keys
)
251 bch_btree_iter_push(iter
, i
->start
, end(i
));
253 b
->written
+= set_blocks(i
, b
->c
);
256 err
= "corrupted btree";
257 for (i
= write_block(b
);
258 index(i
, b
) < btree_blocks(b
);
259 i
= ((void *) i
) + block_bytes(b
->c
))
260 if (i
->seq
== b
->sets
[0].data
->seq
)
263 bch_btree_sort_and_fix_extents(b
, iter
);
266 err
= "short btree key";
267 if (b
->sets
[0].size
&&
268 bkey_cmp(&b
->key
, &b
->sets
[0].end
) < 0)
271 if (b
->written
< btree_blocks(b
))
272 bch_bset_init_next(b
);
274 mempool_free(iter
, b
->c
->fill_iter
);
277 set_btree_node_io_error(b
);
278 bch_cache_set_error(b
->c
, "%s at bucket %zu, block %zu, %u keys",
279 err
, PTR_BUCKET_NR(b
->c
, &b
->key
, 0),
280 index(i
, b
), i
->keys
);
284 static void btree_node_read_endio(struct bio
*bio
, int error
)
286 struct closure
*cl
= bio
->bi_private
;
290 void bch_btree_node_read(struct btree
*b
)
292 uint64_t start_time
= local_clock();
296 trace_bcache_btree_read(b
);
298 closure_init_stack(&cl
);
300 bio
= bch_bbio_alloc(b
->c
);
301 bio
->bi_rw
= REQ_META
|READ_SYNC
;
302 bio
->bi_size
= KEY_SIZE(&b
->key
) << 9;
303 bio
->bi_end_io
= btree_node_read_endio
;
304 bio
->bi_private
= &cl
;
306 bch_bio_map(bio
, b
->sets
[0].data
);
308 bch_submit_bbio(bio
, b
->c
, &b
->key
, 0);
311 if (!test_bit(BIO_UPTODATE
, &bio
->bi_flags
))
312 set_btree_node_io_error(b
);
314 bch_bbio_free(bio
, b
->c
);
316 if (btree_node_io_error(b
))
319 bch_btree_node_read_done(b
);
320 bch_time_stats_update(&b
->c
->btree_read_time
, start_time
);
324 bch_cache_set_error(b
->c
, "io error reading bucket %zu",
325 PTR_BUCKET_NR(b
->c
, &b
->key
, 0));
328 static void btree_complete_write(struct btree
*b
, struct btree_write
*w
)
330 if (w
->prio_blocked
&&
331 !atomic_sub_return(w
->prio_blocked
, &b
->c
->prio_blocked
))
332 wake_up_allocators(b
->c
);
335 atomic_dec_bug(w
->journal
);
336 __closure_wake_up(&b
->c
->journal
.wait
);
343 static void __btree_node_write_done(struct closure
*cl
)
345 struct btree
*b
= container_of(cl
, struct btree
, io
.cl
);
346 struct btree_write
*w
= btree_prev_write(b
);
348 bch_bbio_free(b
->bio
, b
->c
);
350 btree_complete_write(b
, w
);
352 if (btree_node_dirty(b
))
353 queue_delayed_work(btree_io_wq
, &b
->work
,
354 msecs_to_jiffies(30000));
359 static void btree_node_write_done(struct closure
*cl
)
361 struct btree
*b
= container_of(cl
, struct btree
, io
.cl
);
365 __bio_for_each_segment(bv
, b
->bio
, n
, 0)
366 __free_page(bv
->bv_page
);
368 __btree_node_write_done(cl
);
371 static void btree_node_write_endio(struct bio
*bio
, int error
)
373 struct closure
*cl
= bio
->bi_private
;
374 struct btree
*b
= container_of(cl
, struct btree
, io
.cl
);
377 set_btree_node_io_error(b
);
379 bch_bbio_count_io_errors(b
->c
, bio
, error
, "writing btree");
383 static void do_btree_node_write(struct btree
*b
)
385 struct closure
*cl
= &b
->io
.cl
;
386 struct bset
*i
= b
->sets
[b
->nsets
].data
;
389 i
->version
= BCACHE_BSET_VERSION
;
390 i
->csum
= btree_csum_set(b
, i
);
393 b
->bio
= bch_bbio_alloc(b
->c
);
395 b
->bio
->bi_end_io
= btree_node_write_endio
;
396 b
->bio
->bi_private
= cl
;
397 b
->bio
->bi_rw
= REQ_META
|WRITE_SYNC
|REQ_FUA
;
398 b
->bio
->bi_size
= set_blocks(i
, b
->c
) * block_bytes(b
->c
);
399 bch_bio_map(b
->bio
, i
);
402 * If we're appending to a leaf node, we don't technically need FUA -
403 * this write just needs to be persisted before the next journal write,
404 * which will be marked FLUSH|FUA.
406 * Similarly if we're writing a new btree root - the pointer is going to
407 * be in the next journal entry.
409 * But if we're writing a new btree node (that isn't a root) or
410 * appending to a non leaf btree node, we need either FUA or a flush
411 * when we write the parent with the new pointer. FUA is cheaper than a
412 * flush, and writes appending to leaf nodes aren't blocking anything so
413 * just make all btree node writes FUA to keep things sane.
416 bkey_copy(&k
.key
, &b
->key
);
417 SET_PTR_OFFSET(&k
.key
, 0, PTR_OFFSET(&k
.key
, 0) + bset_offset(b
, i
));
419 if (!bio_alloc_pages(b
->bio
, GFP_NOIO
)) {
422 void *base
= (void *) ((unsigned long) i
& ~(PAGE_SIZE
- 1));
424 bio_for_each_segment(bv
, b
->bio
, j
)
425 memcpy(page_address(bv
->bv_page
),
426 base
+ j
* PAGE_SIZE
, PAGE_SIZE
);
428 bch_submit_bbio(b
->bio
, b
->c
, &k
.key
, 0);
430 continue_at(cl
, btree_node_write_done
, NULL
);
433 bch_bio_map(b
->bio
, i
);
435 bch_submit_bbio(b
->bio
, b
->c
, &k
.key
, 0);
438 __btree_node_write_done(cl
);
442 void bch_btree_node_write(struct btree
*b
, struct closure
*parent
)
444 struct bset
*i
= b
->sets
[b
->nsets
].data
;
446 trace_bcache_btree_write(b
);
448 BUG_ON(current
->bio_list
);
449 BUG_ON(b
->written
>= btree_blocks(b
));
450 BUG_ON(b
->written
&& !i
->keys
);
451 BUG_ON(b
->sets
->data
->seq
!= i
->seq
);
452 bch_check_keys(b
, "writing");
454 cancel_delayed_work(&b
->work
);
456 /* If caller isn't waiting for write, parent refcount is cache set */
457 closure_lock(&b
->io
, parent
?: &b
->c
->cl
);
459 clear_bit(BTREE_NODE_dirty
, &b
->flags
);
460 change_bit(BTREE_NODE_write_idx
, &b
->flags
);
462 do_btree_node_write(b
);
464 b
->written
+= set_blocks(i
, b
->c
);
465 atomic_long_add(set_blocks(i
, b
->c
) * b
->c
->sb
.block_size
,
466 &PTR_CACHE(b
->c
, &b
->key
, 0)->btree_sectors_written
);
468 bch_btree_sort_lazy(b
);
470 if (b
->written
< btree_blocks(b
))
471 bch_bset_init_next(b
);
474 static void bch_btree_node_write_sync(struct btree
*b
)
478 closure_init_stack(&cl
);
479 bch_btree_node_write(b
, &cl
);
483 static void btree_node_write_work(struct work_struct
*w
)
485 struct btree
*b
= container_of(to_delayed_work(w
), struct btree
, work
);
487 rw_lock(true, b
, b
->level
);
489 if (btree_node_dirty(b
))
490 bch_btree_node_write(b
, NULL
);
494 static void bch_btree_leaf_dirty(struct btree
*b
, atomic_t
*journal_ref
)
496 struct bset
*i
= b
->sets
[b
->nsets
].data
;
497 struct btree_write
*w
= btree_current_write(b
);
502 if (!btree_node_dirty(b
))
503 queue_delayed_work(btree_io_wq
, &b
->work
, 30 * HZ
);
505 set_btree_node_dirty(b
);
509 journal_pin_cmp(b
->c
, w
->journal
, journal_ref
)) {
510 atomic_dec_bug(w
->journal
);
515 w
->journal
= journal_ref
;
516 atomic_inc(w
->journal
);
520 /* Force write if set is too big */
521 if (set_bytes(i
) > PAGE_SIZE
- 48 &&
523 bch_btree_node_write(b
, NULL
);
527 * Btree in memory cache - allocation/freeing
528 * mca -> memory cache
531 static void mca_reinit(struct btree
*b
)
539 for (i
= 0; i
< MAX_BSETS
; i
++)
542 * Second loop starts at 1 because b->sets[0]->data is the memory we
545 for (i
= 1; i
< MAX_BSETS
; i
++)
546 b
->sets
[i
].data
= NULL
;
549 #define mca_reserve(c) (((c->root && c->root->level) \
550 ? c->root->level : 1) * 8 + 16)
551 #define mca_can_free(c) \
552 max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
554 static void mca_data_free(struct btree
*b
)
556 struct bset_tree
*t
= b
->sets
;
557 BUG_ON(!closure_is_unlocked(&b
->io
.cl
));
559 if (bset_prev_bytes(b
) < PAGE_SIZE
)
562 free_pages((unsigned long) t
->prev
,
563 get_order(bset_prev_bytes(b
)));
565 if (bset_tree_bytes(b
) < PAGE_SIZE
)
568 free_pages((unsigned long) t
->tree
,
569 get_order(bset_tree_bytes(b
)));
571 free_pages((unsigned long) t
->data
, b
->page_order
);
576 list_move(&b
->list
, &b
->c
->btree_cache_freed
);
577 b
->c
->bucket_cache_used
--;
580 static void mca_bucket_free(struct btree
*b
)
582 BUG_ON(btree_node_dirty(b
));
585 hlist_del_init_rcu(&b
->hash
);
586 list_move(&b
->list
, &b
->c
->btree_cache_freeable
);
589 static unsigned btree_order(struct bkey
*k
)
591 return ilog2(KEY_SIZE(k
) / PAGE_SECTORS
?: 1);
594 static void mca_data_alloc(struct btree
*b
, struct bkey
*k
, gfp_t gfp
)
596 struct bset_tree
*t
= b
->sets
;
599 b
->page_order
= max_t(unsigned,
600 ilog2(b
->c
->btree_pages
),
603 t
->data
= (void *) __get_free_pages(gfp
, b
->page_order
);
607 t
->tree
= bset_tree_bytes(b
) < PAGE_SIZE
608 ? kmalloc(bset_tree_bytes(b
), gfp
)
609 : (void *) __get_free_pages(gfp
, get_order(bset_tree_bytes(b
)));
613 t
->prev
= bset_prev_bytes(b
) < PAGE_SIZE
614 ? kmalloc(bset_prev_bytes(b
), gfp
)
615 : (void *) __get_free_pages(gfp
, get_order(bset_prev_bytes(b
)));
619 list_move(&b
->list
, &b
->c
->btree_cache
);
620 b
->c
->bucket_cache_used
++;
626 static struct btree
*mca_bucket_alloc(struct cache_set
*c
,
627 struct bkey
*k
, gfp_t gfp
)
629 struct btree
*b
= kzalloc(sizeof(struct btree
), gfp
);
633 init_rwsem(&b
->lock
);
634 lockdep_set_novalidate_class(&b
->lock
);
635 INIT_LIST_HEAD(&b
->list
);
636 INIT_DELAYED_WORK(&b
->work
, btree_node_write_work
);
638 closure_init_unlocked(&b
->io
);
640 mca_data_alloc(b
, k
, gfp
);
644 static int mca_reap(struct btree
*b
, unsigned min_order
, bool flush
)
648 closure_init_stack(&cl
);
649 lockdep_assert_held(&b
->c
->bucket_lock
);
651 if (!down_write_trylock(&b
->lock
))
654 BUG_ON(btree_node_dirty(b
) && !b
->sets
[0].data
);
656 if (b
->page_order
< min_order
||
658 (btree_node_dirty(b
) ||
659 atomic_read(&b
->io
.cl
.remaining
) != -1))) {
664 if (btree_node_dirty(b
))
665 bch_btree_node_write_sync(b
);
667 /* wait for any in flight btree write */
668 closure_wait_event(&b
->io
.wait
, &cl
,
669 atomic_read(&b
->io
.cl
.remaining
) == -1);
674 static unsigned long bch_mca_scan(struct shrinker
*shrink
,
675 struct shrink_control
*sc
)
677 struct cache_set
*c
= container_of(shrink
, struct cache_set
, shrink
);
679 unsigned long i
, nr
= sc
->nr_to_scan
;
680 unsigned long freed
= 0;
682 if (c
->shrinker_disabled
)
688 /* Return -1 if we can't do anything right now */
689 if (sc
->gfp_mask
& __GFP_IO
)
690 mutex_lock(&c
->bucket_lock
);
691 else if (!mutex_trylock(&c
->bucket_lock
))
695 * It's _really_ critical that we don't free too many btree nodes - we
696 * have to always leave ourselves a reserve. The reserve is how we
697 * guarantee that allocating memory for a new btree node can always
698 * succeed, so that inserting keys into the btree can always succeed and
699 * IO can always make forward progress:
701 nr
/= c
->btree_pages
;
702 nr
= min_t(unsigned long, nr
, mca_can_free(c
));
705 list_for_each_entry_safe(b
, t
, &c
->btree_cache_freeable
, list
) {
710 !mca_reap(b
, 0, false)) {
718 * Can happen right when we first start up, before we've read in any
721 if (list_empty(&c
->btree_cache
))
724 for (i
= 0; (nr
--) && i
< c
->bucket_cache_used
; i
++) {
725 b
= list_first_entry(&c
->btree_cache
, struct btree
, list
);
726 list_rotate_left(&c
->btree_cache
);
729 !mca_reap(b
, 0, false)) {
738 mutex_unlock(&c
->bucket_lock
);
742 static unsigned long bch_mca_count(struct shrinker
*shrink
,
743 struct shrink_control
*sc
)
745 struct cache_set
*c
= container_of(shrink
, struct cache_set
, shrink
);
747 if (c
->shrinker_disabled
)
753 return mca_can_free(c
) * c
->btree_pages
;
756 void bch_btree_cache_free(struct cache_set
*c
)
760 closure_init_stack(&cl
);
762 if (c
->shrink
.list
.next
)
763 unregister_shrinker(&c
->shrink
);
765 mutex_lock(&c
->bucket_lock
);
767 #ifdef CONFIG_BCACHE_DEBUG
769 list_move(&c
->verify_data
->list
, &c
->btree_cache
);
772 list_splice(&c
->btree_cache_freeable
,
775 while (!list_empty(&c
->btree_cache
)) {
776 b
= list_first_entry(&c
->btree_cache
, struct btree
, list
);
778 if (btree_node_dirty(b
))
779 btree_complete_write(b
, btree_current_write(b
));
780 clear_bit(BTREE_NODE_dirty
, &b
->flags
);
785 while (!list_empty(&c
->btree_cache_freed
)) {
786 b
= list_first_entry(&c
->btree_cache_freed
,
789 cancel_delayed_work_sync(&b
->work
);
793 mutex_unlock(&c
->bucket_lock
);
796 int bch_btree_cache_alloc(struct cache_set
*c
)
800 for (i
= 0; i
< mca_reserve(c
); i
++)
801 if (!mca_bucket_alloc(c
, &ZERO_KEY
, GFP_KERNEL
))
804 list_splice_init(&c
->btree_cache
,
805 &c
->btree_cache_freeable
);
807 #ifdef CONFIG_BCACHE_DEBUG
808 mutex_init(&c
->verify_lock
);
810 c
->verify_data
= mca_bucket_alloc(c
, &ZERO_KEY
, GFP_KERNEL
);
812 if (c
->verify_data
&&
813 c
->verify_data
->sets
[0].data
)
814 list_del_init(&c
->verify_data
->list
);
816 c
->verify_data
= NULL
;
819 c
->shrink
.count_objects
= bch_mca_count
;
820 c
->shrink
.scan_objects
= bch_mca_scan
;
822 c
->shrink
.batch
= c
->btree_pages
* 2;
823 register_shrinker(&c
->shrink
);
828 /* Btree in memory cache - hash table */
830 static struct hlist_head
*mca_hash(struct cache_set
*c
, struct bkey
*k
)
832 return &c
->bucket_hash
[hash_32(PTR_HASH(c
, k
), BUCKET_HASH_BITS
)];
835 static struct btree
*mca_find(struct cache_set
*c
, struct bkey
*k
)
840 hlist_for_each_entry_rcu(b
, mca_hash(c
, k
), hash
)
841 if (PTR_HASH(c
, &b
->key
) == PTR_HASH(c
, k
))
849 static struct btree
*mca_cannibalize(struct cache_set
*c
, struct bkey
*k
)
853 trace_bcache_btree_cache_cannibalize(c
);
855 if (!c
->try_harder
) {
856 c
->try_harder
= current
;
857 c
->try_harder_start
= local_clock();
858 } else if (c
->try_harder
!= current
)
859 return ERR_PTR(-ENOSPC
);
861 list_for_each_entry_reverse(b
, &c
->btree_cache
, list
)
862 if (!mca_reap(b
, btree_order(k
), false))
865 list_for_each_entry_reverse(b
, &c
->btree_cache
, list
)
866 if (!mca_reap(b
, btree_order(k
), true))
869 return ERR_PTR(-ENOMEM
);
873 * We can only have one thread cannibalizing other cached btree nodes at a time,
874 * or we'll deadlock. We use an open coded mutex to ensure that, which a
875 * cannibalize_bucket() will take. This means every time we unlock the root of
876 * the btree, we need to release this lock if we have it held.
878 static void bch_cannibalize_unlock(struct cache_set
*c
)
880 if (c
->try_harder
== current
) {
881 bch_time_stats_update(&c
->try_harder_time
, c
->try_harder_start
);
882 c
->try_harder
= NULL
;
883 wake_up(&c
->try_wait
);
887 static struct btree
*mca_alloc(struct cache_set
*c
, struct bkey
*k
, int level
)
891 BUG_ON(current
->bio_list
);
893 lockdep_assert_held(&c
->bucket_lock
);
898 /* btree_free() doesn't free memory; it sticks the node on the end of
899 * the list. Check if there's any freed nodes there:
901 list_for_each_entry(b
, &c
->btree_cache_freeable
, list
)
902 if (!mca_reap(b
, btree_order(k
), false))
905 /* We never free struct btree itself, just the memory that holds the on
906 * disk node. Check the freed list before allocating a new one:
908 list_for_each_entry(b
, &c
->btree_cache_freed
, list
)
909 if (!mca_reap(b
, 0, false)) {
910 mca_data_alloc(b
, k
, __GFP_NOWARN
|GFP_NOIO
);
911 if (!b
->sets
[0].data
)
917 b
= mca_bucket_alloc(c
, k
, __GFP_NOWARN
|GFP_NOIO
);
921 BUG_ON(!down_write_trylock(&b
->lock
));
925 BUG_ON(!closure_is_unlocked(&b
->io
.cl
));
927 bkey_copy(&b
->key
, k
);
928 list_move(&b
->list
, &c
->btree_cache
);
929 hlist_del_init_rcu(&b
->hash
);
930 hlist_add_head_rcu(&b
->hash
, mca_hash(c
, k
));
932 lock_set_subclass(&b
->lock
.dep_map
, level
+ 1, _THIS_IP_
);
934 b
->parent
= (void *) ~0UL;
943 b
= mca_cannibalize(c
, k
);
951 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
952 * in from disk if necessary.
954 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
956 * The btree node will have either a read or a write lock held, depending on
957 * level and op->lock.
959 struct btree
*bch_btree_node_get(struct cache_set
*c
, struct bkey
*k
,
960 int level
, bool write
)
970 if (current
->bio_list
)
971 return ERR_PTR(-EAGAIN
);
973 mutex_lock(&c
->bucket_lock
);
974 b
= mca_alloc(c
, k
, level
);
975 mutex_unlock(&c
->bucket_lock
);
982 bch_btree_node_read(b
);
985 downgrade_write(&b
->lock
);
987 rw_lock(write
, b
, level
);
988 if (PTR_HASH(c
, &b
->key
) != PTR_HASH(c
, k
)) {
992 BUG_ON(b
->level
!= level
);
997 for (; i
<= b
->nsets
&& b
->sets
[i
].size
; i
++) {
998 prefetch(b
->sets
[i
].tree
);
999 prefetch(b
->sets
[i
].data
);
1002 for (; i
<= b
->nsets
; i
++)
1003 prefetch(b
->sets
[i
].data
);
1005 if (btree_node_io_error(b
)) {
1006 rw_unlock(write
, b
);
1007 return ERR_PTR(-EIO
);
1010 BUG_ON(!b
->written
);
1015 static void btree_node_prefetch(struct cache_set
*c
, struct bkey
*k
, int level
)
1019 mutex_lock(&c
->bucket_lock
);
1020 b
= mca_alloc(c
, k
, level
);
1021 mutex_unlock(&c
->bucket_lock
);
1023 if (!IS_ERR_OR_NULL(b
)) {
1024 bch_btree_node_read(b
);
1031 static void btree_node_free(struct btree
*b
)
1035 trace_bcache_btree_node_free(b
);
1037 BUG_ON(b
== b
->c
->root
);
1039 if (btree_node_dirty(b
))
1040 btree_complete_write(b
, btree_current_write(b
));
1041 clear_bit(BTREE_NODE_dirty
, &b
->flags
);
1043 cancel_delayed_work(&b
->work
);
1045 mutex_lock(&b
->c
->bucket_lock
);
1047 for (i
= 0; i
< KEY_PTRS(&b
->key
); i
++) {
1048 BUG_ON(atomic_read(&PTR_BUCKET(b
->c
, &b
->key
, i
)->pin
));
1050 bch_inc_gen(PTR_CACHE(b
->c
, &b
->key
, i
),
1051 PTR_BUCKET(b
->c
, &b
->key
, i
));
1054 bch_bucket_free(b
->c
, &b
->key
);
1056 mutex_unlock(&b
->c
->bucket_lock
);
1059 struct btree
*bch_btree_node_alloc(struct cache_set
*c
, int level
, bool wait
)
1062 struct btree
*b
= ERR_PTR(-EAGAIN
);
1064 mutex_lock(&c
->bucket_lock
);
1066 if (__bch_bucket_alloc_set(c
, WATERMARK_METADATA
, &k
.key
, 1, wait
))
1069 bkey_put(c
, &k
.key
);
1070 SET_KEY_SIZE(&k
.key
, c
->btree_pages
* PAGE_SECTORS
);
1072 b
= mca_alloc(c
, &k
.key
, level
);
1078 "Tried to allocate bucket that was in btree cache");
1083 bch_bset_init_next(b
);
1085 mutex_unlock(&c
->bucket_lock
);
1087 trace_bcache_btree_node_alloc(b
);
1090 bch_bucket_free(c
, &k
.key
);
1092 mutex_unlock(&c
->bucket_lock
);
1094 trace_bcache_btree_node_alloc_fail(b
);
1098 static struct btree
*btree_node_alloc_replacement(struct btree
*b
, bool wait
)
1100 struct btree
*n
= bch_btree_node_alloc(b
->c
, b
->level
, wait
);
1101 if (!IS_ERR_OR_NULL(n
))
1102 bch_btree_sort_into(b
, n
);
1107 static void make_btree_freeing_key(struct btree
*b
, struct bkey
*k
)
1111 bkey_copy(k
, &b
->key
);
1112 bkey_copy_key(k
, &ZERO_KEY
);
1114 for (i
= 0; i
< KEY_PTRS(k
); i
++) {
1115 uint8_t g
= PTR_BUCKET(b
->c
, k
, i
)->gen
+ 1;
1117 SET_PTR_GEN(k
, i
, g
);
1120 atomic_inc(&b
->c
->prio_blocked
);
1123 /* Garbage collection */
1125 uint8_t __bch_btree_mark_key(struct cache_set
*c
, int level
, struct bkey
*k
)
1132 * ptr_invalid() can't return true for the keys that mark btree nodes as
1133 * freed, but since ptr_bad() returns true we'll never actually use them
1134 * for anything and thus we don't want mark their pointers here
1136 if (!bkey_cmp(k
, &ZERO_KEY
))
1139 for (i
= 0; i
< KEY_PTRS(k
); i
++) {
1140 if (!ptr_available(c
, k
, i
))
1143 g
= PTR_BUCKET(c
, k
, i
);
1145 if (gen_after(g
->gc_gen
, PTR_GEN(k
, i
)))
1146 g
->gc_gen
= PTR_GEN(k
, i
);
1148 if (ptr_stale(c
, k
, i
)) {
1149 stale
= max(stale
, ptr_stale(c
, k
, i
));
1153 cache_bug_on(GC_MARK(g
) &&
1154 (GC_MARK(g
) == GC_MARK_METADATA
) != (level
!= 0),
1155 c
, "inconsistent ptrs: mark = %llu, level = %i",
1159 SET_GC_MARK(g
, GC_MARK_METADATA
);
1160 else if (KEY_DIRTY(k
))
1161 SET_GC_MARK(g
, GC_MARK_DIRTY
);
1163 /* guard against overflow */
1164 SET_GC_SECTORS_USED(g
, min_t(unsigned,
1165 GC_SECTORS_USED(g
) + KEY_SIZE(k
),
1168 BUG_ON(!GC_SECTORS_USED(g
));
1174 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1176 static bool btree_gc_mark_node(struct btree
*b
, struct gc_stat
*gc
)
1179 unsigned keys
= 0, good_keys
= 0;
1181 struct btree_iter iter
;
1182 struct bset_tree
*t
;
1186 for_each_key_filter(b
, k
, &iter
, bch_ptr_invalid
) {
1187 stale
= max(stale
, btree_mark_key(b
, k
));
1190 if (bch_ptr_bad(b
, k
))
1193 gc
->key_bytes
+= bkey_u64s(k
);
1197 gc
->data
+= KEY_SIZE(k
);
1200 for (t
= b
->sets
; t
<= &b
->sets
[b
->nsets
]; t
++)
1201 btree_bug_on(t
->size
&&
1202 bset_written(b
, t
) &&
1203 bkey_cmp(&b
->key
, &t
->end
) < 0,
1204 b
, "found short btree key in gc");
1206 if (b
->c
->gc_always_rewrite
)
1212 if ((keys
- good_keys
) * 2 > keys
)
1218 #define GC_MERGE_NODES 4U
1220 struct gc_merge_info
{
1225 static int bch_btree_insert_node(struct btree
*, struct btree_op
*,
1226 struct keylist
*, atomic_t
*, struct bkey
*);
1228 static int btree_gc_coalesce(struct btree
*b
, struct btree_op
*op
,
1229 struct keylist
*keylist
, struct gc_stat
*gc
,
1230 struct gc_merge_info
*r
)
1232 unsigned i
, nodes
= 0, keys
= 0, blocks
;
1233 struct btree
*new_nodes
[GC_MERGE_NODES
];
1237 memset(new_nodes
, 0, sizeof(new_nodes
));
1238 closure_init_stack(&cl
);
1240 while (nodes
< GC_MERGE_NODES
&& !IS_ERR_OR_NULL(r
[nodes
].b
))
1241 keys
+= r
[nodes
++].keys
;
1243 blocks
= btree_default_blocks(b
->c
) * 2 / 3;
1246 __set_blocks(b
->sets
[0].data
, keys
, b
->c
) > blocks
* (nodes
- 1))
1249 for (i
= 0; i
< nodes
; i
++) {
1250 new_nodes
[i
] = btree_node_alloc_replacement(r
[i
].b
, false);
1251 if (IS_ERR_OR_NULL(new_nodes
[i
]))
1252 goto out_nocoalesce
;
1255 for (i
= nodes
- 1; i
> 0; --i
) {
1256 struct bset
*n1
= new_nodes
[i
]->sets
->data
;
1257 struct bset
*n2
= new_nodes
[i
- 1]->sets
->data
;
1258 struct bkey
*k
, *last
= NULL
;
1266 if (__set_blocks(n1
, n1
->keys
+ keys
+
1267 bkey_u64s(k
), b
->c
) > blocks
)
1271 keys
+= bkey_u64s(k
);
1275 * Last node we're not getting rid of - we're getting
1276 * rid of the node at r[0]. Have to try and fit all of
1277 * the remaining keys into this node; we can't ensure
1278 * they will always fit due to rounding and variable
1279 * length keys (shouldn't be possible in practice,
1282 if (__set_blocks(n1
, n1
->keys
+ n2
->keys
,
1283 b
->c
) > btree_blocks(new_nodes
[i
]))
1284 goto out_nocoalesce
;
1287 /* Take the key of the node we're getting rid of */
1291 BUG_ON(__set_blocks(n1
, n1
->keys
+ keys
,
1292 b
->c
) > btree_blocks(new_nodes
[i
]));
1295 bkey_copy_key(&new_nodes
[i
]->key
, last
);
1299 (void *) node(n2
, keys
) - (void *) n2
->start
);
1302 r
[i
].keys
= n1
->keys
;
1306 (void *) end(n2
) - (void *) node(n2
, keys
));
1310 if (bch_keylist_realloc(keylist
,
1311 KEY_PTRS(&new_nodes
[i
]->key
), b
->c
))
1312 goto out_nocoalesce
;
1314 bch_btree_node_write(new_nodes
[i
], &cl
);
1315 bch_keylist_add(keylist
, &new_nodes
[i
]->key
);
1318 for (i
= 0; i
< nodes
; i
++) {
1319 if (bch_keylist_realloc(keylist
, KEY_PTRS(&r
[i
].b
->key
), b
->c
))
1320 goto out_nocoalesce
;
1322 make_btree_freeing_key(r
[i
].b
, keylist
->top
);
1323 bch_keylist_push(keylist
);
1326 /* We emptied out this node */
1327 BUG_ON(new_nodes
[0]->sets
->data
->keys
);
1328 btree_node_free(new_nodes
[0]);
1329 rw_unlock(true, new_nodes
[0]);
1333 for (i
= 0; i
< nodes
; i
++) {
1334 btree_node_free(r
[i
].b
);
1335 rw_unlock(true, r
[i
].b
);
1337 r
[i
].b
= new_nodes
[i
];
1340 bch_btree_insert_node(b
, op
, keylist
, NULL
, NULL
);
1341 BUG_ON(!bch_keylist_empty(keylist
));
1343 memmove(r
, r
+ 1, sizeof(r
[0]) * (nodes
- 1));
1344 r
[nodes
- 1].b
= ERR_PTR(-EINTR
);
1346 trace_bcache_btree_gc_coalesce(nodes
);
1349 /* Invalidated our iterator */
1355 while ((k
= bch_keylist_pop(keylist
)))
1356 if (!bkey_cmp(k
, &ZERO_KEY
))
1357 atomic_dec(&b
->c
->prio_blocked
);
1359 for (i
= 0; i
< nodes
; i
++)
1360 if (!IS_ERR_OR_NULL(new_nodes
[i
])) {
1361 btree_node_free(new_nodes
[i
]);
1362 rw_unlock(true, new_nodes
[i
]);
1367 static unsigned btree_gc_count_keys(struct btree
*b
)
1370 struct btree_iter iter
;
1373 for_each_key_filter(b
, k
, &iter
, bch_ptr_bad
)
1374 ret
+= bkey_u64s(k
);
1379 static int btree_gc_recurse(struct btree
*b
, struct btree_op
*op
,
1380 struct closure
*writes
, struct gc_stat
*gc
)
1384 bool should_rewrite
;
1387 struct keylist keys
;
1388 struct btree_iter iter
;
1389 struct gc_merge_info r
[GC_MERGE_NODES
];
1390 struct gc_merge_info
*last
= r
+ GC_MERGE_NODES
- 1;
1392 bch_keylist_init(&keys
);
1393 bch_btree_iter_init(b
, &iter
, &b
->c
->gc_done
);
1395 for (i
= 0; i
< GC_MERGE_NODES
; i
++)
1396 r
[i
].b
= ERR_PTR(-EINTR
);
1399 k
= bch_btree_iter_next_filter(&iter
, b
, bch_ptr_bad
);
1401 r
->b
= bch_btree_node_get(b
->c
, k
, b
->level
- 1, true);
1403 ret
= PTR_ERR(r
->b
);
1407 r
->keys
= btree_gc_count_keys(r
->b
);
1409 ret
= btree_gc_coalesce(b
, op
, &keys
, gc
, r
);
1417 if (!IS_ERR(last
->b
)) {
1418 should_rewrite
= btree_gc_mark_node(last
->b
, gc
);
1419 if (should_rewrite
) {
1420 n
= btree_node_alloc_replacement(last
->b
,
1423 if (!IS_ERR_OR_NULL(n
)) {
1424 bch_btree_node_write_sync(n
);
1425 bch_keylist_add(&keys
, &n
->key
);
1427 make_btree_freeing_key(last
->b
,
1429 bch_keylist_push(&keys
);
1431 btree_node_free(last
->b
);
1433 bch_btree_insert_node(b
, op
, &keys
,
1435 BUG_ON(!bch_keylist_empty(&keys
));
1437 rw_unlock(true, last
->b
);
1440 /* Invalidated our iterator */
1446 if (last
->b
->level
) {
1447 ret
= btree_gc_recurse(last
->b
, op
, writes
, gc
);
1452 bkey_copy_key(&b
->c
->gc_done
, &last
->b
->key
);
1455 * Must flush leaf nodes before gc ends, since replace
1456 * operations aren't journalled
1458 if (btree_node_dirty(last
->b
))
1459 bch_btree_node_write(last
->b
, writes
);
1460 rw_unlock(true, last
->b
);
1463 memmove(r
+ 1, r
, sizeof(r
[0]) * (GC_MERGE_NODES
- 1));
1466 if (need_resched()) {
1472 for (i
= 0; i
< GC_MERGE_NODES
; i
++)
1473 if (!IS_ERR_OR_NULL(r
[i
].b
)) {
1474 if (btree_node_dirty(r
[i
].b
))
1475 bch_btree_node_write(r
[i
].b
, writes
);
1476 rw_unlock(true, r
[i
].b
);
1479 bch_keylist_free(&keys
);
1484 static int bch_btree_gc_root(struct btree
*b
, struct btree_op
*op
,
1485 struct closure
*writes
, struct gc_stat
*gc
)
1487 struct btree
*n
= NULL
;
1489 bool should_rewrite
;
1491 should_rewrite
= btree_gc_mark_node(b
, gc
);
1492 if (should_rewrite
) {
1493 n
= btree_node_alloc_replacement(b
, false);
1495 if (!IS_ERR_OR_NULL(n
)) {
1496 bch_btree_node_write_sync(n
);
1497 bch_btree_set_root(n
);
1506 ret
= btree_gc_recurse(b
, op
, writes
, gc
);
1511 bkey_copy_key(&b
->c
->gc_done
, &b
->key
);
1516 static void btree_gc_start(struct cache_set
*c
)
1522 if (!c
->gc_mark_valid
)
1525 mutex_lock(&c
->bucket_lock
);
1527 c
->gc_mark_valid
= 0;
1528 c
->gc_done
= ZERO_KEY
;
1530 for_each_cache(ca
, c
, i
)
1531 for_each_bucket(b
, ca
) {
1533 if (!atomic_read(&b
->pin
)) {
1534 SET_GC_MARK(b
, GC_MARK_RECLAIMABLE
);
1535 SET_GC_SECTORS_USED(b
, 0);
1539 mutex_unlock(&c
->bucket_lock
);
1542 size_t bch_btree_gc_finish(struct cache_set
*c
)
1544 size_t available
= 0;
1549 mutex_lock(&c
->bucket_lock
);
1552 c
->gc_mark_valid
= 1;
1556 for (i
= 0; i
< KEY_PTRS(&c
->root
->key
); i
++)
1557 SET_GC_MARK(PTR_BUCKET(c
, &c
->root
->key
, i
),
1560 for (i
= 0; i
< KEY_PTRS(&c
->uuid_bucket
); i
++)
1561 SET_GC_MARK(PTR_BUCKET(c
, &c
->uuid_bucket
, i
),
1564 for_each_cache(ca
, c
, i
) {
1567 ca
->invalidate_needs_gc
= 0;
1569 for (i
= ca
->sb
.d
; i
< ca
->sb
.d
+ ca
->sb
.keys
; i
++)
1570 SET_GC_MARK(ca
->buckets
+ *i
, GC_MARK_METADATA
);
1572 for (i
= ca
->prio_buckets
;
1573 i
< ca
->prio_buckets
+ prio_buckets(ca
) * 2; i
++)
1574 SET_GC_MARK(ca
->buckets
+ *i
, GC_MARK_METADATA
);
1576 for_each_bucket(b
, ca
) {
1577 b
->last_gc
= b
->gc_gen
;
1578 c
->need_gc
= max(c
->need_gc
, bucket_gc_gen(b
));
1580 if (!atomic_read(&b
->pin
) &&
1581 GC_MARK(b
) == GC_MARK_RECLAIMABLE
) {
1583 if (!GC_SECTORS_USED(b
))
1584 bch_bucket_add_unused(ca
, b
);
1589 mutex_unlock(&c
->bucket_lock
);
1593 static void bch_btree_gc(struct cache_set
*c
)
1596 unsigned long available
;
1597 struct gc_stat stats
;
1598 struct closure writes
;
1600 uint64_t start_time
= local_clock();
1602 trace_bcache_gc_start(c
);
1604 memset(&stats
, 0, sizeof(struct gc_stat
));
1605 closure_init_stack(&writes
);
1606 bch_btree_op_init(&op
, SHRT_MAX
);
1611 ret
= btree_root(gc_root
, c
, &op
, &writes
, &stats
);
1612 closure_sync(&writes
);
1614 if (ret
&& ret
!= -EAGAIN
)
1615 pr_warn("gc failed!");
1618 available
= bch_btree_gc_finish(c
);
1619 wake_up_allocators(c
);
1621 bch_time_stats_update(&c
->btree_gc_time
, start_time
);
1623 stats
.key_bytes
*= sizeof(uint64_t);
1625 stats
.in_use
= (c
->nbuckets
- available
) * 100 / c
->nbuckets
;
1626 memcpy(&c
->gc_stats
, &stats
, sizeof(struct gc_stat
));
1628 trace_bcache_gc_end(c
);
1633 static int bch_gc_thread(void *arg
)
1635 struct cache_set
*c
= arg
;
1643 set_current_state(TASK_INTERRUPTIBLE
);
1644 if (kthread_should_stop())
1647 mutex_lock(&c
->bucket_lock
);
1649 for_each_cache(ca
, c
, i
)
1650 if (ca
->invalidate_needs_gc
) {
1651 mutex_unlock(&c
->bucket_lock
);
1652 set_current_state(TASK_RUNNING
);
1656 mutex_unlock(&c
->bucket_lock
);
1665 int bch_gc_thread_start(struct cache_set
*c
)
1667 c
->gc_thread
= kthread_create(bch_gc_thread
, c
, "bcache_gc");
1668 if (IS_ERR(c
->gc_thread
))
1669 return PTR_ERR(c
->gc_thread
);
1671 set_task_state(c
->gc_thread
, TASK_INTERRUPTIBLE
);
1675 /* Initial partial gc */
1677 static int bch_btree_check_recurse(struct btree
*b
, struct btree_op
*op
,
1678 unsigned long **seen
)
1682 struct bkey
*k
, *p
= NULL
;
1684 struct btree_iter iter
;
1686 for_each_key_filter(b
, k
, &iter
, bch_ptr_invalid
) {
1687 for (i
= 0; i
< KEY_PTRS(k
); i
++) {
1688 if (!ptr_available(b
->c
, k
, i
))
1691 g
= PTR_BUCKET(b
->c
, k
, i
);
1693 if (!__test_and_set_bit(PTR_BUCKET_NR(b
->c
, k
, i
),
1694 seen
[PTR_DEV(k
, i
)]) ||
1695 !ptr_stale(b
->c
, k
, i
)) {
1696 g
->gen
= PTR_GEN(k
, i
);
1699 g
->prio
= BTREE_PRIO
;
1700 else if (g
->prio
== BTREE_PRIO
)
1701 g
->prio
= INITIAL_PRIO
;
1705 btree_mark_key(b
, k
);
1709 bch_btree_iter_init(b
, &iter
, NULL
);
1712 k
= bch_btree_iter_next_filter(&iter
, b
, bch_ptr_bad
);
1714 btree_node_prefetch(b
->c
, k
, b
->level
- 1);
1717 ret
= btree(check_recurse
, p
, b
, op
, seen
);
1720 } while (p
&& !ret
);
1726 int bch_btree_check(struct cache_set
*c
)
1730 unsigned long *seen
[MAX_CACHES_PER_SET
];
1733 memset(seen
, 0, sizeof(seen
));
1734 bch_btree_op_init(&op
, SHRT_MAX
);
1736 for (i
= 0; c
->cache
[i
]; i
++) {
1737 size_t n
= DIV_ROUND_UP(c
->cache
[i
]->sb
.nbuckets
, 8);
1738 seen
[i
] = kmalloc(n
, GFP_KERNEL
);
1742 /* Disables the seen array until prio_read() uses it too */
1743 memset(seen
[i
], 0xFF, n
);
1746 ret
= btree_root(check_recurse
, c
, &op
, seen
);
1748 for (i
= 0; i
< MAX_CACHES_PER_SET
; i
++)
1753 /* Btree insertion */
1755 static void shift_keys(struct btree
*b
, struct bkey
*where
, struct bkey
*insert
)
1757 struct bset
*i
= b
->sets
[b
->nsets
].data
;
1759 memmove((uint64_t *) where
+ bkey_u64s(insert
),
1761 (void *) end(i
) - (void *) where
);
1763 i
->keys
+= bkey_u64s(insert
);
1764 bkey_copy(where
, insert
);
1765 bch_bset_fix_lookup_table(b
, where
);
1768 static bool fix_overlapping_extents(struct btree
*b
, struct bkey
*insert
,
1769 struct btree_iter
*iter
,
1770 struct bkey
*replace_key
)
1772 void subtract_dirty(struct bkey
*k
, uint64_t offset
, int sectors
)
1775 bcache_dev_sectors_dirty_add(b
->c
, KEY_INODE(k
),
1779 uint64_t old_offset
;
1780 unsigned old_size
, sectors_found
= 0;
1783 struct bkey
*k
= bch_btree_iter_next(iter
);
1785 bkey_cmp(&START_KEY(k
), insert
) >= 0)
1788 if (bkey_cmp(k
, &START_KEY(insert
)) <= 0)
1791 old_offset
= KEY_START(k
);
1792 old_size
= KEY_SIZE(k
);
1795 * We might overlap with 0 size extents; we can't skip these
1796 * because if they're in the set we're inserting to we have to
1797 * adjust them so they don't overlap with the key we're
1798 * inserting. But we don't want to check them for replace
1802 if (replace_key
&& KEY_SIZE(k
)) {
1804 * k might have been split since we inserted/found the
1805 * key we're replacing
1808 uint64_t offset
= KEY_START(k
) -
1809 KEY_START(replace_key
);
1811 /* But it must be a subset of the replace key */
1812 if (KEY_START(k
) < KEY_START(replace_key
) ||
1813 KEY_OFFSET(k
) > KEY_OFFSET(replace_key
))
1816 /* We didn't find a key that we were supposed to */
1817 if (KEY_START(k
) > KEY_START(insert
) + sectors_found
)
1820 if (KEY_PTRS(replace_key
) != KEY_PTRS(k
))
1826 BUG_ON(!KEY_PTRS(replace_key
));
1828 for (i
= 0; i
< KEY_PTRS(replace_key
); i
++)
1829 if (k
->ptr
[i
] != replace_key
->ptr
[i
] + offset
)
1832 sectors_found
= KEY_OFFSET(k
) - KEY_START(insert
);
1835 if (bkey_cmp(insert
, k
) < 0 &&
1836 bkey_cmp(&START_KEY(insert
), &START_KEY(k
)) > 0) {
1838 * We overlapped in the middle of an existing key: that
1839 * means we have to split the old key. But we have to do
1840 * slightly different things depending on whether the
1841 * old key has been written out yet.
1846 subtract_dirty(k
, KEY_START(insert
), KEY_SIZE(insert
));
1848 if (bkey_written(b
, k
)) {
1850 * We insert a new key to cover the top of the
1851 * old key, and the old key is modified in place
1852 * to represent the bottom split.
1854 * It's completely arbitrary whether the new key
1855 * is the top or the bottom, but it has to match
1856 * up with what btree_sort_fixup() does - it
1857 * doesn't check for this kind of overlap, it
1858 * depends on us inserting a new key for the top
1861 top
= bch_bset_search(b
, &b
->sets
[b
->nsets
],
1863 shift_keys(b
, top
, k
);
1865 BKEY_PADDED(key
) temp
;
1866 bkey_copy(&temp
.key
, k
);
1867 shift_keys(b
, k
, &temp
.key
);
1871 bch_cut_front(insert
, top
);
1872 bch_cut_back(&START_KEY(insert
), k
);
1873 bch_bset_fix_invalidated_key(b
, k
);
1877 if (bkey_cmp(insert
, k
) < 0) {
1878 bch_cut_front(insert
, k
);
1880 if (bkey_cmp(&START_KEY(insert
), &START_KEY(k
)) > 0)
1881 old_offset
= KEY_START(insert
);
1883 if (bkey_written(b
, k
) &&
1884 bkey_cmp(&START_KEY(insert
), &START_KEY(k
)) <= 0) {
1886 * Completely overwrote, so we don't have to
1887 * invalidate the binary search tree
1889 bch_cut_front(k
, k
);
1891 __bch_cut_back(&START_KEY(insert
), k
);
1892 bch_bset_fix_invalidated_key(b
, k
);
1896 subtract_dirty(k
, old_offset
, old_size
- KEY_SIZE(k
));
1901 if (!sectors_found
) {
1903 } else if (sectors_found
< KEY_SIZE(insert
)) {
1904 SET_KEY_OFFSET(insert
, KEY_OFFSET(insert
) -
1905 (KEY_SIZE(insert
) - sectors_found
));
1906 SET_KEY_SIZE(insert
, sectors_found
);
1913 static bool btree_insert_key(struct btree
*b
, struct btree_op
*op
,
1914 struct bkey
*k
, struct bkey
*replace_key
)
1916 struct bset
*i
= b
->sets
[b
->nsets
].data
;
1917 struct bkey
*m
, *prev
;
1918 unsigned status
= BTREE_INSERT_STATUS_INSERT
;
1920 BUG_ON(bkey_cmp(k
, &b
->key
) > 0);
1921 BUG_ON(b
->level
&& !KEY_PTRS(k
));
1922 BUG_ON(!b
->level
&& !KEY_OFFSET(k
));
1925 struct btree_iter iter
;
1928 * bset_search() returns the first key that is strictly greater
1929 * than the search key - but for back merging, we want to find
1933 m
= bch_btree_iter_init(b
, &iter
, PRECEDING_KEY(&START_KEY(k
)));
1935 if (fix_overlapping_extents(b
, k
, &iter
, replace_key
)) {
1936 op
->insert_collision
= true;
1941 bcache_dev_sectors_dirty_add(b
->c
, KEY_INODE(k
),
1942 KEY_START(k
), KEY_SIZE(k
));
1944 while (m
!= end(i
) &&
1945 bkey_cmp(k
, &START_KEY(m
)) > 0)
1946 prev
= m
, m
= bkey_next(m
);
1948 if (key_merging_disabled(b
->c
))
1951 /* prev is in the tree, if we merge we're done */
1952 status
= BTREE_INSERT_STATUS_BACK_MERGE
;
1954 bch_bkey_try_merge(b
, prev
, k
))
1957 status
= BTREE_INSERT_STATUS_OVERWROTE
;
1959 KEY_PTRS(m
) == KEY_PTRS(k
) && !KEY_SIZE(m
))
1962 status
= BTREE_INSERT_STATUS_FRONT_MERGE
;
1964 bch_bkey_try_merge(b
, k
, m
))
1967 BUG_ON(replace_key
);
1968 m
= bch_bset_search(b
, &b
->sets
[b
->nsets
], k
);
1971 insert
: shift_keys(b
, m
, k
);
1972 copy
: bkey_copy(m
, k
);
1974 bch_check_keys(b
, "%u for %s", status
,
1975 replace_key
? "replace" : "insert");
1977 if (b
->level
&& !KEY_OFFSET(k
))
1978 btree_current_write(b
)->prio_blocked
++;
1980 trace_bcache_btree_insert_key(b
, k
, replace_key
!= NULL
, status
);
1985 static bool bch_btree_insert_keys(struct btree
*b
, struct btree_op
*op
,
1986 struct keylist
*insert_keys
,
1987 struct bkey
*replace_key
)
1990 int oldsize
= bch_count_data(b
);
1992 while (!bch_keylist_empty(insert_keys
)) {
1993 struct bset
*i
= write_block(b
);
1994 struct bkey
*k
= insert_keys
->keys
;
1996 if (b
->written
+ __set_blocks(i
, i
->keys
+ bkey_u64s(k
), b
->c
)
2000 if (bkey_cmp(k
, &b
->key
) <= 0) {
2004 ret
|= btree_insert_key(b
, op
, k
, replace_key
);
2005 bch_keylist_pop_front(insert_keys
);
2006 } else if (bkey_cmp(&START_KEY(k
), &b
->key
) < 0) {
2007 BKEY_PADDED(key
) temp
;
2008 bkey_copy(&temp
.key
, insert_keys
->keys
);
2010 bch_cut_back(&b
->key
, &temp
.key
);
2011 bch_cut_front(&b
->key
, insert_keys
->keys
);
2013 ret
|= btree_insert_key(b
, op
, &temp
.key
, replace_key
);
2020 BUG_ON(!bch_keylist_empty(insert_keys
) && b
->level
);
2022 BUG_ON(bch_count_data(b
) < oldsize
);
2026 static int btree_split(struct btree
*b
, struct btree_op
*op
,
2027 struct keylist
*insert_keys
,
2028 struct bkey
*replace_key
)
2031 struct btree
*n1
, *n2
= NULL
, *n3
= NULL
;
2032 uint64_t start_time
= local_clock();
2034 struct keylist parent_keys
;
2036 closure_init_stack(&cl
);
2037 bch_keylist_init(&parent_keys
);
2039 n1
= btree_node_alloc_replacement(b
, true);
2043 split
= set_blocks(n1
->sets
[0].data
, n1
->c
) > (btree_blocks(b
) * 4) / 5;
2048 trace_bcache_btree_node_split(b
, n1
->sets
[0].data
->keys
);
2050 n2
= bch_btree_node_alloc(b
->c
, b
->level
, true);
2055 n3
= bch_btree_node_alloc(b
->c
, b
->level
+ 1, true);
2060 bch_btree_insert_keys(n1
, op
, insert_keys
, replace_key
);
2063 * Has to be a linear search because we don't have an auxiliary
2067 while (keys
< (n1
->sets
[0].data
->keys
* 3) / 5)
2068 keys
+= bkey_u64s(node(n1
->sets
[0].data
, keys
));
2070 bkey_copy_key(&n1
->key
, node(n1
->sets
[0].data
, keys
));
2071 keys
+= bkey_u64s(node(n1
->sets
[0].data
, keys
));
2073 n2
->sets
[0].data
->keys
= n1
->sets
[0].data
->keys
- keys
;
2074 n1
->sets
[0].data
->keys
= keys
;
2076 memcpy(n2
->sets
[0].data
->start
,
2077 end(n1
->sets
[0].data
),
2078 n2
->sets
[0].data
->keys
* sizeof(uint64_t));
2080 bkey_copy_key(&n2
->key
, &b
->key
);
2082 bch_keylist_add(&parent_keys
, &n2
->key
);
2083 bch_btree_node_write(n2
, &cl
);
2084 rw_unlock(true, n2
);
2086 trace_bcache_btree_node_compact(b
, n1
->sets
[0].data
->keys
);
2088 bch_btree_insert_keys(n1
, op
, insert_keys
, replace_key
);
2091 bch_keylist_add(&parent_keys
, &n1
->key
);
2092 bch_btree_node_write(n1
, &cl
);
2095 /* Depth increases, make a new root */
2096 bkey_copy_key(&n3
->key
, &MAX_KEY
);
2097 bch_btree_insert_keys(n3
, op
, &parent_keys
, NULL
);
2098 bch_btree_node_write(n3
, &cl
);
2101 bch_btree_set_root(n3
);
2102 rw_unlock(true, n3
);
2105 } else if (!b
->parent
) {
2106 /* Root filled up but didn't need to be split */
2108 bch_btree_set_root(n1
);
2112 /* Split a non root node */
2114 make_btree_freeing_key(b
, parent_keys
.top
);
2115 bch_keylist_push(&parent_keys
);
2119 bch_btree_insert_node(b
->parent
, op
, &parent_keys
, NULL
, NULL
);
2120 BUG_ON(!bch_keylist_empty(&parent_keys
));
2123 rw_unlock(true, n1
);
2125 bch_time_stats_update(&b
->c
->btree_split_time
, start_time
);
2129 btree_node_free(n2
);
2130 rw_unlock(true, n2
);
2132 btree_node_free(n1
);
2133 rw_unlock(true, n1
);
2135 if (n3
== ERR_PTR(-EAGAIN
) ||
2136 n2
== ERR_PTR(-EAGAIN
) ||
2137 n1
== ERR_PTR(-EAGAIN
))
2140 pr_warn("couldn't split");
2144 static int bch_btree_insert_node(struct btree
*b
, struct btree_op
*op
,
2145 struct keylist
*insert_keys
,
2146 atomic_t
*journal_ref
,
2147 struct bkey
*replace_key
)
2149 BUG_ON(b
->level
&& replace_key
);
2151 if (should_split(b
)) {
2152 if (current
->bio_list
) {
2153 op
->lock
= b
->c
->root
->level
+ 1;
2155 } else if (op
->lock
<= b
->c
->root
->level
) {
2156 op
->lock
= b
->c
->root
->level
+ 1;
2159 /* Invalidated all iterators */
2160 return btree_split(b
, op
, insert_keys
, replace_key
) ?:
2164 BUG_ON(write_block(b
) != b
->sets
[b
->nsets
].data
);
2166 if (bch_btree_insert_keys(b
, op
, insert_keys
, replace_key
)) {
2168 bch_btree_leaf_dirty(b
, journal_ref
);
2170 bch_btree_node_write_sync(b
);
2177 int bch_btree_insert_check_key(struct btree
*b
, struct btree_op
*op
,
2178 struct bkey
*check_key
)
2181 uint64_t btree_ptr
= b
->key
.ptr
[0];
2182 unsigned long seq
= b
->seq
;
2183 struct keylist insert
;
2184 bool upgrade
= op
->lock
== -1;
2186 bch_keylist_init(&insert
);
2189 rw_unlock(false, b
);
2190 rw_lock(true, b
, b
->level
);
2192 if (b
->key
.ptr
[0] != btree_ptr
||
2197 SET_KEY_PTRS(check_key
, 1);
2198 get_random_bytes(&check_key
->ptr
[0], sizeof(uint64_t));
2200 SET_PTR_DEV(check_key
, 0, PTR_CHECK_DEV
);
2202 bch_keylist_add(&insert
, check_key
);
2204 ret
= bch_btree_insert_node(b
, op
, &insert
, NULL
, NULL
);
2206 BUG_ON(!ret
&& !bch_keylist_empty(&insert
));
2209 downgrade_write(&b
->lock
);
2213 struct btree_insert_op
{
2215 struct keylist
*keys
;
2216 atomic_t
*journal_ref
;
2217 struct bkey
*replace_key
;
2220 int btree_insert_fn(struct btree_op
*b_op
, struct btree
*b
)
2222 struct btree_insert_op
*op
= container_of(b_op
,
2223 struct btree_insert_op
, op
);
2225 int ret
= bch_btree_insert_node(b
, &op
->op
, op
->keys
,
2226 op
->journal_ref
, op
->replace_key
);
2227 if (ret
&& !bch_keylist_empty(op
->keys
))
2233 int bch_btree_insert(struct cache_set
*c
, struct keylist
*keys
,
2234 atomic_t
*journal_ref
, struct bkey
*replace_key
)
2236 struct btree_insert_op op
;
2239 BUG_ON(current
->bio_list
);
2240 BUG_ON(bch_keylist_empty(keys
));
2242 bch_btree_op_init(&op
.op
, 0);
2244 op
.journal_ref
= journal_ref
;
2245 op
.replace_key
= replace_key
;
2247 while (!ret
&& !bch_keylist_empty(keys
)) {
2249 ret
= bch_btree_map_leaf_nodes(&op
.op
, c
,
2250 &START_KEY(keys
->keys
),
2257 pr_err("error %i", ret
);
2259 while ((k
= bch_keylist_pop(keys
)))
2261 } else if (op
.op
.insert_collision
)
2267 void bch_btree_set_root(struct btree
*b
)
2272 closure_init_stack(&cl
);
2274 trace_bcache_btree_set_root(b
);
2276 BUG_ON(!b
->written
);
2278 for (i
= 0; i
< KEY_PTRS(&b
->key
); i
++)
2279 BUG_ON(PTR_BUCKET(b
->c
, &b
->key
, i
)->prio
!= BTREE_PRIO
);
2281 mutex_lock(&b
->c
->bucket_lock
);
2282 list_del_init(&b
->list
);
2283 mutex_unlock(&b
->c
->bucket_lock
);
2287 bch_journal_meta(b
->c
, &cl
);
2291 /* Map across nodes or keys */
2293 static int bch_btree_map_nodes_recurse(struct btree
*b
, struct btree_op
*op
,
2295 btree_map_nodes_fn
*fn
, int flags
)
2297 int ret
= MAP_CONTINUE
;
2301 struct btree_iter iter
;
2303 bch_btree_iter_init(b
, &iter
, from
);
2305 while ((k
= bch_btree_iter_next_filter(&iter
, b
,
2307 ret
= btree(map_nodes_recurse
, k
, b
,
2308 op
, from
, fn
, flags
);
2311 if (ret
!= MAP_CONTINUE
)
2316 if (!b
->level
|| flags
== MAP_ALL_NODES
)
2322 int __bch_btree_map_nodes(struct btree_op
*op
, struct cache_set
*c
,
2323 struct bkey
*from
, btree_map_nodes_fn
*fn
, int flags
)
2325 return btree_root(map_nodes_recurse
, c
, op
, from
, fn
, flags
);
2328 static int bch_btree_map_keys_recurse(struct btree
*b
, struct btree_op
*op
,
2329 struct bkey
*from
, btree_map_keys_fn
*fn
,
2332 int ret
= MAP_CONTINUE
;
2334 struct btree_iter iter
;
2336 bch_btree_iter_init(b
, &iter
, from
);
2338 while ((k
= bch_btree_iter_next_filter(&iter
, b
, bch_ptr_bad
))) {
2341 : btree(map_keys_recurse
, k
, b
, op
, from
, fn
, flags
);
2344 if (ret
!= MAP_CONTINUE
)
2348 if (!b
->level
&& (flags
& MAP_END_KEY
))
2349 ret
= fn(op
, b
, &KEY(KEY_INODE(&b
->key
),
2350 KEY_OFFSET(&b
->key
), 0));
2355 int bch_btree_map_keys(struct btree_op
*op
, struct cache_set
*c
,
2356 struct bkey
*from
, btree_map_keys_fn
*fn
, int flags
)
2358 return btree_root(map_keys_recurse
, c
, op
, from
, fn
, flags
);
2363 static inline int keybuf_cmp(struct keybuf_key
*l
, struct keybuf_key
*r
)
2365 /* Overlapping keys compare equal */
2366 if (bkey_cmp(&l
->key
, &START_KEY(&r
->key
)) <= 0)
2368 if (bkey_cmp(&START_KEY(&l
->key
), &r
->key
) >= 0)
2373 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key
*l
,
2374 struct keybuf_key
*r
)
2376 return clamp_t(int64_t, bkey_cmp(&l
->key
, &r
->key
), -1, 1);
2384 keybuf_pred_fn
*pred
;
2387 static int refill_keybuf_fn(struct btree_op
*op
, struct btree
*b
,
2390 struct refill
*refill
= container_of(op
, struct refill
, op
);
2391 struct keybuf
*buf
= refill
->buf
;
2392 int ret
= MAP_CONTINUE
;
2394 if (bkey_cmp(k
, refill
->end
) >= 0) {
2399 if (!KEY_SIZE(k
)) /* end key */
2402 if (refill
->pred(buf
, k
)) {
2403 struct keybuf_key
*w
;
2405 spin_lock(&buf
->lock
);
2407 w
= array_alloc(&buf
->freelist
);
2409 spin_unlock(&buf
->lock
);
2414 bkey_copy(&w
->key
, k
);
2416 if (RB_INSERT(&buf
->keys
, w
, node
, keybuf_cmp
))
2417 array_free(&buf
->freelist
, w
);
2421 if (array_freelist_empty(&buf
->freelist
))
2424 spin_unlock(&buf
->lock
);
2427 buf
->last_scanned
= *k
;
2431 void bch_refill_keybuf(struct cache_set
*c
, struct keybuf
*buf
,
2432 struct bkey
*end
, keybuf_pred_fn
*pred
)
2434 struct bkey start
= buf
->last_scanned
;
2435 struct refill refill
;
2439 bch_btree_op_init(&refill
.op
, -1);
2440 refill
.nr_found
= 0;
2445 bch_btree_map_keys(&refill
.op
, c
, &buf
->last_scanned
,
2446 refill_keybuf_fn
, MAP_END_KEY
);
2448 trace_bcache_keyscan(refill
.nr_found
,
2449 KEY_INODE(&start
), KEY_OFFSET(&start
),
2450 KEY_INODE(&buf
->last_scanned
),
2451 KEY_OFFSET(&buf
->last_scanned
));
2453 spin_lock(&buf
->lock
);
2455 if (!RB_EMPTY_ROOT(&buf
->keys
)) {
2456 struct keybuf_key
*w
;
2457 w
= RB_FIRST(&buf
->keys
, struct keybuf_key
, node
);
2458 buf
->start
= START_KEY(&w
->key
);
2460 w
= RB_LAST(&buf
->keys
, struct keybuf_key
, node
);
2463 buf
->start
= MAX_KEY
;
2467 spin_unlock(&buf
->lock
);
2470 static void __bch_keybuf_del(struct keybuf
*buf
, struct keybuf_key
*w
)
2472 rb_erase(&w
->node
, &buf
->keys
);
2473 array_free(&buf
->freelist
, w
);
2476 void bch_keybuf_del(struct keybuf
*buf
, struct keybuf_key
*w
)
2478 spin_lock(&buf
->lock
);
2479 __bch_keybuf_del(buf
, w
);
2480 spin_unlock(&buf
->lock
);
2483 bool bch_keybuf_check_overlapping(struct keybuf
*buf
, struct bkey
*start
,
2487 struct keybuf_key
*p
, *w
, s
;
2490 if (bkey_cmp(end
, &buf
->start
) <= 0 ||
2491 bkey_cmp(start
, &buf
->end
) >= 0)
2494 spin_lock(&buf
->lock
);
2495 w
= RB_GREATER(&buf
->keys
, s
, node
, keybuf_nonoverlapping_cmp
);
2497 while (w
&& bkey_cmp(&START_KEY(&w
->key
), end
) < 0) {
2499 w
= RB_NEXT(w
, node
);
2504 __bch_keybuf_del(buf
, p
);
2507 spin_unlock(&buf
->lock
);
2511 struct keybuf_key
*bch_keybuf_next(struct keybuf
*buf
)
2513 struct keybuf_key
*w
;
2514 spin_lock(&buf
->lock
);
2516 w
= RB_FIRST(&buf
->keys
, struct keybuf_key
, node
);
2518 while (w
&& w
->private)
2519 w
= RB_NEXT(w
, node
);
2522 w
->private = ERR_PTR(-EINTR
);
2524 spin_unlock(&buf
->lock
);
2528 struct keybuf_key
*bch_keybuf_next_rescan(struct cache_set
*c
,
2531 keybuf_pred_fn
*pred
)
2533 struct keybuf_key
*ret
;
2536 ret
= bch_keybuf_next(buf
);
2540 if (bkey_cmp(&buf
->last_scanned
, end
) >= 0) {
2541 pr_debug("scan finished");
2545 bch_refill_keybuf(c
, buf
, end
, pred
);
2551 void bch_keybuf_init(struct keybuf
*buf
)
2553 buf
->last_scanned
= MAX_KEY
;
2554 buf
->keys
= RB_ROOT
;
2556 spin_lock_init(&buf
->lock
);
2557 array_allocator_init(&buf
->freelist
);
2560 void bch_btree_exit(void)
2563 destroy_workqueue(btree_io_wq
);
2566 int __init
bch_btree_init(void)
2568 btree_io_wq
= create_singlethread_workqueue("bch_btree_io");