[mirror_ubuntu-jammy-kernel.git] / drivers / md / bcache / alloc.c

/*
 * Primary bucket allocation code
 *
 * Copyright 2012 Google, Inc.
 *
 * Allocation in bcache is done in terms of buckets:
 *
 * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
 * btree pointers - they must match for the pointer to be considered valid.
 *
 * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
 * bucket simply by incrementing its gen.
 *
 * The gens (along with the priorities; it's really the gens are important but
 * the code is named as if it's the priorities) are written in an arbitrary list
 * of buckets on disk, with a pointer to them in the journal header.
 *
 * When we invalidate a bucket, we have to write its new gen to disk and wait
 * for that write to complete before we use it - otherwise after a crash we
 * could have pointers that appeared to be good but pointed to data that had
 * been overwritten.
 *
 * Since the gens and priorities are all stored contiguously on disk, we can
 * batch this up: We fill up the free_inc list with freshly invalidated buckets,
 * call prio_write(), and when prio_write() finishes we pull buckets off the
 * free_inc list and optionally discard them.
 *
 * free_inc isn't the only freelist - if it was, we'd often to sleep while
 * priorities and gens were being written before we could allocate. c->free is a
 * smaller freelist, and buckets on that list are always ready to be used.
 *
 * If we've got discards enabled, that happens when a bucket moves from the
 * free_inc list to the free list.
 *
 * There is another freelist, because sometimes we have buckets that we know
 * have nothing pointing into them - these we can reuse without waiting for
 * priorities to be rewritten. These come from freed btree nodes and buckets
 * that garbage collection discovered no longer had valid keys pointing into
 * them (because they were overwritten). That's the unused list - buckets on the
 * unused list move to the free list, optionally being discarded in the process.
 *
 * It's also important to ensure that gens don't wrap around - with respect to
 * either the oldest gen in the btree or the gen on disk. This is quite
 * difficult to do in practice, but we explicitly guard against it anyways - if
 * a bucket is in danger of wrapping around we simply skip invalidating it that
 * time around, and we garbage collect or rewrite the priorities sooner than we
 * would have otherwise.
 *
 * bch_bucket_alloc() allocates a single bucket from a specific cache.
 *
 * bch_bucket_alloc_set() allocates one or more buckets from different caches
 * out of a cache set.
 *
 * free_some_buckets() drives all the processes described above. It's called
 * from bch_bucket_alloc() and a few other places that need to make sure free
 * buckets are ready.
 *
 * invalidate_buckets_(lru|fifo)() find buckets that are available to be
 * invalidated, and then invalidate them and stick them on the free_inc list -
 * in either lru or fifo order.
 */

#include "bcache.h"
#include "btree.h"

#include <linux/blkdev.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/random.h>
#include <trace/events/bcache.h>

/* Bucket heap / gen */

uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
{
	uint8_t ret = ++b->gen;

	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);

	if (CACHE_SYNC(&ca->set->sb)) {
		ca->need_save_prio = max(ca->need_save_prio,
					 bucket_disk_gen(b));
		WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
	}

	return ret;
}

void bch_rescale_priorities(struct cache_set *c, int sectors)
{
	struct cache *ca;
	struct bucket *b;
	unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
	unsigned i;
	int r;

	atomic_sub(sectors, &c->rescale);

	do {
		r = atomic_read(&c->rescale);

		if (r >= 0)
			return;
	} while (atomic_cmpxchg(&c->rescale, r, r + next) != r);

	mutex_lock(&c->bucket_lock);

	c->min_prio = USHRT_MAX;

	for_each_cache(ca, c, i)
		for_each_bucket(b, ca)
			if (b->prio &&
			    b->prio != BTREE_PRIO &&
			    !atomic_read(&b->pin)) {
				b->prio--;
				c->min_prio = min(c->min_prio, b->prio);
			}

	mutex_unlock(&c->bucket_lock);
}

/* Allocation */

static inline bool can_inc_bucket_gen(struct bucket *b)
{
	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
		bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
}

bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
{
	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));

	if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
	    CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
		return false;

	b->prio = 0;

	if (can_inc_bucket_gen(b) &&
	    fifo_push(&ca->unused, b - ca->buckets)) {
		atomic_inc(&b->pin);
		return true;
	}

	return false;
}

static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
{
	return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
		!atomic_read(&b->pin) &&
		can_inc_bucket_gen(b);
}

static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
{
	bch_inc_gen(ca, b);
	b->prio = INITIAL_PRIO;
	atomic_inc(&b->pin);
	fifo_push(&ca->free_inc, b - ca->buckets);
}

#define bucket_prio(b)				\
	(((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))

#define bucket_max_cmp(l, r)	(bucket_prio(l) < bucket_prio(r))
#define bucket_min_cmp(l, r)	(bucket_prio(l) > bucket_prio(r))

static void invalidate_buckets_lru(struct cache *ca)
{
	struct bucket *b;
	ssize_t i;

	ca->heap.used = 0;

	for_each_bucket(b, ca) {
		/*
		 * If we fill up the unused list, if we then return before
		 * adding anything to the free_inc list we'll skip writing
		 * prios/gens and just go back to allocating from the unused
		 * list:
		 */
		if (fifo_full(&ca->unused))
			return;

		if (!can_invalidate_bucket(ca, b))
			continue;

		if (!GC_SECTORS_USED(b) &&
		    bch_bucket_add_unused(ca, b))
			continue;

		if (!heap_full(&ca->heap))
			heap_add(&ca->heap, b, bucket_max_cmp);
		else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
			ca->heap.data[0] = b;
			heap_sift(&ca->heap, 0, bucket_max_cmp);
		}
	}

	for (i = ca->heap.used / 2 - 1; i >= 0; --i)
		heap_sift(&ca->heap, i, bucket_min_cmp);

	while (!fifo_full(&ca->free_inc)) {
		if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
			/*
			 * We don't want to be calling invalidate_buckets()
			 * multiple times when it can't do anything
			 */
			ca->invalidate_needs_gc = 1;
			wake_up_gc(ca->set);
			return;
		}

		invalidate_one_bucket(ca, b);
	}
}

static void invalidate_buckets_fifo(struct cache *ca)
{
	struct bucket *b;
	size_t checked = 0;

	while (!fifo_full(&ca->free_inc)) {
		if (ca->fifo_last_bucket <  ca->sb.first_bucket ||
		    ca->fifo_last_bucket >= ca->sb.nbuckets)
			ca->fifo_last_bucket = ca->sb.first_bucket;

		b = ca->buckets + ca->fifo_last_bucket++;

		if (can_invalidate_bucket(ca, b))
			invalidate_one_bucket(ca, b);

		if (++checked >= ca->sb.nbuckets) {
			ca->invalidate_needs_gc = 1;
			wake_up_gc(ca->set);
			return;
		}
	}
}

static void invalidate_buckets_random(struct cache *ca)
{
	struct bucket *b;
	size_t checked = 0;

	while (!fifo_full(&ca->free_inc)) {
		size_t n;
		get_random_bytes(&n, sizeof(n));

		n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
		n += ca->sb.first_bucket;

		b = ca->buckets + n;

		if (can_invalidate_bucket(ca, b))
			invalidate_one_bucket(ca, b);

		if (++checked >= ca->sb.nbuckets / 2) {
			ca->invalidate_needs_gc = 1;
			wake_up_gc(ca->set);
			return;
		}
	}
}

static void invalidate_buckets(struct cache *ca)
{
	if (ca->invalidate_needs_gc)
		return;

	switch (CACHE_REPLACEMENT(&ca->sb)) {
	case CACHE_REPLACEMENT_LRU:
		invalidate_buckets_lru(ca);
		break;
	case CACHE_REPLACEMENT_FIFO:
		invalidate_buckets_fifo(ca);
		break;
	case CACHE_REPLACEMENT_RANDOM:
		invalidate_buckets_random(ca);
		break;
	}

	trace_bcache_alloc_invalidate(ca);
}

#define allocator_wait(ca, cond)					\
do {									\
	while (1) {							\
		set_current_state(TASK_INTERRUPTIBLE);			\
		if (cond)						\
			break;						\
									\
		mutex_unlock(&(ca)->set->bucket_lock);			\
		if (kthread_should_stop())				\
			return 0;					\
									\
		try_to_freeze();					\
		schedule();						\
		mutex_lock(&(ca)->set->bucket_lock);			\
	}								\
	__set_current_state(TASK_RUNNING);				\
} while (0)

static int bch_allocator_thread(void *arg)
{
	struct cache *ca = arg;

	mutex_lock(&ca->set->bucket_lock);

	while (1) {
		/*
		 * First, we pull buckets off of the unused and free_inc lists,
		 * possibly issue discards to them, then we add the bucket to
		 * the free list:
		 */
		while (1) {
			long bucket;

			if ((!atomic_read(&ca->set->prio_blocked) ||
			     !CACHE_SYNC(&ca->set->sb)) &&
			    !fifo_empty(&ca->unused))
				fifo_pop(&ca->unused, bucket);
			else if (!fifo_empty(&ca->free_inc))
				fifo_pop(&ca->free_inc, bucket);
			else
				break;

			if (ca->discard) {
				mutex_unlock(&ca->set->bucket_lock);
				blkdev_issue_discard(ca->bdev,
					bucket_to_sector(ca->set, bucket),
					ca->sb.block_size, GFP_KERNEL, 0);
				mutex_lock(&ca->set->bucket_lock);
			}

			allocator_wait(ca, !fifo_full(&ca->free));

			fifo_push(&ca->free, bucket);
			wake_up(&ca->set->bucket_wait);
		}

		/*
		 * We've run out of free buckets, we need to find some buckets
		 * we can invalidate. First, invalidate them in memory and add
		 * them to the free_inc list:
		 */

		allocator_wait(ca, ca->set->gc_mark_valid &&
			       (ca->need_save_prio > 64 ||
				!ca->invalidate_needs_gc));
		invalidate_buckets(ca);

		/*
		 * Now, we write their new gens to disk so we can start writing
		 * new stuff to them:
		 */
		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
		if (CACHE_SYNC(&ca->set->sb) &&
		    (!fifo_empty(&ca->free_inc) ||
		     ca->need_save_prio > 64))
			bch_prio_write(ca);
	}
}

long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
{
	DEFINE_WAIT(w);
	struct bucket *b;
	long r;

	/* fastpath */
	if (fifo_used(&ca->free) > ca->watermark[watermark]) {
		fifo_pop(&ca->free, r);
		goto out;
	}

	if (!wait)
		return -1;

	while (1) {
		if (fifo_used(&ca->free) > ca->watermark[watermark]) {
			fifo_pop(&ca->free, r);
			break;
		}

		prepare_to_wait(&ca->set->bucket_wait, &w,
				TASK_UNINTERRUPTIBLE);

		mutex_unlock(&ca->set->bucket_lock);
		schedule();
		mutex_lock(&ca->set->bucket_lock);
	}

	finish_wait(&ca->set->bucket_wait, &w);
out:
	wake_up_process(ca->alloc_thread);

	if (expensive_debug_checks(ca->set)) {
		size_t iter;
		long i;

		for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
			BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);

		fifo_for_each(i, &ca->free, iter)
			BUG_ON(i == r);
		fifo_for_each(i, &ca->free_inc, iter)
			BUG_ON(i == r);
		fifo_for_each(i, &ca->unused, iter)
			BUG_ON(i == r);
	}

	b = ca->buckets + r;

	BUG_ON(atomic_read(&b->pin) != 1);

	SET_GC_SECTORS_USED(b, ca->sb.bucket_size);

	if (watermark <= WATERMARK_METADATA) {
		SET_GC_MARK(b, GC_MARK_METADATA);
		SET_GC_MOVE(b, 0);
		b->prio = BTREE_PRIO;
	} else {
		SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
		SET_GC_MOVE(b, 0);
		b->prio = INITIAL_PRIO;
	}

	return r;
}

void bch_bucket_free(struct cache_set *c, struct bkey *k)
{
	unsigned i;

	for (i = 0; i < KEY_PTRS(k); i++) {
		struct bucket *b = PTR_BUCKET(c, k, i);

		SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
		SET_GC_SECTORS_USED(b, 0);
		bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
	}
}

int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
			   struct bkey *k, int n, bool wait)
{
	int i;

	lockdep_assert_held(&c->bucket_lock);
	BUG_ON(!n || n > c->caches_loaded || n > 8);

	bkey_init(k);

	/* sort by free space/prio of oldest data in caches */

	for (i = 0; i < n; i++) {
		struct cache *ca = c->cache_by_alloc[i];
		long b = bch_bucket_alloc(ca, watermark, wait);

		if (b == -1)
			goto err;

		k->ptr[i] = PTR(ca->buckets[b].gen,
				bucket_to_sector(c, b),
				ca->sb.nr_this_dev);

		SET_KEY_PTRS(k, i + 1);
	}

	return 0;
err:
	bch_bucket_free(c, k);
	bkey_put(c, k);
	return -1;
}

int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
			 struct bkey *k, int n, bool wait)
{
	int ret;
	mutex_lock(&c->bucket_lock);
	ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
	mutex_unlock(&c->bucket_lock);
	return ret;
}

/* Sector allocator */

struct open_bucket {
	struct list_head	list;
	unsigned		last_write_point;
	unsigned		sectors_free;
	BKEY_PADDED(key);
};

/*
 * We keep multiple buckets open for writes, and try to segregate different
 * write streams for better cache utilization: first we look for a bucket where
 * the last write to it was sequential with the current write, and failing that
 * we look for a bucket that was last used by the same task.
 *
 * The ideas is if you've got multiple tasks pulling data into the cache at the
 * same time, you'll get better cache utilization if you try to segregate their
 * data and preserve locality.
 *
 * For example, say you've starting Firefox at the same time you're copying a
 * bunch of files. Firefox will likely end up being fairly hot and stay in the
 * cache awhile, but the data you copied might not be; if you wrote all that
 * data to the same buckets it'd get invalidated at the same time.
 *
 * Both of those tasks will be doing fairly random IO so we can't rely on
 * detecting sequential IO to segregate their data, but going off of the task
 * should be a sane heuristic.
 */
static struct open_bucket *pick_data_bucket(struct cache_set *c,
					    const struct bkey *search,
					    unsigned write_point,
					    struct bkey *alloc)
{
	struct open_bucket *ret, *ret_task = NULL;

	list_for_each_entry_reverse(ret, &c->data_buckets, list)
		if (!bkey_cmp(&ret->key, search))
			goto found;
		else if (ret->last_write_point == write_point)
			ret_task = ret;

	ret = ret_task ?: list_first_entry(&c->data_buckets,
					   struct open_bucket, list);
found:
	if (!ret->sectors_free && KEY_PTRS(alloc)) {
		ret->sectors_free = c->sb.bucket_size;
		bkey_copy(&ret->key, alloc);
		bkey_init(alloc);
	}

	if (!ret->sectors_free)
		ret = NULL;

	return ret;
}

/*
 * Allocates some space in the cache to write to, and k to point to the newly
 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
 * end of the newly allocated space).
 *
 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
 * sectors were actually allocated.
 *
 * If s->writeback is true, will not fail.
 */
bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
		       unsigned write_point, unsigned write_prio, bool wait)
{
	struct open_bucket *b;
	BKEY_PADDED(key) alloc;
	unsigned i;

	/*
	 * We might have to allocate a new bucket, which we can't do with a
	 * spinlock held. So if we have to allocate, we drop the lock, allocate
	 * and then retry. KEY_PTRS() indicates whether alloc points to
	 * allocated bucket(s).
	 */

	bkey_init(&alloc.key);
	spin_lock(&c->data_bucket_lock);

	while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
		unsigned watermark = write_prio
			? WATERMARK_MOVINGGC
			: WATERMARK_NONE;

		spin_unlock(&c->data_bucket_lock);

		if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
			return false;

		spin_lock(&c->data_bucket_lock);
	}

	/*
	 * If we had to allocate, we might race and not need to allocate the
	 * second time we call find_data_bucket(). If we allocated a bucket but
	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
	 */
	if (KEY_PTRS(&alloc.key))
		bkey_put(c, &alloc.key);

	for (i = 0; i < KEY_PTRS(&b->key); i++)
		EBUG_ON(ptr_stale(c, &b->key, i));

	/* Set up the pointer to the space we're allocating: */

	for (i = 0; i < KEY_PTRS(&b->key); i++)
		k->ptr[i] = b->key.ptr[i];

	sectors = min(sectors, b->sectors_free);

	SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
	SET_KEY_SIZE(k, sectors);
	SET_KEY_PTRS(k, KEY_PTRS(&b->key));

	/*
	 * Move b to the end of the lru, and keep track of what this bucket was
	 * last used for:
	 */
	list_move_tail(&b->list, &c->data_buckets);
	bkey_copy_key(&b->key, k);
	b->last_write_point = write_point;

	b->sectors_free	-= sectors;

	for (i = 0; i < KEY_PTRS(&b->key); i++) {
		SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);

		atomic_long_add(sectors,
				&PTR_CACHE(c, &b->key, i)->sectors_written);
	}

	if (b->sectors_free < c->sb.block_size)
		b->sectors_free = 0;

	/*
	 * k takes refcounts on the buckets it points to until it's inserted
	 * into the btree, but if we're done with this bucket we just transfer
	 * get_data_bucket()'s refcount.
	 */
	if (b->sectors_free)
		for (i = 0; i < KEY_PTRS(&b->key); i++)
			atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);

	spin_unlock(&c->data_bucket_lock);
	return true;
}

/* Init */

void bch_open_buckets_free(struct cache_set *c)
{
	struct open_bucket *b;

	while (!list_empty(&c->data_buckets)) {
		b = list_first_entry(&c->data_buckets,
				     struct open_bucket, list);
		list_del(&b->list);
		kfree(b);
	}
}

int bch_open_buckets_alloc(struct cache_set *c)
{
	int i;

	spin_lock_init(&c->data_bucket_lock);

	for (i = 0; i < 6; i++) {
		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
		if (!b)
			return -ENOMEM;

		list_add(&b->list, &c->data_buckets);
	}

	return 0;
}

int bch_cache_allocator_start(struct cache *ca)
{
	struct task_struct *k = kthread_run(bch_allocator_thread,
					    ca, "bcache_allocator");
	if (IS_ERR(k))
		return PTR_ERR(k);

	ca->alloc_thread = k;
	return 0;
}

int bch_cache_allocator_init(struct cache *ca)
{
	/*
	 * Reserve:
	 * Prio/gen writes first
	 * Then 8 for btree allocations
	 * Then half for the moving garbage collector
	 */

	ca->watermark[WATERMARK_PRIO] = 0;

	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);

	ca->watermark[WATERMARK_MOVINGGC] = 8 +
		ca->watermark[WATERMARK_METADATA];

	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
		ca->watermark[WATERMARK_MOVINGGC];

	return 0;
}
Commit	Line	Data
cafe5635 KO	1	/*
	2	* Primary bucket allocation code
	3	*
	4	* Copyright 2012 Google, Inc.
	5	*
	6	* Allocation in bcache is done in terms of buckets:
	7	*
	8	* Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
	9	* btree pointers - they must match for the pointer to be considered valid.
	10	*
	11	* Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
	12	* bucket simply by incrementing its gen.
	13	*
	14	* The gens (along with the priorities; it's really the gens are important but
	15	* the code is named as if it's the priorities) are written in an arbitrary list
	16	* of buckets on disk, with a pointer to them in the journal header.
	17	*
	18	* When we invalidate a bucket, we have to write its new gen to disk and wait
	19	* for that write to complete before we use it - otherwise after a crash we
	20	* could have pointers that appeared to be good but pointed to data that had
	21	* been overwritten.
	22	*
	23	* Since the gens and priorities are all stored contiguously on disk, we can
	24	* batch this up: We fill up the free_inc list with freshly invalidated buckets,
	25	* call prio_write(), and when prio_write() finishes we pull buckets off the
	26	* free_inc list and optionally discard them.
	27	*
	28	* free_inc isn't the only freelist - if it was, we'd often to sleep while
	29	* priorities and gens were being written before we could allocate. c->free is a
	30	* smaller freelist, and buckets on that list are always ready to be used.
	31	*
	32	* If we've got discards enabled, that happens when a bucket moves from the
	33	* free_inc list to the free list.
	34	*
	35	* There is another freelist, because sometimes we have buckets that we know
	36	* have nothing pointing into them - these we can reuse without waiting for
	37	* priorities to be rewritten. These come from freed btree nodes and buckets
	38	* that garbage collection discovered no longer had valid keys pointing into
	39	* them (because they were overwritten). That's the unused list - buckets on the
	40	* unused list move to the free list, optionally being discarded in the process.
	41	*
	42	* It's also important to ensure that gens don't wrap around - with respect to
	43	* either the oldest gen in the btree or the gen on disk. This is quite
	44	* difficult to do in practice, but we explicitly guard against it anyways - if
	45	* a bucket is in danger of wrapping around we simply skip invalidating it that
	46	* time around, and we garbage collect or rewrite the priorities sooner than we
	47	* would have otherwise.
	48	*
	49	* bch_bucket_alloc() allocates a single bucket from a specific cache.
	50	*
	51	* bch_bucket_alloc_set() allocates one or more buckets from different caches
	52	* out of a cache set.
	53	*
	54	* free_some_buckets() drives all the processes described above. It's called
	55	* from bch_bucket_alloc() and a few other places that need to make sure free
	56	* buckets are ready.
	57	*
	58	* invalidate_buckets_(lru\|fifo)() find buckets that are available to be
	59	* invalidated, and then invalidate them and stick them on the free_inc list -
	60	* in either lru or fifo order.
	61	*/
	62
	63	#include "bcache.h"
	64	#include "btree.h"
65
49b1212d	66	#include <linux/blkdev.h>
79826c35	67	#include <linux/freezer.h>
119ba0f8	68	#include <linux/kthread.h>
cafe5635	69	#include <linux/random.h>
c37511b8	70	#include <trace/events/bcache.h>
cafe5635	71
cafe5635 KO	72	/* Bucket heap / gen */
	73
	74	uint8_t bch_inc_gen(struct cache ca, struct bucket b)
	75	{
	76	uint8_t ret = ++b->gen;
	77
	78	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
	79	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
	80
	81	if (CACHE_SYNC(&ca->set->sb)) {
	82	ca->need_save_prio = max(ca->need_save_prio,
	83	bucket_disk_gen(b));
	84	WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
	85	}
	86
	87	return ret;
	88	}
	89
	90	void bch_rescale_priorities(struct cache_set *c, int sectors)
	91	{
	92	struct cache *ca;
	93	struct bucket *b;
	94	unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
	95	unsigned i;
	96	int r;
	97
	98	atomic_sub(sectors, &c->rescale);
	99
	100	do {
	101	r = atomic_read(&c->rescale);
	102
	103	if (r >= 0)
	104	return;
	105	} while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
	106
	107	mutex_lock(&c->bucket_lock);
	108
	109	c->min_prio = USHRT_MAX;
	110
	111	for_each_cache(ca, c, i)
	112	for_each_bucket(b, ca)
	113	if (b->prio &&
	114	b->prio != BTREE_PRIO &&
	115	!atomic_read(&b->pin)) {
	116	b->prio--;
	117	c->min_prio = min(c->min_prio, b->prio);
	118	}
	119
	120	mutex_unlock(&c->bucket_lock);
	121	}
	122
cafe5635 KO	123	/* Allocation */
	124
	125	static inline bool can_inc_bucket_gen(struct bucket *b)
	126	{
	127	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
	128	bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
	129	}
	130
	131	bool bch_bucket_add_unused(struct cache ca, struct bucket b)
	132	{
	133	BUG_ON(GC_MARK(b) \|\| GC_SECTORS_USED(b));
	134
	135	if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
	136	CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
	137	return false;
	138
	139	b->prio = 0;
	140
	141	if (can_inc_bucket_gen(b) &&
	142	fifo_push(&ca->unused, b - ca->buckets)) {
	143	atomic_inc(&b->pin);
	144	return true;
	145	}
	146
	147	return false;
	148	}
	149
	150	static bool can_invalidate_bucket(struct cache ca, struct bucket b)
	151	{
	152	return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
	153	!atomic_read(&b->pin) &&
	154	can_inc_bucket_gen(b);
	155	}
	156
	157	static void invalidate_one_bucket(struct cache ca, struct bucket b)
	158	{
	159	bch_inc_gen(ca, b);
	160	b->prio = INITIAL_PRIO;
	161	atomic_inc(&b->pin);
	162	fifo_push(&ca->free_inc, b - ca->buckets);
	163	}
	164
b1a67b0f KO	165	#define bucket_prio(b) \
b1a67b0f KO	166	(((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
cafe5635	167
b1a67b0f KO	168	#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
b1a67b0f KO	169	#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
cafe5635	170
b1a67b0f KO	171	static void invalidate_buckets_lru(struct cache *ca)
b1a67b0f KO	172	{
cafe5635 KO	173	struct bucket *b;
	174	ssize_t i;
	175
	176	ca->heap.used = 0;
	177
	178	for_each_bucket(b, ca) {
86b26b82 KO	179	/*
	180	* If we fill up the unused list, if we then return before
	181	* adding anything to the free_inc list we'll skip writing
	182	* prios/gens and just go back to allocating from the unused
	183	* list:
	184	*/
	185	if (fifo_full(&ca->unused))
	186	return;
	187
cafe5635 KO	188	if (!can_invalidate_bucket(ca, b))
	189	continue;
	190
86b26b82 KO	191	if (!GC_SECTORS_USED(b) &&
	192	bch_bucket_add_unused(ca, b))
	193	continue;
	194
	195	if (!heap_full(&ca->heap))
	196	heap_add(&ca->heap, b, bucket_max_cmp);
	197	else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
	198	ca->heap.data[0] = b;
	199	heap_sift(&ca->heap, 0, bucket_max_cmp);
cafe5635 KO	200	}
	201	}
	202
cafe5635 KO	203	for (i = ca->heap.used / 2 - 1; i >= 0; --i)
	204	heap_sift(&ca->heap, i, bucket_min_cmp);
	205
	206	while (!fifo_full(&ca->free_inc)) {
	207	if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
86b26b82 KO	208	/*
86b26b82 KO	209	* We don't want to be calling invalidate_buckets()
cafe5635 KO	210	* multiple times when it can't do anything
	211	*/
	212	ca->invalidate_needs_gc = 1;
72a44517	213	wake_up_gc(ca->set);
cafe5635 KO	214	return;
	215	}
	216
	217	invalidate_one_bucket(ca, b);
	218	}
	219	}
	220
	221	static void invalidate_buckets_fifo(struct cache *ca)
	222	{
	223	struct bucket *b;
	224	size_t checked = 0;
	225
	226	while (!fifo_full(&ca->free_inc)) {
	227	if (ca->fifo_last_bucket < ca->sb.first_bucket \|\|
	228	ca->fifo_last_bucket >= ca->sb.nbuckets)
	229	ca->fifo_last_bucket = ca->sb.first_bucket;
	230
	231	b = ca->buckets + ca->fifo_last_bucket++;
	232
	233	if (can_invalidate_bucket(ca, b))
	234	invalidate_one_bucket(ca, b);
	235
	236	if (++checked >= ca->sb.nbuckets) {
	237	ca->invalidate_needs_gc = 1;
72a44517	238	wake_up_gc(ca->set);
cafe5635 KO	239	return;
	240	}
	241	}
	242	}
	243
	244	static void invalidate_buckets_random(struct cache *ca)
	245	{
	246	struct bucket *b;
	247	size_t checked = 0;
	248
	249	while (!fifo_full(&ca->free_inc)) {
	250	size_t n;
	251	get_random_bytes(&n, sizeof(n));
	252
	253	n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
	254	n += ca->sb.first_bucket;
	255
	256	b = ca->buckets + n;
	257
	258	if (can_invalidate_bucket(ca, b))
	259	invalidate_one_bucket(ca, b);
	260
	261	if (++checked >= ca->sb.nbuckets / 2) {
	262	ca->invalidate_needs_gc = 1;
72a44517	263	wake_up_gc(ca->set);
cafe5635 KO	264	return;
	265	}
	266	}
	267	}
	268
	269	static void invalidate_buckets(struct cache *ca)
	270	{
	271	if (ca->invalidate_needs_gc)
	272	return;
	273
	274	switch (CACHE_REPLACEMENT(&ca->sb)) {
	275	case CACHE_REPLACEMENT_LRU:
	276	invalidate_buckets_lru(ca);
	277	break;
	278	case CACHE_REPLACEMENT_FIFO:
	279	invalidate_buckets_fifo(ca);
	280	break;
	281	case CACHE_REPLACEMENT_RANDOM:
	282	invalidate_buckets_random(ca);
	283	break;
	284	}
86b26b82	285
c37511b8	286	trace_bcache_alloc_invalidate(ca);
cafe5635 KO	287	}
	288
	289	#define allocator_wait(ca, cond) \
	290	do { \
86b26b82	291	while (1) { \
119ba0f8	292	set_current_state(TASK_INTERRUPTIBLE); \
86b26b82 KO	293	if (cond) \
86b26b82 KO	294	break; \
cafe5635 KO	295	\
cafe5635 KO	296	mutex_unlock(&(ca)->set->bucket_lock); \
79826c35	297	if (kthread_should_stop()) \
119ba0f8	298	return 0; \
cafe5635	299	\
79826c35	300	try_to_freeze(); \
cafe5635	301	schedule(); \
cafe5635 KO	302	mutex_lock(&(ca)->set->bucket_lock); \
cafe5635 KO	303	} \
119ba0f8	304	__set_current_state(TASK_RUNNING); \
cafe5635 KO	305	} while (0)
cafe5635 KO	306
119ba0f8	307	static int bch_allocator_thread(void *arg)
cafe5635	308	{
119ba0f8	309	struct cache *ca = arg;
cafe5635 KO	310
	311	mutex_lock(&ca->set->bucket_lock);
	312
	313	while (1) {
86b26b82 KO	314	/*
	315	* First, we pull buckets off of the unused and free_inc lists,
	316	* possibly issue discards to them, then we add the bucket to
	317	* the free list:
	318	*/
cafe5635 KO	319	while (1) {
	320	long bucket;
	321
	322	if ((!atomic_read(&ca->set->prio_blocked) \|\|
	323	!CACHE_SYNC(&ca->set->sb)) &&
	324	!fifo_empty(&ca->unused))
	325	fifo_pop(&ca->unused, bucket);
	326	else if (!fifo_empty(&ca->free_inc))
	327	fifo_pop(&ca->free_inc, bucket);
	328	else
	329	break;
	330
cafe5635	331	if (ca->discard) {
49b1212d KO	332	mutex_unlock(&ca->set->bucket_lock);
	333	blkdev_issue_discard(ca->bdev,
	334	bucket_to_sector(ca->set, bucket),
	335	ca->sb.block_size, GFP_KERNEL, 0);
	336	mutex_lock(&ca->set->bucket_lock);
cafe5635	337	}
49b1212d KO	338
	339	allocator_wait(ca, !fifo_full(&ca->free));
	340
	341	fifo_push(&ca->free, bucket);
35fcd848	342	wake_up(&ca->set->bucket_wait);
cafe5635 KO	343	}
cafe5635 KO	344
86b26b82 KO	345	/*
	346	* We've run out of free buckets, we need to find some buckets
	347	* we can invalidate. First, invalidate them in memory and add
	348	* them to the free_inc list:
	349	*/
cafe5635	350
86b26b82 KO	351	allocator_wait(ca, ca->set->gc_mark_valid &&
	352	(ca->need_save_prio > 64 \|\|
	353	!ca->invalidate_needs_gc));
	354	invalidate_buckets(ca);
cafe5635	355
86b26b82 KO	356	/*
	357	* Now, we write their new gens to disk so we can start writing
	358	* new stuff to them:
	359	*/
	360	allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
cafe5635 KO	361	if (CACHE_SYNC(&ca->set->sb) &&
cafe5635 KO	362	(!fifo_empty(&ca->free_inc) \|\|
86b26b82	363	ca->need_save_prio > 64))
cafe5635	364	bch_prio_write(ca);
cafe5635 KO	365	}
	366	}
	367
35fcd848	368	long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
cafe5635	369	{
35fcd848 KO	370	DEFINE_WAIT(w);
	371	struct bucket *b;
	372	long r;
	373
	374	/* fastpath */
	375	if (fifo_used(&ca->free) > ca->watermark[watermark]) {
	376	fifo_pop(&ca->free, r);
	377	goto out;
	378	}
	379
	380	if (!wait)
	381	return -1;
	382
	383	while (1) {
	384	if (fifo_used(&ca->free) > ca->watermark[watermark]) {
	385	fifo_pop(&ca->free, r);
	386	break;
	387	}
	388
	389	prepare_to_wait(&ca->set->bucket_wait, &w,
	390	TASK_UNINTERRUPTIBLE);
	391
	392	mutex_unlock(&ca->set->bucket_lock);
	393	schedule();
	394	mutex_lock(&ca->set->bucket_lock);
	395	}
	396
	397	finish_wait(&ca->set->bucket_wait, &w);
	398	out:
119ba0f8	399	wake_up_process(ca->alloc_thread);
cafe5635	400
280481d0	401	if (expensive_debug_checks(ca->set)) {
cafe5635 KO	402	size_t iter;
	403	long i;
	404
	405	for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
	406	BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
	407
	408	fifo_for_each(i, &ca->free, iter)
	409	BUG_ON(i == r);
	410	fifo_for_each(i, &ca->free_inc, iter)
	411	BUG_ON(i == r);
	412	fifo_for_each(i, &ca->unused, iter)
	413	BUG_ON(i == r);
cafe5635	414	}
280481d0	415
35fcd848	416	b = ca->buckets + r;
cafe5635	417
35fcd848	418	BUG_ON(atomic_read(&b->pin) != 1);
cafe5635	419
35fcd848	420	SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
cafe5635	421
35fcd848 KO	422	if (watermark <= WATERMARK_METADATA) {
35fcd848 KO	423	SET_GC_MARK(b, GC_MARK_METADATA);
981aa8c0	424	SET_GC_MOVE(b, 0);
35fcd848 KO	425	b->prio = BTREE_PRIO;
	426	} else {
	427	SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
981aa8c0	428	SET_GC_MOVE(b, 0);
35fcd848	429	b->prio = INITIAL_PRIO;
cafe5635 KO	430	}
cafe5635 KO	431
35fcd848	432	return r;
cafe5635 KO	433	}
	434
	435	void bch_bucket_free(struct cache_set c, struct bkey k)
	436	{
	437	unsigned i;
	438
	439	for (i = 0; i < KEY_PTRS(k); i++) {
	440	struct bucket *b = PTR_BUCKET(c, k, i);
	441
86b26b82	442	SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
cafe5635 KO	443	SET_GC_SECTORS_USED(b, 0);
	444	bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
	445	}
	446	}
	447
	448	int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
35fcd848	449	struct bkey *k, int n, bool wait)
cafe5635 KO	450	{
	451	int i;
	452
	453	lockdep_assert_held(&c->bucket_lock);
	454	BUG_ON(!n \|\| n > c->caches_loaded \|\| n > 8);
	455
	456	bkey_init(k);
	457
	458	/* sort by free space/prio of oldest data in caches */
	459
	460	for (i = 0; i < n; i++) {
	461	struct cache *ca = c->cache_by_alloc[i];
35fcd848	462	long b = bch_bucket_alloc(ca, watermark, wait);
cafe5635 KO	463
	464	if (b == -1)
	465	goto err;
	466
	467	k->ptr[i] = PTR(ca->buckets[b].gen,
	468	bucket_to_sector(c, b),
	469	ca->sb.nr_this_dev);
	470
	471	SET_KEY_PTRS(k, i + 1);
	472	}
	473
	474	return 0;
	475	err:
	476	bch_bucket_free(c, k);
3a3b6a4e	477	bkey_put(c, k);
cafe5635 KO	478	return -1;
	479	}
	480
	481	int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
35fcd848	482	struct bkey *k, int n, bool wait)
cafe5635 KO	483	{
	484	int ret;
	485	mutex_lock(&c->bucket_lock);
35fcd848	486	ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
cafe5635 KO	487	mutex_unlock(&c->bucket_lock);
	488	return ret;
	489	}
	490
2599b53b KO	491	/* Sector allocator */
	492
	493	struct open_bucket {
	494	struct list_head list;
	495	unsigned last_write_point;
	496	unsigned sectors_free;
	497	BKEY_PADDED(key);
	498	};
	499
	500	/*
	501	* We keep multiple buckets open for writes, and try to segregate different
	502	* write streams for better cache utilization: first we look for a bucket where
	503	* the last write to it was sequential with the current write, and failing that
	504	* we look for a bucket that was last used by the same task.
	505	*
	506	* The ideas is if you've got multiple tasks pulling data into the cache at the
	507	* same time, you'll get better cache utilization if you try to segregate their
	508	* data and preserve locality.
	509	*
	510	* For example, say you've starting Firefox at the same time you're copying a
	511	* bunch of files. Firefox will likely end up being fairly hot and stay in the
	512	* cache awhile, but the data you copied might not be; if you wrote all that
	513	* data to the same buckets it'd get invalidated at the same time.
	514	*
	515	* Both of those tasks will be doing fairly random IO so we can't rely on
	516	* detecting sequential IO to segregate their data, but going off of the task
	517	* should be a sane heuristic.
	518	*/
	519	static struct open_bucket pick_data_bucket(struct cache_set c,
	520	const struct bkey *search,
	521	unsigned write_point,
	522	struct bkey *alloc)
	523	{
	524	struct open_bucket ret, ret_task = NULL;
	525
	526	list_for_each_entry_reverse(ret, &c->data_buckets, list)
	527	if (!bkey_cmp(&ret->key, search))
	528	goto found;
	529	else if (ret->last_write_point == write_point)
	530	ret_task = ret;
	531
	532	ret = ret_task ?: list_first_entry(&c->data_buckets,
	533	struct open_bucket, list);
	534	found:
	535	if (!ret->sectors_free && KEY_PTRS(alloc)) {
	536	ret->sectors_free = c->sb.bucket_size;
	537	bkey_copy(&ret->key, alloc);
	538	bkey_init(alloc);
	539	}
	540
	541	if (!ret->sectors_free)
	542	ret = NULL;
	543
	544	return ret;
	545	}
	546
	547	/*
	548	* Allocates some space in the cache to write to, and k to point to the newly
	549	* allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
	550	* end of the newly allocated space).
	551	*
	552	* May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
	553	* sectors were actually allocated.
	554	*
555	* If s->writeback is true, will not fail.
556	*/
557	bool bch_alloc_sectors(struct cache_set c, struct bkey k, unsigned sectors,
558	unsigned write_point, unsigned write_prio, bool wait)
559	{
560	struct open_bucket *b;
561	BKEY_PADDED(key) alloc;
562	unsigned i;
563
564	/*
565	* We might have to allocate a new bucket, which we can't do with a
566	* spinlock held. So if we have to allocate, we drop the lock, allocate
567	* and then retry. KEY_PTRS() indicates whether alloc points to
568	* allocated bucket(s).
569	*/
570
571	bkey_init(&alloc.key);
572	spin_lock(&c->data_bucket_lock);
573
574	while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
575	unsigned watermark = write_prio
576	? WATERMARK_MOVINGGC
577	: WATERMARK_NONE;
578
579	spin_unlock(&c->data_bucket_lock);
580
581	if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
582	return false;
583
584	spin_lock(&c->data_bucket_lock);
585	}
586
587	/*
588	* If we had to allocate, we might race and not need to allocate the
589	* second time we call find_data_bucket(). If we allocated a bucket but
590	* didn't use it, drop the refcount bch_bucket_alloc_set() took:
591	*/
592	if (KEY_PTRS(&alloc.key))
3a3b6a4e	593	bkey_put(c, &alloc.key);
2599b53b KO	594
	595	for (i = 0; i < KEY_PTRS(&b->key); i++)
	596	EBUG_ON(ptr_stale(c, &b->key, i));
	597
	598	/* Set up the pointer to the space we're allocating: */
	599
	600	for (i = 0; i < KEY_PTRS(&b->key); i++)
	601	k->ptr[i] = b->key.ptr[i];
	602
	603	sectors = min(sectors, b->sectors_free);
	604
	605	SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
	606	SET_KEY_SIZE(k, sectors);
	607	SET_KEY_PTRS(k, KEY_PTRS(&b->key));
	608
	609	/*
	610	* Move b to the end of the lru, and keep track of what this bucket was
	611	* last used for:
	612	*/
	613	list_move_tail(&b->list, &c->data_buckets);
	614	bkey_copy_key(&b->key, k);
	615	b->last_write_point = write_point;
	616
	617	b->sectors_free -= sectors;
	618
	619	for (i = 0; i < KEY_PTRS(&b->key); i++) {
	620	SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
	621
	622	atomic_long_add(sectors,
	623	&PTR_CACHE(c, &b->key, i)->sectors_written);
	624	}
	625
	626	if (b->sectors_free < c->sb.block_size)
	627	b->sectors_free = 0;
	628
	629	/*
	630	* k takes refcounts on the buckets it points to until it's inserted
	631	* into the btree, but if we're done with this bucket we just transfer
	632	* get_data_bucket()'s refcount.
	633	*/
	634	if (b->sectors_free)
	635	for (i = 0; i < KEY_PTRS(&b->key); i++)
	636	atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
	637
	638	spin_unlock(&c->data_bucket_lock);
	639	return true;
	640	}
	641
cafe5635 KO	642	/* Init */
cafe5635 KO	643
2599b53b KO	644	void bch_open_buckets_free(struct cache_set *c)
	645	{
	646	struct open_bucket *b;
	647
	648	while (!list_empty(&c->data_buckets)) {
	649	b = list_first_entry(&c->data_buckets,
	650	struct open_bucket, list);
	651	list_del(&b->list);
	652	kfree(b);
	653	}
	654	}
	655
	656	int bch_open_buckets_alloc(struct cache_set *c)
	657	{
	658	int i;
	659
	660	spin_lock_init(&c->data_bucket_lock);
	661
	662	for (i = 0; i < 6; i++) {
	663	struct open_bucket b = kzalloc(sizeof(b), GFP_KERNEL);
	664	if (!b)
	665	return -ENOMEM;
	666
	667	list_add(&b->list, &c->data_buckets);
	668	}
	669
	670	return 0;
	671	}
	672
119ba0f8 KO	673	int bch_cache_allocator_start(struct cache *ca)
119ba0f8 KO	674	{
79826c35 KO	675	struct task_struct *k = kthread_run(bch_allocator_thread,
	676	ca, "bcache_allocator");
	677	if (IS_ERR(k))
	678	return PTR_ERR(k);
119ba0f8	679
79826c35	680	ca->alloc_thread = k;
119ba0f8 KO	681	return 0;
	682	}
	683
cafe5635 KO	684	int bch_cache_allocator_init(struct cache *ca)
cafe5635 KO	685	{
cafe5635 KO	686	/*
	687	* Reserve:
	688	* Prio/gen writes first
	689	* Then 8 for btree allocations
	690	* Then half for the moving garbage collector
	691	*/
	692
	693	ca->watermark[WATERMARK_PRIO] = 0;
	694
	695	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
	696
	697	ca->watermark[WATERMARK_MOVINGGC] = 8 +
	698	ca->watermark[WATERMARK_METADATA];
	699
	700	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
	701	ca->watermark[WATERMARK_MOVINGGC];
	702
cafe5635 KO	703	return 0;
cafe5635 KO	704	}