[mirror_ubuntu-artful-kernel.git] / drivers / md / bcache / alloc.c

/*
 * Primary bucket allocation code
 *
 * Copyright 2012 Google, Inc.
 *
 * Allocation in bcache is done in terms of buckets:
 *
 * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
 * btree pointers - they must match for the pointer to be considered valid.
 *
 * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
 * bucket simply by incrementing its gen.
 *
 * The gens (along with the priorities; it's really the gens are important but
 * the code is named as if it's the priorities) are written in an arbitrary list
 * of buckets on disk, with a pointer to them in the journal header.
 *
 * When we invalidate a bucket, we have to write its new gen to disk and wait
 * for that write to complete before we use it - otherwise after a crash we
 * could have pointers that appeared to be good but pointed to data that had
 * been overwritten.
 *
 * Since the gens and priorities are all stored contiguously on disk, we can
 * batch this up: We fill up the free_inc list with freshly invalidated buckets,
 * call prio_write(), and when prio_write() finishes we pull buckets off the
 * free_inc list and optionally discard them.
 *
 * free_inc isn't the only freelist - if it was, we'd often to sleep while
 * priorities and gens were being written before we could allocate. c->free is a
 * smaller freelist, and buckets on that list are always ready to be used.
 *
 * If we've got discards enabled, that happens when a bucket moves from the
 * free_inc list to the free list.
 *
 * There is another freelist, because sometimes we have buckets that we know
 * have nothing pointing into them - these we can reuse without waiting for
 * priorities to be rewritten. These come from freed btree nodes and buckets
 * that garbage collection discovered no longer had valid keys pointing into
 * them (because they were overwritten). That's the unused list - buckets on the
 * unused list move to the free list, optionally being discarded in the process.
 *
 * It's also important to ensure that gens don't wrap around - with respect to
 * either the oldest gen in the btree or the gen on disk. This is quite
 * difficult to do in practice, but we explicitly guard against it anyways - if
 * a bucket is in danger of wrapping around we simply skip invalidating it that
 * time around, and we garbage collect or rewrite the priorities sooner than we
 * would have otherwise.
 *
 * bch_bucket_alloc() allocates a single bucket from a specific cache.
 *
 * bch_bucket_alloc_set() allocates one or more buckets from different caches
 * out of a cache set.
 *
 * free_some_buckets() drives all the processes described above. It's called
 * from bch_bucket_alloc() and a few other places that need to make sure free
 * buckets are ready.
 *
 * invalidate_buckets_(lru|fifo)() find buckets that are available to be
 * invalidated, and then invalidate them and stick them on the free_inc list -
 * in either lru or fifo order.
 */

#include "bcache.h"
#include "btree.h"

#include <linux/kthread.h>
#include <linux/random.h>
#include <trace/events/bcache.h>

#define MAX_IN_FLIGHT_DISCARDS		8U

/* Bucket heap / gen */

uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
{
	uint8_t ret = ++b->gen;

	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);

	if (CACHE_SYNC(&ca->set->sb)) {
		ca->need_save_prio = max(ca->need_save_prio,
					 bucket_disk_gen(b));
		WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
	}

	return ret;
}

void bch_rescale_priorities(struct cache_set *c, int sectors)
{
	struct cache *ca;
	struct bucket *b;
	unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
	unsigned i;
	int r;

	atomic_sub(sectors, &c->rescale);

	do {
		r = atomic_read(&c->rescale);

		if (r >= 0)
			return;
	} while (atomic_cmpxchg(&c->rescale, r, r + next) != r);

	mutex_lock(&c->bucket_lock);

	c->min_prio = USHRT_MAX;

	for_each_cache(ca, c, i)
		for_each_bucket(b, ca)
			if (b->prio &&
			    b->prio != BTREE_PRIO &&
			    !atomic_read(&b->pin)) {
				b->prio--;
				c->min_prio = min(c->min_prio, b->prio);
			}

	mutex_unlock(&c->bucket_lock);
}

/* Discard/TRIM */

struct discard {
	struct list_head	list;
	struct work_struct	work;
	struct cache		*ca;
	long			bucket;

	struct bio		bio;
	struct bio_vec		bv;
};

static void discard_finish(struct work_struct *w)
{
	struct discard *d = container_of(w, struct discard, work);
	struct cache *ca = d->ca;
	char buf[BDEVNAME_SIZE];

	if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
		pr_notice("discard error on %s, disabling",
			 bdevname(ca->bdev, buf));
		d->ca->discard = 0;
	}

	mutex_lock(&ca->set->bucket_lock);

	fifo_push(&ca->free, d->bucket);
	list_add(&d->list, &ca->discards);
	atomic_dec(&ca->discards_in_flight);

	mutex_unlock(&ca->set->bucket_lock);

	closure_wake_up(&ca->set->bucket_wait);
	wake_up_process(ca->alloc_thread);

	closure_put(&ca->set->cl);
}

static void discard_endio(struct bio *bio, int error)
{
	struct discard *d = container_of(bio, struct discard, bio);
	schedule_work(&d->work);
}

static void do_discard(struct cache *ca, long bucket)
{
	struct discard *d = list_first_entry(&ca->discards,
					     struct discard, list);

	list_del(&d->list);
	d->bucket = bucket;

	atomic_inc(&ca->discards_in_flight);
	closure_get(&ca->set->cl);

	bio_init(&d->bio);

	d->bio.bi_sector	= bucket_to_sector(ca->set, d->bucket);
	d->bio.bi_bdev		= ca->bdev;
	d->bio.bi_rw		= REQ_WRITE|REQ_DISCARD;
	d->bio.bi_max_vecs	= 1;
	d->bio.bi_io_vec	= d->bio.bi_inline_vecs;
	d->bio.bi_size		= bucket_bytes(ca);
	d->bio.bi_end_io	= discard_endio;
	bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));

	submit_bio(0, &d->bio);
}

/* Allocation */

static inline bool can_inc_bucket_gen(struct bucket *b)
{
	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
		bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
}

bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
{
	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));

	if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
	    CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
		return false;

	b->prio = 0;

	if (can_inc_bucket_gen(b) &&
	    fifo_push(&ca->unused, b - ca->buckets)) {
		atomic_inc(&b->pin);
		return true;
	}

	return false;
}

static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
{
	return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
		!atomic_read(&b->pin) &&
		can_inc_bucket_gen(b);
}

static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
{
	bch_inc_gen(ca, b);
	b->prio = INITIAL_PRIO;
	atomic_inc(&b->pin);
	fifo_push(&ca->free_inc, b - ca->buckets);
}

#define bucket_prio(b)				\
	(((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))

#define bucket_max_cmp(l, r)	(bucket_prio(l) < bucket_prio(r))
#define bucket_min_cmp(l, r)	(bucket_prio(l) > bucket_prio(r))

static void invalidate_buckets_lru(struct cache *ca)
{
	struct bucket *b;
	ssize_t i;

	ca->heap.used = 0;

	for_each_bucket(b, ca) {
		/*
		 * If we fill up the unused list, if we then return before
		 * adding anything to the free_inc list we'll skip writing
		 * prios/gens and just go back to allocating from the unused
		 * list:
		 */
		if (fifo_full(&ca->unused))
			return;

		if (!can_invalidate_bucket(ca, b))
			continue;

		if (!GC_SECTORS_USED(b) &&
		    bch_bucket_add_unused(ca, b))
			continue;

		if (!heap_full(&ca->heap))
			heap_add(&ca->heap, b, bucket_max_cmp);
		else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
			ca->heap.data[0] = b;
			heap_sift(&ca->heap, 0, bucket_max_cmp);
		}
	}

	for (i = ca->heap.used / 2 - 1; i >= 0; --i)
		heap_sift(&ca->heap, i, bucket_min_cmp);

	while (!fifo_full(&ca->free_inc)) {
		if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
			/*
			 * We don't want to be calling invalidate_buckets()
			 * multiple times when it can't do anything
			 */
			ca->invalidate_needs_gc = 1;
			bch_queue_gc(ca->set);
			return;
		}

		invalidate_one_bucket(ca, b);
	}
}

static void invalidate_buckets_fifo(struct cache *ca)
{
	struct bucket *b;
	size_t checked = 0;

	while (!fifo_full(&ca->free_inc)) {
		if (ca->fifo_last_bucket <  ca->sb.first_bucket ||
		    ca->fifo_last_bucket >= ca->sb.nbuckets)
			ca->fifo_last_bucket = ca->sb.first_bucket;

		b = ca->buckets + ca->fifo_last_bucket++;

		if (can_invalidate_bucket(ca, b))
			invalidate_one_bucket(ca, b);

		if (++checked >= ca->sb.nbuckets) {
			ca->invalidate_needs_gc = 1;
			bch_queue_gc(ca->set);
			return;
		}
	}
}

static void invalidate_buckets_random(struct cache *ca)
{
	struct bucket *b;
	size_t checked = 0;

	while (!fifo_full(&ca->free_inc)) {
		size_t n;
		get_random_bytes(&n, sizeof(n));

		n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
		n += ca->sb.first_bucket;

		b = ca->buckets + n;

		if (can_invalidate_bucket(ca, b))
			invalidate_one_bucket(ca, b);

		if (++checked >= ca->sb.nbuckets / 2) {
			ca->invalidate_needs_gc = 1;
			bch_queue_gc(ca->set);
			return;
		}
	}
}

static void invalidate_buckets(struct cache *ca)
{
	if (ca->invalidate_needs_gc)
		return;

	switch (CACHE_REPLACEMENT(&ca->sb)) {
	case CACHE_REPLACEMENT_LRU:
		invalidate_buckets_lru(ca);
		break;
	case CACHE_REPLACEMENT_FIFO:
		invalidate_buckets_fifo(ca);
		break;
	case CACHE_REPLACEMENT_RANDOM:
		invalidate_buckets_random(ca);
		break;
	}

	trace_bcache_alloc_invalidate(ca);
}

#define allocator_wait(ca, cond)					\
do {									\
	while (1) {							\
		set_current_state(TASK_INTERRUPTIBLE);			\
		if (cond)						\
			break;						\
									\
		mutex_unlock(&(ca)->set->bucket_lock);			\
		if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) {	\
			closure_put(&ca->set->cl);			\
			return 0;					\
		}							\
									\
		schedule();						\
		mutex_lock(&(ca)->set->bucket_lock);			\
	}								\
	__set_current_state(TASK_RUNNING);				\
} while (0)

static int bch_allocator_thread(void *arg)
{
	struct cache *ca = arg;

	mutex_lock(&ca->set->bucket_lock);

	while (1) {
		/*
		 * First, we pull buckets off of the unused and free_inc lists,
		 * possibly issue discards to them, then we add the bucket to
		 * the free list:
		 */
		while (1) {
			long bucket;

			if ((!atomic_read(&ca->set->prio_blocked) ||
			     !CACHE_SYNC(&ca->set->sb)) &&
			    !fifo_empty(&ca->unused))
				fifo_pop(&ca->unused, bucket);
			else if (!fifo_empty(&ca->free_inc))
				fifo_pop(&ca->free_inc, bucket);
			else
				break;

			allocator_wait(ca, (int) fifo_free(&ca->free) >
				       atomic_read(&ca->discards_in_flight));

			if (ca->discard) {
				allocator_wait(ca, !list_empty(&ca->discards));
				do_discard(ca, bucket);
			} else {
				fifo_push(&ca->free, bucket);
				closure_wake_up(&ca->set->bucket_wait);
			}
		}

		/*
		 * We've run out of free buckets, we need to find some buckets
		 * we can invalidate. First, invalidate them in memory and add
		 * them to the free_inc list:
		 */

		allocator_wait(ca, ca->set->gc_mark_valid &&
			       (ca->need_save_prio > 64 ||
				!ca->invalidate_needs_gc));
		invalidate_buckets(ca);

		/*
		 * Now, we write their new gens to disk so we can start writing
		 * new stuff to them:
		 */
		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
		if (CACHE_SYNC(&ca->set->sb) &&
		    (!fifo_empty(&ca->free_inc) ||
		     ca->need_save_prio > 64))
			bch_prio_write(ca);
	}
}

long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
{
	long r = -1;
again:
	wake_up_process(ca->alloc_thread);

	if (fifo_used(&ca->free) > ca->watermark[watermark] &&
	    fifo_pop(&ca->free, r)) {
		struct bucket *b = ca->buckets + r;
#ifdef CONFIG_BCACHE_EDEBUG
		size_t iter;
		long i;

		for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
			BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);

		fifo_for_each(i, &ca->free, iter)
			BUG_ON(i == r);
		fifo_for_each(i, &ca->free_inc, iter)
			BUG_ON(i == r);
		fifo_for_each(i, &ca->unused, iter)
			BUG_ON(i == r);
#endif
		BUG_ON(atomic_read(&b->pin) != 1);

		SET_GC_SECTORS_USED(b, ca->sb.bucket_size);

		if (watermark <= WATERMARK_METADATA) {
			SET_GC_MARK(b, GC_MARK_METADATA);
			b->prio = BTREE_PRIO;
		} else {
			SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
			b->prio = INITIAL_PRIO;
		}

		return r;
	}

	trace_bcache_alloc_fail(ca);

	if (cl) {
		closure_wait(&ca->set->bucket_wait, cl);

		if (closure_blocking(cl)) {
			mutex_unlock(&ca->set->bucket_lock);
			closure_sync(cl);
			mutex_lock(&ca->set->bucket_lock);
			goto again;
		}
	}

	return -1;
}

void bch_bucket_free(struct cache_set *c, struct bkey *k)
{
	unsigned i;

	for (i = 0; i < KEY_PTRS(k); i++) {
		struct bucket *b = PTR_BUCKET(c, k, i);

		SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
		SET_GC_SECTORS_USED(b, 0);
		bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
	}
}

int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
			   struct bkey *k, int n, struct closure *cl)
{
	int i;

	lockdep_assert_held(&c->bucket_lock);
	BUG_ON(!n || n > c->caches_loaded || n > 8);

	bkey_init(k);

	/* sort by free space/prio of oldest data in caches */

	for (i = 0; i < n; i++) {
		struct cache *ca = c->cache_by_alloc[i];
		long b = bch_bucket_alloc(ca, watermark, cl);

		if (b == -1)
			goto err;

		k->ptr[i] = PTR(ca->buckets[b].gen,
				bucket_to_sector(c, b),
				ca->sb.nr_this_dev);

		SET_KEY_PTRS(k, i + 1);
	}

	return 0;
err:
	bch_bucket_free(c, k);
	__bkey_put(c, k);
	return -1;
}

int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
			 struct bkey *k, int n, struct closure *cl)
{
	int ret;
	mutex_lock(&c->bucket_lock);
	ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
	mutex_unlock(&c->bucket_lock);
	return ret;
}

/* Init */

int bch_cache_allocator_start(struct cache *ca)
{
	ca->alloc_thread = kthread_create(bch_allocator_thread,
					  ca, "bcache_allocator");
	if (IS_ERR(ca->alloc_thread))
		return PTR_ERR(ca->alloc_thread);

	closure_get(&ca->set->cl);
	wake_up_process(ca->alloc_thread);

	return 0;
}

void bch_cache_allocator_exit(struct cache *ca)
{
	struct discard *d;

	while (!list_empty(&ca->discards)) {
		d = list_first_entry(&ca->discards, struct discard, list);
		cancel_work_sync(&d->work);
		list_del(&d->list);
		kfree(d);
	}
}

int bch_cache_allocator_init(struct cache *ca)
{
	unsigned i;

	/*
	 * Reserve:
	 * Prio/gen writes first
	 * Then 8 for btree allocations
	 * Then half for the moving garbage collector
	 */

	ca->watermark[WATERMARK_PRIO] = 0;

	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);

	ca->watermark[WATERMARK_MOVINGGC] = 8 +
		ca->watermark[WATERMARK_METADATA];

	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
		ca->watermark[WATERMARK_MOVINGGC];

	for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
		struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
		if (!d)
			return -ENOMEM;

		d->ca = ca;
		INIT_WORK(&d->work, discard_finish);
		list_add(&d->list, &ca->discards);
	}

	return 0;
}
Commit	Line	Data
cafe5635 KO	1	/*
	2	* Primary bucket allocation code
	3	*
	4	* Copyright 2012 Google, Inc.
	5	*
	6	* Allocation in bcache is done in terms of buckets:
	7	*
	8	* Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
	9	* btree pointers - they must match for the pointer to be considered valid.
	10	*
	11	* Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
	12	* bucket simply by incrementing its gen.
	13	*
	14	* The gens (along with the priorities; it's really the gens are important but
	15	* the code is named as if it's the priorities) are written in an arbitrary list
	16	* of buckets on disk, with a pointer to them in the journal header.
	17	*
	18	* When we invalidate a bucket, we have to write its new gen to disk and wait
	19	* for that write to complete before we use it - otherwise after a crash we
	20	* could have pointers that appeared to be good but pointed to data that had
	21	* been overwritten.
	22	*
	23	* Since the gens and priorities are all stored contiguously on disk, we can
	24	* batch this up: We fill up the free_inc list with freshly invalidated buckets,
	25	* call prio_write(), and when prio_write() finishes we pull buckets off the
	26	* free_inc list and optionally discard them.
	27	*
	28	* free_inc isn't the only freelist - if it was, we'd often to sleep while
	29	* priorities and gens were being written before we could allocate. c->free is a
	30	* smaller freelist, and buckets on that list are always ready to be used.
	31	*
	32	* If we've got discards enabled, that happens when a bucket moves from the
	33	* free_inc list to the free list.
	34	*
	35	* There is another freelist, because sometimes we have buckets that we know
	36	* have nothing pointing into them - these we can reuse without waiting for
	37	* priorities to be rewritten. These come from freed btree nodes and buckets
	38	* that garbage collection discovered no longer had valid keys pointing into
	39	* them (because they were overwritten). That's the unused list - buckets on the
	40	* unused list move to the free list, optionally being discarded in the process.
	41	*
	42	* It's also important to ensure that gens don't wrap around - with respect to
	43	* either the oldest gen in the btree or the gen on disk. This is quite
	44	* difficult to do in practice, but we explicitly guard against it anyways - if
	45	* a bucket is in danger of wrapping around we simply skip invalidating it that
	46	* time around, and we garbage collect or rewrite the priorities sooner than we
	47	* would have otherwise.
	48	*
	49	* bch_bucket_alloc() allocates a single bucket from a specific cache.
	50	*
	51	* bch_bucket_alloc_set() allocates one or more buckets from different caches
	52	* out of a cache set.
	53	*
	54	* free_some_buckets() drives all the processes described above. It's called
	55	* from bch_bucket_alloc() and a few other places that need to make sure free
	56	* buckets are ready.
	57	*
	58	* invalidate_buckets_(lru\|fifo)() find buckets that are available to be
	59	* invalidated, and then invalidate them and stick them on the free_inc list -
	60	* in either lru or fifo order.
	61	*/
	62
	63	#include "bcache.h"
	64	#include "btree.h"
65
119ba0f8	66	#include <linux/kthread.h>
cafe5635	67	#include <linux/random.h>
c37511b8	68	#include <trace/events/bcache.h>
cafe5635 KO	69
	70	#define MAX_IN_FLIGHT_DISCARDS 8U
	71
	72	/* Bucket heap / gen */
	73
	74	uint8_t bch_inc_gen(struct cache ca, struct bucket b)
	75	{
	76	uint8_t ret = ++b->gen;
	77
	78	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
	79	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
	80
	81	if (CACHE_SYNC(&ca->set->sb)) {
	82	ca->need_save_prio = max(ca->need_save_prio,
	83	bucket_disk_gen(b));
	84	WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
	85	}
	86
	87	return ret;
	88	}
	89
	90	void bch_rescale_priorities(struct cache_set *c, int sectors)
	91	{
	92	struct cache *ca;
	93	struct bucket *b;
	94	unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
	95	unsigned i;
	96	int r;
	97
	98	atomic_sub(sectors, &c->rescale);
	99
	100	do {
	101	r = atomic_read(&c->rescale);
	102
	103	if (r >= 0)
	104	return;
	105	} while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
	106
	107	mutex_lock(&c->bucket_lock);
	108
	109	c->min_prio = USHRT_MAX;
	110
	111	for_each_cache(ca, c, i)
	112	for_each_bucket(b, ca)
	113	if (b->prio &&
	114	b->prio != BTREE_PRIO &&
	115	!atomic_read(&b->pin)) {
	116	b->prio--;
	117	c->min_prio = min(c->min_prio, b->prio);
	118	}
	119
	120	mutex_unlock(&c->bucket_lock);
	121	}
	122
	123	/* Discard/TRIM */
	124
	125	struct discard {
	126	struct list_head list;
	127	struct work_struct work;
	128	struct cache *ca;
	129	long bucket;
	130
	131	struct bio bio;
	132	struct bio_vec bv;
133	};
134
135	static void discard_finish(struct work_struct *w)
136	{
137	struct discard *d = container_of(w, struct discard, work);
138	struct cache *ca = d->ca;
139	char buf[BDEVNAME_SIZE];
140
141	if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
142	pr_notice("discard error on %s, disabling",
143	bdevname(ca->bdev, buf));
144	d->ca->discard = 0;
145	}
146
147	mutex_lock(&ca->set->bucket_lock);
148
149	fifo_push(&ca->free, d->bucket);
150	list_add(&d->list, &ca->discards);
151	atomic_dec(&ca->discards_in_flight);
152
153	mutex_unlock(&ca->set->bucket_lock);
154
155	closure_wake_up(&ca->set->bucket_wait);
119ba0f8	156	wake_up_process(ca->alloc_thread);
cafe5635 KO	157
	158	closure_put(&ca->set->cl);
	159	}
	160
	161	static void discard_endio(struct bio *bio, int error)
	162	{
	163	struct discard *d = container_of(bio, struct discard, bio);
	164	schedule_work(&d->work);
	165	}
	166
	167	static void do_discard(struct cache *ca, long bucket)
	168	{
	169	struct discard *d = list_first_entry(&ca->discards,
	170	struct discard, list);
	171
	172	list_del(&d->list);
	173	d->bucket = bucket;
	174
	175	atomic_inc(&ca->discards_in_flight);
	176	closure_get(&ca->set->cl);
	177
	178	bio_init(&d->bio);
	179
	180	d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket);
	181	d->bio.bi_bdev = ca->bdev;
	182	d->bio.bi_rw = REQ_WRITE\|REQ_DISCARD;
	183	d->bio.bi_max_vecs = 1;
	184	d->bio.bi_io_vec = d->bio.bi_inline_vecs;
	185	d->bio.bi_size = bucket_bytes(ca);
	186	d->bio.bi_end_io = discard_endio;
	187	bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
	188
	189	submit_bio(0, &d->bio);
	190	}
	191
	192	/* Allocation */
	193
	194	static inline bool can_inc_bucket_gen(struct bucket *b)
	195	{
	196	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
	197	bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
	198	}
	199
	200	bool bch_bucket_add_unused(struct cache ca, struct bucket b)
	201	{
	202	BUG_ON(GC_MARK(b) \|\| GC_SECTORS_USED(b));
	203
	204	if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
	205	CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
	206	return false;
	207
	208	b->prio = 0;
	209
	210	if (can_inc_bucket_gen(b) &&
	211	fifo_push(&ca->unused, b - ca->buckets)) {
	212	atomic_inc(&b->pin);
	213	return true;
	214	}
	215
	216	return false;
	217	}
	218
	219	static bool can_invalidate_bucket(struct cache ca, struct bucket b)
	220	{
221	return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
222	!atomic_read(&b->pin) &&
223	can_inc_bucket_gen(b);
224	}
225
226	static void invalidate_one_bucket(struct cache ca, struct bucket b)
227	{
228	bch_inc_gen(ca, b);
229	b->prio = INITIAL_PRIO;
230	atomic_inc(&b->pin);
231	fifo_push(&ca->free_inc, b - ca->buckets);
232	}
233
b1a67b0f KO	234	#define bucket_prio(b) \
b1a67b0f KO	235	(((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
cafe5635	236
b1a67b0f KO	237	#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
b1a67b0f KO	238	#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
cafe5635	239
b1a67b0f KO	240	static void invalidate_buckets_lru(struct cache *ca)
b1a67b0f KO	241	{
cafe5635 KO	242	struct bucket *b;
	243	ssize_t i;
	244
	245	ca->heap.used = 0;
	246
	247	for_each_bucket(b, ca) {
86b26b82 KO	248	/*
	249	* If we fill up the unused list, if we then return before
	250	* adding anything to the free_inc list we'll skip writing
	251	* prios/gens and just go back to allocating from the unused
	252	* list:
	253	*/
	254	if (fifo_full(&ca->unused))
	255	return;
	256
cafe5635 KO	257	if (!can_invalidate_bucket(ca, b))
	258	continue;
	259
86b26b82 KO	260	if (!GC_SECTORS_USED(b) &&
	261	bch_bucket_add_unused(ca, b))
	262	continue;
	263
	264	if (!heap_full(&ca->heap))
	265	heap_add(&ca->heap, b, bucket_max_cmp);
	266	else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
	267	ca->heap.data[0] = b;
	268	heap_sift(&ca->heap, 0, bucket_max_cmp);
cafe5635 KO	269	}
	270	}
	271
cafe5635 KO	272	for (i = ca->heap.used / 2 - 1; i >= 0; --i)
	273	heap_sift(&ca->heap, i, bucket_min_cmp);
	274
	275	while (!fifo_full(&ca->free_inc)) {
	276	if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
86b26b82 KO	277	/*
86b26b82 KO	278	* We don't want to be calling invalidate_buckets()
cafe5635 KO	279	* multiple times when it can't do anything
	280	*/
	281	ca->invalidate_needs_gc = 1;
	282	bch_queue_gc(ca->set);
	283	return;
	284	}
	285
	286	invalidate_one_bucket(ca, b);
	287	}
	288	}
	289
	290	static void invalidate_buckets_fifo(struct cache *ca)
	291	{
	292	struct bucket *b;
	293	size_t checked = 0;
	294
	295	while (!fifo_full(&ca->free_inc)) {
	296	if (ca->fifo_last_bucket < ca->sb.first_bucket \|\|
	297	ca->fifo_last_bucket >= ca->sb.nbuckets)
	298	ca->fifo_last_bucket = ca->sb.first_bucket;
	299
	300	b = ca->buckets + ca->fifo_last_bucket++;
	301
	302	if (can_invalidate_bucket(ca, b))
	303	invalidate_one_bucket(ca, b);
	304
	305	if (++checked >= ca->sb.nbuckets) {
	306	ca->invalidate_needs_gc = 1;
	307	bch_queue_gc(ca->set);
	308	return;
	309	}
	310	}
	311	}
	312
	313	static void invalidate_buckets_random(struct cache *ca)
	314	{
	315	struct bucket *b;
	316	size_t checked = 0;
	317
	318	while (!fifo_full(&ca->free_inc)) {
	319	size_t n;
	320	get_random_bytes(&n, sizeof(n));
	321
	322	n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
	323	n += ca->sb.first_bucket;
	324
	325	b = ca->buckets + n;
	326
	327	if (can_invalidate_bucket(ca, b))
	328	invalidate_one_bucket(ca, b);
	329
	330	if (++checked >= ca->sb.nbuckets / 2) {
	331	ca->invalidate_needs_gc = 1;
	332	bch_queue_gc(ca->set);
	333	return;
	334	}
	335	}
	336	}
	337
	338	static void invalidate_buckets(struct cache *ca)
	339	{
	340	if (ca->invalidate_needs_gc)
	341	return;
	342
343	switch (CACHE_REPLACEMENT(&ca->sb)) {
344	case CACHE_REPLACEMENT_LRU:
345	invalidate_buckets_lru(ca);
346	break;
347	case CACHE_REPLACEMENT_FIFO:
348	invalidate_buckets_fifo(ca);
349	break;
350	case CACHE_REPLACEMENT_RANDOM:
351	invalidate_buckets_random(ca);
352	break;
353	}
86b26b82	354
c37511b8	355	trace_bcache_alloc_invalidate(ca);
cafe5635 KO	356	}
	357
	358	#define allocator_wait(ca, cond) \
	359	do { \
86b26b82	360	while (1) { \
119ba0f8	361	set_current_state(TASK_INTERRUPTIBLE); \
86b26b82 KO	362	if (cond) \
86b26b82 KO	363	break; \
cafe5635 KO	364	\
	365	mutex_unlock(&(ca)->set->bucket_lock); \
	366	if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
119ba0f8 KO	367	closure_put(&ca->set->cl); \
119ba0f8 KO	368	return 0; \
cafe5635 KO	369	} \
	370	\
	371	schedule(); \
cafe5635 KO	372	mutex_lock(&(ca)->set->bucket_lock); \
cafe5635 KO	373	} \
119ba0f8	374	__set_current_state(TASK_RUNNING); \
cafe5635 KO	375	} while (0)
cafe5635 KO	376
119ba0f8	377	static int bch_allocator_thread(void *arg)
cafe5635	378	{
119ba0f8	379	struct cache *ca = arg;
cafe5635 KO	380
	381	mutex_lock(&ca->set->bucket_lock);
	382
	383	while (1) {
86b26b82 KO	384	/*
	385	* First, we pull buckets off of the unused and free_inc lists,
	386	* possibly issue discards to them, then we add the bucket to
	387	* the free list:
	388	*/
cafe5635 KO	389	while (1) {
	390	long bucket;
	391
	392	if ((!atomic_read(&ca->set->prio_blocked) \|\|
	393	!CACHE_SYNC(&ca->set->sb)) &&
	394	!fifo_empty(&ca->unused))
	395	fifo_pop(&ca->unused, bucket);
	396	else if (!fifo_empty(&ca->free_inc))
	397	fifo_pop(&ca->free_inc, bucket);
	398	else
	399	break;
	400
	401	allocator_wait(ca, (int) fifo_free(&ca->free) >
	402	atomic_read(&ca->discards_in_flight));
	403
	404	if (ca->discard) {
	405	allocator_wait(ca, !list_empty(&ca->discards));
	406	do_discard(ca, bucket);
	407	} else {
	408	fifo_push(&ca->free, bucket);
	409	closure_wake_up(&ca->set->bucket_wait);
	410	}
	411	}
	412
86b26b82 KO	413	/*
	414	* We've run out of free buckets, we need to find some buckets
	415	* we can invalidate. First, invalidate them in memory and add
	416	* them to the free_inc list:
	417	*/
cafe5635	418
86b26b82 KO	419	allocator_wait(ca, ca->set->gc_mark_valid &&
	420	(ca->need_save_prio > 64 \|\|
	421	!ca->invalidate_needs_gc));
	422	invalidate_buckets(ca);
cafe5635	423
86b26b82 KO	424	/*
	425	* Now, we write their new gens to disk so we can start writing
	426	* new stuff to them:
	427	*/
	428	allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
cafe5635 KO	429	if (CACHE_SYNC(&ca->set->sb) &&
cafe5635 KO	430	(!fifo_empty(&ca->free_inc) \|\|
86b26b82	431	ca->need_save_prio > 64))
cafe5635	432	bch_prio_write(ca);
cafe5635 KO	433	}
	434	}
	435
	436	long bch_bucket_alloc(struct cache ca, unsigned watermark, struct closure cl)
	437	{
	438	long r = -1;
	439	again:
119ba0f8	440	wake_up_process(ca->alloc_thread);
cafe5635 KO	441
	442	if (fifo_used(&ca->free) > ca->watermark[watermark] &&
	443	fifo_pop(&ca->free, r)) {
	444	struct bucket *b = ca->buckets + r;
	445	#ifdef CONFIG_BCACHE_EDEBUG
	446	size_t iter;
	447	long i;
	448
	449	for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
	450	BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
	451
	452	fifo_for_each(i, &ca->free, iter)
	453	BUG_ON(i == r);
	454	fifo_for_each(i, &ca->free_inc, iter)
	455	BUG_ON(i == r);
	456	fifo_for_each(i, &ca->unused, iter)
	457	BUG_ON(i == r);
	458	#endif
	459	BUG_ON(atomic_read(&b->pin) != 1);
	460
	461	SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
	462
	463	if (watermark <= WATERMARK_METADATA) {
	464	SET_GC_MARK(b, GC_MARK_METADATA);
	465	b->prio = BTREE_PRIO;
	466	} else {
	467	SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
	468	b->prio = INITIAL_PRIO;
	469	}
	470
	471	return r;
	472	}
	473
c37511b8	474	trace_bcache_alloc_fail(ca);
cafe5635 KO	475
	476	if (cl) {
	477	closure_wait(&ca->set->bucket_wait, cl);
	478
	479	if (closure_blocking(cl)) {
	480	mutex_unlock(&ca->set->bucket_lock);
	481	closure_sync(cl);
	482	mutex_lock(&ca->set->bucket_lock);
	483	goto again;
	484	}
	485	}
	486
	487	return -1;
	488	}
	489
	490	void bch_bucket_free(struct cache_set c, struct bkey k)
	491	{
	492	unsigned i;
	493
	494	for (i = 0; i < KEY_PTRS(k); i++) {
	495	struct bucket *b = PTR_BUCKET(c, k, i);
	496
86b26b82	497	SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
cafe5635 KO	498	SET_GC_SECTORS_USED(b, 0);
	499	bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
	500	}
	501	}
	502
	503	int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
	504	struct bkey k, int n, struct closure cl)
	505	{
	506	int i;
	507
	508	lockdep_assert_held(&c->bucket_lock);
	509	BUG_ON(!n \|\| n > c->caches_loaded \|\| n > 8);
	510
	511	bkey_init(k);
	512
	513	/* sort by free space/prio of oldest data in caches */
	514
	515	for (i = 0; i < n; i++) {
	516	struct cache *ca = c->cache_by_alloc[i];
	517	long b = bch_bucket_alloc(ca, watermark, cl);
	518
	519	if (b == -1)
	520	goto err;
	521
	522	k->ptr[i] = PTR(ca->buckets[b].gen,
	523	bucket_to_sector(c, b),
	524	ca->sb.nr_this_dev);
	525
	526	SET_KEY_PTRS(k, i + 1);
	527	}
	528
	529	return 0;
	530	err:
	531	bch_bucket_free(c, k);
	532	__bkey_put(c, k);
	533	return -1;
	534	}
	535
	536	int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
	537	struct bkey k, int n, struct closure cl)
	538	{
	539	int ret;
	540	mutex_lock(&c->bucket_lock);
	541	ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
	542	mutex_unlock(&c->bucket_lock);
	543	return ret;
	544	}
	545
	546	/* Init */
	547
119ba0f8 KO	548	int bch_cache_allocator_start(struct cache *ca)
	549	{
	550	ca->alloc_thread = kthread_create(bch_allocator_thread,
	551	ca, "bcache_allocator");
	552	if (IS_ERR(ca->alloc_thread))
	553	return PTR_ERR(ca->alloc_thread);
	554
	555	closure_get(&ca->set->cl);
	556	wake_up_process(ca->alloc_thread);
	557
	558	return 0;
	559	}
	560
cafe5635 KO	561	void bch_cache_allocator_exit(struct cache *ca)
	562	{
	563	struct discard *d;
	564
	565	while (!list_empty(&ca->discards)) {
	566	d = list_first_entry(&ca->discards, struct discard, list);
	567	cancel_work_sync(&d->work);
	568	list_del(&d->list);
	569	kfree(d);
	570	}
	571	}
	572
	573	int bch_cache_allocator_init(struct cache *ca)
	574	{
	575	unsigned i;
	576
	577	/*
	578	* Reserve:
	579	* Prio/gen writes first
	580	* Then 8 for btree allocations
	581	* Then half for the moving garbage collector
	582	*/
	583
	584	ca->watermark[WATERMARK_PRIO] = 0;
	585
	586	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
	587
	588	ca->watermark[WATERMARK_MOVINGGC] = 8 +
	589	ca->watermark[WATERMARK_METADATA];
	590
	591	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
	592	ca->watermark[WATERMARK_MOVINGGC];
	593
	594	for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
	595	struct discard d = kzalloc(sizeof(d), GFP_KERNEL);
	596	if (!d)
	597	return -ENOMEM;
	598
	599	d->ca = ca;
	600	INIT_WORK(&d->work, discard_finish);
	601	list_add(&d->list, &ca->discards);
	602	}
	603
	604	return 0;
	605	}