]>
Commit | Line | Data |
---|---|---|
cafe5635 KO |
1 | /* |
2 | * Primary bucket allocation code | |
3 | * | |
4 | * Copyright 2012 Google, Inc. | |
5 | * | |
6 | * Allocation in bcache is done in terms of buckets: | |
7 | * | |
8 | * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in | |
9 | * btree pointers - they must match for the pointer to be considered valid. | |
10 | * | |
11 | * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a | |
12 | * bucket simply by incrementing its gen. | |
13 | * | |
14 | * The gens (along with the priorities; it's really the gens are important but | |
15 | * the code is named as if it's the priorities) are written in an arbitrary list | |
16 | * of buckets on disk, with a pointer to them in the journal header. | |
17 | * | |
18 | * When we invalidate a bucket, we have to write its new gen to disk and wait | |
19 | * for that write to complete before we use it - otherwise after a crash we | |
20 | * could have pointers that appeared to be good but pointed to data that had | |
21 | * been overwritten. | |
22 | * | |
23 | * Since the gens and priorities are all stored contiguously on disk, we can | |
24 | * batch this up: We fill up the free_inc list with freshly invalidated buckets, | |
25 | * call prio_write(), and when prio_write() finishes we pull buckets off the | |
26 | * free_inc list and optionally discard them. | |
27 | * | |
28 | * free_inc isn't the only freelist - if it was, we'd often to sleep while | |
29 | * priorities and gens were being written before we could allocate. c->free is a | |
30 | * smaller freelist, and buckets on that list are always ready to be used. | |
31 | * | |
32 | * If we've got discards enabled, that happens when a bucket moves from the | |
33 | * free_inc list to the free list. | |
34 | * | |
35 | * There is another freelist, because sometimes we have buckets that we know | |
36 | * have nothing pointing into them - these we can reuse without waiting for | |
37 | * priorities to be rewritten. These come from freed btree nodes and buckets | |
38 | * that garbage collection discovered no longer had valid keys pointing into | |
39 | * them (because they were overwritten). That's the unused list - buckets on the | |
40 | * unused list move to the free list, optionally being discarded in the process. | |
41 | * | |
42 | * It's also important to ensure that gens don't wrap around - with respect to | |
43 | * either the oldest gen in the btree or the gen on disk. This is quite | |
44 | * difficult to do in practice, but we explicitly guard against it anyways - if | |
45 | * a bucket is in danger of wrapping around we simply skip invalidating it that | |
46 | * time around, and we garbage collect or rewrite the priorities sooner than we | |
47 | * would have otherwise. | |
48 | * | |
49 | * bch_bucket_alloc() allocates a single bucket from a specific cache. | |
50 | * | |
51 | * bch_bucket_alloc_set() allocates one or more buckets from different caches | |
52 | * out of a cache set. | |
53 | * | |
54 | * free_some_buckets() drives all the processes described above. It's called | |
55 | * from bch_bucket_alloc() and a few other places that need to make sure free | |
56 | * buckets are ready. | |
57 | * | |
58 | * invalidate_buckets_(lru|fifo)() find buckets that are available to be | |
59 | * invalidated, and then invalidate them and stick them on the free_inc list - | |
60 | * in either lru or fifo order. | |
61 | */ | |
62 | ||
63 | #include "bcache.h" | |
64 | #include "btree.h" | |
65 | ||
49b1212d | 66 | #include <linux/blkdev.h> |
79826c35 | 67 | #include <linux/freezer.h> |
119ba0f8 | 68 | #include <linux/kthread.h> |
cafe5635 | 69 | #include <linux/random.h> |
c37511b8 | 70 | #include <trace/events/bcache.h> |
cafe5635 | 71 | |
cafe5635 KO |
72 | /* Bucket heap / gen */ |
73 | ||
74 | uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) | |
75 | { | |
76 | uint8_t ret = ++b->gen; | |
77 | ||
78 | ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); | |
79 | WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); | |
80 | ||
81 | if (CACHE_SYNC(&ca->set->sb)) { | |
82 | ca->need_save_prio = max(ca->need_save_prio, | |
83 | bucket_disk_gen(b)); | |
84 | WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); | |
85 | } | |
86 | ||
87 | return ret; | |
88 | } | |
89 | ||
90 | void bch_rescale_priorities(struct cache_set *c, int sectors) | |
91 | { | |
92 | struct cache *ca; | |
93 | struct bucket *b; | |
94 | unsigned next = c->nbuckets * c->sb.bucket_size / 1024; | |
95 | unsigned i; | |
96 | int r; | |
97 | ||
98 | atomic_sub(sectors, &c->rescale); | |
99 | ||
100 | do { | |
101 | r = atomic_read(&c->rescale); | |
102 | ||
103 | if (r >= 0) | |
104 | return; | |
105 | } while (atomic_cmpxchg(&c->rescale, r, r + next) != r); | |
106 | ||
107 | mutex_lock(&c->bucket_lock); | |
108 | ||
109 | c->min_prio = USHRT_MAX; | |
110 | ||
111 | for_each_cache(ca, c, i) | |
112 | for_each_bucket(b, ca) | |
113 | if (b->prio && | |
114 | b->prio != BTREE_PRIO && | |
115 | !atomic_read(&b->pin)) { | |
116 | b->prio--; | |
117 | c->min_prio = min(c->min_prio, b->prio); | |
118 | } | |
119 | ||
120 | mutex_unlock(&c->bucket_lock); | |
121 | } | |
122 | ||
cafe5635 KO |
123 | /* Allocation */ |
124 | ||
125 | static inline bool can_inc_bucket_gen(struct bucket *b) | |
126 | { | |
127 | return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && | |
128 | bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; | |
129 | } | |
130 | ||
131 | bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) | |
132 | { | |
133 | BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); | |
134 | ||
135 | if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && | |
136 | CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) | |
137 | return false; | |
138 | ||
139 | b->prio = 0; | |
140 | ||
141 | if (can_inc_bucket_gen(b) && | |
142 | fifo_push(&ca->unused, b - ca->buckets)) { | |
143 | atomic_inc(&b->pin); | |
144 | return true; | |
145 | } | |
146 | ||
147 | return false; | |
148 | } | |
149 | ||
150 | static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) | |
151 | { | |
152 | return GC_MARK(b) == GC_MARK_RECLAIMABLE && | |
153 | !atomic_read(&b->pin) && | |
154 | can_inc_bucket_gen(b); | |
155 | } | |
156 | ||
157 | static void invalidate_one_bucket(struct cache *ca, struct bucket *b) | |
158 | { | |
159 | bch_inc_gen(ca, b); | |
160 | b->prio = INITIAL_PRIO; | |
161 | atomic_inc(&b->pin); | |
162 | fifo_push(&ca->free_inc, b - ca->buckets); | |
163 | } | |
164 | ||
b1a67b0f KO |
165 | #define bucket_prio(b) \ |
166 | (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b)) | |
cafe5635 | 167 | |
b1a67b0f KO |
168 | #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) |
169 | #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) | |
cafe5635 | 170 | |
b1a67b0f KO |
171 | static void invalidate_buckets_lru(struct cache *ca) |
172 | { | |
cafe5635 KO |
173 | struct bucket *b; |
174 | ssize_t i; | |
175 | ||
176 | ca->heap.used = 0; | |
177 | ||
178 | for_each_bucket(b, ca) { | |
86b26b82 KO |
179 | /* |
180 | * If we fill up the unused list, if we then return before | |
181 | * adding anything to the free_inc list we'll skip writing | |
182 | * prios/gens and just go back to allocating from the unused | |
183 | * list: | |
184 | */ | |
185 | if (fifo_full(&ca->unused)) | |
186 | return; | |
187 | ||
cafe5635 KO |
188 | if (!can_invalidate_bucket(ca, b)) |
189 | continue; | |
190 | ||
86b26b82 KO |
191 | if (!GC_SECTORS_USED(b) && |
192 | bch_bucket_add_unused(ca, b)) | |
193 | continue; | |
194 | ||
195 | if (!heap_full(&ca->heap)) | |
196 | heap_add(&ca->heap, b, bucket_max_cmp); | |
197 | else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { | |
198 | ca->heap.data[0] = b; | |
199 | heap_sift(&ca->heap, 0, bucket_max_cmp); | |
cafe5635 KO |
200 | } |
201 | } | |
202 | ||
cafe5635 KO |
203 | for (i = ca->heap.used / 2 - 1; i >= 0; --i) |
204 | heap_sift(&ca->heap, i, bucket_min_cmp); | |
205 | ||
206 | while (!fifo_full(&ca->free_inc)) { | |
207 | if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { | |
86b26b82 KO |
208 | /* |
209 | * We don't want to be calling invalidate_buckets() | |
cafe5635 KO |
210 | * multiple times when it can't do anything |
211 | */ | |
212 | ca->invalidate_needs_gc = 1; | |
72a44517 | 213 | wake_up_gc(ca->set); |
cafe5635 KO |
214 | return; |
215 | } | |
216 | ||
217 | invalidate_one_bucket(ca, b); | |
218 | } | |
219 | } | |
220 | ||
221 | static void invalidate_buckets_fifo(struct cache *ca) | |
222 | { | |
223 | struct bucket *b; | |
224 | size_t checked = 0; | |
225 | ||
226 | while (!fifo_full(&ca->free_inc)) { | |
227 | if (ca->fifo_last_bucket < ca->sb.first_bucket || | |
228 | ca->fifo_last_bucket >= ca->sb.nbuckets) | |
229 | ca->fifo_last_bucket = ca->sb.first_bucket; | |
230 | ||
231 | b = ca->buckets + ca->fifo_last_bucket++; | |
232 | ||
233 | if (can_invalidate_bucket(ca, b)) | |
234 | invalidate_one_bucket(ca, b); | |
235 | ||
236 | if (++checked >= ca->sb.nbuckets) { | |
237 | ca->invalidate_needs_gc = 1; | |
72a44517 | 238 | wake_up_gc(ca->set); |
cafe5635 KO |
239 | return; |
240 | } | |
241 | } | |
242 | } | |
243 | ||
244 | static void invalidate_buckets_random(struct cache *ca) | |
245 | { | |
246 | struct bucket *b; | |
247 | size_t checked = 0; | |
248 | ||
249 | while (!fifo_full(&ca->free_inc)) { | |
250 | size_t n; | |
251 | get_random_bytes(&n, sizeof(n)); | |
252 | ||
253 | n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); | |
254 | n += ca->sb.first_bucket; | |
255 | ||
256 | b = ca->buckets + n; | |
257 | ||
258 | if (can_invalidate_bucket(ca, b)) | |
259 | invalidate_one_bucket(ca, b); | |
260 | ||
261 | if (++checked >= ca->sb.nbuckets / 2) { | |
262 | ca->invalidate_needs_gc = 1; | |
72a44517 | 263 | wake_up_gc(ca->set); |
cafe5635 KO |
264 | return; |
265 | } | |
266 | } | |
267 | } | |
268 | ||
269 | static void invalidate_buckets(struct cache *ca) | |
270 | { | |
271 | if (ca->invalidate_needs_gc) | |
272 | return; | |
273 | ||
274 | switch (CACHE_REPLACEMENT(&ca->sb)) { | |
275 | case CACHE_REPLACEMENT_LRU: | |
276 | invalidate_buckets_lru(ca); | |
277 | break; | |
278 | case CACHE_REPLACEMENT_FIFO: | |
279 | invalidate_buckets_fifo(ca); | |
280 | break; | |
281 | case CACHE_REPLACEMENT_RANDOM: | |
282 | invalidate_buckets_random(ca); | |
283 | break; | |
284 | } | |
86b26b82 | 285 | |
c37511b8 | 286 | trace_bcache_alloc_invalidate(ca); |
cafe5635 KO |
287 | } |
288 | ||
289 | #define allocator_wait(ca, cond) \ | |
290 | do { \ | |
86b26b82 | 291 | while (1) { \ |
119ba0f8 | 292 | set_current_state(TASK_INTERRUPTIBLE); \ |
86b26b82 KO |
293 | if (cond) \ |
294 | break; \ | |
cafe5635 KO |
295 | \ |
296 | mutex_unlock(&(ca)->set->bucket_lock); \ | |
79826c35 | 297 | if (kthread_should_stop()) \ |
119ba0f8 | 298 | return 0; \ |
cafe5635 | 299 | \ |
79826c35 | 300 | try_to_freeze(); \ |
cafe5635 | 301 | schedule(); \ |
cafe5635 KO |
302 | mutex_lock(&(ca)->set->bucket_lock); \ |
303 | } \ | |
119ba0f8 | 304 | __set_current_state(TASK_RUNNING); \ |
cafe5635 KO |
305 | } while (0) |
306 | ||
119ba0f8 | 307 | static int bch_allocator_thread(void *arg) |
cafe5635 | 308 | { |
119ba0f8 | 309 | struct cache *ca = arg; |
cafe5635 KO |
310 | |
311 | mutex_lock(&ca->set->bucket_lock); | |
312 | ||
313 | while (1) { | |
86b26b82 KO |
314 | /* |
315 | * First, we pull buckets off of the unused and free_inc lists, | |
316 | * possibly issue discards to them, then we add the bucket to | |
317 | * the free list: | |
318 | */ | |
cafe5635 KO |
319 | while (1) { |
320 | long bucket; | |
321 | ||
322 | if ((!atomic_read(&ca->set->prio_blocked) || | |
323 | !CACHE_SYNC(&ca->set->sb)) && | |
324 | !fifo_empty(&ca->unused)) | |
325 | fifo_pop(&ca->unused, bucket); | |
326 | else if (!fifo_empty(&ca->free_inc)) | |
327 | fifo_pop(&ca->free_inc, bucket); | |
328 | else | |
329 | break; | |
330 | ||
cafe5635 | 331 | if (ca->discard) { |
49b1212d KO |
332 | mutex_unlock(&ca->set->bucket_lock); |
333 | blkdev_issue_discard(ca->bdev, | |
334 | bucket_to_sector(ca->set, bucket), | |
335 | ca->sb.block_size, GFP_KERNEL, 0); | |
336 | mutex_lock(&ca->set->bucket_lock); | |
cafe5635 | 337 | } |
49b1212d KO |
338 | |
339 | allocator_wait(ca, !fifo_full(&ca->free)); | |
340 | ||
341 | fifo_push(&ca->free, bucket); | |
35fcd848 | 342 | wake_up(&ca->set->bucket_wait); |
cafe5635 KO |
343 | } |
344 | ||
86b26b82 KO |
345 | /* |
346 | * We've run out of free buckets, we need to find some buckets | |
347 | * we can invalidate. First, invalidate them in memory and add | |
348 | * them to the free_inc list: | |
349 | */ | |
cafe5635 | 350 | |
86b26b82 KO |
351 | allocator_wait(ca, ca->set->gc_mark_valid && |
352 | (ca->need_save_prio > 64 || | |
353 | !ca->invalidate_needs_gc)); | |
354 | invalidate_buckets(ca); | |
cafe5635 | 355 | |
86b26b82 KO |
356 | /* |
357 | * Now, we write their new gens to disk so we can start writing | |
358 | * new stuff to them: | |
359 | */ | |
360 | allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); | |
cafe5635 KO |
361 | if (CACHE_SYNC(&ca->set->sb) && |
362 | (!fifo_empty(&ca->free_inc) || | |
86b26b82 | 363 | ca->need_save_prio > 64)) |
cafe5635 | 364 | bch_prio_write(ca); |
cafe5635 KO |
365 | } |
366 | } | |
367 | ||
35fcd848 | 368 | long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) |
cafe5635 | 369 | { |
35fcd848 KO |
370 | DEFINE_WAIT(w); |
371 | struct bucket *b; | |
372 | long r; | |
373 | ||
374 | /* fastpath */ | |
375 | if (fifo_used(&ca->free) > ca->watermark[watermark]) { | |
376 | fifo_pop(&ca->free, r); | |
377 | goto out; | |
378 | } | |
379 | ||
380 | if (!wait) | |
381 | return -1; | |
382 | ||
383 | while (1) { | |
384 | if (fifo_used(&ca->free) > ca->watermark[watermark]) { | |
385 | fifo_pop(&ca->free, r); | |
386 | break; | |
387 | } | |
388 | ||
389 | prepare_to_wait(&ca->set->bucket_wait, &w, | |
390 | TASK_UNINTERRUPTIBLE); | |
391 | ||
392 | mutex_unlock(&ca->set->bucket_lock); | |
393 | schedule(); | |
394 | mutex_lock(&ca->set->bucket_lock); | |
395 | } | |
396 | ||
397 | finish_wait(&ca->set->bucket_wait, &w); | |
398 | out: | |
119ba0f8 | 399 | wake_up_process(ca->alloc_thread); |
cafe5635 | 400 | |
280481d0 | 401 | if (expensive_debug_checks(ca->set)) { |
cafe5635 KO |
402 | size_t iter; |
403 | long i; | |
404 | ||
405 | for (iter = 0; iter < prio_buckets(ca) * 2; iter++) | |
406 | BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); | |
407 | ||
408 | fifo_for_each(i, &ca->free, iter) | |
409 | BUG_ON(i == r); | |
410 | fifo_for_each(i, &ca->free_inc, iter) | |
411 | BUG_ON(i == r); | |
412 | fifo_for_each(i, &ca->unused, iter) | |
413 | BUG_ON(i == r); | |
cafe5635 | 414 | } |
280481d0 | 415 | |
35fcd848 | 416 | b = ca->buckets + r; |
cafe5635 | 417 | |
35fcd848 | 418 | BUG_ON(atomic_read(&b->pin) != 1); |
cafe5635 | 419 | |
35fcd848 | 420 | SET_GC_SECTORS_USED(b, ca->sb.bucket_size); |
cafe5635 | 421 | |
35fcd848 KO |
422 | if (watermark <= WATERMARK_METADATA) { |
423 | SET_GC_MARK(b, GC_MARK_METADATA); | |
981aa8c0 | 424 | SET_GC_MOVE(b, 0); |
35fcd848 KO |
425 | b->prio = BTREE_PRIO; |
426 | } else { | |
427 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); | |
981aa8c0 | 428 | SET_GC_MOVE(b, 0); |
35fcd848 | 429 | b->prio = INITIAL_PRIO; |
cafe5635 KO |
430 | } |
431 | ||
35fcd848 | 432 | return r; |
cafe5635 KO |
433 | } |
434 | ||
435 | void bch_bucket_free(struct cache_set *c, struct bkey *k) | |
436 | { | |
437 | unsigned i; | |
438 | ||
439 | for (i = 0; i < KEY_PTRS(k); i++) { | |
440 | struct bucket *b = PTR_BUCKET(c, k, i); | |
441 | ||
86b26b82 | 442 | SET_GC_MARK(b, GC_MARK_RECLAIMABLE); |
cafe5635 KO |
443 | SET_GC_SECTORS_USED(b, 0); |
444 | bch_bucket_add_unused(PTR_CACHE(c, k, i), b); | |
445 | } | |
446 | } | |
447 | ||
448 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | |
35fcd848 | 449 | struct bkey *k, int n, bool wait) |
cafe5635 KO |
450 | { |
451 | int i; | |
452 | ||
453 | lockdep_assert_held(&c->bucket_lock); | |
454 | BUG_ON(!n || n > c->caches_loaded || n > 8); | |
455 | ||
456 | bkey_init(k); | |
457 | ||
458 | /* sort by free space/prio of oldest data in caches */ | |
459 | ||
460 | for (i = 0; i < n; i++) { | |
461 | struct cache *ca = c->cache_by_alloc[i]; | |
35fcd848 | 462 | long b = bch_bucket_alloc(ca, watermark, wait); |
cafe5635 KO |
463 | |
464 | if (b == -1) | |
465 | goto err; | |
466 | ||
467 | k->ptr[i] = PTR(ca->buckets[b].gen, | |
468 | bucket_to_sector(c, b), | |
469 | ca->sb.nr_this_dev); | |
470 | ||
471 | SET_KEY_PTRS(k, i + 1); | |
472 | } | |
473 | ||
474 | return 0; | |
475 | err: | |
476 | bch_bucket_free(c, k); | |
3a3b6a4e | 477 | bkey_put(c, k); |
cafe5635 KO |
478 | return -1; |
479 | } | |
480 | ||
481 | int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, | |
35fcd848 | 482 | struct bkey *k, int n, bool wait) |
cafe5635 KO |
483 | { |
484 | int ret; | |
485 | mutex_lock(&c->bucket_lock); | |
35fcd848 | 486 | ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); |
cafe5635 KO |
487 | mutex_unlock(&c->bucket_lock); |
488 | return ret; | |
489 | } | |
490 | ||
2599b53b KO |
491 | /* Sector allocator */ |
492 | ||
493 | struct open_bucket { | |
494 | struct list_head list; | |
495 | unsigned last_write_point; | |
496 | unsigned sectors_free; | |
497 | BKEY_PADDED(key); | |
498 | }; | |
499 | ||
500 | /* | |
501 | * We keep multiple buckets open for writes, and try to segregate different | |
502 | * write streams for better cache utilization: first we look for a bucket where | |
503 | * the last write to it was sequential with the current write, and failing that | |
504 | * we look for a bucket that was last used by the same task. | |
505 | * | |
506 | * The ideas is if you've got multiple tasks pulling data into the cache at the | |
507 | * same time, you'll get better cache utilization if you try to segregate their | |
508 | * data and preserve locality. | |
509 | * | |
510 | * For example, say you've starting Firefox at the same time you're copying a | |
511 | * bunch of files. Firefox will likely end up being fairly hot and stay in the | |
512 | * cache awhile, but the data you copied might not be; if you wrote all that | |
513 | * data to the same buckets it'd get invalidated at the same time. | |
514 | * | |
515 | * Both of those tasks will be doing fairly random IO so we can't rely on | |
516 | * detecting sequential IO to segregate their data, but going off of the task | |
517 | * should be a sane heuristic. | |
518 | */ | |
519 | static struct open_bucket *pick_data_bucket(struct cache_set *c, | |
520 | const struct bkey *search, | |
521 | unsigned write_point, | |
522 | struct bkey *alloc) | |
523 | { | |
524 | struct open_bucket *ret, *ret_task = NULL; | |
525 | ||
526 | list_for_each_entry_reverse(ret, &c->data_buckets, list) | |
527 | if (!bkey_cmp(&ret->key, search)) | |
528 | goto found; | |
529 | else if (ret->last_write_point == write_point) | |
530 | ret_task = ret; | |
531 | ||
532 | ret = ret_task ?: list_first_entry(&c->data_buckets, | |
533 | struct open_bucket, list); | |
534 | found: | |
535 | if (!ret->sectors_free && KEY_PTRS(alloc)) { | |
536 | ret->sectors_free = c->sb.bucket_size; | |
537 | bkey_copy(&ret->key, alloc); | |
538 | bkey_init(alloc); | |
539 | } | |
540 | ||
541 | if (!ret->sectors_free) | |
542 | ret = NULL; | |
543 | ||
544 | return ret; | |
545 | } | |
546 | ||
547 | /* | |
548 | * Allocates some space in the cache to write to, and k to point to the newly | |
549 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the | |
550 | * end of the newly allocated space). | |
551 | * | |
552 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many | |
553 | * sectors were actually allocated. | |
554 | * | |
555 | * If s->writeback is true, will not fail. | |
556 | */ | |
557 | bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, | |
558 | unsigned write_point, unsigned write_prio, bool wait) | |
559 | { | |
560 | struct open_bucket *b; | |
561 | BKEY_PADDED(key) alloc; | |
562 | unsigned i; | |
563 | ||
564 | /* | |
565 | * We might have to allocate a new bucket, which we can't do with a | |
566 | * spinlock held. So if we have to allocate, we drop the lock, allocate | |
567 | * and then retry. KEY_PTRS() indicates whether alloc points to | |
568 | * allocated bucket(s). | |
569 | */ | |
570 | ||
571 | bkey_init(&alloc.key); | |
572 | spin_lock(&c->data_bucket_lock); | |
573 | ||
574 | while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { | |
575 | unsigned watermark = write_prio | |
576 | ? WATERMARK_MOVINGGC | |
577 | : WATERMARK_NONE; | |
578 | ||
579 | spin_unlock(&c->data_bucket_lock); | |
580 | ||
581 | if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) | |
582 | return false; | |
583 | ||
584 | spin_lock(&c->data_bucket_lock); | |
585 | } | |
586 | ||
587 | /* | |
588 | * If we had to allocate, we might race and not need to allocate the | |
589 | * second time we call find_data_bucket(). If we allocated a bucket but | |
590 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: | |
591 | */ | |
592 | if (KEY_PTRS(&alloc.key)) | |
3a3b6a4e | 593 | bkey_put(c, &alloc.key); |
2599b53b KO |
594 | |
595 | for (i = 0; i < KEY_PTRS(&b->key); i++) | |
596 | EBUG_ON(ptr_stale(c, &b->key, i)); | |
597 | ||
598 | /* Set up the pointer to the space we're allocating: */ | |
599 | ||
600 | for (i = 0; i < KEY_PTRS(&b->key); i++) | |
601 | k->ptr[i] = b->key.ptr[i]; | |
602 | ||
603 | sectors = min(sectors, b->sectors_free); | |
604 | ||
605 | SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); | |
606 | SET_KEY_SIZE(k, sectors); | |
607 | SET_KEY_PTRS(k, KEY_PTRS(&b->key)); | |
608 | ||
609 | /* | |
610 | * Move b to the end of the lru, and keep track of what this bucket was | |
611 | * last used for: | |
612 | */ | |
613 | list_move_tail(&b->list, &c->data_buckets); | |
614 | bkey_copy_key(&b->key, k); | |
615 | b->last_write_point = write_point; | |
616 | ||
617 | b->sectors_free -= sectors; | |
618 | ||
619 | for (i = 0; i < KEY_PTRS(&b->key); i++) { | |
620 | SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); | |
621 | ||
622 | atomic_long_add(sectors, | |
623 | &PTR_CACHE(c, &b->key, i)->sectors_written); | |
624 | } | |
625 | ||
626 | if (b->sectors_free < c->sb.block_size) | |
627 | b->sectors_free = 0; | |
628 | ||
629 | /* | |
630 | * k takes refcounts on the buckets it points to until it's inserted | |
631 | * into the btree, but if we're done with this bucket we just transfer | |
632 | * get_data_bucket()'s refcount. | |
633 | */ | |
634 | if (b->sectors_free) | |
635 | for (i = 0; i < KEY_PTRS(&b->key); i++) | |
636 | atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); | |
637 | ||
638 | spin_unlock(&c->data_bucket_lock); | |
639 | return true; | |
640 | } | |
641 | ||
cafe5635 KO |
642 | /* Init */ |
643 | ||
2599b53b KO |
644 | void bch_open_buckets_free(struct cache_set *c) |
645 | { | |
646 | struct open_bucket *b; | |
647 | ||
648 | while (!list_empty(&c->data_buckets)) { | |
649 | b = list_first_entry(&c->data_buckets, | |
650 | struct open_bucket, list); | |
651 | list_del(&b->list); | |
652 | kfree(b); | |
653 | } | |
654 | } | |
655 | ||
656 | int bch_open_buckets_alloc(struct cache_set *c) | |
657 | { | |
658 | int i; | |
659 | ||
660 | spin_lock_init(&c->data_bucket_lock); | |
661 | ||
662 | for (i = 0; i < 6; i++) { | |
663 | struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); | |
664 | if (!b) | |
665 | return -ENOMEM; | |
666 | ||
667 | list_add(&b->list, &c->data_buckets); | |
668 | } | |
669 | ||
670 | return 0; | |
671 | } | |
672 | ||
119ba0f8 KO |
673 | int bch_cache_allocator_start(struct cache *ca) |
674 | { | |
79826c35 KO |
675 | struct task_struct *k = kthread_run(bch_allocator_thread, |
676 | ca, "bcache_allocator"); | |
677 | if (IS_ERR(k)) | |
678 | return PTR_ERR(k); | |
119ba0f8 | 679 | |
79826c35 | 680 | ca->alloc_thread = k; |
119ba0f8 KO |
681 | return 0; |
682 | } | |
683 | ||
cafe5635 KO |
684 | int bch_cache_allocator_init(struct cache *ca) |
685 | { | |
cafe5635 KO |
686 | /* |
687 | * Reserve: | |
688 | * Prio/gen writes first | |
689 | * Then 8 for btree allocations | |
690 | * Then half for the moving garbage collector | |
691 | */ | |
692 | ||
693 | ca->watermark[WATERMARK_PRIO] = 0; | |
694 | ||
695 | ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); | |
696 | ||
697 | ca->watermark[WATERMARK_MOVINGGC] = 8 + | |
698 | ca->watermark[WATERMARK_METADATA]; | |
699 | ||
700 | ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + | |
701 | ca->watermark[WATERMARK_MOVINGGC]; | |
702 | ||
cafe5635 KO |
703 | return 0; |
704 | } |