]> git.proxmox.com Git - mirror_ubuntu-disco-kernel.git/blame - drivers/md/bcache/super.c
bcache: improve bcache_reboot()
[mirror_ubuntu-disco-kernel.git] / drivers / md / bcache / super.c
CommitLineData
87418ef9 1// SPDX-License-Identifier: GPL-2.0
cafe5635
KO
2/*
3 * bcache setup/teardown code, and some metadata io - read a superblock and
4 * figure out what to do with it.
5 *
6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7 * Copyright 2012 Google, Inc.
8 */
9
10#include "bcache.h"
11#include "btree.h"
12#include "debug.h"
65d45231 13#include "extents.h"
cafe5635 14#include "request.h"
279afbad 15#include "writeback.h"
cafe5635 16
c37511b8 17#include <linux/blkdev.h>
cafe5635
KO
18#include <linux/buffer_head.h>
19#include <linux/debugfs.h>
20#include <linux/genhd.h>
28935ab5 21#include <linux/idr.h>
79826c35 22#include <linux/kthread.h>
cafe5635
KO
23#include <linux/module.h>
24#include <linux/random.h>
25#include <linux/reboot.h>
26#include <linux/sysfs.h>
27
9aaf5165
CL
28unsigned int bch_cutoff_writeback;
29unsigned int bch_cutoff_writeback_sync;
30
cafe5635
KO
31static const char bcache_magic[] = {
32 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
33 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
34};
35
36static const char invalid_uuid[] = {
37 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
38 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
39};
40
cafe5635
KO
41static struct kobject *bcache_kobj;
42struct mutex bch_register_lock;
43LIST_HEAD(bch_cache_sets);
44static LIST_HEAD(uncached_devices);
45
28935ab5 46static int bcache_major;
1dbe32ad 47static DEFINE_IDA(bcache_device_idx);
cafe5635
KO
48static wait_queue_head_t unregister_wait;
49struct workqueue_struct *bcache_wq;
0f843e65 50struct workqueue_struct *bch_journal_wq;
cafe5635
KO
51
52#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
1dbe32ad
CL
53/* limitation of partitions number on single bcache device */
54#define BCACHE_MINORS 128
55/* limitation of bcache devices number on single system */
56#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
cafe5635 57
cafe5635
KO
58/* Superblock */
59
60static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
61 struct page **res)
62{
63 const char *err;
64 struct cache_sb *s;
65 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
6f10f7d1 66 unsigned int i;
cafe5635
KO
67
68 if (!bh)
69 return "IO error";
70
71 s = (struct cache_sb *) bh->b_data;
72
73 sb->offset = le64_to_cpu(s->offset);
74 sb->version = le64_to_cpu(s->version);
75
76 memcpy(sb->magic, s->magic, 16);
77 memcpy(sb->uuid, s->uuid, 16);
78 memcpy(sb->set_uuid, s->set_uuid, 16);
79 memcpy(sb->label, s->label, SB_LABEL_SIZE);
80
81 sb->flags = le64_to_cpu(s->flags);
82 sb->seq = le64_to_cpu(s->seq);
cafe5635 83 sb->last_mount = le32_to_cpu(s->last_mount);
cafe5635
KO
84 sb->first_bucket = le16_to_cpu(s->first_bucket);
85 sb->keys = le16_to_cpu(s->keys);
86
87 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
88 sb->d[i] = le64_to_cpu(s->d[i]);
89
90 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
91 sb->version, sb->flags, sb->seq, sb->keys);
92
93 err = "Not a bcache superblock";
94 if (sb->offset != SB_SECTOR)
95 goto err;
96
97 if (memcmp(sb->magic, bcache_magic, 16))
98 goto err;
99
100 err = "Too many journal buckets";
101 if (sb->keys > SB_JOURNAL_BUCKETS)
102 goto err;
103
104 err = "Bad checksum";
105 if (s->csum != csum_set(s))
106 goto err;
107
108 err = "Bad UUID";
169ef1cf 109 if (bch_is_zero(sb->uuid, 16))
cafe5635
KO
110 goto err;
111
8abb2a5d
KO
112 sb->block_size = le16_to_cpu(s->block_size);
113
114 err = "Superblock block size smaller than device block size";
115 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
116 goto err;
117
2903381f
KO
118 switch (sb->version) {
119 case BCACHE_SB_VERSION_BDEV:
2903381f
KO
120 sb->data_offset = BDEV_DATA_START_DEFAULT;
121 break;
122 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
2903381f
KO
123 sb->data_offset = le64_to_cpu(s->data_offset);
124
125 err = "Bad data offset";
126 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
127 goto err;
cafe5635 128
2903381f
KO
129 break;
130 case BCACHE_SB_VERSION_CDEV:
131 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
132 sb->nbuckets = le64_to_cpu(s->nbuckets);
2903381f 133 sb->bucket_size = le16_to_cpu(s->bucket_size);
cafe5635 134
2903381f
KO
135 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
136 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
cafe5635 137
2903381f
KO
138 err = "Too many buckets";
139 if (sb->nbuckets > LONG_MAX)
140 goto err;
cafe5635 141
2903381f
KO
142 err = "Not enough buckets";
143 if (sb->nbuckets < 1 << 7)
144 goto err;
cafe5635 145
2903381f
KO
146 err = "Bad block/bucket size";
147 if (!is_power_of_2(sb->block_size) ||
148 sb->block_size > PAGE_SECTORS ||
149 !is_power_of_2(sb->bucket_size) ||
150 sb->bucket_size < PAGE_SECTORS)
151 goto err;
cafe5635 152
2903381f 153 err = "Invalid superblock: device too small";
b0d30981
CL
154 if (get_capacity(bdev->bd_disk) <
155 sb->bucket_size * sb->nbuckets)
2903381f 156 goto err;
cafe5635 157
2903381f
KO
158 err = "Bad UUID";
159 if (bch_is_zero(sb->set_uuid, 16))
160 goto err;
cafe5635 161
2903381f
KO
162 err = "Bad cache device number in set";
163 if (!sb->nr_in_set ||
164 sb->nr_in_set <= sb->nr_this_dev ||
165 sb->nr_in_set > MAX_CACHES_PER_SET)
cafe5635
KO
166 goto err;
167
2903381f
KO
168 err = "Journal buckets not sequential";
169 for (i = 0; i < sb->keys; i++)
170 if (sb->d[i] != sb->first_bucket + i)
171 goto err;
cafe5635 172
2903381f
KO
173 err = "Too many journal buckets";
174 if (sb->first_bucket + sb->keys > sb->nbuckets)
175 goto err;
176
177 err = "Invalid superblock: first bucket comes before end of super";
178 if (sb->first_bucket * sb->bucket_size < 16)
179 goto err;
180
181 break;
182 default:
183 err = "Unsupported superblock version";
cafe5635 184 goto err;
2903381f
KO
185 }
186
75cbb3f1 187 sb->last_mount = (u32)ktime_get_real_seconds();
cafe5635
KO
188 err = NULL;
189
190 get_page(bh->b_page);
191 *res = bh->b_page;
192err:
193 put_bh(bh);
194 return err;
195}
196
4246a0b6 197static void write_bdev_super_endio(struct bio *bio)
cafe5635
KO
198{
199 struct cached_dev *dc = bio->bi_private;
200 /* XXX: error checking */
201
cb7a583e 202 closure_put(&dc->sb_write);
cafe5635
KO
203}
204
205static void __write_super(struct cache_sb *sb, struct bio *bio)
206{
263663cd 207 struct cache_sb *out = page_address(bio_first_page_all(bio));
6f10f7d1 208 unsigned int i;
cafe5635 209
4f024f37 210 bio->bi_iter.bi_sector = SB_SECTOR;
4f024f37 211 bio->bi_iter.bi_size = SB_SIZE;
ad0d9e76 212 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
169ef1cf 213 bch_bio_map(bio, NULL);
cafe5635
KO
214
215 out->offset = cpu_to_le64(sb->offset);
216 out->version = cpu_to_le64(sb->version);
217
218 memcpy(out->uuid, sb->uuid, 16);
219 memcpy(out->set_uuid, sb->set_uuid, 16);
220 memcpy(out->label, sb->label, SB_LABEL_SIZE);
221
222 out->flags = cpu_to_le64(sb->flags);
223 out->seq = cpu_to_le64(sb->seq);
224
225 out->last_mount = cpu_to_le32(sb->last_mount);
226 out->first_bucket = cpu_to_le16(sb->first_bucket);
227 out->keys = cpu_to_le16(sb->keys);
228
229 for (i = 0; i < sb->keys; i++)
230 out->d[i] = cpu_to_le64(sb->d[i]);
231
232 out->csum = csum_set(out);
233
234 pr_debug("ver %llu, flags %llu, seq %llu",
235 sb->version, sb->flags, sb->seq);
236
4e49ea4a 237 submit_bio(bio);
cafe5635
KO
238}
239
cb7a583e
KO
240static void bch_write_bdev_super_unlock(struct closure *cl)
241{
242 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
243
244 up(&dc->sb_write_mutex);
245}
246
cafe5635
KO
247void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
248{
cb7a583e 249 struct closure *cl = &dc->sb_write;
cafe5635
KO
250 struct bio *bio = &dc->sb_bio;
251
cb7a583e
KO
252 down(&dc->sb_write_mutex);
253 closure_init(cl, parent);
cafe5635
KO
254
255 bio_reset(bio);
74d46992 256 bio_set_dev(bio, dc->bdev);
cafe5635
KO
257 bio->bi_end_io = write_bdev_super_endio;
258 bio->bi_private = dc;
259
260 closure_get(cl);
27a40ab9 261 /* I/O request sent to backing device */
cafe5635
KO
262 __write_super(&dc->sb, bio);
263
cb7a583e 264 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
cafe5635
KO
265}
266
4246a0b6 267static void write_super_endio(struct bio *bio)
cafe5635
KO
268{
269 struct cache *ca = bio->bi_private;
270
5138ac67
CL
271 /* is_read = 0 */
272 bch_count_io_errors(ca, bio->bi_status, 0,
273 "writing superblock");
cb7a583e
KO
274 closure_put(&ca->set->sb_write);
275}
276
277static void bcache_write_super_unlock(struct closure *cl)
278{
279 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
280
281 up(&c->sb_write_mutex);
cafe5635
KO
282}
283
284void bcache_write_super(struct cache_set *c)
285{
cb7a583e 286 struct closure *cl = &c->sb_write;
cafe5635 287 struct cache *ca;
6f10f7d1 288 unsigned int i;
cafe5635 289
cb7a583e
KO
290 down(&c->sb_write_mutex);
291 closure_init(cl, &c->cl);
cafe5635
KO
292
293 c->sb.seq++;
294
295 for_each_cache(ca, c, i) {
296 struct bio *bio = &ca->sb_bio;
297
2903381f 298 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
cafe5635
KO
299 ca->sb.seq = c->sb.seq;
300 ca->sb.last_mount = c->sb.last_mount;
301
302 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
303
304 bio_reset(bio);
74d46992 305 bio_set_dev(bio, ca->bdev);
cafe5635
KO
306 bio->bi_end_io = write_super_endio;
307 bio->bi_private = ca;
308
309 closure_get(cl);
310 __write_super(&ca->sb, bio);
311 }
312
cb7a583e 313 closure_return_with_destructor(cl, bcache_write_super_unlock);
cafe5635
KO
314}
315
316/* UUID io */
317
4246a0b6 318static void uuid_endio(struct bio *bio)
cafe5635
KO
319{
320 struct closure *cl = bio->bi_private;
cb7a583e 321 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
cafe5635 322
4e4cbee9 323 cache_set_err_on(bio->bi_status, c, "accessing uuids");
cafe5635
KO
324 bch_bbio_free(bio, c);
325 closure_put(cl);
326}
327
cb7a583e
KO
328static void uuid_io_unlock(struct closure *cl)
329{
330 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
331
332 up(&c->uuid_write_mutex);
333}
334
ad0d9e76 335static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
cafe5635
KO
336 struct bkey *k, struct closure *parent)
337{
cb7a583e 338 struct closure *cl = &c->uuid_write;
cafe5635 339 struct uuid_entry *u;
6f10f7d1 340 unsigned int i;
85b1492e 341 char buf[80];
cafe5635
KO
342
343 BUG_ON(!parent);
cb7a583e
KO
344 down(&c->uuid_write_mutex);
345 closure_init(cl, parent);
cafe5635
KO
346
347 for (i = 0; i < KEY_PTRS(k); i++) {
348 struct bio *bio = bch_bbio_alloc(c);
349
1eff9d32 350 bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
4f024f37 351 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
cafe5635
KO
352
353 bio->bi_end_io = uuid_endio;
354 bio->bi_private = cl;
ad0d9e76 355 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
169ef1cf 356 bch_bio_map(bio, c->uuids);
cafe5635
KO
357
358 bch_submit_bbio(bio, c, k, i);
359
ad0d9e76 360 if (op != REQ_OP_WRITE)
cafe5635
KO
361 break;
362 }
363
dc9d98d6 364 bch_extent_to_text(buf, sizeof(buf), k);
ad0d9e76 365 pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
cafe5635
KO
366
367 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
169ef1cf 368 if (!bch_is_zero(u->uuid, 16))
cafe5635
KO
369 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
370 u - c->uuids, u->uuid, u->label,
371 u->first_reg, u->last_reg, u->invalidated);
372
cb7a583e 373 closure_return_with_destructor(cl, uuid_io_unlock);
cafe5635
KO
374}
375
376static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
377{
378 struct bkey *k = &j->uuid_bucket;
379
65d45231 380 if (__bch_btree_ptr_invalid(c, k))
cafe5635
KO
381 return "bad uuid pointer";
382
383 bkey_copy(&c->uuid_bucket, k);
70fd7614 384 uuid_io(c, REQ_OP_READ, 0, k, cl);
cafe5635
KO
385
386 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
387 struct uuid_entry_v0 *u0 = (void *) c->uuids;
388 struct uuid_entry *u1 = (void *) c->uuids;
389 int i;
390
391 closure_sync(cl);
392
393 /*
394 * Since the new uuid entry is bigger than the old, we have to
395 * convert starting at the highest memory address and work down
396 * in order to do it in place
397 */
398
399 for (i = c->nr_uuids - 1;
400 i >= 0;
401 --i) {
402 memcpy(u1[i].uuid, u0[i].uuid, 16);
403 memcpy(u1[i].label, u0[i].label, 32);
404
405 u1[i].first_reg = u0[i].first_reg;
406 u1[i].last_reg = u0[i].last_reg;
407 u1[i].invalidated = u0[i].invalidated;
408
409 u1[i].flags = 0;
410 u1[i].sectors = 0;
411 }
412 }
413
414 return NULL;
415}
416
417static int __uuid_write(struct cache_set *c)
418{
419 BKEY_PADDED(key) k;
420 struct closure cl;
7a55948d 421 struct cache *ca;
cafe5635 422
1fae7cf0 423 closure_init_stack(&cl);
cafe5635
KO
424 lockdep_assert_held(&bch_register_lock);
425
78365411 426 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
cafe5635
KO
427 return 1;
428
429 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
ad0d9e76 430 uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
cafe5635
KO
431 closure_sync(&cl);
432
7a55948d
SW
433 /* Only one bucket used for uuid write */
434 ca = PTR_CACHE(c, &k.key, 0);
435 atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
436
cafe5635 437 bkey_copy(&c->uuid_bucket, &k.key);
3a3b6a4e 438 bkey_put(c, &k.key);
cafe5635
KO
439 return 0;
440}
441
442int bch_uuid_write(struct cache_set *c)
443{
444 int ret = __uuid_write(c);
445
446 if (!ret)
447 bch_journal_meta(c, NULL);
448
449 return ret;
450}
451
452static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
453{
454 struct uuid_entry *u;
455
456 for (u = c->uuids;
457 u < c->uuids + c->nr_uuids; u++)
458 if (!memcmp(u->uuid, uuid, 16))
459 return u;
460
461 return NULL;
462}
463
464static struct uuid_entry *uuid_find_empty(struct cache_set *c)
465{
466 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
1fae7cf0 467
cafe5635
KO
468 return uuid_find(c, zero_uuid);
469}
470
471/*
472 * Bucket priorities/gens:
473 *
474 * For each bucket, we store on disk its
3be11dba
CL
475 * 8 bit gen
476 * 16 bit priority
cafe5635
KO
477 *
478 * See alloc.c for an explanation of the gen. The priority is used to implement
479 * lru (and in the future other) cache replacement policies; for most purposes
480 * it's just an opaque integer.
481 *
482 * The gens and the priorities don't have a whole lot to do with each other, and
483 * it's actually the gens that must be written out at specific times - it's no
484 * big deal if the priorities don't get written, if we lose them we just reuse
485 * buckets in suboptimal order.
486 *
487 * On disk they're stored in a packed array, and in as many buckets are required
488 * to fit them all. The buckets we use to store them form a list; the journal
489 * header points to the first bucket, the first bucket points to the second
490 * bucket, et cetera.
491 *
492 * This code is used by the allocation code; periodically (whenever it runs out
493 * of buckets to allocate from) the allocation code will invalidate some
494 * buckets, but it can't use those buckets until their new gens are safely on
495 * disk.
496 */
497
4246a0b6 498static void prio_endio(struct bio *bio)
cafe5635
KO
499{
500 struct cache *ca = bio->bi_private;
501
4e4cbee9 502 cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
cafe5635
KO
503 bch_bbio_free(bio, ca->set);
504 closure_put(&ca->prio);
505}
506
ad0d9e76
MC
507static void prio_io(struct cache *ca, uint64_t bucket, int op,
508 unsigned long op_flags)
cafe5635
KO
509{
510 struct closure *cl = &ca->prio;
511 struct bio *bio = bch_bbio_alloc(ca->set);
512
513 closure_init_stack(cl);
514
4f024f37 515 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
74d46992 516 bio_set_dev(bio, ca->bdev);
4f024f37 517 bio->bi_iter.bi_size = bucket_bytes(ca);
cafe5635
KO
518
519 bio->bi_end_io = prio_endio;
520 bio->bi_private = ca;
ad0d9e76 521 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
169ef1cf 522 bch_bio_map(bio, ca->disk_buckets);
cafe5635 523
771f393e 524 closure_bio_submit(ca->set, bio, &ca->prio);
cafe5635
KO
525 closure_sync(cl);
526}
527
cafe5635
KO
528void bch_prio_write(struct cache *ca)
529{
530 int i;
531 struct bucket *b;
532 struct closure cl;
533
534 closure_init_stack(&cl);
535
536 lockdep_assert_held(&ca->set->bucket_lock);
537
cafe5635
KO
538 ca->disk_buckets->seq++;
539
540 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
541 &ca->meta_sectors_written);
542
78365411
KO
543 //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
544 // fifo_used(&ca->free_inc), fifo_used(&ca->unused));
cafe5635
KO
545
546 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
547 long bucket;
548 struct prio_set *p = ca->disk_buckets;
b1a67b0f
KO
549 struct bucket_disk *d = p->data;
550 struct bucket_disk *end = d + prios_per_bucket(ca);
cafe5635
KO
551
552 for (b = ca->buckets + i * prios_per_bucket(ca);
553 b < ca->buckets + ca->sb.nbuckets && d < end;
554 b++, d++) {
555 d->prio = cpu_to_le16(b->prio);
556 d->gen = b->gen;
557 }
558
559 p->next_bucket = ca->prio_buckets[i + 1];
81ab4190 560 p->magic = pset_magic(&ca->sb);
169ef1cf 561 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
cafe5635 562
78365411 563 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
cafe5635
KO
564 BUG_ON(bucket == -1);
565
566 mutex_unlock(&ca->set->bucket_lock);
ad0d9e76 567 prio_io(ca, bucket, REQ_OP_WRITE, 0);
cafe5635
KO
568 mutex_lock(&ca->set->bucket_lock);
569
570 ca->prio_buckets[i] = bucket;
571 atomic_dec_bug(&ca->buckets[bucket].pin);
572 }
573
574 mutex_unlock(&ca->set->bucket_lock);
575
576 bch_journal_meta(ca->set, &cl);
577 closure_sync(&cl);
578
579 mutex_lock(&ca->set->bucket_lock);
580
cafe5635
KO
581 /*
582 * Don't want the old priorities to get garbage collected until after we
583 * finish writing the new ones, and they're journalled
584 */
2531d9ee
KO
585 for (i = 0; i < prio_buckets(ca); i++) {
586 if (ca->prio_last_buckets[i])
587 __bch_bucket_free(ca,
588 &ca->buckets[ca->prio_last_buckets[i]]);
589
cafe5635 590 ca->prio_last_buckets[i] = ca->prio_buckets[i];
2531d9ee 591 }
cafe5635
KO
592}
593
594static void prio_read(struct cache *ca, uint64_t bucket)
595{
596 struct prio_set *p = ca->disk_buckets;
597 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
598 struct bucket *b;
6f10f7d1 599 unsigned int bucket_nr = 0;
cafe5635
KO
600
601 for (b = ca->buckets;
602 b < ca->buckets + ca->sb.nbuckets;
603 b++, d++) {
604 if (d == end) {
605 ca->prio_buckets[bucket_nr] = bucket;
606 ca->prio_last_buckets[bucket_nr] = bucket;
607 bucket_nr++;
608
70fd7614 609 prio_io(ca, bucket, REQ_OP_READ, 0);
cafe5635 610
b0d30981
CL
611 if (p->csum !=
612 bch_crc64(&p->magic, bucket_bytes(ca) - 8))
cafe5635
KO
613 pr_warn("bad csum reading priorities");
614
81ab4190 615 if (p->magic != pset_magic(&ca->sb))
cafe5635
KO
616 pr_warn("bad magic reading priorities");
617
618 bucket = p->next_bucket;
619 d = p->data;
620 }
621
622 b->prio = le16_to_cpu(d->prio);
3a2fd9d5 623 b->gen = b->last_gc = d->gen;
cafe5635
KO
624 }
625}
626
627/* Bcache device */
628
629static int open_dev(struct block_device *b, fmode_t mode)
630{
631 struct bcache_device *d = b->bd_disk->private_data;
1fae7cf0 632
c4d951dd 633 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
cafe5635
KO
634 return -ENXIO;
635
636 closure_get(&d->cl);
637 return 0;
638}
639
867e1162 640static void release_dev(struct gendisk *b, fmode_t mode)
cafe5635
KO
641{
642 struct bcache_device *d = b->private_data;
1fae7cf0 643
cafe5635 644 closure_put(&d->cl);
cafe5635
KO
645}
646
647static int ioctl_dev(struct block_device *b, fmode_t mode,
648 unsigned int cmd, unsigned long arg)
649{
650 struct bcache_device *d = b->bd_disk->private_data;
0f0709e6 651
cafe5635
KO
652 return d->ioctl(d, mode, cmd, arg);
653}
654
655static const struct block_device_operations bcache_ops = {
656 .open = open_dev,
657 .release = release_dev,
658 .ioctl = ioctl_dev,
659 .owner = THIS_MODULE,
660};
661
662void bcache_device_stop(struct bcache_device *d)
663{
c4d951dd 664 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
cafe5635
KO
665 closure_queue(&d->cl);
666}
667
ee668506
KO
668static void bcache_device_unlink(struct bcache_device *d)
669{
c4d951dd 670 lockdep_assert_held(&bch_register_lock);
ee668506 671
c4d951dd 672 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
6f10f7d1 673 unsigned int i;
c4d951dd 674 struct cache *ca;
ee668506 675
c4d951dd
KO
676 sysfs_remove_link(&d->c->kobj, d->name);
677 sysfs_remove_link(&d->kobj, "cache");
678
679 for_each_cache(ca, d->c, i)
680 bd_unlink_disk_holder(ca->bdev, d->disk);
681 }
ee668506
KO
682}
683
684static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
685 const char *name)
686{
6f10f7d1 687 unsigned int i;
ee668506
KO
688 struct cache *ca;
689
690 for_each_cache(ca, d->c, i)
691 bd_link_disk_holder(ca->bdev, d->disk);
692
693 snprintf(d->name, BCACHEDEVNAME_SIZE,
694 "%s%u", name, d->id);
695
696 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
697 sysfs_create_link(&c->kobj, &d->kobj, d->name),
698 "Couldn't create device <-> cache set symlinks");
fecaee6f
ZL
699
700 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
ee668506
KO
701}
702
cafe5635
KO
703static void bcache_device_detach(struct bcache_device *d)
704{
705 lockdep_assert_held(&bch_register_lock);
706
ea8c5356
CL
707 atomic_dec(&d->c->attached_dev_nr);
708
c4d951dd 709 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
cafe5635
KO
710 struct uuid_entry *u = d->c->uuids + d->id;
711
712 SET_UUID_FLASH_ONLY(u, 0);
713 memcpy(u->uuid, invalid_uuid, 16);
75cbb3f1 714 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
cafe5635 715 bch_uuid_write(d->c);
cafe5635
KO
716 }
717
c4d951dd 718 bcache_device_unlink(d);
ee668506 719
cafe5635
KO
720 d->c->devices[d->id] = NULL;
721 closure_put(&d->c->caching);
722 d->c = NULL;
723}
724
725static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
6f10f7d1 726 unsigned int id)
cafe5635 727{
cafe5635
KO
728 d->id = id;
729 d->c = c;
730 c->devices[id] = d;
731
2831231d
CL
732 if (id >= c->devices_max_used)
733 c->devices_max_used = id + 1;
734
cafe5635
KO
735 closure_get(&c->caching);
736}
737
1dbe32ad
CL
738static inline int first_minor_to_idx(int first_minor)
739{
740 return (first_minor/BCACHE_MINORS);
741}
742
743static inline int idx_to_first_minor(int idx)
744{
745 return (idx * BCACHE_MINORS);
746}
747
cafe5635
KO
748static void bcache_device_free(struct bcache_device *d)
749{
750 lockdep_assert_held(&bch_register_lock);
751
752 pr_info("%s stopped", d->disk->disk_name);
753
754 if (d->c)
755 bcache_device_detach(d);
f59fce84 756 if (d->disk && d->disk->flags & GENHD_FL_UP)
cafe5635
KO
757 del_gendisk(d->disk);
758 if (d->disk && d->disk->queue)
759 blk_cleanup_queue(d->disk->queue);
28935ab5 760 if (d->disk) {
1dbe32ad
CL
761 ida_simple_remove(&bcache_device_idx,
762 first_minor_to_idx(d->disk->first_minor));
cafe5635 763 put_disk(d->disk);
28935ab5 764 }
cafe5635 765
d19936a2 766 bioset_exit(&d->bio_split);
958b4338
PE
767 kvfree(d->full_dirty_stripes);
768 kvfree(d->stripe_sectors_dirty);
cafe5635
KO
769
770 closure_debug_destroy(&d->cl);
771}
772
6f10f7d1 773static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
279afbad 774 sector_t sectors)
cafe5635
KO
775{
776 struct request_queue *q;
5f2b18ec
BVA
777 const size_t max_stripes = min_t(size_t, INT_MAX,
778 SIZE_MAX / sizeof(atomic_t));
279afbad 779 size_t n;
1dbe32ad 780 int idx;
279afbad 781
2d679fc7
KO
782 if (!d->stripe_size)
783 d->stripe_size = 1 << 31;
279afbad 784
2d679fc7 785 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
279afbad 786
5f2b18ec 787 if (!d->nr_stripes || d->nr_stripes > max_stripes) {
90706094 788 pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
6f10f7d1 789 (unsigned int)d->nr_stripes);
279afbad 790 return -ENOMEM;
48a915a8 791 }
279afbad
KO
792
793 n = d->nr_stripes * sizeof(atomic_t);
bc4e54f6 794 d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
279afbad
KO
795 if (!d->stripe_sectors_dirty)
796 return -ENOMEM;
cafe5635 797
48a915a8 798 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
bc4e54f6 799 d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
48a915a8
KO
800 if (!d->full_dirty_stripes)
801 return -ENOMEM;
802
1dbe32ad
CL
803 idx = ida_simple_get(&bcache_device_idx, 0,
804 BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
805 if (idx < 0)
806 return idx;
b8c0d911 807
d19936a2 808 if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
9b4e9f5a
FS
809 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
810 goto err;
811
812 d->disk = alloc_disk(BCACHE_MINORS);
813 if (!d->disk)
814 goto err;
cafe5635 815
279afbad 816 set_capacity(d->disk, sectors);
1dbe32ad 817 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
cafe5635
KO
818
819 d->disk->major = bcache_major;
1dbe32ad 820 d->disk->first_minor = idx_to_first_minor(idx);
cafe5635
KO
821 d->disk->fops = &bcache_ops;
822 d->disk->private_data = d;
823
28935ab5
KO
824 q = blk_alloc_queue(GFP_KERNEL);
825 if (!q)
826 return -ENOMEM;
827
cafe5635
KO
828 blk_queue_make_request(q, NULL);
829 d->disk->queue = q;
830 q->queuedata = d;
dc3b17cc 831 q->backing_dev_info->congested_data = d;
cafe5635
KO
832 q->limits.max_hw_sectors = UINT_MAX;
833 q->limits.max_sectors = UINT_MAX;
834 q->limits.max_segment_size = UINT_MAX;
835 q->limits.max_segments = BIO_MAX_PAGES;
2bb4cd5c 836 blk_queue_max_discard_sectors(q, UINT_MAX);
90db6919 837 q->limits.discard_granularity = 512;
cafe5635
KO
838 q->limits.io_min = block_size;
839 q->limits.logical_block_size = block_size;
840 q->limits.physical_block_size = block_size;
44e1ebe2
BVA
841 blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
842 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
843 blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
cafe5635 844
84b4ff9e 845 blk_queue_write_cache(q, true, true);
54d12f2b 846
cafe5635 847 return 0;
9b4e9f5a
FS
848
849err:
850 ida_simple_remove(&bcache_device_idx, idx);
851 return -ENOMEM;
852
cafe5635
KO
853}
854
855/* Cached device */
856
857static void calc_cached_dev_sectors(struct cache_set *c)
858{
859 uint64_t sectors = 0;
860 struct cached_dev *dc;
861
862 list_for_each_entry(dc, &c->cached_devs, list)
863 sectors += bdev_sectors(dc->bdev);
864
865 c->cached_dev_sectors = sectors;
866}
867
0f0709e6
CL
868#define BACKING_DEV_OFFLINE_TIMEOUT 5
869static int cached_dev_status_update(void *arg)
870{
871 struct cached_dev *dc = arg;
872 struct request_queue *q;
873
874 /*
875 * If this delayed worker is stopping outside, directly quit here.
876 * dc->io_disable might be set via sysfs interface, so check it
877 * here too.
878 */
879 while (!kthread_should_stop() && !dc->io_disable) {
880 q = bdev_get_queue(dc->bdev);
881 if (blk_queue_dying(q))
882 dc->offline_seconds++;
883 else
884 dc->offline_seconds = 0;
885
886 if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
887 pr_err("%s: device offline for %d seconds",
888 dc->backing_dev_name,
889 BACKING_DEV_OFFLINE_TIMEOUT);
890 pr_err("%s: disable I/O request due to backing "
891 "device offline", dc->disk.name);
892 dc->io_disable = true;
893 /* let others know earlier that io_disable is true */
894 smp_mb();
895 bcache_device_stop(&dc->disk);
896 break;
897 }
898 schedule_timeout_interruptible(HZ);
899 }
900
901 wait_for_kthread_stop();
902 return 0;
903}
904
905
7e9c273f 906void bch_cached_dev_emit_change(struct cached_dev *dc)
cafe5635
KO
907{
908 struct bcache_device *d = &dc->disk;
ab9e1400 909 char buf[SB_LABEL_SIZE + 1];
a25c32be
GP
910 char *env[] = {
911 "DRIVER=bcache",
912 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
ab9e1400
GP
913 NULL,
914 NULL,
a25c32be 915 };
cafe5635 916
ab9e1400
GP
917 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
918 buf[SB_LABEL_SIZE] = '\0';
919 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
920
7e9c273f
RH
921 /* won't show up in the uevent file, use udevadm monitor -e instead
922 * only class / kset properties are persistent */
923 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
924 kfree(env[1]);
925 kfree(env[2]);
926
927}
928
929void bch_cached_dev_run(struct cached_dev *dc)
930{
931 struct bcache_device *d = &dc->disk;
4d4d8573 932 if (atomic_xchg(&dc->running, 1)) {
cafe5635 933 return;
4d4d8573 934 }
cafe5635
KO
935
936 if (!d->c &&
937 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
938 struct closure cl;
1fae7cf0 939
cafe5635
KO
940 closure_init_stack(&cl);
941
942 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
943 bch_write_bdev_super(dc, &cl);
944 closure_sync(&cl);
945 }
946
947 add_disk(d->disk);
ee668506 948 bd_link_disk_holder(dc->bdev, dc->disk.disk);
7e9c273f
RH
949
950 /* emit change event */
951 bch_cached_dev_emit_change(dc);
a25c32be 952
cafe5635
KO
953 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
954 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
955 pr_debug("error creating sysfs link");
0f0709e6
CL
956
957 dc->status_update_thread = kthread_run(cached_dev_status_update,
958 dc, "bcache_status_update");
959 if (IS_ERR(dc->status_update_thread)) {
960 pr_warn("failed to create bcache_status_update kthread, "
961 "continue to run without monitoring backing "
962 "device status");
963 }
cafe5635
KO
964}
965
3fd47bfe
CL
966/*
967 * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
968 * work dc->writeback_rate_update is running. Wait until the routine
969 * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
970 * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
971 * seconds, give up waiting here and continue to cancel it too.
972 */
973static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
974{
975 int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
976
977 do {
978 if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
979 &dc->disk.flags))
980 break;
981 time_out--;
982 schedule_timeout_interruptible(1);
983 } while (time_out > 0);
984
985 if (time_out == 0)
986 pr_warn("give up waiting for dc->writeback_write_update to quit");
987
988 cancel_delayed_work_sync(&dc->writeback_rate_update);
989}
990
cafe5635
KO
991static void cached_dev_detach_finish(struct work_struct *w)
992{
993 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
cafe5635 994 struct closure cl;
1fae7cf0 995
cafe5635
KO
996 closure_init_stack(&cl);
997
c4d951dd 998 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
3b304d24 999 BUG_ON(refcount_read(&dc->count));
cafe5635 1000
cafe5635
KO
1001 mutex_lock(&bch_register_lock);
1002
3fd47bfe
CL
1003 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1004 cancel_writeback_rate_update_dwork(dc);
1005
8d29c442
TJ
1006 if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
1007 kthread_stop(dc->writeback_thread);
1008 dc->writeback_thread = NULL;
1009 }
1010
cafe5635
KO
1011 memset(&dc->sb.set_uuid, 0, 16);
1012 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
1013
1014 bch_write_bdev_super(dc, &cl);
1015 closure_sync(&cl);
1016
46010141 1017 calc_cached_dev_sectors(dc->disk.c);
cafe5635
KO
1018 bcache_device_detach(&dc->disk);
1019 list_move(&dc->list, &uncached_devices);
1020
c4d951dd 1021 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
5b1016e6 1022 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
c4d951dd 1023
cafe5635
KO
1024 mutex_unlock(&bch_register_lock);
1025
6e916a7e 1026 pr_info("Caching disabled for %s", dc->backing_dev_name);
cafe5635
KO
1027
1028 /* Drop ref we took in cached_dev_detach() */
1029 closure_put(&dc->disk.cl);
1030}
1031
1032void bch_cached_dev_detach(struct cached_dev *dc)
1033{
1034 lockdep_assert_held(&bch_register_lock);
1035
c4d951dd 1036 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
cafe5635
KO
1037 return;
1038
c4d951dd 1039 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
cafe5635
KO
1040 return;
1041
1042 /*
1043 * Block the device from being closed and freed until we're finished
1044 * detaching
1045 */
1046 closure_get(&dc->disk.cl);
1047
1048 bch_writeback_queue(dc);
3fd47bfe 1049
cafe5635
KO
1050 cached_dev_put(dc);
1051}
1052
73ac105b
TJ
1053int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1054 uint8_t *set_uuid)
cafe5635 1055{
75cbb3f1 1056 uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
cafe5635 1057 struct uuid_entry *u;
86755b7a 1058 struct cached_dev *exist_dc, *t;
cafe5635 1059
73ac105b
TJ
1060 if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
1061 (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
cafe5635
KO
1062 return -ENOENT;
1063
1064 if (dc->disk.c) {
6e916a7e
CL
1065 pr_err("Can't attach %s: already attached",
1066 dc->backing_dev_name);
cafe5635
KO
1067 return -EINVAL;
1068 }
1069
1070 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
6e916a7e
CL
1071 pr_err("Can't attach %s: shutting down",
1072 dc->backing_dev_name);
cafe5635
KO
1073 return -EINVAL;
1074 }
1075
1076 if (dc->sb.block_size < c->sb.block_size) {
1077 /* Will die */
b1a67b0f 1078 pr_err("Couldn't attach %s: block size less than set's block size",
6e916a7e 1079 dc->backing_dev_name);
cafe5635
KO
1080 return -EINVAL;
1081 }
1082
86755b7a
ML
1083 /* Check whether already attached */
1084 list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
1085 if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
1086 pr_err("Tried to attach %s but duplicate UUID already attached",
6e916a7e 1087 dc->backing_dev_name);
86755b7a
ML
1088
1089 return -EINVAL;
1090 }
1091 }
1092
cafe5635
KO
1093 u = uuid_find(c, dc->sb.uuid);
1094
1095 if (u &&
1096 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
1097 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
1098 memcpy(u->uuid, invalid_uuid, 16);
75cbb3f1 1099 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
cafe5635
KO
1100 u = NULL;
1101 }
1102
1103 if (!u) {
1104 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
6e916a7e
CL
1105 pr_err("Couldn't find uuid for %s in set",
1106 dc->backing_dev_name);
cafe5635
KO
1107 return -ENOENT;
1108 }
1109
1110 u = uuid_find_empty(c);
1111 if (!u) {
6e916a7e
CL
1112 pr_err("Not caching %s, no room for UUID",
1113 dc->backing_dev_name);
cafe5635
KO
1114 return -EINVAL;
1115 }
1116 }
1117
3be11dba
CL
1118 /*
1119 * Deadlocks since we're called via sysfs...
1120 * sysfs_remove_file(&dc->kobj, &sysfs_attach);
cafe5635
KO
1121 */
1122
169ef1cf 1123 if (bch_is_zero(u->uuid, 16)) {
cafe5635 1124 struct closure cl;
1fae7cf0 1125
cafe5635
KO
1126 closure_init_stack(&cl);
1127
1128 memcpy(u->uuid, dc->sb.uuid, 16);
1129 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
1130 u->first_reg = u->last_reg = rtime;
1131 bch_uuid_write(c);
1132
1133 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
1134 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1135
1136 bch_write_bdev_super(dc, &cl);
1137 closure_sync(&cl);
1138 } else {
1139 u->last_reg = rtime;
1140 bch_uuid_write(c);
1141 }
1142
1143 bcache_device_attach(&dc->disk, c, u - c->uuids);
cafe5635
KO
1144 list_move(&dc->list, &c->cached_devs);
1145 calc_cached_dev_sectors(c);
1146
cafe5635
KO
1147 /*
1148 * dc->c must be set before dc->count != 0 - paired with the mb in
1149 * cached_dev_get()
1150 */
eb2b3d03 1151 smp_wmb();
3b304d24 1152 refcount_set(&dc->count, 1);
cafe5635 1153
07cc6ef8
EW
1154 /* Block writeback thread, but spawn it */
1155 down_write(&dc->writeback_lock);
1156 if (bch_cached_dev_writeback_start(dc)) {
1157 up_write(&dc->writeback_lock);
9e5c3535 1158 return -ENOMEM;
07cc6ef8 1159 }
9e5c3535 1160
cafe5635
KO
1161 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1162 atomic_set(&dc->has_dirty, 1);
cafe5635
KO
1163 bch_writeback_queue(dc);
1164 }
1165
2e17a262
TJ
1166 bch_sectors_dirty_init(&dc->disk);
1167
cafe5635 1168 bch_cached_dev_run(dc);
ee668506 1169 bcache_device_link(&dc->disk, c, "bdev");
ea8c5356 1170 atomic_inc(&c->attached_dev_nr);
cafe5635 1171
07cc6ef8
EW
1172 /* Allow the writeback thread to proceed */
1173 up_write(&dc->writeback_lock);
1174
cafe5635 1175 pr_info("Caching %s as %s on set %pU",
6e916a7e
CL
1176 dc->backing_dev_name,
1177 dc->disk.disk->disk_name,
cafe5635
KO
1178 dc->disk.c->sb.set_uuid);
1179 return 0;
1180}
1181
1182void bch_cached_dev_release(struct kobject *kobj)
1183{
1184 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1185 disk.kobj);
1186 kfree(dc);
1187 module_put(THIS_MODULE);
1188}
1189
1190static void cached_dev_free(struct closure *cl)
1191{
1192 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1193
3fd47bfe
CL
1194 mutex_lock(&bch_register_lock);
1195
1196 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1197 cancel_writeback_rate_update_dwork(dc);
1198
a664d0f0
SP
1199 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1200 kthread_stop(dc->writeback_thread);
9baf3097
TJ
1201 if (dc->writeback_write_wq)
1202 destroy_workqueue(dc->writeback_write_wq);
0f0709e6
CL
1203 if (!IS_ERR_OR_NULL(dc->status_update_thread))
1204 kthread_stop(dc->status_update_thread);
cafe5635 1205
f59fce84
KO
1206 if (atomic_read(&dc->running))
1207 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
cafe5635
KO
1208 bcache_device_free(&dc->disk);
1209 list_del(&dc->list);
1210
1211 mutex_unlock(&bch_register_lock);
1212
0781c874 1213 if (!IS_ERR_OR_NULL(dc->bdev))
cafe5635 1214 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
cafe5635
KO
1215
1216 wake_up(&unregister_wait);
1217
1218 kobject_put(&dc->disk.kobj);
1219}
1220
1221static void cached_dev_flush(struct closure *cl)
1222{
1223 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1224 struct bcache_device *d = &dc->disk;
1225
c9502ea4 1226 mutex_lock(&bch_register_lock);
c4d951dd 1227 bcache_device_unlink(d);
c9502ea4
KO
1228 mutex_unlock(&bch_register_lock);
1229
cafe5635
KO
1230 bch_cache_accounting_destroy(&dc->accounting);
1231 kobject_del(&d->kobj);
1232
1233 continue_at(cl, cached_dev_free, system_wq);
1234}
1235
6f10f7d1 1236static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
cafe5635 1237{
f59fce84 1238 int ret;
cafe5635 1239 struct io *io;
f59fce84 1240 struct request_queue *q = bdev_get_queue(dc->bdev);
cafe5635
KO
1241
1242 __module_get(THIS_MODULE);
1243 INIT_LIST_HEAD(&dc->list);
f59fce84
KO
1244 closure_init(&dc->disk.cl, NULL);
1245 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
cafe5635 1246 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
cafe5635 1247 INIT_WORK(&dc->detach, cached_dev_detach_finish);
cb7a583e 1248 sema_init(&dc->sb_write_mutex, 1);
f59fce84
KO
1249 INIT_LIST_HEAD(&dc->io_lru);
1250 spin_lock_init(&dc->io_lock);
1251 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
cafe5635 1252
cafe5635
KO
1253 dc->sequential_cutoff = 4 << 20;
1254
cafe5635
KO
1255 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1256 list_add(&io->lru, &dc->io_lru);
1257 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1258 }
1259
c78afc62
KO
1260 dc->disk.stripe_size = q->limits.io_opt >> 9;
1261
1262 if (dc->disk.stripe_size)
1263 dc->partial_stripes_expensive =
1264 q->limits.raid_partial_stripes_expensive;
1265
279afbad
KO
1266 ret = bcache_device_init(&dc->disk, block_size,
1267 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
f59fce84
KO
1268 if (ret)
1269 return ret;
1270
dc3b17cc
JK
1271 dc->disk.disk->queue->backing_dev_info->ra_pages =
1272 max(dc->disk.disk->queue->backing_dev_info->ra_pages,
1273 q->backing_dev_info->ra_pages);
f59fce84 1274
c7b7bd07
CL
1275 atomic_set(&dc->io_errors, 0);
1276 dc->io_disable = false;
1277 dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
7e027ca4
CL
1278 /* default to auto */
1279 dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
1280
f59fce84
KO
1281 bch_cached_dev_request_init(dc);
1282 bch_cached_dev_writeback_init(dc);
cafe5635 1283 return 0;
cafe5635
KO
1284}
1285
1286/* Cached device - bcache superblock */
1287
f59fce84 1288static void register_bdev(struct cache_sb *sb, struct page *sb_page,
cafe5635
KO
1289 struct block_device *bdev,
1290 struct cached_dev *dc)
1291{
cafe5635 1292 const char *err = "cannot allocate memory";
cafe5635
KO
1293 struct cache_set *c;
1294
6e916a7e 1295 bdevname(bdev, dc->backing_dev_name);
cafe5635 1296 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
cafe5635
KO
1297 dc->bdev = bdev;
1298 dc->bdev->bd_holder = dc;
1299
3a83f467 1300 bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
263663cd 1301 bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
f59fce84 1302 get_page(sb_page);
4f0fd955 1303
6e916a7e 1304
f59fce84
KO
1305 if (cached_dev_init(dc, sb->block_size << 9))
1306 goto err;
cafe5635
KO
1307
1308 err = "error creating kobject";
1309 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1310 "bcache"))
1311 goto err;
1312 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1313 goto err;
1314
6e916a7e 1315 pr_info("registered backing device %s", dc->backing_dev_name);
f59fce84 1316
cafe5635 1317 list_add(&dc->list, &uncached_devices);
e57fd746 1318 /* attach to a matched cache set if it exists */
cafe5635 1319 list_for_each_entry(c, &bch_cache_sets, list)
73ac105b 1320 bch_cached_dev_attach(dc, c, NULL);
cafe5635
KO
1321
1322 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1323 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1324 bch_cached_dev_run(dc);
1325
f59fce84 1326 return;
cafe5635 1327err:
6e916a7e 1328 pr_notice("error %s: %s", dc->backing_dev_name, err);
f59fce84 1329 bcache_device_stop(&dc->disk);
cafe5635
KO
1330}
1331
1332/* Flash only volumes */
1333
1334void bch_flash_dev_release(struct kobject *kobj)
1335{
1336 struct bcache_device *d = container_of(kobj, struct bcache_device,
1337 kobj);
1338 kfree(d);
1339}
1340
1341static void flash_dev_free(struct closure *cl)
1342{
1343 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1fae7cf0 1344
e5112201 1345 mutex_lock(&bch_register_lock);
99a27d59
TJ
1346 atomic_long_sub(bcache_dev_sectors_dirty(d),
1347 &d->c->flash_dev_dirty_sectors);
cafe5635 1348 bcache_device_free(d);
e5112201 1349 mutex_unlock(&bch_register_lock);
cafe5635
KO
1350 kobject_put(&d->kobj);
1351}
1352
1353static void flash_dev_flush(struct closure *cl)
1354{
1355 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1356
e5112201 1357 mutex_lock(&bch_register_lock);
ee668506 1358 bcache_device_unlink(d);
e5112201 1359 mutex_unlock(&bch_register_lock);
cafe5635
KO
1360 kobject_del(&d->kobj);
1361 continue_at(cl, flash_dev_free, system_wq);
1362}
1363
1364static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1365{
1366 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1367 GFP_KERNEL);
1368 if (!d)
1369 return -ENOMEM;
1370
1371 closure_init(&d->cl, NULL);
1372 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1373
1374 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1375
279afbad 1376 if (bcache_device_init(d, block_bytes(c), u->sectors))
cafe5635
KO
1377 goto err;
1378
1379 bcache_device_attach(d, c, u - c->uuids);
175206cf 1380 bch_sectors_dirty_init(d);
cafe5635
KO
1381 bch_flash_dev_request_init(d);
1382 add_disk(d->disk);
1383
1384 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1385 goto err;
1386
1387 bcache_device_link(d, c, "volume");
1388
1389 return 0;
1390err:
1391 kobject_put(&d->kobj);
1392 return -ENOMEM;
1393}
1394
1395static int flash_devs_run(struct cache_set *c)
1396{
1397 int ret = 0;
1398 struct uuid_entry *u;
1399
1400 for (u = c->uuids;
02aa8a8b 1401 u < c->uuids + c->nr_uuids && !ret;
cafe5635
KO
1402 u++)
1403 if (UUID_FLASH_ONLY(u))
1404 ret = flash_dev_run(c, u);
1405
1406 return ret;
1407}
1408
1409int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1410{
1411 struct uuid_entry *u;
1412
1413 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1414 return -EINTR;
1415
bf0c55c9
SP
1416 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1417 return -EPERM;
1418
cafe5635
KO
1419 u = uuid_find_empty(c);
1420 if (!u) {
1421 pr_err("Can't create volume, no room for UUID");
1422 return -EINVAL;
1423 }
1424
1425 get_random_bytes(u->uuid, 16);
1426 memset(u->label, 0, 32);
75cbb3f1 1427 u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
cafe5635
KO
1428
1429 SET_UUID_FLASH_ONLY(u, 1);
1430 u->sectors = size >> 9;
1431
1432 bch_uuid_write(c);
1433
1434 return flash_dev_run(c, u);
1435}
1436
c7b7bd07
CL
1437bool bch_cached_dev_error(struct cached_dev *dc)
1438{
c7b7bd07
CL
1439 if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1440 return false;
1441
1442 dc->io_disable = true;
1443 /* make others know io_disable is true earlier */
1444 smp_mb();
1445
1446 pr_err("stop %s: too many IO errors on backing device %s\n",
6e916a7e 1447 dc->disk.disk->disk_name, dc->backing_dev_name);
c7b7bd07
CL
1448
1449 bcache_device_stop(&dc->disk);
1450 return true;
1451}
1452
cafe5635
KO
1453/* Cache set */
1454
1455__printf(2, 3)
1456bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1457{
1458 va_list args;
1459
77c320eb
KO
1460 if (c->on_error != ON_ERROR_PANIC &&
1461 test_bit(CACHE_SET_STOPPING, &c->flags))
cafe5635
KO
1462 return false;
1463
771f393e 1464 if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
09a44ca2 1465 pr_info("CACHE_SET_IO_DISABLE already set");
771f393e 1466
3be11dba
CL
1467 /*
1468 * XXX: we can be called from atomic context
1469 * acquire_console_sem();
1470 */
cafe5635 1471
6ae63e35 1472 pr_err("bcache: error on %pU: ", c->sb.set_uuid);
cafe5635
KO
1473
1474 va_start(args, fmt);
1475 vprintk(fmt, args);
1476 va_end(args);
1477
6ae63e35 1478 pr_err(", disabling caching\n");
cafe5635 1479
77c320eb
KO
1480 if (c->on_error == ON_ERROR_PANIC)
1481 panic("panic forced after error\n");
1482
cafe5635
KO
1483 bch_cache_set_unregister(c);
1484 return true;
1485}
1486
1487void bch_cache_set_release(struct kobject *kobj)
1488{
1489 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1fae7cf0 1490
cafe5635
KO
1491 kfree(c);
1492 module_put(THIS_MODULE);
1493}
1494
1495static void cache_set_free(struct closure *cl)
1496{
1497 struct cache_set *c = container_of(cl, struct cache_set, cl);
1498 struct cache *ca;
6f10f7d1 1499 unsigned int i;
cafe5635 1500
ae171023 1501 debugfs_remove(c->debug);
cafe5635
KO
1502
1503 bch_open_buckets_free(c);
1504 bch_btree_cache_free(c);
1505 bch_journal_free(c);
1506
9d4be5dd 1507 mutex_lock(&bch_register_lock);
cafe5635 1508 for_each_cache(ca, c, i)
c9a78332
SP
1509 if (ca) {
1510 ca->set = NULL;
1511 c->cache[ca->sb.nr_this_dev] = NULL;
cafe5635 1512 kobject_put(&ca->kobj);
c9a78332 1513 }
cafe5635 1514
67539e85 1515 bch_bset_sort_state_free(&c->sort);
cafe5635 1516 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
cafe5635 1517
da415a09
NS
1518 if (c->moving_gc_wq)
1519 destroy_workqueue(c->moving_gc_wq);
d19936a2
KO
1520 bioset_exit(&c->bio_split);
1521 mempool_exit(&c->fill_iter);
1522 mempool_exit(&c->bio_meta);
1523 mempool_exit(&c->search);
cafe5635
KO
1524 kfree(c->devices);
1525
cafe5635
KO
1526 list_del(&c->list);
1527 mutex_unlock(&bch_register_lock);
1528
1529 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1530 wake_up(&unregister_wait);
1531
1532 closure_debug_destroy(&c->cl);
1533 kobject_put(&c->kobj);
1534}
1535
1536static void cache_set_flush(struct closure *cl)
1537{
1538 struct cache_set *c = container_of(cl, struct cache_set, caching);
79826c35 1539 struct cache *ca;
cafe5635 1540 struct btree *b;
6f10f7d1 1541 unsigned int i;
cafe5635
KO
1542
1543 bch_cache_accounting_destroy(&c->accounting);
1544
1545 kobject_put(&c->internal);
1546 kobject_del(&c->kobj);
1547
72a44517
KO
1548 if (c->gc_thread)
1549 kthread_stop(c->gc_thread);
1550
cafe5635
KO
1551 if (!IS_ERR_OR_NULL(c->root))
1552 list_add(&c->root->list, &c->btree_cache);
1553
1554 /* Should skip this if we're unregistering because of an error */
2a285686
KO
1555 list_for_each_entry(b, &c->btree_cache, list) {
1556 mutex_lock(&b->write_lock);
cafe5635 1557 if (btree_node_dirty(b))
2a285686
KO
1558 __bch_btree_node_write(b, NULL);
1559 mutex_unlock(&b->write_lock);
1560 }
cafe5635 1561
79826c35
KO
1562 for_each_cache(ca, c, i)
1563 if (ca->alloc_thread)
1564 kthread_stop(ca->alloc_thread);
1565
5b1016e6
KO
1566 if (c->journal.cur) {
1567 cancel_delayed_work_sync(&c->journal.work);
1568 /* flush last journal entry if needed */
1569 c->journal.work.work.func(&c->journal.work.work);
1570 }
dabb4433 1571
cafe5635
KO
1572 closure_return(cl);
1573}
1574
7e027ca4
CL
1575/*
1576 * This function is only called when CACHE_SET_IO_DISABLE is set, which means
1577 * cache set is unregistering due to too many I/O errors. In this condition,
1578 * the bcache device might be stopped, it depends on stop_when_cache_set_failed
1579 * value and whether the broken cache has dirty data:
1580 *
1581 * dc->stop_when_cache_set_failed dc->has_dirty stop bcache device
1582 * BCH_CACHED_STOP_AUTO 0 NO
1583 * BCH_CACHED_STOP_AUTO 1 YES
1584 * BCH_CACHED_DEV_STOP_ALWAYS 0 YES
1585 * BCH_CACHED_DEV_STOP_ALWAYS 1 YES
1586 *
1587 * The expected behavior is, if stop_when_cache_set_failed is configured to
1588 * "auto" via sysfs interface, the bcache device will not be stopped if the
1589 * backing device is clean on the broken cache device.
1590 */
1591static void conditional_stop_bcache_device(struct cache_set *c,
1592 struct bcache_device *d,
1593 struct cached_dev *dc)
1594{
1595 if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
1596 pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
1597 d->disk->disk_name, c->sb.set_uuid);
1598 bcache_device_stop(d);
1599 } else if (atomic_read(&dc->has_dirty)) {
1600 /*
1601 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
1602 * and dc->has_dirty == 1
1603 */
1604 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
1605 d->disk->disk_name);
4fd8e138
CL
1606 /*
1607 * There might be a small time gap that cache set is
1608 * released but bcache device is not. Inside this time
1609 * gap, regular I/O requests will directly go into
1610 * backing device as no cache set attached to. This
1611 * behavior may also introduce potential inconsistence
1612 * data in writeback mode while cache is dirty.
1613 * Therefore before calling bcache_device_stop() due
1614 * to a broken cache device, dc->io_disable should be
1615 * explicitly set to true.
1616 */
1617 dc->io_disable = true;
1618 /* make others know io_disable is true earlier */
1619 smp_mb();
7e027ca4
CL
1620 bcache_device_stop(d);
1621 } else {
1622 /*
1623 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
1624 * and dc->has_dirty == 0
1625 */
1626 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
1627 d->disk->disk_name);
1628 }
1629}
1630
cafe5635
KO
1631static void __cache_set_unregister(struct closure *cl)
1632{
1633 struct cache_set *c = container_of(cl, struct cache_set, caching);
5caa52af 1634 struct cached_dev *dc;
7e027ca4 1635 struct bcache_device *d;
cafe5635
KO
1636 size_t i;
1637
1638 mutex_lock(&bch_register_lock);
1639
7e027ca4
CL
1640 for (i = 0; i < c->devices_max_used; i++) {
1641 d = c->devices[i];
1642 if (!d)
1643 continue;
1644
1645 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1646 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1647 dc = container_of(d, struct cached_dev, disk);
1648 bch_cached_dev_detach(dc);
1649 if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1650 conditional_stop_bcache_device(c, d, dc);
1651 } else {
1652 bcache_device_stop(d);
5caa52af 1653 }
7e027ca4 1654 }
cafe5635
KO
1655
1656 mutex_unlock(&bch_register_lock);
1657
1658 continue_at(cl, cache_set_flush, system_wq);
1659}
1660
1661void bch_cache_set_stop(struct cache_set *c)
1662{
1663 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1664 closure_queue(&c->caching);
1665}
1666
1667void bch_cache_set_unregister(struct cache_set *c)
1668{
1669 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1670 bch_cache_set_stop(c);
1671}
1672
1673#define alloc_bucket_pages(gfp, c) \
1674 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1675
1676struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1677{
1678 int iter_size;
1679 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1fae7cf0 1680
cafe5635
KO
1681 if (!c)
1682 return NULL;
1683
1684 __module_get(THIS_MODULE);
1685 closure_init(&c->cl, NULL);
1686 set_closure_fn(&c->cl, cache_set_free, system_wq);
1687
1688 closure_init(&c->caching, &c->cl);
1689 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1690
1691 /* Maybe create continue_at_noreturn() and use it here? */
1692 closure_set_stopped(&c->cl);
1693 closure_put(&c->cl);
1694
1695 kobject_init(&c->kobj, &bch_cache_set_ktype);
1696 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1697
1698 bch_cache_accounting_init(&c->accounting, &c->cl);
1699
1700 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1701 c->sb.block_size = sb->block_size;
1702 c->sb.bucket_size = sb->bucket_size;
1703 c->sb.nr_in_set = sb->nr_in_set;
1704 c->sb.last_mount = sb->last_mount;
1705 c->bucket_bits = ilog2(sb->bucket_size);
1706 c->block_bits = ilog2(sb->block_size);
1707 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
2831231d 1708 c->devices_max_used = 0;
ea8c5356 1709 atomic_set(&c->attached_dev_nr, 0);
ee811287 1710 c->btree_pages = bucket_pages(c);
cafe5635
KO
1711 if (c->btree_pages > BTREE_MAX_PAGES)
1712 c->btree_pages = max_t(int, c->btree_pages / 4,
1713 BTREE_MAX_PAGES);
1714
cb7a583e 1715 sema_init(&c->sb_write_mutex, 1);
e8e1d468 1716 mutex_init(&c->bucket_lock);
0a63b66d 1717 init_waitqueue_head(&c->btree_cache_wait);
35fcd848 1718 init_waitqueue_head(&c->bucket_wait);
be628be0 1719 init_waitqueue_head(&c->gc_wait);
cb7a583e 1720 sema_init(&c->uuid_write_mutex, 1);
65d22e91 1721
65d22e91
KO
1722 spin_lock_init(&c->btree_gc_time.lock);
1723 spin_lock_init(&c->btree_split_time.lock);
1724 spin_lock_init(&c->btree_read_time.lock);
e8e1d468 1725
cafe5635
KO
1726 bch_moving_init_cache_set(c);
1727
1728 INIT_LIST_HEAD(&c->list);
1729 INIT_LIST_HEAD(&c->cached_devs);
1730 INIT_LIST_HEAD(&c->btree_cache);
1731 INIT_LIST_HEAD(&c->btree_cache_freeable);
1732 INIT_LIST_HEAD(&c->btree_cache_freed);
1733 INIT_LIST_HEAD(&c->data_buckets);
1734
cafe5635
KO
1735 iter_size = (sb->bucket_size / sb->block_size + 1) *
1736 sizeof(struct btree_iter_set);
1737
6396bb22 1738 if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
d19936a2
KO
1739 mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
1740 mempool_init_kmalloc_pool(&c->bio_meta, 2,
b0d30981
CL
1741 sizeof(struct bbio) + sizeof(struct bio_vec) *
1742 bucket_pages(c)) ||
d19936a2
KO
1743 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
1744 bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
1745 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
cafe5635 1746 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
81baf90a
BS
1747 !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
1748 WQ_MEM_RECLAIM, 0)) ||
cafe5635
KO
1749 bch_journal_alloc(c) ||
1750 bch_btree_cache_alloc(c) ||
67539e85
KO
1751 bch_open_buckets_alloc(c) ||
1752 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
cafe5635
KO
1753 goto err;
1754
cafe5635
KO
1755 c->congested_read_threshold_us = 2000;
1756 c->congested_write_threshold_us = 20000;
7ba0d830 1757 c->error_limit = DEFAULT_IO_ERROR_LIMIT;
771f393e 1758 WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
cafe5635
KO
1759
1760 return c;
1761err:
1762 bch_cache_set_unregister(c);
1763 return NULL;
1764}
1765
317d836b 1766static int run_cache_set(struct cache_set *c)
cafe5635
KO
1767{
1768 const char *err = "cannot allocate memory";
1769 struct cached_dev *dc, *t;
1770 struct cache *ca;
c18536a7 1771 struct closure cl;
6f10f7d1 1772 unsigned int i;
19d761a5
SW
1773 LIST_HEAD(journal);
1774 struct journal_replay *l;
cafe5635 1775
c18536a7 1776 closure_init_stack(&cl);
cafe5635
KO
1777
1778 for_each_cache(ca, c, i)
1779 c->nbuckets += ca->sb.nbuckets;
be628be0 1780 set_gc_sectors(c);
cafe5635
KO
1781
1782 if (CACHE_SYNC(&c->sb)) {
1783 LIST_HEAD(journal);
1784 struct bkey *k;
1785 struct jset *j;
1786
1787 err = "cannot allocate memory for journal";
c18536a7 1788 if (bch_journal_read(c, &journal))
cafe5635
KO
1789 goto err;
1790
1791 pr_debug("btree_journal_read() done");
1792
1793 err = "no journal entries found";
1794 if (list_empty(&journal))
1795 goto err;
1796
1797 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1798
1799 err = "IO error reading priorities";
1800 for_each_cache(ca, c, i)
1801 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1802
1803 /*
1804 * If prio_read() fails it'll call cache_set_error and we'll
1805 * tear everything down right away, but if we perhaps checked
1806 * sooner we could avoid journal replay.
1807 */
1808
1809 k = &j->btree_root;
1810
1811 err = "bad btree root";
65d45231 1812 if (__bch_btree_ptr_invalid(c, k))
cafe5635
KO
1813 goto err;
1814
1815 err = "error reading btree root";
b0d30981
CL
1816 c->root = bch_btree_node_get(c, NULL, k,
1817 j->btree_level,
1818 true, NULL);
cafe5635
KO
1819 if (IS_ERR_OR_NULL(c->root))
1820 goto err;
1821
1822 list_del_init(&c->root->list);
1823 rw_unlock(true, c->root);
1824
c18536a7 1825 err = uuid_read(c, j, &cl);
cafe5635
KO
1826 if (err)
1827 goto err;
1828
1829 err = "error in recovery";
c18536a7 1830 if (bch_btree_check(c))
cafe5635
KO
1831 goto err;
1832
1833 bch_journal_mark(c, &journal);
2531d9ee 1834 bch_initial_gc_finish(c);
cafe5635
KO
1835 pr_debug("btree_check() done");
1836
1837 /*
1838 * bcache_journal_next() can't happen sooner, or
1839 * btree_gc_finish() will give spurious errors about last_gc >
1840 * gc_gen - this is a hack but oh well.
1841 */
1842 bch_journal_next(&c->journal);
1843
119ba0f8 1844 err = "error starting allocator thread";
cafe5635 1845 for_each_cache(ca, c, i)
119ba0f8
KO
1846 if (bch_cache_allocator_start(ca))
1847 goto err;
cafe5635
KO
1848
1849 /*
1850 * First place it's safe to allocate: btree_check() and
1851 * btree_gc_finish() have to run before we have buckets to
1852 * allocate, and bch_bucket_alloc_set() might cause a journal
1853 * entry to be written so bcache_journal_next() has to be called
1854 * first.
1855 *
1856 * If the uuids were in the old format we have to rewrite them
1857 * before the next journal entry is written:
1858 */
1859 if (j->version < BCACHE_JSET_VERSION_UUID)
1860 __uuid_write(c);
1861
317d836b
CL
1862 err = "bcache: replay journal failed";
1863 if (bch_journal_replay(c, &journal))
1864 goto err;
cafe5635
KO
1865 } else {
1866 pr_notice("invalidating existing data");
cafe5635
KO
1867
1868 for_each_cache(ca, c, i) {
6f10f7d1 1869 unsigned int j;
cafe5635
KO
1870
1871 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1872 2, SB_JOURNAL_BUCKETS);
1873
1874 for (j = 0; j < ca->sb.keys; j++)
1875 ca->sb.d[j] = ca->sb.first_bucket + j;
1876 }
1877
2531d9ee 1878 bch_initial_gc_finish(c);
cafe5635 1879
119ba0f8 1880 err = "error starting allocator thread";
cafe5635 1881 for_each_cache(ca, c, i)
119ba0f8
KO
1882 if (bch_cache_allocator_start(ca))
1883 goto err;
cafe5635
KO
1884
1885 mutex_lock(&c->bucket_lock);
1886 for_each_cache(ca, c, i)
1887 bch_prio_write(ca);
1888 mutex_unlock(&c->bucket_lock);
1889
cafe5635
KO
1890 err = "cannot allocate new UUID bucket";
1891 if (__uuid_write(c))
72a44517 1892 goto err;
cafe5635
KO
1893
1894 err = "cannot allocate new btree root";
2452cc89 1895 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
cafe5635 1896 if (IS_ERR_OR_NULL(c->root))
72a44517 1897 goto err;
cafe5635 1898
2a285686 1899 mutex_lock(&c->root->write_lock);
cafe5635 1900 bkey_copy_key(&c->root->key, &MAX_KEY);
c18536a7 1901 bch_btree_node_write(c->root, &cl);
2a285686 1902 mutex_unlock(&c->root->write_lock);
cafe5635
KO
1903
1904 bch_btree_set_root(c->root);
1905 rw_unlock(true, c->root);
1906
1907 /*
1908 * We don't want to write the first journal entry until
1909 * everything is set up - fortunately journal entries won't be
1910 * written until the SET_CACHE_SYNC() here:
1911 */
1912 SET_CACHE_SYNC(&c->sb, true);
1913
1914 bch_journal_next(&c->journal);
c18536a7 1915 bch_journal_meta(c, &cl);
cafe5635
KO
1916 }
1917
72a44517
KO
1918 err = "error starting gc thread";
1919 if (bch_gc_thread_start(c))
1920 goto err;
1921
c18536a7 1922 closure_sync(&cl);
75cbb3f1 1923 c->sb.last_mount = (u32)ktime_get_real_seconds();
cafe5635
KO
1924 bcache_write_super(c);
1925
1926 list_for_each_entry_safe(dc, t, &uncached_devices, list)
73ac105b 1927 bch_cached_dev_attach(dc, c, NULL);
cafe5635
KO
1928
1929 flash_devs_run(c);
1930
bf0c55c9 1931 set_bit(CACHE_SET_RUNNING, &c->flags);
317d836b 1932 return 0;
cafe5635 1933err:
19d761a5
SW
1934 while (!list_empty(&journal)) {
1935 l = list_first_entry(&journal, struct journal_replay, list);
1936 list_del(&l->list);
1937 kfree(l);
1938 }
1939
c18536a7 1940 closure_sync(&cl);
cafe5635 1941 /* XXX: test this, it's broken */
c8694948 1942 bch_cache_set_error(c, "%s", err);
317d836b
CL
1943
1944 return -EIO;
cafe5635
KO
1945}
1946
1947static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1948{
1949 return ca->sb.block_size == c->sb.block_size &&
9eb8ebeb 1950 ca->sb.bucket_size == c->sb.bucket_size &&
cafe5635
KO
1951 ca->sb.nr_in_set == c->sb.nr_in_set;
1952}
1953
1954static const char *register_cache_set(struct cache *ca)
1955{
1956 char buf[12];
1957 const char *err = "cannot allocate memory";
1958 struct cache_set *c;
1959
1960 list_for_each_entry(c, &bch_cache_sets, list)
1961 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1962 if (c->cache[ca->sb.nr_this_dev])
1963 return "duplicate cache set member";
1964
1965 if (!can_attach_cache(ca, c))
1966 return "cache sb does not match set";
1967
1968 if (!CACHE_SYNC(&ca->sb))
1969 SET_CACHE_SYNC(&c->sb, false);
1970
1971 goto found;
1972 }
1973
1974 c = bch_cache_set_alloc(&ca->sb);
1975 if (!c)
1976 return err;
1977
1978 err = "error creating kobject";
1979 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1980 kobject_add(&c->internal, &c->kobj, "internal"))
1981 goto err;
1982
1983 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1984 goto err;
1985
1986 bch_debug_init_cache_set(c);
1987
1988 list_add(&c->list, &bch_cache_sets);
1989found:
1990 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1991 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1992 sysfs_create_link(&c->kobj, &ca->kobj, buf))
1993 goto err;
1994
1995 if (ca->sb.seq > c->sb.seq) {
1996 c->sb.version = ca->sb.version;
1997 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1998 c->sb.flags = ca->sb.flags;
1999 c->sb.seq = ca->sb.seq;
2000 pr_debug("set version = %llu", c->sb.version);
2001 }
2002
d83353b3 2003 kobject_get(&ca->kobj);
cafe5635
KO
2004 ca->set = c;
2005 ca->set->cache[ca->sb.nr_this_dev] = ca;
2006 c->cache_by_alloc[c->caches_loaded++] = ca;
2007
317d836b
CL
2008 if (c->caches_loaded == c->sb.nr_in_set) {
2009 err = "failed to run cache set";
2010 if (run_cache_set(c) < 0)
2011 goto err;
2012 }
cafe5635
KO
2013
2014 return NULL;
2015err:
2016 bch_cache_set_unregister(c);
2017 return err;
2018}
2019
2020/* Cache device */
2021
2022void bch_cache_release(struct kobject *kobj)
2023{
2024 struct cache *ca = container_of(kobj, struct cache, kobj);
6f10f7d1 2025 unsigned int i;
cafe5635 2026
c9a78332
SP
2027 if (ca->set) {
2028 BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
cafe5635 2029 ca->set->cache[ca->sb.nr_this_dev] = NULL;
c9a78332 2030 }
cafe5635 2031
cafe5635
KO
2032 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
2033 kfree(ca->prio_buckets);
2034 vfree(ca->buckets);
2035
2036 free_heap(&ca->heap);
cafe5635 2037 free_fifo(&ca->free_inc);
78365411
KO
2038
2039 for (i = 0; i < RESERVE_NR; i++)
2040 free_fifo(&ca->free[i]);
cafe5635
KO
2041
2042 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
263663cd 2043 put_page(bio_first_page_all(&ca->sb_bio));
cafe5635 2044
0781c874 2045 if (!IS_ERR_OR_NULL(ca->bdev))
cafe5635 2046 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
cafe5635
KO
2047
2048 kfree(ca);
2049 module_put(THIS_MODULE);
2050}
2051
c50d4d5d 2052static int cache_alloc(struct cache *ca)
cafe5635
KO
2053{
2054 size_t free;
682811b3 2055 size_t btree_buckets;
cafe5635 2056 struct bucket *b;
f6027bca
DC
2057 int ret = -ENOMEM;
2058 const char *err = NULL;
cafe5635 2059
cafe5635
KO
2060 __module_get(THIS_MODULE);
2061 kobject_init(&ca->kobj, &bch_cache_ktype);
2062
3a83f467 2063 bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
cafe5635 2064
682811b3
TJ
2065 /*
2066 * when ca->sb.njournal_buckets is not zero, journal exists,
2067 * and in bch_journal_replay(), tree node may split,
2068 * so bucket of RESERVE_BTREE type is needed,
2069 * the worst situation is all journal buckets are valid journal,
2070 * and all the keys need to replay,
2071 * so the number of RESERVE_BTREE type buckets should be as much
2072 * as journal buckets
2073 */
2074 btree_buckets = ca->sb.njournal_buckets ?: 8;
78365411 2075 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
3a646fd7
DC
2076 if (!free) {
2077 ret = -EPERM;
2078 err = "ca->sb.nbuckets is too small";
2079 goto err_free;
2080 }
cafe5635 2081
f6027bca
DC
2082 if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
2083 GFP_KERNEL)) {
2084 err = "ca->free[RESERVE_BTREE] alloc failed";
2085 goto err_btree_alloc;
2086 }
2087
2088 if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
2089 GFP_KERNEL)) {
2090 err = "ca->free[RESERVE_PRIO] alloc failed";
2091 goto err_prio_alloc;
2092 }
2093
2094 if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
2095 err = "ca->free[RESERVE_MOVINGGC] alloc failed";
2096 goto err_movinggc_alloc;
2097 }
2098
2099 if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
2100 err = "ca->free[RESERVE_NONE] alloc failed";
2101 goto err_none_alloc;
2102 }
2103
2104 if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
2105 err = "ca->free_inc alloc failed";
2106 goto err_free_inc_alloc;
2107 }
2108
2109 if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
2110 err = "ca->heap alloc failed";
2111 goto err_heap_alloc;
2112 }
2113
2114 ca->buckets = vzalloc(array_size(sizeof(struct bucket),
2115 ca->sb.nbuckets));
2116 if (!ca->buckets) {
2117 err = "ca->buckets alloc failed";
2118 goto err_buckets_alloc;
2119 }
2120
2121 ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
2122 prio_buckets(ca), 2),
2123 GFP_KERNEL);
2124 if (!ca->prio_buckets) {
2125 err = "ca->prio_buckets alloc failed";
2126 goto err_prio_buckets_alloc;
2127 }
2128
2129 ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca);
2130 if (!ca->disk_buckets) {
2131 err = "ca->disk_buckets alloc failed";
2132 goto err_disk_buckets_alloc;
2133 }
cafe5635
KO
2134
2135 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2136
cafe5635
KO
2137 for_each_bucket(b, ca)
2138 atomic_set(&b->pin, 0);
cafe5635 2139 return 0;
f6027bca
DC
2140
2141err_disk_buckets_alloc:
2142 kfree(ca->prio_buckets);
2143err_prio_buckets_alloc:
2144 vfree(ca->buckets);
2145err_buckets_alloc:
2146 free_heap(&ca->heap);
2147err_heap_alloc:
2148 free_fifo(&ca->free_inc);
2149err_free_inc_alloc:
2150 free_fifo(&ca->free[RESERVE_NONE]);
2151err_none_alloc:
2152 free_fifo(&ca->free[RESERVE_MOVINGGC]);
2153err_movinggc_alloc:
2154 free_fifo(&ca->free[RESERVE_PRIO]);
2155err_prio_alloc:
2156 free_fifo(&ca->free[RESERVE_BTREE]);
2157err_btree_alloc:
3a646fd7 2158err_free:
f6027bca
DC
2159 module_put(THIS_MODULE);
2160 if (err)
2161 pr_notice("error %s: %s", ca->cache_dev_name, err);
2162 return ret;
cafe5635
KO
2163}
2164
9b299728 2165static int register_cache(struct cache_sb *sb, struct page *sb_page,
c9a78332 2166 struct block_device *bdev, struct cache *ca)
cafe5635 2167{
d9dc1702 2168 const char *err = NULL; /* must be set for any error case */
9b299728 2169 int ret = 0;
cafe5635 2170
6e916a7e 2171 bdevname(bdev, ca->cache_dev_name);
f59fce84 2172 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
cafe5635
KO
2173 ca->bdev = bdev;
2174 ca->bdev->bd_holder = ca;
2175
3a83f467 2176 bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
263663cd 2177 bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
f59fce84
KO
2178 get_page(sb_page);
2179
cc40daf9 2180 if (blk_queue_discard(bdev_get_queue(bdev)))
cafe5635
KO
2181 ca->discard = CACHE_DISCARD(&ca->sb);
2182
c50d4d5d 2183 ret = cache_alloc(ca);
d9dc1702 2184 if (ret != 0) {
cc40daf9 2185 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
d9dc1702
EW
2186 if (ret == -ENOMEM)
2187 err = "cache_alloc(): -ENOMEM";
3a646fd7
DC
2188 else if (ret == -EPERM)
2189 err = "cache_alloc(): cache device is too small";
d9dc1702
EW
2190 else
2191 err = "cache_alloc(): unknown error";
f59fce84 2192 goto err;
d9dc1702 2193 }
f59fce84 2194
b0d30981
CL
2195 if (kobject_add(&ca->kobj,
2196 &part_to_dev(bdev->bd_part)->kobj,
2197 "bcache")) {
9b299728
EW
2198 err = "error calling kobject_add";
2199 ret = -ENOMEM;
2200 goto out;
2201 }
cafe5635 2202
4fa03402 2203 mutex_lock(&bch_register_lock);
cafe5635 2204 err = register_cache_set(ca);
4fa03402
KO
2205 mutex_unlock(&bch_register_lock);
2206
9b299728
EW
2207 if (err) {
2208 ret = -ENODEV;
2209 goto out;
2210 }
cafe5635 2211
6e916a7e 2212 pr_info("registered cache device %s", ca->cache_dev_name);
9b299728 2213
d83353b3
KO
2214out:
2215 kobject_put(&ca->kobj);
9b299728 2216
cafe5635 2217err:
9b299728 2218 if (err)
6e916a7e 2219 pr_notice("error %s: %s", ca->cache_dev_name, err);
9b299728
EW
2220
2221 return ret;
cafe5635
KO
2222}
2223
2224/* Global interfaces/init */
2225
fc2d5988
CL
2226static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2227 const char *buffer, size_t size);
cafe5635
KO
2228
2229kobj_attribute_write(register, register_bcache);
2230kobj_attribute_write(register_quiet, register_bcache);
2231
b3cf37bf
CL
2232static bool bch_is_open_backing(struct block_device *bdev)
2233{
a9dd53ad
GP
2234 struct cache_set *c, *tc;
2235 struct cached_dev *dc, *t;
2236
2237 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2238 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
2239 if (dc->bdev == bdev)
2240 return true;
2241 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2242 if (dc->bdev == bdev)
2243 return true;
2244 return false;
2245}
2246
7e9c273f
RH
2247static struct cached_dev *bch_find_cached_dev(struct block_device *bdev) {
2248 struct cache_set *c, *tc;
2249 struct cached_dev *dc, *t;
2250
2251 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2252 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
2253 if (dc->bdev == bdev)
2254 return dc;
2255 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2256 if (dc->bdev == bdev)
2257 return dc;
2258
2259 return NULL;
2260}
2261
b3cf37bf
CL
2262static bool bch_is_open_cache(struct block_device *bdev)
2263{
a9dd53ad
GP
2264 struct cache_set *c, *tc;
2265 struct cache *ca;
6f10f7d1 2266 unsigned int i;
a9dd53ad
GP
2267
2268 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2269 for_each_cache(ca, c, i)
2270 if (ca->bdev == bdev)
2271 return true;
2272 return false;
2273}
2274
b3cf37bf
CL
2275static bool bch_is_open(struct block_device *bdev)
2276{
a9dd53ad
GP
2277 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
2278}
2279
cafe5635
KO
2280static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2281 const char *buffer, size_t size)
2282{
2283 ssize_t ret = size;
2284 const char *err = "cannot allocate memory";
2285 char *path = NULL;
2286 struct cache_sb *sb = NULL;
2287 struct block_device *bdev = NULL;
2288 struct page *sb_page = NULL;
7e9c273f 2289 struct cached_dev *dc = NULL;
cafe5635
KO
2290
2291 if (!try_module_get(THIS_MODULE))
2292 return -EBUSY;
2293
a56489d4
FS
2294 path = kstrndup(buffer, size, GFP_KERNEL);
2295 if (!path)
2296 goto err;
2297
2298 sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
2299 if (!sb)
cafe5635
KO
2300 goto err;
2301
2302 err = "failed to open device";
2303 bdev = blkdev_get_by_path(strim(path),
2304 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2305 sb);
f59fce84 2306 if (IS_ERR(bdev)) {
a9dd53ad 2307 if (bdev == ERR_PTR(-EBUSY)) {
92bea716 2308 bdev = lookup_bdev(strim(path), 0);
789d21db 2309 mutex_lock(&bch_register_lock);
7e9c273f 2310 if (!IS_ERR(bdev) && bch_is_open(bdev)) {
a9dd53ad 2311 err = "device already registered";
7e9c273f
RH
2312 /* emit CHANGE event for backing devices to export
2313 * CACHED_{UUID/LABEL} values to udev */
2314 if (bch_is_open_backing(bdev)) {
2315 dc = bch_find_cached_dev(bdev);
2316 if (dc) {
2317 bch_cached_dev_emit_change(dc);
2318 err = "device already registered (emitting change event)";
2319 }
2320 }
2321 } else {
a9dd53ad 2322 err = "device busy";
7e9c273f 2323 }
789d21db 2324 mutex_unlock(&bch_register_lock);
4b758df2
JK
2325 if (!IS_ERR(bdev))
2326 bdput(bdev);
d7076f21
GP
2327 if (attr == &ksysfs_register_quiet)
2328 goto out;
a9dd53ad 2329 }
cafe5635 2330 goto err;
f59fce84
KO
2331 }
2332
2333 err = "failed to set blocksize";
2334 if (set_blocksize(bdev, 4096))
2335 goto err_close;
cafe5635
KO
2336
2337 err = read_super(sb, bdev, &sb_page);
2338 if (err)
2339 goto err_close;
2340
cc40daf9 2341 err = "failed to register device";
2903381f 2342 if (SB_IS_BDEV(sb)) {
cafe5635 2343 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
1fae7cf0 2344
f59fce84
KO
2345 if (!dc)
2346 goto err_close;
cafe5635 2347
4fa03402 2348 mutex_lock(&bch_register_lock);
f59fce84 2349 register_bdev(sb, sb_page, bdev, dc);
4fa03402 2350 mutex_unlock(&bch_register_lock);
cafe5635
KO
2351 } else {
2352 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1fae7cf0 2353
f59fce84
KO
2354 if (!ca)
2355 goto err_close;
cafe5635 2356
9b299728 2357 if (register_cache(sb, sb_page, bdev, ca) != 0)
cc40daf9 2358 goto err;
cafe5635 2359 }
f59fce84
KO
2360out:
2361 if (sb_page)
cafe5635 2362 put_page(sb_page);
cafe5635
KO
2363 kfree(sb);
2364 kfree(path);
cafe5635
KO
2365 module_put(THIS_MODULE);
2366 return ret;
f59fce84
KO
2367
2368err_close:
2369 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2370err:
cc40daf9 2371 pr_info("error %s: %s", path, err);
f59fce84
KO
2372 ret = -EINVAL;
2373 goto out;
cafe5635
KO
2374}
2375
2376static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2377{
2378 if (code == SYS_DOWN ||
2379 code == SYS_HALT ||
2380 code == SYS_POWER_OFF) {
2381 DEFINE_WAIT(wait);
2382 unsigned long start = jiffies;
2383 bool stopped = false;
2384
2385 struct cache_set *c, *tc;
2386 struct cached_dev *dc, *tdc;
2387
2388 mutex_lock(&bch_register_lock);
2389
2390 if (list_empty(&bch_cache_sets) &&
2391 list_empty(&uncached_devices))
2392 goto out;
2393
2394 pr_info("Stopping all devices:");
2395
2396 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2397 bch_cache_set_stop(c);
2398
2399 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2400 bcache_device_stop(&dc->disk);
2401
47408e09
CL
2402 mutex_unlock(&bch_register_lock);
2403
2404 /*
2405 * Give an early chance for other kthreads and
2406 * kworkers to stop themselves
2407 */
2408 schedule();
2409
cafe5635
KO
2410 /* What's a condition variable? */
2411 while (1) {
47408e09 2412 long timeout = start + 10 * HZ - jiffies;
cafe5635 2413
47408e09 2414 mutex_lock(&bch_register_lock);
cafe5635
KO
2415 stopped = list_empty(&bch_cache_sets) &&
2416 list_empty(&uncached_devices);
2417
2418 if (timeout < 0 || stopped)
2419 break;
2420
2421 prepare_to_wait(&unregister_wait, &wait,
2422 TASK_UNINTERRUPTIBLE);
2423
2424 mutex_unlock(&bch_register_lock);
2425 schedule_timeout(timeout);
cafe5635
KO
2426 }
2427
2428 finish_wait(&unregister_wait, &wait);
2429
2430 if (stopped)
2431 pr_info("All devices stopped");
2432 else
2433 pr_notice("Timeout waiting for devices to be closed");
2434out:
2435 mutex_unlock(&bch_register_lock);
2436 }
2437
2438 return NOTIFY_DONE;
2439}
2440
2441static struct notifier_block reboot = {
2442 .notifier_call = bcache_reboot,
2443 .priority = INT_MAX, /* before any real devices */
2444};
2445
2446static void bcache_exit(void)
2447{
2448 bch_debug_exit();
cafe5635 2449 bch_request_exit();
cafe5635
KO
2450 if (bcache_kobj)
2451 kobject_put(bcache_kobj);
2452 if (bcache_wq)
2453 destroy_workqueue(bcache_wq);
0f843e65
GF
2454 if (bch_journal_wq)
2455 destroy_workqueue(bch_journal_wq);
2456
5c41c8a7
KO
2457 if (bcache_major)
2458 unregister_blkdev(bcache_major, "bcache");
cafe5635 2459 unregister_reboot_notifier(&reboot);
330a4db8 2460 mutex_destroy(&bch_register_lock);
cafe5635
KO
2461}
2462
9aaf5165
CL
2463/* Check and fixup module parameters */
2464static void check_module_parameters(void)
2465{
2466 if (bch_cutoff_writeback_sync == 0)
2467 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
2468 else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
2469 pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
2470 bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
2471 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
2472 }
2473
2474 if (bch_cutoff_writeback == 0)
2475 bch_cutoff_writeback = CUTOFF_WRITEBACK;
2476 else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
2477 pr_warn("set bch_cutoff_writeback (%u) to max value %u",
2478 bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
2479 bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
2480 }
2481
2482 if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
2483 pr_warn("set bch_cutoff_writeback (%u) to %u",
2484 bch_cutoff_writeback, bch_cutoff_writeback_sync);
2485 bch_cutoff_writeback = bch_cutoff_writeback_sync;
2486 }
2487}
2488
cafe5635
KO
2489static int __init bcache_init(void)
2490{
2491 static const struct attribute *files[] = {
2492 &ksysfs_register.attr,
2493 &ksysfs_register_quiet.attr,
2494 NULL
2495 };
2496
9aaf5165
CL
2497 check_module_parameters();
2498
cafe5635
KO
2499 mutex_init(&bch_register_lock);
2500 init_waitqueue_head(&unregister_wait);
2501 register_reboot_notifier(&reboot);
2502
2503 bcache_major = register_blkdev(0, "bcache");
2ecf0cdb
ZL
2504 if (bcache_major < 0) {
2505 unregister_reboot_notifier(&reboot);
330a4db8 2506 mutex_destroy(&bch_register_lock);
cafe5635 2507 return bcache_major;
2ecf0cdb 2508 }
cafe5635 2509
16c1fdf4
FS
2510 bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
2511 if (!bcache_wq)
2512 goto err;
2513
0f843e65
GF
2514 bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
2515 if (!bch_journal_wq)
2516 goto err;
2517
16c1fdf4
FS
2518 bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
2519 if (!bcache_kobj)
2520 goto err;
2521
2522 if (bch_request_init() ||
330a4db8 2523 sysfs_create_files(bcache_kobj, files))
cafe5635
KO
2524 goto err;
2525
91bafdf0 2526 bch_debug_init();
78ac2107
CL
2527 closure_debug_init();
2528
cafe5635
KO
2529 return 0;
2530err:
2531 bcache_exit();
2532 return -ENOMEM;
2533}
2534
9aaf5165
CL
2535/*
2536 * Module hooks
2537 */
cafe5635
KO
2538module_exit(bcache_exit);
2539module_init(bcache_init);
009673d0 2540
9aaf5165
CL
2541module_param(bch_cutoff_writeback, uint, 0);
2542MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
2543
2544module_param(bch_cutoff_writeback_sync, uint, 0);
2545MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
2546
009673d0
CL
2547MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
2548MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
2549MODULE_LICENSE("GPL");