]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/md/dm-cache-target.c
dm cache: submit writethrough writes in parallel to origin and cache
[mirror_ubuntu-bionic-kernel.git] / drivers / md / dm-cache-target.c
1 /*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7 #include "dm.h"
8 #include "dm-bio-prison-v2.h"
9 #include "dm-bio-record.h"
10 #include "dm-cache-metadata.h"
11
12 #include <linux/dm-io.h>
13 #include <linux/dm-kcopyd.h>
14 #include <linux/jiffies.h>
15 #include <linux/init.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/rwsem.h>
19 #include <linux/slab.h>
20 #include <linux/vmalloc.h>
21
22 #define DM_MSG_PREFIX "cache"
23
24 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
25 "A percentage of time allocated for copying to and/or from cache");
26
27 /*----------------------------------------------------------------*/
28
29 /*
30 * Glossary:
31 *
32 * oblock: index of an origin block
33 * cblock: index of a cache block
34 * promotion: movement of a block from origin to cache
35 * demotion: movement of a block from cache to origin
36 * migration: movement of a block between the origin and cache device,
37 * either direction
38 */
39
40 /*----------------------------------------------------------------*/
41
42 struct io_tracker {
43 spinlock_t lock;
44
45 /*
46 * Sectors of in-flight IO.
47 */
48 sector_t in_flight;
49
50 /*
51 * The time, in jiffies, when this device became idle (if it is
52 * indeed idle).
53 */
54 unsigned long idle_time;
55 unsigned long last_update_time;
56 };
57
58 static void iot_init(struct io_tracker *iot)
59 {
60 spin_lock_init(&iot->lock);
61 iot->in_flight = 0ul;
62 iot->idle_time = 0ul;
63 iot->last_update_time = jiffies;
64 }
65
66 static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
67 {
68 if (iot->in_flight)
69 return false;
70
71 return time_after(jiffies, iot->idle_time + jifs);
72 }
73
74 static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
75 {
76 bool r;
77 unsigned long flags;
78
79 spin_lock_irqsave(&iot->lock, flags);
80 r = __iot_idle_for(iot, jifs);
81 spin_unlock_irqrestore(&iot->lock, flags);
82
83 return r;
84 }
85
86 static void iot_io_begin(struct io_tracker *iot, sector_t len)
87 {
88 unsigned long flags;
89
90 spin_lock_irqsave(&iot->lock, flags);
91 iot->in_flight += len;
92 spin_unlock_irqrestore(&iot->lock, flags);
93 }
94
95 static void __iot_io_end(struct io_tracker *iot, sector_t len)
96 {
97 if (!len)
98 return;
99
100 iot->in_flight -= len;
101 if (!iot->in_flight)
102 iot->idle_time = jiffies;
103 }
104
105 static void iot_io_end(struct io_tracker *iot, sector_t len)
106 {
107 unsigned long flags;
108
109 spin_lock_irqsave(&iot->lock, flags);
110 __iot_io_end(iot, len);
111 spin_unlock_irqrestore(&iot->lock, flags);
112 }
113
114 /*----------------------------------------------------------------*/
115
116 /*
117 * Represents a chunk of future work. 'input' allows continuations to pass
118 * values between themselves, typically error values.
119 */
120 struct continuation {
121 struct work_struct ws;
122 blk_status_t input;
123 };
124
125 static inline void init_continuation(struct continuation *k,
126 void (*fn)(struct work_struct *))
127 {
128 INIT_WORK(&k->ws, fn);
129 k->input = 0;
130 }
131
132 static inline void queue_continuation(struct workqueue_struct *wq,
133 struct continuation *k)
134 {
135 queue_work(wq, &k->ws);
136 }
137
138 /*----------------------------------------------------------------*/
139
140 /*
141 * The batcher collects together pieces of work that need a particular
142 * operation to occur before they can proceed (typically a commit).
143 */
144 struct batcher {
145 /*
146 * The operation that everyone is waiting for.
147 */
148 blk_status_t (*commit_op)(void *context);
149 void *commit_context;
150
151 /*
152 * This is how bios should be issued once the commit op is complete
153 * (accounted_request).
154 */
155 void (*issue_op)(struct bio *bio, void *context);
156 void *issue_context;
157
158 /*
159 * Queued work gets put on here after commit.
160 */
161 struct workqueue_struct *wq;
162
163 spinlock_t lock;
164 struct list_head work_items;
165 struct bio_list bios;
166 struct work_struct commit_work;
167
168 bool commit_scheduled;
169 };
170
171 static void __commit(struct work_struct *_ws)
172 {
173 struct batcher *b = container_of(_ws, struct batcher, commit_work);
174 blk_status_t r;
175 unsigned long flags;
176 struct list_head work_items;
177 struct work_struct *ws, *tmp;
178 struct continuation *k;
179 struct bio *bio;
180 struct bio_list bios;
181
182 INIT_LIST_HEAD(&work_items);
183 bio_list_init(&bios);
184
185 /*
186 * We have to grab these before the commit_op to avoid a race
187 * condition.
188 */
189 spin_lock_irqsave(&b->lock, flags);
190 list_splice_init(&b->work_items, &work_items);
191 bio_list_merge(&bios, &b->bios);
192 bio_list_init(&b->bios);
193 b->commit_scheduled = false;
194 spin_unlock_irqrestore(&b->lock, flags);
195
196 r = b->commit_op(b->commit_context);
197
198 list_for_each_entry_safe(ws, tmp, &work_items, entry) {
199 k = container_of(ws, struct continuation, ws);
200 k->input = r;
201 INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
202 queue_work(b->wq, ws);
203 }
204
205 while ((bio = bio_list_pop(&bios))) {
206 if (r) {
207 bio->bi_status = r;
208 bio_endio(bio);
209 } else
210 b->issue_op(bio, b->issue_context);
211 }
212 }
213
214 static void batcher_init(struct batcher *b,
215 blk_status_t (*commit_op)(void *),
216 void *commit_context,
217 void (*issue_op)(struct bio *bio, void *),
218 void *issue_context,
219 struct workqueue_struct *wq)
220 {
221 b->commit_op = commit_op;
222 b->commit_context = commit_context;
223 b->issue_op = issue_op;
224 b->issue_context = issue_context;
225 b->wq = wq;
226
227 spin_lock_init(&b->lock);
228 INIT_LIST_HEAD(&b->work_items);
229 bio_list_init(&b->bios);
230 INIT_WORK(&b->commit_work, __commit);
231 b->commit_scheduled = false;
232 }
233
234 static void async_commit(struct batcher *b)
235 {
236 queue_work(b->wq, &b->commit_work);
237 }
238
239 static void continue_after_commit(struct batcher *b, struct continuation *k)
240 {
241 unsigned long flags;
242 bool commit_scheduled;
243
244 spin_lock_irqsave(&b->lock, flags);
245 commit_scheduled = b->commit_scheduled;
246 list_add_tail(&k->ws.entry, &b->work_items);
247 spin_unlock_irqrestore(&b->lock, flags);
248
249 if (commit_scheduled)
250 async_commit(b);
251 }
252
253 /*
254 * Bios are errored if commit failed.
255 */
256 static void issue_after_commit(struct batcher *b, struct bio *bio)
257 {
258 unsigned long flags;
259 bool commit_scheduled;
260
261 spin_lock_irqsave(&b->lock, flags);
262 commit_scheduled = b->commit_scheduled;
263 bio_list_add(&b->bios, bio);
264 spin_unlock_irqrestore(&b->lock, flags);
265
266 if (commit_scheduled)
267 async_commit(b);
268 }
269
270 /*
271 * Call this if some urgent work is waiting for the commit to complete.
272 */
273 static void schedule_commit(struct batcher *b)
274 {
275 bool immediate;
276 unsigned long flags;
277
278 spin_lock_irqsave(&b->lock, flags);
279 immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
280 b->commit_scheduled = true;
281 spin_unlock_irqrestore(&b->lock, flags);
282
283 if (immediate)
284 async_commit(b);
285 }
286
287 /*
288 * There are a couple of places where we let a bio run, but want to do some
289 * work before calling its endio function. We do this by temporarily
290 * changing the endio fn.
291 */
292 struct dm_hook_info {
293 bio_end_io_t *bi_end_io;
294 };
295
296 static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
297 bio_end_io_t *bi_end_io, void *bi_private)
298 {
299 h->bi_end_io = bio->bi_end_io;
300
301 bio->bi_end_io = bi_end_io;
302 bio->bi_private = bi_private;
303 }
304
305 static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
306 {
307 bio->bi_end_io = h->bi_end_io;
308 }
309
310 /*----------------------------------------------------------------*/
311
312 #define MIGRATION_POOL_SIZE 128
313 #define COMMIT_PERIOD HZ
314 #define MIGRATION_COUNT_WINDOW 10
315
316 /*
317 * The block size of the device holding cache data must be
318 * between 32KB and 1GB.
319 */
320 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
321 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
322
323 enum cache_metadata_mode {
324 CM_WRITE, /* metadata may be changed */
325 CM_READ_ONLY, /* metadata may not be changed */
326 CM_FAIL
327 };
328
329 enum cache_io_mode {
330 /*
331 * Data is written to cached blocks only. These blocks are marked
332 * dirty. If you lose the cache device you will lose data.
333 * Potential performance increase for both reads and writes.
334 */
335 CM_IO_WRITEBACK,
336
337 /*
338 * Data is written to both cache and origin. Blocks are never
339 * dirty. Potential performance benfit for reads only.
340 */
341 CM_IO_WRITETHROUGH,
342
343 /*
344 * A degraded mode useful for various cache coherency situations
345 * (eg, rolling back snapshots). Reads and writes always go to the
346 * origin. If a write goes to a cached oblock, then the cache
347 * block is invalidated.
348 */
349 CM_IO_PASSTHROUGH
350 };
351
352 struct cache_features {
353 enum cache_metadata_mode mode;
354 enum cache_io_mode io_mode;
355 unsigned metadata_version;
356 };
357
358 struct cache_stats {
359 atomic_t read_hit;
360 atomic_t read_miss;
361 atomic_t write_hit;
362 atomic_t write_miss;
363 atomic_t demotion;
364 atomic_t promotion;
365 atomic_t writeback;
366 atomic_t copies_avoided;
367 atomic_t cache_cell_clash;
368 atomic_t commit_count;
369 atomic_t discard_count;
370 };
371
372 struct cache {
373 struct dm_target *ti;
374 struct dm_target_callbacks callbacks;
375
376 struct dm_cache_metadata *cmd;
377
378 /*
379 * Metadata is written to this device.
380 */
381 struct dm_dev *metadata_dev;
382
383 /*
384 * The slower of the two data devices. Typically a spindle.
385 */
386 struct dm_dev *origin_dev;
387
388 /*
389 * The faster of the two data devices. Typically an SSD.
390 */
391 struct dm_dev *cache_dev;
392
393 /*
394 * Size of the origin device in _complete_ blocks and native sectors.
395 */
396 dm_oblock_t origin_blocks;
397 sector_t origin_sectors;
398
399 /*
400 * Size of the cache device in blocks.
401 */
402 dm_cblock_t cache_size;
403
404 /*
405 * Fields for converting from sectors to blocks.
406 */
407 sector_t sectors_per_block;
408 int sectors_per_block_shift;
409
410 spinlock_t lock;
411 struct list_head deferred_cells;
412 struct bio_list deferred_bios;
413 struct bio_list deferred_writethrough_bios;
414 sector_t migration_threshold;
415 wait_queue_head_t migration_wait;
416 atomic_t nr_allocated_migrations;
417
418 /*
419 * The number of in flight migrations that are performing
420 * background io. eg, promotion, writeback.
421 */
422 atomic_t nr_io_migrations;
423
424 struct rw_semaphore quiesce_lock;
425
426 /*
427 * cache_size entries, dirty if set
428 */
429 atomic_t nr_dirty;
430 unsigned long *dirty_bitset;
431
432 /*
433 * origin_blocks entries, discarded if set.
434 */
435 dm_dblock_t discard_nr_blocks;
436 unsigned long *discard_bitset;
437 uint32_t discard_block_size; /* a power of 2 times sectors per block */
438
439 /*
440 * Rather than reconstructing the table line for the status we just
441 * save it and regurgitate.
442 */
443 unsigned nr_ctr_args;
444 const char **ctr_args;
445
446 struct dm_kcopyd_client *copier;
447 struct workqueue_struct *wq;
448 struct work_struct deferred_bio_worker;
449 struct work_struct deferred_writethrough_worker;
450 struct work_struct migration_worker;
451 struct delayed_work waker;
452 struct dm_bio_prison_v2 *prison;
453 struct bio_set *bs;
454
455 mempool_t *migration_pool;
456
457 struct dm_cache_policy *policy;
458 unsigned policy_nr_args;
459
460 bool need_tick_bio:1;
461 bool sized:1;
462 bool invalidate:1;
463 bool commit_requested:1;
464 bool loaded_mappings:1;
465 bool loaded_discards:1;
466
467 /*
468 * Cache features such as write-through.
469 */
470 struct cache_features features;
471
472 struct cache_stats stats;
473
474 /*
475 * Invalidation fields.
476 */
477 spinlock_t invalidation_lock;
478 struct list_head invalidation_requests;
479
480 struct io_tracker tracker;
481
482 struct work_struct commit_ws;
483 struct batcher committer;
484
485 struct rw_semaphore background_work_lock;
486 };
487
488 struct per_bio_data {
489 bool tick:1;
490 unsigned req_nr:2;
491 struct dm_bio_prison_cell_v2 *cell;
492 struct dm_hook_info hook_info;
493 sector_t len;
494
495 /*
496 * writethrough fields. These MUST remain at the end of this
497 * structure and the 'cache' member must be the first as it
498 * is used to determine the offset of the writethrough fields.
499 */
500 struct cache *cache;
501 dm_cblock_t cblock;
502 struct dm_bio_details bio_details;
503 };
504
505 struct dm_cache_migration {
506 struct continuation k;
507 struct cache *cache;
508
509 struct policy_work *op;
510 struct bio *overwrite_bio;
511 struct dm_bio_prison_cell_v2 *cell;
512
513 dm_cblock_t invalidate_cblock;
514 dm_oblock_t invalidate_oblock;
515 };
516
517 /*----------------------------------------------------------------*/
518
519 static bool writethrough_mode(struct cache *cache)
520 {
521 return cache->features.io_mode == CM_IO_WRITETHROUGH;
522 }
523
524 static bool writeback_mode(struct cache *cache)
525 {
526 return cache->features.io_mode == CM_IO_WRITEBACK;
527 }
528
529 static inline bool passthrough_mode(struct cache *cache)
530 {
531 return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
532 }
533
534 /*----------------------------------------------------------------*/
535
536 static void wake_deferred_bio_worker(struct cache *cache)
537 {
538 queue_work(cache->wq, &cache->deferred_bio_worker);
539 }
540
541 static void wake_deferred_writethrough_worker(struct cache *cache)
542 {
543 queue_work(cache->wq, &cache->deferred_writethrough_worker);
544 }
545
546 static void wake_migration_worker(struct cache *cache)
547 {
548 if (passthrough_mode(cache))
549 return;
550
551 queue_work(cache->wq, &cache->migration_worker);
552 }
553
554 /*----------------------------------------------------------------*/
555
556 static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
557 {
558 return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
559 }
560
561 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
562 {
563 dm_bio_prison_free_cell_v2(cache->prison, cell);
564 }
565
566 static struct dm_cache_migration *alloc_migration(struct cache *cache)
567 {
568 struct dm_cache_migration *mg;
569
570 mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
571 if (mg) {
572 mg->cache = cache;
573 atomic_inc(&mg->cache->nr_allocated_migrations);
574 }
575
576 return mg;
577 }
578
579 static void free_migration(struct dm_cache_migration *mg)
580 {
581 struct cache *cache = mg->cache;
582
583 if (atomic_dec_and_test(&cache->nr_allocated_migrations))
584 wake_up(&cache->migration_wait);
585
586 mempool_free(mg, cache->migration_pool);
587 }
588
589 /*----------------------------------------------------------------*/
590
591 static inline dm_oblock_t oblock_succ(dm_oblock_t b)
592 {
593 return to_oblock(from_oblock(b) + 1ull);
594 }
595
596 static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
597 {
598 key->virtual = 0;
599 key->dev = 0;
600 key->block_begin = from_oblock(begin);
601 key->block_end = from_oblock(end);
602 }
603
604 /*
605 * We have two lock levels. Level 0, which is used to prevent WRITEs, and
606 * level 1 which prevents *both* READs and WRITEs.
607 */
608 #define WRITE_LOCK_LEVEL 0
609 #define READ_WRITE_LOCK_LEVEL 1
610
611 static unsigned lock_level(struct bio *bio)
612 {
613 return bio_data_dir(bio) == WRITE ?
614 WRITE_LOCK_LEVEL :
615 READ_WRITE_LOCK_LEVEL;
616 }
617
618 /*----------------------------------------------------------------
619 * Per bio data
620 *--------------------------------------------------------------*/
621
622 /*
623 * If using writeback, leave out struct per_bio_data's writethrough fields.
624 */
625 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
626 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
627
628 static size_t get_per_bio_data_size(struct cache *cache)
629 {
630 return writethrough_mode(cache) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
631 }
632
633 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
634 {
635 struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
636 BUG_ON(!pb);
637 return pb;
638 }
639
640 static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
641 {
642 struct per_bio_data *pb = get_per_bio_data(bio, data_size);
643
644 pb->tick = false;
645 pb->req_nr = dm_bio_get_target_bio_nr(bio);
646 pb->cell = NULL;
647 pb->len = 0;
648
649 return pb;
650 }
651
652 /*----------------------------------------------------------------*/
653
654 static void defer_bio(struct cache *cache, struct bio *bio)
655 {
656 unsigned long flags;
657
658 spin_lock_irqsave(&cache->lock, flags);
659 bio_list_add(&cache->deferred_bios, bio);
660 spin_unlock_irqrestore(&cache->lock, flags);
661
662 wake_deferred_bio_worker(cache);
663 }
664
665 static void defer_bios(struct cache *cache, struct bio_list *bios)
666 {
667 unsigned long flags;
668
669 spin_lock_irqsave(&cache->lock, flags);
670 bio_list_merge(&cache->deferred_bios, bios);
671 bio_list_init(bios);
672 spin_unlock_irqrestore(&cache->lock, flags);
673
674 wake_deferred_bio_worker(cache);
675 }
676
677 /*----------------------------------------------------------------*/
678
679 static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
680 {
681 bool r;
682 size_t pb_size;
683 struct per_bio_data *pb;
684 struct dm_cell_key_v2 key;
685 dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
686 struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
687
688 cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
689 if (!cell_prealloc) {
690 defer_bio(cache, bio);
691 return false;
692 }
693
694 build_key(oblock, end, &key);
695 r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
696 if (!r) {
697 /*
698 * Failed to get the lock.
699 */
700 free_prison_cell(cache, cell_prealloc);
701 return r;
702 }
703
704 if (cell != cell_prealloc)
705 free_prison_cell(cache, cell_prealloc);
706
707 pb_size = get_per_bio_data_size(cache);
708 pb = get_per_bio_data(bio, pb_size);
709 pb->cell = cell;
710
711 return r;
712 }
713
714 /*----------------------------------------------------------------*/
715
716 static bool is_dirty(struct cache *cache, dm_cblock_t b)
717 {
718 return test_bit(from_cblock(b), cache->dirty_bitset);
719 }
720
721 static void set_dirty(struct cache *cache, dm_cblock_t cblock)
722 {
723 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
724 atomic_inc(&cache->nr_dirty);
725 policy_set_dirty(cache->policy, cblock);
726 }
727 }
728
729 /*
730 * These two are called when setting after migrations to force the policy
731 * and dirty bitset to be in sync.
732 */
733 static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
734 {
735 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
736 atomic_inc(&cache->nr_dirty);
737 policy_set_dirty(cache->policy, cblock);
738 }
739
740 static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
741 {
742 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
743 if (atomic_dec_return(&cache->nr_dirty) == 0)
744 dm_table_event(cache->ti->table);
745 }
746
747 policy_clear_dirty(cache->policy, cblock);
748 }
749
750 /*----------------------------------------------------------------*/
751
752 static bool block_size_is_power_of_two(struct cache *cache)
753 {
754 return cache->sectors_per_block_shift >= 0;
755 }
756
757 /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
758 #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
759 __always_inline
760 #endif
761 static dm_block_t block_div(dm_block_t b, uint32_t n)
762 {
763 do_div(b, n);
764
765 return b;
766 }
767
768 static dm_block_t oblocks_per_dblock(struct cache *cache)
769 {
770 dm_block_t oblocks = cache->discard_block_size;
771
772 if (block_size_is_power_of_two(cache))
773 oblocks >>= cache->sectors_per_block_shift;
774 else
775 oblocks = block_div(oblocks, cache->sectors_per_block);
776
777 return oblocks;
778 }
779
780 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
781 {
782 return to_dblock(block_div(from_oblock(oblock),
783 oblocks_per_dblock(cache)));
784 }
785
786 static void set_discard(struct cache *cache, dm_dblock_t b)
787 {
788 unsigned long flags;
789
790 BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
791 atomic_inc(&cache->stats.discard_count);
792
793 spin_lock_irqsave(&cache->lock, flags);
794 set_bit(from_dblock(b), cache->discard_bitset);
795 spin_unlock_irqrestore(&cache->lock, flags);
796 }
797
798 static void clear_discard(struct cache *cache, dm_dblock_t b)
799 {
800 unsigned long flags;
801
802 spin_lock_irqsave(&cache->lock, flags);
803 clear_bit(from_dblock(b), cache->discard_bitset);
804 spin_unlock_irqrestore(&cache->lock, flags);
805 }
806
807 static bool is_discarded(struct cache *cache, dm_dblock_t b)
808 {
809 int r;
810 unsigned long flags;
811
812 spin_lock_irqsave(&cache->lock, flags);
813 r = test_bit(from_dblock(b), cache->discard_bitset);
814 spin_unlock_irqrestore(&cache->lock, flags);
815
816 return r;
817 }
818
819 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
820 {
821 int r;
822 unsigned long flags;
823
824 spin_lock_irqsave(&cache->lock, flags);
825 r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
826 cache->discard_bitset);
827 spin_unlock_irqrestore(&cache->lock, flags);
828
829 return r;
830 }
831
832 /*----------------------------------------------------------------
833 * Remapping
834 *--------------------------------------------------------------*/
835 static void remap_to_origin(struct cache *cache, struct bio *bio)
836 {
837 bio_set_dev(bio, cache->origin_dev->bdev);
838 }
839
840 static void remap_to_cache(struct cache *cache, struct bio *bio,
841 dm_cblock_t cblock)
842 {
843 sector_t bi_sector = bio->bi_iter.bi_sector;
844 sector_t block = from_cblock(cblock);
845
846 bio_set_dev(bio, cache->cache_dev->bdev);
847 if (!block_size_is_power_of_two(cache))
848 bio->bi_iter.bi_sector =
849 (block * cache->sectors_per_block) +
850 sector_div(bi_sector, cache->sectors_per_block);
851 else
852 bio->bi_iter.bi_sector =
853 (block << cache->sectors_per_block_shift) |
854 (bi_sector & (cache->sectors_per_block - 1));
855 }
856
857 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
858 {
859 unsigned long flags;
860 size_t pb_data_size = get_per_bio_data_size(cache);
861 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
862
863 spin_lock_irqsave(&cache->lock, flags);
864 if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
865 bio_op(bio) != REQ_OP_DISCARD) {
866 pb->tick = true;
867 cache->need_tick_bio = false;
868 }
869 spin_unlock_irqrestore(&cache->lock, flags);
870 }
871
872 static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
873 dm_oblock_t oblock, bool bio_has_pbd)
874 {
875 if (bio_has_pbd)
876 check_if_tick_bio_needed(cache, bio);
877 remap_to_origin(cache, bio);
878 if (bio_data_dir(bio) == WRITE)
879 clear_discard(cache, oblock_to_dblock(cache, oblock));
880 }
881
882 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
883 dm_oblock_t oblock)
884 {
885 // FIXME: check_if_tick_bio_needed() is called way too much through this interface
886 __remap_to_origin_clear_discard(cache, bio, oblock, true);
887 }
888
889 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
890 dm_oblock_t oblock, dm_cblock_t cblock)
891 {
892 check_if_tick_bio_needed(cache, bio);
893 remap_to_cache(cache, bio, cblock);
894 if (bio_data_dir(bio) == WRITE) {
895 set_dirty(cache, cblock);
896 clear_discard(cache, oblock_to_dblock(cache, oblock));
897 }
898 }
899
900 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
901 {
902 sector_t block_nr = bio->bi_iter.bi_sector;
903
904 if (!block_size_is_power_of_two(cache))
905 (void) sector_div(block_nr, cache->sectors_per_block);
906 else
907 block_nr >>= cache->sectors_per_block_shift;
908
909 return to_oblock(block_nr);
910 }
911
912 static bool accountable_bio(struct cache *cache, struct bio *bio)
913 {
914 return bio_op(bio) != REQ_OP_DISCARD;
915 }
916
917 static void accounted_begin(struct cache *cache, struct bio *bio)
918 {
919 size_t pb_data_size = get_per_bio_data_size(cache);
920 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
921
922 if (accountable_bio(cache, bio)) {
923 pb->len = bio_sectors(bio);
924 iot_io_begin(&cache->tracker, pb->len);
925 }
926 }
927
928 static void accounted_complete(struct cache *cache, struct bio *bio)
929 {
930 size_t pb_data_size = get_per_bio_data_size(cache);
931 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
932
933 iot_io_end(&cache->tracker, pb->len);
934 }
935
936 static void accounted_request(struct cache *cache, struct bio *bio)
937 {
938 accounted_begin(cache, bio);
939 generic_make_request(bio);
940 }
941
942 static void issue_op(struct bio *bio, void *context)
943 {
944 struct cache *cache = context;
945 accounted_request(cache, bio);
946 }
947
948 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
949 {
950 unsigned long flags;
951
952 spin_lock_irqsave(&cache->lock, flags);
953 bio_list_add(&cache->deferred_writethrough_bios, bio);
954 spin_unlock_irqrestore(&cache->lock, flags);
955
956 wake_deferred_writethrough_worker(cache);
957 }
958
959 static void writethrough_endio(struct bio *bio)
960 {
961 struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
962
963 dm_unhook_bio(&pb->hook_info, bio);
964
965 if (bio->bi_status) {
966 bio_endio(bio);
967 return;
968 }
969
970 dm_bio_restore(&pb->bio_details, bio);
971 remap_to_cache(pb->cache, bio, pb->cblock);
972
973 /*
974 * We can't issue this bio directly, since we're in interrupt
975 * context. So it gets put on a bio list for processing by the
976 * worker thread.
977 */
978 defer_writethrough_bio(pb->cache, bio);
979 }
980
981 /*
982 * When running in writethrough mode we need to send writes to clean blocks
983 * to both the cache and origin devices. Clone the bio and send them in parallel.
984 */
985 static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
986 dm_oblock_t oblock, dm_cblock_t cblock)
987 {
988 struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs);
989
990 BUG_ON(!origin_bio);
991
992 bio_chain(origin_bio, bio);
993 /*
994 * Passing false to __remap_to_origin_clear_discard() skips
995 * all code that might use per_bio_data (since clone doesn't have it)
996 */
997 __remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
998 submit_bio(origin_bio);
999
1000 remap_to_cache(cache, bio, cblock);
1001 }
1002
1003 /*----------------------------------------------------------------
1004 * Failure modes
1005 *--------------------------------------------------------------*/
1006 static enum cache_metadata_mode get_cache_mode(struct cache *cache)
1007 {
1008 return cache->features.mode;
1009 }
1010
1011 static const char *cache_device_name(struct cache *cache)
1012 {
1013 return dm_device_name(dm_table_get_md(cache->ti->table));
1014 }
1015
1016 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
1017 {
1018 const char *descs[] = {
1019 "write",
1020 "read-only",
1021 "fail"
1022 };
1023
1024 dm_table_event(cache->ti->table);
1025 DMINFO("%s: switching cache to %s mode",
1026 cache_device_name(cache), descs[(int)mode]);
1027 }
1028
1029 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
1030 {
1031 bool needs_check;
1032 enum cache_metadata_mode old_mode = get_cache_mode(cache);
1033
1034 if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
1035 DMERR("%s: unable to read needs_check flag, setting failure mode.",
1036 cache_device_name(cache));
1037 new_mode = CM_FAIL;
1038 }
1039
1040 if (new_mode == CM_WRITE && needs_check) {
1041 DMERR("%s: unable to switch cache to write mode until repaired.",
1042 cache_device_name(cache));
1043 if (old_mode != new_mode)
1044 new_mode = old_mode;
1045 else
1046 new_mode = CM_READ_ONLY;
1047 }
1048
1049 /* Never move out of fail mode */
1050 if (old_mode == CM_FAIL)
1051 new_mode = CM_FAIL;
1052
1053 switch (new_mode) {
1054 case CM_FAIL:
1055 case CM_READ_ONLY:
1056 dm_cache_metadata_set_read_only(cache->cmd);
1057 break;
1058
1059 case CM_WRITE:
1060 dm_cache_metadata_set_read_write(cache->cmd);
1061 break;
1062 }
1063
1064 cache->features.mode = new_mode;
1065
1066 if (new_mode != old_mode)
1067 notify_mode_switch(cache, new_mode);
1068 }
1069
1070 static void abort_transaction(struct cache *cache)
1071 {
1072 const char *dev_name = cache_device_name(cache);
1073
1074 if (get_cache_mode(cache) >= CM_READ_ONLY)
1075 return;
1076
1077 if (dm_cache_metadata_set_needs_check(cache->cmd)) {
1078 DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1079 set_cache_mode(cache, CM_FAIL);
1080 }
1081
1082 DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1083 if (dm_cache_metadata_abort(cache->cmd)) {
1084 DMERR("%s: failed to abort metadata transaction", dev_name);
1085 set_cache_mode(cache, CM_FAIL);
1086 }
1087 }
1088
1089 static void metadata_operation_failed(struct cache *cache, const char *op, int r)
1090 {
1091 DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1092 cache_device_name(cache), op, r);
1093 abort_transaction(cache);
1094 set_cache_mode(cache, CM_READ_ONLY);
1095 }
1096
1097 /*----------------------------------------------------------------*/
1098
1099 static void load_stats(struct cache *cache)
1100 {
1101 struct dm_cache_statistics stats;
1102
1103 dm_cache_metadata_get_stats(cache->cmd, &stats);
1104 atomic_set(&cache->stats.read_hit, stats.read_hits);
1105 atomic_set(&cache->stats.read_miss, stats.read_misses);
1106 atomic_set(&cache->stats.write_hit, stats.write_hits);
1107 atomic_set(&cache->stats.write_miss, stats.write_misses);
1108 }
1109
1110 static void save_stats(struct cache *cache)
1111 {
1112 struct dm_cache_statistics stats;
1113
1114 if (get_cache_mode(cache) >= CM_READ_ONLY)
1115 return;
1116
1117 stats.read_hits = atomic_read(&cache->stats.read_hit);
1118 stats.read_misses = atomic_read(&cache->stats.read_miss);
1119 stats.write_hits = atomic_read(&cache->stats.write_hit);
1120 stats.write_misses = atomic_read(&cache->stats.write_miss);
1121
1122 dm_cache_metadata_set_stats(cache->cmd, &stats);
1123 }
1124
1125 static void update_stats(struct cache_stats *stats, enum policy_operation op)
1126 {
1127 switch (op) {
1128 case POLICY_PROMOTE:
1129 atomic_inc(&stats->promotion);
1130 break;
1131
1132 case POLICY_DEMOTE:
1133 atomic_inc(&stats->demotion);
1134 break;
1135
1136 case POLICY_WRITEBACK:
1137 atomic_inc(&stats->writeback);
1138 break;
1139 }
1140 }
1141
1142 /*----------------------------------------------------------------
1143 * Migration processing
1144 *
1145 * Migration covers moving data from the origin device to the cache, or
1146 * vice versa.
1147 *--------------------------------------------------------------*/
1148
1149 static void inc_io_migrations(struct cache *cache)
1150 {
1151 atomic_inc(&cache->nr_io_migrations);
1152 }
1153
1154 static void dec_io_migrations(struct cache *cache)
1155 {
1156 atomic_dec(&cache->nr_io_migrations);
1157 }
1158
1159 static bool discard_or_flush(struct bio *bio)
1160 {
1161 return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1162 }
1163
1164 static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1165 dm_dblock_t *b, dm_dblock_t *e)
1166 {
1167 sector_t sb = bio->bi_iter.bi_sector;
1168 sector_t se = bio_end_sector(bio);
1169
1170 *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1171
1172 if (se - sb < cache->discard_block_size)
1173 *e = *b;
1174 else
1175 *e = to_dblock(block_div(se, cache->discard_block_size));
1176 }
1177
1178 /*----------------------------------------------------------------*/
1179
1180 static void prevent_background_work(struct cache *cache)
1181 {
1182 lockdep_off();
1183 down_write(&cache->background_work_lock);
1184 lockdep_on();
1185 }
1186
1187 static void allow_background_work(struct cache *cache)
1188 {
1189 lockdep_off();
1190 up_write(&cache->background_work_lock);
1191 lockdep_on();
1192 }
1193
1194 static bool background_work_begin(struct cache *cache)
1195 {
1196 bool r;
1197
1198 lockdep_off();
1199 r = down_read_trylock(&cache->background_work_lock);
1200 lockdep_on();
1201
1202 return r;
1203 }
1204
1205 static void background_work_end(struct cache *cache)
1206 {
1207 lockdep_off();
1208 up_read(&cache->background_work_lock);
1209 lockdep_on();
1210 }
1211
1212 /*----------------------------------------------------------------*/
1213
1214 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1215 {
1216 return (bio_data_dir(bio) == WRITE) &&
1217 (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1218 }
1219
1220 static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1221 {
1222 return writeback_mode(cache) &&
1223 (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1224 }
1225
1226 static void quiesce(struct dm_cache_migration *mg,
1227 void (*continuation)(struct work_struct *))
1228 {
1229 init_continuation(&mg->k, continuation);
1230 dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1231 }
1232
1233 static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1234 {
1235 struct continuation *k = container_of(ws, struct continuation, ws);
1236 return container_of(k, struct dm_cache_migration, k);
1237 }
1238
1239 static void copy_complete(int read_err, unsigned long write_err, void *context)
1240 {
1241 struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1242
1243 if (read_err || write_err)
1244 mg->k.input = BLK_STS_IOERR;
1245
1246 queue_continuation(mg->cache->wq, &mg->k);
1247 }
1248
1249 static int copy(struct dm_cache_migration *mg, bool promote)
1250 {
1251 int r;
1252 struct dm_io_region o_region, c_region;
1253 struct cache *cache = mg->cache;
1254
1255 o_region.bdev = cache->origin_dev->bdev;
1256 o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1257 o_region.count = cache->sectors_per_block;
1258
1259 c_region.bdev = cache->cache_dev->bdev;
1260 c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1261 c_region.count = cache->sectors_per_block;
1262
1263 if (promote)
1264 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1265 else
1266 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1267
1268 return r;
1269 }
1270
1271 static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1272 {
1273 size_t pb_data_size = get_per_bio_data_size(cache);
1274 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1275
1276 if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1277 free_prison_cell(cache, pb->cell);
1278 pb->cell = NULL;
1279 }
1280
1281 static void overwrite_endio(struct bio *bio)
1282 {
1283 struct dm_cache_migration *mg = bio->bi_private;
1284 struct cache *cache = mg->cache;
1285 size_t pb_data_size = get_per_bio_data_size(cache);
1286 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1287
1288 dm_unhook_bio(&pb->hook_info, bio);
1289
1290 if (bio->bi_status)
1291 mg->k.input = bio->bi_status;
1292
1293 queue_continuation(mg->cache->wq, &mg->k);
1294 }
1295
1296 static void overwrite(struct dm_cache_migration *mg,
1297 void (*continuation)(struct work_struct *))
1298 {
1299 struct bio *bio = mg->overwrite_bio;
1300 size_t pb_data_size = get_per_bio_data_size(mg->cache);
1301 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1302
1303 dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1304
1305 /*
1306 * The overwrite bio is part of the copy operation, as such it does
1307 * not set/clear discard or dirty flags.
1308 */
1309 if (mg->op->op == POLICY_PROMOTE)
1310 remap_to_cache(mg->cache, bio, mg->op->cblock);
1311 else
1312 remap_to_origin(mg->cache, bio);
1313
1314 init_continuation(&mg->k, continuation);
1315 accounted_request(mg->cache, bio);
1316 }
1317
1318 /*
1319 * Migration steps:
1320 *
1321 * 1) exclusive lock preventing WRITEs
1322 * 2) quiesce
1323 * 3) copy or issue overwrite bio
1324 * 4) upgrade to exclusive lock preventing READs and WRITEs
1325 * 5) quiesce
1326 * 6) update metadata and commit
1327 * 7) unlock
1328 */
1329 static void mg_complete(struct dm_cache_migration *mg, bool success)
1330 {
1331 struct bio_list bios;
1332 struct cache *cache = mg->cache;
1333 struct policy_work *op = mg->op;
1334 dm_cblock_t cblock = op->cblock;
1335
1336 if (success)
1337 update_stats(&cache->stats, op->op);
1338
1339 switch (op->op) {
1340 case POLICY_PROMOTE:
1341 clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1342 policy_complete_background_work(cache->policy, op, success);
1343
1344 if (mg->overwrite_bio) {
1345 if (success)
1346 force_set_dirty(cache, cblock);
1347 else if (mg->k.input)
1348 mg->overwrite_bio->bi_status = mg->k.input;
1349 else
1350 mg->overwrite_bio->bi_status = BLK_STS_IOERR;
1351 bio_endio(mg->overwrite_bio);
1352 } else {
1353 if (success)
1354 force_clear_dirty(cache, cblock);
1355 dec_io_migrations(cache);
1356 }
1357 break;
1358
1359 case POLICY_DEMOTE:
1360 /*
1361 * We clear dirty here to update the nr_dirty counter.
1362 */
1363 if (success)
1364 force_clear_dirty(cache, cblock);
1365 policy_complete_background_work(cache->policy, op, success);
1366 dec_io_migrations(cache);
1367 break;
1368
1369 case POLICY_WRITEBACK:
1370 if (success)
1371 force_clear_dirty(cache, cblock);
1372 policy_complete_background_work(cache->policy, op, success);
1373 dec_io_migrations(cache);
1374 break;
1375 }
1376
1377 bio_list_init(&bios);
1378 if (mg->cell) {
1379 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1380 free_prison_cell(cache, mg->cell);
1381 }
1382
1383 free_migration(mg);
1384 defer_bios(cache, &bios);
1385 wake_migration_worker(cache);
1386
1387 background_work_end(cache);
1388 }
1389
1390 static void mg_success(struct work_struct *ws)
1391 {
1392 struct dm_cache_migration *mg = ws_to_mg(ws);
1393 mg_complete(mg, mg->k.input == 0);
1394 }
1395
1396 static void mg_update_metadata(struct work_struct *ws)
1397 {
1398 int r;
1399 struct dm_cache_migration *mg = ws_to_mg(ws);
1400 struct cache *cache = mg->cache;
1401 struct policy_work *op = mg->op;
1402
1403 switch (op->op) {
1404 case POLICY_PROMOTE:
1405 r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1406 if (r) {
1407 DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1408 cache_device_name(cache));
1409 metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1410
1411 mg_complete(mg, false);
1412 return;
1413 }
1414 mg_complete(mg, true);
1415 break;
1416
1417 case POLICY_DEMOTE:
1418 r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1419 if (r) {
1420 DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1421 cache_device_name(cache));
1422 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1423
1424 mg_complete(mg, false);
1425 return;
1426 }
1427
1428 /*
1429 * It would be nice if we only had to commit when a REQ_FLUSH
1430 * comes through. But there's one scenario that we have to
1431 * look out for:
1432 *
1433 * - vblock x in a cache block
1434 * - domotion occurs
1435 * - cache block gets reallocated and over written
1436 * - crash
1437 *
1438 * When we recover, because there was no commit the cache will
1439 * rollback to having the data for vblock x in the cache block.
1440 * But the cache block has since been overwritten, so it'll end
1441 * up pointing to data that was never in 'x' during the history
1442 * of the device.
1443 *
1444 * To avoid this issue we require a commit as part of the
1445 * demotion operation.
1446 */
1447 init_continuation(&mg->k, mg_success);
1448 continue_after_commit(&cache->committer, &mg->k);
1449 schedule_commit(&cache->committer);
1450 break;
1451
1452 case POLICY_WRITEBACK:
1453 mg_complete(mg, true);
1454 break;
1455 }
1456 }
1457
1458 static void mg_update_metadata_after_copy(struct work_struct *ws)
1459 {
1460 struct dm_cache_migration *mg = ws_to_mg(ws);
1461
1462 /*
1463 * Did the copy succeed?
1464 */
1465 if (mg->k.input)
1466 mg_complete(mg, false);
1467 else
1468 mg_update_metadata(ws);
1469 }
1470
1471 static void mg_upgrade_lock(struct work_struct *ws)
1472 {
1473 int r;
1474 struct dm_cache_migration *mg = ws_to_mg(ws);
1475
1476 /*
1477 * Did the copy succeed?
1478 */
1479 if (mg->k.input)
1480 mg_complete(mg, false);
1481
1482 else {
1483 /*
1484 * Now we want the lock to prevent both reads and writes.
1485 */
1486 r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1487 READ_WRITE_LOCK_LEVEL);
1488 if (r < 0)
1489 mg_complete(mg, false);
1490
1491 else if (r)
1492 quiesce(mg, mg_update_metadata);
1493
1494 else
1495 mg_update_metadata(ws);
1496 }
1497 }
1498
1499 static void mg_full_copy(struct work_struct *ws)
1500 {
1501 struct dm_cache_migration *mg = ws_to_mg(ws);
1502 struct cache *cache = mg->cache;
1503 struct policy_work *op = mg->op;
1504 bool is_policy_promote = (op->op == POLICY_PROMOTE);
1505
1506 if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1507 is_discarded_oblock(cache, op->oblock)) {
1508 mg_upgrade_lock(ws);
1509 return;
1510 }
1511
1512 init_continuation(&mg->k, mg_upgrade_lock);
1513
1514 if (copy(mg, is_policy_promote)) {
1515 DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
1516 mg->k.input = BLK_STS_IOERR;
1517 mg_complete(mg, false);
1518 }
1519 }
1520
1521 static void mg_copy(struct work_struct *ws)
1522 {
1523 struct dm_cache_migration *mg = ws_to_mg(ws);
1524
1525 if (mg->overwrite_bio) {
1526 /*
1527 * No exclusive lock was held when we last checked if the bio
1528 * was optimisable. So we have to check again in case things
1529 * have changed (eg, the block may no longer be discarded).
1530 */
1531 if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
1532 /*
1533 * Fallback to a real full copy after doing some tidying up.
1534 */
1535 bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
1536 BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
1537 mg->overwrite_bio = NULL;
1538 inc_io_migrations(mg->cache);
1539 mg_full_copy(ws);
1540 return;
1541 }
1542
1543 /*
1544 * It's safe to do this here, even though it's new data
1545 * because all IO has been locked out of the block.
1546 *
1547 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1548 * so _not_ using mg_upgrade_lock() as continutation.
1549 */
1550 overwrite(mg, mg_update_metadata_after_copy);
1551
1552 } else
1553 mg_full_copy(ws);
1554 }
1555
1556 static int mg_lock_writes(struct dm_cache_migration *mg)
1557 {
1558 int r;
1559 struct dm_cell_key_v2 key;
1560 struct cache *cache = mg->cache;
1561 struct dm_bio_prison_cell_v2 *prealloc;
1562
1563 prealloc = alloc_prison_cell(cache);
1564 if (!prealloc) {
1565 DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
1566 mg_complete(mg, false);
1567 return -ENOMEM;
1568 }
1569
1570 /*
1571 * Prevent writes to the block, but allow reads to continue.
1572 * Unless we're using an overwrite bio, in which case we lock
1573 * everything.
1574 */
1575 build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1576 r = dm_cell_lock_v2(cache->prison, &key,
1577 mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1578 prealloc, &mg->cell);
1579 if (r < 0) {
1580 free_prison_cell(cache, prealloc);
1581 mg_complete(mg, false);
1582 return r;
1583 }
1584
1585 if (mg->cell != prealloc)
1586 free_prison_cell(cache, prealloc);
1587
1588 if (r == 0)
1589 mg_copy(&mg->k.ws);
1590 else
1591 quiesce(mg, mg_copy);
1592
1593 return 0;
1594 }
1595
1596 static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1597 {
1598 struct dm_cache_migration *mg;
1599
1600 if (!background_work_begin(cache)) {
1601 policy_complete_background_work(cache->policy, op, false);
1602 return -EPERM;
1603 }
1604
1605 mg = alloc_migration(cache);
1606 if (!mg) {
1607 policy_complete_background_work(cache->policy, op, false);
1608 background_work_end(cache);
1609 return -ENOMEM;
1610 }
1611
1612 memset(mg, 0, sizeof(*mg));
1613
1614 mg->cache = cache;
1615 mg->op = op;
1616 mg->overwrite_bio = bio;
1617
1618 if (!bio)
1619 inc_io_migrations(cache);
1620
1621 return mg_lock_writes(mg);
1622 }
1623
1624 /*----------------------------------------------------------------
1625 * invalidation processing
1626 *--------------------------------------------------------------*/
1627
1628 static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1629 {
1630 struct bio_list bios;
1631 struct cache *cache = mg->cache;
1632
1633 bio_list_init(&bios);
1634 if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1635 free_prison_cell(cache, mg->cell);
1636
1637 if (!success && mg->overwrite_bio)
1638 bio_io_error(mg->overwrite_bio);
1639
1640 free_migration(mg);
1641 defer_bios(cache, &bios);
1642
1643 background_work_end(cache);
1644 }
1645
1646 static void invalidate_completed(struct work_struct *ws)
1647 {
1648 struct dm_cache_migration *mg = ws_to_mg(ws);
1649 invalidate_complete(mg, !mg->k.input);
1650 }
1651
1652 static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1653 {
1654 int r = policy_invalidate_mapping(cache->policy, cblock);
1655 if (!r) {
1656 r = dm_cache_remove_mapping(cache->cmd, cblock);
1657 if (r) {
1658 DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1659 cache_device_name(cache));
1660 metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1661 }
1662
1663 } else if (r == -ENODATA) {
1664 /*
1665 * Harmless, already unmapped.
1666 */
1667 r = 0;
1668
1669 } else
1670 DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1671
1672 return r;
1673 }
1674
1675 static void invalidate_remove(struct work_struct *ws)
1676 {
1677 int r;
1678 struct dm_cache_migration *mg = ws_to_mg(ws);
1679 struct cache *cache = mg->cache;
1680
1681 r = invalidate_cblock(cache, mg->invalidate_cblock);
1682 if (r) {
1683 invalidate_complete(mg, false);
1684 return;
1685 }
1686
1687 init_continuation(&mg->k, invalidate_completed);
1688 continue_after_commit(&cache->committer, &mg->k);
1689 remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1690 mg->overwrite_bio = NULL;
1691 schedule_commit(&cache->committer);
1692 }
1693
1694 static int invalidate_lock(struct dm_cache_migration *mg)
1695 {
1696 int r;
1697 struct dm_cell_key_v2 key;
1698 struct cache *cache = mg->cache;
1699 struct dm_bio_prison_cell_v2 *prealloc;
1700
1701 prealloc = alloc_prison_cell(cache);
1702 if (!prealloc) {
1703 invalidate_complete(mg, false);
1704 return -ENOMEM;
1705 }
1706
1707 build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1708 r = dm_cell_lock_v2(cache->prison, &key,
1709 READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1710 if (r < 0) {
1711 free_prison_cell(cache, prealloc);
1712 invalidate_complete(mg, false);
1713 return r;
1714 }
1715
1716 if (mg->cell != prealloc)
1717 free_prison_cell(cache, prealloc);
1718
1719 if (r)
1720 quiesce(mg, invalidate_remove);
1721
1722 else {
1723 /*
1724 * We can't call invalidate_remove() directly here because we
1725 * might still be in request context.
1726 */
1727 init_continuation(&mg->k, invalidate_remove);
1728 queue_work(cache->wq, &mg->k.ws);
1729 }
1730
1731 return 0;
1732 }
1733
1734 static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1735 dm_oblock_t oblock, struct bio *bio)
1736 {
1737 struct dm_cache_migration *mg;
1738
1739 if (!background_work_begin(cache))
1740 return -EPERM;
1741
1742 mg = alloc_migration(cache);
1743 if (!mg) {
1744 background_work_end(cache);
1745 return -ENOMEM;
1746 }
1747
1748 memset(mg, 0, sizeof(*mg));
1749
1750 mg->cache = cache;
1751 mg->overwrite_bio = bio;
1752 mg->invalidate_cblock = cblock;
1753 mg->invalidate_oblock = oblock;
1754
1755 return invalidate_lock(mg);
1756 }
1757
1758 /*----------------------------------------------------------------
1759 * bio processing
1760 *--------------------------------------------------------------*/
1761
1762 enum busy {
1763 IDLE,
1764 BUSY
1765 };
1766
1767 static enum busy spare_migration_bandwidth(struct cache *cache)
1768 {
1769 bool idle = iot_idle_for(&cache->tracker, HZ);
1770 sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1771 cache->sectors_per_block;
1772
1773 if (idle && current_volume <= cache->migration_threshold)
1774 return IDLE;
1775 else
1776 return BUSY;
1777 }
1778
1779 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1780 {
1781 atomic_inc(bio_data_dir(bio) == READ ?
1782 &cache->stats.read_hit : &cache->stats.write_hit);
1783 }
1784
1785 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1786 {
1787 atomic_inc(bio_data_dir(bio) == READ ?
1788 &cache->stats.read_miss : &cache->stats.write_miss);
1789 }
1790
1791 /*----------------------------------------------------------------*/
1792
1793 static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1794 bool *commit_needed)
1795 {
1796 int r, data_dir;
1797 bool rb, background_queued;
1798 dm_cblock_t cblock;
1799 size_t pb_data_size = get_per_bio_data_size(cache);
1800 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1801
1802 *commit_needed = false;
1803
1804 rb = bio_detain_shared(cache, block, bio);
1805 if (!rb) {
1806 /*
1807 * An exclusive lock is held for this block, so we have to
1808 * wait. We set the commit_needed flag so the current
1809 * transaction will be committed asap, allowing this lock
1810 * to be dropped.
1811 */
1812 *commit_needed = true;
1813 return DM_MAPIO_SUBMITTED;
1814 }
1815
1816 data_dir = bio_data_dir(bio);
1817
1818 if (optimisable_bio(cache, bio, block)) {
1819 struct policy_work *op = NULL;
1820
1821 r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1822 if (unlikely(r && r != -ENOENT)) {
1823 DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1824 cache_device_name(cache), r);
1825 bio_io_error(bio);
1826 return DM_MAPIO_SUBMITTED;
1827 }
1828
1829 if (r == -ENOENT && op) {
1830 bio_drop_shared_lock(cache, bio);
1831 BUG_ON(op->op != POLICY_PROMOTE);
1832 mg_start(cache, op, bio);
1833 return DM_MAPIO_SUBMITTED;
1834 }
1835 } else {
1836 r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1837 if (unlikely(r && r != -ENOENT)) {
1838 DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1839 cache_device_name(cache), r);
1840 bio_io_error(bio);
1841 return DM_MAPIO_SUBMITTED;
1842 }
1843
1844 if (background_queued)
1845 wake_migration_worker(cache);
1846 }
1847
1848 if (r == -ENOENT) {
1849 /*
1850 * Miss.
1851 */
1852 inc_miss_counter(cache, bio);
1853 if (pb->req_nr == 0) {
1854 accounted_begin(cache, bio);
1855 remap_to_origin_clear_discard(cache, bio, block);
1856
1857 } else {
1858 /*
1859 * This is a duplicate writethrough io that is no
1860 * longer needed because the block has been demoted.
1861 */
1862 bio_endio(bio);
1863 return DM_MAPIO_SUBMITTED;
1864 }
1865 } else {
1866 /*
1867 * Hit.
1868 */
1869 inc_hit_counter(cache, bio);
1870
1871 /*
1872 * Passthrough always maps to the origin, invalidating any
1873 * cache blocks that are written to.
1874 */
1875 if (passthrough_mode(cache)) {
1876 if (bio_data_dir(bio) == WRITE) {
1877 bio_drop_shared_lock(cache, bio);
1878 atomic_inc(&cache->stats.demotion);
1879 invalidate_start(cache, cblock, block, bio);
1880 } else
1881 remap_to_origin_clear_discard(cache, bio, block);
1882
1883 } else {
1884 if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
1885 !is_dirty(cache, cblock)) {
1886 remap_to_origin_and_cache(cache, bio, block, cblock);
1887 accounted_begin(cache, bio);
1888 } else
1889 remap_to_cache_dirty(cache, bio, block, cblock);
1890 }
1891 }
1892
1893 /*
1894 * dm core turns FUA requests into a separate payload and FLUSH req.
1895 */
1896 if (bio->bi_opf & REQ_FUA) {
1897 /*
1898 * issue_after_commit will call accounted_begin a second time. So
1899 * we call accounted_complete() to avoid double accounting.
1900 */
1901 accounted_complete(cache, bio);
1902 issue_after_commit(&cache->committer, bio);
1903 *commit_needed = true;
1904 return DM_MAPIO_SUBMITTED;
1905 }
1906
1907 return DM_MAPIO_REMAPPED;
1908 }
1909
1910 static bool process_bio(struct cache *cache, struct bio *bio)
1911 {
1912 bool commit_needed;
1913
1914 if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1915 generic_make_request(bio);
1916
1917 return commit_needed;
1918 }
1919
1920 /*
1921 * A non-zero return indicates read_only or fail_io mode.
1922 */
1923 static int commit(struct cache *cache, bool clean_shutdown)
1924 {
1925 int r;
1926
1927 if (get_cache_mode(cache) >= CM_READ_ONLY)
1928 return -EINVAL;
1929
1930 atomic_inc(&cache->stats.commit_count);
1931 r = dm_cache_commit(cache->cmd, clean_shutdown);
1932 if (r)
1933 metadata_operation_failed(cache, "dm_cache_commit", r);
1934
1935 return r;
1936 }
1937
1938 /*
1939 * Used by the batcher.
1940 */
1941 static blk_status_t commit_op(void *context)
1942 {
1943 struct cache *cache = context;
1944
1945 if (dm_cache_changed_this_transaction(cache->cmd))
1946 return errno_to_blk_status(commit(cache, false));
1947
1948 return 0;
1949 }
1950
1951 /*----------------------------------------------------------------*/
1952
1953 static bool process_flush_bio(struct cache *cache, struct bio *bio)
1954 {
1955 size_t pb_data_size = get_per_bio_data_size(cache);
1956 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
1957
1958 if (!pb->req_nr)
1959 remap_to_origin(cache, bio);
1960 else
1961 remap_to_cache(cache, bio, 0);
1962
1963 issue_after_commit(&cache->committer, bio);
1964 return true;
1965 }
1966
1967 static bool process_discard_bio(struct cache *cache, struct bio *bio)
1968 {
1969 dm_dblock_t b, e;
1970
1971 // FIXME: do we need to lock the region? Or can we just assume the
1972 // user wont be so foolish as to issue discard concurrently with
1973 // other IO?
1974 calc_discard_block_range(cache, bio, &b, &e);
1975 while (b != e) {
1976 set_discard(cache, b);
1977 b = to_dblock(from_dblock(b) + 1);
1978 }
1979
1980 bio_endio(bio);
1981
1982 return false;
1983 }
1984
1985 static void process_deferred_bios(struct work_struct *ws)
1986 {
1987 struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1988
1989 unsigned long flags;
1990 bool commit_needed = false;
1991 struct bio_list bios;
1992 struct bio *bio;
1993
1994 bio_list_init(&bios);
1995
1996 spin_lock_irqsave(&cache->lock, flags);
1997 bio_list_merge(&bios, &cache->deferred_bios);
1998 bio_list_init(&cache->deferred_bios);
1999 spin_unlock_irqrestore(&cache->lock, flags);
2000
2001 while ((bio = bio_list_pop(&bios))) {
2002 if (bio->bi_opf & REQ_PREFLUSH)
2003 commit_needed = process_flush_bio(cache, bio) || commit_needed;
2004
2005 else if (bio_op(bio) == REQ_OP_DISCARD)
2006 commit_needed = process_discard_bio(cache, bio) || commit_needed;
2007
2008 else
2009 commit_needed = process_bio(cache, bio) || commit_needed;
2010 }
2011
2012 if (commit_needed)
2013 schedule_commit(&cache->committer);
2014 }
2015
2016 static void process_deferred_writethrough_bios(struct work_struct *ws)
2017 {
2018 struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker);
2019
2020 unsigned long flags;
2021 struct bio_list bios;
2022 struct bio *bio;
2023
2024 bio_list_init(&bios);
2025
2026 spin_lock_irqsave(&cache->lock, flags);
2027 bio_list_merge(&bios, &cache->deferred_writethrough_bios);
2028 bio_list_init(&cache->deferred_writethrough_bios);
2029 spin_unlock_irqrestore(&cache->lock, flags);
2030
2031 /*
2032 * These bios have already been through accounted_begin()
2033 */
2034 while ((bio = bio_list_pop(&bios)))
2035 generic_make_request(bio);
2036 }
2037
2038 /*----------------------------------------------------------------
2039 * Main worker loop
2040 *--------------------------------------------------------------*/
2041
2042 static void requeue_deferred_bios(struct cache *cache)
2043 {
2044 struct bio *bio;
2045 struct bio_list bios;
2046
2047 bio_list_init(&bios);
2048 bio_list_merge(&bios, &cache->deferred_bios);
2049 bio_list_init(&cache->deferred_bios);
2050
2051 while ((bio = bio_list_pop(&bios))) {
2052 bio->bi_status = BLK_STS_DM_REQUEUE;
2053 bio_endio(bio);
2054 }
2055 }
2056
2057 /*
2058 * We want to commit periodically so that not too much
2059 * unwritten metadata builds up.
2060 */
2061 static void do_waker(struct work_struct *ws)
2062 {
2063 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
2064
2065 policy_tick(cache->policy, true);
2066 wake_migration_worker(cache);
2067 schedule_commit(&cache->committer);
2068 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
2069 }
2070
2071 static void check_migrations(struct work_struct *ws)
2072 {
2073 int r;
2074 struct policy_work *op;
2075 struct cache *cache = container_of(ws, struct cache, migration_worker);
2076 enum busy b;
2077
2078 for (;;) {
2079 b = spare_migration_bandwidth(cache);
2080
2081 r = policy_get_background_work(cache->policy, b == IDLE, &op);
2082 if (r == -ENODATA)
2083 break;
2084
2085 if (r) {
2086 DMERR_LIMIT("%s: policy_background_work failed",
2087 cache_device_name(cache));
2088 break;
2089 }
2090
2091 r = mg_start(cache, op, NULL);
2092 if (r)
2093 break;
2094 }
2095 }
2096
2097 /*----------------------------------------------------------------
2098 * Target methods
2099 *--------------------------------------------------------------*/
2100
2101 /*
2102 * This function gets called on the error paths of the constructor, so we
2103 * have to cope with a partially initialised struct.
2104 */
2105 static void destroy(struct cache *cache)
2106 {
2107 unsigned i;
2108
2109 mempool_destroy(cache->migration_pool);
2110
2111 if (cache->prison)
2112 dm_bio_prison_destroy_v2(cache->prison);
2113
2114 if (cache->wq)
2115 destroy_workqueue(cache->wq);
2116
2117 if (cache->dirty_bitset)
2118 free_bitset(cache->dirty_bitset);
2119
2120 if (cache->discard_bitset)
2121 free_bitset(cache->discard_bitset);
2122
2123 if (cache->copier)
2124 dm_kcopyd_client_destroy(cache->copier);
2125
2126 if (cache->cmd)
2127 dm_cache_metadata_close(cache->cmd);
2128
2129 if (cache->metadata_dev)
2130 dm_put_device(cache->ti, cache->metadata_dev);
2131
2132 if (cache->origin_dev)
2133 dm_put_device(cache->ti, cache->origin_dev);
2134
2135 if (cache->cache_dev)
2136 dm_put_device(cache->ti, cache->cache_dev);
2137
2138 if (cache->policy)
2139 dm_cache_policy_destroy(cache->policy);
2140
2141 for (i = 0; i < cache->nr_ctr_args ; i++)
2142 kfree(cache->ctr_args[i]);
2143 kfree(cache->ctr_args);
2144
2145 if (cache->bs)
2146 bioset_free(cache->bs);
2147
2148 kfree(cache);
2149 }
2150
2151 static void cache_dtr(struct dm_target *ti)
2152 {
2153 struct cache *cache = ti->private;
2154
2155 destroy(cache);
2156 }
2157
2158 static sector_t get_dev_size(struct dm_dev *dev)
2159 {
2160 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
2161 }
2162
2163 /*----------------------------------------------------------------*/
2164
2165 /*
2166 * Construct a cache device mapping.
2167 *
2168 * cache <metadata dev> <cache dev> <origin dev> <block size>
2169 * <#feature args> [<feature arg>]*
2170 * <policy> <#policy args> [<policy arg>]*
2171 *
2172 * metadata dev : fast device holding the persistent metadata
2173 * cache dev : fast device holding cached data blocks
2174 * origin dev : slow device holding original data blocks
2175 * block size : cache unit size in sectors
2176 *
2177 * #feature args : number of feature arguments passed
2178 * feature args : writethrough. (The default is writeback.)
2179 *
2180 * policy : the replacement policy to use
2181 * #policy args : an even number of policy arguments corresponding
2182 * to key/value pairs passed to the policy
2183 * policy args : key/value pairs passed to the policy
2184 * E.g. 'sequential_threshold 1024'
2185 * See cache-policies.txt for details.
2186 *
2187 * Optional feature arguments are:
2188 * writethrough : write through caching that prohibits cache block
2189 * content from being different from origin block content.
2190 * Without this argument, the default behaviour is to write
2191 * back cache block contents later for performance reasons,
2192 * so they may differ from the corresponding origin blocks.
2193 */
2194 struct cache_args {
2195 struct dm_target *ti;
2196
2197 struct dm_dev *metadata_dev;
2198
2199 struct dm_dev *cache_dev;
2200 sector_t cache_sectors;
2201
2202 struct dm_dev *origin_dev;
2203 sector_t origin_sectors;
2204
2205 uint32_t block_size;
2206
2207 const char *policy_name;
2208 int policy_argc;
2209 const char **policy_argv;
2210
2211 struct cache_features features;
2212 };
2213
2214 static void destroy_cache_args(struct cache_args *ca)
2215 {
2216 if (ca->metadata_dev)
2217 dm_put_device(ca->ti, ca->metadata_dev);
2218
2219 if (ca->cache_dev)
2220 dm_put_device(ca->ti, ca->cache_dev);
2221
2222 if (ca->origin_dev)
2223 dm_put_device(ca->ti, ca->origin_dev);
2224
2225 kfree(ca);
2226 }
2227
2228 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2229 {
2230 if (!as->argc) {
2231 *error = "Insufficient args";
2232 return false;
2233 }
2234
2235 return true;
2236 }
2237
2238 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2239 char **error)
2240 {
2241 int r;
2242 sector_t metadata_dev_size;
2243 char b[BDEVNAME_SIZE];
2244
2245 if (!at_least_one_arg(as, error))
2246 return -EINVAL;
2247
2248 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2249 &ca->metadata_dev);
2250 if (r) {
2251 *error = "Error opening metadata device";
2252 return r;
2253 }
2254
2255 metadata_dev_size = get_dev_size(ca->metadata_dev);
2256 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2257 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2258 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2259
2260 return 0;
2261 }
2262
2263 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2264 char **error)
2265 {
2266 int r;
2267
2268 if (!at_least_one_arg(as, error))
2269 return -EINVAL;
2270
2271 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2272 &ca->cache_dev);
2273 if (r) {
2274 *error = "Error opening cache device";
2275 return r;
2276 }
2277 ca->cache_sectors = get_dev_size(ca->cache_dev);
2278
2279 return 0;
2280 }
2281
2282 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2283 char **error)
2284 {
2285 int r;
2286
2287 if (!at_least_one_arg(as, error))
2288 return -EINVAL;
2289
2290 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2291 &ca->origin_dev);
2292 if (r) {
2293 *error = "Error opening origin device";
2294 return r;
2295 }
2296
2297 ca->origin_sectors = get_dev_size(ca->origin_dev);
2298 if (ca->ti->len > ca->origin_sectors) {
2299 *error = "Device size larger than cached device";
2300 return -EINVAL;
2301 }
2302
2303 return 0;
2304 }
2305
2306 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2307 char **error)
2308 {
2309 unsigned long block_size;
2310
2311 if (!at_least_one_arg(as, error))
2312 return -EINVAL;
2313
2314 if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2315 block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2316 block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2317 block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2318 *error = "Invalid data block size";
2319 return -EINVAL;
2320 }
2321
2322 if (block_size > ca->cache_sectors) {
2323 *error = "Data block size is larger than the cache device";
2324 return -EINVAL;
2325 }
2326
2327 ca->block_size = block_size;
2328
2329 return 0;
2330 }
2331
2332 static void init_features(struct cache_features *cf)
2333 {
2334 cf->mode = CM_WRITE;
2335 cf->io_mode = CM_IO_WRITEBACK;
2336 cf->metadata_version = 1;
2337 }
2338
2339 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2340 char **error)
2341 {
2342 static const struct dm_arg _args[] = {
2343 {0, 2, "Invalid number of cache feature arguments"},
2344 };
2345
2346 int r;
2347 unsigned argc;
2348 const char *arg;
2349 struct cache_features *cf = &ca->features;
2350
2351 init_features(cf);
2352
2353 r = dm_read_arg_group(_args, as, &argc, error);
2354 if (r)
2355 return -EINVAL;
2356
2357 while (argc--) {
2358 arg = dm_shift_arg(as);
2359
2360 if (!strcasecmp(arg, "writeback"))
2361 cf->io_mode = CM_IO_WRITEBACK;
2362
2363 else if (!strcasecmp(arg, "writethrough"))
2364 cf->io_mode = CM_IO_WRITETHROUGH;
2365
2366 else if (!strcasecmp(arg, "passthrough"))
2367 cf->io_mode = CM_IO_PASSTHROUGH;
2368
2369 else if (!strcasecmp(arg, "metadata2"))
2370 cf->metadata_version = 2;
2371
2372 else {
2373 *error = "Unrecognised cache feature requested";
2374 return -EINVAL;
2375 }
2376 }
2377
2378 return 0;
2379 }
2380
2381 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2382 char **error)
2383 {
2384 static const struct dm_arg _args[] = {
2385 {0, 1024, "Invalid number of policy arguments"},
2386 };
2387
2388 int r;
2389
2390 if (!at_least_one_arg(as, error))
2391 return -EINVAL;
2392
2393 ca->policy_name = dm_shift_arg(as);
2394
2395 r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2396 if (r)
2397 return -EINVAL;
2398
2399 ca->policy_argv = (const char **)as->argv;
2400 dm_consume_args(as, ca->policy_argc);
2401
2402 return 0;
2403 }
2404
2405 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2406 char **error)
2407 {
2408 int r;
2409 struct dm_arg_set as;
2410
2411 as.argc = argc;
2412 as.argv = argv;
2413
2414 r = parse_metadata_dev(ca, &as, error);
2415 if (r)
2416 return r;
2417
2418 r = parse_cache_dev(ca, &as, error);
2419 if (r)
2420 return r;
2421
2422 r = parse_origin_dev(ca, &as, error);
2423 if (r)
2424 return r;
2425
2426 r = parse_block_size(ca, &as, error);
2427 if (r)
2428 return r;
2429
2430 r = parse_features(ca, &as, error);
2431 if (r)
2432 return r;
2433
2434 r = parse_policy(ca, &as, error);
2435 if (r)
2436 return r;
2437
2438 return 0;
2439 }
2440
2441 /*----------------------------------------------------------------*/
2442
2443 static struct kmem_cache *migration_cache;
2444
2445 #define NOT_CORE_OPTION 1
2446
2447 static int process_config_option(struct cache *cache, const char *key, const char *value)
2448 {
2449 unsigned long tmp;
2450
2451 if (!strcasecmp(key, "migration_threshold")) {
2452 if (kstrtoul(value, 10, &tmp))
2453 return -EINVAL;
2454
2455 cache->migration_threshold = tmp;
2456 return 0;
2457 }
2458
2459 return NOT_CORE_OPTION;
2460 }
2461
2462 static int set_config_value(struct cache *cache, const char *key, const char *value)
2463 {
2464 int r = process_config_option(cache, key, value);
2465
2466 if (r == NOT_CORE_OPTION)
2467 r = policy_set_config_value(cache->policy, key, value);
2468
2469 if (r)
2470 DMWARN("bad config value for %s: %s", key, value);
2471
2472 return r;
2473 }
2474
2475 static int set_config_values(struct cache *cache, int argc, const char **argv)
2476 {
2477 int r = 0;
2478
2479 if (argc & 1) {
2480 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2481 return -EINVAL;
2482 }
2483
2484 while (argc) {
2485 r = set_config_value(cache, argv[0], argv[1]);
2486 if (r)
2487 break;
2488
2489 argc -= 2;
2490 argv += 2;
2491 }
2492
2493 return r;
2494 }
2495
2496 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2497 char **error)
2498 {
2499 struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2500 cache->cache_size,
2501 cache->origin_sectors,
2502 cache->sectors_per_block);
2503 if (IS_ERR(p)) {
2504 *error = "Error creating cache's policy";
2505 return PTR_ERR(p);
2506 }
2507 cache->policy = p;
2508 BUG_ON(!cache->policy);
2509
2510 return 0;
2511 }
2512
2513 /*
2514 * We want the discard block size to be at least the size of the cache
2515 * block size and have no more than 2^14 discard blocks across the origin.
2516 */
2517 #define MAX_DISCARD_BLOCKS (1 << 14)
2518
2519 static bool too_many_discard_blocks(sector_t discard_block_size,
2520 sector_t origin_size)
2521 {
2522 (void) sector_div(origin_size, discard_block_size);
2523
2524 return origin_size > MAX_DISCARD_BLOCKS;
2525 }
2526
2527 static sector_t calculate_discard_block_size(sector_t cache_block_size,
2528 sector_t origin_size)
2529 {
2530 sector_t discard_block_size = cache_block_size;
2531
2532 if (origin_size)
2533 while (too_many_discard_blocks(discard_block_size, origin_size))
2534 discard_block_size *= 2;
2535
2536 return discard_block_size;
2537 }
2538
2539 static void set_cache_size(struct cache *cache, dm_cblock_t size)
2540 {
2541 dm_block_t nr_blocks = from_cblock(size);
2542
2543 if (nr_blocks > (1 << 20) && cache->cache_size != size)
2544 DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2545 "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2546 "Please consider increasing the cache block size to reduce the overall cache block count.",
2547 (unsigned long long) nr_blocks);
2548
2549 cache->cache_size = size;
2550 }
2551
2552 static int is_congested(struct dm_dev *dev, int bdi_bits)
2553 {
2554 struct request_queue *q = bdev_get_queue(dev->bdev);
2555 return bdi_congested(q->backing_dev_info, bdi_bits);
2556 }
2557
2558 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2559 {
2560 struct cache *cache = container_of(cb, struct cache, callbacks);
2561
2562 return is_congested(cache->origin_dev, bdi_bits) ||
2563 is_congested(cache->cache_dev, bdi_bits);
2564 }
2565
2566 #define DEFAULT_MIGRATION_THRESHOLD 2048
2567
2568 static int cache_create(struct cache_args *ca, struct cache **result)
2569 {
2570 int r = 0;
2571 char **error = &ca->ti->error;
2572 struct cache *cache;
2573 struct dm_target *ti = ca->ti;
2574 dm_block_t origin_blocks;
2575 struct dm_cache_metadata *cmd;
2576 bool may_format = ca->features.mode == CM_WRITE;
2577
2578 cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2579 if (!cache)
2580 return -ENOMEM;
2581
2582 cache->ti = ca->ti;
2583 ti->private = cache;
2584 ti->num_flush_bios = 2;
2585 ti->flush_supported = true;
2586
2587 ti->num_discard_bios = 1;
2588 ti->discards_supported = true;
2589 ti->split_discard_bios = false;
2590
2591 cache->features = ca->features;
2592 ti->per_io_data_size = get_per_bio_data_size(cache);
2593
2594 if (writethrough_mode(cache)) {
2595 /* Create bioset for writethrough bios issued to origin */
2596 cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0);
2597 if (!cache->bs)
2598 goto bad;
2599 }
2600
2601 cache->callbacks.congested_fn = cache_is_congested;
2602 dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2603
2604 cache->metadata_dev = ca->metadata_dev;
2605 cache->origin_dev = ca->origin_dev;
2606 cache->cache_dev = ca->cache_dev;
2607
2608 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2609
2610 origin_blocks = cache->origin_sectors = ca->origin_sectors;
2611 origin_blocks = block_div(origin_blocks, ca->block_size);
2612 cache->origin_blocks = to_oblock(origin_blocks);
2613
2614 cache->sectors_per_block = ca->block_size;
2615 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2616 r = -EINVAL;
2617 goto bad;
2618 }
2619
2620 if (ca->block_size & (ca->block_size - 1)) {
2621 dm_block_t cache_size = ca->cache_sectors;
2622
2623 cache->sectors_per_block_shift = -1;
2624 cache_size = block_div(cache_size, ca->block_size);
2625 set_cache_size(cache, to_cblock(cache_size));
2626 } else {
2627 cache->sectors_per_block_shift = __ffs(ca->block_size);
2628 set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2629 }
2630
2631 r = create_cache_policy(cache, ca, error);
2632 if (r)
2633 goto bad;
2634
2635 cache->policy_nr_args = ca->policy_argc;
2636 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2637
2638 r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2639 if (r) {
2640 *error = "Error setting cache policy's config values";
2641 goto bad;
2642 }
2643
2644 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2645 ca->block_size, may_format,
2646 dm_cache_policy_get_hint_size(cache->policy),
2647 ca->features.metadata_version);
2648 if (IS_ERR(cmd)) {
2649 *error = "Error creating metadata object";
2650 r = PTR_ERR(cmd);
2651 goto bad;
2652 }
2653 cache->cmd = cmd;
2654 set_cache_mode(cache, CM_WRITE);
2655 if (get_cache_mode(cache) != CM_WRITE) {
2656 *error = "Unable to get write access to metadata, please check/repair metadata.";
2657 r = -EINVAL;
2658 goto bad;
2659 }
2660
2661 if (passthrough_mode(cache)) {
2662 bool all_clean;
2663
2664 r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2665 if (r) {
2666 *error = "dm_cache_metadata_all_clean() failed";
2667 goto bad;
2668 }
2669
2670 if (!all_clean) {
2671 *error = "Cannot enter passthrough mode unless all blocks are clean";
2672 r = -EINVAL;
2673 goto bad;
2674 }
2675
2676 policy_allow_migrations(cache->policy, false);
2677 }
2678
2679 spin_lock_init(&cache->lock);
2680 INIT_LIST_HEAD(&cache->deferred_cells);
2681 bio_list_init(&cache->deferred_bios);
2682 bio_list_init(&cache->deferred_writethrough_bios);
2683 atomic_set(&cache->nr_allocated_migrations, 0);
2684 atomic_set(&cache->nr_io_migrations, 0);
2685 init_waitqueue_head(&cache->migration_wait);
2686
2687 r = -ENOMEM;
2688 atomic_set(&cache->nr_dirty, 0);
2689 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2690 if (!cache->dirty_bitset) {
2691 *error = "could not allocate dirty bitset";
2692 goto bad;
2693 }
2694 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2695
2696 cache->discard_block_size =
2697 calculate_discard_block_size(cache->sectors_per_block,
2698 cache->origin_sectors);
2699 cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2700 cache->discard_block_size));
2701 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2702 if (!cache->discard_bitset) {
2703 *error = "could not allocate discard bitset";
2704 goto bad;
2705 }
2706 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2707
2708 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2709 if (IS_ERR(cache->copier)) {
2710 *error = "could not create kcopyd client";
2711 r = PTR_ERR(cache->copier);
2712 goto bad;
2713 }
2714
2715 cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2716 if (!cache->wq) {
2717 *error = "could not create workqueue for metadata object";
2718 goto bad;
2719 }
2720 INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2721 INIT_WORK(&cache->deferred_writethrough_worker,
2722 process_deferred_writethrough_bios);
2723 INIT_WORK(&cache->migration_worker, check_migrations);
2724 INIT_DELAYED_WORK(&cache->waker, do_waker);
2725
2726 cache->prison = dm_bio_prison_create_v2(cache->wq);
2727 if (!cache->prison) {
2728 *error = "could not create bio prison";
2729 goto bad;
2730 }
2731
2732 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2733 migration_cache);
2734 if (!cache->migration_pool) {
2735 *error = "Error creating cache's migration mempool";
2736 goto bad;
2737 }
2738
2739 cache->need_tick_bio = true;
2740 cache->sized = false;
2741 cache->invalidate = false;
2742 cache->commit_requested = false;
2743 cache->loaded_mappings = false;
2744 cache->loaded_discards = false;
2745
2746 load_stats(cache);
2747
2748 atomic_set(&cache->stats.demotion, 0);
2749 atomic_set(&cache->stats.promotion, 0);
2750 atomic_set(&cache->stats.copies_avoided, 0);
2751 atomic_set(&cache->stats.cache_cell_clash, 0);
2752 atomic_set(&cache->stats.commit_count, 0);
2753 atomic_set(&cache->stats.discard_count, 0);
2754
2755 spin_lock_init(&cache->invalidation_lock);
2756 INIT_LIST_HEAD(&cache->invalidation_requests);
2757
2758 batcher_init(&cache->committer, commit_op, cache,
2759 issue_op, cache, cache->wq);
2760 iot_init(&cache->tracker);
2761
2762 init_rwsem(&cache->background_work_lock);
2763 prevent_background_work(cache);
2764
2765 *result = cache;
2766 return 0;
2767 bad:
2768 destroy(cache);
2769 return r;
2770 }
2771
2772 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2773 {
2774 unsigned i;
2775 const char **copy;
2776
2777 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2778 if (!copy)
2779 return -ENOMEM;
2780 for (i = 0; i < argc; i++) {
2781 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2782 if (!copy[i]) {
2783 while (i--)
2784 kfree(copy[i]);
2785 kfree(copy);
2786 return -ENOMEM;
2787 }
2788 }
2789
2790 cache->nr_ctr_args = argc;
2791 cache->ctr_args = copy;
2792
2793 return 0;
2794 }
2795
2796 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2797 {
2798 int r = -EINVAL;
2799 struct cache_args *ca;
2800 struct cache *cache = NULL;
2801
2802 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2803 if (!ca) {
2804 ti->error = "Error allocating memory for cache";
2805 return -ENOMEM;
2806 }
2807 ca->ti = ti;
2808
2809 r = parse_cache_args(ca, argc, argv, &ti->error);
2810 if (r)
2811 goto out;
2812
2813 r = cache_create(ca, &cache);
2814 if (r)
2815 goto out;
2816
2817 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2818 if (r) {
2819 destroy(cache);
2820 goto out;
2821 }
2822
2823 ti->private = cache;
2824 out:
2825 destroy_cache_args(ca);
2826 return r;
2827 }
2828
2829 /*----------------------------------------------------------------*/
2830
2831 static int cache_map(struct dm_target *ti, struct bio *bio)
2832 {
2833 struct cache *cache = ti->private;
2834
2835 int r;
2836 bool commit_needed;
2837 dm_oblock_t block = get_bio_block(cache, bio);
2838 size_t pb_data_size = get_per_bio_data_size(cache);
2839
2840 init_per_bio_data(bio, pb_data_size);
2841 if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2842 /*
2843 * This can only occur if the io goes to a partial block at
2844 * the end of the origin device. We don't cache these.
2845 * Just remap to the origin and carry on.
2846 */
2847 remap_to_origin(cache, bio);
2848 accounted_begin(cache, bio);
2849 return DM_MAPIO_REMAPPED;
2850 }
2851
2852 if (discard_or_flush(bio)) {
2853 defer_bio(cache, bio);
2854 return DM_MAPIO_SUBMITTED;
2855 }
2856
2857 r = map_bio(cache, bio, block, &commit_needed);
2858 if (commit_needed)
2859 schedule_commit(&cache->committer);
2860
2861 return r;
2862 }
2863
2864 static int cache_end_io(struct dm_target *ti, struct bio *bio,
2865 blk_status_t *error)
2866 {
2867 struct cache *cache = ti->private;
2868 unsigned long flags;
2869 size_t pb_data_size = get_per_bio_data_size(cache);
2870 struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
2871
2872 if (pb->tick) {
2873 policy_tick(cache->policy, false);
2874
2875 spin_lock_irqsave(&cache->lock, flags);
2876 cache->need_tick_bio = true;
2877 spin_unlock_irqrestore(&cache->lock, flags);
2878 }
2879
2880 bio_drop_shared_lock(cache, bio);
2881 accounted_complete(cache, bio);
2882
2883 return DM_ENDIO_DONE;
2884 }
2885
2886 static int write_dirty_bitset(struct cache *cache)
2887 {
2888 int r;
2889
2890 if (get_cache_mode(cache) >= CM_READ_ONLY)
2891 return -EINVAL;
2892
2893 r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
2894 if (r)
2895 metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
2896
2897 return r;
2898 }
2899
2900 static int write_discard_bitset(struct cache *cache)
2901 {
2902 unsigned i, r;
2903
2904 if (get_cache_mode(cache) >= CM_READ_ONLY)
2905 return -EINVAL;
2906
2907 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2908 cache->discard_nr_blocks);
2909 if (r) {
2910 DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
2911 metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
2912 return r;
2913 }
2914
2915 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2916 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2917 is_discarded(cache, to_dblock(i)));
2918 if (r) {
2919 metadata_operation_failed(cache, "dm_cache_set_discard", r);
2920 return r;
2921 }
2922 }
2923
2924 return 0;
2925 }
2926
2927 static int write_hints(struct cache *cache)
2928 {
2929 int r;
2930
2931 if (get_cache_mode(cache) >= CM_READ_ONLY)
2932 return -EINVAL;
2933
2934 r = dm_cache_write_hints(cache->cmd, cache->policy);
2935 if (r) {
2936 metadata_operation_failed(cache, "dm_cache_write_hints", r);
2937 return r;
2938 }
2939
2940 return 0;
2941 }
2942
2943 /*
2944 * returns true on success
2945 */
2946 static bool sync_metadata(struct cache *cache)
2947 {
2948 int r1, r2, r3, r4;
2949
2950 r1 = write_dirty_bitset(cache);
2951 if (r1)
2952 DMERR("%s: could not write dirty bitset", cache_device_name(cache));
2953
2954 r2 = write_discard_bitset(cache);
2955 if (r2)
2956 DMERR("%s: could not write discard bitset", cache_device_name(cache));
2957
2958 save_stats(cache);
2959
2960 r3 = write_hints(cache);
2961 if (r3)
2962 DMERR("%s: could not write hints", cache_device_name(cache));
2963
2964 /*
2965 * If writing the above metadata failed, we still commit, but don't
2966 * set the clean shutdown flag. This will effectively force every
2967 * dirty bit to be set on reload.
2968 */
2969 r4 = commit(cache, !r1 && !r2 && !r3);
2970 if (r4)
2971 DMERR("%s: could not write cache metadata", cache_device_name(cache));
2972
2973 return !r1 && !r2 && !r3 && !r4;
2974 }
2975
2976 static void cache_postsuspend(struct dm_target *ti)
2977 {
2978 struct cache *cache = ti->private;
2979
2980 prevent_background_work(cache);
2981 BUG_ON(atomic_read(&cache->nr_io_migrations));
2982
2983 cancel_delayed_work(&cache->waker);
2984 flush_workqueue(cache->wq);
2985 WARN_ON(cache->tracker.in_flight);
2986
2987 /*
2988 * If it's a flush suspend there won't be any deferred bios, so this
2989 * call is harmless.
2990 */
2991 requeue_deferred_bios(cache);
2992
2993 if (get_cache_mode(cache) == CM_WRITE)
2994 (void) sync_metadata(cache);
2995 }
2996
2997 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2998 bool dirty, uint32_t hint, bool hint_valid)
2999 {
3000 int r;
3001 struct cache *cache = context;
3002
3003 if (dirty) {
3004 set_bit(from_cblock(cblock), cache->dirty_bitset);
3005 atomic_inc(&cache->nr_dirty);
3006 } else
3007 clear_bit(from_cblock(cblock), cache->dirty_bitset);
3008
3009 r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
3010 if (r)
3011 return r;
3012
3013 return 0;
3014 }
3015
3016 /*
3017 * The discard block size in the on disk metadata is not
3018 * neccessarily the same as we're currently using. So we have to
3019 * be careful to only set the discarded attribute if we know it
3020 * covers a complete block of the new size.
3021 */
3022 struct discard_load_info {
3023 struct cache *cache;
3024
3025 /*
3026 * These blocks are sized using the on disk dblock size, rather
3027 * than the current one.
3028 */
3029 dm_block_t block_size;
3030 dm_block_t discard_begin, discard_end;
3031 };
3032
3033 static void discard_load_info_init(struct cache *cache,
3034 struct discard_load_info *li)
3035 {
3036 li->cache = cache;
3037 li->discard_begin = li->discard_end = 0;
3038 }
3039
3040 static void set_discard_range(struct discard_load_info *li)
3041 {
3042 sector_t b, e;
3043
3044 if (li->discard_begin == li->discard_end)
3045 return;
3046
3047 /*
3048 * Convert to sectors.
3049 */
3050 b = li->discard_begin * li->block_size;
3051 e = li->discard_end * li->block_size;
3052
3053 /*
3054 * Then convert back to the current dblock size.
3055 */
3056 b = dm_sector_div_up(b, li->cache->discard_block_size);
3057 sector_div(e, li->cache->discard_block_size);
3058
3059 /*
3060 * The origin may have shrunk, so we need to check we're still in
3061 * bounds.
3062 */
3063 if (e > from_dblock(li->cache->discard_nr_blocks))
3064 e = from_dblock(li->cache->discard_nr_blocks);
3065
3066 for (; b < e; b++)
3067 set_discard(li->cache, to_dblock(b));
3068 }
3069
3070 static int load_discard(void *context, sector_t discard_block_size,
3071 dm_dblock_t dblock, bool discard)
3072 {
3073 struct discard_load_info *li = context;
3074
3075 li->block_size = discard_block_size;
3076
3077 if (discard) {
3078 if (from_dblock(dblock) == li->discard_end)
3079 /*
3080 * We're already in a discard range, just extend it.
3081 */
3082 li->discard_end = li->discard_end + 1ULL;
3083
3084 else {
3085 /*
3086 * Emit the old range and start a new one.
3087 */
3088 set_discard_range(li);
3089 li->discard_begin = from_dblock(dblock);
3090 li->discard_end = li->discard_begin + 1ULL;
3091 }
3092 } else {
3093 set_discard_range(li);
3094 li->discard_begin = li->discard_end = 0;
3095 }
3096
3097 return 0;
3098 }
3099
3100 static dm_cblock_t get_cache_dev_size(struct cache *cache)
3101 {
3102 sector_t size = get_dev_size(cache->cache_dev);
3103 (void) sector_div(size, cache->sectors_per_block);
3104 return to_cblock(size);
3105 }
3106
3107 static bool can_resize(struct cache *cache, dm_cblock_t new_size)
3108 {
3109 if (from_cblock(new_size) > from_cblock(cache->cache_size))
3110 return true;
3111
3112 /*
3113 * We can't drop a dirty block when shrinking the cache.
3114 */
3115 while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
3116 new_size = to_cblock(from_cblock(new_size) + 1);
3117 if (is_dirty(cache, new_size)) {
3118 DMERR("%s: unable to shrink cache; cache block %llu is dirty",
3119 cache_device_name(cache),
3120 (unsigned long long) from_cblock(new_size));
3121 return false;
3122 }
3123 }
3124
3125 return true;
3126 }
3127
3128 static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
3129 {
3130 int r;
3131
3132 r = dm_cache_resize(cache->cmd, new_size);
3133 if (r) {
3134 DMERR("%s: could not resize cache metadata", cache_device_name(cache));
3135 metadata_operation_failed(cache, "dm_cache_resize", r);
3136 return r;
3137 }
3138
3139 set_cache_size(cache, new_size);
3140
3141 return 0;
3142 }
3143
3144 static int cache_preresume(struct dm_target *ti)
3145 {
3146 int r = 0;
3147 struct cache *cache = ti->private;
3148 dm_cblock_t csize = get_cache_dev_size(cache);
3149
3150 /*
3151 * Check to see if the cache has resized.
3152 */
3153 if (!cache->sized) {
3154 r = resize_cache_dev(cache, csize);
3155 if (r)
3156 return r;
3157
3158 cache->sized = true;
3159
3160 } else if (csize != cache->cache_size) {
3161 if (!can_resize(cache, csize))
3162 return -EINVAL;
3163
3164 r = resize_cache_dev(cache, csize);
3165 if (r)
3166 return r;
3167 }
3168
3169 if (!cache->loaded_mappings) {
3170 r = dm_cache_load_mappings(cache->cmd, cache->policy,
3171 load_mapping, cache);
3172 if (r) {
3173 DMERR("%s: could not load cache mappings", cache_device_name(cache));
3174 metadata_operation_failed(cache, "dm_cache_load_mappings", r);
3175 return r;
3176 }
3177
3178 cache->loaded_mappings = true;
3179 }
3180
3181 if (!cache->loaded_discards) {
3182 struct discard_load_info li;
3183
3184 /*
3185 * The discard bitset could have been resized, or the
3186 * discard block size changed. To be safe we start by
3187 * setting every dblock to not discarded.
3188 */
3189 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3190
3191 discard_load_info_init(cache, &li);
3192 r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3193 if (r) {
3194 DMERR("%s: could not load origin discards", cache_device_name(cache));
3195 metadata_operation_failed(cache, "dm_cache_load_discards", r);
3196 return r;
3197 }
3198 set_discard_range(&li);
3199
3200 cache->loaded_discards = true;
3201 }
3202
3203 return r;
3204 }
3205
3206 static void cache_resume(struct dm_target *ti)
3207 {
3208 struct cache *cache = ti->private;
3209
3210 cache->need_tick_bio = true;
3211 allow_background_work(cache);
3212 do_waker(&cache->waker.work);
3213 }
3214
3215 /*
3216 * Status format:
3217 *
3218 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3219 * <cache block size> <#used cache blocks>/<#total cache blocks>
3220 * <#read hits> <#read misses> <#write hits> <#write misses>
3221 * <#demotions> <#promotions> <#dirty>
3222 * <#features> <features>*
3223 * <#core args> <core args>
3224 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3225 */
3226 static void cache_status(struct dm_target *ti, status_type_t type,
3227 unsigned status_flags, char *result, unsigned maxlen)
3228 {
3229 int r = 0;
3230 unsigned i;
3231 ssize_t sz = 0;
3232 dm_block_t nr_free_blocks_metadata = 0;
3233 dm_block_t nr_blocks_metadata = 0;
3234 char buf[BDEVNAME_SIZE];
3235 struct cache *cache = ti->private;
3236 dm_cblock_t residency;
3237 bool needs_check;
3238
3239 switch (type) {
3240 case STATUSTYPE_INFO:
3241 if (get_cache_mode(cache) == CM_FAIL) {
3242 DMEMIT("Fail");
3243 break;
3244 }
3245
3246 /* Commit to ensure statistics aren't out-of-date */
3247 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3248 (void) commit(cache, false);
3249
3250 r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
3251 if (r) {
3252 DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3253 cache_device_name(cache), r);
3254 goto err;
3255 }
3256
3257 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3258 if (r) {
3259 DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3260 cache_device_name(cache), r);
3261 goto err;
3262 }
3263
3264 residency = policy_residency(cache->policy);
3265
3266 DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3267 (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3268 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3269 (unsigned long long)nr_blocks_metadata,
3270 (unsigned long long)cache->sectors_per_block,
3271 (unsigned long long) from_cblock(residency),
3272 (unsigned long long) from_cblock(cache->cache_size),
3273 (unsigned) atomic_read(&cache->stats.read_hit),
3274 (unsigned) atomic_read(&cache->stats.read_miss),
3275 (unsigned) atomic_read(&cache->stats.write_hit),
3276 (unsigned) atomic_read(&cache->stats.write_miss),
3277 (unsigned) atomic_read(&cache->stats.demotion),
3278 (unsigned) atomic_read(&cache->stats.promotion),
3279 (unsigned long) atomic_read(&cache->nr_dirty));
3280
3281 if (cache->features.metadata_version == 2)
3282 DMEMIT("2 metadata2 ");
3283 else
3284 DMEMIT("1 ");
3285
3286 if (writethrough_mode(cache))
3287 DMEMIT("writethrough ");
3288
3289 else if (passthrough_mode(cache))
3290 DMEMIT("passthrough ");
3291
3292 else if (writeback_mode(cache))
3293 DMEMIT("writeback ");
3294
3295 else {
3296 DMERR("%s: internal error: unknown io mode: %d",
3297 cache_device_name(cache), (int) cache->features.io_mode);
3298 goto err;
3299 }
3300
3301 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3302
3303 DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3304 if (sz < maxlen) {
3305 r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
3306 if (r)
3307 DMERR("%s: policy_emit_config_values returned %d",
3308 cache_device_name(cache), r);
3309 }
3310
3311 if (get_cache_mode(cache) == CM_READ_ONLY)
3312 DMEMIT("ro ");
3313 else
3314 DMEMIT("rw ");
3315
3316 r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
3317
3318 if (r || needs_check)
3319 DMEMIT("needs_check ");
3320 else
3321 DMEMIT("- ");
3322
3323 break;
3324
3325 case STATUSTYPE_TABLE:
3326 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3327 DMEMIT("%s ", buf);
3328 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3329 DMEMIT("%s ", buf);
3330 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3331 DMEMIT("%s", buf);
3332
3333 for (i = 0; i < cache->nr_ctr_args - 1; i++)
3334 DMEMIT(" %s", cache->ctr_args[i]);
3335 if (cache->nr_ctr_args)
3336 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3337 }
3338
3339 return;
3340
3341 err:
3342 DMEMIT("Error");
3343 }
3344
3345 /*
3346 * Defines a range of cblocks, begin to (end - 1) are in the range. end is
3347 * the one-past-the-end value.
3348 */
3349 struct cblock_range {
3350 dm_cblock_t begin;
3351 dm_cblock_t end;
3352 };
3353
3354 /*
3355 * A cache block range can take two forms:
3356 *
3357 * i) A single cblock, eg. '3456'
3358 * ii) A begin and end cblock with a dash between, eg. 123-234
3359 */
3360 static int parse_cblock_range(struct cache *cache, const char *str,
3361 struct cblock_range *result)
3362 {
3363 char dummy;
3364 uint64_t b, e;
3365 int r;
3366
3367 /*
3368 * Try and parse form (ii) first.
3369 */
3370 r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3371 if (r < 0)
3372 return r;
3373
3374 if (r == 2) {
3375 result->begin = to_cblock(b);
3376 result->end = to_cblock(e);
3377 return 0;
3378 }
3379
3380 /*
3381 * That didn't work, try form (i).
3382 */
3383 r = sscanf(str, "%llu%c", &b, &dummy);
3384 if (r < 0)
3385 return r;
3386
3387 if (r == 1) {
3388 result->begin = to_cblock(b);
3389 result->end = to_cblock(from_cblock(result->begin) + 1u);
3390 return 0;
3391 }
3392
3393 DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3394 return -EINVAL;
3395 }
3396
3397 static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3398 {
3399 uint64_t b = from_cblock(range->begin);
3400 uint64_t e = from_cblock(range->end);
3401 uint64_t n = from_cblock(cache->cache_size);
3402
3403 if (b >= n) {
3404 DMERR("%s: begin cblock out of range: %llu >= %llu",
3405 cache_device_name(cache), b, n);
3406 return -EINVAL;
3407 }
3408
3409 if (e > n) {
3410 DMERR("%s: end cblock out of range: %llu > %llu",
3411 cache_device_name(cache), e, n);
3412 return -EINVAL;
3413 }
3414
3415 if (b >= e) {
3416 DMERR("%s: invalid cblock range: %llu >= %llu",
3417 cache_device_name(cache), b, e);
3418 return -EINVAL;
3419 }
3420
3421 return 0;
3422 }
3423
3424 static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3425 {
3426 return to_cblock(from_cblock(b) + 1);
3427 }
3428
3429 static int request_invalidation(struct cache *cache, struct cblock_range *range)
3430 {
3431 int r = 0;
3432
3433 /*
3434 * We don't need to do any locking here because we know we're in
3435 * passthrough mode. There's is potential for a race between an
3436 * invalidation triggered by an io and an invalidation message. This
3437 * is harmless, we must not worry if the policy call fails.
3438 */
3439 while (range->begin != range->end) {
3440 r = invalidate_cblock(cache, range->begin);
3441 if (r)
3442 return r;
3443
3444 range->begin = cblock_succ(range->begin);
3445 }
3446
3447 cache->commit_requested = true;
3448 return r;
3449 }
3450
3451 static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3452 const char **cblock_ranges)
3453 {
3454 int r = 0;
3455 unsigned i;
3456 struct cblock_range range;
3457
3458 if (!passthrough_mode(cache)) {
3459 DMERR("%s: cache has to be in passthrough mode for invalidation",
3460 cache_device_name(cache));
3461 return -EPERM;
3462 }
3463
3464 for (i = 0; i < count; i++) {
3465 r = parse_cblock_range(cache, cblock_ranges[i], &range);
3466 if (r)
3467 break;
3468
3469 r = validate_cblock_range(cache, &range);
3470 if (r)
3471 break;
3472
3473 /*
3474 * Pass begin and end origin blocks to the worker and wake it.
3475 */
3476 r = request_invalidation(cache, &range);
3477 if (r)
3478 break;
3479 }
3480
3481 return r;
3482 }
3483
3484 /*
3485 * Supports
3486 * "<key> <value>"
3487 * and
3488 * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3489 *
3490 * The key migration_threshold is supported by the cache target core.
3491 */
3492 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
3493 {
3494 struct cache *cache = ti->private;
3495
3496 if (!argc)
3497 return -EINVAL;
3498
3499 if (get_cache_mode(cache) >= CM_READ_ONLY) {
3500 DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3501 cache_device_name(cache));
3502 return -EOPNOTSUPP;
3503 }
3504
3505 if (!strcasecmp(argv[0], "invalidate_cblocks"))
3506 return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3507
3508 if (argc != 2)
3509 return -EINVAL;
3510
3511 return set_config_value(cache, argv[0], argv[1]);
3512 }
3513
3514 static int cache_iterate_devices(struct dm_target *ti,
3515 iterate_devices_callout_fn fn, void *data)
3516 {
3517 int r = 0;
3518 struct cache *cache = ti->private;
3519
3520 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3521 if (!r)
3522 r = fn(ti, cache->origin_dev, 0, ti->len, data);
3523
3524 return r;
3525 }
3526
3527 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3528 {
3529 /*
3530 * FIXME: these limits may be incompatible with the cache device
3531 */
3532 limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3533 cache->origin_sectors);
3534 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3535 }
3536
3537 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3538 {
3539 struct cache *cache = ti->private;
3540 uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3541
3542 /*
3543 * If the system-determined stacked limits are compatible with the
3544 * cache's blocksize (io_opt is a factor) do not override them.
3545 */
3546 if (io_opt_sectors < cache->sectors_per_block ||
3547 do_div(io_opt_sectors, cache->sectors_per_block)) {
3548 blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3549 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3550 }
3551 set_discard_limits(cache, limits);
3552 }
3553
3554 /*----------------------------------------------------------------*/
3555
3556 static struct target_type cache_target = {
3557 .name = "cache",
3558 .version = {2, 0, 0},
3559 .module = THIS_MODULE,
3560 .ctr = cache_ctr,
3561 .dtr = cache_dtr,
3562 .map = cache_map,
3563 .end_io = cache_end_io,
3564 .postsuspend = cache_postsuspend,
3565 .preresume = cache_preresume,
3566 .resume = cache_resume,
3567 .status = cache_status,
3568 .message = cache_message,
3569 .iterate_devices = cache_iterate_devices,
3570 .io_hints = cache_io_hints,
3571 };
3572
3573 static int __init dm_cache_init(void)
3574 {
3575 int r;
3576
3577 r = dm_register_target(&cache_target);
3578 if (r) {
3579 DMERR("cache target registration failed: %d", r);
3580 return r;
3581 }
3582
3583 migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3584 if (!migration_cache) {
3585 dm_unregister_target(&cache_target);
3586 return -ENOMEM;
3587 }
3588
3589 return 0;
3590 }
3591
3592 static void __exit dm_cache_exit(void)
3593 {
3594 dm_unregister_target(&cache_target);
3595 kmem_cache_destroy(migration_cache);
3596 }
3597
3598 module_init(dm_cache_init);
3599 module_exit(dm_cache_exit);
3600
3601 MODULE_DESCRIPTION(DM_NAME " cache target");
3602 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3603 MODULE_LICENSE("GPL");