]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/md/dm-log-writes.c
dm space map metadata: fix missing store of apply_bops() return value
[mirror_ubuntu-bionic-kernel.git] / drivers / md / dm-log-writes.c
CommitLineData
0e9cebe7
JB
1/*
2 * Copyright (C) 2014 Facebook. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include <linux/device-mapper.h>
8
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/blkdev.h>
12#include <linux/bio.h>
98d82f48 13#include <linux/dax.h>
0e9cebe7
JB
14#include <linux/slab.h>
15#include <linux/kthread.h>
16#include <linux/freezer.h>
98d82f48 17#include <linux/uio.h>
0e9cebe7
JB
18
19#define DM_MSG_PREFIX "log-writes"
20
21/*
22 * This target will sequentially log all writes to the target device onto the
23 * log device. This is helpful for replaying writes to check for fs consistency
24 * at all times. This target provides a mechanism to mark specific events to
25 * check data at a later time. So for example you would:
26 *
27 * write data
28 * fsync
29 * dmsetup message /dev/whatever mark mymark
30 * unmount /mnt/test
31 *
32 * Then replay the log up to mymark and check the contents of the replay to
33 * verify it matches what was written.
34 *
35 * We log writes only after they have been flushed, this makes the log describe
36 * close to the order in which the data hits the actual disk, not its cache. So
37 * for example the following sequence (W means write, C means complete)
38 *
39 * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
40 *
41 * Would result in the log looking like this:
42 *
43 * c,a,flush,fuad,b,<other writes>,<next flush>
44 *
45 * This is meant to help expose problems where file systems do not properly wait
46 * on data being written before invoking a FLUSH. FUA bypasses cache so once it
47 * completes it is added to the log as it should be on disk.
48 *
49 * We treat DISCARDs as if they don't bypass cache so that they are logged in
50 * order of completion along with the normal writes. If we didn't do it this
51 * way we would process all the discards first and then write all the data, when
52 * in fact we want to do the data and the discard in the order that they
53 * completed.
54 */
55#define LOG_FLUSH_FLAG (1 << 0)
56#define LOG_FUA_FLAG (1 << 1)
57#define LOG_DISCARD_FLAG (1 << 2)
58#define LOG_MARK_FLAG (1 << 3)
59
f4ad317a
GU
60#define WRITE_LOG_VERSION 1ULL
61#define WRITE_LOG_MAGIC 0x6a736677736872ULL
e9634d1f 62#define WRITE_LOG_SUPER_SECTOR 0
0e9cebe7
JB
63
64/*
65 * The disk format for this is braindead simple.
66 *
67 * At byte 0 we have our super, followed by the following sequence for
68 * nr_entries:
69 *
70 * [ 1 sector ][ entry->nr_sectors ]
71 * [log_write_entry][ data written ]
72 *
73 * The log_write_entry takes up a full sector so we can have arbitrary length
74 * marks and it leaves us room for extra content in the future.
75 */
76
77/*
78 * Basic info about the log for userspace.
79 */
80struct log_write_super {
81 __le64 magic;
82 __le64 version;
83 __le64 nr_entries;
84 __le32 sectorsize;
85};
86
87/*
88 * sector - the sector we wrote.
89 * nr_sectors - the number of sectors we wrote.
90 * flags - flags for this log entry.
91 * data_len - the size of the data in this log entry, this is for private log
92 * entry stuff, the MARK data provided by userspace for example.
93 */
94struct log_write_entry {
95 __le64 sector;
96 __le64 nr_sectors;
97 __le64 flags;
98 __le64 data_len;
99};
100
101struct log_writes_c {
102 struct dm_dev *dev;
103 struct dm_dev *logdev;
104 u64 logged_entries;
105 u32 sectorsize;
228bb5b2 106 u32 sectorshift;
0e9cebe7
JB
107 atomic_t io_blocks;
108 atomic_t pending_blocks;
109 sector_t next_sector;
110 sector_t end_sector;
111 bool logging_enabled;
112 bool device_supports_discard;
113 spinlock_t blocks_lock;
114 struct list_head unflushed_blocks;
115 struct list_head logging_blocks;
116 wait_queue_head_t wait;
117 struct task_struct *log_kthread;
e9634d1f 118 struct completion super_done;
0e9cebe7
JB
119};
120
121struct pending_block {
122 int vec_cnt;
123 u64 flags;
124 sector_t sector;
125 sector_t nr_sectors;
126 char *data;
127 u32 datalen;
128 struct list_head list;
129 struct bio_vec vecs[0];
130};
131
132struct per_bio_data {
133 struct pending_block *block;
134};
135
228bb5b2
JB
136static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc,
137 sector_t sectors)
138{
139 return sectors >> (lc->sectorshift - SECTOR_SHIFT);
140}
141
142static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc,
143 sector_t sectors)
144{
145 return sectors << (lc->sectorshift - SECTOR_SHIFT);
146}
147
0e9cebe7
JB
148static void put_pending_block(struct log_writes_c *lc)
149{
150 if (atomic_dec_and_test(&lc->pending_blocks)) {
151 smp_mb__after_atomic();
152 if (waitqueue_active(&lc->wait))
153 wake_up(&lc->wait);
154 }
155}
156
157static void put_io_block(struct log_writes_c *lc)
158{
159 if (atomic_dec_and_test(&lc->io_blocks)) {
160 smp_mb__after_atomic();
161 if (waitqueue_active(&lc->wait))
162 wake_up(&lc->wait);
163 }
164}
165
4246a0b6 166static void log_end_io(struct bio *bio)
0e9cebe7
JB
167{
168 struct log_writes_c *lc = bio->bi_private;
0e9cebe7 169
4e4cbee9 170 if (bio->bi_status) {
0e9cebe7
JB
171 unsigned long flags;
172
4e4cbee9 173 DMERR("Error writing log block, error=%d", bio->bi_status);
0e9cebe7
JB
174 spin_lock_irqsave(&lc->blocks_lock, flags);
175 lc->logging_enabled = false;
176 spin_unlock_irqrestore(&lc->blocks_lock, flags);
177 }
178
491221f8 179 bio_free_pages(bio);
0e9cebe7
JB
180 put_io_block(lc);
181 bio_put(bio);
182}
183
e9634d1f 184static void log_end_super(struct bio *bio)
185{
186 struct log_writes_c *lc = bio->bi_private;
187
188 complete(&lc->super_done);
189 log_end_io(bio);
190}
191
0e9cebe7
JB
192/*
193 * Meant to be called if there is an error, it will free all the pages
194 * associated with the block.
195 */
196static void free_pending_block(struct log_writes_c *lc,
197 struct pending_block *block)
198{
199 int i;
200
201 for (i = 0; i < block->vec_cnt; i++) {
202 if (block->vecs[i].bv_page)
203 __free_page(block->vecs[i].bv_page);
204 }
205 kfree(block->data);
206 kfree(block);
207 put_pending_block(lc);
208}
209
210static int write_metadata(struct log_writes_c *lc, void *entry,
211 size_t entrylen, void *data, size_t datalen,
212 sector_t sector)
213{
214 struct bio *bio;
215 struct page *page;
216 void *ptr;
217 size_t ret;
218
219 bio = bio_alloc(GFP_KERNEL, 1);
220 if (!bio) {
221 DMERR("Couldn't alloc log bio");
222 goto error;
223 }
224 bio->bi_iter.bi_size = 0;
225 bio->bi_iter.bi_sector = sector;
74d46992 226 bio_set_dev(bio, lc->logdev->bdev);
e9634d1f 227 bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
228 log_end_super : log_end_io;
0e9cebe7 229 bio->bi_private = lc;
e6047149 230 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
0e9cebe7
JB
231
232 page = alloc_page(GFP_KERNEL);
233 if (!page) {
234 DMERR("Couldn't alloc log page");
235 bio_put(bio);
236 goto error;
237 }
238
239 ptr = kmap_atomic(page);
240 memcpy(ptr, entry, entrylen);
241 if (datalen)
242 memcpy(ptr + entrylen, data, datalen);
243 memset(ptr + entrylen + datalen, 0,
244 lc->sectorsize - entrylen - datalen);
245 kunmap_atomic(ptr);
246
247 ret = bio_add_page(bio, page, lc->sectorsize, 0);
248 if (ret != lc->sectorsize) {
249 DMERR("Couldn't add page to the log block");
250 goto error_bio;
251 }
4e49ea4a 252 submit_bio(bio);
0e9cebe7
JB
253 return 0;
254error_bio:
255 bio_put(bio);
256 __free_page(page);
257error:
258 put_io_block(lc);
259 return -1;
260}
261
e5a20660
RZ
262static int write_inline_data(struct log_writes_c *lc, void *entry,
263 size_t entrylen, void *data, size_t datalen,
264 sector_t sector)
265{
266 int num_pages, bio_pages, pg_datalen, pg_sectorlen, i;
267 struct page *page;
268 struct bio *bio;
269 size_t ret;
270 void *ptr;
271
272 while (datalen) {
273 num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT;
274 bio_pages = min(num_pages, BIO_MAX_PAGES);
275
276 atomic_inc(&lc->io_blocks);
277
278 bio = bio_alloc(GFP_KERNEL, bio_pages);
279 if (!bio) {
280 DMERR("Couldn't alloc inline data bio");
281 goto error;
282 }
283
284 bio->bi_iter.bi_size = 0;
285 bio->bi_iter.bi_sector = sector;
286 bio_set_dev(bio, lc->logdev->bdev);
287 bio->bi_end_io = log_end_io;
288 bio->bi_private = lc;
289 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
290
291 for (i = 0; i < bio_pages; i++) {
292 pg_datalen = min_t(int, datalen, PAGE_SIZE);
293 pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize);
294
295 page = alloc_page(GFP_KERNEL);
296 if (!page) {
297 DMERR("Couldn't alloc inline data page");
298 goto error_bio;
299 }
300
301 ptr = kmap_atomic(page);
302 memcpy(ptr, data, pg_datalen);
303 if (pg_sectorlen > pg_datalen)
304 memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen);
305 kunmap_atomic(ptr);
306
307 ret = bio_add_page(bio, page, pg_sectorlen, 0);
308 if (ret != pg_sectorlen) {
309 DMERR("Couldn't add page of inline data");
310 __free_page(page);
311 goto error_bio;
312 }
313
314 datalen -= pg_datalen;
315 data += pg_datalen;
316 }
317 submit_bio(bio);
318
319 sector += bio_pages * PAGE_SECTORS;
320 }
321 return 0;
322error_bio:
323 bio_free_pages(bio);
324 bio_put(bio);
325error:
326 put_io_block(lc);
327 return -1;
328}
329
0e9cebe7
JB
330static int log_one_block(struct log_writes_c *lc,
331 struct pending_block *block, sector_t sector)
332{
333 struct bio *bio;
334 struct log_write_entry entry;
e5a20660 335 size_t metadatalen, ret;
0e9cebe7
JB
336 int i;
337
338 entry.sector = cpu_to_le64(block->sector);
339 entry.nr_sectors = cpu_to_le64(block->nr_sectors);
340 entry.flags = cpu_to_le64(block->flags);
341 entry.data_len = cpu_to_le64(block->datalen);
e5a20660
RZ
342
343 metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0;
0e9cebe7 344 if (write_metadata(lc, &entry, sizeof(entry), block->data,
e5a20660 345 metadatalen, sector)) {
0e9cebe7
JB
346 free_pending_block(lc, block);
347 return -1;
348 }
349
e5a20660
RZ
350 sector += dev_to_bio_sectors(lc, 1);
351
352 if (block->datalen && metadatalen == 0) {
353 if (write_inline_data(lc, &entry, sizeof(entry), block->data,
354 block->datalen, sector)) {
355 free_pending_block(lc, block);
356 return -1;
357 }
358 /* we don't support both inline data & bio data */
359 goto out;
360 }
361
0e9cebe7
JB
362 if (!block->vec_cnt)
363 goto out;
0e9cebe7 364
a5d60783 365 atomic_inc(&lc->io_blocks);
7efb3673 366 bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
0e9cebe7
JB
367 if (!bio) {
368 DMERR("Couldn't alloc log bio");
369 goto error;
370 }
0e9cebe7
JB
371 bio->bi_iter.bi_size = 0;
372 bio->bi_iter.bi_sector = sector;
74d46992 373 bio_set_dev(bio, lc->logdev->bdev);
0e9cebe7
JB
374 bio->bi_end_io = log_end_io;
375 bio->bi_private = lc;
e6047149 376 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
0e9cebe7
JB
377
378 for (i = 0; i < block->vec_cnt; i++) {
379 /*
380 * The page offset is always 0 because we allocate a new page
381 * for every bvec in the original bio for simplicity sake.
382 */
383 ret = bio_add_page(bio, block->vecs[i].bv_page,
384 block->vecs[i].bv_len, 0);
385 if (ret != block->vecs[i].bv_len) {
386 atomic_inc(&lc->io_blocks);
4e49ea4a 387 submit_bio(bio);
7efb3673 388 bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt - i, BIO_MAX_PAGES));
0e9cebe7
JB
389 if (!bio) {
390 DMERR("Couldn't alloc log bio");
391 goto error;
392 }
393 bio->bi_iter.bi_size = 0;
394 bio->bi_iter.bi_sector = sector;
74d46992 395 bio_set_dev(bio, lc->logdev->bdev);
0e9cebe7
JB
396 bio->bi_end_io = log_end_io;
397 bio->bi_private = lc;
e6047149 398 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
0e9cebe7
JB
399
400 ret = bio_add_page(bio, block->vecs[i].bv_page,
401 block->vecs[i].bv_len, 0);
402 if (ret != block->vecs[i].bv_len) {
403 DMERR("Couldn't add page on new bio?");
404 bio_put(bio);
405 goto error;
406 }
407 }
408 sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
409 }
4e49ea4a 410 submit_bio(bio);
0e9cebe7
JB
411out:
412 kfree(block->data);
413 kfree(block);
414 put_pending_block(lc);
415 return 0;
416error:
417 free_pending_block(lc, block);
418 put_io_block(lc);
419 return -1;
420}
421
422static int log_super(struct log_writes_c *lc)
423{
424 struct log_write_super super;
425
426 super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
427 super.version = cpu_to_le64(WRITE_LOG_VERSION);
428 super.nr_entries = cpu_to_le64(lc->logged_entries);
429 super.sectorsize = cpu_to_le32(lc->sectorsize);
430
e9634d1f 431 if (write_metadata(lc, &super, sizeof(super), NULL, 0,
432 WRITE_LOG_SUPER_SECTOR)) {
0e9cebe7
JB
433 DMERR("Couldn't write super");
434 return -1;
435 }
436
e9634d1f 437 /*
438 * Super sector should be writen in-order, otherwise the
439 * nr_entries could be rewritten incorrectly by an old bio.
440 */
441 wait_for_completion_io(&lc->super_done);
442
0e9cebe7
JB
443 return 0;
444}
445
446static inline sector_t logdev_last_sector(struct log_writes_c *lc)
447{
448 return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
449}
450
451static int log_writes_kthread(void *arg)
452{
453 struct log_writes_c *lc = (struct log_writes_c *)arg;
454 sector_t sector = 0;
455
456 while (!kthread_should_stop()) {
457 bool super = false;
458 bool logging_enabled;
459 struct pending_block *block = NULL;
460 int ret;
461
462 spin_lock_irq(&lc->blocks_lock);
463 if (!list_empty(&lc->logging_blocks)) {
464 block = list_first_entry(&lc->logging_blocks,
465 struct pending_block, list);
466 list_del_init(&block->list);
467 if (!lc->logging_enabled)
468 goto next;
469
470 sector = lc->next_sector;
228bb5b2
JB
471 if (!(block->flags & LOG_DISCARD_FLAG))
472 lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors);
473 lc->next_sector += dev_to_bio_sectors(lc, 1);
0e9cebe7
JB
474
475 /*
476 * Apparently the size of the device may not be known
477 * right away, so handle this properly.
478 */
479 if (!lc->end_sector)
480 lc->end_sector = logdev_last_sector(lc);
481 if (lc->end_sector &&
482 lc->next_sector >= lc->end_sector) {
483 DMERR("Ran out of space on the logdev");
484 lc->logging_enabled = false;
485 goto next;
486 }
487 lc->logged_entries++;
488 atomic_inc(&lc->io_blocks);
489
490 super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
491 if (super)
492 atomic_inc(&lc->io_blocks);
493 }
494next:
495 logging_enabled = lc->logging_enabled;
496 spin_unlock_irq(&lc->blocks_lock);
497 if (block) {
498 if (logging_enabled) {
499 ret = log_one_block(lc, block, sector);
500 if (!ret && super)
501 ret = log_super(lc);
502 if (ret) {
503 spin_lock_irq(&lc->blocks_lock);
504 lc->logging_enabled = false;
505 spin_unlock_irq(&lc->blocks_lock);
506 }
507 } else
508 free_pending_block(lc, block);
509 continue;
510 }
511
512 if (!try_to_freeze()) {
513 set_current_state(TASK_INTERRUPTIBLE);
514 if (!kthread_should_stop() &&
0c79c620 515 list_empty(&lc->logging_blocks))
0e9cebe7
JB
516 schedule();
517 __set_current_state(TASK_RUNNING);
518 }
519 }
520 return 0;
521}
522
523/*
524 * Construct a log-writes mapping:
525 * log-writes <dev_path> <log_dev_path>
526 */
527static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
528{
529 struct log_writes_c *lc;
530 struct dm_arg_set as;
531 const char *devname, *logdevname;
e80d1c80 532 int ret;
0e9cebe7
JB
533
534 as.argc = argc;
535 as.argv = argv;
536
537 if (argc < 2) {
538 ti->error = "Invalid argument count";
539 return -EINVAL;
540 }
541
542 lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
543 if (!lc) {
544 ti->error = "Cannot allocate context";
545 return -ENOMEM;
546 }
547 spin_lock_init(&lc->blocks_lock);
548 INIT_LIST_HEAD(&lc->unflushed_blocks);
549 INIT_LIST_HEAD(&lc->logging_blocks);
550 init_waitqueue_head(&lc->wait);
e9634d1f 551 init_completion(&lc->super_done);
0e9cebe7
JB
552 atomic_set(&lc->io_blocks, 0);
553 atomic_set(&lc->pending_blocks, 0);
554
555 devname = dm_shift_arg(&as);
e80d1c80
VG
556 ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev);
557 if (ret) {
0e9cebe7
JB
558 ti->error = "Device lookup failed";
559 goto bad;
560 }
561
562 logdevname = dm_shift_arg(&as);
e80d1c80
VG
563 ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table),
564 &lc->logdev);
565 if (ret) {
0e9cebe7
JB
566 ti->error = "Log device lookup failed";
567 dm_put_device(ti, lc->dev);
568 goto bad;
569 }
570
228bb5b2
JB
571 lc->sectorsize = bdev_logical_block_size(lc->dev->bdev);
572 lc->sectorshift = ilog2(lc->sectorsize);
0e9cebe7 573 lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
91e630d9
VZ
574 if (IS_ERR(lc->log_kthread)) {
575 ret = PTR_ERR(lc->log_kthread);
0e9cebe7
JB
576 ti->error = "Couldn't alloc kthread";
577 dm_put_device(ti, lc->dev);
578 dm_put_device(ti, lc->logdev);
579 goto bad;
580 }
581
228bb5b2
JB
582 /*
583 * next_sector is in 512b sectors to correspond to what bi_sector expects.
584 * The super starts at sector 0, and the next_sector is the next logical
585 * one based on the sectorsize of the device.
586 */
587 lc->next_sector = lc->sectorsize >> SECTOR_SHIFT;
0e9cebe7
JB
588 lc->logging_enabled = true;
589 lc->end_sector = logdev_last_sector(lc);
590 lc->device_supports_discard = true;
591
592 ti->num_flush_bios = 1;
593 ti->flush_supported = true;
594 ti->num_discard_bios = 1;
595 ti->discards_supported = true;
30187e1d 596 ti->per_io_data_size = sizeof(struct per_bio_data);
0e9cebe7
JB
597 ti->private = lc;
598 return 0;
599
600bad:
601 kfree(lc);
e80d1c80 602 return ret;
0e9cebe7
JB
603}
604
605static int log_mark(struct log_writes_c *lc, char *data)
606{
607 struct pending_block *block;
608 size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
609
610 block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
611 if (!block) {
612 DMERR("Error allocating pending block");
613 return -ENOMEM;
614 }
615
616 block->data = kstrndup(data, maxsize, GFP_KERNEL);
617 if (!block->data) {
618 DMERR("Error copying mark data");
619 kfree(block);
620 return -ENOMEM;
621 }
622 atomic_inc(&lc->pending_blocks);
623 block->datalen = strlen(block->data);
624 block->flags |= LOG_MARK_FLAG;
625 spin_lock_irq(&lc->blocks_lock);
626 list_add_tail(&block->list, &lc->logging_blocks);
627 spin_unlock_irq(&lc->blocks_lock);
628 wake_up_process(lc->log_kthread);
629 return 0;
630}
631
98d82f48
RZ
632static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
633 struct iov_iter *i)
634{
635 struct pending_block *block;
636
637 if (!bytes)
638 return 0;
639
640 block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
641 if (!block) {
642 DMERR("Error allocating dax pending block");
643 return -ENOMEM;
644 }
645
646 block->data = kzalloc(bytes, GFP_KERNEL);
647 if (!block->data) {
648 DMERR("Error allocating dax data space");
649 kfree(block);
650 return -ENOMEM;
651 }
652
653 /* write data provided via the iterator */
654 if (!copy_from_iter(block->data, bytes, i)) {
655 DMERR("Error copying dax data");
656 kfree(block->data);
657 kfree(block);
658 return -EIO;
659 }
660
661 /* rewind the iterator so that the block driver can use it */
662 iov_iter_revert(i, bytes);
663
664 block->datalen = bytes;
665 block->sector = bio_to_dev_sectors(lc, sector);
666 block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift;
667
668 atomic_inc(&lc->pending_blocks);
669 spin_lock_irq(&lc->blocks_lock);
670 list_add_tail(&block->list, &lc->unflushed_blocks);
671 spin_unlock_irq(&lc->blocks_lock);
672 wake_up_process(lc->log_kthread);
673
674 return 0;
675}
676
0e9cebe7
JB
677static void log_writes_dtr(struct dm_target *ti)
678{
679 struct log_writes_c *lc = ti->private;
680
681 spin_lock_irq(&lc->blocks_lock);
682 list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
683 spin_unlock_irq(&lc->blocks_lock);
684
685 /*
686 * This is just nice to have since it'll update the super to include the
687 * unflushed blocks, if it fails we don't really care.
688 */
689 log_mark(lc, "dm-log-writes-end");
690 wake_up_process(lc->log_kthread);
691 wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
692 !atomic_read(&lc->pending_blocks));
693 kthread_stop(lc->log_kthread);
694
695 WARN_ON(!list_empty(&lc->logging_blocks));
696 WARN_ON(!list_empty(&lc->unflushed_blocks));
697 dm_put_device(ti, lc->dev);
698 dm_put_device(ti, lc->logdev);
699 kfree(lc);
700}
701
702static void normal_map_bio(struct dm_target *ti, struct bio *bio)
703{
704 struct log_writes_c *lc = ti->private;
705
74d46992 706 bio_set_dev(bio, lc->dev->bdev);
0e9cebe7
JB
707}
708
709static int log_writes_map(struct dm_target *ti, struct bio *bio)
710{
711 struct log_writes_c *lc = ti->private;
712 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
713 struct pending_block *block;
714 struct bvec_iter iter;
715 struct bio_vec bv;
716 size_t alloc_size;
717 int i = 0;
1eff9d32
JA
718 bool flush_bio = (bio->bi_opf & REQ_PREFLUSH);
719 bool fua_bio = (bio->bi_opf & REQ_FUA);
e6047149 720 bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD);
0e9cebe7
JB
721
722 pb->block = NULL;
723
724 /* Don't bother doing anything if logging has been disabled */
725 if (!lc->logging_enabled)
726 goto map_bio;
727
728 /*
729 * Map reads as normal.
730 */
731 if (bio_data_dir(bio) == READ)
732 goto map_bio;
733
734 /* No sectors and not a flush? Don't care */
735 if (!bio_sectors(bio) && !flush_bio)
736 goto map_bio;
737
738 /*
739 * Discards will have bi_size set but there's no actual data, so just
740 * allocate the size of the pending block.
741 */
742 if (discard_bio)
743 alloc_size = sizeof(struct pending_block);
744 else
745 alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
746
747 block = kzalloc(alloc_size, GFP_NOIO);
748 if (!block) {
749 DMERR("Error allocating pending block");
750 spin_lock_irq(&lc->blocks_lock);
751 lc->logging_enabled = false;
752 spin_unlock_irq(&lc->blocks_lock);
846785e6 753 return DM_MAPIO_KILL;
0e9cebe7
JB
754 }
755 INIT_LIST_HEAD(&block->list);
756 pb->block = block;
757 atomic_inc(&lc->pending_blocks);
758
759 if (flush_bio)
760 block->flags |= LOG_FLUSH_FLAG;
761 if (fua_bio)
762 block->flags |= LOG_FUA_FLAG;
763 if (discard_bio)
764 block->flags |= LOG_DISCARD_FLAG;
765
228bb5b2
JB
766 block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector);
767 block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio));
0e9cebe7
JB
768
769 /* We don't need the data, just submit */
770 if (discard_bio) {
771 WARN_ON(flush_bio || fua_bio);
772 if (lc->device_supports_discard)
773 goto map_bio;
4246a0b6 774 bio_endio(bio);
0e9cebe7
JB
775 return DM_MAPIO_SUBMITTED;
776 }
777
778 /* Flush bio, splice the unflushed blocks onto this list and submit */
779 if (flush_bio && !bio_sectors(bio)) {
780 spin_lock_irq(&lc->blocks_lock);
781 list_splice_init(&lc->unflushed_blocks, &block->list);
782 spin_unlock_irq(&lc->blocks_lock);
783 goto map_bio;
784 }
785
786 /*
787 * We will write this bio somewhere else way later so we need to copy
788 * the actual contents into new pages so we know the data will always be
789 * there.
790 *
791 * We do this because this could be a bio from O_DIRECT in which case we
792 * can't just hold onto the page until some later point, we have to
793 * manually copy the contents.
794 */
795 bio_for_each_segment(bv, bio, iter) {
796 struct page *page;
797 void *src, *dst;
798
799 page = alloc_page(GFP_NOIO);
800 if (!page) {
801 DMERR("Error allocing page");
802 free_pending_block(lc, block);
803 spin_lock_irq(&lc->blocks_lock);
804 lc->logging_enabled = false;
805 spin_unlock_irq(&lc->blocks_lock);
846785e6 806 return DM_MAPIO_KILL;
0e9cebe7
JB
807 }
808
809 src = kmap_atomic(bv.bv_page);
810 dst = kmap_atomic(page);
811 memcpy(dst, src + bv.bv_offset, bv.bv_len);
812 kunmap_atomic(dst);
813 kunmap_atomic(src);
814 block->vecs[i].bv_page = page;
815 block->vecs[i].bv_len = bv.bv_len;
816 block->vec_cnt++;
817 i++;
818 }
819
820 /* Had a flush with data in it, weird */
821 if (flush_bio) {
822 spin_lock_irq(&lc->blocks_lock);
823 list_splice_init(&lc->unflushed_blocks, &block->list);
824 spin_unlock_irq(&lc->blocks_lock);
825 }
826map_bio:
827 normal_map_bio(ti, bio);
828 return DM_MAPIO_REMAPPED;
829}
830
4e4cbee9
CH
831static int normal_end_io(struct dm_target *ti, struct bio *bio,
832 blk_status_t *error)
0e9cebe7
JB
833{
834 struct log_writes_c *lc = ti->private;
835 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
836
837 if (bio_data_dir(bio) == WRITE && pb->block) {
838 struct pending_block *block = pb->block;
839 unsigned long flags;
840
841 spin_lock_irqsave(&lc->blocks_lock, flags);
842 if (block->flags & LOG_FLUSH_FLAG) {
843 list_splice_tail_init(&block->list, &lc->logging_blocks);
844 list_add_tail(&block->list, &lc->logging_blocks);
845 wake_up_process(lc->log_kthread);
846 } else if (block->flags & LOG_FUA_FLAG) {
847 list_add_tail(&block->list, &lc->logging_blocks);
848 wake_up_process(lc->log_kthread);
849 } else
850 list_add_tail(&block->list, &lc->unflushed_blocks);
851 spin_unlock_irqrestore(&lc->blocks_lock, flags);
852 }
853
1be56909 854 return DM_ENDIO_DONE;
0e9cebe7
JB
855}
856
857/*
858 * INFO format: <logged entries> <highest allocated sector>
859 */
860static void log_writes_status(struct dm_target *ti, status_type_t type,
861 unsigned status_flags, char *result,
862 unsigned maxlen)
863{
864 unsigned sz = 0;
865 struct log_writes_c *lc = ti->private;
866
867 switch (type) {
868 case STATUSTYPE_INFO:
869 DMEMIT("%llu %llu", lc->logged_entries,
870 (unsigned long long)lc->next_sector - 1);
871 if (!lc->logging_enabled)
872 DMEMIT(" logging_disabled");
873 break;
874
875 case STATUSTYPE_TABLE:
876 DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
877 break;
878 }
879}
880
e56f81e0
CH
881static int log_writes_prepare_ioctl(struct dm_target *ti,
882 struct block_device **bdev, fmode_t *mode)
0e9cebe7
JB
883{
884 struct log_writes_c *lc = ti->private;
885 struct dm_dev *dev = lc->dev;
0e9cebe7 886
e56f81e0 887 *bdev = dev->bdev;
0e9cebe7
JB
888 /*
889 * Only pass ioctls through if the device sizes match exactly.
890 */
891 if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
e56f81e0
CH
892 return 1;
893 return 0;
0e9cebe7
JB
894}
895
0e9cebe7
JB
896static int log_writes_iterate_devices(struct dm_target *ti,
897 iterate_devices_callout_fn fn,
898 void *data)
899{
900 struct log_writes_c *lc = ti->private;
901
902 return fn(ti, lc->dev, 0, ti->len, data);
903}
904
905/*
906 * Messages supported:
907 * mark <mark data> - specify the marked data.
908 */
909static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv)
910{
911 int r = -EINVAL;
912 struct log_writes_c *lc = ti->private;
913
914 if (argc != 2) {
915 DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
916 return r;
917 }
918
919 if (!strcasecmp(argv[0], "mark"))
920 r = log_mark(lc, argv[1]);
921 else
922 DMWARN("Unrecognised log writes target message received: %s", argv[0]);
923
924 return r;
925}
926
927static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
928{
929 struct log_writes_c *lc = ti->private;
930 struct request_queue *q = bdev_get_queue(lc->dev->bdev);
931
932 if (!q || !blk_queue_discard(q)) {
933 lc->device_supports_discard = false;
228bb5b2 934 limits->discard_granularity = lc->sectorsize;
0e9cebe7
JB
935 limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
936 }
228bb5b2
JB
937 limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev);
938 limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev);
939 limits->io_min = limits->physical_block_size;
0e9cebe7
JB
940}
941
98d82f48
RZ
942static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
943 long nr_pages, void **kaddr, pfn_t *pfn)
944{
945 struct log_writes_c *lc = ti->private;
946 sector_t sector = pgoff * PAGE_SECTORS;
947 int ret;
948
949 ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff);
950 if (ret)
951 return ret;
952 return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn);
953}
954
955static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
956 pgoff_t pgoff, void *addr, size_t bytes,
957 struct iov_iter *i)
958{
959 struct log_writes_c *lc = ti->private;
960 sector_t sector = pgoff * PAGE_SECTORS;
961 int err;
962
963 if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
964 return 0;
965
966 /* Don't bother doing anything if logging has been disabled */
967 if (!lc->logging_enabled)
968 goto dax_copy;
969
970 err = log_dax(lc, sector, bytes, i);
971 if (err) {
972 DMWARN("Error %d logging DAX write", err);
973 return 0;
974 }
975dax_copy:
976 return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
977}
978
0e9cebe7
JB
979static struct target_type log_writes_target = {
980 .name = "log-writes",
98d82f48 981 .version = {1, 1, 0},
0e9cebe7
JB
982 .module = THIS_MODULE,
983 .ctr = log_writes_ctr,
984 .dtr = log_writes_dtr,
985 .map = log_writes_map,
986 .end_io = normal_end_io,
987 .status = log_writes_status,
e56f81e0 988 .prepare_ioctl = log_writes_prepare_ioctl,
0e9cebe7
JB
989 .message = log_writes_message,
990 .iterate_devices = log_writes_iterate_devices,
991 .io_hints = log_writes_io_hints,
98d82f48
RZ
992 .direct_access = log_writes_dax_direct_access,
993 .dax_copy_from_iter = log_writes_dax_copy_from_iter,
0e9cebe7
JB
994};
995
996static int __init dm_log_writes_init(void)
997{
998 int r = dm_register_target(&log_writes_target);
999
1000 if (r < 0)
1001 DMERR("register failed %d", r);
1002
1003 return r;
1004}
1005
1006static void __exit dm_log_writes_exit(void)
1007{
1008 dm_unregister_target(&log_writes_target);
1009}
1010
1011module_init(dm_log_writes_init);
1012module_exit(dm_log_writes_exit);
1013
1014MODULE_DESCRIPTION(DM_NAME " log writes target");
1015MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
1016MODULE_LICENSE("GPL");