]>
Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * dm-snapshot.c | |
3 | * | |
4 | * Copyright (C) 2001-2002 Sistina Software (UK) Limited. | |
5 | * | |
6 | * This file is released under the GPL. | |
7 | */ | |
8 | ||
9 | #include "dm.h" | |
10 | #include "dm-snap.h" | |
11 | #include "dm-io.h" | |
12 | #include "kcopyd.h" | |
13 | ||
14 | #include <linux/mm.h> | |
15 | #include <linux/pagemap.h> | |
16 | #include <linux/vmalloc.h> | |
17 | #include <linux/slab.h> | |
18 | ||
19 | /*----------------------------------------------------------------- | |
20 | * Persistent snapshots, by persistent we mean that the snapshot | |
21 | * will survive a reboot. | |
22 | *---------------------------------------------------------------*/ | |
23 | ||
24 | /* | |
25 | * We need to store a record of which parts of the origin have | |
26 | * been copied to the snapshot device. The snapshot code | |
27 | * requires that we copy exception chunks to chunk aligned areas | |
28 | * of the COW store. It makes sense therefore, to store the | |
29 | * metadata in chunk size blocks. | |
30 | * | |
31 | * There is no backward or forward compatibility implemented, | |
32 | * snapshots with different disk versions than the kernel will | |
33 | * not be usable. It is expected that "lvcreate" will blank out | |
34 | * the start of a fresh COW device before calling the snapshot | |
35 | * constructor. | |
36 | * | |
37 | * The first chunk of the COW device just contains the header. | |
38 | * After this there is a chunk filled with exception metadata, | |
39 | * followed by as many exception chunks as can fit in the | |
40 | * metadata areas. | |
41 | * | |
42 | * All on disk structures are in little-endian format. The end | |
43 | * of the exceptions info is indicated by an exception with a | |
44 | * new_chunk of 0, which is invalid since it would point to the | |
45 | * header chunk. | |
46 | */ | |
47 | ||
48 | /* | |
49 | * Magic for persistent snapshots: "SnAp" - Feeble isn't it. | |
50 | */ | |
51 | #define SNAP_MAGIC 0x70416e53 | |
52 | ||
53 | /* | |
54 | * The on-disk version of the metadata. | |
55 | */ | |
56 | #define SNAPSHOT_DISK_VERSION 1 | |
57 | ||
58 | struct disk_header { | |
59 | uint32_t magic; | |
60 | ||
61 | /* | |
62 | * Is this snapshot valid. There is no way of recovering | |
63 | * an invalid snapshot. | |
64 | */ | |
65 | uint32_t valid; | |
66 | ||
67 | /* | |
68 | * Simple, incrementing version. no backward | |
69 | * compatibility. | |
70 | */ | |
71 | uint32_t version; | |
72 | ||
73 | /* In sectors */ | |
74 | uint32_t chunk_size; | |
75 | }; | |
76 | ||
77 | struct disk_exception { | |
78 | uint64_t old_chunk; | |
79 | uint64_t new_chunk; | |
80 | }; | |
81 | ||
82 | struct commit_callback { | |
83 | void (*callback)(void *, int success); | |
84 | void *context; | |
85 | }; | |
86 | ||
87 | /* | |
88 | * The top level structure for a persistent exception store. | |
89 | */ | |
90 | struct pstore { | |
91 | struct dm_snapshot *snap; /* up pointer to my snapshot */ | |
92 | int version; | |
93 | int valid; | |
94 | uint32_t chunk_size; | |
95 | uint32_t exceptions_per_area; | |
96 | ||
97 | /* | |
98 | * Now that we have an asynchronous kcopyd there is no | |
99 | * need for large chunk sizes, so it wont hurt to have a | |
100 | * whole chunks worth of metadata in memory at once. | |
101 | */ | |
102 | void *area; | |
103 | ||
104 | /* | |
105 | * Used to keep track of which metadata area the data in | |
106 | * 'chunk' refers to. | |
107 | */ | |
108 | uint32_t current_area; | |
109 | ||
110 | /* | |
111 | * The next free chunk for an exception. | |
112 | */ | |
113 | uint32_t next_free; | |
114 | ||
115 | /* | |
116 | * The index of next free exception in the current | |
117 | * metadata area. | |
118 | */ | |
119 | uint32_t current_committed; | |
120 | ||
121 | atomic_t pending_count; | |
122 | uint32_t callback_count; | |
123 | struct commit_callback *callbacks; | |
124 | }; | |
125 | ||
126 | static inline unsigned int sectors_to_pages(unsigned int sectors) | |
127 | { | |
128 | return sectors / (PAGE_SIZE >> 9); | |
129 | } | |
130 | ||
131 | static int alloc_area(struct pstore *ps) | |
132 | { | |
133 | int r = -ENOMEM; | |
134 | size_t len; | |
135 | ||
136 | len = ps->chunk_size << SECTOR_SHIFT; | |
137 | ||
138 | /* | |
139 | * Allocate the chunk_size block of memory that will hold | |
140 | * a single metadata area. | |
141 | */ | |
142 | ps->area = vmalloc(len); | |
143 | if (!ps->area) | |
144 | return r; | |
145 | ||
146 | return 0; | |
147 | } | |
148 | ||
149 | static void free_area(struct pstore *ps) | |
150 | { | |
151 | vfree(ps->area); | |
152 | } | |
153 | ||
154 | /* | |
155 | * Read or write a chunk aligned and sized block of data from a device. | |
156 | */ | |
157 | static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) | |
158 | { | |
159 | struct io_region where; | |
160 | unsigned long bits; | |
161 | ||
162 | where.bdev = ps->snap->cow->bdev; | |
163 | where.sector = ps->chunk_size * chunk; | |
164 | where.count = ps->chunk_size; | |
165 | ||
166 | return dm_io_sync_vm(1, &where, rw, ps->area, &bits); | |
167 | } | |
168 | ||
169 | /* | |
170 | * Read or write a metadata area. Remembering to skip the first | |
171 | * chunk which holds the header. | |
172 | */ | |
173 | static int area_io(struct pstore *ps, uint32_t area, int rw) | |
174 | { | |
175 | int r; | |
176 | uint32_t chunk; | |
177 | ||
178 | /* convert a metadata area index to a chunk index */ | |
179 | chunk = 1 + ((ps->exceptions_per_area + 1) * area); | |
180 | ||
181 | r = chunk_io(ps, chunk, rw); | |
182 | if (r) | |
183 | return r; | |
184 | ||
185 | ps->current_area = area; | |
186 | return 0; | |
187 | } | |
188 | ||
189 | static int zero_area(struct pstore *ps, uint32_t area) | |
190 | { | |
191 | memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); | |
192 | return area_io(ps, area, WRITE); | |
193 | } | |
194 | ||
195 | static int read_header(struct pstore *ps, int *new_snapshot) | |
196 | { | |
197 | int r; | |
198 | struct disk_header *dh; | |
199 | ||
200 | r = chunk_io(ps, 0, READ); | |
201 | if (r) | |
202 | return r; | |
203 | ||
204 | dh = (struct disk_header *) ps->area; | |
205 | ||
206 | if (le32_to_cpu(dh->magic) == 0) { | |
207 | *new_snapshot = 1; | |
208 | ||
209 | } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { | |
210 | *new_snapshot = 0; | |
211 | ps->valid = le32_to_cpu(dh->valid); | |
212 | ps->version = le32_to_cpu(dh->version); | |
213 | ps->chunk_size = le32_to_cpu(dh->chunk_size); | |
214 | ||
215 | } else { | |
216 | DMWARN("Invalid/corrupt snapshot"); | |
217 | r = -ENXIO; | |
218 | } | |
219 | ||
220 | return r; | |
221 | } | |
222 | ||
223 | static int write_header(struct pstore *ps) | |
224 | { | |
225 | struct disk_header *dh; | |
226 | ||
227 | memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); | |
228 | ||
229 | dh = (struct disk_header *) ps->area; | |
230 | dh->magic = cpu_to_le32(SNAP_MAGIC); | |
231 | dh->valid = cpu_to_le32(ps->valid); | |
232 | dh->version = cpu_to_le32(ps->version); | |
233 | dh->chunk_size = cpu_to_le32(ps->chunk_size); | |
234 | ||
235 | return chunk_io(ps, 0, WRITE); | |
236 | } | |
237 | ||
238 | /* | |
239 | * Access functions for the disk exceptions, these do the endian conversions. | |
240 | */ | |
241 | static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) | |
242 | { | |
243 | if (index >= ps->exceptions_per_area) | |
244 | return NULL; | |
245 | ||
246 | return ((struct disk_exception *) ps->area) + index; | |
247 | } | |
248 | ||
249 | static int read_exception(struct pstore *ps, | |
250 | uint32_t index, struct disk_exception *result) | |
251 | { | |
252 | struct disk_exception *e; | |
253 | ||
254 | e = get_exception(ps, index); | |
255 | if (!e) | |
256 | return -EINVAL; | |
257 | ||
258 | /* copy it */ | |
259 | result->old_chunk = le64_to_cpu(e->old_chunk); | |
260 | result->new_chunk = le64_to_cpu(e->new_chunk); | |
261 | ||
262 | return 0; | |
263 | } | |
264 | ||
265 | static int write_exception(struct pstore *ps, | |
266 | uint32_t index, struct disk_exception *de) | |
267 | { | |
268 | struct disk_exception *e; | |
269 | ||
270 | e = get_exception(ps, index); | |
271 | if (!e) | |
272 | return -EINVAL; | |
273 | ||
274 | /* copy it */ | |
275 | e->old_chunk = cpu_to_le64(de->old_chunk); | |
276 | e->new_chunk = cpu_to_le64(de->new_chunk); | |
277 | ||
278 | return 0; | |
279 | } | |
280 | ||
281 | /* | |
282 | * Registers the exceptions that are present in the current area. | |
283 | * 'full' is filled in to indicate if the area has been | |
284 | * filled. | |
285 | */ | |
286 | static int insert_exceptions(struct pstore *ps, int *full) | |
287 | { | |
288 | int r; | |
289 | unsigned int i; | |
290 | struct disk_exception de; | |
291 | ||
292 | /* presume the area is full */ | |
293 | *full = 1; | |
294 | ||
295 | for (i = 0; i < ps->exceptions_per_area; i++) { | |
296 | r = read_exception(ps, i, &de); | |
297 | ||
298 | if (r) | |
299 | return r; | |
300 | ||
301 | /* | |
302 | * If the new_chunk is pointing at the start of | |
303 | * the COW device, where the first metadata area | |
304 | * is we know that we've hit the end of the | |
305 | * exceptions. Therefore the area is not full. | |
306 | */ | |
307 | if (de.new_chunk == 0LL) { | |
308 | ps->current_committed = i; | |
309 | *full = 0; | |
310 | break; | |
311 | } | |
312 | ||
313 | /* | |
314 | * Keep track of the start of the free chunks. | |
315 | */ | |
316 | if (ps->next_free <= de.new_chunk) | |
317 | ps->next_free = de.new_chunk + 1; | |
318 | ||
319 | /* | |
320 | * Otherwise we add the exception to the snapshot. | |
321 | */ | |
322 | r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); | |
323 | if (r) | |
324 | return r; | |
325 | } | |
326 | ||
327 | return 0; | |
328 | } | |
329 | ||
330 | static int read_exceptions(struct pstore *ps) | |
331 | { | |
332 | uint32_t area; | |
333 | int r, full = 1; | |
334 | ||
335 | /* | |
336 | * Keeping reading chunks and inserting exceptions until | |
337 | * we find a partially full area. | |
338 | */ | |
339 | for (area = 0; full; area++) { | |
340 | r = area_io(ps, area, READ); | |
341 | if (r) | |
342 | return r; | |
343 | ||
344 | r = insert_exceptions(ps, &full); | |
345 | if (r) | |
346 | return r; | |
347 | } | |
348 | ||
349 | return 0; | |
350 | } | |
351 | ||
352 | static inline struct pstore *get_info(struct exception_store *store) | |
353 | { | |
354 | return (struct pstore *) store->context; | |
355 | } | |
356 | ||
357 | static void persistent_fraction_full(struct exception_store *store, | |
358 | sector_t *numerator, sector_t *denominator) | |
359 | { | |
360 | *numerator = get_info(store)->next_free * store->snap->chunk_size; | |
361 | *denominator = get_dev_size(store->snap->cow->bdev); | |
362 | } | |
363 | ||
364 | static void persistent_destroy(struct exception_store *store) | |
365 | { | |
366 | struct pstore *ps = get_info(store); | |
367 | ||
368 | dm_io_put(sectors_to_pages(ps->chunk_size)); | |
369 | vfree(ps->callbacks); | |
370 | free_area(ps); | |
371 | kfree(ps); | |
372 | } | |
373 | ||
374 | static int persistent_read_metadata(struct exception_store *store) | |
375 | { | |
376 | int r, new_snapshot; | |
377 | struct pstore *ps = get_info(store); | |
378 | ||
379 | /* | |
380 | * Read the snapshot header. | |
381 | */ | |
382 | r = read_header(ps, &new_snapshot); | |
383 | if (r) | |
384 | return r; | |
385 | ||
386 | /* | |
387 | * Do we need to setup a new snapshot ? | |
388 | */ | |
389 | if (new_snapshot) { | |
390 | r = write_header(ps); | |
391 | if (r) { | |
392 | DMWARN("write_header failed"); | |
393 | return r; | |
394 | } | |
395 | ||
396 | r = zero_area(ps, 0); | |
397 | if (r) { | |
398 | DMWARN("zero_area(0) failed"); | |
399 | return r; | |
400 | } | |
401 | ||
402 | } else { | |
403 | /* | |
404 | * Sanity checks. | |
405 | */ | |
406 | if (!ps->valid) { | |
407 | DMWARN("snapshot is marked invalid"); | |
408 | return -EINVAL; | |
409 | } | |
410 | ||
411 | if (ps->version != SNAPSHOT_DISK_VERSION) { | |
412 | DMWARN("unable to handle snapshot disk version %d", | |
413 | ps->version); | |
414 | return -EINVAL; | |
415 | } | |
416 | ||
417 | /* | |
418 | * Read the metadata. | |
419 | */ | |
420 | r = read_exceptions(ps); | |
421 | if (r) | |
422 | return r; | |
423 | } | |
424 | ||
425 | return 0; | |
426 | } | |
427 | ||
428 | static int persistent_prepare(struct exception_store *store, | |
429 | struct exception *e) | |
430 | { | |
431 | struct pstore *ps = get_info(store); | |
432 | uint32_t stride; | |
433 | sector_t size = get_dev_size(store->snap->cow->bdev); | |
434 | ||
435 | /* Is there enough room ? */ | |
436 | if (size < ((ps->next_free + 1) * store->snap->chunk_size)) | |
437 | return -ENOSPC; | |
438 | ||
439 | e->new_chunk = ps->next_free; | |
440 | ||
441 | /* | |
442 | * Move onto the next free pending, making sure to take | |
443 | * into account the location of the metadata chunks. | |
444 | */ | |
445 | stride = (ps->exceptions_per_area + 1); | |
446 | if ((++ps->next_free % stride) == 1) | |
447 | ps->next_free++; | |
448 | ||
449 | atomic_inc(&ps->pending_count); | |
450 | return 0; | |
451 | } | |
452 | ||
453 | static void persistent_commit(struct exception_store *store, | |
454 | struct exception *e, | |
455 | void (*callback) (void *, int success), | |
456 | void *callback_context) | |
457 | { | |
458 | int r; | |
459 | unsigned int i; | |
460 | struct pstore *ps = get_info(store); | |
461 | struct disk_exception de; | |
462 | struct commit_callback *cb; | |
463 | ||
464 | de.old_chunk = e->old_chunk; | |
465 | de.new_chunk = e->new_chunk; | |
466 | write_exception(ps, ps->current_committed++, &de); | |
467 | ||
468 | /* | |
469 | * Add the callback to the back of the array. This code | |
470 | * is the only place where the callback array is | |
471 | * manipulated, and we know that it will never be called | |
472 | * multiple times concurrently. | |
473 | */ | |
474 | cb = ps->callbacks + ps->callback_count++; | |
475 | cb->callback = callback; | |
476 | cb->context = callback_context; | |
477 | ||
478 | /* | |
479 | * If there are no more exceptions in flight, or we have | |
480 | * filled this metadata area we commit the exceptions to | |
481 | * disk. | |
482 | */ | |
483 | if (atomic_dec_and_test(&ps->pending_count) || | |
484 | (ps->current_committed == ps->exceptions_per_area)) { | |
485 | r = area_io(ps, ps->current_area, WRITE); | |
486 | if (r) | |
487 | ps->valid = 0; | |
488 | ||
489 | for (i = 0; i < ps->callback_count; i++) { | |
490 | cb = ps->callbacks + i; | |
491 | cb->callback(cb->context, r == 0 ? 1 : 0); | |
492 | } | |
493 | ||
494 | ps->callback_count = 0; | |
495 | } | |
496 | ||
497 | /* | |
498 | * Have we completely filled the current area ? | |
499 | */ | |
500 | if (ps->current_committed == ps->exceptions_per_area) { | |
501 | ps->current_committed = 0; | |
502 | r = zero_area(ps, ps->current_area + 1); | |
503 | if (r) | |
504 | ps->valid = 0; | |
505 | } | |
506 | } | |
507 | ||
508 | static void persistent_drop(struct exception_store *store) | |
509 | { | |
510 | struct pstore *ps = get_info(store); | |
511 | ||
512 | ps->valid = 0; | |
513 | if (write_header(ps)) | |
514 | DMWARN("write header failed"); | |
515 | } | |
516 | ||
517 | int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) | |
518 | { | |
519 | int r; | |
520 | struct pstore *ps; | |
521 | ||
522 | r = dm_io_get(sectors_to_pages(chunk_size)); | |
523 | if (r) | |
524 | return r; | |
525 | ||
526 | /* allocate the pstore */ | |
527 | ps = kmalloc(sizeof(*ps), GFP_KERNEL); | |
528 | if (!ps) { | |
529 | r = -ENOMEM; | |
530 | goto bad; | |
531 | } | |
532 | ||
533 | ps->snap = store->snap; | |
534 | ps->valid = 1; | |
535 | ps->version = SNAPSHOT_DISK_VERSION; | |
536 | ps->chunk_size = chunk_size; | |
537 | ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / | |
538 | sizeof(struct disk_exception); | |
539 | ps->next_free = 2; /* skipping the header and first area */ | |
540 | ps->current_committed = 0; | |
541 | ||
542 | r = alloc_area(ps); | |
543 | if (r) | |
544 | goto bad; | |
545 | ||
546 | /* | |
547 | * Allocate space for all the callbacks. | |
548 | */ | |
549 | ps->callback_count = 0; | |
550 | atomic_set(&ps->pending_count, 0); | |
551 | ps->callbacks = dm_vcalloc(ps->exceptions_per_area, | |
552 | sizeof(*ps->callbacks)); | |
553 | ||
554 | if (!ps->callbacks) { | |
555 | r = -ENOMEM; | |
556 | goto bad; | |
557 | } | |
558 | ||
559 | store->destroy = persistent_destroy; | |
560 | store->read_metadata = persistent_read_metadata; | |
561 | store->prepare_exception = persistent_prepare; | |
562 | store->commit_exception = persistent_commit; | |
563 | store->drop_snapshot = persistent_drop; | |
564 | store->fraction_full = persistent_fraction_full; | |
565 | store->context = ps; | |
566 | ||
567 | return 0; | |
568 | ||
569 | bad: | |
570 | dm_io_put(sectors_to_pages(chunk_size)); | |
571 | if (ps) { | |
572 | if (ps->area) | |
573 | free_area(ps); | |
574 | ||
575 | kfree(ps); | |
576 | } | |
577 | return r; | |
578 | } | |
579 | ||
580 | /*----------------------------------------------------------------- | |
581 | * Implementation of the store for non-persistent snapshots. | |
582 | *---------------------------------------------------------------*/ | |
583 | struct transient_c { | |
584 | sector_t next_free; | |
585 | }; | |
586 | ||
587 | static void transient_destroy(struct exception_store *store) | |
588 | { | |
589 | kfree(store->context); | |
590 | } | |
591 | ||
592 | static int transient_read_metadata(struct exception_store *store) | |
593 | { | |
594 | return 0; | |
595 | } | |
596 | ||
597 | static int transient_prepare(struct exception_store *store, struct exception *e) | |
598 | { | |
599 | struct transient_c *tc = (struct transient_c *) store->context; | |
600 | sector_t size = get_dev_size(store->snap->cow->bdev); | |
601 | ||
602 | if (size < (tc->next_free + store->snap->chunk_size)) | |
603 | return -1; | |
604 | ||
605 | e->new_chunk = sector_to_chunk(store->snap, tc->next_free); | |
606 | tc->next_free += store->snap->chunk_size; | |
607 | ||
608 | return 0; | |
609 | } | |
610 | ||
611 | static void transient_commit(struct exception_store *store, | |
612 | struct exception *e, | |
613 | void (*callback) (void *, int success), | |
614 | void *callback_context) | |
615 | { | |
616 | /* Just succeed */ | |
617 | callback(callback_context, 1); | |
618 | } | |
619 | ||
620 | static void transient_fraction_full(struct exception_store *store, | |
621 | sector_t *numerator, sector_t *denominator) | |
622 | { | |
623 | *numerator = ((struct transient_c *) store->context)->next_free; | |
624 | *denominator = get_dev_size(store->snap->cow->bdev); | |
625 | } | |
626 | ||
627 | int dm_create_transient(struct exception_store *store, | |
628 | struct dm_snapshot *s, int blocksize) | |
629 | { | |
630 | struct transient_c *tc; | |
631 | ||
632 | memset(store, 0, sizeof(*store)); | |
633 | store->destroy = transient_destroy; | |
634 | store->read_metadata = transient_read_metadata; | |
635 | store->prepare_exception = transient_prepare; | |
636 | store->commit_exception = transient_commit; | |
637 | store->fraction_full = transient_fraction_full; | |
638 | store->snap = s; | |
639 | ||
640 | tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); | |
641 | if (!tc) | |
642 | return -ENOMEM; | |
643 | ||
644 | tc->next_free = 0; | |
645 | store->context = tc; | |
646 | ||
647 | return 0; | |
648 | } |