[mirror_ubuntu-zesty-kernel.git] / drivers / md / dm-exception-store.c

/*
 * dm-snapshot.c
 *
 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
 *
 * This file is released under the GPL.
 */

#include "dm.h"
#include "dm-snap.h"
#include "dm-io.h"
#include "kcopyd.h"

#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>

/*-----------------------------------------------------------------
 * Persistent snapshots, by persistent we mean that the snapshot
 * will survive a reboot.
 *---------------------------------------------------------------*/

/*
 * We need to store a record of which parts of the origin have
 * been copied to the snapshot device.  The snapshot code
 * requires that we copy exception chunks to chunk aligned areas
 * of the COW store.  It makes sense therefore, to store the
 * metadata in chunk size blocks.
 *
 * There is no backward or forward compatibility implemented,
 * snapshots with different disk versions than the kernel will
 * not be usable.  It is expected that "lvcreate" will blank out
 * the start of a fresh COW device before calling the snapshot
 * constructor.
 *
 * The first chunk of the COW device just contains the header.
 * After this there is a chunk filled with exception metadata,
 * followed by as many exception chunks as can fit in the
 * metadata areas.
 *
 * All on disk structures are in little-endian format.  The end
 * of the exceptions info is indicated by an exception with a
 * new_chunk of 0, which is invalid since it would point to the
 * header chunk.
 */

/*
 * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
 */
#define SNAP_MAGIC 0x70416e53

/*
 * The on-disk version of the metadata.
 */
#define SNAPSHOT_DISK_VERSION 1

struct disk_header {
	uint32_t magic;

	/*
	 * Is this snapshot valid.  There is no way of recovering
	 * an invalid snapshot.
	 */
	uint32_t valid;

	/*
	 * Simple, incrementing version. no backward
	 * compatibility.
	 */
	uint32_t version;

	/* In sectors */
	uint32_t chunk_size;
};

struct disk_exception {
	uint64_t old_chunk;
	uint64_t new_chunk;
};

struct commit_callback {
	void (*callback)(void *, int success);
	void *context;
};

/*
 * The top level structure for a persistent exception store.
 */
struct pstore {
	struct dm_snapshot *snap;	/* up pointer to my snapshot */
	int version;
	int valid;
	uint32_t chunk_size;
	uint32_t exceptions_per_area;

	/*
	 * Now that we have an asynchronous kcopyd there is no
	 * need for large chunk sizes, so it wont hurt to have a
	 * whole chunks worth of metadata in memory at once.
	 */
	void *area;

	/*
	 * Used to keep track of which metadata area the data in
	 * 'chunk' refers to.
	 */
	uint32_t current_area;

	/*
	 * The next free chunk for an exception.
	 */
	uint32_t next_free;

	/*
	 * The index of next free exception in the current
	 * metadata area.
	 */
	uint32_t current_committed;

	atomic_t pending_count;
	uint32_t callback_count;
	struct commit_callback *callbacks;
};

static inline unsigned int sectors_to_pages(unsigned int sectors)
{
	return sectors / (PAGE_SIZE >> 9);
}

static int alloc_area(struct pstore *ps)
{
	int r = -ENOMEM;
	size_t len;

	len = ps->chunk_size << SECTOR_SHIFT;

	/*
	 * Allocate the chunk_size block of memory that will hold
	 * a single metadata area.
	 */
	ps->area = vmalloc(len);
	if (!ps->area)
		return r;

	return 0;
}

static void free_area(struct pstore *ps)
{
	vfree(ps->area);
}

/*
 * Read or write a chunk aligned and sized block of data from a device.
 */
static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
{
	struct io_region where;
	unsigned long bits;

	where.bdev = ps->snap->cow->bdev;
	where.sector = ps->chunk_size * chunk;
	where.count = ps->chunk_size;

	return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
}

/*
 * Read or write a metadata area.  Remembering to skip the first
 * chunk which holds the header.
 */
static int area_io(struct pstore *ps, uint32_t area, int rw)
{
	int r;
	uint32_t chunk;

	/* convert a metadata area index to a chunk index */
	chunk = 1 + ((ps->exceptions_per_area + 1) * area);

	r = chunk_io(ps, chunk, rw);
	if (r)
		return r;

	ps->current_area = area;
	return 0;
}

static int zero_area(struct pstore *ps, uint32_t area)
{
	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
	return area_io(ps, area, WRITE);
}

static int read_header(struct pstore *ps, int *new_snapshot)
{
	int r;
	struct disk_header *dh;

	r = chunk_io(ps, 0, READ);
	if (r)
		return r;

	dh = (struct disk_header *) ps->area;

	if (le32_to_cpu(dh->magic) == 0) {
		*new_snapshot = 1;

	} else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
		*new_snapshot = 0;
		ps->valid = le32_to_cpu(dh->valid);
		ps->version = le32_to_cpu(dh->version);
		ps->chunk_size = le32_to_cpu(dh->chunk_size);

	} else {
		DMWARN("Invalid/corrupt snapshot");
		r = -ENXIO;
	}

	return r;
}

static int write_header(struct pstore *ps)
{
	struct disk_header *dh;

	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);

	dh = (struct disk_header *) ps->area;
	dh->magic = cpu_to_le32(SNAP_MAGIC);
	dh->valid = cpu_to_le32(ps->valid);
	dh->version = cpu_to_le32(ps->version);
	dh->chunk_size = cpu_to_le32(ps->chunk_size);

	return chunk_io(ps, 0, WRITE);
}

/*
 * Access functions for the disk exceptions, these do the endian conversions.
 */
static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
{
	if (index >= ps->exceptions_per_area)
		return NULL;

	return ((struct disk_exception *) ps->area) + index;
}

static int read_exception(struct pstore *ps,
			  uint32_t index, struct disk_exception *result)
{
	struct disk_exception *e;

	e = get_exception(ps, index);
	if (!e)
		return -EINVAL;

	/* copy it */
	result->old_chunk = le64_to_cpu(e->old_chunk);
	result->new_chunk = le64_to_cpu(e->new_chunk);

	return 0;
}

static int write_exception(struct pstore *ps,
			   uint32_t index, struct disk_exception *de)
{
	struct disk_exception *e;

	e = get_exception(ps, index);
	if (!e)
		return -EINVAL;

	/* copy it */
	e->old_chunk = cpu_to_le64(de->old_chunk);
	e->new_chunk = cpu_to_le64(de->new_chunk);

	return 0;
}

/*
 * Registers the exceptions that are present in the current area.
 * 'full' is filled in to indicate if the area has been
 * filled.
 */
static int insert_exceptions(struct pstore *ps, int *full)
{
	int r;
	unsigned int i;
	struct disk_exception de;

	/* presume the area is full */
	*full = 1;

	for (i = 0; i < ps->exceptions_per_area; i++) {
		r = read_exception(ps, i, &de);

		if (r)
			return r;

		/*
		 * If the new_chunk is pointing at the start of
		 * the COW device, where the first metadata area
		 * is we know that we've hit the end of the
		 * exceptions.  Therefore the area is not full.
		 */
		if (de.new_chunk == 0LL) {
			ps->current_committed = i;
			*full = 0;
			break;
		}

		/*
		 * Keep track of the start of the free chunks.
		 */
		if (ps->next_free <= de.new_chunk)
			ps->next_free = de.new_chunk + 1;

		/*
		 * Otherwise we add the exception to the snapshot.
		 */
		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
		if (r)
			return r;
	}

	return 0;
}

static int read_exceptions(struct pstore *ps)
{
	uint32_t area;
	int r, full = 1;

	/*
	 * Keeping reading chunks and inserting exceptions until
	 * we find a partially full area.
	 */
	for (area = 0; full; area++) {
		r = area_io(ps, area, READ);
		if (r)
			return r;

		r = insert_exceptions(ps, &full);
		if (r)
			return r;
	}

	return 0;
}

static inline struct pstore *get_info(struct exception_store *store)
{
	return (struct pstore *) store->context;
}

static void persistent_fraction_full(struct exception_store *store,
				     sector_t *numerator, sector_t *denominator)
{
	*numerator = get_info(store)->next_free * store->snap->chunk_size;
	*denominator = get_dev_size(store->snap->cow->bdev);
}

static void persistent_destroy(struct exception_store *store)
{
	struct pstore *ps = get_info(store);

	dm_io_put(sectors_to_pages(ps->chunk_size));
	vfree(ps->callbacks);
	free_area(ps);
	kfree(ps);
}

static int persistent_read_metadata(struct exception_store *store)
{
	int r, new_snapshot;
	struct pstore *ps = get_info(store);

	/*
	 * Read the snapshot header.
	 */
	r = read_header(ps, &new_snapshot);
	if (r)
		return r;

	/*
	 * Do we need to setup a new snapshot ?
	 */
	if (new_snapshot) {
		r = write_header(ps);
		if (r) {
			DMWARN("write_header failed");
			return r;
		}

		r = zero_area(ps, 0);
		if (r) {
			DMWARN("zero_area(0) failed");
			return r;
		}

	} else {
		/*
		 * Sanity checks.
		 */
		if (!ps->valid) {
			DMWARN("snapshot is marked invalid");
			return -EINVAL;
		}

		if (ps->version != SNAPSHOT_DISK_VERSION) {
			DMWARN("unable to handle snapshot disk version %d",
			       ps->version);
			return -EINVAL;
		}

		/*
		 * Read the metadata.
		 */
		r = read_exceptions(ps);
		if (r)
			return r;
	}

	return 0;
}

static int persistent_prepare(struct exception_store *store,
			      struct exception *e)
{
	struct pstore *ps = get_info(store);
	uint32_t stride;
	sector_t size = get_dev_size(store->snap->cow->bdev);

	/* Is there enough room ? */
	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
		return -ENOSPC;

	e->new_chunk = ps->next_free;

	/*
	 * Move onto the next free pending, making sure to take
	 * into account the location of the metadata chunks.
	 */
	stride = (ps->exceptions_per_area + 1);
	if ((++ps->next_free % stride) == 1)
		ps->next_free++;

	atomic_inc(&ps->pending_count);
	return 0;
}

static void persistent_commit(struct exception_store *store,
			      struct exception *e,
			      void (*callback) (void *, int success),
			      void *callback_context)
{
	int r;
	unsigned int i;
	struct pstore *ps = get_info(store);
	struct disk_exception de;
	struct commit_callback *cb;

	de.old_chunk = e->old_chunk;
	de.new_chunk = e->new_chunk;
	write_exception(ps, ps->current_committed++, &de);

	/*
	 * Add the callback to the back of the array.  This code
	 * is the only place where the callback array is
	 * manipulated, and we know that it will never be called
	 * multiple times concurrently.
	 */
	cb = ps->callbacks + ps->callback_count++;
	cb->callback = callback;
	cb->context = callback_context;

	/*
	 * If there are no more exceptions in flight, or we have
	 * filled this metadata area we commit the exceptions to
	 * disk.
	 */
	if (atomic_dec_and_test(&ps->pending_count) ||
	    (ps->current_committed == ps->exceptions_per_area)) {
		r = area_io(ps, ps->current_area, WRITE);
		if (r)
			ps->valid = 0;

		for (i = 0; i < ps->callback_count; i++) {
			cb = ps->callbacks + i;
			cb->callback(cb->context, r == 0 ? 1 : 0);
		}

		ps->callback_count = 0;
	}

	/*
	 * Have we completely filled the current area ?
	 */
	if (ps->current_committed == ps->exceptions_per_area) {
		ps->current_committed = 0;
		r = zero_area(ps, ps->current_area + 1);
		if (r)
			ps->valid = 0;
	}
}

static void persistent_drop(struct exception_store *store)
{
	struct pstore *ps = get_info(store);

	ps->valid = 0;
	if (write_header(ps))
		DMWARN("write header failed");
}

int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
{
	int r;
	struct pstore *ps;

	r = dm_io_get(sectors_to_pages(chunk_size));
	if (r)
		return r;

	/* allocate the pstore */
	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
	if (!ps) {
		r = -ENOMEM;
		goto bad;
	}

	ps->snap = store->snap;
	ps->valid = 1;
	ps->version = SNAPSHOT_DISK_VERSION;
	ps->chunk_size = chunk_size;
	ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
	    sizeof(struct disk_exception);
	ps->next_free = 2;	/* skipping the header and first area */
	ps->current_committed = 0;

	r = alloc_area(ps);
	if (r)
		goto bad;

	/*
	 * Allocate space for all the callbacks.
	 */
	ps->callback_count = 0;
	atomic_set(&ps->pending_count, 0);
	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
				   sizeof(*ps->callbacks));

	if (!ps->callbacks) {
		r = -ENOMEM;
		goto bad;
	}

	store->destroy = persistent_destroy;
	store->read_metadata = persistent_read_metadata;
	store->prepare_exception = persistent_prepare;
	store->commit_exception = persistent_commit;
	store->drop_snapshot = persistent_drop;
	store->fraction_full = persistent_fraction_full;
	store->context = ps;

	return 0;

      bad:
	dm_io_put(sectors_to_pages(chunk_size));
	if (ps) {
		if (ps->area)
			free_area(ps);

		kfree(ps);
	}
	return r;
}

/*-----------------------------------------------------------------
 * Implementation of the store for non-persistent snapshots.
 *---------------------------------------------------------------*/
struct transient_c {
	sector_t next_free;
};

static void transient_destroy(struct exception_store *store)
{
	kfree(store->context);
}

static int transient_read_metadata(struct exception_store *store)
{
	return 0;
}

static int transient_prepare(struct exception_store *store, struct exception *e)
{
	struct transient_c *tc = (struct transient_c *) store->context;
	sector_t size = get_dev_size(store->snap->cow->bdev);

	if (size < (tc->next_free + store->snap->chunk_size))
		return -1;

	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
	tc->next_free += store->snap->chunk_size;

	return 0;
}

static void transient_commit(struct exception_store *store,
		      struct exception *e,
		      void (*callback) (void *, int success),
		      void *callback_context)
{
	/* Just succeed */
	callback(callback_context, 1);
}

static void transient_fraction_full(struct exception_store *store,
				    sector_t *numerator, sector_t *denominator)
{
	*numerator = ((struct transient_c *) store->context)->next_free;
	*denominator = get_dev_size(store->snap->cow->bdev);
}

int dm_create_transient(struct exception_store *store,
			struct dm_snapshot *s, int blocksize)
{
	struct transient_c *tc;

	memset(store, 0, sizeof(*store));
	store->destroy = transient_destroy;
	store->read_metadata = transient_read_metadata;
	store->prepare_exception = transient_prepare;
	store->commit_exception = transient_commit;
	store->fraction_full = transient_fraction_full;
	store->snap = s;

	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
	if (!tc)
		return -ENOMEM;

	tc->next_free = 0;
	store->context = tc;

	return 0;
}
Commit	Line	Data
1da177e4 LT	1	/*
	2	* dm-snapshot.c
	3	*
	4	* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
	5	*
	6	* This file is released under the GPL.
	7	*/
	8
	9	#include "dm.h"
	10	#include "dm-snap.h"
	11	#include "dm-io.h"
	12	#include "kcopyd.h"
	13
	14	#include <linux/mm.h>
	15	#include <linux/pagemap.h>
	16	#include <linux/vmalloc.h>
	17	#include <linux/slab.h>
	18
	19	/*-----------------------------------------------------------------
	20	* Persistent snapshots, by persistent we mean that the snapshot
	21	* will survive a reboot.
	22	---------------------------------------------------------------/
	23
	24	/*
	25	* We need to store a record of which parts of the origin have
	26	* been copied to the snapshot device. The snapshot code
	27	* requires that we copy exception chunks to chunk aligned areas
	28	* of the COW store. It makes sense therefore, to store the
	29	* metadata in chunk size blocks.
	30	*
	31	* There is no backward or forward compatibility implemented,
	32	* snapshots with different disk versions than the kernel will
	33	* not be usable. It is expected that "lvcreate" will blank out
	34	* the start of a fresh COW device before calling the snapshot
	35	* constructor.
	36	*
	37	* The first chunk of the COW device just contains the header.
	38	* After this there is a chunk filled with exception metadata,
	39	* followed by as many exception chunks as can fit in the
	40	* metadata areas.
	41	*
	42	* All on disk structures are in little-endian format. The end
	43	* of the exceptions info is indicated by an exception with a
	44	* new_chunk of 0, which is invalid since it would point to the
	45	* header chunk.
	46	*/
	47
	48	/*
	49	* Magic for persistent snapshots: "SnAp" - Feeble isn't it.
	50	*/
	51	#define SNAP_MAGIC 0x70416e53
	52
	53	/*
	54	* The on-disk version of the metadata.
	55	*/
	56	#define SNAPSHOT_DISK_VERSION 1
	57
	58	struct disk_header {
	59	uint32_t magic;
	60
	61	/*
	62	* Is this snapshot valid. There is no way of recovering
	63	* an invalid snapshot.
	64	*/
65	uint32_t valid;
66
67	/*
68	* Simple, incrementing version. no backward
69	* compatibility.
70	*/
71	uint32_t version;
72
73	/* In sectors */
74	uint32_t chunk_size;
75	};
76
77	struct disk_exception {
78	uint64_t old_chunk;
79	uint64_t new_chunk;
80	};
81
82	struct commit_callback {
83	void (callback)(void , int success);
84	void *context;
85	};
86
87	/*
88	* The top level structure for a persistent exception store.
89	*/
90	struct pstore {
91	struct dm_snapshot snap; / up pointer to my snapshot */
92	int version;
93	int valid;
94	uint32_t chunk_size;
95	uint32_t exceptions_per_area;
96
97	/*
98	* Now that we have an asynchronous kcopyd there is no
99	* need for large chunk sizes, so it wont hurt to have a
100	* whole chunks worth of metadata in memory at once.
101	*/
102	void *area;
103
104	/*
105	* Used to keep track of which metadata area the data in
106	* 'chunk' refers to.
107	*/
108	uint32_t current_area;
109
110	/*
111	* The next free chunk for an exception.
112	*/
113	uint32_t next_free;
114
115	/*
116	* The index of next free exception in the current
117	* metadata area.
118	*/
119	uint32_t current_committed;
120
121	atomic_t pending_count;
122	uint32_t callback_count;
123	struct commit_callback *callbacks;
124	};
125
126	static inline unsigned int sectors_to_pages(unsigned int sectors)
127	{
128	return sectors / (PAGE_SIZE >> 9);
129	}
130
131	static int alloc_area(struct pstore *ps)
132	{
133	int r = -ENOMEM;
134	size_t len;
135
136	len = ps->chunk_size << SECTOR_SHIFT;
137
138	/*
139	* Allocate the chunk_size block of memory that will hold
140	* a single metadata area.
141	*/
142	ps->area = vmalloc(len);
143	if (!ps->area)
144	return r;
145
146	return 0;
147	}
148
149	static void free_area(struct pstore *ps)
150	{
151	vfree(ps->area);
152	}
153
154	/*
155	* Read or write a chunk aligned and sized block of data from a device.
156	*/
157	static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
158	{
159	struct io_region where;
160	unsigned long bits;
161
162	where.bdev = ps->snap->cow->bdev;
163	where.sector = ps->chunk_size * chunk;
164	where.count = ps->chunk_size;
165
166	return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
167	}
168
169	/*
170	* Read or write a metadata area. Remembering to skip the first
171	* chunk which holds the header.
172	*/
173	static int area_io(struct pstore *ps, uint32_t area, int rw)
174	{
175	int r;
176	uint32_t chunk;
177
178	/* convert a metadata area index to a chunk index */
179	chunk = 1 + ((ps->exceptions_per_area + 1) * area);
180
181	r = chunk_io(ps, chunk, rw);
182	if (r)
183	return r;
184
185	ps->current_area = area;
186	return 0;
187	}
188
189	static int zero_area(struct pstore *ps, uint32_t area)
190	{
191	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
192	return area_io(ps, area, WRITE);
193	}
194
195	static int read_header(struct pstore ps, int new_snapshot)
196	{
197	int r;
198	struct disk_header *dh;
199
200	r = chunk_io(ps, 0, READ);
201	if (r)
202	return r;
203
204	dh = (struct disk_header *) ps->area;
205
206	if (le32_to_cpu(dh->magic) == 0) {
207	*new_snapshot = 1;
208
209	} else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
210	*new_snapshot = 0;
211	ps->valid = le32_to_cpu(dh->valid);
212	ps->version = le32_to_cpu(dh->version);
213	ps->chunk_size = le32_to_cpu(dh->chunk_size);
214
215	} else {
216	DMWARN("Invalid/corrupt snapshot");
217	r = -ENXIO;
218	}
219
220	return r;
221	}
222
223	static int write_header(struct pstore *ps)
224	{
225	struct disk_header *dh;
226
227	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
228
229	dh = (struct disk_header *) ps->area;
230	dh->magic = cpu_to_le32(SNAP_MAGIC);
231	dh->valid = cpu_to_le32(ps->valid);
232	dh->version = cpu_to_le32(ps->version);
233	dh->chunk_size = cpu_to_le32(ps->chunk_size);
234
235	return chunk_io(ps, 0, WRITE);
236	}
237
238	/*
239	* Access functions for the disk exceptions, these do the endian conversions.
240	*/
241	static struct disk_exception get_exception(struct pstore ps, uint32_t index)
242	{
243	if (index >= ps->exceptions_per_area)
244	return NULL;
245
246	return ((struct disk_exception *) ps->area) + index;
247	}
248
249	static int read_exception(struct pstore *ps,
250	uint32_t index, struct disk_exception *result)
251	{
252	struct disk_exception *e;
253
254	e = get_exception(ps, index);
255	if (!e)
256	return -EINVAL;
257
258	/* copy it */
259	result->old_chunk = le64_to_cpu(e->old_chunk);
260	result->new_chunk = le64_to_cpu(e->new_chunk);
261
262	return 0;
263	}
264
265	static int write_exception(struct pstore *ps,
266	uint32_t index, struct disk_exception *de)
267	{
268	struct disk_exception *e;
269
270	e = get_exception(ps, index);
271	if (!e)
272	return -EINVAL;
273
274	/* copy it */
275	e->old_chunk = cpu_to_le64(de->old_chunk);
276	e->new_chunk = cpu_to_le64(de->new_chunk);
277
278	return 0;
279	}
280
281	/*
282	* Registers the exceptions that are present in the current area.
283	* 'full' is filled in to indicate if the area has been
284	* filled.
285	*/
286	static int insert_exceptions(struct pstore ps, int full)
287	{
288	int r;
289	unsigned int i;
290	struct disk_exception de;
291
292	/* presume the area is full */
293	*full = 1;
294
295	for (i = 0; i < ps->exceptions_per_area; i++) {
296	r = read_exception(ps, i, &de);
297
298	if (r)
299	return r;
300
301	/*
302	* If the new_chunk is pointing at the start of
303	* the COW device, where the first metadata area
304	* is we know that we've hit the end of the
305	* exceptions. Therefore the area is not full.
306	*/
307	if (de.new_chunk == 0LL) {
308	ps->current_committed = i;
309	*full = 0;
310	break;
311	}
312
313	/*
314	* Keep track of the start of the free chunks.
315	*/
316	if (ps->next_free <= de.new_chunk)
317	ps->next_free = de.new_chunk + 1;
318
319	/*
320	* Otherwise we add the exception to the snapshot.
321	*/
322	r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
323	if (r)
324	return r;
325	}
326
327	return 0;
328	}
329
330	static int read_exceptions(struct pstore *ps)
331	{
332	uint32_t area;
333	int r, full = 1;
334
335	/*
336	* Keeping reading chunks and inserting exceptions until
337	* we find a partially full area.
338	*/
339	for (area = 0; full; area++) {
340	r = area_io(ps, area, READ);
341	if (r)
342	return r;
343
344	r = insert_exceptions(ps, &full);
345	if (r)
346	return r;
347	}
348
349	return 0;
350	}
351
352	static inline struct pstore get_info(struct exception_store store)
353	{
354	return (struct pstore *) store->context;
355	}
356
357	static void persistent_fraction_full(struct exception_store *store,
358	sector_t numerator, sector_t denominator)
359	{
360	numerator = get_info(store)->next_free store->snap->chunk_size;
361	*denominator = get_dev_size(store->snap->cow->bdev);
362	}
363
364	static void persistent_destroy(struct exception_store *store)
365	{
366	struct pstore *ps = get_info(store);
367
368	dm_io_put(sectors_to_pages(ps->chunk_size));
369	vfree(ps->callbacks);
370	free_area(ps);
371	kfree(ps);
372	}
373
374	static int persistent_read_metadata(struct exception_store *store)
375	{
376	int r, new_snapshot;
377	struct pstore *ps = get_info(store);
378
379	/*
380	* Read the snapshot header.
381	*/
382	r = read_header(ps, &new_snapshot);
383	if (r)
384	return r;
385
386	/*
387	* Do we need to setup a new snapshot ?
388	*/
389	if (new_snapshot) {
390	r = write_header(ps);
391	if (r) {
392	DMWARN("write_header failed");
393	return r;
394	}
395
396	r = zero_area(ps, 0);
397	if (r) {
398	DMWARN("zero_area(0) failed");
399	return r;
400	}
401
402	} else {
403	/*
404	* Sanity checks.
405	*/
406	if (!ps->valid) {
407	DMWARN("snapshot is marked invalid");
408	return -EINVAL;
409	}
410
411	if (ps->version != SNAPSHOT_DISK_VERSION) {
412	DMWARN("unable to handle snapshot disk version %d",
413	ps->version);
414	return -EINVAL;
415	}
416
417	/*
418	* Read the metadata.
419	*/
420	r = read_exceptions(ps);
421	if (r)
422	return r;
423	}
424
425	return 0;
426	}
427
428	static int persistent_prepare(struct exception_store *store,
429	struct exception *e)
430	{
431	struct pstore *ps = get_info(store);
432	uint32_t stride;
433	sector_t size = get_dev_size(store->snap->cow->bdev);
434
435	/* Is there enough room ? */
436	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
437	return -ENOSPC;
438
439	e->new_chunk = ps->next_free;
440
441	/*
442	* Move onto the next free pending, making sure to take
443	* into account the location of the metadata chunks.
444	*/
445	stride = (ps->exceptions_per_area + 1);
446	if ((++ps->next_free % stride) == 1)
447	ps->next_free++;
448
449	atomic_inc(&ps->pending_count);
450	return 0;
451	}
452
453	static void persistent_commit(struct exception_store *store,
454	struct exception *e,
455	void (callback) (void , int success),
456	void *callback_context)
457	{
458	int r;
459	unsigned int i;
460	struct pstore *ps = get_info(store);
461	struct disk_exception de;
462	struct commit_callback *cb;
463
464	de.old_chunk = e->old_chunk;
465	de.new_chunk = e->new_chunk;
466	write_exception(ps, ps->current_committed++, &de);
467
468	/*
469	* Add the callback to the back of the array. This code
470	* is the only place where the callback array is
471	* manipulated, and we know that it will never be called
472	* multiple times concurrently.
473	*/
474	cb = ps->callbacks + ps->callback_count++;
475	cb->callback = callback;
476	cb->context = callback_context;
477
478	/*
479	* If there are no more exceptions in flight, or we have
480	* filled this metadata area we commit the exceptions to
481	* disk.
482	*/
483	if (atomic_dec_and_test(&ps->pending_count) \|\|
484	(ps->current_committed == ps->exceptions_per_area)) {
485	r = area_io(ps, ps->current_area, WRITE);
486	if (r)
487	ps->valid = 0;
488
489	for (i = 0; i < ps->callback_count; i++) {
490	cb = ps->callbacks + i;
491	cb->callback(cb->context, r == 0 ? 1 : 0);
492	}
493
494	ps->callback_count = 0;
495	}
496
497	/*
498	* Have we completely filled the current area ?
499	*/
500	if (ps->current_committed == ps->exceptions_per_area) {
501	ps->current_committed = 0;
502	r = zero_area(ps, ps->current_area + 1);
503	if (r)
504	ps->valid = 0;
505	}
506	}
507
508	static void persistent_drop(struct exception_store *store)
509	{
510	struct pstore *ps = get_info(store);
511
512	ps->valid = 0;
513	if (write_header(ps))
514	DMWARN("write header failed");
515	}
516
517	int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
518	{
519	int r;
520	struct pstore *ps;
521
522	r = dm_io_get(sectors_to_pages(chunk_size));
523	if (r)
524	return r;
525
526	/* allocate the pstore */
527	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
528	if (!ps) {
529	r = -ENOMEM;
530	goto bad;
531	}
532
533	ps->snap = store->snap;
534	ps->valid = 1;
535	ps->version = SNAPSHOT_DISK_VERSION;
536	ps->chunk_size = chunk_size;
537	ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
538	sizeof(struct disk_exception);
539	ps->next_free = 2; /* skipping the header and first area */
540	ps->current_committed = 0;
541
542	r = alloc_area(ps);
543	if (r)
544	goto bad;
545
546	/*
547	* Allocate space for all the callbacks.
548	*/
549	ps->callback_count = 0;
550	atomic_set(&ps->pending_count, 0);
551	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
552	sizeof(*ps->callbacks));
553
554	if (!ps->callbacks) {
555	r = -ENOMEM;
556	goto bad;
557	}
558
559	store->destroy = persistent_destroy;
560	store->read_metadata = persistent_read_metadata;
561	store->prepare_exception = persistent_prepare;
562	store->commit_exception = persistent_commit;
563	store->drop_snapshot = persistent_drop;
564	store->fraction_full = persistent_fraction_full;
565	store->context = ps;
566
567	return 0;
568
569	bad:
570	dm_io_put(sectors_to_pages(chunk_size));
571	if (ps) {
572	if (ps->area)
573	free_area(ps);
574
575	kfree(ps);
576	}
577	return r;
578	}
579
580	/*-----------------------------------------------------------------
581	* Implementation of the store for non-persistent snapshots.
582	---------------------------------------------------------------/
583	struct transient_c {
584	sector_t next_free;
585	};
586
587	static void transient_destroy(struct exception_store *store)
588	{
589	kfree(store->context);
590	}
591
592	static int transient_read_metadata(struct exception_store *store)
593	{
594	return 0;
595	}
596
597	static int transient_prepare(struct exception_store store, struct exception e)
598	{
599	struct transient_c tc = (struct transient_c ) store->context;
600	sector_t size = get_dev_size(store->snap->cow->bdev);
601
602	if (size < (tc->next_free + store->snap->chunk_size))
603	return -1;
604
605	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
606	tc->next_free += store->snap->chunk_size;
607
608	return 0;
609	}
610
611	static void transient_commit(struct exception_store *store,
612	struct exception *e,
613	void (callback) (void , int success),
614	void *callback_context)
615	{
616	/* Just succeed */
617	callback(callback_context, 1);
618	}
619
620	static void transient_fraction_full(struct exception_store *store,
621	sector_t numerator, sector_t denominator)
622	{
623	numerator = ((struct transient_c ) store->context)->next_free;
624	*denominator = get_dev_size(store->snap->cow->bdev);
625	}
626
627	int dm_create_transient(struct exception_store *store,
628	struct dm_snapshot *s, int blocksize)
629	{
630	struct transient_c *tc;
631
632	memset(store, 0, sizeof(*store));
633	store->destroy = transient_destroy;
634	store->read_metadata = transient_read_metadata;
635	store->prepare_exception = transient_prepare;
636	store->commit_exception = transient_commit;
637	store->fraction_full = transient_fraction_full;
638	store->snap = s;
639
640	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
641	if (!tc)
642	return -ENOMEM;
643
644	tc->next_free = 0;
645	store->context = tc;
646
647	return 0;
648	}