]>
Commit | Line | Data |
---|---|---|
3241b1d3 JT |
1 | /* |
2 | * Copyright (C) 2011 Red Hat, Inc. | |
3 | * | |
4 | * This file is released under the GPL. | |
5 | */ | |
6 | ||
7 | #include "dm-btree-internal.h" | |
8 | #include "dm-space-map.h" | |
9 | #include "dm-transaction-manager.h" | |
10 | ||
1944ce60 | 11 | #include <linux/export.h> |
3241b1d3 JT |
12 | #include <linux/device-mapper.h> |
13 | ||
14 | #define DM_MSG_PREFIX "btree" | |
15 | ||
16 | /*---------------------------------------------------------------- | |
17 | * Array manipulation | |
18 | *--------------------------------------------------------------*/ | |
19 | static void memcpy_disk(void *dest, const void *src, size_t len) | |
20 | __dm_written_to_disk(src) | |
21 | { | |
22 | memcpy(dest, src, len); | |
23 | __dm_unbless_for_disk(src); | |
24 | } | |
25 | ||
26 | static void array_insert(void *base, size_t elt_size, unsigned nr_elts, | |
27 | unsigned index, void *elt) | |
28 | __dm_written_to_disk(elt) | |
29 | { | |
30 | if (index < nr_elts) | |
31 | memmove(base + (elt_size * (index + 1)), | |
32 | base + (elt_size * index), | |
33 | (nr_elts - index) * elt_size); | |
34 | ||
35 | memcpy_disk(base + (elt_size * index), elt, elt_size); | |
36 | } | |
37 | ||
38 | /*----------------------------------------------------------------*/ | |
39 | ||
40 | /* makes the assumption that no two keys are the same. */ | |
550929fa | 41 | static int bsearch(struct btree_node *n, uint64_t key, int want_hi) |
3241b1d3 JT |
42 | { |
43 | int lo = -1, hi = le32_to_cpu(n->header.nr_entries); | |
44 | ||
45 | while (hi - lo > 1) { | |
46 | int mid = lo + ((hi - lo) / 2); | |
47 | uint64_t mid_key = le64_to_cpu(n->keys[mid]); | |
48 | ||
49 | if (mid_key == key) | |
50 | return mid; | |
51 | ||
52 | if (mid_key < key) | |
53 | lo = mid; | |
54 | else | |
55 | hi = mid; | |
56 | } | |
57 | ||
58 | return want_hi ? hi : lo; | |
59 | } | |
60 | ||
550929fa | 61 | int lower_bound(struct btree_node *n, uint64_t key) |
3241b1d3 JT |
62 | { |
63 | return bsearch(n, key, 0); | |
64 | } | |
65 | ||
993ceab9 JT |
66 | static int upper_bound(struct btree_node *n, uint64_t key) |
67 | { | |
68 | return bsearch(n, key, 1); | |
69 | } | |
70 | ||
550929fa | 71 | void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, |
3241b1d3 JT |
72 | struct dm_btree_value_type *vt) |
73 | { | |
74 | unsigned i; | |
75 | uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); | |
76 | ||
77 | if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) | |
78 | for (i = 0; i < nr_entries; i++) | |
79 | dm_tm_inc(tm, value64(n, i)); | |
80 | else if (vt->inc) | |
81 | for (i = 0; i < nr_entries; i++) | |
a3aefb39 | 82 | vt->inc(vt->context, value_ptr(n, i)); |
3241b1d3 JT |
83 | } |
84 | ||
550929fa | 85 | static int insert_at(size_t value_size, struct btree_node *node, unsigned index, |
3241b1d3 JT |
86 | uint64_t key, void *value) |
87 | __dm_written_to_disk(value) | |
88 | { | |
89 | uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); | |
90 | __le64 key_le = cpu_to_le64(key); | |
91 | ||
92 | if (index > nr_entries || | |
93 | index >= le32_to_cpu(node->header.max_entries)) { | |
94 | DMERR("too many entries in btree node for insert"); | |
95 | __dm_unbless_for_disk(value); | |
96 | return -ENOMEM; | |
97 | } | |
98 | ||
99 | __dm_bless_for_disk(&key_le); | |
100 | ||
101 | array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); | |
102 | array_insert(value_base(node), value_size, nr_entries, index, value); | |
103 | node->header.nr_entries = cpu_to_le32(nr_entries + 1); | |
104 | ||
105 | return 0; | |
106 | } | |
107 | ||
108 | /*----------------------------------------------------------------*/ | |
109 | ||
110 | /* | |
111 | * We want 3n entries (for some n). This works more nicely for repeated | |
112 | * insert remove loops than (2n + 1). | |
113 | */ | |
114 | static uint32_t calc_max_entries(size_t value_size, size_t block_size) | |
115 | { | |
116 | uint32_t total, n; | |
117 | size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ | |
118 | ||
119 | block_size -= sizeof(struct node_header); | |
120 | total = block_size / elt_size; | |
121 | n = total / 3; /* rounds down */ | |
122 | ||
123 | return 3 * n; | |
124 | } | |
125 | ||
126 | int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) | |
127 | { | |
128 | int r; | |
129 | struct dm_block *b; | |
550929fa | 130 | struct btree_node *n; |
3241b1d3 JT |
131 | size_t block_size; |
132 | uint32_t max_entries; | |
133 | ||
134 | r = new_block(info, &b); | |
135 | if (r < 0) | |
136 | return r; | |
137 | ||
138 | block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); | |
139 | max_entries = calc_max_entries(info->value_type.size, block_size); | |
140 | ||
141 | n = dm_block_data(b); | |
142 | memset(n, 0, block_size); | |
143 | n->header.flags = cpu_to_le32(LEAF_NODE); | |
144 | n->header.nr_entries = cpu_to_le32(0); | |
145 | n->header.max_entries = cpu_to_le32(max_entries); | |
146 | n->header.value_size = cpu_to_le32(info->value_type.size); | |
147 | ||
148 | *root = dm_block_location(b); | |
4c7da06f MP |
149 | unlock_block(info, b); |
150 | ||
151 | return 0; | |
3241b1d3 JT |
152 | } |
153 | EXPORT_SYMBOL_GPL(dm_btree_empty); | |
154 | ||
155 | /*----------------------------------------------------------------*/ | |
156 | ||
157 | /* | |
158 | * Deletion uses a recursive algorithm, since we have limited stack space | |
159 | * we explicitly manage our own stack on the heap. | |
160 | */ | |
161 | #define MAX_SPINE_DEPTH 64 | |
162 | struct frame { | |
163 | struct dm_block *b; | |
550929fa | 164 | struct btree_node *n; |
3241b1d3 JT |
165 | unsigned level; |
166 | unsigned nr_children; | |
167 | unsigned current_child; | |
168 | }; | |
169 | ||
170 | struct del_stack { | |
04f17c80 | 171 | struct dm_btree_info *info; |
3241b1d3 JT |
172 | struct dm_transaction_manager *tm; |
173 | int top; | |
174 | struct frame spine[MAX_SPINE_DEPTH]; | |
175 | }; | |
176 | ||
177 | static int top_frame(struct del_stack *s, struct frame **f) | |
178 | { | |
179 | if (s->top < 0) { | |
180 | DMERR("btree deletion stack empty"); | |
181 | return -EINVAL; | |
182 | } | |
183 | ||
184 | *f = s->spine + s->top; | |
185 | ||
186 | return 0; | |
187 | } | |
188 | ||
189 | static int unprocessed_frames(struct del_stack *s) | |
190 | { | |
191 | return s->top >= 0; | |
192 | } | |
193 | ||
04f17c80 JT |
194 | static void prefetch_children(struct del_stack *s, struct frame *f) |
195 | { | |
196 | unsigned i; | |
197 | struct dm_block_manager *bm = dm_tm_get_bm(s->tm); | |
198 | ||
199 | for (i = 0; i < f->nr_children; i++) | |
200 | dm_bm_prefetch(bm, value64(f->n, i)); | |
201 | } | |
202 | ||
203 | static bool is_internal_level(struct dm_btree_info *info, struct frame *f) | |
204 | { | |
205 | return f->level < (info->levels - 1); | |
206 | } | |
207 | ||
3241b1d3 JT |
208 | static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) |
209 | { | |
210 | int r; | |
211 | uint32_t ref_count; | |
212 | ||
213 | if (s->top >= MAX_SPINE_DEPTH - 1) { | |
214 | DMERR("btree deletion stack out of memory"); | |
215 | return -ENOMEM; | |
216 | } | |
217 | ||
218 | r = dm_tm_ref(s->tm, b, &ref_count); | |
219 | if (r) | |
220 | return r; | |
221 | ||
222 | if (ref_count > 1) | |
223 | /* | |
224 | * This is a shared node, so we can just decrement it's | |
225 | * reference counter and leave the children. | |
226 | */ | |
227 | dm_tm_dec(s->tm, b); | |
228 | ||
229 | else { | |
04f17c80 | 230 | uint32_t flags; |
3241b1d3 JT |
231 | struct frame *f = s->spine + ++s->top; |
232 | ||
233 | r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); | |
234 | if (r) { | |
235 | s->top--; | |
236 | return r; | |
237 | } | |
238 | ||
239 | f->n = dm_block_data(f->b); | |
240 | f->level = level; | |
241 | f->nr_children = le32_to_cpu(f->n->header.nr_entries); | |
242 | f->current_child = 0; | |
04f17c80 JT |
243 | |
244 | flags = le32_to_cpu(f->n->header.flags); | |
245 | if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) | |
246 | prefetch_children(s, f); | |
3241b1d3 JT |
247 | } |
248 | ||
249 | return 0; | |
250 | } | |
251 | ||
252 | static void pop_frame(struct del_stack *s) | |
253 | { | |
254 | struct frame *f = s->spine + s->top--; | |
255 | ||
256 | dm_tm_dec(s->tm, dm_block_location(f->b)); | |
257 | dm_tm_unlock(s->tm, f->b); | |
258 | } | |
259 | ||
ed8b45a3 JT |
260 | static void unlock_all_frames(struct del_stack *s) |
261 | { | |
262 | struct frame *f; | |
263 | ||
264 | while (unprocessed_frames(s)) { | |
265 | f = s->spine + s->top--; | |
266 | dm_tm_unlock(s->tm, f->b); | |
267 | } | |
268 | } | |
269 | ||
3241b1d3 JT |
270 | int dm_btree_del(struct dm_btree_info *info, dm_block_t root) |
271 | { | |
272 | int r; | |
273 | struct del_stack *s; | |
274 | ||
1c751879 | 275 | s = kmalloc(sizeof(*s), GFP_NOIO); |
3241b1d3 JT |
276 | if (!s) |
277 | return -ENOMEM; | |
04f17c80 | 278 | s->info = info; |
3241b1d3 JT |
279 | s->tm = info->tm; |
280 | s->top = -1; | |
281 | ||
e3cbf945 | 282 | r = push_frame(s, root, 0); |
3241b1d3 JT |
283 | if (r) |
284 | goto out; | |
285 | ||
286 | while (unprocessed_frames(s)) { | |
287 | uint32_t flags; | |
288 | struct frame *f; | |
289 | dm_block_t b; | |
290 | ||
291 | r = top_frame(s, &f); | |
292 | if (r) | |
293 | goto out; | |
294 | ||
295 | if (f->current_child >= f->nr_children) { | |
296 | pop_frame(s); | |
297 | continue; | |
298 | } | |
299 | ||
300 | flags = le32_to_cpu(f->n->header.flags); | |
301 | if (flags & INTERNAL_NODE) { | |
302 | b = value64(f->n, f->current_child); | |
303 | f->current_child++; | |
304 | r = push_frame(s, b, f->level); | |
305 | if (r) | |
306 | goto out; | |
307 | ||
e3cbf945 | 308 | } else if (is_internal_level(info, f)) { |
3241b1d3 JT |
309 | b = value64(f->n, f->current_child); |
310 | f->current_child++; | |
311 | r = push_frame(s, b, f->level + 1); | |
312 | if (r) | |
313 | goto out; | |
314 | ||
315 | } else { | |
316 | if (info->value_type.dec) { | |
317 | unsigned i; | |
318 | ||
319 | for (i = 0; i < f->nr_children; i++) | |
320 | info->value_type.dec(info->value_type.context, | |
a3aefb39 | 321 | value_ptr(f->n, i)); |
3241b1d3 | 322 | } |
cd5acf0b | 323 | pop_frame(s); |
3241b1d3 JT |
324 | } |
325 | } | |
3241b1d3 | 326 | out: |
ed8b45a3 JT |
327 | if (r) { |
328 | /* cleanup all frames of del_stack */ | |
329 | unlock_all_frames(s); | |
330 | } | |
3241b1d3 | 331 | kfree(s); |
ed8b45a3 | 332 | |
3241b1d3 JT |
333 | return r; |
334 | } | |
335 | EXPORT_SYMBOL_GPL(dm_btree_del); | |
336 | ||
337 | /*----------------------------------------------------------------*/ | |
338 | ||
339 | static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, | |
550929fa | 340 | int (*search_fn)(struct btree_node *, uint64_t), |
3241b1d3 JT |
341 | uint64_t *result_key, void *v, size_t value_size) |
342 | { | |
343 | int i, r; | |
344 | uint32_t flags, nr_entries; | |
345 | ||
346 | do { | |
347 | r = ro_step(s, block); | |
348 | if (r < 0) | |
349 | return r; | |
350 | ||
351 | i = search_fn(ro_node(s), key); | |
352 | ||
353 | flags = le32_to_cpu(ro_node(s)->header.flags); | |
354 | nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); | |
355 | if (i < 0 || i >= nr_entries) | |
356 | return -ENODATA; | |
357 | ||
358 | if (flags & INTERNAL_NODE) | |
359 | block = value64(ro_node(s), i); | |
360 | ||
361 | } while (!(flags & LEAF_NODE)); | |
362 | ||
363 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); | |
a3aefb39 | 364 | memcpy(v, value_ptr(ro_node(s), i), value_size); |
3241b1d3 JT |
365 | |
366 | return 0; | |
367 | } | |
368 | ||
369 | int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, | |
370 | uint64_t *keys, void *value_le) | |
371 | { | |
372 | unsigned level, last_level = info->levels - 1; | |
373 | int r = -ENODATA; | |
374 | uint64_t rkey; | |
375 | __le64 internal_value_le; | |
376 | struct ro_spine spine; | |
377 | ||
378 | init_ro_spine(&spine, info); | |
379 | for (level = 0; level < info->levels; level++) { | |
380 | size_t size; | |
381 | void *value_p; | |
382 | ||
383 | if (level == last_level) { | |
384 | value_p = value_le; | |
385 | size = info->value_type.size; | |
386 | ||
387 | } else { | |
388 | value_p = &internal_value_le; | |
389 | size = sizeof(uint64_t); | |
390 | } | |
391 | ||
392 | r = btree_lookup_raw(&spine, root, keys[level], | |
393 | lower_bound, &rkey, | |
394 | value_p, size); | |
395 | ||
396 | if (!r) { | |
397 | if (rkey != keys[level]) { | |
398 | exit_ro_spine(&spine); | |
399 | return -ENODATA; | |
400 | } | |
401 | } else { | |
402 | exit_ro_spine(&spine); | |
403 | return r; | |
404 | } | |
405 | ||
406 | root = le64_to_cpu(internal_value_le); | |
407 | } | |
408 | exit_ro_spine(&spine); | |
409 | ||
410 | return r; | |
411 | } | |
412 | EXPORT_SYMBOL_GPL(dm_btree_lookup); | |
413 | ||
993ceab9 JT |
414 | static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root, |
415 | uint64_t key, uint64_t *rkey, void *value_le) | |
416 | { | |
417 | int r, i; | |
418 | uint32_t flags, nr_entries; | |
419 | struct dm_block *node; | |
420 | struct btree_node *n; | |
421 | ||
422 | r = bn_read_lock(info, root, &node); | |
423 | if (r) | |
424 | return r; | |
425 | ||
426 | n = dm_block_data(node); | |
427 | flags = le32_to_cpu(n->header.flags); | |
428 | nr_entries = le32_to_cpu(n->header.nr_entries); | |
429 | ||
430 | if (flags & INTERNAL_NODE) { | |
431 | i = lower_bound(n, key); | |
432 | if (i < 0 || i >= nr_entries) { | |
433 | r = -ENODATA; | |
434 | goto out; | |
435 | } | |
436 | ||
437 | r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); | |
438 | if (r == -ENODATA && i < (nr_entries - 1)) { | |
439 | i++; | |
440 | r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); | |
441 | } | |
442 | ||
443 | } else { | |
444 | i = upper_bound(n, key); | |
445 | if (i < 0 || i >= nr_entries) { | |
446 | r = -ENODATA; | |
447 | goto out; | |
448 | } | |
449 | ||
450 | *rkey = le64_to_cpu(n->keys[i]); | |
451 | memcpy(value_le, value_ptr(n, i), info->value_type.size); | |
452 | } | |
453 | out: | |
454 | dm_tm_unlock(info->tm, node); | |
455 | return r; | |
456 | } | |
457 | ||
458 | int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, | |
459 | uint64_t *keys, uint64_t *rkey, void *value_le) | |
460 | { | |
461 | unsigned level; | |
462 | int r = -ENODATA; | |
463 | __le64 internal_value_le; | |
464 | struct ro_spine spine; | |
465 | ||
466 | init_ro_spine(&spine, info); | |
467 | for (level = 0; level < info->levels - 1u; level++) { | |
468 | r = btree_lookup_raw(&spine, root, keys[level], | |
469 | lower_bound, rkey, | |
470 | &internal_value_le, sizeof(uint64_t)); | |
471 | if (r) | |
472 | goto out; | |
473 | ||
474 | if (*rkey != keys[level]) { | |
475 | r = -ENODATA; | |
476 | goto out; | |
477 | } | |
478 | ||
479 | root = le64_to_cpu(internal_value_le); | |
480 | } | |
481 | ||
482 | r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le); | |
483 | out: | |
484 | exit_ro_spine(&spine); | |
485 | return r; | |
486 | } | |
487 | ||
488 | EXPORT_SYMBOL_GPL(dm_btree_lookup_next); | |
489 | ||
3241b1d3 JT |
490 | /* |
491 | * Splits a node by creating a sibling node and shifting half the nodes | |
492 | * contents across. Assumes there is a parent node, and it has room for | |
493 | * another child. | |
494 | * | |
495 | * Before: | |
496 | * +--------+ | |
497 | * | Parent | | |
498 | * +--------+ | |
499 | * | | |
500 | * v | |
501 | * +----------+ | |
502 | * | A ++++++ | | |
503 | * +----------+ | |
504 | * | |
505 | * | |
506 | * After: | |
507 | * +--------+ | |
508 | * | Parent | | |
509 | * +--------+ | |
510 | * | | | |
511 | * v +------+ | |
512 | * +---------+ | | |
513 | * | A* +++ | v | |
514 | * +---------+ +-------+ | |
515 | * | B +++ | | |
516 | * +-------+ | |
517 | * | |
518 | * Where A* is a shadow of A. | |
519 | */ | |
0a8d4c3e VG |
520 | static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, |
521 | uint64_t key) | |
3241b1d3 JT |
522 | { |
523 | int r; | |
524 | size_t size; | |
525 | unsigned nr_left, nr_right; | |
526 | struct dm_block *left, *right, *parent; | |
550929fa | 527 | struct btree_node *ln, *rn, *pn; |
3241b1d3 JT |
528 | __le64 location; |
529 | ||
530 | left = shadow_current(s); | |
531 | ||
532 | r = new_block(s->info, &right); | |
533 | if (r < 0) | |
534 | return r; | |
535 | ||
536 | ln = dm_block_data(left); | |
537 | rn = dm_block_data(right); | |
538 | ||
539 | nr_left = le32_to_cpu(ln->header.nr_entries) / 2; | |
540 | nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; | |
541 | ||
542 | ln->header.nr_entries = cpu_to_le32(nr_left); | |
543 | ||
544 | rn->header.flags = ln->header.flags; | |
545 | rn->header.nr_entries = cpu_to_le32(nr_right); | |
546 | rn->header.max_entries = ln->header.max_entries; | |
547 | rn->header.value_size = ln->header.value_size; | |
548 | memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); | |
549 | ||
550 | size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? | |
551 | sizeof(uint64_t) : s->info->value_type.size; | |
a3aefb39 | 552 | memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), |
3241b1d3 JT |
553 | size * nr_right); |
554 | ||
555 | /* | |
556 | * Patch up the parent | |
557 | */ | |
558 | parent = shadow_parent(s); | |
559 | ||
560 | pn = dm_block_data(parent); | |
561 | location = cpu_to_le64(dm_block_location(left)); | |
562 | __dm_bless_for_disk(&location); | |
a3aefb39 | 563 | memcpy_disk(value_ptr(pn, parent_index), |
3241b1d3 JT |
564 | &location, sizeof(__le64)); |
565 | ||
566 | location = cpu_to_le64(dm_block_location(right)); | |
567 | __dm_bless_for_disk(&location); | |
568 | ||
569 | r = insert_at(sizeof(__le64), pn, parent_index + 1, | |
570 | le64_to_cpu(rn->keys[0]), &location); | |
30ce6e1c MS |
571 | if (r) { |
572 | unlock_block(s->info, right); | |
3241b1d3 | 573 | return r; |
30ce6e1c | 574 | } |
3241b1d3 JT |
575 | |
576 | if (key < le64_to_cpu(rn->keys[0])) { | |
577 | unlock_block(s->info, right); | |
578 | s->nodes[1] = left; | |
579 | } else { | |
580 | unlock_block(s->info, left); | |
581 | s->nodes[1] = right; | |
582 | } | |
583 | ||
584 | return 0; | |
585 | } | |
586 | ||
587 | /* | |
588 | * Splits a node by creating two new children beneath the given node. | |
589 | * | |
590 | * Before: | |
591 | * +----------+ | |
592 | * | A ++++++ | | |
593 | * +----------+ | |
594 | * | |
595 | * | |
596 | * After: | |
597 | * +------------+ | |
598 | * | A (shadow) | | |
599 | * +------------+ | |
600 | * | | | |
601 | * +------+ +----+ | |
602 | * | | | |
603 | * v v | |
604 | * +-------+ +-------+ | |
605 | * | B +++ | | C +++ | | |
606 | * +-------+ +-------+ | |
607 | */ | |
608 | static int btree_split_beneath(struct shadow_spine *s, uint64_t key) | |
609 | { | |
610 | int r; | |
611 | size_t size; | |
612 | unsigned nr_left, nr_right; | |
613 | struct dm_block *left, *right, *new_parent; | |
550929fa | 614 | struct btree_node *pn, *ln, *rn; |
3241b1d3 JT |
615 | __le64 val; |
616 | ||
617 | new_parent = shadow_current(s); | |
618 | ||
619 | r = new_block(s->info, &left); | |
620 | if (r < 0) | |
621 | return r; | |
622 | ||
623 | r = new_block(s->info, &right); | |
624 | if (r < 0) { | |
4dcb8b57 | 625 | unlock_block(s->info, left); |
3241b1d3 JT |
626 | return r; |
627 | } | |
628 | ||
629 | pn = dm_block_data(new_parent); | |
630 | ln = dm_block_data(left); | |
631 | rn = dm_block_data(right); | |
632 | ||
633 | nr_left = le32_to_cpu(pn->header.nr_entries) / 2; | |
634 | nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; | |
635 | ||
636 | ln->header.flags = pn->header.flags; | |
637 | ln->header.nr_entries = cpu_to_le32(nr_left); | |
638 | ln->header.max_entries = pn->header.max_entries; | |
639 | ln->header.value_size = pn->header.value_size; | |
640 | ||
641 | rn->header.flags = pn->header.flags; | |
642 | rn->header.nr_entries = cpu_to_le32(nr_right); | |
643 | rn->header.max_entries = pn->header.max_entries; | |
644 | rn->header.value_size = pn->header.value_size; | |
645 | ||
646 | memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); | |
647 | memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); | |
648 | ||
649 | size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? | |
650 | sizeof(__le64) : s->info->value_type.size; | |
a3aefb39 JT |
651 | memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); |
652 | memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), | |
3241b1d3 JT |
653 | nr_right * size); |
654 | ||
655 | /* new_parent should just point to l and r now */ | |
656 | pn->header.flags = cpu_to_le32(INTERNAL_NODE); | |
657 | pn->header.nr_entries = cpu_to_le32(2); | |
658 | pn->header.max_entries = cpu_to_le32( | |
659 | calc_max_entries(sizeof(__le64), | |
660 | dm_bm_block_size( | |
661 | dm_tm_get_bm(s->info->tm)))); | |
662 | pn->header.value_size = cpu_to_le32(sizeof(__le64)); | |
663 | ||
664 | val = cpu_to_le64(dm_block_location(left)); | |
665 | __dm_bless_for_disk(&val); | |
666 | pn->keys[0] = ln->keys[0]; | |
a3aefb39 | 667 | memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); |
3241b1d3 JT |
668 | |
669 | val = cpu_to_le64(dm_block_location(right)); | |
670 | __dm_bless_for_disk(&val); | |
671 | pn->keys[1] = rn->keys[0]; | |
a3aefb39 | 672 | memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); |
3241b1d3 JT |
673 | |
674 | /* | |
675 | * rejig the spine. This is ugly, since it knows too | |
676 | * much about the spine | |
677 | */ | |
678 | if (s->nodes[0] != new_parent) { | |
679 | unlock_block(s->info, s->nodes[0]); | |
680 | s->nodes[0] = new_parent; | |
681 | } | |
682 | if (key < le64_to_cpu(rn->keys[0])) { | |
683 | unlock_block(s->info, right); | |
684 | s->nodes[1] = left; | |
685 | } else { | |
686 | unlock_block(s->info, left); | |
687 | s->nodes[1] = right; | |
688 | } | |
689 | s->count = 2; | |
690 | ||
691 | return 0; | |
692 | } | |
693 | ||
694 | static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, | |
695 | struct dm_btree_value_type *vt, | |
696 | uint64_t key, unsigned *index) | |
697 | { | |
698 | int r, i = *index, top = 1; | |
550929fa | 699 | struct btree_node *node; |
3241b1d3 JT |
700 | |
701 | for (;;) { | |
702 | r = shadow_step(s, root, vt); | |
703 | if (r < 0) | |
704 | return r; | |
705 | ||
706 | node = dm_block_data(shadow_current(s)); | |
707 | ||
708 | /* | |
709 | * We have to patch up the parent node, ugly, but I don't | |
710 | * see a way to do this automatically as part of the spine | |
711 | * op. | |
712 | */ | |
713 | if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ | |
714 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | |
715 | ||
716 | __dm_bless_for_disk(&location); | |
a3aefb39 | 717 | memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), |
3241b1d3 JT |
718 | &location, sizeof(__le64)); |
719 | } | |
720 | ||
721 | node = dm_block_data(shadow_current(s)); | |
722 | ||
723 | if (node->header.nr_entries == node->header.max_entries) { | |
724 | if (top) | |
725 | r = btree_split_beneath(s, key); | |
726 | else | |
0a8d4c3e | 727 | r = btree_split_sibling(s, i, key); |
3241b1d3 JT |
728 | |
729 | if (r < 0) | |
730 | return r; | |
731 | } | |
732 | ||
733 | node = dm_block_data(shadow_current(s)); | |
734 | ||
735 | i = lower_bound(node, key); | |
736 | ||
737 | if (le32_to_cpu(node->header.flags) & LEAF_NODE) | |
738 | break; | |
739 | ||
740 | if (i < 0) { | |
741 | /* change the bounds on the lowest key */ | |
742 | node->keys[0] = cpu_to_le64(key); | |
743 | i = 0; | |
744 | } | |
745 | ||
746 | root = value64(node, i); | |
747 | top = 0; | |
748 | } | |
749 | ||
750 | if (i < 0 || le64_to_cpu(node->keys[i]) != key) | |
751 | i++; | |
752 | ||
753 | *index = i; | |
754 | return 0; | |
755 | } | |
756 | ||
757 | static int insert(struct dm_btree_info *info, dm_block_t root, | |
758 | uint64_t *keys, void *value, dm_block_t *new_root, | |
759 | int *inserted) | |
760 | __dm_written_to_disk(value) | |
761 | { | |
762 | int r, need_insert; | |
763 | unsigned level, index = -1, last_level = info->levels - 1; | |
764 | dm_block_t block = root; | |
765 | struct shadow_spine spine; | |
550929fa | 766 | struct btree_node *n; |
3241b1d3 JT |
767 | struct dm_btree_value_type le64_type; |
768 | ||
b0dc3c8b | 769 | init_le64_type(info->tm, &le64_type); |
3241b1d3 JT |
770 | init_shadow_spine(&spine, info); |
771 | ||
772 | for (level = 0; level < (info->levels - 1); level++) { | |
773 | r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); | |
774 | if (r < 0) | |
775 | goto bad; | |
776 | ||
777 | n = dm_block_data(shadow_current(&spine)); | |
778 | need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || | |
779 | (le64_to_cpu(n->keys[index]) != keys[level])); | |
780 | ||
781 | if (need_insert) { | |
782 | dm_block_t new_tree; | |
783 | __le64 new_le; | |
784 | ||
785 | r = dm_btree_empty(info, &new_tree); | |
786 | if (r < 0) | |
787 | goto bad; | |
788 | ||
789 | new_le = cpu_to_le64(new_tree); | |
790 | __dm_bless_for_disk(&new_le); | |
791 | ||
792 | r = insert_at(sizeof(uint64_t), n, index, | |
793 | keys[level], &new_le); | |
794 | if (r) | |
795 | goto bad; | |
796 | } | |
797 | ||
798 | if (level < last_level) | |
799 | block = value64(n, index); | |
800 | } | |
801 | ||
802 | r = btree_insert_raw(&spine, block, &info->value_type, | |
803 | keys[level], &index); | |
804 | if (r < 0) | |
805 | goto bad; | |
806 | ||
807 | n = dm_block_data(shadow_current(&spine)); | |
808 | need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || | |
809 | (le64_to_cpu(n->keys[index]) != keys[level])); | |
810 | ||
811 | if (need_insert) { | |
812 | if (inserted) | |
813 | *inserted = 1; | |
814 | ||
815 | r = insert_at(info->value_type.size, n, index, | |
816 | keys[level], value); | |
817 | if (r) | |
818 | goto bad_unblessed; | |
819 | } else { | |
820 | if (inserted) | |
821 | *inserted = 0; | |
822 | ||
823 | if (info->value_type.dec && | |
824 | (!info->value_type.equal || | |
825 | !info->value_type.equal( | |
826 | info->value_type.context, | |
a3aefb39 | 827 | value_ptr(n, index), |
3241b1d3 JT |
828 | value))) { |
829 | info->value_type.dec(info->value_type.context, | |
a3aefb39 | 830 | value_ptr(n, index)); |
3241b1d3 | 831 | } |
a3aefb39 | 832 | memcpy_disk(value_ptr(n, index), |
3241b1d3 JT |
833 | value, info->value_type.size); |
834 | } | |
835 | ||
836 | *new_root = shadow_root(&spine); | |
837 | exit_shadow_spine(&spine); | |
838 | ||
839 | return 0; | |
840 | ||
841 | bad: | |
842 | __dm_unbless_for_disk(value); | |
843 | bad_unblessed: | |
844 | exit_shadow_spine(&spine); | |
845 | return r; | |
846 | } | |
847 | ||
848 | int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, | |
849 | uint64_t *keys, void *value, dm_block_t *new_root) | |
850 | __dm_written_to_disk(value) | |
851 | { | |
852 | return insert(info, root, keys, value, new_root, NULL); | |
853 | } | |
854 | EXPORT_SYMBOL_GPL(dm_btree_insert); | |
855 | ||
856 | int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, | |
857 | uint64_t *keys, void *value, dm_block_t *new_root, | |
858 | int *inserted) | |
859 | __dm_written_to_disk(value) | |
860 | { | |
861 | return insert(info, root, keys, value, new_root, inserted); | |
862 | } | |
863 | EXPORT_SYMBOL_GPL(dm_btree_insert_notify); | |
864 | ||
865 | /*----------------------------------------------------------------*/ | |
866 | ||
f164e690 JT |
867 | static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, |
868 | uint64_t *result_key, dm_block_t *next_block) | |
3241b1d3 JT |
869 | { |
870 | int i, r; | |
871 | uint32_t flags; | |
872 | ||
873 | do { | |
874 | r = ro_step(s, block); | |
875 | if (r < 0) | |
876 | return r; | |
877 | ||
878 | flags = le32_to_cpu(ro_node(s)->header.flags); | |
879 | i = le32_to_cpu(ro_node(s)->header.nr_entries); | |
880 | if (!i) | |
881 | return -ENODATA; | |
882 | else | |
883 | i--; | |
884 | ||
f164e690 JT |
885 | if (find_highest) |
886 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); | |
887 | else | |
888 | *result_key = le64_to_cpu(ro_node(s)->keys[0]); | |
889 | ||
3241b1d3 JT |
890 | if (next_block || flags & INTERNAL_NODE) |
891 | block = value64(ro_node(s), i); | |
892 | ||
893 | } while (flags & INTERNAL_NODE); | |
894 | ||
895 | if (next_block) | |
896 | *next_block = block; | |
897 | return 0; | |
898 | } | |
899 | ||
f164e690 JT |
900 | static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, |
901 | bool find_highest, uint64_t *result_keys) | |
3241b1d3 JT |
902 | { |
903 | int r = 0, count = 0, level; | |
904 | struct ro_spine spine; | |
905 | ||
906 | init_ro_spine(&spine, info); | |
907 | for (level = 0; level < info->levels; level++) { | |
f164e690 JT |
908 | r = find_key(&spine, root, find_highest, result_keys + level, |
909 | level == info->levels - 1 ? NULL : &root); | |
3241b1d3 JT |
910 | if (r == -ENODATA) { |
911 | r = 0; | |
912 | break; | |
913 | ||
914 | } else if (r) | |
915 | break; | |
916 | ||
917 | count++; | |
918 | } | |
919 | exit_ro_spine(&spine); | |
920 | ||
921 | return r ? r : count; | |
922 | } | |
f164e690 JT |
923 | |
924 | int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, | |
925 | uint64_t *result_keys) | |
926 | { | |
927 | return dm_btree_find_key(info, root, true, result_keys); | |
928 | } | |
3241b1d3 | 929 | EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); |
4e7f1f90 | 930 | |
f164e690 JT |
931 | int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, |
932 | uint64_t *result_keys) | |
933 | { | |
934 | return dm_btree_find_key(info, root, false, result_keys); | |
935 | } | |
936 | EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); | |
937 | ||
938 | /*----------------------------------------------------------------*/ | |
939 | ||
4e7f1f90 JT |
940 | /* |
941 | * FIXME: We shouldn't use a recursive algorithm when we have limited stack | |
942 | * space. Also this only works for single level trees. | |
943 | */ | |
9b460d36 | 944 | static int walk_node(struct dm_btree_info *info, dm_block_t block, |
4e7f1f90 JT |
945 | int (*fn)(void *context, uint64_t *keys, void *leaf), |
946 | void *context) | |
947 | { | |
948 | int r; | |
949 | unsigned i, nr; | |
9b460d36 | 950 | struct dm_block *node; |
4e7f1f90 JT |
951 | struct btree_node *n; |
952 | uint64_t keys; | |
953 | ||
9b460d36 JT |
954 | r = bn_read_lock(info, block, &node); |
955 | if (r) | |
956 | return r; | |
957 | ||
958 | n = dm_block_data(node); | |
4e7f1f90 JT |
959 | |
960 | nr = le32_to_cpu(n->header.nr_entries); | |
961 | for (i = 0; i < nr; i++) { | |
962 | if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { | |
9b460d36 | 963 | r = walk_node(info, value64(n, i), fn, context); |
4e7f1f90 JT |
964 | if (r) |
965 | goto out; | |
966 | } else { | |
967 | keys = le64_to_cpu(*key_ptr(n, i)); | |
968 | r = fn(context, &keys, value_ptr(n, i)); | |
969 | if (r) | |
970 | goto out; | |
971 | } | |
972 | } | |
973 | ||
974 | out: | |
9b460d36 | 975 | dm_tm_unlock(info->tm, node); |
4e7f1f90 JT |
976 | return r; |
977 | } | |
978 | ||
979 | int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, | |
980 | int (*fn)(void *context, uint64_t *keys, void *leaf), | |
981 | void *context) | |
982 | { | |
4e7f1f90 | 983 | BUG_ON(info->levels > 1); |
9b460d36 | 984 | return walk_node(info, root, fn, context); |
4e7f1f90 JT |
985 | } |
986 | EXPORT_SYMBOL_GPL(dm_btree_walk); |