]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_indirect_mapping.c
Get rid of space_map_update() for ms_synced_length
[mirror_zfs.git] / module / zfs / vdev_indirect_mapping.c
CommitLineData
a1d477c2
MA
1/*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
12 *
13 * CDDL HEADER END
14 */
15
16/*
4d044c4c 17 * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
a1d477c2
MA
18 */
19
20#include <sys/dmu_tx.h>
21#include <sys/dsl_pool.h>
22#include <sys/spa.h>
23#include <sys/vdev_impl.h>
24#include <sys/vdev_indirect_mapping.h>
25#include <sys/zfeature.h>
26#include <sys/dmu_objset.h>
27
28#ifdef ZFS_DEBUG
29static boolean_t
30vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
31{
32 ASSERT(vim != NULL);
33
34 ASSERT(vim->vim_object != 0);
35 ASSERT(vim->vim_objset != NULL);
36 ASSERT(vim->vim_phys != NULL);
37 ASSERT(vim->vim_dbuf != NULL);
38
39 EQUIV(vim->vim_phys->vimp_num_entries > 0,
40 vim->vim_entries != NULL);
41 if (vim->vim_phys->vimp_num_entries > 0) {
42 ASSERTV(vdev_indirect_mapping_entry_phys_t *last_entry =
43 &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]);
44 ASSERTV(uint64_t offset =
45 DVA_MAPPING_GET_SRC_OFFSET(last_entry));
46 ASSERTV(uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst));
47
48 ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
49 }
50 if (vim->vim_havecounts) {
51 ASSERT(vim->vim_phys->vimp_counts_object != 0);
52 }
53
54 return (B_TRUE);
55}
56#endif
57
58uint64_t
59vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
60{
61 ASSERT(vdev_indirect_mapping_verify(vim));
62
63 return (vim->vim_phys->vimp_num_entries);
64}
65
66uint64_t
67vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
68{
69 ASSERT(vdev_indirect_mapping_verify(vim));
70
71 return (vim->vim_phys->vimp_max_offset);
72}
73
74uint64_t
75vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
76{
77 ASSERT(vdev_indirect_mapping_verify(vim));
78
79 return (vim->vim_object);
80}
81
82uint64_t
83vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
84{
85 ASSERT(vdev_indirect_mapping_verify(vim));
86
87 return (vim->vim_phys->vimp_bytes_mapped);
88}
89
90/*
91 * The length (in bytes) of the mapping object array in memory and
92 * (logically) on disk.
93 *
94 * Note that unlike most of our accessor functions,
95 * we don't assert that the struct is consistent; therefore it can be
96 * called while there may be concurrent changes, if we don't care about
97 * the value being immediately stale (e.g. from spa_removal_get_stats()).
98 */
99uint64_t
100vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
101{
102 return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
103}
104
105/*
106 * Compare an offset with an indirect mapping entry; there are three
107 * possible scenarios:
108 *
109 * 1. The offset is "less than" the mapping entry; meaning the
110 * offset is less than the source offset of the mapping entry. In
111 * this case, there is no overlap between the offset and the
112 * mapping entry and -1 will be returned.
113 *
114 * 2. The offset is "greater than" the mapping entry; meaning the
115 * offset is greater than the mapping entry's source offset plus
116 * the entry's size. In this case, there is no overlap between
117 * the offset and the mapping entry and 1 will be returned.
118 *
119 * NOTE: If the offset is actually equal to the entry's offset
120 * plus size, this is considered to be "greater" than the entry,
121 * and this case applies (i.e. 1 will be returned). Thus, the
122 * entry's "range" can be considered to be inclusive at its
123 * start, but exclusive at its end: e.g. [src, src + size).
124 *
125 * 3. The last case to consider is if the offset actually falls
126 * within the mapping entry's range. If this is the case, the
127 * offset is considered to be "equal to" the mapping entry and
128 * 0 will be returned.
129 *
130 * NOTE: If the offset is equal to the entry's source offset,
131 * this case applies and 0 will be returned. If the offset is
132 * equal to the entry's source plus its size, this case does
133 * *not* apply (see "NOTE" above for scenario 2), and 1 will be
134 * returned.
135 */
136static int
137dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
138{
d68ac65e 139 const uint64_t * const key = v_key;
140 const vdev_indirect_mapping_entry_phys_t * const array_elem =
a1d477c2
MA
141 v_array_elem;
142 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
143
144 if (*key < src_offset) {
145 return (-1);
146 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
147 return (0);
148 } else {
149 return (1);
150 }
151}
152
153/*
154 * Returns the mapping entry for the given offset.
155 *
156 * It's possible that the given offset will not be in the mapping table
157 * (i.e. no mapping entries contain this offset), in which case, the
158 * return value value depends on the "next_if_missing" parameter.
159 *
160 * If the offset is not found in the table and "next_if_missing" is
161 * B_FALSE, then NULL will always be returned. The behavior is intended
162 * to allow consumers to get the entry corresponding to the offset
163 * parameter, iff the offset overlaps with an entry in the table.
164 *
165 * If the offset is not found in the table and "next_if_missing" is
166 * B_TRUE, then the entry nearest to the given offset will be returned,
167 * such that the entry's source offset is greater than the offset
168 * passed in (i.e. the "next" mapping entry in the table is returned, if
169 * the offset is missing from the table). If there are no entries whose
170 * source offset is greater than the passed in offset, NULL is returned.
171 */
172static vdev_indirect_mapping_entry_phys_t *
173vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
174 uint64_t offset, boolean_t next_if_missing)
175{
176 ASSERT(vdev_indirect_mapping_verify(vim));
177 ASSERT(vim->vim_phys->vimp_num_entries > 0);
178
179 vdev_indirect_mapping_entry_phys_t *entry = NULL;
180
181 uint64_t last = vim->vim_phys->vimp_num_entries - 1;
182 uint64_t base = 0;
183
184 /*
185 * We don't define these inside of the while loop because we use
186 * their value in the case that offset isn't in the mapping.
187 */
188 uint64_t mid;
189 int result;
190
191 while (last >= base) {
192 mid = base + ((last - base) >> 1);
193
194 result = dva_mapping_overlap_compare(&offset,
195 &vim->vim_entries[mid]);
196
197 if (result == 0) {
198 entry = &vim->vim_entries[mid];
199 break;
200 } else if (result < 0) {
201 last = mid - 1;
202 } else {
203 base = mid + 1;
204 }
205 }
206
207 if (entry == NULL && next_if_missing) {
208 ASSERT3U(base, ==, last + 1);
209 ASSERT(mid == base || mid == last);
210 ASSERT3S(result, !=, 0);
211
212 /*
213 * The offset we're looking for isn't actually contained
214 * in the mapping table, thus we need to return the
215 * closest mapping entry that is greater than the
216 * offset. We reuse the result of the last comparison,
217 * comparing the mapping entry at index "mid" and the
218 * offset. The offset is guaranteed to lie between
219 * indices one less than "mid", and one greater than
220 * "mid"; we just need to determine if offset is greater
221 * than, or less than the mapping entry contained at
222 * index "mid".
223 */
224
225 uint64_t index;
226 if (result < 0)
227 index = mid;
228 else
229 index = mid + 1;
230
231 ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
232
233 if (index == vim->vim_phys->vimp_num_entries) {
234 /*
235 * If "index" is past the end of the entries
236 * array, then not only is the offset not in the
237 * mapping table, but it's actually greater than
238 * all entries in the table. In this case, we
239 * can't return a mapping entry greater than the
240 * offset (since none exist), so we return NULL.
241 */
242
243 ASSERT3S(dva_mapping_overlap_compare(&offset,
244 &vim->vim_entries[index - 1]), >, 0);
245
246 return (NULL);
247 } else {
248 /*
249 * Just to be safe, we verify the offset falls
250 * in between the mapping entries at index and
251 * one less than index. Since we know the offset
252 * doesn't overlap an entry, and we're supposed
253 * to return the entry just greater than the
254 * offset, both of the following tests must be
255 * true.
256 */
257 ASSERT3S(dva_mapping_overlap_compare(&offset,
258 &vim->vim_entries[index]), <, 0);
259 IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
260 &vim->vim_entries[index - 1]) > 0);
261
262 return (&vim->vim_entries[index]);
263 }
264 } else {
265 return (entry);
266 }
267}
268
269vdev_indirect_mapping_entry_phys_t *
270vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
271 uint64_t offset)
272{
273 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
274 B_FALSE));
275}
276
277vdev_indirect_mapping_entry_phys_t *
278vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
279 uint64_t offset)
280{
281 return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
282 B_TRUE));
283}
284
a1d477c2
MA
285void
286vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
287{
288 ASSERT(vdev_indirect_mapping_verify(vim));
289
290 if (vim->vim_phys->vimp_num_entries > 0) {
291 uint64_t map_size = vdev_indirect_mapping_size(vim);
292 vmem_free(vim->vim_entries, map_size);
293 vim->vim_entries = NULL;
294 }
295
296 dmu_buf_rele(vim->vim_dbuf, vim);
297
298 vim->vim_objset = NULL;
299 vim->vim_object = 0;
300 vim->vim_dbuf = NULL;
301 vim->vim_phys = NULL;
302
303 kmem_free(vim, sizeof (*vim));
304}
305
306uint64_t
307vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
308{
309 uint64_t object;
310 ASSERT(dmu_tx_is_syncing(tx));
311 uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
312
313 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
314 bonus_size = sizeof (vdev_indirect_mapping_phys_t);
315 }
316
317 object = dmu_object_alloc(os,
318 DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
319 DMU_OTN_UINT64_METADATA, bonus_size,
320 tx);
321
322 if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
323 dmu_buf_t *dbuf;
324 vdev_indirect_mapping_phys_t *vimp;
325
326 VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
327 dmu_buf_will_dirty(dbuf, tx);
328 vimp = dbuf->db_data;
329 vimp->vimp_counts_object = dmu_object_alloc(os,
330 DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
331 DMU_OT_NONE, 0, tx);
332 spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
333 dmu_buf_rele(dbuf, FTAG);
334 }
335
336 return (object);
337}
338
339
340vdev_indirect_mapping_t *
341vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
342{
343 vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
344 dmu_object_info_t doi;
345 VERIFY0(dmu_object_info(os, mapping_object, &doi));
346
347 vim->vim_objset = os;
348 vim->vim_object = mapping_object;
349
350 VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
351 &vim->vim_dbuf));
352 vim->vim_phys = vim->vim_dbuf->db_data;
353
354 vim->vim_havecounts =
355 (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
356
357 if (vim->vim_phys->vimp_num_entries > 0) {
358 uint64_t map_size = vdev_indirect_mapping_size(vim);
359 vim->vim_entries = vmem_alloc(map_size, KM_SLEEP);
360 VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
361 vim->vim_entries, DMU_READ_PREFETCH));
362 }
363
364 ASSERT(vdev_indirect_mapping_verify(vim));
365
366 return (vim);
367}
368
369void
370vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
371{
372 vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
373 if (vim->vim_havecounts) {
374 VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
375 tx));
376 spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
377 }
378 vdev_indirect_mapping_close(vim);
379
380 VERIFY0(dmu_object_free(os, object, tx));
381}
382
383/*
384 * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
385 * mapping object. Also remove the entries from the list and free them.
386 * This also implicitly extends the max_offset of the mapping (to the end
387 * of the last entry).
388 */
389void
390vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
391 list_t *list, dmu_tx_t *tx)
392{
393 vdev_indirect_mapping_entry_phys_t *mapbuf;
394 uint64_t old_size;
395 uint32_t *countbuf = NULL;
396 vdev_indirect_mapping_entry_phys_t *old_entries;
397 uint64_t old_count;
398 uint64_t entries_written = 0;
399
400 ASSERT(vdev_indirect_mapping_verify(vim));
401 ASSERT(dmu_tx_is_syncing(tx));
402 ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
403 ASSERT(!list_is_empty(list));
404
405 old_size = vdev_indirect_mapping_size(vim);
406 old_entries = vim->vim_entries;
407 old_count = vim->vim_phys->vimp_num_entries;
408
409 dmu_buf_will_dirty(vim->vim_dbuf, tx);
410
411 mapbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP);
412 if (vim->vim_havecounts) {
413 countbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP);
414 ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
415 SPA_FEATURE_OBSOLETE_COUNTS));
416 }
417 while (!list_is_empty(list)) {
418 uint64_t i;
419 /*
420 * Write entries from the list to the
421 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
422 */
423 for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
424 vdev_indirect_mapping_entry_t *entry =
425 list_remove_head(list);
426 if (entry == NULL)
427 break;
428
429 uint64_t size =
430 DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
431 uint64_t src_offset =
432 DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
433
434 /*
435 * We shouldn't be adding an entry which is fully
436 * obsolete.
437 */
438 ASSERT3U(entry->vime_obsolete_count, <, size);
439 IMPLY(entry->vime_obsolete_count != 0,
440 vim->vim_havecounts);
441
442 mapbuf[i] = entry->vime_mapping;
443 if (vim->vim_havecounts)
444 countbuf[i] = entry->vime_obsolete_count;
445
446 vim->vim_phys->vimp_bytes_mapped += size;
447 ASSERT3U(src_offset, >=,
448 vim->vim_phys->vimp_max_offset);
449 vim->vim_phys->vimp_max_offset = src_offset + size;
450
451 entries_written++;
452
453 vmem_free(entry, sizeof (*entry));
454 }
455 dmu_write(vim->vim_objset, vim->vim_object,
456 vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
457 i * sizeof (*mapbuf),
458 mapbuf, tx);
459 if (vim->vim_havecounts) {
460 dmu_write(vim->vim_objset,
461 vim->vim_phys->vimp_counts_object,
462 vim->vim_phys->vimp_num_entries *
463 sizeof (*countbuf),
464 i * sizeof (*countbuf), countbuf, tx);
465 }
466 vim->vim_phys->vimp_num_entries += i;
467 }
468 vmem_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
469 if (vim->vim_havecounts)
470 vmem_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
471
472 /*
473 * Update the entry array to reflect the new entries. First, copy
474 * over any old entries then read back the new entries we just wrote.
475 */
476 uint64_t new_size = vdev_indirect_mapping_size(vim);
477 ASSERT3U(new_size, >, old_size);
478 ASSERT3U(new_size - old_size, ==,
479 entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
480 vim->vim_entries = vmem_alloc(new_size, KM_SLEEP);
481 if (old_size > 0) {
482 bcopy(old_entries, vim->vim_entries, old_size);
483 vmem_free(old_entries, old_size);
484 }
485 VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
486 new_size - old_size, &vim->vim_entries[old_count],
487 DMU_READ_PREFETCH));
488
489 zfs_dbgmsg("txg %llu: wrote %llu entries to "
490 "indirect mapping obj %llu; max offset=0x%llx",
491 (u_longlong_t)dmu_tx_get_txg(tx),
492 (u_longlong_t)entries_written,
493 (u_longlong_t)vim->vim_object,
494 (u_longlong_t)vim->vim_phys->vimp_max_offset);
495}
496
497/*
498 * Increment the relevant counts for the specified offset and length.
499 * The counts array must be obtained from
500 * vdev_indirect_mapping_load_obsolete_counts().
501 */
502void
503vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
504 uint64_t offset, uint64_t length, uint32_t *counts)
505{
506 vdev_indirect_mapping_entry_phys_t *mapping;
507 uint64_t index;
508
509 mapping = vdev_indirect_mapping_entry_for_offset(vim, offset);
510
511 ASSERT(length > 0);
512 ASSERT3P(mapping, !=, NULL);
513
514 index = mapping - vim->vim_entries;
515
516 while (length > 0) {
517 ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
518
519 uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
520 uint64_t inner_offset = offset -
521 DVA_MAPPING_GET_SRC_OFFSET(mapping);
522 VERIFY3U(inner_offset, <, size);
523 uint64_t inner_size = MIN(length, size - inner_offset);
524
525 VERIFY3U(counts[index] + inner_size, <=, size);
526 counts[index] += inner_size;
527
528 offset += inner_size;
529 length -= inner_size;
530 mapping++;
531 index++;
532 }
533}
534
535typedef struct load_obsolete_space_map_arg {
536 vdev_indirect_mapping_t *losma_vim;
537 uint32_t *losma_counts;
538} load_obsolete_space_map_arg_t;
539
540static int
4d044c4c 541load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
a1d477c2
MA
542{
543 load_obsolete_space_map_arg_t *losma = arg;
4d044c4c 544 ASSERT3S(sme->sme_type, ==, SM_ALLOC);
a1d477c2
MA
545
546 vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
4d044c4c 547 sme->sme_offset, sme->sme_run, losma->losma_counts);
a1d477c2
MA
548
549 return (0);
550}
551
552/*
553 * Modify the counts (increment them) based on the spacemap.
554 */
555void
556vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
557 uint32_t *counts, space_map_t *obsolete_space_sm)
558{
559 load_obsolete_space_map_arg_t losma;
560 losma.losma_counts = counts;
561 losma.losma_vim = vim;
562 VERIFY0(space_map_iterate(obsolete_space_sm,
425d3237 563 space_map_length(obsolete_space_sm),
a1d477c2
MA
564 load_obsolete_sm_callback, &losma));
565}
566
567/*
568 * Read the obsolete counts from disk, returning them in an array.
569 */
570uint32_t *
571vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
572{
573 ASSERT(vdev_indirect_mapping_verify(vim));
574
575 uint64_t counts_size =
576 vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
577 uint32_t *counts = vmem_alloc(counts_size, KM_SLEEP);
578 if (vim->vim_havecounts) {
579 VERIFY0(dmu_read(vim->vim_objset,
580 vim->vim_phys->vimp_counts_object,
581 0, counts_size,
582 counts, DMU_READ_PREFETCH));
583 } else {
584 bzero(counts, counts_size);
585 }
586 return (counts);
587}
588
589extern void
590vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
591 uint32_t *counts)
592{
593 ASSERT(vdev_indirect_mapping_verify(vim));
594
595 vmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
596}
597
93ce2b4c 598#if defined(_KERNEL)
a1d477c2
MA
599EXPORT_SYMBOL(vdev_indirect_mapping_add_entries);
600EXPORT_SYMBOL(vdev_indirect_mapping_alloc);
601EXPORT_SYMBOL(vdev_indirect_mapping_bytes_mapped);
602EXPORT_SYMBOL(vdev_indirect_mapping_close);
603EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset);
604EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset_or_next);
605EXPORT_SYMBOL(vdev_indirect_mapping_free);
606EXPORT_SYMBOL(vdev_indirect_mapping_free_obsolete_counts);
607EXPORT_SYMBOL(vdev_indirect_mapping_increment_obsolete_count);
608EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_counts);
609EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_spacemap);
610EXPORT_SYMBOL(vdev_indirect_mapping_max_offset);
611EXPORT_SYMBOL(vdev_indirect_mapping_num_entries);
612EXPORT_SYMBOL(vdev_indirect_mapping_object);
613EXPORT_SYMBOL(vdev_indirect_mapping_open);
614EXPORT_SYMBOL(vdev_indirect_mapping_size);
615#endif