]>
Commit | Line | Data |
---|---|---|
a1d477c2 MA |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * This file and its contents are supplied under the terms of the | |
5 | * Common Development and Distribution License ("CDDL"), version 1.0. | |
6 | * You may only use this file in accordance with the terms of version | |
7 | * 1.0 of the CDDL. | |
8 | * | |
9 | * A full copy of the text of the CDDL should have accompanied this | |
10 | * source. A copy of the CDDL is also available via the Internet at | |
11 | * http://www.illumos.org/license/CDDL. | |
12 | * | |
13 | * CDDL HEADER END | |
14 | */ | |
15 | ||
16 | /* | |
17 | * Copyright (c) 2014, 2015 by Delphix. All rights reserved. | |
18 | */ | |
19 | ||
20 | #include <sys/zfs_context.h> | |
21 | #include <sys/spa.h> | |
22 | #include <sys/spa_impl.h> | |
23 | #include <sys/vdev_impl.h> | |
24 | #include <sys/fs/zfs.h> | |
25 | #include <sys/zio.h> | |
26 | #include <sys/metaslab.h> | |
27 | #include <sys/refcount.h> | |
28 | #include <sys/dmu.h> | |
29 | #include <sys/vdev_indirect_mapping.h> | |
30 | #include <sys/dmu_tx.h> | |
31 | #include <sys/dsl_synctask.h> | |
32 | #include <sys/zap.h> | |
33 | ||
34 | /* | |
35 | * An indirect vdev corresponds to a vdev that has been removed. Since | |
36 | * we cannot rewrite block pointers of snapshots, etc., we keep a | |
37 | * mapping from old location on the removed device to the new location | |
38 | * on another device in the pool and use this mapping whenever we need | |
39 | * to access the DVA. Unfortunately, this mapping did not respect | |
40 | * logical block boundaries when it was first created, and so a DVA on | |
41 | * this indirect vdev may be "split" into multiple sections that each | |
42 | * map to a different location. As a consequence, not all DVAs can be | |
43 | * translated to an equivalent new DVA. Instead we must provide a | |
44 | * "vdev_remap" operation that executes a callback on each contiguous | |
45 | * segment of the new location. This function is used in multiple ways: | |
46 | * | |
47 | * - reads and repair writes to this device use the callback to create | |
48 | * a child io for each mapped segment. | |
49 | * | |
50 | * - frees and claims to this device use the callback to free or claim | |
51 | * each mapped segment. (Note that we don't actually need to claim | |
52 | * log blocks on indirect vdevs, because we don't allocate to | |
53 | * removing vdevs. However, zdb uses zio_claim() for its leak | |
54 | * detection.) | |
55 | */ | |
56 | ||
57 | /* | |
58 | * "Big theory statement" for how we mark blocks obsolete. | |
59 | * | |
60 | * When a block on an indirect vdev is freed or remapped, a section of | |
61 | * that vdev's mapping may no longer be referenced (aka "obsolete"). We | |
62 | * keep track of how much of each mapping entry is obsolete. When | |
63 | * an entry becomes completely obsolete, we can remove it, thus reducing | |
64 | * the memory used by the mapping. The complete picture of obsolescence | |
65 | * is given by the following data structures, described below: | |
66 | * - the entry-specific obsolete count | |
67 | * - the vdev-specific obsolete spacemap | |
68 | * - the pool-specific obsolete bpobj | |
69 | * | |
70 | * == On disk data structures used == | |
71 | * | |
72 | * We track the obsolete space for the pool using several objects. Each | |
73 | * of these objects is created on demand and freed when no longer | |
74 | * needed, and is assumed to be empty if it does not exist. | |
75 | * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. | |
76 | * | |
77 | * - Each vic_mapping_object (associated with an indirect vdev) can | |
78 | * have a vimp_counts_object. This is an array of uint32_t's | |
79 | * with the same number of entries as the vic_mapping_object. When | |
80 | * the mapping is condensed, entries from the vic_obsolete_sm_object | |
81 | * (see below) are folded into the counts. Therefore, each | |
82 | * obsolete_counts entry tells us the number of bytes in the | |
83 | * corresponding mapping entry that were not referenced when the | |
84 | * mapping was last condensed. | |
85 | * | |
86 | * - Each indirect or removing vdev can have a vic_obsolete_sm_object. | |
87 | * This is a space map containing an alloc entry for every DVA that | |
88 | * has been obsoleted since the last time this indirect vdev was | |
89 | * condensed. We use this object in order to improve performance | |
90 | * when marking a DVA as obsolete. Instead of modifying an arbitrary | |
91 | * offset of the vimp_counts_object, we only need to append an entry | |
92 | * to the end of this object. When a DVA becomes obsolete, it is | |
93 | * added to the obsolete space map. This happens when the DVA is | |
94 | * freed, remapped and not referenced by a snapshot, or the last | |
95 | * snapshot referencing it is destroyed. | |
96 | * | |
97 | * - Each dataset can have a ds_remap_deadlist object. This is a | |
98 | * deadlist object containing all blocks that were remapped in this | |
99 | * dataset but referenced in a previous snapshot. Blocks can *only* | |
100 | * appear on this list if they were remapped (dsl_dataset_block_remapped); | |
101 | * blocks that were killed in a head dataset are put on the normal | |
102 | * ds_deadlist and marked obsolete when they are freed. | |
103 | * | |
104 | * - The pool can have a dp_obsolete_bpobj. This is a list of blocks | |
105 | * in the pool that need to be marked obsolete. When a snapshot is | |
106 | * destroyed, we move some of the ds_remap_deadlist to the obsolete | |
107 | * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then | |
108 | * asynchronously process the obsolete bpobj, moving its entries to | |
109 | * the specific vdevs' obsolete space maps. | |
110 | * | |
111 | * == Summary of how we mark blocks as obsolete == | |
112 | * | |
113 | * - When freeing a block: if any DVA is on an indirect vdev, append to | |
114 | * vic_obsolete_sm_object. | |
115 | * - When remapping a block, add dva to ds_remap_deadlist (if prev snap | |
116 | * references; otherwise append to vic_obsolete_sm_object). | |
117 | * - When freeing a snapshot: move parts of ds_remap_deadlist to | |
118 | * dp_obsolete_bpobj (same algorithm as ds_deadlist). | |
119 | * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to | |
120 | * individual vdev's vic_obsolete_sm_object. | |
121 | */ | |
122 | ||
123 | /* | |
124 | * "Big theory statement" for how we condense indirect vdevs. | |
125 | * | |
126 | * Condensing an indirect vdev's mapping is the process of determining | |
127 | * the precise counts of obsolete space for each mapping entry (by | |
128 | * integrating the obsolete spacemap into the obsolete counts) and | |
129 | * writing out a new mapping that contains only referenced entries. | |
130 | * | |
131 | * We condense a vdev when we expect the mapping to shrink (see | |
132 | * vdev_indirect_should_condense()), but only perform one condense at a | |
133 | * time to limit the memory usage. In addition, we use a separate | |
134 | * open-context thread (spa_condense_indirect_thread) to incrementally | |
135 | * create the new mapping object in a way that minimizes the impact on | |
136 | * the rest of the system. | |
137 | * | |
138 | * == Generating a new mapping == | |
139 | * | |
140 | * To generate a new mapping, we follow these steps: | |
141 | * | |
142 | * 1. Save the old obsolete space map and create a new mapping object | |
143 | * (see spa_condense_indirect_start_sync()). This initializes the | |
144 | * spa_condensing_indirect_phys with the "previous obsolete space map", | |
145 | * which is now read only. Newly obsolete DVAs will be added to a | |
146 | * new (initially empty) obsolete space map, and will not be | |
147 | * considered as part of this condense operation. | |
148 | * | |
149 | * 2. Construct in memory the precise counts of obsolete space for each | |
150 | * mapping entry, by incorporating the obsolete space map into the | |
151 | * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) | |
152 | * | |
153 | * 3. Iterate through each mapping entry, writing to the new mapping any | |
154 | * entries that are not completely obsolete (i.e. which don't have | |
155 | * obsolete count == mapping length). (See | |
156 | * spa_condense_indirect_generate_new_mapping().) | |
157 | * | |
158 | * 4. Destroy the old mapping object and switch over to the new one | |
159 | * (spa_condense_indirect_complete_sync). | |
160 | * | |
161 | * == Restarting from failure == | |
162 | * | |
163 | * To restart the condense when we import/open the pool, we must start | |
164 | * at the 2nd step above: reconstruct the precise counts in memory, | |
165 | * based on the space map + counts. Then in the 3rd step, we start | |
166 | * iterating where we left off: at vimp_max_offset of the new mapping | |
167 | * object. | |
168 | */ | |
169 | ||
170 | boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; | |
171 | ||
172 | /* | |
173 | * Condense if at least this percent of the bytes in the mapping is | |
174 | * obsolete. With the default of 25%, the amount of space mapped | |
175 | * will be reduced to 1% of its original size after at most 16 | |
176 | * condenses. Higher values will condense less often (causing less | |
177 | * i/o); lower values will reduce the mapping size more quickly. | |
178 | */ | |
179 | int zfs_indirect_condense_obsolete_pct = 25; | |
180 | ||
181 | /* | |
182 | * Condense if the obsolete space map takes up more than this amount of | |
183 | * space on disk (logically). This limits the amount of disk space | |
184 | * consumed by the obsolete space map; the default of 1GB is small enough | |
185 | * that we typically don't mind "wasting" it. | |
186 | */ | |
187 | uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; | |
188 | ||
189 | /* | |
190 | * Don't bother condensing if the mapping uses less than this amount of | |
191 | * memory. The default of 128KB is considered a "trivial" amount of | |
192 | * memory and not worth reducing. | |
193 | */ | |
194 | unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; | |
195 | ||
196 | /* | |
197 | * This is used by the test suite so that it can ensure that certain | |
198 | * actions happen while in the middle of a condense (which might otherwise | |
199 | * complete too quickly). If used to reduce the performance impact of | |
200 | * condensing in production, a maximum value of 1 should be sufficient. | |
201 | */ | |
202 | int zfs_condense_indirect_commit_entry_delay_ms = 0; | |
203 | ||
204 | /* | |
205 | * Mark the given offset and size as being obsolete in the given txg. | |
206 | */ | |
207 | void | |
208 | vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size, | |
209 | uint64_t txg) | |
210 | { | |
211 | spa_t *spa = vd->vdev_spa; | |
212 | ASSERT3U(spa_syncing_txg(spa), ==, txg); | |
213 | ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); | |
214 | ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); | |
215 | ASSERT(size > 0); | |
216 | VERIFY(vdev_indirect_mapping_entry_for_offset( | |
217 | vd->vdev_indirect_mapping, offset) != NULL); | |
218 | ||
219 | if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { | |
220 | mutex_enter(&vd->vdev_obsolete_lock); | |
221 | range_tree_add(vd->vdev_obsolete_segments, offset, size); | |
222 | mutex_exit(&vd->vdev_obsolete_lock); | |
223 | vdev_dirty(vd, 0, NULL, txg); | |
224 | } | |
225 | } | |
226 | ||
227 | /* | |
228 | * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This | |
229 | * wrapper is provided because the DMU does not know about vdev_t's and | |
230 | * cannot directly call vdev_indirect_mark_obsolete. | |
231 | */ | |
232 | void | |
233 | spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, | |
234 | uint64_t size, dmu_tx_t *tx) | |
235 | { | |
236 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
237 | ASSERT(dmu_tx_is_syncing(tx)); | |
238 | ||
239 | /* The DMU can only remap indirect vdevs. */ | |
240 | ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); | |
241 | vdev_indirect_mark_obsolete(vd, offset, size, dmu_tx_get_txg(tx)); | |
242 | } | |
243 | ||
244 | static spa_condensing_indirect_t * | |
245 | spa_condensing_indirect_create(spa_t *spa) | |
246 | { | |
247 | spa_condensing_indirect_phys_t *scip = | |
248 | &spa->spa_condensing_indirect_phys; | |
249 | spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); | |
250 | objset_t *mos = spa->spa_meta_objset; | |
251 | ||
252 | for (int i = 0; i < TXG_SIZE; i++) { | |
253 | list_create(&sci->sci_new_mapping_entries[i], | |
254 | sizeof (vdev_indirect_mapping_entry_t), | |
255 | offsetof(vdev_indirect_mapping_entry_t, vime_node)); | |
256 | } | |
257 | ||
258 | sci->sci_new_mapping = | |
259 | vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); | |
260 | ||
261 | return (sci); | |
262 | } | |
263 | ||
264 | static void | |
265 | spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) | |
266 | { | |
267 | for (int i = 0; i < TXG_SIZE; i++) | |
268 | list_destroy(&sci->sci_new_mapping_entries[i]); | |
269 | ||
270 | if (sci->sci_new_mapping != NULL) | |
271 | vdev_indirect_mapping_close(sci->sci_new_mapping); | |
272 | ||
273 | kmem_free(sci, sizeof (*sci)); | |
274 | } | |
275 | ||
276 | boolean_t | |
277 | vdev_indirect_should_condense(vdev_t *vd) | |
278 | { | |
279 | vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; | |
280 | spa_t *spa = vd->vdev_spa; | |
281 | ||
282 | ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); | |
283 | ||
284 | if (!zfs_condense_indirect_vdevs_enable) | |
285 | return (B_FALSE); | |
286 | ||
287 | /* | |
288 | * We can only condense one indirect vdev at a time. | |
289 | */ | |
290 | if (spa->spa_condensing_indirect != NULL) | |
291 | return (B_FALSE); | |
292 | ||
293 | if (spa_shutting_down(spa)) | |
294 | return (B_FALSE); | |
295 | ||
296 | /* | |
297 | * The mapping object size must not change while we are | |
298 | * condensing, so we can only condense indirect vdevs | |
299 | * (not vdevs that are still in the middle of being removed). | |
300 | */ | |
301 | if (vd->vdev_ops != &vdev_indirect_ops) | |
302 | return (B_FALSE); | |
303 | ||
304 | /* | |
305 | * If nothing new has been marked obsolete, there is no | |
306 | * point in condensing. | |
307 | */ | |
308 | if (vd->vdev_obsolete_sm == NULL) { | |
309 | ASSERT0(vdev_obsolete_sm_object(vd)); | |
310 | return (B_FALSE); | |
311 | } | |
312 | ||
313 | ASSERT(vd->vdev_obsolete_sm != NULL); | |
314 | ||
315 | ASSERT3U(vdev_obsolete_sm_object(vd), ==, | |
316 | space_map_object(vd->vdev_obsolete_sm)); | |
317 | ||
318 | uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); | |
319 | uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); | |
320 | uint64_t mapping_size = vdev_indirect_mapping_size(vim); | |
321 | uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); | |
322 | ||
323 | ASSERT3U(bytes_obsolete, <=, bytes_mapped); | |
324 | ||
325 | /* | |
326 | * If a high percentage of the bytes that are mapped have become | |
327 | * obsolete, condense (unless the mapping is already small enough). | |
328 | * This has a good chance of reducing the amount of memory used | |
329 | * by the mapping. | |
330 | */ | |
331 | if (bytes_obsolete * 100 / bytes_mapped >= | |
332 | zfs_indirect_condense_obsolete_pct && | |
333 | mapping_size > zfs_condense_min_mapping_bytes) { | |
334 | zfs_dbgmsg("should condense vdev %llu because obsolete " | |
335 | "spacemap covers %d%% of %lluMB mapping", | |
336 | (u_longlong_t)vd->vdev_id, | |
337 | (int)(bytes_obsolete * 100 / bytes_mapped), | |
338 | (u_longlong_t)bytes_mapped / 1024 / 1024); | |
339 | return (B_TRUE); | |
340 | } | |
341 | ||
342 | /* | |
343 | * If the obsolete space map takes up too much space on disk, | |
344 | * condense in order to free up this disk space. | |
345 | */ | |
346 | if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { | |
347 | zfs_dbgmsg("should condense vdev %llu because obsolete sm " | |
348 | "length %lluMB >= max size %lluMB", | |
349 | (u_longlong_t)vd->vdev_id, | |
350 | (u_longlong_t)obsolete_sm_size / 1024 / 1024, | |
351 | (u_longlong_t)zfs_condense_max_obsolete_bytes / | |
352 | 1024 / 1024); | |
353 | return (B_TRUE); | |
354 | } | |
355 | ||
356 | return (B_FALSE); | |
357 | } | |
358 | ||
359 | /* | |
360 | * This sync task completes (finishes) a condense, deleting the old | |
361 | * mapping and replacing it with the new one. | |
362 | */ | |
363 | static void | |
364 | spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) | |
365 | { | |
366 | spa_condensing_indirect_t *sci = arg; | |
367 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
368 | spa_condensing_indirect_phys_t *scip = | |
369 | &spa->spa_condensing_indirect_phys; | |
370 | vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); | |
371 | vdev_indirect_config_t *vic = &vd->vdev_indirect_config; | |
372 | objset_t *mos = spa->spa_meta_objset; | |
373 | vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; | |
374 | uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); | |
375 | uint64_t new_count = | |
376 | vdev_indirect_mapping_num_entries(sci->sci_new_mapping); | |
377 | ||
378 | ASSERT(dmu_tx_is_syncing(tx)); | |
379 | ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); | |
380 | ASSERT3P(sci, ==, spa->spa_condensing_indirect); | |
381 | for (int i = 0; i < TXG_SIZE; i++) { | |
382 | ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); | |
383 | } | |
384 | ASSERT(vic->vic_mapping_object != 0); | |
385 | ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); | |
386 | ASSERT(scip->scip_next_mapping_object != 0); | |
387 | ASSERT(scip->scip_prev_obsolete_sm_object != 0); | |
388 | ||
389 | /* | |
390 | * Reset vdev_indirect_mapping to refer to the new object. | |
391 | */ | |
392 | rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); | |
393 | vdev_indirect_mapping_close(vd->vdev_indirect_mapping); | |
394 | vd->vdev_indirect_mapping = sci->sci_new_mapping; | |
395 | rw_exit(&vd->vdev_indirect_rwlock); | |
396 | ||
397 | sci->sci_new_mapping = NULL; | |
398 | vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); | |
399 | vic->vic_mapping_object = scip->scip_next_mapping_object; | |
400 | scip->scip_next_mapping_object = 0; | |
401 | ||
402 | space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); | |
403 | spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); | |
404 | scip->scip_prev_obsolete_sm_object = 0; | |
405 | ||
406 | scip->scip_vdev = 0; | |
407 | ||
408 | VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, | |
409 | DMU_POOL_CONDENSING_INDIRECT, tx)); | |
410 | spa_condensing_indirect_destroy(spa->spa_condensing_indirect); | |
411 | spa->spa_condensing_indirect = NULL; | |
412 | ||
413 | zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " | |
414 | "new mapping object %llu has %llu entries " | |
415 | "(was %llu entries)", | |
416 | vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, | |
417 | new_count, old_count); | |
418 | ||
419 | vdev_config_dirty(spa->spa_root_vdev); | |
420 | } | |
421 | ||
422 | /* | |
423 | * This sync task appends entries to the new mapping object. | |
424 | */ | |
425 | static void | |
426 | spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) | |
427 | { | |
428 | spa_condensing_indirect_t *sci = arg; | |
429 | uint64_t txg = dmu_tx_get_txg(tx); | |
430 | ASSERTV(spa_t *spa = dmu_tx_pool(tx)->dp_spa); | |
431 | ||
432 | ASSERT(dmu_tx_is_syncing(tx)); | |
433 | ASSERT3P(sci, ==, spa->spa_condensing_indirect); | |
434 | ||
435 | vdev_indirect_mapping_add_entries(sci->sci_new_mapping, | |
436 | &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); | |
437 | ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); | |
438 | } | |
439 | ||
440 | /* | |
441 | * Open-context function to add one entry to the new mapping. The new | |
442 | * entry will be remembered and written from syncing context. | |
443 | */ | |
444 | static void | |
445 | spa_condense_indirect_commit_entry(spa_t *spa, | |
446 | vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) | |
447 | { | |
448 | spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; | |
449 | ||
450 | ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); | |
451 | ||
452 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
453 | dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); | |
454 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
455 | int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; | |
456 | ||
457 | /* | |
458 | * If we are the first entry committed this txg, kick off the sync | |
459 | * task to write to the MOS on our behalf. | |
460 | */ | |
461 | if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { | |
462 | dsl_sync_task_nowait(dmu_tx_pool(tx), | |
463 | spa_condense_indirect_commit_sync, sci, | |
464 | 0, ZFS_SPACE_CHECK_NONE, tx); | |
465 | } | |
466 | ||
467 | vdev_indirect_mapping_entry_t *vime = | |
468 | kmem_alloc(sizeof (*vime), KM_SLEEP); | |
469 | vime->vime_mapping = *vimep; | |
470 | vime->vime_obsolete_count = count; | |
471 | list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); | |
472 | ||
473 | dmu_tx_commit(tx); | |
474 | } | |
475 | ||
476 | static void | |
477 | spa_condense_indirect_generate_new_mapping(vdev_t *vd, | |
478 | uint32_t *obsolete_counts, uint64_t start_index) | |
479 | { | |
480 | spa_t *spa = vd->vdev_spa; | |
481 | uint64_t mapi = start_index; | |
482 | vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; | |
483 | uint64_t old_num_entries = | |
484 | vdev_indirect_mapping_num_entries(old_mapping); | |
485 | ||
486 | ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); | |
487 | ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); | |
488 | ||
489 | zfs_dbgmsg("starting condense of vdev %llu from index %llu", | |
490 | (u_longlong_t)vd->vdev_id, | |
491 | (u_longlong_t)mapi); | |
492 | ||
493 | while (mapi < old_num_entries && !spa_shutting_down(spa)) { | |
494 | vdev_indirect_mapping_entry_phys_t *entry = | |
495 | &old_mapping->vim_entries[mapi]; | |
496 | uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); | |
497 | ASSERT3U(obsolete_counts[mapi], <=, entry_size); | |
498 | if (obsolete_counts[mapi] < entry_size) { | |
499 | spa_condense_indirect_commit_entry(spa, entry, | |
500 | obsolete_counts[mapi]); | |
501 | ||
502 | /* | |
503 | * This delay may be requested for testing, debugging, | |
504 | * or performance reasons. | |
505 | */ | |
506 | hrtime_t now = gethrtime(); | |
507 | hrtime_t sleep_until = now + MSEC2NSEC( | |
508 | zfs_condense_indirect_commit_entry_delay_ms); | |
509 | zfs_sleep_until(sleep_until); | |
510 | } | |
511 | ||
512 | mapi++; | |
513 | } | |
514 | if (spa_shutting_down(spa)) { | |
515 | zfs_dbgmsg("pausing condense of vdev %llu at index %llu", | |
516 | (u_longlong_t)vd->vdev_id, | |
517 | (u_longlong_t)mapi); | |
518 | } | |
519 | } | |
520 | ||
521 | static void | |
522 | spa_condense_indirect_thread(void *arg) | |
523 | { | |
524 | vdev_t *vd = arg; | |
525 | spa_t *spa = vd->vdev_spa; | |
526 | spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; | |
527 | spa_condensing_indirect_phys_t *scip = | |
528 | &spa->spa_condensing_indirect_phys; | |
529 | uint32_t *counts; | |
530 | uint64_t start_index; | |
531 | vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; | |
532 | space_map_t *prev_obsolete_sm = NULL; | |
533 | ||
534 | ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); | |
535 | ASSERT(scip->scip_next_mapping_object != 0); | |
536 | ASSERT(scip->scip_prev_obsolete_sm_object != 0); | |
537 | ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); | |
538 | ||
539 | for (int i = 0; i < TXG_SIZE; i++) { | |
540 | /* | |
541 | * The list must start out empty in order for the | |
542 | * _commit_sync() sync task to be properly registered | |
543 | * on the first call to _commit_entry(); so it's wise | |
544 | * to double check and ensure we actually are starting | |
545 | * with empty lists. | |
546 | */ | |
547 | ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); | |
548 | } | |
549 | ||
550 | VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, | |
551 | scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); | |
552 | space_map_update(prev_obsolete_sm); | |
553 | counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); | |
554 | if (prev_obsolete_sm != NULL) { | |
555 | vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, | |
556 | counts, prev_obsolete_sm); | |
557 | } | |
558 | space_map_close(prev_obsolete_sm); | |
559 | ||
560 | /* | |
561 | * Generate new mapping. Determine what index to continue from | |
562 | * based on the max offset that we've already written in the | |
563 | * new mapping. | |
564 | */ | |
565 | uint64_t max_offset = | |
566 | vdev_indirect_mapping_max_offset(sci->sci_new_mapping); | |
567 | if (max_offset == 0) { | |
568 | /* We haven't written anything to the new mapping yet. */ | |
569 | start_index = 0; | |
570 | } else { | |
571 | /* | |
572 | * Pick up from where we left off. _entry_for_offset() | |
573 | * returns a pointer into the vim_entries array. If | |
574 | * max_offset is greater than any of the mappings | |
575 | * contained in the table NULL will be returned and | |
576 | * that indicates we've exhausted our iteration of the | |
577 | * old_mapping. | |
578 | */ | |
579 | ||
580 | vdev_indirect_mapping_entry_phys_t *entry = | |
581 | vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, | |
582 | max_offset); | |
583 | ||
584 | if (entry == NULL) { | |
585 | /* | |
586 | * We've already written the whole new mapping. | |
587 | * This special value will cause us to skip the | |
588 | * generate_new_mapping step and just do the sync | |
589 | * task to complete the condense. | |
590 | */ | |
591 | start_index = UINT64_MAX; | |
592 | } else { | |
593 | start_index = entry - old_mapping->vim_entries; | |
594 | ASSERT3U(start_index, <, | |
595 | vdev_indirect_mapping_num_entries(old_mapping)); | |
596 | } | |
597 | } | |
598 | ||
599 | spa_condense_indirect_generate_new_mapping(vd, counts, start_index); | |
600 | ||
601 | vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); | |
602 | ||
603 | /* | |
604 | * We may have bailed early from generate_new_mapping(), if | |
605 | * the spa is shutting down. In this case, do not complete | |
606 | * the condense. | |
607 | */ | |
608 | if (!spa_shutting_down(spa)) { | |
609 | VERIFY0(dsl_sync_task(spa_name(spa), NULL, | |
610 | spa_condense_indirect_complete_sync, sci, 0, | |
611 | ZFS_SPACE_CHECK_NONE)); | |
612 | } | |
613 | ||
614 | mutex_enter(&spa->spa_async_lock); | |
615 | spa->spa_condense_thread = NULL; | |
616 | cv_broadcast(&spa->spa_async_cv); | |
617 | mutex_exit(&spa->spa_async_lock); | |
618 | } | |
619 | ||
620 | /* | |
621 | * Sync task to begin the condensing process. | |
622 | */ | |
623 | void | |
624 | spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) | |
625 | { | |
626 | spa_t *spa = vd->vdev_spa; | |
627 | spa_condensing_indirect_phys_t *scip = | |
628 | &spa->spa_condensing_indirect_phys; | |
629 | ||
630 | ASSERT0(scip->scip_next_mapping_object); | |
631 | ASSERT0(scip->scip_prev_obsolete_sm_object); | |
632 | ASSERT0(scip->scip_vdev); | |
633 | ASSERT(dmu_tx_is_syncing(tx)); | |
634 | ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); | |
635 | ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); | |
636 | ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); | |
637 | ||
638 | uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); | |
639 | ASSERT(obsolete_sm_obj != 0); | |
640 | ||
641 | scip->scip_vdev = vd->vdev_id; | |
642 | scip->scip_next_mapping_object = | |
643 | vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); | |
644 | ||
645 | scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; | |
646 | ||
647 | /* | |
648 | * We don't need to allocate a new space map object, since | |
649 | * vdev_indirect_sync_obsolete will allocate one when needed. | |
650 | */ | |
651 | space_map_close(vd->vdev_obsolete_sm); | |
652 | vd->vdev_obsolete_sm = NULL; | |
653 | VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, | |
654 | VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); | |
655 | ||
656 | VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, | |
657 | DMU_POOL_DIRECTORY_OBJECT, | |
658 | DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), | |
659 | sizeof (*scip) / sizeof (uint64_t), scip, tx)); | |
660 | ||
661 | ASSERT3P(spa->spa_condensing_indirect, ==, NULL); | |
662 | spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); | |
663 | ||
664 | zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " | |
665 | "posm=%llu nm=%llu", | |
666 | vd->vdev_id, dmu_tx_get_txg(tx), | |
667 | (u_longlong_t)scip->scip_prev_obsolete_sm_object, | |
668 | (u_longlong_t)scip->scip_next_mapping_object); | |
669 | ||
670 | ASSERT3P(spa->spa_condense_thread, ==, NULL); | |
671 | spa->spa_condense_thread = thread_create(NULL, 0, | |
672 | spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, minclsyspri); | |
673 | } | |
674 | ||
675 | /* | |
676 | * Sync to the given vdev's obsolete space map any segments that are no longer | |
677 | * referenced as of the given txg. | |
678 | * | |
679 | * If the obsolete space map doesn't exist yet, create and open it. | |
680 | */ | |
681 | void | |
682 | vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) | |
683 | { | |
684 | spa_t *spa = vd->vdev_spa; | |
685 | ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config); | |
686 | ||
687 | ASSERT3U(vic->vic_mapping_object, !=, 0); | |
688 | ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); | |
689 | ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); | |
690 | ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); | |
691 | ||
692 | if (vdev_obsolete_sm_object(vd) == 0) { | |
693 | uint64_t obsolete_sm_object = | |
694 | space_map_alloc(spa->spa_meta_objset, tx); | |
695 | ||
696 | ASSERT(vd->vdev_top_zap != 0); | |
697 | VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
698 | VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, | |
699 | sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); | |
700 | ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); | |
701 | ||
702 | spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); | |
703 | VERIFY0(space_map_open(&vd->vdev_obsolete_sm, | |
704 | spa->spa_meta_objset, obsolete_sm_object, | |
705 | 0, vd->vdev_asize, 0)); | |
706 | space_map_update(vd->vdev_obsolete_sm); | |
707 | } | |
708 | ||
709 | ASSERT(vd->vdev_obsolete_sm != NULL); | |
710 | ASSERT3U(vdev_obsolete_sm_object(vd), ==, | |
711 | space_map_object(vd->vdev_obsolete_sm)); | |
712 | ||
713 | space_map_write(vd->vdev_obsolete_sm, | |
714 | vd->vdev_obsolete_segments, SM_ALLOC, tx); | |
715 | space_map_update(vd->vdev_obsolete_sm); | |
716 | range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); | |
717 | } | |
718 | ||
719 | int | |
720 | spa_condense_init(spa_t *spa) | |
721 | { | |
722 | int error = zap_lookup(spa->spa_meta_objset, | |
723 | DMU_POOL_DIRECTORY_OBJECT, | |
724 | DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), | |
725 | sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), | |
726 | &spa->spa_condensing_indirect_phys); | |
727 | if (error == 0) { | |
728 | if (spa_writeable(spa)) { | |
729 | spa->spa_condensing_indirect = | |
730 | spa_condensing_indirect_create(spa); | |
731 | } | |
732 | return (0); | |
733 | } else if (error == ENOENT) { | |
734 | return (0); | |
735 | } else { | |
736 | return (error); | |
737 | } | |
738 | } | |
739 | ||
740 | void | |
741 | spa_condense_fini(spa_t *spa) | |
742 | { | |
743 | if (spa->spa_condensing_indirect != NULL) { | |
744 | spa_condensing_indirect_destroy(spa->spa_condensing_indirect); | |
745 | spa->spa_condensing_indirect = NULL; | |
746 | } | |
747 | } | |
748 | ||
749 | /* | |
750 | * Restart the condense - called when the pool is opened. | |
751 | */ | |
752 | void | |
753 | spa_condense_indirect_restart(spa_t *spa) | |
754 | { | |
755 | vdev_t *vd; | |
756 | ASSERT(spa->spa_condensing_indirect != NULL); | |
757 | spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); | |
758 | vd = vdev_lookup_top(spa, | |
759 | spa->spa_condensing_indirect_phys.scip_vdev); | |
760 | ASSERT(vd != NULL); | |
761 | spa_config_exit(spa, SCL_VDEV, FTAG); | |
762 | ||
763 | ASSERT3P(spa->spa_condense_thread, ==, NULL); | |
764 | spa->spa_condense_thread = thread_create(NULL, 0, | |
765 | spa_condense_indirect_thread, vd, 0, &p0, TS_RUN, | |
766 | minclsyspri); | |
767 | } | |
768 | ||
769 | /* | |
770 | * Gets the obsolete spacemap object from the vdev's ZAP. | |
771 | * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't | |
772 | * exist yet. | |
773 | */ | |
774 | int | |
775 | vdev_obsolete_sm_object(vdev_t *vd) | |
776 | { | |
777 | ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); | |
778 | if (vd->vdev_top_zap == 0) { | |
779 | return (0); | |
780 | } | |
781 | ||
782 | uint64_t sm_obj = 0; | |
783 | int err; | |
784 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
785 | VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); | |
786 | ||
787 | ASSERT(err == 0 || err == ENOENT); | |
788 | ||
789 | return (sm_obj); | |
790 | } | |
791 | ||
792 | boolean_t | |
793 | vdev_obsolete_counts_are_precise(vdev_t *vd) | |
794 | { | |
795 | ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); | |
796 | if (vd->vdev_top_zap == 0) { | |
797 | return (B_FALSE); | |
798 | } | |
799 | ||
800 | uint64_t val = 0; | |
801 | int err; | |
802 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
803 | VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); | |
804 | ||
805 | ASSERT(err == 0 || err == ENOENT); | |
806 | ||
807 | return (val != 0); | |
808 | } | |
809 | ||
810 | /* ARGSUSED */ | |
811 | static void | |
812 | vdev_indirect_close(vdev_t *vd) | |
813 | { | |
814 | } | |
815 | ||
816 | /* ARGSUSED */ | |
817 | static void | |
818 | vdev_indirect_io_done(zio_t *zio) | |
819 | { | |
820 | } | |
821 | ||
822 | /* ARGSUSED */ | |
823 | static int | |
824 | vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, | |
825 | uint64_t *ashift) | |
826 | { | |
827 | *psize = *max_psize = vd->vdev_asize + | |
828 | VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; | |
829 | *ashift = vd->vdev_ashift; | |
830 | return (0); | |
831 | } | |
832 | ||
833 | typedef struct remap_segment { | |
834 | vdev_t *rs_vd; | |
835 | uint64_t rs_offset; | |
836 | uint64_t rs_asize; | |
837 | uint64_t rs_split_offset; | |
838 | list_node_t rs_node; | |
839 | } remap_segment_t; | |
840 | ||
841 | remap_segment_t * | |
842 | rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) | |
843 | { | |
844 | remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); | |
845 | rs->rs_vd = vd; | |
846 | rs->rs_offset = offset; | |
847 | rs->rs_asize = asize; | |
848 | rs->rs_split_offset = split_offset; | |
849 | return (rs); | |
850 | } | |
851 | ||
852 | /* | |
853 | * Goes through the relevant indirect mappings until it hits a concrete vdev | |
854 | * and issues the callback. On the way to the concrete vdev, if any other | |
855 | * indirect vdevs are encountered, then the callback will also be called on | |
856 | * each of those indirect vdevs. For example, if the segment is mapped to | |
857 | * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is | |
858 | * mapped to segment B on concrete vdev 2, then the callback will be called on | |
859 | * both vdev 1 and vdev 2. | |
860 | * | |
861 | * While the callback passed to vdev_indirect_remap() is called on every vdev | |
862 | * the function encounters, certain callbacks only care about concrete vdevs. | |
863 | * These types of callbacks should return immediately and explicitly when they | |
864 | * are called on an indirect vdev. | |
865 | * | |
866 | * Because there is a possibility that a DVA section in the indirect device | |
867 | * has been split into multiple sections in our mapping, we keep track | |
868 | * of the relevant contiguous segments of the new location (remap_segment_t) | |
869 | * in a stack. This way we can call the callback for each of the new sections | |
870 | * created by a single section of the indirect device. Note though, that in | |
871 | * this scenario the callbacks in each split block won't occur in-order in | |
872 | * terms of offset, so callers should not make any assumptions about that. | |
873 | * | |
874 | * For callbacks that don't handle split blocks and immediately return when | |
875 | * they encounter them (as is the case for remap_blkptr_cb), the caller can | |
876 | * assume that its callback will be applied from the first indirect vdev | |
877 | * encountered to the last one and then the concrete vdev, in that order. | |
878 | */ | |
879 | static void | |
880 | vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, | |
881 | void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) | |
882 | { | |
883 | list_t stack; | |
884 | spa_t *spa = vd->vdev_spa; | |
885 | ||
886 | list_create(&stack, sizeof (remap_segment_t), | |
887 | offsetof(remap_segment_t, rs_node)); | |
888 | ||
889 | for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); | |
890 | rs != NULL; rs = list_remove_head(&stack)) { | |
891 | vdev_t *v = rs->rs_vd; | |
892 | ||
893 | /* | |
894 | * Note: this can be called from open context | |
895 | * (eg. zio_read()), so we need the rwlock to prevent | |
896 | * the mapping from being changed by condensing. | |
897 | */ | |
898 | rw_enter(&v->vdev_indirect_rwlock, RW_READER); | |
899 | vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; | |
900 | ASSERT3P(vim, !=, NULL); | |
901 | ||
902 | ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); | |
903 | ASSERT(rs->rs_asize > 0); | |
904 | ||
905 | vdev_indirect_mapping_entry_phys_t *mapping = | |
906 | vdev_indirect_mapping_entry_for_offset(vim, rs->rs_offset); | |
907 | ASSERT3P(mapping, !=, NULL); | |
908 | ||
909 | while (rs->rs_asize > 0) { | |
910 | /* | |
911 | * Note: the vdev_indirect_mapping can not change | |
912 | * while we are running. It only changes while the | |
913 | * removal is in progress, and then only from syncing | |
914 | * context. While a removal is in progress, this | |
915 | * function is only called for frees, which also only | |
916 | * happen from syncing context. | |
917 | */ | |
918 | ||
919 | uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); | |
920 | uint64_t dst_offset = | |
921 | DVA_GET_OFFSET(&mapping->vimep_dst); | |
922 | uint64_t dst_vdev = DVA_GET_VDEV(&mapping->vimep_dst); | |
923 | ||
924 | ASSERT3U(rs->rs_offset, >=, | |
925 | DVA_MAPPING_GET_SRC_OFFSET(mapping)); | |
926 | ASSERT3U(rs->rs_offset, <, | |
927 | DVA_MAPPING_GET_SRC_OFFSET(mapping) + size); | |
928 | ASSERT3U(dst_vdev, !=, v->vdev_id); | |
929 | ||
930 | uint64_t inner_offset = rs->rs_offset - | |
931 | DVA_MAPPING_GET_SRC_OFFSET(mapping); | |
932 | uint64_t inner_size = | |
933 | MIN(rs->rs_asize, size - inner_offset); | |
934 | ||
935 | vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); | |
936 | ASSERT3P(dst_v, !=, NULL); | |
937 | ||
938 | if (dst_v->vdev_ops == &vdev_indirect_ops) { | |
939 | list_insert_head(&stack, | |
940 | rs_alloc(dst_v, dst_offset + inner_offset, | |
941 | inner_size, rs->rs_split_offset)); | |
942 | ||
943 | } | |
944 | ||
945 | if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && | |
946 | IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { | |
947 | /* | |
948 | * Note: This clause exists only solely for | |
949 | * testing purposes. We use it to ensure that | |
950 | * split blocks work and that the callbacks | |
951 | * using them yield the same result if issued | |
952 | * in reverse order. | |
953 | */ | |
954 | uint64_t inner_half = inner_size / 2; | |
955 | ||
956 | func(rs->rs_split_offset + inner_half, dst_v, | |
957 | dst_offset + inner_offset + inner_half, | |
958 | inner_half, arg); | |
959 | ||
960 | func(rs->rs_split_offset, dst_v, | |
961 | dst_offset + inner_offset, | |
962 | inner_half, arg); | |
963 | } else { | |
964 | func(rs->rs_split_offset, dst_v, | |
965 | dst_offset + inner_offset, | |
966 | inner_size, arg); | |
967 | } | |
968 | ||
969 | rs->rs_offset += inner_size; | |
970 | rs->rs_asize -= inner_size; | |
971 | rs->rs_split_offset += inner_size; | |
972 | mapping++; | |
973 | } | |
974 | ||
975 | rw_exit(&v->vdev_indirect_rwlock); | |
976 | kmem_free(rs, sizeof (remap_segment_t)); | |
977 | } | |
978 | list_destroy(&stack); | |
979 | } | |
980 | ||
981 | static void | |
982 | vdev_indirect_child_io_done(zio_t *zio) | |
983 | { | |
984 | zio_t *pio = zio->io_private; | |
985 | ||
986 | mutex_enter(&pio->io_lock); | |
987 | pio->io_error = zio_worst_error(pio->io_error, zio->io_error); | |
988 | mutex_exit(&pio->io_lock); | |
989 | ||
990 | abd_put(zio->io_abd); | |
991 | } | |
992 | ||
993 | static void | |
994 | vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, | |
995 | uint64_t size, void *arg) | |
996 | { | |
997 | zio_t *zio = arg; | |
998 | ||
999 | ASSERT3P(vd, !=, NULL); | |
1000 | ||
1001 | if (vd->vdev_ops == &vdev_indirect_ops) | |
1002 | return; | |
1003 | ||
1004 | zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, | |
1005 | abd_get_offset(zio->io_abd, split_offset), | |
1006 | size, zio->io_type, zio->io_priority, | |
1007 | 0, vdev_indirect_child_io_done, zio)); | |
1008 | } | |
1009 | ||
1010 | static void | |
1011 | vdev_indirect_io_start(zio_t *zio) | |
1012 | { | |
1013 | ASSERTV(spa_t *spa = zio->io_spa); | |
1014 | ||
1015 | ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); | |
1016 | if (zio->io_type != ZIO_TYPE_READ) { | |
1017 | ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); | |
1018 | ASSERT((zio->io_flags & | |
1019 | (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0); | |
1020 | } | |
1021 | ||
1022 | vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, | |
1023 | vdev_indirect_io_start_cb, zio); | |
1024 | ||
1025 | zio_execute(zio); | |
1026 | } | |
1027 | ||
1028 | vdev_ops_t vdev_indirect_ops = { | |
1029 | vdev_indirect_open, | |
1030 | vdev_indirect_close, | |
1031 | vdev_default_asize, | |
1032 | vdev_indirect_io_start, | |
1033 | vdev_indirect_io_done, | |
1034 | NULL, | |
1035 | NULL, | |
1036 | NULL, | |
1037 | NULL, | |
1038 | vdev_indirect_remap, | |
1039 | VDEV_TYPE_INDIRECT, /* name of this vdev type */ | |
1040 | B_FALSE /* leaf vdev */ | |
1041 | }; | |
1042 | ||
1043 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
1044 | EXPORT_SYMBOL(rs_alloc); | |
1045 | EXPORT_SYMBOL(spa_condense_fini); | |
1046 | EXPORT_SYMBOL(spa_condense_indirect_restart); | |
1047 | EXPORT_SYMBOL(spa_condense_indirect_start_sync); | |
1048 | EXPORT_SYMBOL(spa_condense_init); | |
1049 | EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete); | |
1050 | EXPORT_SYMBOL(vdev_indirect_mark_obsolete); | |
1051 | EXPORT_SYMBOL(vdev_indirect_should_condense); | |
1052 | EXPORT_SYMBOL(vdev_indirect_sync_obsolete); | |
1053 | EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); | |
1054 | EXPORT_SYMBOL(vdev_obsolete_sm_object); | |
1055 | ||
1056 | /* CSTYLED */ | |
1057 | module_param(zfs_condense_min_mapping_bytes, ulong, 0644); | |
1058 | MODULE_PARM_DESC(zfs_condense_min_mapping_bytes, | |
1059 | "Minimum size of vdev mapping to condense"); | |
1060 | ||
1061 | module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644); | |
1062 | MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms, | |
1063 | "Delay while condensing vdev mapping"); | |
1064 | #endif |