]>
Commit | Line | Data |
---|---|---|
619f0976 GW |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
619f0976 GW |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
c183d164 | 23 | * Copyright (c) 2016, 2024 by Delphix. All rights reserved. |
619f0976 GW |
24 | */ |
25 | ||
26 | #include <sys/spa.h> | |
27 | #include <sys/spa_impl.h> | |
28 | #include <sys/txg.h> | |
29 | #include <sys/vdev_impl.h> | |
619f0976 GW |
30 | #include <sys/metaslab_impl.h> |
31 | #include <sys/dsl_synctask.h> | |
32 | #include <sys/zap.h> | |
33 | #include <sys/dmu_tx.h> | |
59055a01 | 34 | #include <sys/vdev_initialize.h> |
619f0976 | 35 | |
619f0976 GW |
36 | /* |
37 | * Value that is written to disk during initialization. | |
38 | */ | |
ab8d9c17 | 39 | static uint64_t zfs_initialize_value = 0xdeadbeefdeadbeeeULL; |
619f0976 GW |
40 | |
41 | /* maximum number of I/Os outstanding per leaf vdev */ | |
18168da7 | 42 | static const int zfs_initialize_limit = 1; |
619f0976 GW |
43 | |
44 | /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ | |
ab8d9c17 | 45 | static uint64_t zfs_initialize_chunk_size = 1024 * 1024; |
619f0976 GW |
46 | |
47 | static boolean_t | |
48 | vdev_initialize_should_stop(vdev_t *vd) | |
49 | { | |
50 | return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || | |
5caeef02 DB |
51 | vd->vdev_detached || vd->vdev_top->vdev_removing || |
52 | vd->vdev_top->vdev_rz_expanding); | |
619f0976 GW |
53 | } |
54 | ||
55 | static void | |
56 | vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) | |
57 | { | |
58 | /* | |
59 | * We pass in the guid instead of the vdev_t since the vdev may | |
60 | * have been freed prior to the sync task being processed. This | |
61 | * happens when a vdev is detached as we call spa_config_vdev_exit(), | |
dd785b5b | 62 | * stop the initializing thread, schedule the sync task, and free |
619f0976 GW |
63 | * the vdev. Later when the scheduled sync task is invoked, it would |
64 | * find that the vdev has been freed. | |
65 | */ | |
66 | uint64_t guid = *(uint64_t *)arg; | |
67 | uint64_t txg = dmu_tx_get_txg(tx); | |
68 | kmem_free(arg, sizeof (uint64_t)); | |
69 | ||
70 | vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); | |
5caeef02 DB |
71 | if (vd == NULL || vd->vdev_top->vdev_removing || |
72 | !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) | |
619f0976 GW |
73 | return; |
74 | ||
75 | uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; | |
76 | vd->vdev_initialize_offset[txg & TXG_MASK] = 0; | |
77 | ||
78 | VERIFY(vd->vdev_leaf_zap != 0); | |
79 | ||
80 | objset_t *mos = vd->vdev_spa->spa_meta_objset; | |
81 | ||
82 | if (last_offset > 0) { | |
83 | vd->vdev_initialize_last_offset = last_offset; | |
84 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
85 | VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, | |
86 | sizeof (last_offset), 1, &last_offset, tx)); | |
87 | } | |
88 | if (vd->vdev_initialize_action_time > 0) { | |
89 | uint64_t val = (uint64_t)vd->vdev_initialize_action_time; | |
90 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
91 | VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), | |
92 | 1, &val, tx)); | |
93 | } | |
94 | ||
95 | uint64_t initialize_state = vd->vdev_initialize_state; | |
96 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
97 | VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, | |
98 | &initialize_state, tx)); | |
99 | } | |
100 | ||
e34e15ed BB |
101 | static void |
102 | vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx) | |
103 | { | |
104 | uint64_t guid = *(uint64_t *)arg; | |
105 | ||
106 | kmem_free(arg, sizeof (uint64_t)); | |
107 | ||
108 | vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); | |
109 | if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) | |
110 | return; | |
111 | ||
112 | ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE); | |
113 | ASSERT3U(vd->vdev_leaf_zap, !=, 0); | |
114 | ||
115 | vd->vdev_initialize_last_offset = 0; | |
116 | vd->vdev_initialize_action_time = 0; | |
117 | ||
118 | objset_t *mos = vd->vdev_spa->spa_meta_objset; | |
119 | int error; | |
120 | ||
121 | error = zap_remove(mos, vd->vdev_leaf_zap, | |
122 | VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx); | |
123 | VERIFY(error == 0 || error == ENOENT); | |
124 | ||
125 | error = zap_remove(mos, vd->vdev_leaf_zap, | |
126 | VDEV_LEAF_ZAP_INITIALIZE_STATE, tx); | |
127 | VERIFY(error == 0 || error == ENOENT); | |
128 | ||
129 | error = zap_remove(mos, vd->vdev_leaf_zap, | |
130 | VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx); | |
131 | VERIFY(error == 0 || error == ENOENT); | |
132 | } | |
133 | ||
619f0976 GW |
134 | static void |
135 | vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) | |
136 | { | |
137 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
138 | spa_t *spa = vd->vdev_spa; | |
139 | ||
140 | if (new_state == vd->vdev_initialize_state) | |
141 | return; | |
142 | ||
143 | /* | |
144 | * Copy the vd's guid, this will be freed by the sync task. | |
145 | */ | |
146 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
147 | *guid = vd->vdev_guid; | |
148 | ||
149 | /* | |
150 | * If we're suspending, then preserving the original start time. | |
151 | */ | |
152 | if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { | |
153 | vd->vdev_initialize_action_time = gethrestime_sec(); | |
154 | } | |
b2255edc BB |
155 | |
156 | vdev_initializing_state_t old_state = vd->vdev_initialize_state; | |
619f0976 GW |
157 | vd->vdev_initialize_state = new_state; |
158 | ||
159 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
160 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
e34e15ed BB |
161 | |
162 | if (new_state != VDEV_INITIALIZE_NONE) { | |
163 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
164 | vdev_initialize_zap_update_sync, guid, tx); | |
165 | } else { | |
166 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
167 | vdev_initialize_zap_remove_sync, guid, tx); | |
168 | } | |
619f0976 GW |
169 | |
170 | switch (new_state) { | |
171 | case VDEV_INITIALIZE_ACTIVE: | |
172 | spa_history_log_internal(spa, "initialize", tx, | |
173 | "vdev=%s activated", vd->vdev_path); | |
174 | break; | |
175 | case VDEV_INITIALIZE_SUSPENDED: | |
176 | spa_history_log_internal(spa, "initialize", tx, | |
177 | "vdev=%s suspended", vd->vdev_path); | |
178 | break; | |
179 | case VDEV_INITIALIZE_CANCELED: | |
b2255edc BB |
180 | if (old_state == VDEV_INITIALIZE_ACTIVE || |
181 | old_state == VDEV_INITIALIZE_SUSPENDED) | |
182 | spa_history_log_internal(spa, "initialize", tx, | |
183 | "vdev=%s canceled", vd->vdev_path); | |
619f0976 GW |
184 | break; |
185 | case VDEV_INITIALIZE_COMPLETE: | |
186 | spa_history_log_internal(spa, "initialize", tx, | |
187 | "vdev=%s complete", vd->vdev_path); | |
188 | break; | |
e34e15ed BB |
189 | case VDEV_INITIALIZE_NONE: |
190 | spa_history_log_internal(spa, "uninitialize", tx, | |
191 | "vdev=%s", vd->vdev_path); | |
192 | break; | |
619f0976 GW |
193 | default: |
194 | panic("invalid state %llu", (unsigned long long)new_state); | |
195 | } | |
196 | ||
197 | dmu_tx_commit(tx); | |
e60e158e JG |
198 | |
199 | if (new_state != VDEV_INITIALIZE_ACTIVE) | |
200 | spa_notify_waiters(spa); | |
619f0976 GW |
201 | } |
202 | ||
203 | static void | |
204 | vdev_initialize_cb(zio_t *zio) | |
205 | { | |
206 | vdev_t *vd = zio->io_vd; | |
207 | mutex_enter(&vd->vdev_initialize_io_lock); | |
208 | if (zio->io_error == ENXIO && !vdev_writeable(vd)) { | |
209 | /* | |
210 | * The I/O failed because the vdev was unavailable; roll the | |
211 | * last offset back. (This works because spa_sync waits on | |
212 | * spa_txg_zio before it runs sync tasks.) | |
213 | */ | |
214 | uint64_t *off = | |
215 | &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; | |
216 | *off = MIN(*off, zio->io_offset); | |
217 | } else { | |
218 | /* | |
219 | * Since initializing is best-effort, we ignore I/O errors and | |
220 | * rely on vdev_probe to determine if the errors are more | |
221 | * critical. | |
222 | */ | |
223 | if (zio->io_error != 0) | |
224 | vd->vdev_stat.vs_initialize_errors++; | |
225 | ||
226 | vd->vdev_initialize_bytes_done += zio->io_orig_size; | |
227 | } | |
228 | ASSERT3U(vd->vdev_initialize_inflight, >, 0); | |
229 | vd->vdev_initialize_inflight--; | |
230 | cv_broadcast(&vd->vdev_initialize_io_cv); | |
231 | mutex_exit(&vd->vdev_initialize_io_lock); | |
232 | ||
233 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
234 | } | |
235 | ||
236 | /* Takes care of physical writing and limiting # of concurrent ZIOs. */ | |
237 | static int | |
238 | vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) | |
239 | { | |
240 | spa_t *spa = vd->vdev_spa; | |
241 | ||
242 | /* Limit inflight initializing I/Os */ | |
243 | mutex_enter(&vd->vdev_initialize_io_lock); | |
244 | while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { | |
245 | cv_wait(&vd->vdev_initialize_io_cv, | |
246 | &vd->vdev_initialize_io_lock); | |
247 | } | |
248 | vd->vdev_initialize_inflight++; | |
249 | mutex_exit(&vd->vdev_initialize_io_lock); | |
250 | ||
251 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
252 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
253 | uint64_t txg = dmu_tx_get_txg(tx); | |
254 | ||
255 | spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); | |
256 | mutex_enter(&vd->vdev_initialize_lock); | |
257 | ||
258 | if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { | |
259 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
260 | *guid = vd->vdev_guid; | |
261 | ||
262 | /* This is the first write of this txg. */ | |
263 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
38080324 | 264 | vdev_initialize_zap_update_sync, guid, tx); |
619f0976 GW |
265 | } |
266 | ||
267 | /* | |
268 | * We know the vdev struct will still be around since all | |
269 | * consumers of vdev_free must stop the initialization first. | |
270 | */ | |
271 | if (vdev_initialize_should_stop(vd)) { | |
272 | mutex_enter(&vd->vdev_initialize_io_lock); | |
273 | ASSERT3U(vd->vdev_initialize_inflight, >, 0); | |
274 | vd->vdev_initialize_inflight--; | |
275 | mutex_exit(&vd->vdev_initialize_io_lock); | |
276 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
277 | mutex_exit(&vd->vdev_initialize_lock); | |
278 | dmu_tx_commit(tx); | |
279 | return (SET_ERROR(EINTR)); | |
280 | } | |
281 | mutex_exit(&vd->vdev_initialize_lock); | |
282 | ||
283 | vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; | |
284 | zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, | |
285 | size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, | |
286 | ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); | |
287 | /* vdev_initialize_cb releases SCL_STATE_ALL */ | |
288 | ||
289 | dmu_tx_commit(tx); | |
290 | ||
291 | return (0); | |
292 | } | |
293 | ||
619f0976 GW |
294 | /* |
295 | * Callback to fill each ABD chunk with zfs_initialize_value. len must be | |
296 | * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD | |
297 | * allocation will guarantee these for us. | |
298 | */ | |
619f0976 GW |
299 | static int |
300 | vdev_initialize_block_fill(void *buf, size_t len, void *unused) | |
301 | { | |
14e4e3cb AZ |
302 | (void) unused; |
303 | ||
619f0976 | 304 | ASSERT0(len % sizeof (uint64_t)); |
619f0976 GW |
305 | for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { |
306 | *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; | |
307 | } | |
619f0976 GW |
308 | return (0); |
309 | } | |
310 | ||
311 | static abd_t * | |
312 | vdev_initialize_block_alloc(void) | |
313 | { | |
314 | /* Allocate ABD for filler data */ | |
315 | abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); | |
316 | ||
317 | ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); | |
318 | (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, | |
319 | vdev_initialize_block_fill, NULL); | |
320 | ||
321 | return (data); | |
322 | } | |
323 | ||
324 | static void | |
325 | vdev_initialize_block_free(abd_t *data) | |
326 | { | |
327 | abd_free(data); | |
328 | } | |
329 | ||
330 | static int | |
331 | vdev_initialize_ranges(vdev_t *vd, abd_t *data) | |
332 | { | |
ca577779 PD |
333 | range_tree_t *rt = vd->vdev_initialize_tree; |
334 | zfs_btree_t *bt = &rt->rt_root; | |
335 | zfs_btree_index_t where; | |
619f0976 | 336 | |
ca577779 PD |
337 | for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; |
338 | rs = zfs_btree_next(bt, &where, &where)) { | |
339 | uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); | |
619f0976 GW |
340 | |
341 | /* Split range into legally-sized physical chunks */ | |
342 | uint64_t writes_required = | |
343 | ((size - 1) / zfs_initialize_chunk_size) + 1; | |
344 | ||
345 | for (uint64_t w = 0; w < writes_required; w++) { | |
346 | int error; | |
347 | ||
348 | error = vdev_initialize_write(vd, | |
ca577779 | 349 | VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + |
619f0976 GW |
350 | (w * zfs_initialize_chunk_size), |
351 | MIN(size - (w * zfs_initialize_chunk_size), | |
352 | zfs_initialize_chunk_size), data); | |
353 | if (error != 0) | |
354 | return (error); | |
355 | } | |
356 | } | |
357 | return (0); | |
358 | } | |
359 | ||
b2255edc BB |
360 | static void |
361 | vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) | |
362 | { | |
363 | uint64_t *last_rs_end = (uint64_t *)arg; | |
364 | ||
365 | if (physical_rs->rs_end > *last_rs_end) | |
366 | *last_rs_end = physical_rs->rs_end; | |
367 | } | |
368 | ||
369 | static void | |
370 | vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs) | |
371 | { | |
372 | vdev_t *vd = (vdev_t *)arg; | |
373 | ||
374 | uint64_t size = physical_rs->rs_end - physical_rs->rs_start; | |
375 | vd->vdev_initialize_bytes_est += size; | |
376 | ||
377 | if (vd->vdev_initialize_last_offset > physical_rs->rs_end) { | |
378 | vd->vdev_initialize_bytes_done += size; | |
379 | } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start && | |
380 | vd->vdev_initialize_last_offset < physical_rs->rs_end) { | |
381 | vd->vdev_initialize_bytes_done += | |
382 | vd->vdev_initialize_last_offset - physical_rs->rs_start; | |
383 | } | |
384 | } | |
385 | ||
619f0976 GW |
386 | static void |
387 | vdev_initialize_calculate_progress(vdev_t *vd) | |
388 | { | |
389 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
390 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
391 | ASSERT(vd->vdev_leaf_zap != 0); | |
392 | ||
393 | vd->vdev_initialize_bytes_est = 0; | |
394 | vd->vdev_initialize_bytes_done = 0; | |
395 | ||
396 | for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { | |
397 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
398 | mutex_enter(&msp->ms_lock); | |
399 | ||
b2255edc BB |
400 | uint64_t ms_free = (msp->ms_size - |
401 | metaslab_allocated_space(msp)) / | |
402 | vdev_get_ndisks(vd->vdev_top); | |
619f0976 GW |
403 | |
404 | /* | |
405 | * Convert the metaslab range to a physical range | |
406 | * on our vdev. We use this to determine if we are | |
407 | * in the middle of this metaslab range. | |
408 | */ | |
b2255edc | 409 | range_seg64_t logical_rs, physical_rs, remain_rs; |
619f0976 GW |
410 | logical_rs.rs_start = msp->ms_start; |
411 | logical_rs.rs_end = msp->ms_start + msp->ms_size; | |
619f0976 | 412 | |
b2255edc BB |
413 | /* Metaslab space after this offset has not been initialized */ |
414 | vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); | |
619f0976 GW |
415 | if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { |
416 | vd->vdev_initialize_bytes_est += ms_free; | |
417 | mutex_exit(&msp->ms_lock); | |
418 | continue; | |
b2255edc BB |
419 | } |
420 | ||
421 | /* Metaslab space before this offset has been initialized */ | |
422 | uint64_t last_rs_end = physical_rs.rs_end; | |
423 | if (!vdev_xlate_is_empty(&remain_rs)) { | |
424 | vdev_xlate_walk(vd, &remain_rs, | |
425 | vdev_initialize_xlate_last_rs_end, &last_rs_end); | |
426 | } | |
427 | ||
428 | if (vd->vdev_initialize_last_offset > last_rs_end) { | |
619f0976 GW |
429 | vd->vdev_initialize_bytes_done += ms_free; |
430 | vd->vdev_initialize_bytes_est += ms_free; | |
431 | mutex_exit(&msp->ms_lock); | |
432 | continue; | |
433 | } | |
434 | ||
435 | /* | |
436 | * If we get here, we're in the middle of initializing this | |
437 | * metaslab. Load it and walk the free tree for more accurate | |
438 | * progress estimation. | |
439 | */ | |
b194fab0 | 440 | VERIFY0(metaslab_load(msp)); |
619f0976 | 441 | |
ca577779 PD |
442 | zfs_btree_index_t where; |
443 | range_tree_t *rt = msp->ms_allocatable; | |
444 | for (range_seg_t *rs = | |
445 | zfs_btree_first(&rt->rt_root, &where); rs; | |
446 | rs = zfs_btree_next(&rt->rt_root, &where, | |
447 | &where)) { | |
448 | logical_rs.rs_start = rs_get_start(rs, rt); | |
449 | logical_rs.rs_end = rs_get_end(rs, rt); | |
b2255edc BB |
450 | |
451 | vdev_xlate_walk(vd, &logical_rs, | |
452 | vdev_initialize_xlate_progress, vd); | |
619f0976 GW |
453 | } |
454 | mutex_exit(&msp->ms_lock); | |
455 | } | |
456 | } | |
457 | ||
458 | static int | |
459 | vdev_initialize_load(vdev_t *vd) | |
460 | { | |
461 | int err = 0; | |
462 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
463 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
464 | ASSERT(vd->vdev_leaf_zap != 0); | |
465 | ||
466 | if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || | |
467 | vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { | |
468 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
469 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, | |
470 | sizeof (vd->vdev_initialize_last_offset), 1, | |
471 | &vd->vdev_initialize_last_offset); | |
472 | if (err == ENOENT) { | |
473 | vd->vdev_initialize_last_offset = 0; | |
474 | err = 0; | |
475 | } | |
476 | } | |
477 | ||
478 | vdev_initialize_calculate_progress(vd); | |
479 | return (err); | |
480 | } | |
481 | ||
59055a01 | 482 | static void |
b2255edc | 483 | vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs) |
619f0976 GW |
484 | { |
485 | vdev_t *vd = arg; | |
619f0976 GW |
486 | |
487 | /* Only add segments that we have not visited yet */ | |
b2255edc | 488 | if (physical_rs->rs_end <= vd->vdev_initialize_last_offset) |
619f0976 GW |
489 | return; |
490 | ||
491 | /* Pick up where we left off mid-range. */ | |
b2255edc | 492 | if (vd->vdev_initialize_last_offset > physical_rs->rs_start) { |
619f0976 GW |
493 | zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " |
494 | "(%llu, %llu)", vd->vdev_path, | |
b2255edc BB |
495 | (u_longlong_t)physical_rs->rs_start, |
496 | (u_longlong_t)physical_rs->rs_end, | |
619f0976 | 497 | (u_longlong_t)vd->vdev_initialize_last_offset, |
b2255edc BB |
498 | (u_longlong_t)physical_rs->rs_end); |
499 | ASSERT3U(physical_rs->rs_end, >, | |
619f0976 | 500 | vd->vdev_initialize_last_offset); |
b2255edc | 501 | physical_rs->rs_start = vd->vdev_initialize_last_offset; |
619f0976 | 502 | } |
619f0976 | 503 | |
b2255edc BB |
504 | ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); |
505 | ||
506 | range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start, | |
507 | physical_rs->rs_end - physical_rs->rs_start); | |
508 | } | |
509 | ||
510 | /* | |
511 | * Convert the logical range into a physical range and add it to our | |
512 | * avl tree. | |
513 | */ | |
514 | static void | |
515 | vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) | |
516 | { | |
517 | vdev_t *vd = arg; | |
518 | range_seg64_t logical_rs; | |
519 | logical_rs.rs_start = start; | |
520 | logical_rs.rs_end = start + size; | |
521 | ||
522 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
523 | vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg); | |
619f0976 GW |
524 | } |
525 | ||
460748d4 | 526 | static __attribute__((noreturn)) void |
619f0976 GW |
527 | vdev_initialize_thread(void *arg) |
528 | { | |
529 | vdev_t *vd = arg; | |
530 | spa_t *spa = vd->vdev_spa; | |
531 | int error = 0; | |
532 | uint64_t ms_count = 0; | |
533 | ||
534 | ASSERT(vdev_is_concrete(vd)); | |
535 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
536 | ||
537 | vd->vdev_initialize_last_offset = 0; | |
538 | VERIFY0(vdev_initialize_load(vd)); | |
539 | ||
540 | abd_t *deadbeef = vdev_initialize_block_alloc(); | |
541 | ||
ca577779 PD |
542 | vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, |
543 | 0, 0); | |
619f0976 GW |
544 | |
545 | for (uint64_t i = 0; !vd->vdev_detached && | |
546 | i < vd->vdev_top->vdev_ms_count; i++) { | |
547 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
f09fda50 | 548 | boolean_t unload_when_done = B_FALSE; |
619f0976 GW |
549 | |
550 | /* | |
551 | * If we've expanded the top-level vdev or it's our | |
552 | * first pass, calculate our progress. | |
553 | */ | |
554 | if (vd->vdev_top->vdev_ms_count != ms_count) { | |
555 | vdev_initialize_calculate_progress(vd); | |
556 | ms_count = vd->vdev_top->vdev_ms_count; | |
557 | } | |
558 | ||
1b939560 BB |
559 | spa_config_exit(spa, SCL_CONFIG, FTAG); |
560 | metaslab_disable(msp); | |
619f0976 | 561 | mutex_enter(&msp->ms_lock); |
f09fda50 PD |
562 | if (!msp->ms_loaded && !msp->ms_loading) |
563 | unload_when_done = B_TRUE; | |
b194fab0 | 564 | VERIFY0(metaslab_load(msp)); |
619f0976 GW |
565 | |
566 | range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, | |
567 | vd); | |
568 | mutex_exit(&msp->ms_lock); | |
569 | ||
619f0976 | 570 | error = vdev_initialize_ranges(vd, deadbeef); |
f09fda50 | 571 | metaslab_enable(msp, B_TRUE, unload_when_done); |
619f0976 GW |
572 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); |
573 | ||
574 | range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); | |
575 | if (error != 0) | |
576 | break; | |
577 | } | |
578 | ||
579 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
580 | mutex_enter(&vd->vdev_initialize_io_lock); | |
581 | while (vd->vdev_initialize_inflight > 0) { | |
582 | cv_wait(&vd->vdev_initialize_io_cv, | |
583 | &vd->vdev_initialize_io_lock); | |
584 | } | |
585 | mutex_exit(&vd->vdev_initialize_io_lock); | |
586 | ||
587 | range_tree_destroy(vd->vdev_initialize_tree); | |
588 | vdev_initialize_block_free(deadbeef); | |
589 | vd->vdev_initialize_tree = NULL; | |
590 | ||
591 | mutex_enter(&vd->vdev_initialize_lock); | |
bedbc13d S |
592 | if (!vd->vdev_initialize_exit_wanted) { |
593 | if (vdev_writeable(vd)) { | |
594 | vdev_initialize_change_state(vd, | |
595 | VDEV_INITIALIZE_COMPLETE); | |
596 | } else if (vd->vdev_faulted) { | |
597 | vdev_initialize_change_state(vd, | |
598 | VDEV_INITIALIZE_CANCELED); | |
599 | } | |
619f0976 GW |
600 | } |
601 | ASSERT(vd->vdev_initialize_thread != NULL || | |
602 | vd->vdev_initialize_inflight == 0); | |
603 | ||
604 | /* | |
605 | * Drop the vdev_initialize_lock while we sync out the | |
606 | * txg since it's possible that a device might be trying to | |
607 | * come online and must check to see if it needs to restart an | |
608 | * initialization. That thread will be holding the spa_config_lock | |
609 | * which would prevent the txg_wait_synced from completing. | |
610 | */ | |
611 | mutex_exit(&vd->vdev_initialize_lock); | |
612 | txg_wait_synced(spa_get_dsl(spa), 0); | |
613 | mutex_enter(&vd->vdev_initialize_lock); | |
614 | ||
615 | vd->vdev_initialize_thread = NULL; | |
616 | cv_broadcast(&vd->vdev_initialize_cv); | |
617 | mutex_exit(&vd->vdev_initialize_lock); | |
eeb8fae9 JL |
618 | |
619 | thread_exit(); | |
619f0976 GW |
620 | } |
621 | ||
622 | /* | |
623 | * Initiates a device. Caller must hold vdev_initialize_lock. | |
624 | * Device must be a leaf and not already be initializing. | |
625 | */ | |
626 | void | |
627 | vdev_initialize(vdev_t *vd) | |
628 | { | |
629 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
630 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
631 | ASSERT(vdev_is_concrete(vd)); | |
632 | ASSERT3P(vd->vdev_initialize_thread, ==, NULL); | |
633 | ASSERT(!vd->vdev_detached); | |
634 | ASSERT(!vd->vdev_initialize_exit_wanted); | |
635 | ASSERT(!vd->vdev_top->vdev_removing); | |
5caeef02 | 636 | ASSERT(!vd->vdev_top->vdev_rz_expanding); |
619f0976 GW |
637 | |
638 | vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); | |
639 | vd->vdev_initialize_thread = thread_create(NULL, 0, | |
640 | vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
641 | } | |
642 | ||
e34e15ed BB |
643 | /* |
644 | * Uninitializes a device. Caller must hold vdev_initialize_lock. | |
645 | * Device must be a leaf and not already be initializing. | |
646 | */ | |
647 | void | |
648 | vdev_uninitialize(vdev_t *vd) | |
649 | { | |
650 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
651 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
652 | ASSERT(vdev_is_concrete(vd)); | |
653 | ASSERT3P(vd->vdev_initialize_thread, ==, NULL); | |
654 | ASSERT(!vd->vdev_detached); | |
655 | ASSERT(!vd->vdev_initialize_exit_wanted); | |
656 | ASSERT(!vd->vdev_top->vdev_removing); | |
657 | ||
658 | vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE); | |
659 | } | |
660 | ||
619f0976 | 661 | /* |
c10d37dd GW |
662 | * Wait for the initialize thread to be terminated (cancelled or stopped). |
663 | */ | |
664 | static void | |
665 | vdev_initialize_stop_wait_impl(vdev_t *vd) | |
666 | { | |
667 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
668 | ||
669 | while (vd->vdev_initialize_thread != NULL) | |
670 | cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); | |
671 | ||
672 | ASSERT3P(vd->vdev_initialize_thread, ==, NULL); | |
673 | vd->vdev_initialize_exit_wanted = B_FALSE; | |
674 | } | |
675 | ||
676 | /* | |
677 | * Wait for vdev initialize threads which were either to cleanly exit. | |
619f0976 GW |
678 | */ |
679 | void | |
c10d37dd | 680 | vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) |
619f0976 | 681 | { |
14e4e3cb | 682 | (void) spa; |
c10d37dd GW |
683 | vdev_t *vd; |
684 | ||
975a1325 DB |
685 | ASSERT(MUTEX_HELD(&spa_namespace_lock) || |
686 | spa->spa_export_thread == curthread); | |
c10d37dd GW |
687 | |
688 | while ((vd = list_remove_head(vd_list)) != NULL) { | |
689 | mutex_enter(&vd->vdev_initialize_lock); | |
690 | vdev_initialize_stop_wait_impl(vd); | |
691 | mutex_exit(&vd->vdev_initialize_lock); | |
692 | } | |
693 | } | |
619f0976 | 694 | |
c10d37dd | 695 | /* |
e1cfd73f | 696 | * Stop initializing a device, with the resultant initializing state being |
c10d37dd GW |
697 | * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when |
698 | * a list_t is provided the stopping vdev is inserted in to the list. Callers | |
699 | * are then required to call vdev_initialize_stop_wait() to block for all the | |
700 | * initialization threads to exit. The caller must hold vdev_initialize_lock | |
701 | * and must not be writing to the spa config, as the initializing thread may | |
702 | * try to enter the config as a reader before exiting. | |
703 | */ | |
704 | void | |
705 | vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, | |
706 | list_t *vd_list) | |
707 | { | |
708 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); | |
619f0976 GW |
709 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); |
710 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
711 | ASSERT(vdev_is_concrete(vd)); | |
712 | ||
713 | /* | |
714 | * Allow cancel requests to proceed even if the initialize thread | |
715 | * has stopped. | |
716 | */ | |
717 | if (vd->vdev_initialize_thread == NULL && | |
718 | tgt_state != VDEV_INITIALIZE_CANCELED) { | |
719 | return; | |
720 | } | |
721 | ||
722 | vdev_initialize_change_state(vd, tgt_state); | |
723 | vd->vdev_initialize_exit_wanted = B_TRUE; | |
619f0976 | 724 | |
c10d37dd GW |
725 | if (vd_list == NULL) { |
726 | vdev_initialize_stop_wait_impl(vd); | |
727 | } else { | |
975a1325 DB |
728 | ASSERT(MUTEX_HELD(&spa_namespace_lock) || |
729 | vd->vdev_spa->spa_export_thread == curthread); | |
c10d37dd GW |
730 | list_insert_tail(vd_list, vd); |
731 | } | |
619f0976 GW |
732 | } |
733 | ||
734 | static void | |
c10d37dd GW |
735 | vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, |
736 | list_t *vd_list) | |
619f0976 GW |
737 | { |
738 | if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { | |
739 | mutex_enter(&vd->vdev_initialize_lock); | |
c10d37dd | 740 | vdev_initialize_stop(vd, tgt_state, vd_list); |
619f0976 GW |
741 | mutex_exit(&vd->vdev_initialize_lock); |
742 | return; | |
743 | } | |
744 | ||
745 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
c10d37dd GW |
746 | vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, |
747 | vd_list); | |
619f0976 GW |
748 | } |
749 | } | |
750 | ||
751 | /* | |
752 | * Convenience function to stop initializing of a vdev tree and set all | |
753 | * initialize thread pointers to NULL. | |
754 | */ | |
755 | void | |
756 | vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) | |
757 | { | |
c10d37dd GW |
758 | spa_t *spa = vd->vdev_spa; |
759 | list_t vd_list; | |
760 | ||
975a1325 DB |
761 | ASSERT(MUTEX_HELD(&spa_namespace_lock) || |
762 | spa->spa_export_thread == curthread); | |
c10d37dd GW |
763 | |
764 | list_create(&vd_list, sizeof (vdev_t), | |
765 | offsetof(vdev_t, vdev_initialize_node)); | |
766 | ||
767 | vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); | |
768 | vdev_initialize_stop_wait(spa, &vd_list); | |
619f0976 GW |
769 | |
770 | if (vd->vdev_spa->spa_sync_on) { | |
771 | /* Make sure that our state has been synced to disk */ | |
772 | txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); | |
773 | } | |
c10d37dd GW |
774 | |
775 | list_destroy(&vd_list); | |
619f0976 GW |
776 | } |
777 | ||
778 | void | |
779 | vdev_initialize_restart(vdev_t *vd) | |
780 | { | |
c183d164 GW |
781 | ASSERT(MUTEX_HELD(&spa_namespace_lock) || |
782 | vd->vdev_spa->spa_load_thread == curthread); | |
619f0976 GW |
783 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); |
784 | ||
785 | if (vd->vdev_leaf_zap != 0) { | |
786 | mutex_enter(&vd->vdev_initialize_lock); | |
787 | uint64_t initialize_state = VDEV_INITIALIZE_NONE; | |
788 | int err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
789 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, | |
790 | sizeof (initialize_state), 1, &initialize_state); | |
791 | ASSERT(err == 0 || err == ENOENT); | |
792 | vd->vdev_initialize_state = initialize_state; | |
793 | ||
794 | uint64_t timestamp = 0; | |
795 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
796 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, | |
797 | sizeof (timestamp), 1, ×tamp); | |
798 | ASSERT(err == 0 || err == ENOENT); | |
2c3a8370 | 799 | vd->vdev_initialize_action_time = timestamp; |
619f0976 | 800 | |
5caeef02 DB |
801 | if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || |
802 | vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { | |
619f0976 GW |
803 | /* load progress for reporting, but don't resume */ |
804 | VERIFY0(vdev_initialize_load(vd)); | |
805 | } else if (vd->vdev_initialize_state == | |
dd785b5b BB |
806 | VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && |
807 | !vd->vdev_top->vdev_removing && | |
5caeef02 | 808 | !vd->vdev_top->vdev_rz_expanding && |
dd785b5b | 809 | vd->vdev_initialize_thread == NULL) { |
619f0976 GW |
810 | vdev_initialize(vd); |
811 | } | |
812 | ||
813 | mutex_exit(&vd->vdev_initialize_lock); | |
814 | } | |
815 | ||
816 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
817 | vdev_initialize_restart(vd->vdev_child[i]); | |
818 | } | |
819 | } | |
820 | ||
619f0976 | 821 | EXPORT_SYMBOL(vdev_initialize); |
e34e15ed | 822 | EXPORT_SYMBOL(vdev_uninitialize); |
619f0976 | 823 | EXPORT_SYMBOL(vdev_initialize_stop); |
c10d37dd GW |
824 | EXPORT_SYMBOL(vdev_initialize_stop_all); |
825 | EXPORT_SYMBOL(vdev_initialize_stop_wait); | |
1b939560 | 826 | EXPORT_SYMBOL(vdev_initialize_restart); |
619f0976 | 827 | |
ab8d9c17 | 828 | ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, U64, ZMOD_RW, |
619f0976 | 829 | "Value written during zpool initialize"); |
e60e158e | 830 | |
ab8d9c17 | 831 | ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, U64, ZMOD_RW, |
e60e158e | 832 | "Size in bytes of writes by zpool initialize"); |