]>
Commit | Line | Data |
---|---|---|
619f0976 GW |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2016 by Delphix. All rights reserved. | |
24 | */ | |
25 | ||
26 | #include <sys/spa.h> | |
27 | #include <sys/spa_impl.h> | |
28 | #include <sys/txg.h> | |
29 | #include <sys/vdev_impl.h> | |
30 | #include <sys/refcount.h> | |
31 | #include <sys/metaslab_impl.h> | |
32 | #include <sys/dsl_synctask.h> | |
33 | #include <sys/zap.h> | |
34 | #include <sys/dmu_tx.h> | |
35 | ||
36 | /* | |
37 | * Maximum number of metaslabs per group that can be initialized | |
38 | * simultaneously. | |
39 | */ | |
40 | int max_initialize_ms = 3; | |
41 | ||
42 | /* | |
43 | * Value that is written to disk during initialization. | |
44 | */ | |
45 | #ifdef _ILP32 | |
46 | unsigned long zfs_initialize_value = 0xdeadbeefUL; | |
47 | #else | |
48 | unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL; | |
49 | #endif | |
50 | ||
51 | /* maximum number of I/Os outstanding per leaf vdev */ | |
52 | int zfs_initialize_limit = 1; | |
53 | ||
54 | /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ | |
55 | uint64_t zfs_initialize_chunk_size = 1024 * 1024; | |
56 | ||
57 | static boolean_t | |
58 | vdev_initialize_should_stop(vdev_t *vd) | |
59 | { | |
60 | return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || | |
61 | vd->vdev_detached || vd->vdev_top->vdev_removing); | |
62 | } | |
63 | ||
64 | static void | |
65 | vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) | |
66 | { | |
67 | /* | |
68 | * We pass in the guid instead of the vdev_t since the vdev may | |
69 | * have been freed prior to the sync task being processed. This | |
70 | * happens when a vdev is detached as we call spa_config_vdev_exit(), | |
71 | * stop the intializing thread, schedule the sync task, and free | |
72 | * the vdev. Later when the scheduled sync task is invoked, it would | |
73 | * find that the vdev has been freed. | |
74 | */ | |
75 | uint64_t guid = *(uint64_t *)arg; | |
76 | uint64_t txg = dmu_tx_get_txg(tx); | |
77 | kmem_free(arg, sizeof (uint64_t)); | |
78 | ||
79 | vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); | |
80 | if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) | |
81 | return; | |
82 | ||
83 | uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; | |
84 | vd->vdev_initialize_offset[txg & TXG_MASK] = 0; | |
85 | ||
86 | VERIFY(vd->vdev_leaf_zap != 0); | |
87 | ||
88 | objset_t *mos = vd->vdev_spa->spa_meta_objset; | |
89 | ||
90 | if (last_offset > 0) { | |
91 | vd->vdev_initialize_last_offset = last_offset; | |
92 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
93 | VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, | |
94 | sizeof (last_offset), 1, &last_offset, tx)); | |
95 | } | |
96 | if (vd->vdev_initialize_action_time > 0) { | |
97 | uint64_t val = (uint64_t)vd->vdev_initialize_action_time; | |
98 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
99 | VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), | |
100 | 1, &val, tx)); | |
101 | } | |
102 | ||
103 | uint64_t initialize_state = vd->vdev_initialize_state; | |
104 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
105 | VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, | |
106 | &initialize_state, tx)); | |
107 | } | |
108 | ||
109 | static void | |
110 | vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) | |
111 | { | |
112 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
113 | spa_t *spa = vd->vdev_spa; | |
114 | ||
115 | if (new_state == vd->vdev_initialize_state) | |
116 | return; | |
117 | ||
118 | /* | |
119 | * Copy the vd's guid, this will be freed by the sync task. | |
120 | */ | |
121 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
122 | *guid = vd->vdev_guid; | |
123 | ||
124 | /* | |
125 | * If we're suspending, then preserving the original start time. | |
126 | */ | |
127 | if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { | |
128 | vd->vdev_initialize_action_time = gethrestime_sec(); | |
129 | } | |
130 | vd->vdev_initialize_state = new_state; | |
131 | ||
132 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
133 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
134 | dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, | |
135 | guid, 2, ZFS_SPACE_CHECK_RESERVED, tx); | |
136 | ||
137 | switch (new_state) { | |
138 | case VDEV_INITIALIZE_ACTIVE: | |
139 | spa_history_log_internal(spa, "initialize", tx, | |
140 | "vdev=%s activated", vd->vdev_path); | |
141 | break; | |
142 | case VDEV_INITIALIZE_SUSPENDED: | |
143 | spa_history_log_internal(spa, "initialize", tx, | |
144 | "vdev=%s suspended", vd->vdev_path); | |
145 | break; | |
146 | case VDEV_INITIALIZE_CANCELED: | |
147 | spa_history_log_internal(spa, "initialize", tx, | |
148 | "vdev=%s canceled", vd->vdev_path); | |
149 | break; | |
150 | case VDEV_INITIALIZE_COMPLETE: | |
151 | spa_history_log_internal(spa, "initialize", tx, | |
152 | "vdev=%s complete", vd->vdev_path); | |
153 | break; | |
154 | default: | |
155 | panic("invalid state %llu", (unsigned long long)new_state); | |
156 | } | |
157 | ||
158 | dmu_tx_commit(tx); | |
159 | } | |
160 | ||
161 | static void | |
162 | vdev_initialize_cb(zio_t *zio) | |
163 | { | |
164 | vdev_t *vd = zio->io_vd; | |
165 | mutex_enter(&vd->vdev_initialize_io_lock); | |
166 | if (zio->io_error == ENXIO && !vdev_writeable(vd)) { | |
167 | /* | |
168 | * The I/O failed because the vdev was unavailable; roll the | |
169 | * last offset back. (This works because spa_sync waits on | |
170 | * spa_txg_zio before it runs sync tasks.) | |
171 | */ | |
172 | uint64_t *off = | |
173 | &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; | |
174 | *off = MIN(*off, zio->io_offset); | |
175 | } else { | |
176 | /* | |
177 | * Since initializing is best-effort, we ignore I/O errors and | |
178 | * rely on vdev_probe to determine if the errors are more | |
179 | * critical. | |
180 | */ | |
181 | if (zio->io_error != 0) | |
182 | vd->vdev_stat.vs_initialize_errors++; | |
183 | ||
184 | vd->vdev_initialize_bytes_done += zio->io_orig_size; | |
185 | } | |
186 | ASSERT3U(vd->vdev_initialize_inflight, >, 0); | |
187 | vd->vdev_initialize_inflight--; | |
188 | cv_broadcast(&vd->vdev_initialize_io_cv); | |
189 | mutex_exit(&vd->vdev_initialize_io_lock); | |
190 | ||
191 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
192 | } | |
193 | ||
194 | /* Takes care of physical writing and limiting # of concurrent ZIOs. */ | |
195 | static int | |
196 | vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) | |
197 | { | |
198 | spa_t *spa = vd->vdev_spa; | |
199 | ||
200 | /* Limit inflight initializing I/Os */ | |
201 | mutex_enter(&vd->vdev_initialize_io_lock); | |
202 | while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { | |
203 | cv_wait(&vd->vdev_initialize_io_cv, | |
204 | &vd->vdev_initialize_io_lock); | |
205 | } | |
206 | vd->vdev_initialize_inflight++; | |
207 | mutex_exit(&vd->vdev_initialize_io_lock); | |
208 | ||
209 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
210 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
211 | uint64_t txg = dmu_tx_get_txg(tx); | |
212 | ||
213 | spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); | |
214 | mutex_enter(&vd->vdev_initialize_lock); | |
215 | ||
216 | if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { | |
217 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
218 | *guid = vd->vdev_guid; | |
219 | ||
220 | /* This is the first write of this txg. */ | |
221 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
222 | vdev_initialize_zap_update_sync, guid, 2, | |
223 | ZFS_SPACE_CHECK_RESERVED, tx); | |
224 | } | |
225 | ||
226 | /* | |
227 | * We know the vdev struct will still be around since all | |
228 | * consumers of vdev_free must stop the initialization first. | |
229 | */ | |
230 | if (vdev_initialize_should_stop(vd)) { | |
231 | mutex_enter(&vd->vdev_initialize_io_lock); | |
232 | ASSERT3U(vd->vdev_initialize_inflight, >, 0); | |
233 | vd->vdev_initialize_inflight--; | |
234 | mutex_exit(&vd->vdev_initialize_io_lock); | |
235 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
236 | mutex_exit(&vd->vdev_initialize_lock); | |
237 | dmu_tx_commit(tx); | |
238 | return (SET_ERROR(EINTR)); | |
239 | } | |
240 | mutex_exit(&vd->vdev_initialize_lock); | |
241 | ||
242 | vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; | |
243 | zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, | |
244 | size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, | |
245 | ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); | |
246 | /* vdev_initialize_cb releases SCL_STATE_ALL */ | |
247 | ||
248 | dmu_tx_commit(tx); | |
249 | ||
250 | return (0); | |
251 | } | |
252 | ||
253 | /* | |
254 | * Translate a logical range to the physical range for the specified vdev_t. | |
255 | * This function is initially called with a leaf vdev and will walk each | |
256 | * parent vdev until it reaches a top-level vdev. Once the top-level is | |
257 | * reached the physical range is initialized and the recursive function | |
258 | * begins to unwind. As it unwinds it calls the parent's vdev specific | |
259 | * translation function to do the real conversion. | |
260 | */ | |
261 | void | |
262 | vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) | |
263 | { | |
264 | /* | |
265 | * Walk up the vdev tree | |
266 | */ | |
267 | if (vd != vd->vdev_top) { | |
268 | vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); | |
269 | } else { | |
270 | /* | |
271 | * We've reached the top-level vdev, initialize the | |
272 | * physical range to the logical range and start to | |
273 | * unwind. | |
274 | */ | |
275 | physical_rs->rs_start = logical_rs->rs_start; | |
276 | physical_rs->rs_end = logical_rs->rs_end; | |
277 | return; | |
278 | } | |
279 | ||
280 | vdev_t *pvd = vd->vdev_parent; | |
281 | ASSERT3P(pvd, !=, NULL); | |
282 | ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); | |
283 | ||
284 | /* | |
285 | * As this recursive function unwinds, translate the logical | |
286 | * range into its physical components by calling the | |
287 | * vdev specific translate function. | |
288 | */ | |
289 | range_seg_t intermediate = { { { 0, 0 } } }; | |
290 | pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); | |
291 | ||
292 | physical_rs->rs_start = intermediate.rs_start; | |
293 | physical_rs->rs_end = intermediate.rs_end; | |
294 | } | |
295 | ||
296 | /* | |
297 | * Callback to fill each ABD chunk with zfs_initialize_value. len must be | |
298 | * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD | |
299 | * allocation will guarantee these for us. | |
300 | */ | |
301 | /* ARGSUSED */ | |
302 | static int | |
303 | vdev_initialize_block_fill(void *buf, size_t len, void *unused) | |
304 | { | |
305 | ASSERT0(len % sizeof (uint64_t)); | |
306 | #ifdef _ILP32 | |
307 | for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) { | |
308 | *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value; | |
309 | } | |
310 | #else | |
311 | for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { | |
312 | *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; | |
313 | } | |
314 | #endif | |
315 | return (0); | |
316 | } | |
317 | ||
318 | static abd_t * | |
319 | vdev_initialize_block_alloc(void) | |
320 | { | |
321 | /* Allocate ABD for filler data */ | |
322 | abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); | |
323 | ||
324 | ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); | |
325 | (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, | |
326 | vdev_initialize_block_fill, NULL); | |
327 | ||
328 | return (data); | |
329 | } | |
330 | ||
331 | static void | |
332 | vdev_initialize_block_free(abd_t *data) | |
333 | { | |
334 | abd_free(data); | |
335 | } | |
336 | ||
337 | static int | |
338 | vdev_initialize_ranges(vdev_t *vd, abd_t *data) | |
339 | { | |
340 | avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; | |
341 | ||
342 | for (range_seg_t *rs = avl_first(rt); rs != NULL; | |
343 | rs = AVL_NEXT(rt, rs)) { | |
344 | uint64_t size = rs->rs_end - rs->rs_start; | |
345 | ||
346 | /* Split range into legally-sized physical chunks */ | |
347 | uint64_t writes_required = | |
348 | ((size - 1) / zfs_initialize_chunk_size) + 1; | |
349 | ||
350 | for (uint64_t w = 0; w < writes_required; w++) { | |
351 | int error; | |
352 | ||
353 | error = vdev_initialize_write(vd, | |
354 | VDEV_LABEL_START_SIZE + rs->rs_start + | |
355 | (w * zfs_initialize_chunk_size), | |
356 | MIN(size - (w * zfs_initialize_chunk_size), | |
357 | zfs_initialize_chunk_size), data); | |
358 | if (error != 0) | |
359 | return (error); | |
360 | } | |
361 | } | |
362 | return (0); | |
363 | } | |
364 | ||
619f0976 GW |
365 | static void |
366 | vdev_initialize_mg_wait(metaslab_group_t *mg) | |
367 | { | |
368 | ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); | |
369 | while (mg->mg_initialize_updating) { | |
370 | cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); | |
371 | } | |
372 | } | |
373 | ||
374 | static void | |
375 | vdev_initialize_mg_mark(metaslab_group_t *mg) | |
376 | { | |
377 | ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); | |
378 | ASSERT(mg->mg_initialize_updating); | |
379 | ||
380 | while (mg->mg_ms_initializing >= max_initialize_ms) { | |
381 | cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); | |
382 | } | |
383 | mg->mg_ms_initializing++; | |
384 | ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms); | |
385 | } | |
386 | ||
387 | /* | |
388 | * Mark the metaslab as being initialized to prevent any allocations | |
389 | * on this metaslab. We must also track how many metaslabs are currently | |
390 | * being initialized within a metaslab group and limit them to prevent | |
391 | * allocation failures from occurring because all metaslabs are being | |
392 | * initialized. | |
393 | */ | |
394 | static void | |
395 | vdev_initialize_ms_mark(metaslab_t *msp) | |
396 | { | |
397 | ASSERT(!MUTEX_HELD(&msp->ms_lock)); | |
398 | metaslab_group_t *mg = msp->ms_group; | |
399 | ||
400 | mutex_enter(&mg->mg_ms_initialize_lock); | |
401 | ||
402 | /* | |
403 | * To keep an accurate count of how many threads are initializing | |
404 | * a specific metaslab group, we only allow one thread to mark | |
405 | * the metaslab group at a time. This ensures that the value of | |
406 | * ms_initializing will be accurate when we decide to mark a metaslab | |
407 | * group as being initialized. To do this we force all other threads | |
408 | * to wait till the metaslab's mg_initialize_updating flag is no | |
409 | * longer set. | |
410 | */ | |
411 | vdev_initialize_mg_wait(mg); | |
412 | mg->mg_initialize_updating = B_TRUE; | |
413 | if (msp->ms_initializing == 0) { | |
414 | vdev_initialize_mg_mark(mg); | |
415 | } | |
416 | mutex_enter(&msp->ms_lock); | |
417 | msp->ms_initializing++; | |
418 | mutex_exit(&msp->ms_lock); | |
419 | ||
420 | mg->mg_initialize_updating = B_FALSE; | |
421 | cv_broadcast(&mg->mg_ms_initialize_cv); | |
422 | mutex_exit(&mg->mg_ms_initialize_lock); | |
423 | } | |
424 | ||
425 | static void | |
426 | vdev_initialize_ms_unmark(metaslab_t *msp) | |
427 | { | |
428 | ASSERT(!MUTEX_HELD(&msp->ms_lock)); | |
429 | metaslab_group_t *mg = msp->ms_group; | |
430 | mutex_enter(&mg->mg_ms_initialize_lock); | |
431 | mutex_enter(&msp->ms_lock); | |
432 | if (--msp->ms_initializing == 0) { | |
433 | mg->mg_ms_initializing--; | |
434 | cv_broadcast(&mg->mg_ms_initialize_cv); | |
435 | } | |
436 | mutex_exit(&msp->ms_lock); | |
437 | mutex_exit(&mg->mg_ms_initialize_lock); | |
438 | } | |
439 | ||
440 | static void | |
441 | vdev_initialize_calculate_progress(vdev_t *vd) | |
442 | { | |
443 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
444 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
445 | ASSERT(vd->vdev_leaf_zap != 0); | |
446 | ||
447 | vd->vdev_initialize_bytes_est = 0; | |
448 | vd->vdev_initialize_bytes_done = 0; | |
449 | ||
450 | for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { | |
451 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
452 | mutex_enter(&msp->ms_lock); | |
453 | ||
454 | uint64_t ms_free = msp->ms_size - | |
455 | space_map_allocated(msp->ms_sm); | |
456 | ||
457 | if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) | |
458 | ms_free /= vd->vdev_top->vdev_children; | |
459 | ||
460 | /* | |
461 | * Convert the metaslab range to a physical range | |
462 | * on our vdev. We use this to determine if we are | |
463 | * in the middle of this metaslab range. | |
464 | */ | |
465 | range_seg_t logical_rs, physical_rs; | |
466 | logical_rs.rs_start = msp->ms_start; | |
467 | logical_rs.rs_end = msp->ms_start + msp->ms_size; | |
468 | vdev_xlate(vd, &logical_rs, &physical_rs); | |
469 | ||
470 | if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { | |
471 | vd->vdev_initialize_bytes_est += ms_free; | |
472 | mutex_exit(&msp->ms_lock); | |
473 | continue; | |
474 | } else if (vd->vdev_initialize_last_offset > | |
475 | physical_rs.rs_end) { | |
476 | vd->vdev_initialize_bytes_done += ms_free; | |
477 | vd->vdev_initialize_bytes_est += ms_free; | |
478 | mutex_exit(&msp->ms_lock); | |
479 | continue; | |
480 | } | |
481 | ||
482 | /* | |
483 | * If we get here, we're in the middle of initializing this | |
484 | * metaslab. Load it and walk the free tree for more accurate | |
485 | * progress estimation. | |
486 | */ | |
b194fab0 | 487 | VERIFY0(metaslab_load(msp)); |
619f0976 GW |
488 | |
489 | for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); | |
490 | rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { | |
491 | logical_rs.rs_start = rs->rs_start; | |
492 | logical_rs.rs_end = rs->rs_end; | |
493 | vdev_xlate(vd, &logical_rs, &physical_rs); | |
494 | ||
495 | uint64_t size = physical_rs.rs_end - | |
496 | physical_rs.rs_start; | |
497 | vd->vdev_initialize_bytes_est += size; | |
498 | if (vd->vdev_initialize_last_offset > | |
499 | physical_rs.rs_end) { | |
500 | vd->vdev_initialize_bytes_done += size; | |
501 | } else if (vd->vdev_initialize_last_offset > | |
502 | physical_rs.rs_start && | |
503 | vd->vdev_initialize_last_offset < | |
504 | physical_rs.rs_end) { | |
505 | vd->vdev_initialize_bytes_done += | |
506 | vd->vdev_initialize_last_offset - | |
507 | physical_rs.rs_start; | |
508 | } | |
509 | } | |
510 | mutex_exit(&msp->ms_lock); | |
511 | } | |
512 | } | |
513 | ||
514 | static int | |
515 | vdev_initialize_load(vdev_t *vd) | |
516 | { | |
517 | int err = 0; | |
518 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
519 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
520 | ASSERT(vd->vdev_leaf_zap != 0); | |
521 | ||
522 | if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || | |
523 | vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { | |
524 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
525 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, | |
526 | sizeof (vd->vdev_initialize_last_offset), 1, | |
527 | &vd->vdev_initialize_last_offset); | |
528 | if (err == ENOENT) { | |
529 | vd->vdev_initialize_last_offset = 0; | |
530 | err = 0; | |
531 | } | |
532 | } | |
533 | ||
534 | vdev_initialize_calculate_progress(vd); | |
535 | return (err); | |
536 | } | |
537 | ||
538 | ||
539 | /* | |
540 | * Convert the logical range into a physcial range and add it to our | |
541 | * avl tree. | |
542 | */ | |
543 | void | |
544 | vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) | |
545 | { | |
546 | vdev_t *vd = arg; | |
547 | range_seg_t logical_rs, physical_rs; | |
548 | logical_rs.rs_start = start; | |
549 | logical_rs.rs_end = start + size; | |
550 | ||
551 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
552 | vdev_xlate(vd, &logical_rs, &physical_rs); | |
553 | ||
554 | IMPLY(vd->vdev_top == vd, | |
555 | logical_rs.rs_start == physical_rs.rs_start); | |
556 | IMPLY(vd->vdev_top == vd, | |
557 | logical_rs.rs_end == physical_rs.rs_end); | |
558 | ||
559 | /* Only add segments that we have not visited yet */ | |
560 | if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) | |
561 | return; | |
562 | ||
563 | /* Pick up where we left off mid-range. */ | |
564 | if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { | |
565 | zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " | |
566 | "(%llu, %llu)", vd->vdev_path, | |
567 | (u_longlong_t)physical_rs.rs_start, | |
568 | (u_longlong_t)physical_rs.rs_end, | |
569 | (u_longlong_t)vd->vdev_initialize_last_offset, | |
570 | (u_longlong_t)physical_rs.rs_end); | |
571 | ASSERT3U(physical_rs.rs_end, >, | |
572 | vd->vdev_initialize_last_offset); | |
573 | physical_rs.rs_start = vd->vdev_initialize_last_offset; | |
574 | } | |
575 | ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); | |
576 | ||
577 | /* | |
578 | * With raidz, it's possible that the logical range does not live on | |
579 | * this leaf vdev. We only add the physical range to this vdev's if it | |
580 | * has a length greater than 0. | |
581 | */ | |
582 | if (physical_rs.rs_end > physical_rs.rs_start) { | |
583 | range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, | |
584 | physical_rs.rs_end - physical_rs.rs_start); | |
585 | } else { | |
586 | ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); | |
587 | } | |
588 | } | |
589 | ||
590 | static void | |
591 | vdev_initialize_thread(void *arg) | |
592 | { | |
593 | vdev_t *vd = arg; | |
594 | spa_t *spa = vd->vdev_spa; | |
595 | int error = 0; | |
596 | uint64_t ms_count = 0; | |
597 | ||
598 | ASSERT(vdev_is_concrete(vd)); | |
599 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
600 | ||
601 | vd->vdev_initialize_last_offset = 0; | |
602 | VERIFY0(vdev_initialize_load(vd)); | |
603 | ||
604 | abd_t *deadbeef = vdev_initialize_block_alloc(); | |
605 | ||
606 | vd->vdev_initialize_tree = range_tree_create(NULL, NULL); | |
607 | ||
608 | for (uint64_t i = 0; !vd->vdev_detached && | |
609 | i < vd->vdev_top->vdev_ms_count; i++) { | |
610 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
611 | ||
612 | /* | |
613 | * If we've expanded the top-level vdev or it's our | |
614 | * first pass, calculate our progress. | |
615 | */ | |
616 | if (vd->vdev_top->vdev_ms_count != ms_count) { | |
617 | vdev_initialize_calculate_progress(vd); | |
618 | ms_count = vd->vdev_top->vdev_ms_count; | |
619 | } | |
620 | ||
621 | vdev_initialize_ms_mark(msp); | |
622 | mutex_enter(&msp->ms_lock); | |
b194fab0 | 623 | VERIFY0(metaslab_load(msp)); |
619f0976 GW |
624 | |
625 | range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, | |
626 | vd); | |
627 | mutex_exit(&msp->ms_lock); | |
628 | ||
629 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
630 | error = vdev_initialize_ranges(vd, deadbeef); | |
631 | vdev_initialize_ms_unmark(msp); | |
632 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
633 | ||
634 | range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); | |
635 | if (error != 0) | |
636 | break; | |
637 | } | |
638 | ||
639 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
640 | mutex_enter(&vd->vdev_initialize_io_lock); | |
641 | while (vd->vdev_initialize_inflight > 0) { | |
642 | cv_wait(&vd->vdev_initialize_io_cv, | |
643 | &vd->vdev_initialize_io_lock); | |
644 | } | |
645 | mutex_exit(&vd->vdev_initialize_io_lock); | |
646 | ||
647 | range_tree_destroy(vd->vdev_initialize_tree); | |
648 | vdev_initialize_block_free(deadbeef); | |
649 | vd->vdev_initialize_tree = NULL; | |
650 | ||
651 | mutex_enter(&vd->vdev_initialize_lock); | |
652 | if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { | |
653 | vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); | |
654 | } | |
655 | ASSERT(vd->vdev_initialize_thread != NULL || | |
656 | vd->vdev_initialize_inflight == 0); | |
657 | ||
658 | /* | |
659 | * Drop the vdev_initialize_lock while we sync out the | |
660 | * txg since it's possible that a device might be trying to | |
661 | * come online and must check to see if it needs to restart an | |
662 | * initialization. That thread will be holding the spa_config_lock | |
663 | * which would prevent the txg_wait_synced from completing. | |
664 | */ | |
665 | mutex_exit(&vd->vdev_initialize_lock); | |
666 | txg_wait_synced(spa_get_dsl(spa), 0); | |
667 | mutex_enter(&vd->vdev_initialize_lock); | |
668 | ||
669 | vd->vdev_initialize_thread = NULL; | |
670 | cv_broadcast(&vd->vdev_initialize_cv); | |
671 | mutex_exit(&vd->vdev_initialize_lock); | |
672 | } | |
673 | ||
674 | /* | |
675 | * Initiates a device. Caller must hold vdev_initialize_lock. | |
676 | * Device must be a leaf and not already be initializing. | |
677 | */ | |
678 | void | |
679 | vdev_initialize(vdev_t *vd) | |
680 | { | |
681 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
682 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
683 | ASSERT(vdev_is_concrete(vd)); | |
684 | ASSERT3P(vd->vdev_initialize_thread, ==, NULL); | |
685 | ASSERT(!vd->vdev_detached); | |
686 | ASSERT(!vd->vdev_initialize_exit_wanted); | |
687 | ASSERT(!vd->vdev_top->vdev_removing); | |
688 | ||
689 | vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); | |
690 | vd->vdev_initialize_thread = thread_create(NULL, 0, | |
691 | vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
692 | } | |
693 | ||
694 | /* | |
c10d37dd GW |
695 | * Wait for the initialize thread to be terminated (cancelled or stopped). |
696 | */ | |
697 | static void | |
698 | vdev_initialize_stop_wait_impl(vdev_t *vd) | |
699 | { | |
700 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
701 | ||
702 | while (vd->vdev_initialize_thread != NULL) | |
703 | cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); | |
704 | ||
705 | ASSERT3P(vd->vdev_initialize_thread, ==, NULL); | |
706 | vd->vdev_initialize_exit_wanted = B_FALSE; | |
707 | } | |
708 | ||
709 | /* | |
710 | * Wait for vdev initialize threads which were either to cleanly exit. | |
619f0976 GW |
711 | */ |
712 | void | |
c10d37dd | 713 | vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) |
619f0976 | 714 | { |
c10d37dd GW |
715 | vdev_t *vd; |
716 | ||
717 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
718 | ||
719 | while ((vd = list_remove_head(vd_list)) != NULL) { | |
720 | mutex_enter(&vd->vdev_initialize_lock); | |
721 | vdev_initialize_stop_wait_impl(vd); | |
722 | mutex_exit(&vd->vdev_initialize_lock); | |
723 | } | |
724 | } | |
619f0976 | 725 | |
c10d37dd GW |
726 | /* |
727 | * Stop initializing a device, with the resultant initialing state being | |
728 | * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when | |
729 | * a list_t is provided the stopping vdev is inserted in to the list. Callers | |
730 | * are then required to call vdev_initialize_stop_wait() to block for all the | |
731 | * initialization threads to exit. The caller must hold vdev_initialize_lock | |
732 | * and must not be writing to the spa config, as the initializing thread may | |
733 | * try to enter the config as a reader before exiting. | |
734 | */ | |
735 | void | |
736 | vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, | |
737 | list_t *vd_list) | |
738 | { | |
739 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); | |
619f0976 GW |
740 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); |
741 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
742 | ASSERT(vdev_is_concrete(vd)); | |
743 | ||
744 | /* | |
745 | * Allow cancel requests to proceed even if the initialize thread | |
746 | * has stopped. | |
747 | */ | |
748 | if (vd->vdev_initialize_thread == NULL && | |
749 | tgt_state != VDEV_INITIALIZE_CANCELED) { | |
750 | return; | |
751 | } | |
752 | ||
753 | vdev_initialize_change_state(vd, tgt_state); | |
754 | vd->vdev_initialize_exit_wanted = B_TRUE; | |
619f0976 | 755 | |
c10d37dd GW |
756 | if (vd_list == NULL) { |
757 | vdev_initialize_stop_wait_impl(vd); | |
758 | } else { | |
759 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
760 | list_insert_tail(vd_list, vd); | |
761 | } | |
619f0976 GW |
762 | } |
763 | ||
764 | static void | |
c10d37dd GW |
765 | vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, |
766 | list_t *vd_list) | |
619f0976 GW |
767 | { |
768 | if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { | |
769 | mutex_enter(&vd->vdev_initialize_lock); | |
c10d37dd | 770 | vdev_initialize_stop(vd, tgt_state, vd_list); |
619f0976 GW |
771 | mutex_exit(&vd->vdev_initialize_lock); |
772 | return; | |
773 | } | |
774 | ||
775 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
c10d37dd GW |
776 | vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, |
777 | vd_list); | |
619f0976 GW |
778 | } |
779 | } | |
780 | ||
781 | /* | |
782 | * Convenience function to stop initializing of a vdev tree and set all | |
783 | * initialize thread pointers to NULL. | |
784 | */ | |
785 | void | |
786 | vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) | |
787 | { | |
c10d37dd GW |
788 | spa_t *spa = vd->vdev_spa; |
789 | list_t vd_list; | |
790 | ||
791 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
792 | ||
793 | list_create(&vd_list, sizeof (vdev_t), | |
794 | offsetof(vdev_t, vdev_initialize_node)); | |
795 | ||
796 | vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); | |
797 | vdev_initialize_stop_wait(spa, &vd_list); | |
619f0976 GW |
798 | |
799 | if (vd->vdev_spa->spa_sync_on) { | |
800 | /* Make sure that our state has been synced to disk */ | |
801 | txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); | |
802 | } | |
c10d37dd GW |
803 | |
804 | list_destroy(&vd_list); | |
619f0976 GW |
805 | } |
806 | ||
807 | void | |
808 | vdev_initialize_restart(vdev_t *vd) | |
809 | { | |
810 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
811 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); | |
812 | ||
813 | if (vd->vdev_leaf_zap != 0) { | |
814 | mutex_enter(&vd->vdev_initialize_lock); | |
815 | uint64_t initialize_state = VDEV_INITIALIZE_NONE; | |
816 | int err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
817 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, | |
818 | sizeof (initialize_state), 1, &initialize_state); | |
819 | ASSERT(err == 0 || err == ENOENT); | |
820 | vd->vdev_initialize_state = initialize_state; | |
821 | ||
822 | uint64_t timestamp = 0; | |
823 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
824 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, | |
825 | sizeof (timestamp), 1, ×tamp); | |
826 | ASSERT(err == 0 || err == ENOENT); | |
827 | vd->vdev_initialize_action_time = (time_t)timestamp; | |
828 | ||
829 | if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || | |
830 | vd->vdev_offline) { | |
831 | /* load progress for reporting, but don't resume */ | |
832 | VERIFY0(vdev_initialize_load(vd)); | |
833 | } else if (vd->vdev_initialize_state == | |
834 | VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) { | |
835 | vdev_initialize(vd); | |
836 | } | |
837 | ||
838 | mutex_exit(&vd->vdev_initialize_lock); | |
839 | } | |
840 | ||
841 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
842 | vdev_initialize_restart(vd->vdev_child[i]); | |
843 | } | |
844 | } | |
845 | ||
846 | #if defined(_KERNEL) | |
847 | EXPORT_SYMBOL(vdev_initialize_restart); | |
848 | EXPORT_SYMBOL(vdev_xlate); | |
619f0976 GW |
849 | EXPORT_SYMBOL(vdev_initialize); |
850 | EXPORT_SYMBOL(vdev_initialize_stop); | |
c10d37dd GW |
851 | EXPORT_SYMBOL(vdev_initialize_stop_all); |
852 | EXPORT_SYMBOL(vdev_initialize_stop_wait); | |
619f0976 GW |
853 | |
854 | /* CSTYLED */ | |
855 | module_param(zfs_initialize_value, ulong, 0644); | |
856 | MODULE_PARM_DESC(zfs_initialize_value, | |
857 | "Value written during zpool initialize"); | |
858 | #endif |