]>
Commit | Line | Data |
---|---|---|
619f0976 GW |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2016 by Delphix. All rights reserved. | |
24 | */ | |
25 | ||
26 | #include <sys/spa.h> | |
27 | #include <sys/spa_impl.h> | |
28 | #include <sys/txg.h> | |
29 | #include <sys/vdev_impl.h> | |
30 | #include <sys/refcount.h> | |
31 | #include <sys/metaslab_impl.h> | |
32 | #include <sys/dsl_synctask.h> | |
33 | #include <sys/zap.h> | |
34 | #include <sys/dmu_tx.h> | |
35 | ||
619f0976 GW |
36 | /* |
37 | * Value that is written to disk during initialization. | |
38 | */ | |
39 | #ifdef _ILP32 | |
40 | unsigned long zfs_initialize_value = 0xdeadbeefUL; | |
41 | #else | |
42 | unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL; | |
43 | #endif | |
44 | ||
45 | /* maximum number of I/Os outstanding per leaf vdev */ | |
46 | int zfs_initialize_limit = 1; | |
47 | ||
48 | /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ | |
49 | uint64_t zfs_initialize_chunk_size = 1024 * 1024; | |
50 | ||
51 | static boolean_t | |
52 | vdev_initialize_should_stop(vdev_t *vd) | |
53 | { | |
54 | return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || | |
55 | vd->vdev_detached || vd->vdev_top->vdev_removing); | |
56 | } | |
57 | ||
58 | static void | |
59 | vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) | |
60 | { | |
61 | /* | |
62 | * We pass in the guid instead of the vdev_t since the vdev may | |
63 | * have been freed prior to the sync task being processed. This | |
64 | * happens when a vdev is detached as we call spa_config_vdev_exit(), | |
dd785b5b | 65 | * stop the initializing thread, schedule the sync task, and free |
619f0976 GW |
66 | * the vdev. Later when the scheduled sync task is invoked, it would |
67 | * find that the vdev has been freed. | |
68 | */ | |
69 | uint64_t guid = *(uint64_t *)arg; | |
70 | uint64_t txg = dmu_tx_get_txg(tx); | |
71 | kmem_free(arg, sizeof (uint64_t)); | |
72 | ||
73 | vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); | |
74 | if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) | |
75 | return; | |
76 | ||
77 | uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; | |
78 | vd->vdev_initialize_offset[txg & TXG_MASK] = 0; | |
79 | ||
80 | VERIFY(vd->vdev_leaf_zap != 0); | |
81 | ||
82 | objset_t *mos = vd->vdev_spa->spa_meta_objset; | |
83 | ||
84 | if (last_offset > 0) { | |
85 | vd->vdev_initialize_last_offset = last_offset; | |
86 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
87 | VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, | |
88 | sizeof (last_offset), 1, &last_offset, tx)); | |
89 | } | |
90 | if (vd->vdev_initialize_action_time > 0) { | |
91 | uint64_t val = (uint64_t)vd->vdev_initialize_action_time; | |
92 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
93 | VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), | |
94 | 1, &val, tx)); | |
95 | } | |
96 | ||
97 | uint64_t initialize_state = vd->vdev_initialize_state; | |
98 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
99 | VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, | |
100 | &initialize_state, tx)); | |
101 | } | |
102 | ||
103 | static void | |
104 | vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) | |
105 | { | |
106 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
107 | spa_t *spa = vd->vdev_spa; | |
108 | ||
109 | if (new_state == vd->vdev_initialize_state) | |
110 | return; | |
111 | ||
112 | /* | |
113 | * Copy the vd's guid, this will be freed by the sync task. | |
114 | */ | |
115 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
116 | *guid = vd->vdev_guid; | |
117 | ||
118 | /* | |
119 | * If we're suspending, then preserving the original start time. | |
120 | */ | |
121 | if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { | |
122 | vd->vdev_initialize_action_time = gethrestime_sec(); | |
123 | } | |
124 | vd->vdev_initialize_state = new_state; | |
125 | ||
126 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
127 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
128 | dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, | |
1b939560 | 129 | guid, 2, ZFS_SPACE_CHECK_NONE, tx); |
619f0976 GW |
130 | |
131 | switch (new_state) { | |
132 | case VDEV_INITIALIZE_ACTIVE: | |
133 | spa_history_log_internal(spa, "initialize", tx, | |
134 | "vdev=%s activated", vd->vdev_path); | |
135 | break; | |
136 | case VDEV_INITIALIZE_SUSPENDED: | |
137 | spa_history_log_internal(spa, "initialize", tx, | |
138 | "vdev=%s suspended", vd->vdev_path); | |
139 | break; | |
140 | case VDEV_INITIALIZE_CANCELED: | |
141 | spa_history_log_internal(spa, "initialize", tx, | |
142 | "vdev=%s canceled", vd->vdev_path); | |
143 | break; | |
144 | case VDEV_INITIALIZE_COMPLETE: | |
145 | spa_history_log_internal(spa, "initialize", tx, | |
146 | "vdev=%s complete", vd->vdev_path); | |
147 | break; | |
148 | default: | |
149 | panic("invalid state %llu", (unsigned long long)new_state); | |
150 | } | |
151 | ||
152 | dmu_tx_commit(tx); | |
153 | } | |
154 | ||
155 | static void | |
156 | vdev_initialize_cb(zio_t *zio) | |
157 | { | |
158 | vdev_t *vd = zio->io_vd; | |
159 | mutex_enter(&vd->vdev_initialize_io_lock); | |
160 | if (zio->io_error == ENXIO && !vdev_writeable(vd)) { | |
161 | /* | |
162 | * The I/O failed because the vdev was unavailable; roll the | |
163 | * last offset back. (This works because spa_sync waits on | |
164 | * spa_txg_zio before it runs sync tasks.) | |
165 | */ | |
166 | uint64_t *off = | |
167 | &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; | |
168 | *off = MIN(*off, zio->io_offset); | |
169 | } else { | |
170 | /* | |
171 | * Since initializing is best-effort, we ignore I/O errors and | |
172 | * rely on vdev_probe to determine if the errors are more | |
173 | * critical. | |
174 | */ | |
175 | if (zio->io_error != 0) | |
176 | vd->vdev_stat.vs_initialize_errors++; | |
177 | ||
178 | vd->vdev_initialize_bytes_done += zio->io_orig_size; | |
179 | } | |
180 | ASSERT3U(vd->vdev_initialize_inflight, >, 0); | |
181 | vd->vdev_initialize_inflight--; | |
182 | cv_broadcast(&vd->vdev_initialize_io_cv); | |
183 | mutex_exit(&vd->vdev_initialize_io_lock); | |
184 | ||
185 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
186 | } | |
187 | ||
188 | /* Takes care of physical writing and limiting # of concurrent ZIOs. */ | |
189 | static int | |
190 | vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) | |
191 | { | |
192 | spa_t *spa = vd->vdev_spa; | |
193 | ||
194 | /* Limit inflight initializing I/Os */ | |
195 | mutex_enter(&vd->vdev_initialize_io_lock); | |
196 | while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { | |
197 | cv_wait(&vd->vdev_initialize_io_cv, | |
198 | &vd->vdev_initialize_io_lock); | |
199 | } | |
200 | vd->vdev_initialize_inflight++; | |
201 | mutex_exit(&vd->vdev_initialize_io_lock); | |
202 | ||
203 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
204 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
205 | uint64_t txg = dmu_tx_get_txg(tx); | |
206 | ||
207 | spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); | |
208 | mutex_enter(&vd->vdev_initialize_lock); | |
209 | ||
210 | if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { | |
211 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
212 | *guid = vd->vdev_guid; | |
213 | ||
214 | /* This is the first write of this txg. */ | |
215 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
216 | vdev_initialize_zap_update_sync, guid, 2, | |
217 | ZFS_SPACE_CHECK_RESERVED, tx); | |
218 | } | |
219 | ||
220 | /* | |
221 | * We know the vdev struct will still be around since all | |
222 | * consumers of vdev_free must stop the initialization first. | |
223 | */ | |
224 | if (vdev_initialize_should_stop(vd)) { | |
225 | mutex_enter(&vd->vdev_initialize_io_lock); | |
226 | ASSERT3U(vd->vdev_initialize_inflight, >, 0); | |
227 | vd->vdev_initialize_inflight--; | |
228 | mutex_exit(&vd->vdev_initialize_io_lock); | |
229 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
230 | mutex_exit(&vd->vdev_initialize_lock); | |
231 | dmu_tx_commit(tx); | |
232 | return (SET_ERROR(EINTR)); | |
233 | } | |
234 | mutex_exit(&vd->vdev_initialize_lock); | |
235 | ||
236 | vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; | |
237 | zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, | |
238 | size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, | |
239 | ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); | |
240 | /* vdev_initialize_cb releases SCL_STATE_ALL */ | |
241 | ||
242 | dmu_tx_commit(tx); | |
243 | ||
244 | return (0); | |
245 | } | |
246 | ||
619f0976 GW |
247 | /* |
248 | * Callback to fill each ABD chunk with zfs_initialize_value. len must be | |
249 | * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD | |
250 | * allocation will guarantee these for us. | |
251 | */ | |
252 | /* ARGSUSED */ | |
253 | static int | |
254 | vdev_initialize_block_fill(void *buf, size_t len, void *unused) | |
255 | { | |
256 | ASSERT0(len % sizeof (uint64_t)); | |
257 | #ifdef _ILP32 | |
258 | for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) { | |
259 | *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value; | |
260 | } | |
261 | #else | |
262 | for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { | |
263 | *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; | |
264 | } | |
265 | #endif | |
266 | return (0); | |
267 | } | |
268 | ||
269 | static abd_t * | |
270 | vdev_initialize_block_alloc(void) | |
271 | { | |
272 | /* Allocate ABD for filler data */ | |
273 | abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); | |
274 | ||
275 | ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); | |
276 | (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, | |
277 | vdev_initialize_block_fill, NULL); | |
278 | ||
279 | return (data); | |
280 | } | |
281 | ||
282 | static void | |
283 | vdev_initialize_block_free(abd_t *data) | |
284 | { | |
285 | abd_free(data); | |
286 | } | |
287 | ||
288 | static int | |
289 | vdev_initialize_ranges(vdev_t *vd, abd_t *data) | |
290 | { | |
291 | avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; | |
292 | ||
293 | for (range_seg_t *rs = avl_first(rt); rs != NULL; | |
294 | rs = AVL_NEXT(rt, rs)) { | |
295 | uint64_t size = rs->rs_end - rs->rs_start; | |
296 | ||
297 | /* Split range into legally-sized physical chunks */ | |
298 | uint64_t writes_required = | |
299 | ((size - 1) / zfs_initialize_chunk_size) + 1; | |
300 | ||
301 | for (uint64_t w = 0; w < writes_required; w++) { | |
302 | int error; | |
303 | ||
304 | error = vdev_initialize_write(vd, | |
305 | VDEV_LABEL_START_SIZE + rs->rs_start + | |
306 | (w * zfs_initialize_chunk_size), | |
307 | MIN(size - (w * zfs_initialize_chunk_size), | |
308 | zfs_initialize_chunk_size), data); | |
309 | if (error != 0) | |
310 | return (error); | |
311 | } | |
312 | } | |
313 | return (0); | |
314 | } | |
315 | ||
619f0976 GW |
316 | static void |
317 | vdev_initialize_calculate_progress(vdev_t *vd) | |
318 | { | |
319 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
320 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
321 | ASSERT(vd->vdev_leaf_zap != 0); | |
322 | ||
323 | vd->vdev_initialize_bytes_est = 0; | |
324 | vd->vdev_initialize_bytes_done = 0; | |
325 | ||
326 | for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { | |
327 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
328 | mutex_enter(&msp->ms_lock); | |
329 | ||
330 | uint64_t ms_free = msp->ms_size - | |
425d3237 | 331 | metaslab_allocated_space(msp); |
619f0976 GW |
332 | |
333 | if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) | |
334 | ms_free /= vd->vdev_top->vdev_children; | |
335 | ||
336 | /* | |
337 | * Convert the metaslab range to a physical range | |
338 | * on our vdev. We use this to determine if we are | |
339 | * in the middle of this metaslab range. | |
340 | */ | |
341 | range_seg_t logical_rs, physical_rs; | |
342 | logical_rs.rs_start = msp->ms_start; | |
343 | logical_rs.rs_end = msp->ms_start + msp->ms_size; | |
344 | vdev_xlate(vd, &logical_rs, &physical_rs); | |
345 | ||
346 | if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { | |
347 | vd->vdev_initialize_bytes_est += ms_free; | |
348 | mutex_exit(&msp->ms_lock); | |
349 | continue; | |
350 | } else if (vd->vdev_initialize_last_offset > | |
351 | physical_rs.rs_end) { | |
352 | vd->vdev_initialize_bytes_done += ms_free; | |
353 | vd->vdev_initialize_bytes_est += ms_free; | |
354 | mutex_exit(&msp->ms_lock); | |
355 | continue; | |
356 | } | |
357 | ||
358 | /* | |
359 | * If we get here, we're in the middle of initializing this | |
360 | * metaslab. Load it and walk the free tree for more accurate | |
361 | * progress estimation. | |
362 | */ | |
b194fab0 | 363 | VERIFY0(metaslab_load(msp)); |
619f0976 GW |
364 | |
365 | for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); | |
366 | rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { | |
367 | logical_rs.rs_start = rs->rs_start; | |
368 | logical_rs.rs_end = rs->rs_end; | |
369 | vdev_xlate(vd, &logical_rs, &physical_rs); | |
370 | ||
371 | uint64_t size = physical_rs.rs_end - | |
372 | physical_rs.rs_start; | |
373 | vd->vdev_initialize_bytes_est += size; | |
374 | if (vd->vdev_initialize_last_offset > | |
375 | physical_rs.rs_end) { | |
376 | vd->vdev_initialize_bytes_done += size; | |
377 | } else if (vd->vdev_initialize_last_offset > | |
378 | physical_rs.rs_start && | |
379 | vd->vdev_initialize_last_offset < | |
380 | physical_rs.rs_end) { | |
381 | vd->vdev_initialize_bytes_done += | |
382 | vd->vdev_initialize_last_offset - | |
383 | physical_rs.rs_start; | |
384 | } | |
385 | } | |
386 | mutex_exit(&msp->ms_lock); | |
387 | } | |
388 | } | |
389 | ||
390 | static int | |
391 | vdev_initialize_load(vdev_t *vd) | |
392 | { | |
393 | int err = 0; | |
394 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
395 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
396 | ASSERT(vd->vdev_leaf_zap != 0); | |
397 | ||
398 | if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || | |
399 | vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { | |
400 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
401 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, | |
402 | sizeof (vd->vdev_initialize_last_offset), 1, | |
403 | &vd->vdev_initialize_last_offset); | |
404 | if (err == ENOENT) { | |
405 | vd->vdev_initialize_last_offset = 0; | |
406 | err = 0; | |
407 | } | |
408 | } | |
409 | ||
410 | vdev_initialize_calculate_progress(vd); | |
411 | return (err); | |
412 | } | |
413 | ||
619f0976 | 414 | /* |
1b939560 | 415 | * Convert the logical range into a physical range and add it to our |
619f0976 GW |
416 | * avl tree. |
417 | */ | |
418 | void | |
419 | vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) | |
420 | { | |
421 | vdev_t *vd = arg; | |
422 | range_seg_t logical_rs, physical_rs; | |
423 | logical_rs.rs_start = start; | |
424 | logical_rs.rs_end = start + size; | |
425 | ||
426 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
427 | vdev_xlate(vd, &logical_rs, &physical_rs); | |
428 | ||
429 | IMPLY(vd->vdev_top == vd, | |
430 | logical_rs.rs_start == physical_rs.rs_start); | |
431 | IMPLY(vd->vdev_top == vd, | |
432 | logical_rs.rs_end == physical_rs.rs_end); | |
433 | ||
434 | /* Only add segments that we have not visited yet */ | |
435 | if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) | |
436 | return; | |
437 | ||
438 | /* Pick up where we left off mid-range. */ | |
439 | if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { | |
440 | zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " | |
441 | "(%llu, %llu)", vd->vdev_path, | |
442 | (u_longlong_t)physical_rs.rs_start, | |
443 | (u_longlong_t)physical_rs.rs_end, | |
444 | (u_longlong_t)vd->vdev_initialize_last_offset, | |
445 | (u_longlong_t)physical_rs.rs_end); | |
446 | ASSERT3U(physical_rs.rs_end, >, | |
447 | vd->vdev_initialize_last_offset); | |
448 | physical_rs.rs_start = vd->vdev_initialize_last_offset; | |
449 | } | |
450 | ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); | |
451 | ||
452 | /* | |
453 | * With raidz, it's possible that the logical range does not live on | |
454 | * this leaf vdev. We only add the physical range to this vdev's if it | |
455 | * has a length greater than 0. | |
456 | */ | |
457 | if (physical_rs.rs_end > physical_rs.rs_start) { | |
458 | range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, | |
459 | physical_rs.rs_end - physical_rs.rs_start); | |
460 | } else { | |
461 | ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); | |
462 | } | |
463 | } | |
464 | ||
465 | static void | |
466 | vdev_initialize_thread(void *arg) | |
467 | { | |
468 | vdev_t *vd = arg; | |
469 | spa_t *spa = vd->vdev_spa; | |
470 | int error = 0; | |
471 | uint64_t ms_count = 0; | |
472 | ||
473 | ASSERT(vdev_is_concrete(vd)); | |
474 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
475 | ||
476 | vd->vdev_initialize_last_offset = 0; | |
477 | VERIFY0(vdev_initialize_load(vd)); | |
478 | ||
479 | abd_t *deadbeef = vdev_initialize_block_alloc(); | |
480 | ||
481 | vd->vdev_initialize_tree = range_tree_create(NULL, NULL); | |
482 | ||
483 | for (uint64_t i = 0; !vd->vdev_detached && | |
484 | i < vd->vdev_top->vdev_ms_count; i++) { | |
485 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
486 | ||
487 | /* | |
488 | * If we've expanded the top-level vdev or it's our | |
489 | * first pass, calculate our progress. | |
490 | */ | |
491 | if (vd->vdev_top->vdev_ms_count != ms_count) { | |
492 | vdev_initialize_calculate_progress(vd); | |
493 | ms_count = vd->vdev_top->vdev_ms_count; | |
494 | } | |
495 | ||
1b939560 BB |
496 | spa_config_exit(spa, SCL_CONFIG, FTAG); |
497 | metaslab_disable(msp); | |
619f0976 | 498 | mutex_enter(&msp->ms_lock); |
b194fab0 | 499 | VERIFY0(metaslab_load(msp)); |
619f0976 GW |
500 | |
501 | range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, | |
502 | vd); | |
503 | mutex_exit(&msp->ms_lock); | |
504 | ||
619f0976 | 505 | error = vdev_initialize_ranges(vd, deadbeef); |
1b939560 | 506 | metaslab_enable(msp, B_TRUE); |
619f0976 GW |
507 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); |
508 | ||
509 | range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); | |
510 | if (error != 0) | |
511 | break; | |
512 | } | |
513 | ||
514 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
515 | mutex_enter(&vd->vdev_initialize_io_lock); | |
516 | while (vd->vdev_initialize_inflight > 0) { | |
517 | cv_wait(&vd->vdev_initialize_io_cv, | |
518 | &vd->vdev_initialize_io_lock); | |
519 | } | |
520 | mutex_exit(&vd->vdev_initialize_io_lock); | |
521 | ||
522 | range_tree_destroy(vd->vdev_initialize_tree); | |
523 | vdev_initialize_block_free(deadbeef); | |
524 | vd->vdev_initialize_tree = NULL; | |
525 | ||
526 | mutex_enter(&vd->vdev_initialize_lock); | |
527 | if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { | |
528 | vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); | |
529 | } | |
530 | ASSERT(vd->vdev_initialize_thread != NULL || | |
531 | vd->vdev_initialize_inflight == 0); | |
532 | ||
533 | /* | |
534 | * Drop the vdev_initialize_lock while we sync out the | |
535 | * txg since it's possible that a device might be trying to | |
536 | * come online and must check to see if it needs to restart an | |
537 | * initialization. That thread will be holding the spa_config_lock | |
538 | * which would prevent the txg_wait_synced from completing. | |
539 | */ | |
540 | mutex_exit(&vd->vdev_initialize_lock); | |
541 | txg_wait_synced(spa_get_dsl(spa), 0); | |
542 | mutex_enter(&vd->vdev_initialize_lock); | |
543 | ||
544 | vd->vdev_initialize_thread = NULL; | |
545 | cv_broadcast(&vd->vdev_initialize_cv); | |
546 | mutex_exit(&vd->vdev_initialize_lock); | |
547 | } | |
548 | ||
549 | /* | |
550 | * Initiates a device. Caller must hold vdev_initialize_lock. | |
551 | * Device must be a leaf and not already be initializing. | |
552 | */ | |
553 | void | |
554 | vdev_initialize(vdev_t *vd) | |
555 | { | |
556 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
557 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
558 | ASSERT(vdev_is_concrete(vd)); | |
559 | ASSERT3P(vd->vdev_initialize_thread, ==, NULL); | |
560 | ASSERT(!vd->vdev_detached); | |
561 | ASSERT(!vd->vdev_initialize_exit_wanted); | |
562 | ASSERT(!vd->vdev_top->vdev_removing); | |
563 | ||
564 | vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); | |
565 | vd->vdev_initialize_thread = thread_create(NULL, 0, | |
566 | vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
567 | } | |
568 | ||
569 | /* | |
c10d37dd GW |
570 | * Wait for the initialize thread to be terminated (cancelled or stopped). |
571 | */ | |
572 | static void | |
573 | vdev_initialize_stop_wait_impl(vdev_t *vd) | |
574 | { | |
575 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
576 | ||
577 | while (vd->vdev_initialize_thread != NULL) | |
578 | cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); | |
579 | ||
580 | ASSERT3P(vd->vdev_initialize_thread, ==, NULL); | |
581 | vd->vdev_initialize_exit_wanted = B_FALSE; | |
582 | } | |
583 | ||
584 | /* | |
585 | * Wait for vdev initialize threads which were either to cleanly exit. | |
619f0976 GW |
586 | */ |
587 | void | |
c10d37dd | 588 | vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) |
619f0976 | 589 | { |
c10d37dd GW |
590 | vdev_t *vd; |
591 | ||
592 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
593 | ||
594 | while ((vd = list_remove_head(vd_list)) != NULL) { | |
595 | mutex_enter(&vd->vdev_initialize_lock); | |
596 | vdev_initialize_stop_wait_impl(vd); | |
597 | mutex_exit(&vd->vdev_initialize_lock); | |
598 | } | |
599 | } | |
619f0976 | 600 | |
c10d37dd GW |
601 | /* |
602 | * Stop initializing a device, with the resultant initialing state being | |
603 | * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when | |
604 | * a list_t is provided the stopping vdev is inserted in to the list. Callers | |
605 | * are then required to call vdev_initialize_stop_wait() to block for all the | |
606 | * initialization threads to exit. The caller must hold vdev_initialize_lock | |
607 | * and must not be writing to the spa config, as the initializing thread may | |
608 | * try to enter the config as a reader before exiting. | |
609 | */ | |
610 | void | |
611 | vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, | |
612 | list_t *vd_list) | |
613 | { | |
614 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); | |
619f0976 GW |
615 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); |
616 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
617 | ASSERT(vdev_is_concrete(vd)); | |
618 | ||
619 | /* | |
620 | * Allow cancel requests to proceed even if the initialize thread | |
621 | * has stopped. | |
622 | */ | |
623 | if (vd->vdev_initialize_thread == NULL && | |
624 | tgt_state != VDEV_INITIALIZE_CANCELED) { | |
625 | return; | |
626 | } | |
627 | ||
628 | vdev_initialize_change_state(vd, tgt_state); | |
629 | vd->vdev_initialize_exit_wanted = B_TRUE; | |
619f0976 | 630 | |
c10d37dd GW |
631 | if (vd_list == NULL) { |
632 | vdev_initialize_stop_wait_impl(vd); | |
633 | } else { | |
634 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
635 | list_insert_tail(vd_list, vd); | |
636 | } | |
619f0976 GW |
637 | } |
638 | ||
639 | static void | |
c10d37dd GW |
640 | vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, |
641 | list_t *vd_list) | |
619f0976 GW |
642 | { |
643 | if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { | |
644 | mutex_enter(&vd->vdev_initialize_lock); | |
c10d37dd | 645 | vdev_initialize_stop(vd, tgt_state, vd_list); |
619f0976 GW |
646 | mutex_exit(&vd->vdev_initialize_lock); |
647 | return; | |
648 | } | |
649 | ||
650 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
c10d37dd GW |
651 | vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, |
652 | vd_list); | |
619f0976 GW |
653 | } |
654 | } | |
655 | ||
656 | /* | |
657 | * Convenience function to stop initializing of a vdev tree and set all | |
658 | * initialize thread pointers to NULL. | |
659 | */ | |
660 | void | |
661 | vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) | |
662 | { | |
c10d37dd GW |
663 | spa_t *spa = vd->vdev_spa; |
664 | list_t vd_list; | |
665 | ||
666 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
667 | ||
668 | list_create(&vd_list, sizeof (vdev_t), | |
669 | offsetof(vdev_t, vdev_initialize_node)); | |
670 | ||
671 | vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); | |
672 | vdev_initialize_stop_wait(spa, &vd_list); | |
619f0976 GW |
673 | |
674 | if (vd->vdev_spa->spa_sync_on) { | |
675 | /* Make sure that our state has been synced to disk */ | |
676 | txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); | |
677 | } | |
c10d37dd GW |
678 | |
679 | list_destroy(&vd_list); | |
619f0976 GW |
680 | } |
681 | ||
682 | void | |
683 | vdev_initialize_restart(vdev_t *vd) | |
684 | { | |
685 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
686 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); | |
687 | ||
688 | if (vd->vdev_leaf_zap != 0) { | |
689 | mutex_enter(&vd->vdev_initialize_lock); | |
690 | uint64_t initialize_state = VDEV_INITIALIZE_NONE; | |
691 | int err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
692 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, | |
693 | sizeof (initialize_state), 1, &initialize_state); | |
694 | ASSERT(err == 0 || err == ENOENT); | |
695 | vd->vdev_initialize_state = initialize_state; | |
696 | ||
697 | uint64_t timestamp = 0; | |
698 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
699 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, | |
700 | sizeof (timestamp), 1, ×tamp); | |
701 | ASSERT(err == 0 || err == ENOENT); | |
702 | vd->vdev_initialize_action_time = (time_t)timestamp; | |
703 | ||
704 | if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || | |
705 | vd->vdev_offline) { | |
706 | /* load progress for reporting, but don't resume */ | |
707 | VERIFY0(vdev_initialize_load(vd)); | |
708 | } else if (vd->vdev_initialize_state == | |
dd785b5b BB |
709 | VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && |
710 | !vd->vdev_top->vdev_removing && | |
711 | vd->vdev_initialize_thread == NULL) { | |
619f0976 GW |
712 | vdev_initialize(vd); |
713 | } | |
714 | ||
715 | mutex_exit(&vd->vdev_initialize_lock); | |
716 | } | |
717 | ||
718 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
719 | vdev_initialize_restart(vd->vdev_child[i]); | |
720 | } | |
721 | } | |
722 | ||
723 | #if defined(_KERNEL) | |
619f0976 GW |
724 | EXPORT_SYMBOL(vdev_initialize); |
725 | EXPORT_SYMBOL(vdev_initialize_stop); | |
c10d37dd GW |
726 | EXPORT_SYMBOL(vdev_initialize_stop_all); |
727 | EXPORT_SYMBOL(vdev_initialize_stop_wait); | |
1b939560 | 728 | EXPORT_SYMBOL(vdev_initialize_restart); |
619f0976 GW |
729 | |
730 | /* CSTYLED */ | |
731 | module_param(zfs_initialize_value, ulong, 0644); | |
732 | MODULE_PARM_DESC(zfs_initialize_value, | |
733 | "Value written during zpool initialize"); | |
734 | #endif |