]>
Commit | Line | Data |
---|---|---|
619f0976 GW |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
f09fda50 | 23 | * Copyright (c) 2016, 2019 by Delphix. All rights reserved. |
619f0976 GW |
24 | */ |
25 | ||
26 | #include <sys/spa.h> | |
27 | #include <sys/spa_impl.h> | |
28 | #include <sys/txg.h> | |
29 | #include <sys/vdev_impl.h> | |
619f0976 GW |
30 | #include <sys/metaslab_impl.h> |
31 | #include <sys/dsl_synctask.h> | |
32 | #include <sys/zap.h> | |
33 | #include <sys/dmu_tx.h> | |
59055a01 | 34 | #include <sys/vdev_initialize.h> |
619f0976 | 35 | |
619f0976 GW |
36 | /* |
37 | * Value that is written to disk during initialization. | |
38 | */ | |
39 | #ifdef _ILP32 | |
18168da7 | 40 | static unsigned long zfs_initialize_value = 0xdeadbeefUL; |
619f0976 | 41 | #else |
18168da7 | 42 | static unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL; |
619f0976 GW |
43 | #endif |
44 | ||
45 | /* maximum number of I/Os outstanding per leaf vdev */ | |
18168da7 | 46 | static const int zfs_initialize_limit = 1; |
619f0976 GW |
47 | |
48 | /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ | |
18168da7 | 49 | static unsigned long zfs_initialize_chunk_size = 1024 * 1024; |
619f0976 GW |
50 | |
51 | static boolean_t | |
52 | vdev_initialize_should_stop(vdev_t *vd) | |
53 | { | |
54 | return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || | |
55 | vd->vdev_detached || vd->vdev_top->vdev_removing); | |
56 | } | |
57 | ||
58 | static void | |
59 | vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) | |
60 | { | |
61 | /* | |
62 | * We pass in the guid instead of the vdev_t since the vdev may | |
63 | * have been freed prior to the sync task being processed. This | |
64 | * happens when a vdev is detached as we call spa_config_vdev_exit(), | |
dd785b5b | 65 | * stop the initializing thread, schedule the sync task, and free |
619f0976 GW |
66 | * the vdev. Later when the scheduled sync task is invoked, it would |
67 | * find that the vdev has been freed. | |
68 | */ | |
69 | uint64_t guid = *(uint64_t *)arg; | |
70 | uint64_t txg = dmu_tx_get_txg(tx); | |
71 | kmem_free(arg, sizeof (uint64_t)); | |
72 | ||
73 | vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); | |
74 | if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) | |
75 | return; | |
76 | ||
77 | uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; | |
78 | vd->vdev_initialize_offset[txg & TXG_MASK] = 0; | |
79 | ||
80 | VERIFY(vd->vdev_leaf_zap != 0); | |
81 | ||
82 | objset_t *mos = vd->vdev_spa->spa_meta_objset; | |
83 | ||
84 | if (last_offset > 0) { | |
85 | vd->vdev_initialize_last_offset = last_offset; | |
86 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
87 | VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, | |
88 | sizeof (last_offset), 1, &last_offset, tx)); | |
89 | } | |
90 | if (vd->vdev_initialize_action_time > 0) { | |
91 | uint64_t val = (uint64_t)vd->vdev_initialize_action_time; | |
92 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
93 | VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), | |
94 | 1, &val, tx)); | |
95 | } | |
96 | ||
97 | uint64_t initialize_state = vd->vdev_initialize_state; | |
98 | VERIFY0(zap_update(mos, vd->vdev_leaf_zap, | |
99 | VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, | |
100 | &initialize_state, tx)); | |
101 | } | |
102 | ||
103 | static void | |
104 | vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) | |
105 | { | |
106 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
107 | spa_t *spa = vd->vdev_spa; | |
108 | ||
109 | if (new_state == vd->vdev_initialize_state) | |
110 | return; | |
111 | ||
112 | /* | |
113 | * Copy the vd's guid, this will be freed by the sync task. | |
114 | */ | |
115 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
116 | *guid = vd->vdev_guid; | |
117 | ||
118 | /* | |
119 | * If we're suspending, then preserving the original start time. | |
120 | */ | |
121 | if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { | |
122 | vd->vdev_initialize_action_time = gethrestime_sec(); | |
123 | } | |
b2255edc BB |
124 | |
125 | vdev_initializing_state_t old_state = vd->vdev_initialize_state; | |
619f0976 GW |
126 | vd->vdev_initialize_state = new_state; |
127 | ||
128 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
129 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
130 | dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, | |
38080324 | 131 | guid, tx); |
619f0976 GW |
132 | |
133 | switch (new_state) { | |
134 | case VDEV_INITIALIZE_ACTIVE: | |
135 | spa_history_log_internal(spa, "initialize", tx, | |
136 | "vdev=%s activated", vd->vdev_path); | |
137 | break; | |
138 | case VDEV_INITIALIZE_SUSPENDED: | |
139 | spa_history_log_internal(spa, "initialize", tx, | |
140 | "vdev=%s suspended", vd->vdev_path); | |
141 | break; | |
142 | case VDEV_INITIALIZE_CANCELED: | |
b2255edc BB |
143 | if (old_state == VDEV_INITIALIZE_ACTIVE || |
144 | old_state == VDEV_INITIALIZE_SUSPENDED) | |
145 | spa_history_log_internal(spa, "initialize", tx, | |
146 | "vdev=%s canceled", vd->vdev_path); | |
619f0976 GW |
147 | break; |
148 | case VDEV_INITIALIZE_COMPLETE: | |
149 | spa_history_log_internal(spa, "initialize", tx, | |
150 | "vdev=%s complete", vd->vdev_path); | |
151 | break; | |
152 | default: | |
153 | panic("invalid state %llu", (unsigned long long)new_state); | |
154 | } | |
155 | ||
156 | dmu_tx_commit(tx); | |
e60e158e JG |
157 | |
158 | if (new_state != VDEV_INITIALIZE_ACTIVE) | |
159 | spa_notify_waiters(spa); | |
619f0976 GW |
160 | } |
161 | ||
162 | static void | |
163 | vdev_initialize_cb(zio_t *zio) | |
164 | { | |
165 | vdev_t *vd = zio->io_vd; | |
166 | mutex_enter(&vd->vdev_initialize_io_lock); | |
167 | if (zio->io_error == ENXIO && !vdev_writeable(vd)) { | |
168 | /* | |
169 | * The I/O failed because the vdev was unavailable; roll the | |
170 | * last offset back. (This works because spa_sync waits on | |
171 | * spa_txg_zio before it runs sync tasks.) | |
172 | */ | |
173 | uint64_t *off = | |
174 | &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; | |
175 | *off = MIN(*off, zio->io_offset); | |
176 | } else { | |
177 | /* | |
178 | * Since initializing is best-effort, we ignore I/O errors and | |
179 | * rely on vdev_probe to determine if the errors are more | |
180 | * critical. | |
181 | */ | |
182 | if (zio->io_error != 0) | |
183 | vd->vdev_stat.vs_initialize_errors++; | |
184 | ||
185 | vd->vdev_initialize_bytes_done += zio->io_orig_size; | |
186 | } | |
187 | ASSERT3U(vd->vdev_initialize_inflight, >, 0); | |
188 | vd->vdev_initialize_inflight--; | |
189 | cv_broadcast(&vd->vdev_initialize_io_cv); | |
190 | mutex_exit(&vd->vdev_initialize_io_lock); | |
191 | ||
192 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
193 | } | |
194 | ||
195 | /* Takes care of physical writing and limiting # of concurrent ZIOs. */ | |
196 | static int | |
197 | vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) | |
198 | { | |
199 | spa_t *spa = vd->vdev_spa; | |
200 | ||
201 | /* Limit inflight initializing I/Os */ | |
202 | mutex_enter(&vd->vdev_initialize_io_lock); | |
203 | while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { | |
204 | cv_wait(&vd->vdev_initialize_io_cv, | |
205 | &vd->vdev_initialize_io_lock); | |
206 | } | |
207 | vd->vdev_initialize_inflight++; | |
208 | mutex_exit(&vd->vdev_initialize_io_lock); | |
209 | ||
210 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
211 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
212 | uint64_t txg = dmu_tx_get_txg(tx); | |
213 | ||
214 | spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); | |
215 | mutex_enter(&vd->vdev_initialize_lock); | |
216 | ||
217 | if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { | |
218 | uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | |
219 | *guid = vd->vdev_guid; | |
220 | ||
221 | /* This is the first write of this txg. */ | |
222 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
38080324 | 223 | vdev_initialize_zap_update_sync, guid, tx); |
619f0976 GW |
224 | } |
225 | ||
226 | /* | |
227 | * We know the vdev struct will still be around since all | |
228 | * consumers of vdev_free must stop the initialization first. | |
229 | */ | |
230 | if (vdev_initialize_should_stop(vd)) { | |
231 | mutex_enter(&vd->vdev_initialize_io_lock); | |
232 | ASSERT3U(vd->vdev_initialize_inflight, >, 0); | |
233 | vd->vdev_initialize_inflight--; | |
234 | mutex_exit(&vd->vdev_initialize_io_lock); | |
235 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
236 | mutex_exit(&vd->vdev_initialize_lock); | |
237 | dmu_tx_commit(tx); | |
238 | return (SET_ERROR(EINTR)); | |
239 | } | |
240 | mutex_exit(&vd->vdev_initialize_lock); | |
241 | ||
242 | vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; | |
243 | zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, | |
244 | size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, | |
245 | ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); | |
246 | /* vdev_initialize_cb releases SCL_STATE_ALL */ | |
247 | ||
248 | dmu_tx_commit(tx); | |
249 | ||
250 | return (0); | |
251 | } | |
252 | ||
619f0976 GW |
253 | /* |
254 | * Callback to fill each ABD chunk with zfs_initialize_value. len must be | |
255 | * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD | |
256 | * allocation will guarantee these for us. | |
257 | */ | |
619f0976 GW |
258 | static int |
259 | vdev_initialize_block_fill(void *buf, size_t len, void *unused) | |
260 | { | |
14e4e3cb AZ |
261 | (void) unused; |
262 | ||
619f0976 GW |
263 | ASSERT0(len % sizeof (uint64_t)); |
264 | #ifdef _ILP32 | |
265 | for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) { | |
266 | *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value; | |
267 | } | |
268 | #else | |
269 | for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { | |
270 | *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; | |
271 | } | |
272 | #endif | |
273 | return (0); | |
274 | } | |
275 | ||
276 | static abd_t * | |
277 | vdev_initialize_block_alloc(void) | |
278 | { | |
279 | /* Allocate ABD for filler data */ | |
280 | abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); | |
281 | ||
282 | ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); | |
283 | (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, | |
284 | vdev_initialize_block_fill, NULL); | |
285 | ||
286 | return (data); | |
287 | } | |
288 | ||
289 | static void | |
290 | vdev_initialize_block_free(abd_t *data) | |
291 | { | |
292 | abd_free(data); | |
293 | } | |
294 | ||
295 | static int | |
296 | vdev_initialize_ranges(vdev_t *vd, abd_t *data) | |
297 | { | |
ca577779 PD |
298 | range_tree_t *rt = vd->vdev_initialize_tree; |
299 | zfs_btree_t *bt = &rt->rt_root; | |
300 | zfs_btree_index_t where; | |
619f0976 | 301 | |
ca577779 PD |
302 | for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; |
303 | rs = zfs_btree_next(bt, &where, &where)) { | |
304 | uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); | |
619f0976 GW |
305 | |
306 | /* Split range into legally-sized physical chunks */ | |
307 | uint64_t writes_required = | |
308 | ((size - 1) / zfs_initialize_chunk_size) + 1; | |
309 | ||
310 | for (uint64_t w = 0; w < writes_required; w++) { | |
311 | int error; | |
312 | ||
313 | error = vdev_initialize_write(vd, | |
ca577779 | 314 | VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + |
619f0976 GW |
315 | (w * zfs_initialize_chunk_size), |
316 | MIN(size - (w * zfs_initialize_chunk_size), | |
317 | zfs_initialize_chunk_size), data); | |
318 | if (error != 0) | |
319 | return (error); | |
320 | } | |
321 | } | |
322 | return (0); | |
323 | } | |
324 | ||
b2255edc BB |
325 | static void |
326 | vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) | |
327 | { | |
328 | uint64_t *last_rs_end = (uint64_t *)arg; | |
329 | ||
330 | if (physical_rs->rs_end > *last_rs_end) | |
331 | *last_rs_end = physical_rs->rs_end; | |
332 | } | |
333 | ||
334 | static void | |
335 | vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs) | |
336 | { | |
337 | vdev_t *vd = (vdev_t *)arg; | |
338 | ||
339 | uint64_t size = physical_rs->rs_end - physical_rs->rs_start; | |
340 | vd->vdev_initialize_bytes_est += size; | |
341 | ||
342 | if (vd->vdev_initialize_last_offset > physical_rs->rs_end) { | |
343 | vd->vdev_initialize_bytes_done += size; | |
344 | } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start && | |
345 | vd->vdev_initialize_last_offset < physical_rs->rs_end) { | |
346 | vd->vdev_initialize_bytes_done += | |
347 | vd->vdev_initialize_last_offset - physical_rs->rs_start; | |
348 | } | |
349 | } | |
350 | ||
619f0976 GW |
351 | static void |
352 | vdev_initialize_calculate_progress(vdev_t *vd) | |
353 | { | |
354 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
355 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
356 | ASSERT(vd->vdev_leaf_zap != 0); | |
357 | ||
358 | vd->vdev_initialize_bytes_est = 0; | |
359 | vd->vdev_initialize_bytes_done = 0; | |
360 | ||
361 | for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { | |
362 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
363 | mutex_enter(&msp->ms_lock); | |
364 | ||
b2255edc BB |
365 | uint64_t ms_free = (msp->ms_size - |
366 | metaslab_allocated_space(msp)) / | |
367 | vdev_get_ndisks(vd->vdev_top); | |
619f0976 GW |
368 | |
369 | /* | |
370 | * Convert the metaslab range to a physical range | |
371 | * on our vdev. We use this to determine if we are | |
372 | * in the middle of this metaslab range. | |
373 | */ | |
b2255edc | 374 | range_seg64_t logical_rs, physical_rs, remain_rs; |
619f0976 GW |
375 | logical_rs.rs_start = msp->ms_start; |
376 | logical_rs.rs_end = msp->ms_start + msp->ms_size; | |
619f0976 | 377 | |
b2255edc BB |
378 | /* Metaslab space after this offset has not been initialized */ |
379 | vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); | |
619f0976 GW |
380 | if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { |
381 | vd->vdev_initialize_bytes_est += ms_free; | |
382 | mutex_exit(&msp->ms_lock); | |
383 | continue; | |
b2255edc BB |
384 | } |
385 | ||
386 | /* Metaslab space before this offset has been initialized */ | |
387 | uint64_t last_rs_end = physical_rs.rs_end; | |
388 | if (!vdev_xlate_is_empty(&remain_rs)) { | |
389 | vdev_xlate_walk(vd, &remain_rs, | |
390 | vdev_initialize_xlate_last_rs_end, &last_rs_end); | |
391 | } | |
392 | ||
393 | if (vd->vdev_initialize_last_offset > last_rs_end) { | |
619f0976 GW |
394 | vd->vdev_initialize_bytes_done += ms_free; |
395 | vd->vdev_initialize_bytes_est += ms_free; | |
396 | mutex_exit(&msp->ms_lock); | |
397 | continue; | |
398 | } | |
399 | ||
400 | /* | |
401 | * If we get here, we're in the middle of initializing this | |
402 | * metaslab. Load it and walk the free tree for more accurate | |
403 | * progress estimation. | |
404 | */ | |
b194fab0 | 405 | VERIFY0(metaslab_load(msp)); |
619f0976 | 406 | |
ca577779 PD |
407 | zfs_btree_index_t where; |
408 | range_tree_t *rt = msp->ms_allocatable; | |
409 | for (range_seg_t *rs = | |
410 | zfs_btree_first(&rt->rt_root, &where); rs; | |
411 | rs = zfs_btree_next(&rt->rt_root, &where, | |
412 | &where)) { | |
413 | logical_rs.rs_start = rs_get_start(rs, rt); | |
414 | logical_rs.rs_end = rs_get_end(rs, rt); | |
b2255edc BB |
415 | |
416 | vdev_xlate_walk(vd, &logical_rs, | |
417 | vdev_initialize_xlate_progress, vd); | |
619f0976 GW |
418 | } |
419 | mutex_exit(&msp->ms_lock); | |
420 | } | |
421 | } | |
422 | ||
423 | static int | |
424 | vdev_initialize_load(vdev_t *vd) | |
425 | { | |
426 | int err = 0; | |
427 | ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || | |
428 | spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); | |
429 | ASSERT(vd->vdev_leaf_zap != 0); | |
430 | ||
431 | if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || | |
432 | vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { | |
433 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
434 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, | |
435 | sizeof (vd->vdev_initialize_last_offset), 1, | |
436 | &vd->vdev_initialize_last_offset); | |
437 | if (err == ENOENT) { | |
438 | vd->vdev_initialize_last_offset = 0; | |
439 | err = 0; | |
440 | } | |
441 | } | |
442 | ||
443 | vdev_initialize_calculate_progress(vd); | |
444 | return (err); | |
445 | } | |
446 | ||
59055a01 | 447 | static void |
b2255edc | 448 | vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs) |
619f0976 GW |
449 | { |
450 | vdev_t *vd = arg; | |
619f0976 GW |
451 | |
452 | /* Only add segments that we have not visited yet */ | |
b2255edc | 453 | if (physical_rs->rs_end <= vd->vdev_initialize_last_offset) |
619f0976 GW |
454 | return; |
455 | ||
456 | /* Pick up where we left off mid-range. */ | |
b2255edc | 457 | if (vd->vdev_initialize_last_offset > physical_rs->rs_start) { |
619f0976 GW |
458 | zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " |
459 | "(%llu, %llu)", vd->vdev_path, | |
b2255edc BB |
460 | (u_longlong_t)physical_rs->rs_start, |
461 | (u_longlong_t)physical_rs->rs_end, | |
619f0976 | 462 | (u_longlong_t)vd->vdev_initialize_last_offset, |
b2255edc BB |
463 | (u_longlong_t)physical_rs->rs_end); |
464 | ASSERT3U(physical_rs->rs_end, >, | |
619f0976 | 465 | vd->vdev_initialize_last_offset); |
b2255edc | 466 | physical_rs->rs_start = vd->vdev_initialize_last_offset; |
619f0976 | 467 | } |
619f0976 | 468 | |
b2255edc BB |
469 | ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); |
470 | ||
471 | range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start, | |
472 | physical_rs->rs_end - physical_rs->rs_start); | |
473 | } | |
474 | ||
475 | /* | |
476 | * Convert the logical range into a physical range and add it to our | |
477 | * avl tree. | |
478 | */ | |
479 | static void | |
480 | vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) | |
481 | { | |
482 | vdev_t *vd = arg; | |
483 | range_seg64_t logical_rs; | |
484 | logical_rs.rs_start = start; | |
485 | logical_rs.rs_end = start + size; | |
486 | ||
487 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
488 | vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg); | |
619f0976 GW |
489 | } |
490 | ||
491 | static void | |
492 | vdev_initialize_thread(void *arg) | |
493 | { | |
494 | vdev_t *vd = arg; | |
495 | spa_t *spa = vd->vdev_spa; | |
496 | int error = 0; | |
497 | uint64_t ms_count = 0; | |
498 | ||
499 | ASSERT(vdev_is_concrete(vd)); | |
500 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
501 | ||
502 | vd->vdev_initialize_last_offset = 0; | |
503 | VERIFY0(vdev_initialize_load(vd)); | |
504 | ||
505 | abd_t *deadbeef = vdev_initialize_block_alloc(); | |
506 | ||
ca577779 PD |
507 | vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, |
508 | 0, 0); | |
619f0976 GW |
509 | |
510 | for (uint64_t i = 0; !vd->vdev_detached && | |
511 | i < vd->vdev_top->vdev_ms_count; i++) { | |
512 | metaslab_t *msp = vd->vdev_top->vdev_ms[i]; | |
f09fda50 | 513 | boolean_t unload_when_done = B_FALSE; |
619f0976 GW |
514 | |
515 | /* | |
516 | * If we've expanded the top-level vdev or it's our | |
517 | * first pass, calculate our progress. | |
518 | */ | |
519 | if (vd->vdev_top->vdev_ms_count != ms_count) { | |
520 | vdev_initialize_calculate_progress(vd); | |
521 | ms_count = vd->vdev_top->vdev_ms_count; | |
522 | } | |
523 | ||
1b939560 BB |
524 | spa_config_exit(spa, SCL_CONFIG, FTAG); |
525 | metaslab_disable(msp); | |
619f0976 | 526 | mutex_enter(&msp->ms_lock); |
f09fda50 PD |
527 | if (!msp->ms_loaded && !msp->ms_loading) |
528 | unload_when_done = B_TRUE; | |
b194fab0 | 529 | VERIFY0(metaslab_load(msp)); |
619f0976 GW |
530 | |
531 | range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, | |
532 | vd); | |
533 | mutex_exit(&msp->ms_lock); | |
534 | ||
619f0976 | 535 | error = vdev_initialize_ranges(vd, deadbeef); |
f09fda50 | 536 | metaslab_enable(msp, B_TRUE, unload_when_done); |
619f0976 GW |
537 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); |
538 | ||
539 | range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); | |
540 | if (error != 0) | |
541 | break; | |
542 | } | |
543 | ||
544 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
545 | mutex_enter(&vd->vdev_initialize_io_lock); | |
546 | while (vd->vdev_initialize_inflight > 0) { | |
547 | cv_wait(&vd->vdev_initialize_io_cv, | |
548 | &vd->vdev_initialize_io_lock); | |
549 | } | |
550 | mutex_exit(&vd->vdev_initialize_io_lock); | |
551 | ||
552 | range_tree_destroy(vd->vdev_initialize_tree); | |
553 | vdev_initialize_block_free(deadbeef); | |
554 | vd->vdev_initialize_tree = NULL; | |
555 | ||
556 | mutex_enter(&vd->vdev_initialize_lock); | |
bedbc13d S |
557 | if (!vd->vdev_initialize_exit_wanted) { |
558 | if (vdev_writeable(vd)) { | |
559 | vdev_initialize_change_state(vd, | |
560 | VDEV_INITIALIZE_COMPLETE); | |
561 | } else if (vd->vdev_faulted) { | |
562 | vdev_initialize_change_state(vd, | |
563 | VDEV_INITIALIZE_CANCELED); | |
564 | } | |
619f0976 GW |
565 | } |
566 | ASSERT(vd->vdev_initialize_thread != NULL || | |
567 | vd->vdev_initialize_inflight == 0); | |
568 | ||
569 | /* | |
570 | * Drop the vdev_initialize_lock while we sync out the | |
571 | * txg since it's possible that a device might be trying to | |
572 | * come online and must check to see if it needs to restart an | |
573 | * initialization. That thread will be holding the spa_config_lock | |
574 | * which would prevent the txg_wait_synced from completing. | |
575 | */ | |
576 | mutex_exit(&vd->vdev_initialize_lock); | |
577 | txg_wait_synced(spa_get_dsl(spa), 0); | |
578 | mutex_enter(&vd->vdev_initialize_lock); | |
579 | ||
580 | vd->vdev_initialize_thread = NULL; | |
581 | cv_broadcast(&vd->vdev_initialize_cv); | |
582 | mutex_exit(&vd->vdev_initialize_lock); | |
eeb8fae9 JL |
583 | |
584 | thread_exit(); | |
619f0976 GW |
585 | } |
586 | ||
587 | /* | |
588 | * Initiates a device. Caller must hold vdev_initialize_lock. | |
589 | * Device must be a leaf and not already be initializing. | |
590 | */ | |
591 | void | |
592 | vdev_initialize(vdev_t *vd) | |
593 | { | |
594 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
595 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
596 | ASSERT(vdev_is_concrete(vd)); | |
597 | ASSERT3P(vd->vdev_initialize_thread, ==, NULL); | |
598 | ASSERT(!vd->vdev_detached); | |
599 | ASSERT(!vd->vdev_initialize_exit_wanted); | |
600 | ASSERT(!vd->vdev_top->vdev_removing); | |
601 | ||
602 | vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); | |
603 | vd->vdev_initialize_thread = thread_create(NULL, 0, | |
604 | vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
605 | } | |
606 | ||
607 | /* | |
c10d37dd GW |
608 | * Wait for the initialize thread to be terminated (cancelled or stopped). |
609 | */ | |
610 | static void | |
611 | vdev_initialize_stop_wait_impl(vdev_t *vd) | |
612 | { | |
613 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); | |
614 | ||
615 | while (vd->vdev_initialize_thread != NULL) | |
616 | cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); | |
617 | ||
618 | ASSERT3P(vd->vdev_initialize_thread, ==, NULL); | |
619 | vd->vdev_initialize_exit_wanted = B_FALSE; | |
620 | } | |
621 | ||
622 | /* | |
623 | * Wait for vdev initialize threads which were either to cleanly exit. | |
619f0976 GW |
624 | */ |
625 | void | |
c10d37dd | 626 | vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) |
619f0976 | 627 | { |
14e4e3cb | 628 | (void) spa; |
c10d37dd GW |
629 | vdev_t *vd; |
630 | ||
631 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
632 | ||
633 | while ((vd = list_remove_head(vd_list)) != NULL) { | |
634 | mutex_enter(&vd->vdev_initialize_lock); | |
635 | vdev_initialize_stop_wait_impl(vd); | |
636 | mutex_exit(&vd->vdev_initialize_lock); | |
637 | } | |
638 | } | |
619f0976 | 639 | |
c10d37dd | 640 | /* |
e1cfd73f | 641 | * Stop initializing a device, with the resultant initializing state being |
c10d37dd GW |
642 | * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when |
643 | * a list_t is provided the stopping vdev is inserted in to the list. Callers | |
644 | * are then required to call vdev_initialize_stop_wait() to block for all the | |
645 | * initialization threads to exit. The caller must hold vdev_initialize_lock | |
646 | * and must not be writing to the spa config, as the initializing thread may | |
647 | * try to enter the config as a reader before exiting. | |
648 | */ | |
649 | void | |
650 | vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, | |
651 | list_t *vd_list) | |
652 | { | |
653 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); | |
619f0976 GW |
654 | ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); |
655 | ASSERT(vd->vdev_ops->vdev_op_leaf); | |
656 | ASSERT(vdev_is_concrete(vd)); | |
657 | ||
658 | /* | |
659 | * Allow cancel requests to proceed even if the initialize thread | |
660 | * has stopped. | |
661 | */ | |
662 | if (vd->vdev_initialize_thread == NULL && | |
663 | tgt_state != VDEV_INITIALIZE_CANCELED) { | |
664 | return; | |
665 | } | |
666 | ||
667 | vdev_initialize_change_state(vd, tgt_state); | |
668 | vd->vdev_initialize_exit_wanted = B_TRUE; | |
619f0976 | 669 | |
c10d37dd GW |
670 | if (vd_list == NULL) { |
671 | vdev_initialize_stop_wait_impl(vd); | |
672 | } else { | |
673 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
674 | list_insert_tail(vd_list, vd); | |
675 | } | |
619f0976 GW |
676 | } |
677 | ||
678 | static void | |
c10d37dd GW |
679 | vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, |
680 | list_t *vd_list) | |
619f0976 GW |
681 | { |
682 | if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { | |
683 | mutex_enter(&vd->vdev_initialize_lock); | |
c10d37dd | 684 | vdev_initialize_stop(vd, tgt_state, vd_list); |
619f0976 GW |
685 | mutex_exit(&vd->vdev_initialize_lock); |
686 | return; | |
687 | } | |
688 | ||
689 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
c10d37dd GW |
690 | vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, |
691 | vd_list); | |
619f0976 GW |
692 | } |
693 | } | |
694 | ||
695 | /* | |
696 | * Convenience function to stop initializing of a vdev tree and set all | |
697 | * initialize thread pointers to NULL. | |
698 | */ | |
699 | void | |
700 | vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) | |
701 | { | |
c10d37dd GW |
702 | spa_t *spa = vd->vdev_spa; |
703 | list_t vd_list; | |
704 | ||
705 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
706 | ||
707 | list_create(&vd_list, sizeof (vdev_t), | |
708 | offsetof(vdev_t, vdev_initialize_node)); | |
709 | ||
710 | vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); | |
711 | vdev_initialize_stop_wait(spa, &vd_list); | |
619f0976 GW |
712 | |
713 | if (vd->vdev_spa->spa_sync_on) { | |
714 | /* Make sure that our state has been synced to disk */ | |
715 | txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); | |
716 | } | |
c10d37dd GW |
717 | |
718 | list_destroy(&vd_list); | |
619f0976 GW |
719 | } |
720 | ||
721 | void | |
722 | vdev_initialize_restart(vdev_t *vd) | |
723 | { | |
724 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
725 | ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); | |
726 | ||
727 | if (vd->vdev_leaf_zap != 0) { | |
728 | mutex_enter(&vd->vdev_initialize_lock); | |
729 | uint64_t initialize_state = VDEV_INITIALIZE_NONE; | |
730 | int err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
731 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, | |
732 | sizeof (initialize_state), 1, &initialize_state); | |
733 | ASSERT(err == 0 || err == ENOENT); | |
734 | vd->vdev_initialize_state = initialize_state; | |
735 | ||
736 | uint64_t timestamp = 0; | |
737 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
738 | vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, | |
739 | sizeof (timestamp), 1, ×tamp); | |
740 | ASSERT(err == 0 || err == ENOENT); | |
2c3a8370 | 741 | vd->vdev_initialize_action_time = timestamp; |
619f0976 GW |
742 | |
743 | if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || | |
744 | vd->vdev_offline) { | |
745 | /* load progress for reporting, but don't resume */ | |
746 | VERIFY0(vdev_initialize_load(vd)); | |
747 | } else if (vd->vdev_initialize_state == | |
dd785b5b BB |
748 | VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && |
749 | !vd->vdev_top->vdev_removing && | |
750 | vd->vdev_initialize_thread == NULL) { | |
619f0976 GW |
751 | vdev_initialize(vd); |
752 | } | |
753 | ||
754 | mutex_exit(&vd->vdev_initialize_lock); | |
755 | } | |
756 | ||
757 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
758 | vdev_initialize_restart(vd->vdev_child[i]); | |
759 | } | |
760 | } | |
761 | ||
619f0976 GW |
762 | EXPORT_SYMBOL(vdev_initialize); |
763 | EXPORT_SYMBOL(vdev_initialize_stop); | |
c10d37dd GW |
764 | EXPORT_SYMBOL(vdev_initialize_stop_all); |
765 | EXPORT_SYMBOL(vdev_initialize_stop_wait); | |
1b939560 | 766 | EXPORT_SYMBOL(vdev_initialize_restart); |
619f0976 | 767 | |
03fdcb9a MM |
768 | /* BEGIN CSTYLED */ |
769 | ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW, | |
619f0976 | 770 | "Value written during zpool initialize"); |
e60e158e JG |
771 | |
772 | ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW, | |
773 | "Size in bytes of writes by zpool initialize"); | |
03fdcb9a | 774 | /* END CSTYLED */ |