]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
34dc7c2f BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. |
0409d332 | 23 | * Copyright (c) 2013, 2014, Delphix. All rights reserved. |
e8cf3a4f | 24 | * Copyright (c) 2019 Datto Inc. |
0c4064d9 | 25 | * Copyright (c) 2021, 2022, George Amanakis. All rights reserved. |
34dc7c2f BB |
26 | */ |
27 | ||
34dc7c2f BB |
28 | /* |
29 | * Routines to manage the on-disk persistent error log. | |
30 | * | |
31 | * Each pool stores a log of all logical data errors seen during normal | |
32 | * operation. This is actually the union of two distinct logs: the last log, | |
33 | * and the current log. All errors seen are logged to the current log. When a | |
34 | * scrub completes, the current log becomes the last log, the last log is thrown | |
35 | * out, and the current log is reinitialized. This way, if an error is somehow | |
e1cfd73f | 36 | * corrected, a new scrub will show that it no longer exists, and will be |
34dc7c2f BB |
37 | * deleted from the log when the scrub completes. |
38 | * | |
39 | * The log is stored using a ZAP object whose key is a string form of the | |
5dbd68a3 | 40 | * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an |
34dc7c2f BB |
41 | * optional 'objset:object' human-readable string describing the data. When an |
42 | * error is first logged, this string will be empty, indicating that no name is | |
43 | * known. This prevents us from having to issue a potentially large amount of | |
44 | * I/O to discover the object name during an error path. Instead, we do the | |
45 | * calculation when the data is requested, storing the result so future queries | |
46 | * will be faster. | |
47 | * | |
0409d332 GA |
48 | * If the head_errlog feature is enabled, a different on-disk format is used. |
49 | * The error log of each head dataset is stored separately in the zap object | |
50 | * and keyed by the head id. This enables listing every dataset affected in | |
51 | * userland. In order to be able to track whether an error block has been | |
52 | * modified or added to snapshots since it was marked as an error, a new tuple | |
53 | * is introduced: zbookmark_err_phys_t. It allows the storage of the birth | |
54 | * transaction group of an error block on-disk. The birth transaction group is | |
55 | * used by check_filesystem() to assess whether this block was freed, | |
56 | * re-written or added to a snapshot since its marking as an error. | |
57 | * | |
34dc7c2f BB |
58 | * This log is then shipped into an nvlist where the key is the dataset name and |
59 | * the value is the object name. Userland is then responsible for uniquifying | |
60 | * this list and displaying it to the user. | |
61 | */ | |
62 | ||
63 | #include <sys/dmu_tx.h> | |
64 | #include <sys/spa.h> | |
65 | #include <sys/spa_impl.h> | |
66 | #include <sys/zap.h> | |
67 | #include <sys/zio.h> | |
0409d332 GA |
68 | #include <sys/dsl_dir.h> |
69 | #include <sys/dmu_objset.h> | |
70 | #include <sys/dbuf.h> | |
0c4064d9 | 71 | #include <sys/zfs_znode.h> |
34dc7c2f | 72 | |
e8cf3a4f AP |
73 | #define NAME_MAX_LEN 64 |
74 | ||
0409d332 GA |
75 | /* |
76 | * spa_upgrade_errlog_limit : A zfs module parameter that controls the number | |
e8cf3a4f AP |
77 | * of on-disk error log entries that will be converted to the new |
78 | * format when enabling head_errlog. Defaults to 0 which converts | |
79 | * all log entries. | |
0409d332 | 80 | */ |
fdc2d303 | 81 | static uint_t spa_upgrade_errlog_limit = 0; |
34dc7c2f BB |
82 | |
83 | /* | |
84 | * Convert a bookmark to a string. | |
85 | */ | |
86 | static void | |
5dbd68a3 | 87 | bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) |
34dc7c2f BB |
88 | { |
89 | (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", | |
90 | (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, | |
91 | (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); | |
92 | } | |
93 | ||
94 | /* | |
0409d332 GA |
95 | * Convert an err_phys to a string. |
96 | */ | |
97 | static void | |
98 | errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) | |
99 | { | |
100 | (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", | |
101 | (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level, | |
102 | (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth); | |
103 | } | |
104 | ||
105 | /* | |
106 | * Convert a string to a err_phys. | |
107 | */ | |
108 | static void | |
109 | name_to_errphys(char *buf, zbookmark_err_phys_t *zep) | |
110 | { | |
111 | zep->zb_object = zfs_strtonum(buf, &buf); | |
112 | ASSERT(*buf == ':'); | |
113 | zep->zb_level = (int)zfs_strtonum(buf + 1, &buf); | |
114 | ASSERT(*buf == ':'); | |
115 | zep->zb_blkid = zfs_strtonum(buf + 1, &buf); | |
116 | ASSERT(*buf == ':'); | |
117 | zep->zb_birth = zfs_strtonum(buf + 1, &buf); | |
118 | ASSERT(*buf == '\0'); | |
119 | } | |
120 | ||
121 | /* | |
122 | * Convert a string to a bookmark. | |
34dc7c2f | 123 | */ |
34dc7c2f | 124 | static void |
5dbd68a3 | 125 | name_to_bookmark(char *buf, zbookmark_phys_t *zb) |
34dc7c2f | 126 | { |
e19572e4 | 127 | zb->zb_objset = zfs_strtonum(buf, &buf); |
34dc7c2f | 128 | ASSERT(*buf == ':'); |
e19572e4 | 129 | zb->zb_object = zfs_strtonum(buf + 1, &buf); |
34dc7c2f | 130 | ASSERT(*buf == ':'); |
e19572e4 | 131 | zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); |
34dc7c2f | 132 | ASSERT(*buf == ':'); |
e19572e4 | 133 | zb->zb_blkid = zfs_strtonum(buf + 1, &buf); |
34dc7c2f BB |
134 | ASSERT(*buf == '\0'); |
135 | } | |
0409d332 GA |
136 | |
137 | #ifdef _KERNEL | |
138 | static void | |
139 | zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) | |
140 | { | |
141 | zb->zb_objset = dataset; | |
142 | zb->zb_object = zep->zb_object; | |
143 | zb->zb_level = zep->zb_level; | |
144 | zb->zb_blkid = zep->zb_blkid; | |
145 | } | |
34dc7c2f BB |
146 | #endif |
147 | ||
0409d332 GA |
148 | static void |
149 | name_to_object(char *buf, uint64_t *obj) | |
150 | { | |
151 | *obj = zfs_strtonum(buf, &buf); | |
152 | ASSERT(*buf == '\0'); | |
153 | } | |
154 | ||
155 | static int | |
156 | get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, | |
157 | uint64_t *head_dataset_id) | |
158 | { | |
159 | dsl_pool_t *dp = spa->spa_dsl_pool; | |
160 | dsl_dataset_t *ds; | |
161 | objset_t *os; | |
162 | ||
0409d332 GA |
163 | int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds); |
164 | if (error != 0) { | |
0409d332 GA |
165 | return (error); |
166 | } | |
167 | ASSERT(head_dataset_id); | |
168 | *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; | |
169 | ||
170 | error = dmu_objset_from_ds(ds, &os); | |
171 | if (error != 0) { | |
172 | dsl_dataset_rele(ds, FTAG); | |
0409d332 GA |
173 | return (error); |
174 | } | |
175 | ||
0c4064d9 GA |
176 | /* |
177 | * If the key is not loaded dbuf_dnode_findbp() will error out with | |
178 | * EACCES. However in that case dnode_hold() will eventually call | |
179 | * dbuf_read()->zio_wait() which may call spa_log_error(). This will | |
180 | * lead to a deadlock due to us holding the mutex spa_errlist_lock. | |
181 | * Avoid this by checking here if the keys are loaded, if not return. | |
182 | * If the keys are not loaded the head_errlog feature is meaningless | |
183 | * as we cannot figure out the birth txg of the block pointer. | |
184 | */ | |
185 | if (dsl_dataset_get_keystatus(ds->ds_dir) == | |
186 | ZFS_KEYSTATUS_UNAVAILABLE) { | |
187 | zep->zb_birth = 0; | |
188 | dsl_dataset_rele(ds, FTAG); | |
0c4064d9 GA |
189 | return (0); |
190 | } | |
191 | ||
0409d332 GA |
192 | dnode_t *dn; |
193 | blkptr_t bp; | |
194 | ||
195 | error = dnode_hold(os, zep->zb_object, FTAG, &dn); | |
196 | if (error != 0) { | |
197 | dsl_dataset_rele(ds, FTAG); | |
0409d332 GA |
198 | return (error); |
199 | } | |
200 | ||
201 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
202 | error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, | |
203 | NULL); | |
0409d332 GA |
204 | if (error == 0 && BP_IS_HOLE(&bp)) |
205 | error = SET_ERROR(ENOENT); | |
206 | ||
0c4064d9 GA |
207 | /* |
208 | * If the key is loaded but the encrypted filesystem is unmounted when | |
209 | * a scrub is run, then dbuf_dnode_findbp() will still error out with | |
210 | * EACCES (possibly due to the key mapping being removed upon | |
211 | * unmounting). In that case the head_errlog feature is also | |
212 | * meaningless as we cannot figure out the birth txg of the block | |
213 | * pointer. | |
214 | */ | |
215 | if (error == EACCES) | |
216 | error = 0; | |
217 | else if (!error) | |
218 | zep->zb_birth = bp.blk_birth; | |
219 | ||
0409d332 GA |
220 | rw_exit(&dn->dn_struct_rwlock); |
221 | dnode_rele(dn, FTAG); | |
222 | dsl_dataset_rele(ds, FTAG); | |
0409d332 GA |
223 | return (error); |
224 | } | |
225 | ||
34dc7c2f BB |
226 | /* |
227 | * Log an uncorrectable error to the persistent error log. We add it to the | |
228 | * spa's list of pending errors. The changes are actually synced out to disk | |
229 | * during spa_errlog_sync(). | |
230 | */ | |
231 | void | |
b5256303 | 232 | spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) |
34dc7c2f | 233 | { |
34dc7c2f BB |
234 | spa_error_entry_t search; |
235 | spa_error_entry_t *new; | |
236 | avl_tree_t *tree; | |
237 | avl_index_t where; | |
238 | ||
239 | /* | |
240 | * If we are trying to import a pool, ignore any errors, as we won't be | |
241 | * writing to the pool any time soon. | |
242 | */ | |
428870ff | 243 | if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) |
34dc7c2f BB |
244 | return; |
245 | ||
246 | mutex_enter(&spa->spa_errlist_lock); | |
247 | ||
248 | /* | |
249 | * If we have had a request to rotate the log, log it to the next list | |
250 | * instead of the current one. | |
251 | */ | |
252 | if (spa->spa_scrub_active || spa->spa_scrub_finished) | |
253 | tree = &spa->spa_errlist_scrub; | |
254 | else | |
255 | tree = &spa->spa_errlist_last; | |
256 | ||
257 | search.se_bookmark = *zb; | |
258 | if (avl_find(tree, &search, &where) != NULL) { | |
259 | mutex_exit(&spa->spa_errlist_lock); | |
260 | return; | |
261 | } | |
262 | ||
263 | new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); | |
264 | new->se_bookmark = *zb; | |
265 | avl_insert(tree, new, where); | |
266 | ||
267 | mutex_exit(&spa->spa_errlist_lock); | |
268 | } | |
269 | ||
0409d332 GA |
270 | #ifdef _KERNEL |
271 | static int | |
272 | find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, | |
273 | uint64_t *birth_txg) | |
274 | { | |
275 | objset_t *os; | |
276 | int error = dmu_objset_from_ds(ds, &os); | |
277 | if (error != 0) | |
278 | return (error); | |
279 | ||
280 | dnode_t *dn; | |
281 | blkptr_t bp; | |
282 | ||
283 | error = dnode_hold(os, zep->zb_object, FTAG, &dn); | |
284 | if (error != 0) | |
285 | return (error); | |
286 | ||
287 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
288 | error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, | |
289 | NULL); | |
0409d332 GA |
290 | if (error == 0 && BP_IS_HOLE(&bp)) |
291 | error = SET_ERROR(ENOENT); | |
292 | ||
293 | *birth_txg = bp.blk_birth; | |
294 | rw_exit(&dn->dn_struct_rwlock); | |
295 | dnode_rele(dn, FTAG); | |
296 | return (error); | |
297 | } | |
298 | ||
299 | /* | |
018f2604 MA |
300 | * Copy the bookmark to the end of the user-space buffer which starts at |
301 | * uaddr and has *count unused entries, and decrement *count by 1. | |
302 | */ | |
303 | static int | |
304 | copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) | |
305 | { | |
306 | if (*count == 0) | |
307 | return (SET_ERROR(ENOMEM)); | |
308 | ||
309 | *count -= 1; | |
310 | if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t), | |
311 | sizeof (zbookmark_phys_t)) != 0) | |
312 | return (SET_ERROR(EFAULT)); | |
313 | return (0); | |
314 | } | |
315 | ||
316 | /* | |
317 | * Each time the error block is referenced by a snapshot or clone, add a | |
318 | * zbookmark_phys_t entry to the userspace array at uaddr. The array is | |
319 | * filled from the back and the in-out parameter *count is modified to be the | |
320 | * number of unused entries at the beginning of the array. | |
0409d332 GA |
321 | */ |
322 | static int | |
323 | check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, | |
018f2604 | 324 | void *uaddr, uint64_t *count) |
0409d332 GA |
325 | { |
326 | dsl_dataset_t *ds; | |
327 | dsl_pool_t *dp = spa->spa_dsl_pool; | |
328 | ||
329 | int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); | |
330 | if (error != 0) | |
331 | return (error); | |
332 | ||
333 | uint64_t latest_txg; | |
334 | uint64_t txg_to_consider = spa->spa_syncing_txg; | |
335 | boolean_t check_snapshot = B_TRUE; | |
336 | error = find_birth_txg(ds, zep, &latest_txg); | |
0c4064d9 GA |
337 | |
338 | /* | |
339 | * If we cannot figure out the current birth txg of the block pointer | |
340 | * error out. If the filesystem is encrypted and the key is not loaded | |
341 | * or the encrypted filesystem is not mounted the error will be EACCES. | |
342 | * In that case do not return an error. | |
343 | */ | |
344 | if (error == EACCES) { | |
345 | dsl_dataset_rele(ds, FTAG); | |
346 | return (0); | |
347 | } | |
348 | if (error) { | |
349 | dsl_dataset_rele(ds, FTAG); | |
350 | return (error); | |
351 | } | |
352 | if (zep->zb_birth == latest_txg) { | |
353 | /* Block neither free nor rewritten. */ | |
018f2604 MA |
354 | zbookmark_phys_t zb; |
355 | zep_to_zb(head_ds, zep, &zb); | |
356 | error = copyout_entry(&zb, uaddr, count); | |
357 | if (error != 0) { | |
358 | dsl_dataset_rele(ds, FTAG); | |
359 | return (error); | |
0409d332 | 360 | } |
0c4064d9 GA |
361 | check_snapshot = B_FALSE; |
362 | } else { | |
363 | ASSERT3U(zep->zb_birth, <, latest_txg); | |
364 | txg_to_consider = latest_txg; | |
0409d332 GA |
365 | } |
366 | ||
367 | /* How many snapshots reference this block. */ | |
368 | uint64_t snap_count; | |
369 | error = zap_count(spa->spa_meta_objset, | |
370 | dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); | |
371 | if (error != 0) { | |
372 | dsl_dataset_rele(ds, FTAG); | |
373 | return (error); | |
374 | } | |
375 | ||
376 | if (snap_count == 0) { | |
377 | /* File system has no snapshot. */ | |
378 | dsl_dataset_rele(ds, FTAG); | |
379 | return (0); | |
380 | } | |
381 | ||
382 | uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t), | |
383 | KM_SLEEP); | |
384 | ||
385 | int aff_snap_count = 0; | |
386 | uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; | |
387 | uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; | |
388 | ||
389 | /* Check only snapshots created from this file system. */ | |
390 | while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && | |
391 | snap_obj_txg <= txg_to_consider) { | |
392 | ||
393 | dsl_dataset_rele(ds, FTAG); | |
394 | error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); | |
395 | if (error != 0) | |
396 | goto out; | |
397 | ||
398 | if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) | |
399 | break; | |
400 | ||
401 | boolean_t affected = B_TRUE; | |
402 | if (check_snapshot) { | |
403 | uint64_t blk_txg; | |
404 | error = find_birth_txg(ds, zep, &blk_txg); | |
405 | affected = (error == 0 && zep->zb_birth == blk_txg); | |
406 | } | |
407 | ||
408 | if (affected) { | |
409 | snap_obj_array[aff_snap_count] = snap_obj; | |
410 | aff_snap_count++; | |
411 | ||
018f2604 MA |
412 | zbookmark_phys_t zb; |
413 | zep_to_zb(snap_obj, zep, &zb); | |
414 | error = copyout_entry(&zb, uaddr, count); | |
415 | if (error != 0) { | |
416 | dsl_dataset_rele(ds, FTAG); | |
417 | goto out; | |
0409d332 GA |
418 | } |
419 | ||
420 | /* | |
421 | * Only clones whose origins were affected could also | |
422 | * have affected snapshots. | |
423 | */ | |
424 | zap_cursor_t zc; | |
425 | zap_attribute_t za; | |
426 | for (zap_cursor_init(&zc, spa->spa_meta_objset, | |
427 | dsl_dataset_phys(ds)->ds_next_clones_obj); | |
428 | zap_cursor_retrieve(&zc, &za) == 0; | |
429 | zap_cursor_advance(&zc)) { | |
430 | error = check_filesystem(spa, | |
018f2604 | 431 | za.za_first_integer, zep, uaddr, count); |
0409d332 GA |
432 | |
433 | if (error != 0) { | |
434 | zap_cursor_fini(&zc); | |
435 | goto out; | |
436 | } | |
437 | } | |
438 | zap_cursor_fini(&zc); | |
439 | } | |
440 | snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; | |
441 | snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; | |
442 | } | |
443 | dsl_dataset_rele(ds, FTAG); | |
444 | ||
445 | out: | |
446 | kmem_free(snap_obj_array, sizeof (*snap_obj_array)); | |
447 | return (error); | |
448 | } | |
449 | ||
450 | static int | |
451 | find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, | |
452 | uint64_t *top_affected_fs) | |
453 | { | |
454 | uint64_t oldest_dsobj; | |
455 | int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, | |
456 | &oldest_dsobj); | |
457 | if (error != 0) | |
458 | return (error); | |
459 | ||
460 | dsl_dataset_t *ds; | |
461 | error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj, | |
462 | FTAG, &ds); | |
463 | if (error != 0) | |
464 | return (error); | |
465 | ||
466 | *top_affected_fs = | |
467 | dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; | |
468 | dsl_dataset_rele(ds, FTAG); | |
469 | return (0); | |
470 | } | |
471 | ||
472 | static int | |
473 | process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, | |
018f2604 | 474 | void *uaddr, uint64_t *count) |
0409d332 | 475 | { |
0c4064d9 GA |
476 | /* |
477 | * If the zb_birth is 0 it means we failed to retrieve the birth txg | |
478 | * of the block pointer. This happens when an encrypted filesystem is | |
479 | * not mounted or when the key is not loaded. Do not proceed to | |
480 | * check_filesystem(), instead do the accounting here. | |
481 | */ | |
482 | if (zep->zb_birth == 0) { | |
018f2604 MA |
483 | zbookmark_phys_t zb; |
484 | zep_to_zb(head_ds, zep, &zb); | |
485 | int error = copyout_entry(&zb, uaddr, count); | |
486 | if (error != 0) { | |
487 | return (error); | |
0c4064d9 GA |
488 | } |
489 | return (0); | |
490 | } | |
491 | ||
018f2604 | 492 | uint64_t top_affected_fs; |
0409d332 | 493 | int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); |
018f2604 MA |
494 | if (error == 0) { |
495 | error = check_filesystem(spa, top_affected_fs, zep, | |
496 | uaddr, count); | |
0409d332 | 497 | } |
0409d332 | 498 | |
018f2604 | 499 | return (error); |
0409d332 GA |
500 | } |
501 | #endif | |
502 | ||
e8cf3a4f AP |
503 | /* |
504 | * If a healed bookmark matches an entry in the error log we stash it in a tree | |
505 | * so that we can later remove the related log entries in sync context. | |
506 | */ | |
507 | static void | |
508 | spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb) | |
509 | { | |
510 | char name[NAME_MAX_LEN]; | |
511 | ||
512 | if (obj == 0) | |
513 | return; | |
514 | ||
515 | bookmark_to_name(healed_zb, name, sizeof (name)); | |
516 | mutex_enter(&spa->spa_errlog_lock); | |
517 | if (zap_contains(spa->spa_meta_objset, obj, name) == 0) { | |
518 | /* | |
519 | * Found an error matching healed zb, add zb to our | |
520 | * tree of healed errors | |
521 | */ | |
522 | avl_tree_t *tree = &spa->spa_errlist_healed; | |
523 | spa_error_entry_t search; | |
524 | spa_error_entry_t *new; | |
525 | avl_index_t where; | |
526 | search.se_bookmark = *healed_zb; | |
527 | mutex_enter(&spa->spa_errlist_lock); | |
528 | if (avl_find(tree, &search, &where) != NULL) { | |
529 | mutex_exit(&spa->spa_errlist_lock); | |
530 | mutex_exit(&spa->spa_errlog_lock); | |
531 | return; | |
532 | } | |
533 | new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); | |
534 | new->se_bookmark = *healed_zb; | |
535 | avl_insert(tree, new, where); | |
536 | mutex_exit(&spa->spa_errlist_lock); | |
537 | } | |
538 | mutex_exit(&spa->spa_errlog_lock); | |
539 | } | |
540 | ||
541 | /* | |
542 | * If this error exists in the given tree remove it. | |
543 | */ | |
544 | static void | |
545 | remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb) | |
546 | { | |
547 | spa_error_entry_t search, *found; | |
548 | avl_index_t where; | |
549 | ||
550 | mutex_enter(&spa->spa_errlist_lock); | |
551 | search.se_bookmark = *zb; | |
552 | if ((found = avl_find(t, &search, &where)) != NULL) { | |
553 | avl_remove(t, found); | |
554 | kmem_free(found, sizeof (spa_error_entry_t)); | |
555 | } | |
556 | mutex_exit(&spa->spa_errlist_lock); | |
557 | } | |
558 | ||
559 | ||
560 | /* | |
561 | * Removes all of the recv healed errors from both on-disk error logs | |
562 | */ | |
563 | static void | |
564 | spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) | |
565 | { | |
566 | char name[NAME_MAX_LEN]; | |
567 | spa_error_entry_t *se; | |
568 | void *cookie = NULL; | |
569 | ||
570 | ASSERT(MUTEX_HELD(&spa->spa_errlog_lock)); | |
571 | ||
572 | while ((se = avl_destroy_nodes(&spa->spa_errlist_healed, | |
573 | &cookie)) != NULL) { | |
574 | remove_error_from_list(spa, s, &se->se_bookmark); | |
575 | remove_error_from_list(spa, l, &se->se_bookmark); | |
576 | bookmark_to_name(&se->se_bookmark, name, sizeof (name)); | |
577 | kmem_free(se, sizeof (spa_error_entry_t)); | |
578 | (void) zap_remove(spa->spa_meta_objset, | |
579 | spa->spa_errlog_last, name, tx); | |
580 | (void) zap_remove(spa->spa_meta_objset, | |
581 | spa->spa_errlog_scrub, name, tx); | |
582 | } | |
583 | } | |
584 | ||
585 | /* | |
586 | * Stash away healed bookmarks to remove them from the on-disk error logs | |
587 | * later in spa_remove_healed_errors(). | |
588 | */ | |
589 | void | |
590 | spa_remove_error(spa_t *spa, zbookmark_phys_t *zb) | |
591 | { | |
592 | char name[NAME_MAX_LEN]; | |
593 | ||
594 | bookmark_to_name(zb, name, sizeof (name)); | |
595 | ||
596 | spa_add_healed_error(spa, spa->spa_errlog_last, zb); | |
597 | spa_add_healed_error(spa, spa->spa_errlog_scrub, zb); | |
598 | } | |
599 | ||
018f2604 MA |
600 | static uint64_t |
601 | approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj) | |
602 | { | |
603 | if (spa_err_obj == 0) | |
604 | return (0); | |
605 | uint64_t total = 0; | |
606 | ||
607 | zap_cursor_t zc; | |
608 | zap_attribute_t za; | |
609 | for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); | |
610 | zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { | |
611 | uint64_t count; | |
612 | if (zap_count(spa->spa_meta_objset, za.za_first_integer, | |
613 | &count) == 0) | |
614 | total += count; | |
615 | } | |
616 | zap_cursor_fini(&zc); | |
617 | return (total); | |
618 | } | |
619 | ||
34dc7c2f | 620 | /* |
018f2604 MA |
621 | * Return the approximate number of errors currently in the error log. This |
622 | * will be nonzero if there are some errors, but otherwise it may be more | |
623 | * or less than the number of entries returned by spa_get_errlog(). | |
34dc7c2f BB |
624 | */ |
625 | uint64_t | |
018f2604 | 626 | spa_approx_errlog_size(spa_t *spa) |
34dc7c2f | 627 | { |
0409d332 GA |
628 | uint64_t total = 0; |
629 | ||
630 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { | |
631 | mutex_enter(&spa->spa_errlog_lock); | |
632 | uint64_t count; | |
633 | if (spa->spa_errlog_scrub != 0 && | |
634 | zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, | |
635 | &count) == 0) | |
636 | total += count; | |
637 | ||
638 | if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && | |
639 | zap_count(spa->spa_meta_objset, spa->spa_errlog_last, | |
640 | &count) == 0) | |
641 | total += count; | |
642 | mutex_exit(&spa->spa_errlog_lock); | |
643 | ||
0409d332 | 644 | } else { |
0409d332 | 645 | mutex_enter(&spa->spa_errlog_lock); |
018f2604 MA |
646 | total += approx_errlog_size_impl(spa, spa->spa_errlog_last); |
647 | total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub); | |
0409d332 | 648 | mutex_exit(&spa->spa_errlog_lock); |
0409d332 | 649 | } |
018f2604 MA |
650 | mutex_enter(&spa->spa_errlist_lock); |
651 | total += avl_numnodes(&spa->spa_errlist_last); | |
652 | total += avl_numnodes(&spa->spa_errlist_scrub); | |
653 | mutex_exit(&spa->spa_errlist_lock); | |
0409d332 GA |
654 | return (total); |
655 | } | |
34dc7c2f | 656 | |
0409d332 GA |
657 | /* |
658 | * This function sweeps through an on-disk error log and stores all bookmarks | |
659 | * as error bookmarks in a new ZAP object. At the end we discard the old one, | |
660 | * and spa_update_errlog() will set the spa's on-disk error log to new ZAP | |
661 | * object. | |
662 | */ | |
663 | static void | |
664 | sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, | |
665 | dmu_tx_t *tx) | |
666 | { | |
667 | zap_cursor_t zc; | |
668 | zap_attribute_t za; | |
669 | zbookmark_phys_t zb; | |
670 | uint64_t count; | |
34dc7c2f | 671 | |
0409d332 GA |
672 | *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, |
673 | DMU_OT_NONE, 0, tx); | |
34dc7c2f | 674 | |
0409d332 GA |
675 | /* |
676 | * If we cannnot perform the upgrade we should clear the old on-disk | |
677 | * error logs. | |
678 | */ | |
679 | if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) { | |
680 | VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); | |
681 | return; | |
682 | } | |
683 | ||
684 | for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); | |
685 | zap_cursor_retrieve(&zc, &za) == 0; | |
686 | zap_cursor_advance(&zc)) { | |
687 | if (spa_upgrade_errlog_limit != 0 && | |
688 | zc.zc_cd == spa_upgrade_errlog_limit) | |
689 | break; | |
690 | ||
691 | name_to_bookmark(za.za_name, &zb); | |
692 | ||
693 | zbookmark_err_phys_t zep; | |
694 | zep.zb_object = zb.zb_object; | |
695 | zep.zb_level = zb.zb_level; | |
696 | zep.zb_blkid = zb.zb_blkid; | |
0c4064d9 | 697 | zep.zb_birth = 0; |
0409d332 GA |
698 | |
699 | /* | |
700 | * We cannot use get_head_and_birth_txg() because it will | |
701 | * acquire the pool config lock, which we already have. In case | |
702 | * of an error we simply continue. | |
703 | */ | |
704 | uint64_t head_dataset_obj; | |
705 | dsl_pool_t *dp = spa->spa_dsl_pool; | |
706 | dsl_dataset_t *ds; | |
707 | objset_t *os; | |
708 | ||
709 | int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds); | |
710 | if (error != 0) | |
711 | continue; | |
712 | ||
713 | head_dataset_obj = | |
714 | dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; | |
715 | ||
716 | /* | |
717 | * The objset and the dnode are required for getting the block | |
718 | * pointer, which is used to determine if BP_IS_HOLE(). If | |
719 | * getting the objset or the dnode fails, do not create a | |
720 | * zap entry (presuming we know the dataset) as this may create | |
721 | * spurious errors that we cannot ever resolve. If an error is | |
722 | * truly persistent, it should re-appear after a scan. | |
723 | */ | |
724 | if (dmu_objset_from_ds(ds, &os) != 0) { | |
725 | dsl_dataset_rele(ds, FTAG); | |
726 | continue; | |
727 | } | |
728 | ||
729 | dnode_t *dn; | |
730 | blkptr_t bp; | |
731 | ||
732 | if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) { | |
733 | dsl_dataset_rele(ds, FTAG); | |
734 | continue; | |
735 | } | |
736 | ||
737 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
738 | error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp, | |
739 | NULL, NULL); | |
0c4064d9 GA |
740 | if (error == EACCES) |
741 | error = 0; | |
742 | else if (!error) | |
743 | zep.zb_birth = bp.blk_birth; | |
0409d332 | 744 | |
0409d332 GA |
745 | rw_exit(&dn->dn_struct_rwlock); |
746 | dnode_rele(dn, FTAG); | |
747 | dsl_dataset_rele(ds, FTAG); | |
748 | ||
749 | if (error != 0 || BP_IS_HOLE(&bp)) | |
750 | continue; | |
751 | ||
752 | uint64_t err_obj; | |
753 | error = zap_lookup_int_key(spa->spa_meta_objset, *newobj, | |
754 | head_dataset_obj, &err_obj); | |
755 | ||
756 | if (error == ENOENT) { | |
757 | err_obj = zap_create(spa->spa_meta_objset, | |
758 | DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); | |
759 | ||
760 | (void) zap_update_int_key(spa->spa_meta_objset, | |
761 | *newobj, head_dataset_obj, err_obj, tx); | |
762 | } | |
763 | ||
764 | char buf[64]; | |
0409d332 GA |
765 | errphys_to_name(&zep, buf, sizeof (buf)); |
766 | ||
a926aab9 | 767 | const char *name = ""; |
0409d332 GA |
768 | (void) zap_update(spa->spa_meta_objset, err_obj, |
769 | buf, 1, strlen(name) + 1, name, tx); | |
770 | } | |
771 | zap_cursor_fini(&zc); | |
772 | ||
773 | VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); | |
774 | } | |
775 | ||
776 | void | |
777 | spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) | |
778 | { | |
779 | uint64_t newobj = 0; | |
780 | ||
781 | mutex_enter(&spa->spa_errlog_lock); | |
782 | if (spa->spa_errlog_last != 0) { | |
783 | sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx); | |
784 | spa->spa_errlog_last = newobj; | |
785 | } | |
786 | ||
787 | if (spa->spa_errlog_scrub != 0) { | |
788 | sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx); | |
789 | spa->spa_errlog_scrub = newobj; | |
790 | } | |
791 | mutex_exit(&spa->spa_errlog_lock); | |
34dc7c2f BB |
792 | } |
793 | ||
794 | #ifdef _KERNEL | |
0409d332 | 795 | /* |
018f2604 | 796 | * If an error block is shared by two datasets it will be counted twice. |
0409d332 | 797 | */ |
34dc7c2f | 798 | static int |
0409d332 | 799 | process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) |
34dc7c2f BB |
800 | { |
801 | zap_cursor_t zc; | |
802 | zap_attribute_t za; | |
34dc7c2f BB |
803 | |
804 | if (obj == 0) | |
805 | return (0); | |
806 | ||
0409d332 GA |
807 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { |
808 | for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); | |
809 | zap_cursor_retrieve(&zc, &za) == 0; | |
810 | zap_cursor_advance(&zc)) { | |
811 | if (*count == 0) { | |
812 | zap_cursor_fini(&zc); | |
813 | return (SET_ERROR(ENOMEM)); | |
814 | } | |
815 | ||
816 | zbookmark_phys_t zb; | |
817 | name_to_bookmark(za.za_name, &zb); | |
818 | ||
018f2604 MA |
819 | int error = copyout_entry(&zb, uaddr, count); |
820 | if (error != 0) { | |
0409d332 | 821 | zap_cursor_fini(&zc); |
018f2604 | 822 | return (error); |
0409d332 | 823 | } |
34dc7c2f | 824 | } |
0409d332 GA |
825 | zap_cursor_fini(&zc); |
826 | return (0); | |
827 | } | |
34dc7c2f | 828 | |
0409d332 GA |
829 | for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); |
830 | zap_cursor_retrieve(&zc, &za) == 0; | |
831 | zap_cursor_advance(&zc)) { | |
34dc7c2f | 832 | |
0409d332 GA |
833 | zap_cursor_t head_ds_cursor; |
834 | zap_attribute_t head_ds_attr; | |
835 | ||
836 | uint64_t head_ds_err_obj = za.za_first_integer; | |
837 | uint64_t head_ds; | |
838 | name_to_object(za.za_name, &head_ds); | |
839 | for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, | |
840 | head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor, | |
841 | &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { | |
842 | ||
843 | zbookmark_err_phys_t head_ds_block; | |
844 | name_to_errphys(head_ds_attr.za_name, &head_ds_block); | |
845 | int error = process_error_block(spa, head_ds, | |
018f2604 | 846 | &head_ds_block, uaddr, count); |
0409d332 GA |
847 | |
848 | if (error != 0) { | |
849 | zap_cursor_fini(&head_ds_cursor); | |
850 | zap_cursor_fini(&zc); | |
851 | return (error); | |
852 | } | |
3a84951d | 853 | } |
0409d332 | 854 | zap_cursor_fini(&head_ds_cursor); |
34dc7c2f | 855 | } |
34dc7c2f | 856 | zap_cursor_fini(&zc); |
34dc7c2f BB |
857 | return (0); |
858 | } | |
859 | ||
860 | static int | |
0409d332 | 861 | process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) |
34dc7c2f BB |
862 | { |
863 | spa_error_entry_t *se; | |
864 | ||
0409d332 GA |
865 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { |
866 | for (se = avl_first(list); se != NULL; | |
867 | se = AVL_NEXT(list, se)) { | |
018f2604 MA |
868 | int error = |
869 | copyout_entry(&se->se_bookmark, uaddr, count); | |
870 | if (error != 0) { | |
871 | return (error); | |
872 | } | |
0409d332 GA |
873 | } |
874 | return (0); | |
34dc7c2f BB |
875 | } |
876 | ||
0409d332 GA |
877 | for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { |
878 | zbookmark_err_phys_t zep; | |
879 | zep.zb_object = se->se_bookmark.zb_object; | |
880 | zep.zb_level = se->se_bookmark.zb_level; | |
881 | zep.zb_blkid = se->se_bookmark.zb_blkid; | |
0c4064d9 | 882 | zep.zb_birth = 0; |
0409d332 GA |
883 | |
884 | uint64_t head_ds_obj; | |
885 | int error = get_head_and_birth_txg(spa, &zep, | |
886 | se->se_bookmark.zb_objset, &head_ds_obj); | |
0409d332 | 887 | |
0c4064d9 GA |
888 | if (!error) |
889 | error = process_error_block(spa, head_ds_obj, &zep, | |
018f2604 | 890 | uaddr, count); |
0c4064d9 | 891 | if (error) |
0409d332 GA |
892 | return (error); |
893 | } | |
34dc7c2f BB |
894 | return (0); |
895 | } | |
896 | #endif | |
897 | ||
898 | /* | |
899 | * Copy all known errors to userland as an array of bookmarks. This is | |
900 | * actually a union of the on-disk last log and current log, as well as any | |
901 | * pending error requests. | |
902 | * | |
903 | * Because the act of reading the on-disk log could cause errors to be | |
904 | * generated, we have two separate locks: one for the error log and one for the | |
905 | * in-core error lists. We only need the error list lock to log and error, so | |
906 | * we grab the error log lock while we read the on-disk logs, and only pick up | |
907 | * the error list lock when we are finished. | |
908 | */ | |
909 | int | |
0409d332 | 910 | spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) |
34dc7c2f BB |
911 | { |
912 | int ret = 0; | |
913 | ||
914 | #ifdef _KERNEL | |
018f2604 MA |
915 | /* |
916 | * The pool config lock is needed to hold a dataset_t via (among other | |
917 | * places) process_error_list() -> get_head_and_birth_txg(), and lock | |
918 | * ordering requires that we get it before the spa_errlog_lock. | |
919 | */ | |
920 | dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); | |
34dc7c2f BB |
921 | mutex_enter(&spa->spa_errlog_lock); |
922 | ||
923 | ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); | |
924 | ||
925 | if (!ret && !spa->spa_scrub_finished) | |
926 | ret = process_error_log(spa, spa->spa_errlog_last, uaddr, | |
927 | count); | |
928 | ||
929 | mutex_enter(&spa->spa_errlist_lock); | |
930 | if (!ret) | |
0409d332 | 931 | ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr, |
34dc7c2f BB |
932 | count); |
933 | if (!ret) | |
0409d332 | 934 | ret = process_error_list(spa, &spa->spa_errlist_last, uaddr, |
34dc7c2f BB |
935 | count); |
936 | mutex_exit(&spa->spa_errlist_lock); | |
937 | ||
938 | mutex_exit(&spa->spa_errlog_lock); | |
018f2604 | 939 | dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); |
14e4e3cb AZ |
940 | #else |
941 | (void) spa, (void) uaddr, (void) count; | |
34dc7c2f BB |
942 | #endif |
943 | ||
944 | return (ret); | |
945 | } | |
946 | ||
947 | /* | |
948 | * Called when a scrub completes. This simply set a bit which tells which AVL | |
949 | * tree to add new errors. spa_errlog_sync() is responsible for actually | |
950 | * syncing the changes to the underlying objects. | |
951 | */ | |
952 | void | |
953 | spa_errlog_rotate(spa_t *spa) | |
954 | { | |
955 | mutex_enter(&spa->spa_errlist_lock); | |
34dc7c2f | 956 | spa->spa_scrub_finished = B_TRUE; |
34dc7c2f BB |
957 | mutex_exit(&spa->spa_errlist_lock); |
958 | } | |
959 | ||
960 | /* | |
961 | * Discard any pending errors from the spa_t. Called when unloading a faulted | |
962 | * pool, as the errors encountered during the open cannot be synced to disk. | |
963 | */ | |
964 | void | |
965 | spa_errlog_drain(spa_t *spa) | |
966 | { | |
967 | spa_error_entry_t *se; | |
968 | void *cookie; | |
969 | ||
970 | mutex_enter(&spa->spa_errlist_lock); | |
971 | ||
972 | cookie = NULL; | |
973 | while ((se = avl_destroy_nodes(&spa->spa_errlist_last, | |
974 | &cookie)) != NULL) | |
975 | kmem_free(se, sizeof (spa_error_entry_t)); | |
976 | cookie = NULL; | |
977 | while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, | |
978 | &cookie)) != NULL) | |
979 | kmem_free(se, sizeof (spa_error_entry_t)); | |
980 | ||
981 | mutex_exit(&spa->spa_errlist_lock); | |
982 | } | |
983 | ||
984 | /* | |
985 | * Process a list of errors into the current on-disk log. | |
986 | */ | |
0409d332 | 987 | void |
34dc7c2f BB |
988 | sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) |
989 | { | |
990 | spa_error_entry_t *se; | |
e8cf3a4f | 991 | char buf[NAME_MAX_LEN]; |
34dc7c2f BB |
992 | void *cookie; |
993 | ||
0409d332 GA |
994 | if (avl_numnodes(t) == 0) |
995 | return; | |
996 | ||
997 | /* create log if necessary */ | |
998 | if (*obj == 0) | |
999 | *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, | |
1000 | DMU_OT_NONE, 0, tx); | |
34dc7c2f | 1001 | |
0409d332 GA |
1002 | /* add errors to the current log */ |
1003 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { | |
34dc7c2f | 1004 | for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { |
34dc7c2f BB |
1005 | bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); |
1006 | ||
a926aab9 | 1007 | const char *name = se->se_name ? se->se_name : ""; |
0409d332 GA |
1008 | (void) zap_update(spa->spa_meta_objset, *obj, buf, 1, |
1009 | strlen(name) + 1, name, tx); | |
1010 | } | |
1011 | } else { | |
1012 | for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { | |
0409d332 GA |
1013 | zbookmark_err_phys_t zep; |
1014 | zep.zb_object = se->se_bookmark.zb_object; | |
1015 | zep.zb_level = se->se_bookmark.zb_level; | |
1016 | zep.zb_blkid = se->se_bookmark.zb_blkid; | |
0c4064d9 | 1017 | zep.zb_birth = 0; |
0409d332 GA |
1018 | |
1019 | /* | |
1020 | * If we cannot find out the head dataset and birth txg | |
1021 | * of the present error block, we simply continue. | |
1022 | * Reinserting that error block to the error lists, | |
1023 | * even if we are not syncing the final txg, results | |
1024 | * in duplicate posting of errors. | |
1025 | */ | |
1026 | uint64_t head_dataset_obj; | |
1027 | int error = get_head_and_birth_txg(spa, &zep, | |
1028 | se->se_bookmark.zb_objset, &head_dataset_obj); | |
0c4064d9 | 1029 | if (error) |
0409d332 GA |
1030 | continue; |
1031 | ||
1032 | uint64_t err_obj; | |
1033 | error = zap_lookup_int_key(spa->spa_meta_objset, | |
1034 | *obj, head_dataset_obj, &err_obj); | |
1035 | ||
1036 | if (error == ENOENT) { | |
1037 | err_obj = zap_create(spa->spa_meta_objset, | |
1038 | DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); | |
1039 | ||
1040 | (void) zap_update_int_key(spa->spa_meta_objset, | |
1041 | *obj, head_dataset_obj, err_obj, tx); | |
1042 | } | |
1043 | errphys_to_name(&zep, buf, sizeof (buf)); | |
1044 | ||
a926aab9 | 1045 | const char *name = se->se_name ? se->se_name : ""; |
34dc7c2f | 1046 | (void) zap_update(spa->spa_meta_objset, |
0409d332 | 1047 | err_obj, buf, 1, strlen(name) + 1, name, tx); |
34dc7c2f | 1048 | } |
0409d332 GA |
1049 | } |
1050 | /* purge the error list */ | |
1051 | cookie = NULL; | |
1052 | while ((se = avl_destroy_nodes(t, &cookie)) != NULL) | |
1053 | kmem_free(se, sizeof (spa_error_entry_t)); | |
1054 | } | |
34dc7c2f | 1055 | |
0409d332 GA |
1056 | static void |
1057 | delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx) | |
1058 | { | |
1059 | if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { | |
1060 | zap_cursor_t zc; | |
1061 | zap_attribute_t za; | |
1062 | for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); | |
1063 | zap_cursor_retrieve(&zc, &za) == 0; | |
1064 | zap_cursor_advance(&zc)) { | |
1065 | VERIFY0(dmu_object_free(spa->spa_meta_objset, | |
1066 | za.za_first_integer, tx)); | |
1067 | } | |
1068 | zap_cursor_fini(&zc); | |
34dc7c2f | 1069 | } |
0409d332 | 1070 | VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); |
34dc7c2f BB |
1071 | } |
1072 | ||
1073 | /* | |
1074 | * Sync the error log out to disk. This is a little tricky because the act of | |
1075 | * writing the error log requires the spa_errlist_lock. So, we need to lock the | |
1076 | * error lists, take a copy of the lists, and then reinitialize them. Then, we | |
1077 | * drop the error list lock and take the error log lock, at which point we | |
1078 | * do the errlog processing. Then, if we encounter an I/O error during this | |
1079 | * process, we can successfully add the error to the list. Note that this will | |
1080 | * result in the perpetual recycling of errors, but it is an unlikely situation | |
1081 | * and not a performance critical operation. | |
1082 | */ | |
1083 | void | |
1084 | spa_errlog_sync(spa_t *spa, uint64_t txg) | |
1085 | { | |
1086 | dmu_tx_t *tx; | |
1087 | avl_tree_t scrub, last; | |
1088 | int scrub_finished; | |
1089 | ||
1090 | mutex_enter(&spa->spa_errlist_lock); | |
1091 | ||
1092 | /* | |
1093 | * Bail out early under normal circumstances. | |
1094 | */ | |
1095 | if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && | |
1096 | avl_numnodes(&spa->spa_errlist_last) == 0 && | |
e8cf3a4f | 1097 | avl_numnodes(&spa->spa_errlist_healed) == 0 && |
34dc7c2f BB |
1098 | !spa->spa_scrub_finished) { |
1099 | mutex_exit(&spa->spa_errlist_lock); | |
1100 | return; | |
1101 | } | |
1102 | ||
1103 | spa_get_errlists(spa, &last, &scrub); | |
1104 | scrub_finished = spa->spa_scrub_finished; | |
1105 | spa->spa_scrub_finished = B_FALSE; | |
1106 | ||
1107 | mutex_exit(&spa->spa_errlist_lock); | |
018f2604 MA |
1108 | |
1109 | /* | |
1110 | * The pool config lock is needed to hold a dataset_t via | |
1111 | * sync_error_list() -> get_head_and_birth_txg(), and lock ordering | |
1112 | * requires that we get it before the spa_errlog_lock. | |
1113 | */ | |
1114 | dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); | |
34dc7c2f BB |
1115 | mutex_enter(&spa->spa_errlog_lock); |
1116 | ||
1117 | tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); | |
1118 | ||
e8cf3a4f AP |
1119 | /* |
1120 | * Remove healed errors from errors. | |
1121 | */ | |
1122 | spa_remove_healed_errors(spa, &last, &scrub, tx); | |
1123 | ||
34dc7c2f BB |
1124 | /* |
1125 | * Sync out the current list of errors. | |
1126 | */ | |
1127 | sync_error_list(spa, &last, &spa->spa_errlog_last, tx); | |
1128 | ||
1129 | /* | |
1130 | * Rotate the log if necessary. | |
1131 | */ | |
1132 | if (scrub_finished) { | |
1133 | if (spa->spa_errlog_last != 0) | |
0409d332 | 1134 | delete_errlog(spa, spa->spa_errlog_last, tx); |
34dc7c2f BB |
1135 | spa->spa_errlog_last = spa->spa_errlog_scrub; |
1136 | spa->spa_errlog_scrub = 0; | |
1137 | ||
1138 | sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); | |
1139 | } | |
1140 | ||
1141 | /* | |
1142 | * Sync out any pending scrub errors. | |
1143 | */ | |
1144 | sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); | |
1145 | ||
1146 | /* | |
1147 | * Update the MOS to reflect the new values. | |
1148 | */ | |
1149 | (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
1150 | DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, | |
1151 | &spa->spa_errlog_last, tx); | |
1152 | (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
1153 | DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, | |
1154 | &spa->spa_errlog_scrub, tx); | |
1155 | ||
1156 | dmu_tx_commit(tx); | |
1157 | ||
1158 | mutex_exit(&spa->spa_errlog_lock); | |
018f2604 | 1159 | dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); |
34dc7c2f | 1160 | } |
c28b2279 | 1161 | |
0409d332 GA |
1162 | static void |
1163 | delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds, | |
1164 | dmu_tx_t *tx) | |
1165 | { | |
1166 | if (spa_err_obj == 0) | |
1167 | return; | |
1168 | ||
1169 | zap_cursor_t zc; | |
1170 | zap_attribute_t za; | |
1171 | for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); | |
1172 | zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { | |
1173 | uint64_t head_ds; | |
1174 | name_to_object(za.za_name, &head_ds); | |
1175 | if (head_ds == ds) { | |
1176 | (void) zap_remove(spa->spa_meta_objset, spa_err_obj, | |
1177 | za.za_name, tx); | |
1178 | VERIFY0(dmu_object_free(spa->spa_meta_objset, | |
1179 | za.za_first_integer, tx)); | |
1180 | break; | |
1181 | } | |
1182 | } | |
1183 | zap_cursor_fini(&zc); | |
1184 | } | |
1185 | ||
1186 | void | |
1187 | spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx) | |
1188 | { | |
1189 | mutex_enter(&spa->spa_errlog_lock); | |
1190 | delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx); | |
1191 | delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx); | |
1192 | mutex_exit(&spa->spa_errlog_lock); | |
1193 | } | |
1194 | ||
1195 | static int | |
1196 | find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, | |
1197 | uint64_t *txg) | |
1198 | { | |
1199 | dsl_dataset_t *ds; | |
1200 | dsl_pool_t *dp = spa->spa_dsl_pool; | |
1201 | ||
1202 | int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds); | |
1203 | if (error != 0) | |
1204 | return (error); | |
1205 | ||
1206 | uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; | |
1207 | uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; | |
1208 | ||
1209 | while (prev_obj != 0) { | |
1210 | dsl_dataset_rele(ds, FTAG); | |
1211 | if ((error = dsl_dataset_hold_obj(dp, prev_obj, | |
1212 | FTAG, &ds)) == 0 && | |
1213 | dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head) | |
1214 | break; | |
1215 | ||
1216 | if (error != 0) | |
1217 | return (error); | |
1218 | ||
1219 | prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; | |
1220 | prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; | |
1221 | } | |
1222 | dsl_dataset_rele(ds, FTAG); | |
1223 | ASSERT(prev_obj != 0); | |
1224 | *txg = prev_obj_txg; | |
1225 | return (0); | |
1226 | } | |
1227 | ||
1228 | static void | |
1229 | swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t | |
1230 | old_head, dmu_tx_t *tx) | |
1231 | { | |
1232 | if (spa_err_obj == 0) | |
1233 | return; | |
1234 | ||
1235 | uint64_t old_head_errlog; | |
1236 | int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, | |
1237 | old_head, &old_head_errlog); | |
1238 | ||
1239 | /* If no error log, then there is nothing to do. */ | |
1240 | if (error != 0) | |
1241 | return; | |
1242 | ||
1243 | uint64_t txg; | |
1244 | error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg); | |
1245 | if (error != 0) | |
1246 | return; | |
1247 | ||
1248 | /* | |
1249 | * Create an error log if the file system being promoted does not | |
1250 | * already have one. | |
1251 | */ | |
1252 | uint64_t new_head_errlog; | |
1253 | error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head, | |
1254 | &new_head_errlog); | |
1255 | ||
1256 | if (error != 0) { | |
1257 | new_head_errlog = zap_create(spa->spa_meta_objset, | |
1258 | DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); | |
1259 | ||
1260 | (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj, | |
1261 | new_head, new_head_errlog, tx); | |
1262 | } | |
1263 | ||
1264 | zap_cursor_t zc; | |
1265 | zap_attribute_t za; | |
1266 | zbookmark_err_phys_t err_block; | |
1267 | for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog); | |
1268 | zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { | |
1269 | ||
a926aab9 | 1270 | const char *name = ""; |
0409d332 GA |
1271 | name_to_errphys(za.za_name, &err_block); |
1272 | if (err_block.zb_birth < txg) { | |
1273 | (void) zap_update(spa->spa_meta_objset, new_head_errlog, | |
1274 | za.za_name, 1, strlen(name) + 1, name, tx); | |
1275 | ||
1276 | (void) zap_remove(spa->spa_meta_objset, old_head_errlog, | |
1277 | za.za_name, tx); | |
1278 | } | |
1279 | } | |
1280 | zap_cursor_fini(&zc); | |
1281 | } | |
1282 | ||
1283 | void | |
1284 | spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, | |
1285 | dmu_tx_t *tx) | |
1286 | { | |
1287 | mutex_enter(&spa->spa_errlog_lock); | |
1288 | swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx); | |
1289 | swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx); | |
1290 | mutex_exit(&spa->spa_errlog_lock); | |
1291 | } | |
1292 | ||
93ce2b4c | 1293 | #if defined(_KERNEL) |
c28b2279 BB |
1294 | /* error handling */ |
1295 | EXPORT_SYMBOL(spa_log_error); | |
018f2604 | 1296 | EXPORT_SYMBOL(spa_approx_errlog_size); |
c28b2279 BB |
1297 | EXPORT_SYMBOL(spa_get_errlog); |
1298 | EXPORT_SYMBOL(spa_errlog_rotate); | |
1299 | EXPORT_SYMBOL(spa_errlog_drain); | |
1300 | EXPORT_SYMBOL(spa_errlog_sync); | |
1301 | EXPORT_SYMBOL(spa_get_errlists); | |
0409d332 GA |
1302 | EXPORT_SYMBOL(spa_delete_dataset_errlog); |
1303 | EXPORT_SYMBOL(spa_swap_errlog); | |
1304 | EXPORT_SYMBOL(sync_error_list); | |
1305 | EXPORT_SYMBOL(spa_upgrade_errlog); | |
c28b2279 | 1306 | #endif |
0409d332 GA |
1307 | |
1308 | /* BEGIN CSTYLED */ | |
fdc2d303 | 1309 | ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, UINT, ZMOD_RW, |
0409d332 GA |
1310 | "Limit the number of errors which will be upgraded to the new " |
1311 | "on-disk error log when enabling head_errlog"); | |
1312 | /* END CSTYLED */ |