]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. |
34dc7c2f BB |
23 | */ |
24 | ||
34dc7c2f BB |
25 | /* |
26 | * Routines to manage the on-disk persistent error log. | |
27 | * | |
28 | * Each pool stores a log of all logical data errors seen during normal | |
29 | * operation. This is actually the union of two distinct logs: the last log, | |
30 | * and the current log. All errors seen are logged to the current log. When a | |
31 | * scrub completes, the current log becomes the last log, the last log is thrown | |
32 | * out, and the current log is reinitialized. This way, if an error is somehow | |
33 | * corrected, a new scrub will show that that it no longer exists, and will be | |
34 | * deleted from the log when the scrub completes. | |
35 | * | |
36 | * The log is stored using a ZAP object whose key is a string form of the | |
37 | * zbookmark tuple (objset, object, level, blkid), and whose contents is an | |
38 | * optional 'objset:object' human-readable string describing the data. When an | |
39 | * error is first logged, this string will be empty, indicating that no name is | |
40 | * known. This prevents us from having to issue a potentially large amount of | |
41 | * I/O to discover the object name during an error path. Instead, we do the | |
42 | * calculation when the data is requested, storing the result so future queries | |
43 | * will be faster. | |
44 | * | |
45 | * This log is then shipped into an nvlist where the key is the dataset name and | |
46 | * the value is the object name. Userland is then responsible for uniquifying | |
47 | * this list and displaying it to the user. | |
48 | */ | |
49 | ||
50 | #include <sys/dmu_tx.h> | |
51 | #include <sys/spa.h> | |
52 | #include <sys/spa_impl.h> | |
53 | #include <sys/zap.h> | |
54 | #include <sys/zio.h> | |
55 | ||
34dc7c2f BB |
56 | |
57 | /* | |
58 | * Convert a bookmark to a string. | |
59 | */ | |
60 | static void | |
61 | bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) | |
62 | { | |
63 | (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", | |
64 | (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, | |
65 | (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); | |
66 | } | |
67 | ||
68 | /* | |
69 | * Convert a string to a bookmark | |
70 | */ | |
71 | #ifdef _KERNEL | |
72 | static void | |
73 | name_to_bookmark(char *buf, zbookmark_t *zb) | |
74 | { | |
75 | zb->zb_objset = strtonum(buf, &buf); | |
76 | ASSERT(*buf == ':'); | |
77 | zb->zb_object = strtonum(buf + 1, &buf); | |
78 | ASSERT(*buf == ':'); | |
79 | zb->zb_level = (int)strtonum(buf + 1, &buf); | |
80 | ASSERT(*buf == ':'); | |
81 | zb->zb_blkid = strtonum(buf + 1, &buf); | |
82 | ASSERT(*buf == '\0'); | |
83 | } | |
84 | #endif | |
85 | ||
86 | /* | |
87 | * Log an uncorrectable error to the persistent error log. We add it to the | |
88 | * spa's list of pending errors. The changes are actually synced out to disk | |
89 | * during spa_errlog_sync(). | |
90 | */ | |
91 | void | |
92 | spa_log_error(spa_t *spa, zio_t *zio) | |
93 | { | |
94 | zbookmark_t *zb = &zio->io_logical->io_bookmark; | |
95 | spa_error_entry_t search; | |
96 | spa_error_entry_t *new; | |
97 | avl_tree_t *tree; | |
98 | avl_index_t where; | |
99 | ||
100 | /* | |
101 | * If we are trying to import a pool, ignore any errors, as we won't be | |
102 | * writing to the pool any time soon. | |
103 | */ | |
428870ff | 104 | if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) |
34dc7c2f BB |
105 | return; |
106 | ||
107 | mutex_enter(&spa->spa_errlist_lock); | |
108 | ||
109 | /* | |
110 | * If we have had a request to rotate the log, log it to the next list | |
111 | * instead of the current one. | |
112 | */ | |
113 | if (spa->spa_scrub_active || spa->spa_scrub_finished) | |
114 | tree = &spa->spa_errlist_scrub; | |
115 | else | |
116 | tree = &spa->spa_errlist_last; | |
117 | ||
118 | search.se_bookmark = *zb; | |
119 | if (avl_find(tree, &search, &where) != NULL) { | |
120 | mutex_exit(&spa->spa_errlist_lock); | |
121 | return; | |
122 | } | |
123 | ||
124 | new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); | |
125 | new->se_bookmark = *zb; | |
126 | avl_insert(tree, new, where); | |
127 | ||
128 | mutex_exit(&spa->spa_errlist_lock); | |
129 | } | |
130 | ||
131 | /* | |
132 | * Return the number of errors currently in the error log. This is actually the | |
133 | * sum of both the last log and the current log, since we don't know the union | |
134 | * of these logs until we reach userland. | |
135 | */ | |
136 | uint64_t | |
137 | spa_get_errlog_size(spa_t *spa) | |
138 | { | |
139 | uint64_t total = 0, count; | |
140 | ||
141 | mutex_enter(&spa->spa_errlog_lock); | |
142 | if (spa->spa_errlog_scrub != 0 && | |
143 | zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, | |
144 | &count) == 0) | |
145 | total += count; | |
146 | ||
147 | if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && | |
148 | zap_count(spa->spa_meta_objset, spa->spa_errlog_last, | |
149 | &count) == 0) | |
150 | total += count; | |
151 | mutex_exit(&spa->spa_errlog_lock); | |
152 | ||
153 | mutex_enter(&spa->spa_errlist_lock); | |
154 | total += avl_numnodes(&spa->spa_errlist_last); | |
155 | total += avl_numnodes(&spa->spa_errlist_scrub); | |
156 | mutex_exit(&spa->spa_errlist_lock); | |
157 | ||
158 | return (total); | |
159 | } | |
160 | ||
161 | #ifdef _KERNEL | |
162 | static int | |
163 | process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) | |
164 | { | |
165 | zap_cursor_t zc; | |
166 | zap_attribute_t za; | |
167 | zbookmark_t zb; | |
168 | ||
169 | if (obj == 0) | |
170 | return (0); | |
171 | ||
172 | for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); | |
173 | zap_cursor_retrieve(&zc, &za) == 0; | |
174 | zap_cursor_advance(&zc)) { | |
175 | ||
176 | if (*count == 0) { | |
177 | zap_cursor_fini(&zc); | |
178 | return (ENOMEM); | |
179 | } | |
180 | ||
181 | name_to_bookmark(za.za_name, &zb); | |
182 | ||
183 | if (copyout(&zb, (char *)addr + | |
184 | (*count - 1) * sizeof (zbookmark_t), | |
185 | sizeof (zbookmark_t)) != 0) | |
186 | return (EFAULT); | |
187 | ||
188 | *count -= 1; | |
189 | } | |
190 | ||
191 | zap_cursor_fini(&zc); | |
192 | ||
193 | return (0); | |
194 | } | |
195 | ||
196 | static int | |
197 | process_error_list(avl_tree_t *list, void *addr, size_t *count) | |
198 | { | |
199 | spa_error_entry_t *se; | |
200 | ||
201 | for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { | |
202 | ||
203 | if (*count == 0) | |
204 | return (ENOMEM); | |
205 | ||
206 | if (copyout(&se->se_bookmark, (char *)addr + | |
207 | (*count - 1) * sizeof (zbookmark_t), | |
208 | sizeof (zbookmark_t)) != 0) | |
209 | return (EFAULT); | |
210 | ||
211 | *count -= 1; | |
212 | } | |
213 | ||
214 | return (0); | |
215 | } | |
216 | #endif | |
217 | ||
218 | /* | |
219 | * Copy all known errors to userland as an array of bookmarks. This is | |
220 | * actually a union of the on-disk last log and current log, as well as any | |
221 | * pending error requests. | |
222 | * | |
223 | * Because the act of reading the on-disk log could cause errors to be | |
224 | * generated, we have two separate locks: one for the error log and one for the | |
225 | * in-core error lists. We only need the error list lock to log and error, so | |
226 | * we grab the error log lock while we read the on-disk logs, and only pick up | |
227 | * the error list lock when we are finished. | |
228 | */ | |
229 | int | |
230 | spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) | |
231 | { | |
232 | int ret = 0; | |
233 | ||
234 | #ifdef _KERNEL | |
235 | mutex_enter(&spa->spa_errlog_lock); | |
236 | ||
237 | ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); | |
238 | ||
239 | if (!ret && !spa->spa_scrub_finished) | |
240 | ret = process_error_log(spa, spa->spa_errlog_last, uaddr, | |
241 | count); | |
242 | ||
243 | mutex_enter(&spa->spa_errlist_lock); | |
244 | if (!ret) | |
245 | ret = process_error_list(&spa->spa_errlist_scrub, uaddr, | |
246 | count); | |
247 | if (!ret) | |
248 | ret = process_error_list(&spa->spa_errlist_last, uaddr, | |
249 | count); | |
250 | mutex_exit(&spa->spa_errlist_lock); | |
251 | ||
252 | mutex_exit(&spa->spa_errlog_lock); | |
253 | #endif | |
254 | ||
255 | return (ret); | |
256 | } | |
257 | ||
258 | /* | |
259 | * Called when a scrub completes. This simply set a bit which tells which AVL | |
260 | * tree to add new errors. spa_errlog_sync() is responsible for actually | |
261 | * syncing the changes to the underlying objects. | |
262 | */ | |
263 | void | |
264 | spa_errlog_rotate(spa_t *spa) | |
265 | { | |
266 | mutex_enter(&spa->spa_errlist_lock); | |
34dc7c2f | 267 | spa->spa_scrub_finished = B_TRUE; |
34dc7c2f BB |
268 | mutex_exit(&spa->spa_errlist_lock); |
269 | } | |
270 | ||
271 | /* | |
272 | * Discard any pending errors from the spa_t. Called when unloading a faulted | |
273 | * pool, as the errors encountered during the open cannot be synced to disk. | |
274 | */ | |
275 | void | |
276 | spa_errlog_drain(spa_t *spa) | |
277 | { | |
278 | spa_error_entry_t *se; | |
279 | void *cookie; | |
280 | ||
281 | mutex_enter(&spa->spa_errlist_lock); | |
282 | ||
283 | cookie = NULL; | |
284 | while ((se = avl_destroy_nodes(&spa->spa_errlist_last, | |
285 | &cookie)) != NULL) | |
286 | kmem_free(se, sizeof (spa_error_entry_t)); | |
287 | cookie = NULL; | |
288 | while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, | |
289 | &cookie)) != NULL) | |
290 | kmem_free(se, sizeof (spa_error_entry_t)); | |
291 | ||
292 | mutex_exit(&spa->spa_errlist_lock); | |
293 | } | |
294 | ||
295 | /* | |
296 | * Process a list of errors into the current on-disk log. | |
297 | */ | |
298 | static void | |
299 | sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) | |
300 | { | |
301 | spa_error_entry_t *se; | |
302 | char buf[64]; | |
303 | void *cookie; | |
304 | ||
305 | if (avl_numnodes(t) != 0) { | |
306 | /* create log if necessary */ | |
307 | if (*obj == 0) | |
308 | *obj = zap_create(spa->spa_meta_objset, | |
309 | DMU_OT_ERROR_LOG, DMU_OT_NONE, | |
310 | 0, tx); | |
311 | ||
312 | /* add errors to the current log */ | |
313 | for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { | |
314 | char *name = se->se_name ? se->se_name : ""; | |
315 | ||
316 | bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); | |
317 | ||
318 | (void) zap_update(spa->spa_meta_objset, | |
319 | *obj, buf, 1, strlen(name) + 1, name, tx); | |
320 | } | |
321 | ||
322 | /* purge the error list */ | |
323 | cookie = NULL; | |
324 | while ((se = avl_destroy_nodes(t, &cookie)) != NULL) | |
325 | kmem_free(se, sizeof (spa_error_entry_t)); | |
326 | } | |
327 | } | |
328 | ||
329 | /* | |
330 | * Sync the error log out to disk. This is a little tricky because the act of | |
331 | * writing the error log requires the spa_errlist_lock. So, we need to lock the | |
332 | * error lists, take a copy of the lists, and then reinitialize them. Then, we | |
333 | * drop the error list lock and take the error log lock, at which point we | |
334 | * do the errlog processing. Then, if we encounter an I/O error during this | |
335 | * process, we can successfully add the error to the list. Note that this will | |
336 | * result in the perpetual recycling of errors, but it is an unlikely situation | |
337 | * and not a performance critical operation. | |
338 | */ | |
339 | void | |
340 | spa_errlog_sync(spa_t *spa, uint64_t txg) | |
341 | { | |
342 | dmu_tx_t *tx; | |
343 | avl_tree_t scrub, last; | |
344 | int scrub_finished; | |
345 | ||
346 | mutex_enter(&spa->spa_errlist_lock); | |
347 | ||
348 | /* | |
349 | * Bail out early under normal circumstances. | |
350 | */ | |
351 | if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && | |
352 | avl_numnodes(&spa->spa_errlist_last) == 0 && | |
353 | !spa->spa_scrub_finished) { | |
354 | mutex_exit(&spa->spa_errlist_lock); | |
355 | return; | |
356 | } | |
357 | ||
358 | spa_get_errlists(spa, &last, &scrub); | |
359 | scrub_finished = spa->spa_scrub_finished; | |
360 | spa->spa_scrub_finished = B_FALSE; | |
361 | ||
362 | mutex_exit(&spa->spa_errlist_lock); | |
363 | mutex_enter(&spa->spa_errlog_lock); | |
364 | ||
365 | tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); | |
366 | ||
367 | /* | |
368 | * Sync out the current list of errors. | |
369 | */ | |
370 | sync_error_list(spa, &last, &spa->spa_errlog_last, tx); | |
371 | ||
372 | /* | |
373 | * Rotate the log if necessary. | |
374 | */ | |
375 | if (scrub_finished) { | |
376 | if (spa->spa_errlog_last != 0) | |
377 | VERIFY(dmu_object_free(spa->spa_meta_objset, | |
378 | spa->spa_errlog_last, tx) == 0); | |
379 | spa->spa_errlog_last = spa->spa_errlog_scrub; | |
380 | spa->spa_errlog_scrub = 0; | |
381 | ||
382 | sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); | |
383 | } | |
384 | ||
385 | /* | |
386 | * Sync out any pending scrub errors. | |
387 | */ | |
388 | sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); | |
389 | ||
390 | /* | |
391 | * Update the MOS to reflect the new values. | |
392 | */ | |
393 | (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
394 | DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, | |
395 | &spa->spa_errlog_last, tx); | |
396 | (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
397 | DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, | |
398 | &spa->spa_errlog_scrub, tx); | |
399 | ||
400 | dmu_tx_commit(tx); | |
401 | ||
402 | mutex_exit(&spa->spa_errlog_lock); | |
403 | } |