]>
Commit | Line | Data |
---|---|---|
d02ca379 DB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
d02ca379 DB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. | |
976246fa DB |
24 | * Copyright 2015 Nexenta Systems, Inc. All rights reserved. |
25 | * Copyright (c) 2016, Intel Corporation. | |
cbe88229 | 26 | * Copyright (c) 2023, Klara Inc. |
d02ca379 DB |
27 | */ |
28 | ||
976246fa | 29 | #include <stddef.h> |
93ce2b4c | 30 | #include <string.h> |
976246fa DB |
31 | #include <libuutil.h> |
32 | #include <libzfs.h> | |
33 | #include <sys/types.h> | |
34 | #include <sys/time.h> | |
35 | #include <sys/fs/zfs.h> | |
36 | #include <sys/fm/protocol.h> | |
37 | #include <sys/fm/fs/zfs.h> | |
cf70c0f8 | 38 | #include <sys/zio.h> |
976246fa | 39 | |
d02ca379 | 40 | #include "zfs_agents.h" |
976246fa | 41 | #include "fmd_api.h" |
d02ca379 | 42 | |
69f024a5 RW |
43 | /* |
44 | * Default values for the serd engine when processing checksum or io errors. The | |
45 | * semantics are N <events> in T <seconds>. | |
46 | */ | |
47 | #define DEFAULT_CHECKSUM_N 10 /* events */ | |
48 | #define DEFAULT_CHECKSUM_T 600 /* seconds */ | |
49 | #define DEFAULT_IO_N 10 /* events */ | |
50 | #define DEFAULT_IO_T 600 /* seconds */ | |
cbe88229 DB |
51 | #define DEFAULT_SLOW_IO_N 10 /* events */ |
52 | #define DEFAULT_SLOW_IO_T 30 /* seconds */ | |
53 | ||
54 | #define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */ | |
69f024a5 | 55 | |
976246fa | 56 | /* |
cbe88229 DB |
57 | * Our serd engines are named in the following format: |
58 | * 'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}' | |
59 | * This #define reserves enough space for two 64-bit hex values plus the | |
60 | * length of the longest string. | |
976246fa DB |
61 | */ |
62 | #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) | |
d02ca379 | 63 | |
976246fa DB |
64 | /* |
65 | * On-disk case structure. This must maintain backwards compatibility with | |
66 | * previous versions of the DE. By default, any members appended to the end | |
67 | * will be filled with zeros if they don't exist in a previous version. | |
68 | */ | |
69 | typedef struct zfs_case_data { | |
70 | uint64_t zc_version; | |
71 | uint64_t zc_ena; | |
72 | uint64_t zc_pool_guid; | |
73 | uint64_t zc_vdev_guid; | |
74 | int zc_pool_state; | |
75 | char zc_serd_checksum[MAX_SERDLEN]; | |
76 | char zc_serd_io[MAX_SERDLEN]; | |
cbe88229 | 77 | char zc_serd_slow_io[MAX_SERDLEN]; |
976246fa DB |
78 | int zc_has_remove_timer; |
79 | } zfs_case_data_t; | |
80 | ||
81 | /* | |
82 | * Time-of-day | |
83 | */ | |
84 | typedef struct er_timeval { | |
85 | uint64_t ertv_sec; | |
86 | uint64_t ertv_nsec; | |
87 | } er_timeval_t; | |
88 | ||
89 | /* | |
90 | * In-core case structure. | |
91 | */ | |
92 | typedef struct zfs_case { | |
93 | boolean_t zc_present; | |
94 | uint32_t zc_version; | |
95 | zfs_case_data_t zc_data; | |
96 | fmd_case_t *zc_case; | |
97 | uu_list_node_t zc_node; | |
98 | id_t zc_remove_timer; | |
99 | char *zc_fru; | |
100 | er_timeval_t zc_when; | |
101 | } zfs_case_t; | |
102 | ||
103 | #define CASE_DATA "data" | |
104 | #define CASE_FRU "fru" | |
105 | #define CASE_DATA_VERSION_INITIAL 1 | |
106 | #define CASE_DATA_VERSION_SERD 2 | |
107 | ||
108 | typedef struct zfs_de_stats { | |
109 | fmd_stat_t old_drops; | |
110 | fmd_stat_t dev_drops; | |
111 | fmd_stat_t vdev_drops; | |
112 | fmd_stat_t import_drops; | |
113 | fmd_stat_t resource_drops; | |
114 | } zfs_de_stats_t; | |
115 | ||
116 | zfs_de_stats_t zfs_stats = { | |
117 | { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" }, | |
118 | { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"}, | |
119 | { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"}, | |
120 | { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" }, | |
121 | { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } | |
122 | }; | |
123 | ||
cbe88229 DB |
124 | /* wait 15 seconds after a removal */ |
125 | static hrtime_t zfs_remove_timeout = SEC2NSEC(15); | |
976246fa DB |
126 | |
127 | uu_list_pool_t *zfs_case_pool; | |
128 | uu_list_t *zfs_cases; | |
129 | ||
130 | #define ZFS_MAKE_RSRC(type) \ | |
131 | FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type | |
132 | #define ZFS_MAKE_EREPORT(type) \ | |
133 | FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type | |
134 | ||
cbe88229 DB |
135 | static void zfs_purge_cases(fmd_hdl_t *hdl); |
136 | ||
976246fa DB |
137 | /* |
138 | * Write out the persistent representation of an active case. | |
139 | */ | |
140 | static void | |
e265a082 | 141 | zfs_case_serialize(zfs_case_t *zcp) |
d02ca379 | 142 | { |
976246fa DB |
143 | zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; |
144 | } | |
145 | ||
146 | /* | |
147 | * Read back the persistent representation of an active case. | |
148 | */ | |
149 | static zfs_case_t * | |
150 | zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) | |
151 | { | |
152 | zfs_case_t *zcp; | |
153 | ||
154 | zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); | |
155 | zcp->zc_case = cp; | |
156 | ||
157 | fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, | |
158 | sizeof (zcp->zc_data)); | |
159 | ||
160 | if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { | |
161 | fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); | |
162 | return (NULL); | |
163 | } | |
164 | ||
165 | /* | |
166 | * fmd_buf_read() will have already zeroed out the remainder of the | |
167 | * buffer, so we don't have to do anything special if the version | |
168 | * doesn't include the SERD engine name. | |
169 | */ | |
170 | ||
171 | if (zcp->zc_data.zc_has_remove_timer) | |
172 | zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, | |
173 | NULL, zfs_remove_timeout); | |
174 | ||
175 | uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool); | |
176 | (void) uu_list_insert_before(zfs_cases, NULL, zcp); | |
177 | ||
178 | fmd_case_setspecific(hdl, cp, zcp); | |
179 | ||
180 | return (zcp); | |
181 | } | |
182 | ||
cbe88229 DB |
183 | /* |
184 | * count other unique slow-io cases in a pool | |
185 | */ | |
186 | static uint_t | |
187 | zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case) | |
188 | { | |
189 | zfs_case_t *zcp; | |
190 | uint_t cases = 0; | |
191 | static hrtime_t next_check = 0; | |
192 | ||
193 | /* | |
194 | * Note that plumbing in some external GC would require adding locking, | |
195 | * since most of this module code is not thread safe and assumes there | |
196 | * is only one thread running against the module. So we perform GC here | |
197 | * inline periodically so that future delay induced faults will be | |
198 | * possible once the issue causing multiple vdev delays is resolved. | |
199 | */ | |
200 | if (gethrestime_sec() > next_check) { | |
201 | /* Periodically purge old SERD entries and stale cases */ | |
202 | fmd_serd_gc(hdl); | |
203 | zfs_purge_cases(hdl); | |
204 | next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS; | |
205 | } | |
206 | ||
207 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
208 | zcp = uu_list_next(zfs_cases, zcp)) { | |
209 | if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid && | |
210 | zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid && | |
211 | zcp->zc_data.zc_serd_slow_io[0] != '\0' && | |
212 | fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) { | |
213 | cases++; | |
214 | } | |
215 | } | |
216 | return (cases); | |
217 | } | |
218 | ||
976246fa DB |
219 | /* |
220 | * Iterate over any active cases. If any cases are associated with a pool or | |
221 | * vdev which is no longer present on the system, close the associated case. | |
222 | */ | |
223 | static void | |
224 | zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded) | |
225 | { | |
bfe27ace | 226 | uint64_t vdev_guid = 0; |
976246fa DB |
227 | uint_t c, children; |
228 | nvlist_t **child; | |
229 | zfs_case_t *zcp; | |
976246fa | 230 | |
bfe27ace | 231 | (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); |
976246fa DB |
232 | |
233 | /* | |
234 | * Mark any cases associated with this (pool, vdev) pair. | |
235 | */ | |
236 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
237 | zcp = uu_list_next(zfs_cases, zcp)) { | |
238 | if (zcp->zc_data.zc_pool_guid == pool_guid && | |
239 | zcp->zc_data.zc_vdev_guid == vdev_guid) { | |
240 | zcp->zc_present = B_TRUE; | |
241 | zcp->zc_when = *loaded; | |
242 | } | |
243 | } | |
244 | ||
245 | /* | |
246 | * Iterate over all children. | |
247 | */ | |
248 | if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, | |
249 | &children) == 0) { | |
250 | for (c = 0; c < children; c++) | |
251 | zfs_mark_vdev(pool_guid, child[c], loaded); | |
252 | } | |
253 | ||
254 | if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, | |
255 | &children) == 0) { | |
256 | for (c = 0; c < children; c++) | |
257 | zfs_mark_vdev(pool_guid, child[c], loaded); | |
258 | } | |
259 | ||
260 | if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, | |
261 | &children) == 0) { | |
262 | for (c = 0; c < children; c++) | |
263 | zfs_mark_vdev(pool_guid, child[c], loaded); | |
264 | } | |
d02ca379 DB |
265 | } |
266 | ||
976246fa DB |
267 | static int |
268 | zfs_mark_pool(zpool_handle_t *zhp, void *unused) | |
269 | { | |
e265a082 | 270 | (void) unused; |
976246fa DB |
271 | zfs_case_t *zcp; |
272 | uint64_t pool_guid; | |
273 | uint64_t *tod; | |
274 | er_timeval_t loaded = { 0 }; | |
275 | nvlist_t *config, *vd; | |
276 | uint_t nelem = 0; | |
277 | int ret; | |
278 | ||
279 | pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); | |
280 | /* | |
281 | * Mark any cases associated with just this pool. | |
282 | */ | |
283 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
284 | zcp = uu_list_next(zfs_cases, zcp)) { | |
285 | if (zcp->zc_data.zc_pool_guid == pool_guid && | |
286 | zcp->zc_data.zc_vdev_guid == 0) | |
287 | zcp->zc_present = B_TRUE; | |
288 | } | |
289 | ||
290 | if ((config = zpool_get_config(zhp, NULL)) == NULL) { | |
291 | zpool_close(zhp); | |
292 | return (-1); | |
293 | } | |
294 | ||
295 | (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, | |
296 | &tod, &nelem); | |
297 | if (nelem == 2) { | |
298 | loaded.ertv_sec = tod[0]; | |
299 | loaded.ertv_nsec = tod[1]; | |
300 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
301 | zcp = uu_list_next(zfs_cases, zcp)) { | |
302 | if (zcp->zc_data.zc_pool_guid == pool_guid && | |
303 | zcp->zc_data.zc_vdev_guid == 0) { | |
304 | zcp->zc_when = loaded; | |
305 | } | |
306 | } | |
307 | } | |
308 | ||
309 | ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); | |
bfe27ace BB |
310 | if (ret) { |
311 | zpool_close(zhp); | |
312 | return (-1); | |
313 | } | |
976246fa DB |
314 | |
315 | zfs_mark_vdev(pool_guid, vd, &loaded); | |
316 | ||
317 | zpool_close(zhp); | |
318 | ||
319 | return (0); | |
320 | } | |
321 | ||
322 | struct load_time_arg { | |
323 | uint64_t lt_guid; | |
324 | er_timeval_t *lt_time; | |
325 | boolean_t lt_found; | |
326 | }; | |
327 | ||
328 | static int | |
329 | zpool_find_load_time(zpool_handle_t *zhp, void *arg) | |
d02ca379 | 330 | { |
976246fa DB |
331 | struct load_time_arg *lta = arg; |
332 | uint64_t pool_guid; | |
333 | uint64_t *tod; | |
334 | nvlist_t *config; | |
335 | uint_t nelem; | |
336 | ||
337 | if (lta->lt_found) { | |
338 | zpool_close(zhp); | |
339 | return (0); | |
340 | } | |
341 | ||
342 | pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); | |
343 | if (pool_guid != lta->lt_guid) { | |
344 | zpool_close(zhp); | |
345 | return (0); | |
346 | } | |
347 | ||
348 | if ((config = zpool_get_config(zhp, NULL)) == NULL) { | |
349 | zpool_close(zhp); | |
350 | return (-1); | |
351 | } | |
352 | ||
353 | if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, | |
354 | &tod, &nelem) == 0 && nelem == 2) { | |
355 | lta->lt_found = B_TRUE; | |
356 | lta->lt_time->ertv_sec = tod[0]; | |
357 | lta->lt_time->ertv_nsec = tod[1]; | |
358 | } | |
359 | ||
360 | zpool_close(zhp); | |
361 | ||
d02ca379 DB |
362 | return (0); |
363 | } | |
364 | ||
976246fa DB |
365 | static void |
366 | zfs_purge_cases(fmd_hdl_t *hdl) | |
367 | { | |
368 | zfs_case_t *zcp; | |
369 | uu_list_walk_t *walk; | |
370 | libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); | |
371 | ||
372 | /* | |
373 | * There is no way to open a pool by GUID, or lookup a vdev by GUID. No | |
374 | * matter what we do, we're going to have to stomach an O(vdevs * cases) | |
375 | * algorithm. In reality, both quantities are likely so small that | |
376 | * neither will matter. Given that iterating over pools is more | |
377 | * expensive than iterating over the in-memory case list, we opt for a | |
378 | * 'present' flag in each case that starts off cleared. We then iterate | |
379 | * over all pools, marking those that are still present, and removing | |
380 | * those that aren't found. | |
381 | * | |
382 | * Note that we could also construct an FMRI and rely on | |
383 | * fmd_nvl_fmri_present(), but this would end up doing the same search. | |
384 | */ | |
385 | ||
386 | /* | |
387 | * Mark the cases as not present. | |
388 | */ | |
389 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
390 | zcp = uu_list_next(zfs_cases, zcp)) | |
391 | zcp->zc_present = B_FALSE; | |
392 | ||
393 | /* | |
394 | * Iterate over all pools and mark the pools and vdevs found. If this | |
395 | * fails (most probably because we're out of memory), then don't close | |
396 | * any of the cases and we cannot be sure they are accurate. | |
397 | */ | |
398 | if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) | |
399 | return; | |
400 | ||
401 | /* | |
402 | * Remove those cases which were not found. | |
403 | */ | |
404 | walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); | |
405 | while ((zcp = uu_list_walk_next(walk)) != NULL) { | |
406 | if (!zcp->zc_present) | |
407 | fmd_case_close(hdl, zcp->zc_case); | |
408 | } | |
409 | uu_list_walk_end(walk); | |
410 | } | |
411 | ||
412 | /* | |
413 | * Construct the name of a serd engine given the pool/vdev GUID and type (io or | |
414 | * checksum). | |
415 | */ | |
416 | static void | |
417 | zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, | |
418 | const char *type) | |
419 | { | |
420 | (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", | |
421 | (long long unsigned int)pool_guid, | |
422 | (long long unsigned int)vdev_guid, type); | |
423 | } | |
424 | ||
cbe88229 DB |
425 | static void |
426 | zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp) | |
427 | { | |
428 | fmd_hdl_debug(hdl, "retiring case"); | |
429 | ||
430 | fmd_case_close(hdl, zcp->zc_case); | |
431 | } | |
432 | ||
976246fa DB |
433 | /* |
434 | * Solve a given ZFS case. This first checks to make sure the diagnosis is | |
435 | * still valid, as well as cleaning up any pending timer associated with the | |
436 | * case. | |
437 | */ | |
438 | static void | |
e265a082 | 439 | zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname) |
976246fa DB |
440 | { |
441 | nvlist_t *detector, *fault; | |
442 | boolean_t serialize; | |
443 | nvlist_t *fru = NULL; | |
976246fa DB |
444 | fmd_hdl_debug(hdl, "solving fault '%s'", faultname); |
445 | ||
446 | /* | |
447 | * Construct the detector from the case data. The detector is in the | |
448 | * ZFS scheme, and is either the pool or the vdev, depending on whether | |
449 | * this is a vdev or pool fault. | |
450 | */ | |
451 | detector = fmd_nvl_alloc(hdl, FMD_SLEEP); | |
452 | ||
453 | (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); | |
454 | (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); | |
455 | (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, | |
456 | zcp->zc_data.zc_pool_guid); | |
457 | if (zcp->zc_data.zc_vdev_guid != 0) { | |
458 | (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, | |
459 | zcp->zc_data.zc_vdev_guid); | |
460 | } | |
461 | ||
976246fa DB |
462 | fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, |
463 | fru, detector); | |
464 | fmd_case_add_suspect(hdl, zcp->zc_case, fault); | |
465 | ||
466 | nvlist_free(fru); | |
467 | ||
468 | fmd_case_solve(hdl, zcp->zc_case); | |
469 | ||
470 | serialize = B_FALSE; | |
471 | if (zcp->zc_data.zc_has_remove_timer) { | |
472 | fmd_timer_remove(hdl, zcp->zc_remove_timer); | |
473 | zcp->zc_data.zc_has_remove_timer = 0; | |
474 | serialize = B_TRUE; | |
475 | } | |
476 | if (serialize) | |
e265a082 | 477 | zfs_case_serialize(zcp); |
976246fa DB |
478 | |
479 | nvlist_free(detector); | |
480 | } | |
481 | ||
482 | static boolean_t | |
483 | timeval_earlier(er_timeval_t *a, er_timeval_t *b) | |
484 | { | |
485 | return (a->ertv_sec < b->ertv_sec || | |
486 | (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec)); | |
487 | } | |
488 | ||
976246fa DB |
489 | static void |
490 | zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) | |
491 | { | |
e265a082 | 492 | (void) hdl; |
976246fa DB |
493 | int64_t *tod; |
494 | uint_t nelem; | |
495 | ||
496 | if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod, | |
497 | &nelem) == 0 && nelem == 2) { | |
498 | when->ertv_sec = tod[0]; | |
499 | when->ertv_nsec = tod[1]; | |
500 | } else { | |
501 | when->ertv_sec = when->ertv_nsec = UINT64_MAX; | |
502 | } | |
503 | } | |
504 | ||
505 | /* | |
506 | * Main fmd entry point. | |
507 | */ | |
976246fa DB |
508 | static void |
509 | zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) | |
510 | { | |
511 | zfs_case_t *zcp, *dcp; | |
512 | int32_t pool_state; | |
513 | uint64_t ena, pool_guid, vdev_guid; | |
69f024a5 RW |
514 | uint64_t checksum_n, checksum_t; |
515 | uint64_t io_n, io_t; | |
976246fa DB |
516 | er_timeval_t pool_load; |
517 | er_timeval_t er_when; | |
518 | nvlist_t *detector; | |
519 | boolean_t pool_found = B_FALSE; | |
520 | boolean_t isresource; | |
d1807f16 | 521 | const char *type; |
976246fa DB |
522 | |
523 | /* | |
524 | * We subscribe to notifications for vdev or pool removal. In these | |
525 | * cases, there may be cases that no longer apply. Purge any cases | |
526 | * that no longer apply. | |
527 | */ | |
528 | if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) { | |
529 | fmd_hdl_debug(hdl, "purging orphaned cases from %s", | |
530 | strrchr(class, '.') + 1); | |
531 | zfs_purge_cases(hdl); | |
532 | zfs_stats.resource_drops.fmds_value.ui64++; | |
533 | return; | |
534 | } | |
535 | ||
536 | isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); | |
537 | ||
538 | if (isresource) { | |
539 | /* | |
540 | * For resources, we don't have a normal payload. | |
541 | */ | |
542 | if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, | |
543 | &vdev_guid) != 0) | |
544 | pool_state = SPA_LOAD_OPEN; | |
545 | else | |
546 | pool_state = SPA_LOAD_NONE; | |
547 | detector = NULL; | |
548 | } else { | |
549 | (void) nvlist_lookup_nvlist(nvl, | |
550 | FM_EREPORT_DETECTOR, &detector); | |
551 | (void) nvlist_lookup_int32(nvl, | |
552 | FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); | |
553 | } | |
554 | ||
555 | /* | |
556 | * We also ignore all ereports generated during an import of a pool, | |
557 | * since the only possible fault (.pool) would result in import failure, | |
558 | * and hence no persistent fault. Some day we may want to do something | |
559 | * with these ereports, so we continue generating them internally. | |
560 | */ | |
561 | if (pool_state == SPA_LOAD_IMPORT) { | |
562 | zfs_stats.import_drops.fmds_value.ui64++; | |
563 | fmd_hdl_debug(hdl, "ignoring '%s' during import", class); | |
564 | return; | |
565 | } | |
566 | ||
567 | /* | |
568 | * Device I/O errors are ignored during pool open. | |
569 | */ | |
570 | if (pool_state == SPA_LOAD_OPEN && | |
571 | (fmd_nvl_class_match(hdl, nvl, | |
572 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || | |
573 | fmd_nvl_class_match(hdl, nvl, | |
574 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || | |
575 | fmd_nvl_class_match(hdl, nvl, | |
576 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) { | |
577 | fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class); | |
578 | zfs_stats.dev_drops.fmds_value.ui64++; | |
579 | return; | |
580 | } | |
581 | ||
582 | /* | |
583 | * We ignore ereports for anything except disks and files. | |
584 | */ | |
585 | if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, | |
586 | &type) == 0) { | |
587 | if (strcmp(type, VDEV_TYPE_DISK) != 0 && | |
588 | strcmp(type, VDEV_TYPE_FILE) != 0) { | |
589 | zfs_stats.vdev_drops.fmds_value.ui64++; | |
590 | return; | |
591 | } | |
592 | } | |
593 | ||
594 | /* | |
595 | * Determine if this ereport corresponds to an open case. | |
596 | * Each vdev or pool can have a single case. | |
597 | */ | |
598 | (void) nvlist_lookup_uint64(nvl, | |
599 | FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); | |
600 | if (nvlist_lookup_uint64(nvl, | |
601 | FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) | |
602 | vdev_guid = 0; | |
603 | if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) | |
604 | ena = 0; | |
605 | ||
606 | zfs_ereport_when(hdl, nvl, &er_when); | |
607 | ||
608 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
609 | zcp = uu_list_next(zfs_cases, zcp)) { | |
610 | if (zcp->zc_data.zc_pool_guid == pool_guid) { | |
611 | pool_found = B_TRUE; | |
612 | pool_load = zcp->zc_when; | |
613 | } | |
614 | if (zcp->zc_data.zc_vdev_guid == vdev_guid) | |
615 | break; | |
616 | } | |
617 | ||
618 | /* | |
619 | * Avoid falsely accusing a pool of being faulty. Do so by | |
620 | * not replaying ereports that were generated prior to the | |
621 | * current import. If the failure that generated them was | |
622 | * transient because the device was actually removed but we | |
623 | * didn't receive the normal asynchronous notification, we | |
624 | * don't want to mark it as faulted and potentially panic. If | |
625 | * there is still a problem we'd expect not to be able to | |
626 | * import the pool, or that new ereports will be generated | |
627 | * once the pool is used. | |
628 | */ | |
629 | if (pool_found && timeval_earlier(&er_when, &pool_load)) { | |
630 | fmd_hdl_debug(hdl, "ignoring pool %llx, " | |
631 | "ereport time %lld.%lld, pool load time = %lld.%lld", | |
632 | pool_guid, er_when.ertv_sec, er_when.ertv_nsec, | |
633 | pool_load.ertv_sec, pool_load.ertv_nsec); | |
634 | zfs_stats.old_drops.fmds_value.ui64++; | |
635 | return; | |
636 | } | |
637 | ||
638 | if (!pool_found) { | |
639 | /* | |
640 | * Haven't yet seen this pool, but same situation | |
641 | * may apply. | |
642 | */ | |
643 | libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); | |
644 | struct load_time_arg la; | |
645 | ||
646 | la.lt_guid = pool_guid; | |
647 | la.lt_time = &pool_load; | |
648 | la.lt_found = B_FALSE; | |
649 | ||
650 | if (zhdl != NULL && | |
651 | zpool_iter(zhdl, zpool_find_load_time, &la) == 0 && | |
652 | la.lt_found == B_TRUE) { | |
653 | pool_found = B_TRUE; | |
654 | ||
655 | if (timeval_earlier(&er_when, &pool_load)) { | |
656 | fmd_hdl_debug(hdl, "ignoring pool %llx, " | |
657 | "ereport time %lld.%lld, " | |
658 | "pool load time = %lld.%lld", | |
659 | pool_guid, er_when.ertv_sec, | |
660 | er_when.ertv_nsec, pool_load.ertv_sec, | |
661 | pool_load.ertv_nsec); | |
662 | zfs_stats.old_drops.fmds_value.ui64++; | |
663 | return; | |
664 | } | |
665 | } | |
666 | } | |
667 | ||
668 | if (zcp == NULL) { | |
669 | fmd_case_t *cs; | |
670 | zfs_case_data_t data = { 0 }; | |
671 | ||
672 | /* | |
673 | * If this is one of our 'fake' resource ereports, and there is | |
674 | * no case open, simply discard it. | |
675 | */ | |
676 | if (isresource) { | |
677 | zfs_stats.resource_drops.fmds_value.ui64++; | |
678 | fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", | |
679 | class, vdev_guid); | |
680 | return; | |
681 | } | |
682 | ||
683 | /* | |
684 | * Skip tracking some ereports | |
685 | */ | |
686 | if (strcmp(class, | |
687 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || | |
688 | strcmp(class, | |
cbe88229 | 689 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) { |
976246fa DB |
690 | zfs_stats.resource_drops.fmds_value.ui64++; |
691 | return; | |
692 | } | |
693 | ||
694 | /* | |
695 | * Open a new case. | |
696 | */ | |
697 | cs = fmd_case_open(hdl, NULL); | |
698 | ||
699 | fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'", | |
700 | vdev_guid, class); | |
701 | ||
702 | /* | |
703 | * Initialize the case buffer. To commonize code, we actually | |
704 | * create the buffer with existing data, and then call | |
705 | * zfs_case_unserialize() to instantiate the in-core structure. | |
706 | */ | |
707 | fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t)); | |
708 | ||
709 | data.zc_version = CASE_DATA_VERSION_SERD; | |
710 | data.zc_ena = ena; | |
711 | data.zc_pool_guid = pool_guid; | |
712 | data.zc_vdev_guid = vdev_guid; | |
713 | data.zc_pool_state = (int)pool_state; | |
714 | ||
715 | fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); | |
716 | ||
717 | zcp = zfs_case_unserialize(hdl, cs); | |
718 | assert(zcp != NULL); | |
719 | if (pool_found) | |
720 | zcp->zc_when = pool_load; | |
721 | } | |
722 | ||
723 | if (isresource) { | |
724 | fmd_hdl_debug(hdl, "resource event '%s'", class); | |
725 | ||
726 | if (fmd_nvl_class_match(hdl, nvl, | |
727 | ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { | |
728 | /* | |
729 | * The 'resource.fs.zfs.autoreplace' event indicates | |
730 | * that the pool was loaded with the 'autoreplace' | |
731 | * property set. In this case, any pending device | |
732 | * failures should be ignored, as the asynchronous | |
733 | * autoreplace handling will take care of them. | |
734 | */ | |
735 | fmd_case_close(hdl, zcp->zc_case); | |
736 | } else if (fmd_nvl_class_match(hdl, nvl, | |
737 | ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { | |
738 | /* | |
739 | * The 'resource.fs.zfs.removed' event indicates that | |
740 | * device removal was detected, and the device was | |
741 | * closed asynchronously. If this is the case, we | |
742 | * assume that any recent I/O errors were due to the | |
743 | * device removal, not any fault of the device itself. | |
744 | * We reset the SERD engine, and cancel any pending | |
745 | * timers. | |
746 | */ | |
747 | if (zcp->zc_data.zc_has_remove_timer) { | |
748 | fmd_timer_remove(hdl, zcp->zc_remove_timer); | |
749 | zcp->zc_data.zc_has_remove_timer = 0; | |
e265a082 | 750 | zfs_case_serialize(zcp); |
976246fa DB |
751 | } |
752 | if (zcp->zc_data.zc_serd_io[0] != '\0') | |
753 | fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io); | |
754 | if (zcp->zc_data.zc_serd_checksum[0] != '\0') | |
755 | fmd_serd_reset(hdl, | |
756 | zcp->zc_data.zc_serd_checksum); | |
cbe88229 DB |
757 | if (zcp->zc_data.zc_serd_slow_io[0] != '\0') |
758 | fmd_serd_reset(hdl, | |
759 | zcp->zc_data.zc_serd_slow_io); | |
976246fa DB |
760 | } else if (fmd_nvl_class_match(hdl, nvl, |
761 | ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { | |
762 | uint64_t state = 0; | |
763 | ||
764 | if (zcp != NULL && | |
765 | nvlist_lookup_uint64(nvl, | |
766 | FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 && | |
767 | state == VDEV_STATE_HEALTHY) { | |
768 | fmd_hdl_debug(hdl, "closing case after a " | |
769 | "device statechange to healthy"); | |
770 | fmd_case_close(hdl, zcp->zc_case); | |
771 | } | |
772 | } | |
773 | zfs_stats.resource_drops.fmds_value.ui64++; | |
774 | return; | |
775 | } | |
776 | ||
777 | /* | |
778 | * Associate the ereport with this case. | |
779 | */ | |
780 | fmd_case_add_ereport(hdl, zcp->zc_case, ep); | |
781 | ||
782 | /* | |
783 | * Don't do anything else if this case is already solved. | |
784 | */ | |
785 | if (fmd_case_solved(hdl, zcp->zc_case)) | |
786 | return; | |
787 | ||
cbe88229 DB |
788 | if (vdev_guid) |
789 | fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class, | |
790 | vdev_guid); | |
791 | else | |
792 | fmd_hdl_debug(hdl, "error event '%s'", class); | |
976246fa DB |
793 | |
794 | /* | |
795 | * Determine if we should solve the case and generate a fault. We solve | |
796 | * a case if: | |
797 | * | |
798 | * a. A pool failed to open (ereport.fs.zfs.pool) | |
799 | * b. A device failed to open (ereport.fs.zfs.pool) while a pool | |
800 | * was up and running. | |
801 | * | |
802 | * We may see a series of ereports associated with a pool open, all | |
803 | * chained together by the same ENA. If the pool open succeeds, then | |
804 | * we'll see no further ereports. To detect when a pool open has | |
805 | * succeeded, we associate a timer with the event. When it expires, we | |
806 | * close the case. | |
807 | */ | |
808 | if (fmd_nvl_class_match(hdl, nvl, | |
809 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { | |
810 | /* | |
811 | * Pool level fault. Before solving the case, go through and | |
812 | * close any open device cases that may be pending. | |
813 | */ | |
814 | for (dcp = uu_list_first(zfs_cases); dcp != NULL; | |
815 | dcp = uu_list_next(zfs_cases, dcp)) { | |
816 | if (dcp->zc_data.zc_pool_guid == | |
817 | zcp->zc_data.zc_pool_guid && | |
818 | dcp->zc_data.zc_vdev_guid != 0) | |
819 | fmd_case_close(hdl, dcp->zc_case); | |
820 | } | |
821 | ||
e265a082 | 822 | zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool"); |
976246fa DB |
823 | } else if (fmd_nvl_class_match(hdl, nvl, |
824 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { | |
825 | /* | |
826 | * Pool level fault for reading the intent logs. | |
827 | */ | |
e265a082 | 828 | zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay"); |
976246fa DB |
829 | } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { |
830 | /* | |
831 | * Device fault. | |
832 | */ | |
e265a082 | 833 | zfs_case_solve(hdl, zcp, "fault.fs.zfs.device"); |
976246fa DB |
834 | } else if (fmd_nvl_class_match(hdl, nvl, |
835 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || | |
836 | fmd_nvl_class_match(hdl, nvl, | |
837 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || | |
838 | fmd_nvl_class_match(hdl, nvl, | |
839 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || | |
840 | fmd_nvl_class_match(hdl, nvl, | |
cbe88229 DB |
841 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) || |
842 | fmd_nvl_class_match(hdl, nvl, | |
976246fa | 843 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { |
d1807f16 | 844 | const char *failmode = NULL; |
976246fa | 845 | boolean_t checkremove = B_FALSE; |
cf70c0f8 BB |
846 | uint32_t pri = 0; |
847 | int32_t flags = 0; | |
976246fa DB |
848 | |
849 | /* | |
850 | * If this is a checksum or I/O error, then toss it into the | |
851 | * appropriate SERD engine and check to see if it has fired. | |
852 | * Ideally, we want to do something more sophisticated, | |
853 | * (persistent errors for a single data block, etc). For now, | |
854 | * a single SERD engine is sufficient. | |
855 | */ | |
856 | if (fmd_nvl_class_match(hdl, nvl, | |
857 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { | |
858 | if (zcp->zc_data.zc_serd_io[0] == '\0') { | |
69f024a5 RW |
859 | if (nvlist_lookup_uint64(nvl, |
860 | FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N, | |
861 | &io_n) != 0) { | |
862 | io_n = DEFAULT_IO_N; | |
863 | } | |
864 | if (nvlist_lookup_uint64(nvl, | |
865 | FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T, | |
866 | &io_t) != 0) { | |
867 | io_t = DEFAULT_IO_T; | |
868 | } | |
976246fa DB |
869 | zfs_serd_name(zcp->zc_data.zc_serd_io, |
870 | pool_guid, vdev_guid, "io"); | |
871 | fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, | |
69f024a5 RW |
872 | io_n, |
873 | SEC2NSEC(io_t)); | |
e265a082 | 874 | zfs_case_serialize(zcp); |
976246fa DB |
875 | } |
876 | if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) | |
877 | checkremove = B_TRUE; | |
cbe88229 DB |
878 | } else if (fmd_nvl_class_match(hdl, nvl, |
879 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) { | |
880 | uint64_t slow_io_n, slow_io_t; | |
881 | ||
882 | /* | |
883 | * Create a slow io SERD engine when the VDEV has the | |
884 | * 'vdev_slow_io_n' and 'vdev_slow_io_n' properties. | |
885 | */ | |
886 | if (zcp->zc_data.zc_serd_slow_io[0] == '\0' && | |
887 | nvlist_lookup_uint64(nvl, | |
888 | FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, | |
889 | &slow_io_n) == 0 && | |
890 | nvlist_lookup_uint64(nvl, | |
891 | FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, | |
892 | &slow_io_t) == 0) { | |
893 | zfs_serd_name(zcp->zc_data.zc_serd_slow_io, | |
894 | pool_guid, vdev_guid, "slow_io"); | |
895 | fmd_serd_create(hdl, | |
896 | zcp->zc_data.zc_serd_slow_io, | |
897 | slow_io_n, | |
898 | SEC2NSEC(slow_io_t)); | |
899 | zfs_case_serialize(zcp); | |
900 | } | |
901 | /* Pass event to SERD engine and see if this triggers */ | |
902 | if (zcp->zc_data.zc_serd_slow_io[0] != '\0' && | |
903 | fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io, | |
904 | ep)) { | |
905 | /* | |
906 | * Ignore a slow io diagnosis when other | |
907 | * VDEVs in the pool show signs of being slow. | |
908 | */ | |
909 | if (zfs_other_slow_cases(hdl, &zcp->zc_data)) { | |
910 | zfs_case_retire(hdl, zcp); | |
911 | fmd_hdl_debug(hdl, "pool %llu has " | |
912 | "multiple slow io cases -- skip " | |
913 | "degrading vdev %llu", | |
914 | (u_longlong_t) | |
915 | zcp->zc_data.zc_pool_guid, | |
916 | (u_longlong_t) | |
917 | zcp->zc_data.zc_vdev_guid); | |
918 | } else { | |
919 | zfs_case_solve(hdl, zcp, | |
920 | "fault.fs.zfs.vdev.slow_io"); | |
921 | } | |
922 | } | |
976246fa DB |
923 | } else if (fmd_nvl_class_match(hdl, nvl, |
924 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { | |
cf70c0f8 BB |
925 | /* |
926 | * We ignore ereports for checksum errors generated by | |
927 | * scrub/resilver I/O to avoid potentially further | |
928 | * degrading the pool while it's being repaired. | |
929 | */ | |
930 | if (((nvlist_lookup_uint32(nvl, | |
931 | FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) && | |
932 | (pri == ZIO_PRIORITY_SCRUB || | |
933 | pri == ZIO_PRIORITY_REBUILD)) || | |
934 | ((nvlist_lookup_int32(nvl, | |
935 | FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) && | |
936 | (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) { | |
937 | fmd_hdl_debug(hdl, "ignoring '%s' for " | |
938 | "scrub/resilver I/O", class); | |
939 | return; | |
940 | } | |
941 | ||
976246fa | 942 | if (zcp->zc_data.zc_serd_checksum[0] == '\0') { |
69f024a5 RW |
943 | if (nvlist_lookup_uint64(nvl, |
944 | FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N, | |
945 | &checksum_n) != 0) { | |
946 | checksum_n = DEFAULT_CHECKSUM_N; | |
947 | } | |
948 | if (nvlist_lookup_uint64(nvl, | |
949 | FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T, | |
950 | &checksum_t) != 0) { | |
951 | checksum_t = DEFAULT_CHECKSUM_T; | |
952 | } | |
953 | ||
976246fa DB |
954 | zfs_serd_name(zcp->zc_data.zc_serd_checksum, |
955 | pool_guid, vdev_guid, "checksum"); | |
956 | fmd_serd_create(hdl, | |
957 | zcp->zc_data.zc_serd_checksum, | |
69f024a5 RW |
958 | checksum_n, |
959 | SEC2NSEC(checksum_t)); | |
e265a082 | 960 | zfs_case_serialize(zcp); |
976246fa DB |
961 | } |
962 | if (fmd_serd_record(hdl, | |
963 | zcp->zc_data.zc_serd_checksum, ep)) { | |
964 | zfs_case_solve(hdl, zcp, | |
e265a082 | 965 | "fault.fs.zfs.vdev.checksum"); |
976246fa DB |
966 | } |
967 | } else if (fmd_nvl_class_match(hdl, nvl, | |
968 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && | |
969 | (nvlist_lookup_string(nvl, | |
970 | FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && | |
971 | failmode != NULL) { | |
972 | if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, | |
973 | strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { | |
974 | zfs_case_solve(hdl, zcp, | |
e265a082 | 975 | "fault.fs.zfs.io_failure_continue"); |
976246fa DB |
976 | } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, |
977 | strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { | |
978 | zfs_case_solve(hdl, zcp, | |
e265a082 | 979 | "fault.fs.zfs.io_failure_wait"); |
976246fa DB |
980 | } |
981 | } else if (fmd_nvl_class_match(hdl, nvl, | |
982 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { | |
983 | #ifndef __linux__ | |
984 | /* This causes an unexpected fault diagnosis on linux */ | |
985 | checkremove = B_TRUE; | |
986 | #endif | |
987 | } | |
988 | ||
989 | /* | |
990 | * Because I/O errors may be due to device removal, we postpone | |
991 | * any diagnosis until we're sure that we aren't about to | |
992 | * receive a 'resource.fs.zfs.removed' event. | |
993 | */ | |
994 | if (checkremove) { | |
995 | if (zcp->zc_data.zc_has_remove_timer) | |
996 | fmd_timer_remove(hdl, zcp->zc_remove_timer); | |
997 | zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, | |
998 | zfs_remove_timeout); | |
999 | if (!zcp->zc_data.zc_has_remove_timer) { | |
1000 | zcp->zc_data.zc_has_remove_timer = 1; | |
e265a082 | 1001 | zfs_case_serialize(zcp); |
976246fa DB |
1002 | } |
1003 | } | |
1004 | } | |
1005 | } | |
1006 | ||
1007 | /* | |
1008 | * The timeout is fired when we diagnosed an I/O error, and it was not due to | |
1009 | * device removal (which would cause the timeout to be cancelled). | |
1010 | */ | |
976246fa DB |
1011 | static void |
1012 | zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) | |
1013 | { | |
1014 | zfs_case_t *zcp = data; | |
1015 | ||
1016 | if (id == zcp->zc_remove_timer) | |
e265a082 | 1017 | zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io"); |
976246fa DB |
1018 | } |
1019 | ||
1020 | /* | |
1021 | * The specified case has been closed and any case-specific | |
1022 | * data structures should be deallocated. | |
1023 | */ | |
1024 | static void | |
1025 | zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) | |
1026 | { | |
1027 | zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); | |
1028 | ||
1029 | if (zcp->zc_data.zc_serd_checksum[0] != '\0') | |
1030 | fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); | |
1031 | if (zcp->zc_data.zc_serd_io[0] != '\0') | |
1032 | fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); | |
cbe88229 DB |
1033 | if (zcp->zc_data.zc_serd_slow_io[0] != '\0') |
1034 | fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io); | |
976246fa DB |
1035 | if (zcp->zc_data.zc_has_remove_timer) |
1036 | fmd_timer_remove(hdl, zcp->zc_remove_timer); | |
1037 | ||
1038 | uu_list_remove(zfs_cases, zcp); | |
1039 | uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); | |
1040 | fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); | |
1041 | } | |
1042 | ||
976246fa DB |
1043 | static const fmd_hdl_ops_t fmd_ops = { |
1044 | zfs_fm_recv, /* fmdo_recv */ | |
1045 | zfs_fm_timeout, /* fmdo_timeout */ | |
1046 | zfs_fm_close, /* fmdo_close */ | |
1047 | NULL, /* fmdo_stats */ | |
cbe88229 | 1048 | NULL, /* fmdo_gc */ |
976246fa DB |
1049 | }; |
1050 | ||
1051 | static const fmd_prop_t fmd_props[] = { | |
976246fa DB |
1052 | { NULL, 0, NULL } |
1053 | }; | |
1054 | ||
1055 | static const fmd_hdl_info_t fmd_info = { | |
1056 | "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props | |
1057 | }; | |
1058 | ||
d02ca379 | 1059 | void |
976246fa | 1060 | _zfs_diagnosis_init(fmd_hdl_t *hdl) |
d02ca379 | 1061 | { |
976246fa DB |
1062 | libzfs_handle_t *zhdl; |
1063 | ||
4e9b1569 | 1064 | if ((zhdl = libzfs_init()) == NULL) |
976246fa DB |
1065 | return; |
1066 | ||
1067 | if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", | |
1068 | sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), | |
1069 | NULL, UU_LIST_POOL_DEBUG)) == NULL) { | |
4e9b1569 | 1070 | libzfs_fini(zhdl); |
976246fa DB |
1071 | return; |
1072 | } | |
1073 | ||
1074 | if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, | |
1075 | UU_LIST_DEBUG)) == NULL) { | |
1076 | uu_list_pool_destroy(zfs_case_pool); | |
4e9b1569 | 1077 | libzfs_fini(zhdl); |
976246fa DB |
1078 | return; |
1079 | } | |
1080 | ||
1081 | if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { | |
1082 | uu_list_destroy(zfs_cases); | |
1083 | uu_list_pool_destroy(zfs_case_pool); | |
4e9b1569 | 1084 | libzfs_fini(zhdl); |
976246fa DB |
1085 | return; |
1086 | } | |
1087 | ||
1088 | fmd_hdl_setspecific(hdl, zhdl); | |
1089 | ||
1090 | (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / | |
1091 | sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); | |
976246fa DB |
1092 | } |
1093 | ||
1094 | void | |
1095 | _zfs_diagnosis_fini(fmd_hdl_t *hdl) | |
1096 | { | |
1097 | zfs_case_t *zcp; | |
1098 | uu_list_walk_t *walk; | |
1099 | libzfs_handle_t *zhdl; | |
1100 | ||
1101 | /* | |
1102 | * Remove all active cases. | |
1103 | */ | |
1104 | walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); | |
1105 | while ((zcp = uu_list_walk_next(walk)) != NULL) { | |
1106 | fmd_hdl_debug(hdl, "removing case ena %llu", | |
1107 | (long long unsigned)zcp->zc_data.zc_ena); | |
1108 | uu_list_remove(zfs_cases, zcp); | |
1109 | uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); | |
1110 | fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); | |
1111 | } | |
1112 | uu_list_walk_end(walk); | |
1113 | ||
1114 | uu_list_destroy(zfs_cases); | |
1115 | uu_list_pool_destroy(zfs_case_pool); | |
1116 | ||
1117 | zhdl = fmd_hdl_getspecific(hdl); | |
4e9b1569 | 1118 | libzfs_fini(zhdl); |
d02ca379 | 1119 | } |