]>
Commit | Line | Data |
---|---|---|
d02ca379 DB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. | |
976246fa DB |
24 | * Copyright 2015 Nexenta Systems, Inc. All rights reserved. |
25 | * Copyright (c) 2016, Intel Corporation. | |
d02ca379 DB |
26 | */ |
27 | ||
976246fa | 28 | #include <stddef.h> |
93ce2b4c | 29 | #include <string.h> |
976246fa DB |
30 | #include <libuutil.h> |
31 | #include <libzfs.h> | |
32 | #include <sys/types.h> | |
33 | #include <sys/time.h> | |
34 | #include <sys/fs/zfs.h> | |
35 | #include <sys/fm/protocol.h> | |
36 | #include <sys/fm/fs/zfs.h> | |
37 | ||
d02ca379 | 38 | #include "zfs_agents.h" |
976246fa | 39 | #include "fmd_api.h" |
d02ca379 | 40 | |
976246fa DB |
41 | /* |
42 | * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This | |
43 | * #define reserves enough space for two 64-bit hex values plus the length of | |
44 | * the longest string. | |
45 | */ | |
46 | #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) | |
d02ca379 | 47 | |
976246fa DB |
48 | /* |
49 | * On-disk case structure. This must maintain backwards compatibility with | |
50 | * previous versions of the DE. By default, any members appended to the end | |
51 | * will be filled with zeros if they don't exist in a previous version. | |
52 | */ | |
53 | typedef struct zfs_case_data { | |
54 | uint64_t zc_version; | |
55 | uint64_t zc_ena; | |
56 | uint64_t zc_pool_guid; | |
57 | uint64_t zc_vdev_guid; | |
58 | int zc_pool_state; | |
59 | char zc_serd_checksum[MAX_SERDLEN]; | |
60 | char zc_serd_io[MAX_SERDLEN]; | |
61 | int zc_has_remove_timer; | |
62 | } zfs_case_data_t; | |
63 | ||
64 | /* | |
65 | * Time-of-day | |
66 | */ | |
67 | typedef struct er_timeval { | |
68 | uint64_t ertv_sec; | |
69 | uint64_t ertv_nsec; | |
70 | } er_timeval_t; | |
71 | ||
72 | /* | |
73 | * In-core case structure. | |
74 | */ | |
75 | typedef struct zfs_case { | |
76 | boolean_t zc_present; | |
77 | uint32_t zc_version; | |
78 | zfs_case_data_t zc_data; | |
79 | fmd_case_t *zc_case; | |
80 | uu_list_node_t zc_node; | |
81 | id_t zc_remove_timer; | |
82 | char *zc_fru; | |
83 | er_timeval_t zc_when; | |
84 | } zfs_case_t; | |
85 | ||
86 | #define CASE_DATA "data" | |
87 | #define CASE_FRU "fru" | |
88 | #define CASE_DATA_VERSION_INITIAL 1 | |
89 | #define CASE_DATA_VERSION_SERD 2 | |
90 | ||
91 | typedef struct zfs_de_stats { | |
92 | fmd_stat_t old_drops; | |
93 | fmd_stat_t dev_drops; | |
94 | fmd_stat_t vdev_drops; | |
95 | fmd_stat_t import_drops; | |
96 | fmd_stat_t resource_drops; | |
97 | } zfs_de_stats_t; | |
98 | ||
99 | zfs_de_stats_t zfs_stats = { | |
100 | { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" }, | |
101 | { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"}, | |
102 | { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"}, | |
103 | { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" }, | |
104 | { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } | |
105 | }; | |
106 | ||
107 | static hrtime_t zfs_remove_timeout; | |
108 | ||
109 | uu_list_pool_t *zfs_case_pool; | |
110 | uu_list_t *zfs_cases; | |
111 | ||
112 | #define ZFS_MAKE_RSRC(type) \ | |
113 | FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type | |
114 | #define ZFS_MAKE_EREPORT(type) \ | |
115 | FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type | |
116 | ||
117 | /* | |
118 | * Write out the persistent representation of an active case. | |
119 | */ | |
120 | static void | |
e265a082 | 121 | zfs_case_serialize(zfs_case_t *zcp) |
d02ca379 | 122 | { |
976246fa DB |
123 | zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; |
124 | } | |
125 | ||
126 | /* | |
127 | * Read back the persistent representation of an active case. | |
128 | */ | |
129 | static zfs_case_t * | |
130 | zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) | |
131 | { | |
132 | zfs_case_t *zcp; | |
133 | ||
134 | zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); | |
135 | zcp->zc_case = cp; | |
136 | ||
137 | fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, | |
138 | sizeof (zcp->zc_data)); | |
139 | ||
140 | if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { | |
141 | fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); | |
142 | return (NULL); | |
143 | } | |
144 | ||
145 | /* | |
146 | * fmd_buf_read() will have already zeroed out the remainder of the | |
147 | * buffer, so we don't have to do anything special if the version | |
148 | * doesn't include the SERD engine name. | |
149 | */ | |
150 | ||
151 | if (zcp->zc_data.zc_has_remove_timer) | |
152 | zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, | |
153 | NULL, zfs_remove_timeout); | |
154 | ||
155 | uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool); | |
156 | (void) uu_list_insert_before(zfs_cases, NULL, zcp); | |
157 | ||
158 | fmd_case_setspecific(hdl, cp, zcp); | |
159 | ||
160 | return (zcp); | |
161 | } | |
162 | ||
163 | /* | |
164 | * Iterate over any active cases. If any cases are associated with a pool or | |
165 | * vdev which is no longer present on the system, close the associated case. | |
166 | */ | |
167 | static void | |
168 | zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded) | |
169 | { | |
bfe27ace | 170 | uint64_t vdev_guid = 0; |
976246fa DB |
171 | uint_t c, children; |
172 | nvlist_t **child; | |
173 | zfs_case_t *zcp; | |
976246fa | 174 | |
bfe27ace | 175 | (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); |
976246fa DB |
176 | |
177 | /* | |
178 | * Mark any cases associated with this (pool, vdev) pair. | |
179 | */ | |
180 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
181 | zcp = uu_list_next(zfs_cases, zcp)) { | |
182 | if (zcp->zc_data.zc_pool_guid == pool_guid && | |
183 | zcp->zc_data.zc_vdev_guid == vdev_guid) { | |
184 | zcp->zc_present = B_TRUE; | |
185 | zcp->zc_when = *loaded; | |
186 | } | |
187 | } | |
188 | ||
189 | /* | |
190 | * Iterate over all children. | |
191 | */ | |
192 | if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, | |
193 | &children) == 0) { | |
194 | for (c = 0; c < children; c++) | |
195 | zfs_mark_vdev(pool_guid, child[c], loaded); | |
196 | } | |
197 | ||
198 | if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, | |
199 | &children) == 0) { | |
200 | for (c = 0; c < children; c++) | |
201 | zfs_mark_vdev(pool_guid, child[c], loaded); | |
202 | } | |
203 | ||
204 | if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, | |
205 | &children) == 0) { | |
206 | for (c = 0; c < children; c++) | |
207 | zfs_mark_vdev(pool_guid, child[c], loaded); | |
208 | } | |
d02ca379 DB |
209 | } |
210 | ||
976246fa DB |
211 | static int |
212 | zfs_mark_pool(zpool_handle_t *zhp, void *unused) | |
213 | { | |
e265a082 | 214 | (void) unused; |
976246fa DB |
215 | zfs_case_t *zcp; |
216 | uint64_t pool_guid; | |
217 | uint64_t *tod; | |
218 | er_timeval_t loaded = { 0 }; | |
219 | nvlist_t *config, *vd; | |
220 | uint_t nelem = 0; | |
221 | int ret; | |
222 | ||
223 | pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); | |
224 | /* | |
225 | * Mark any cases associated with just this pool. | |
226 | */ | |
227 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
228 | zcp = uu_list_next(zfs_cases, zcp)) { | |
229 | if (zcp->zc_data.zc_pool_guid == pool_guid && | |
230 | zcp->zc_data.zc_vdev_guid == 0) | |
231 | zcp->zc_present = B_TRUE; | |
232 | } | |
233 | ||
234 | if ((config = zpool_get_config(zhp, NULL)) == NULL) { | |
235 | zpool_close(zhp); | |
236 | return (-1); | |
237 | } | |
238 | ||
239 | (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, | |
240 | &tod, &nelem); | |
241 | if (nelem == 2) { | |
242 | loaded.ertv_sec = tod[0]; | |
243 | loaded.ertv_nsec = tod[1]; | |
244 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
245 | zcp = uu_list_next(zfs_cases, zcp)) { | |
246 | if (zcp->zc_data.zc_pool_guid == pool_guid && | |
247 | zcp->zc_data.zc_vdev_guid == 0) { | |
248 | zcp->zc_when = loaded; | |
249 | } | |
250 | } | |
251 | } | |
252 | ||
253 | ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); | |
bfe27ace BB |
254 | if (ret) { |
255 | zpool_close(zhp); | |
256 | return (-1); | |
257 | } | |
976246fa DB |
258 | |
259 | zfs_mark_vdev(pool_guid, vd, &loaded); | |
260 | ||
261 | zpool_close(zhp); | |
262 | ||
263 | return (0); | |
264 | } | |
265 | ||
266 | struct load_time_arg { | |
267 | uint64_t lt_guid; | |
268 | er_timeval_t *lt_time; | |
269 | boolean_t lt_found; | |
270 | }; | |
271 | ||
272 | static int | |
273 | zpool_find_load_time(zpool_handle_t *zhp, void *arg) | |
d02ca379 | 274 | { |
976246fa DB |
275 | struct load_time_arg *lta = arg; |
276 | uint64_t pool_guid; | |
277 | uint64_t *tod; | |
278 | nvlist_t *config; | |
279 | uint_t nelem; | |
280 | ||
281 | if (lta->lt_found) { | |
282 | zpool_close(zhp); | |
283 | return (0); | |
284 | } | |
285 | ||
286 | pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); | |
287 | if (pool_guid != lta->lt_guid) { | |
288 | zpool_close(zhp); | |
289 | return (0); | |
290 | } | |
291 | ||
292 | if ((config = zpool_get_config(zhp, NULL)) == NULL) { | |
293 | zpool_close(zhp); | |
294 | return (-1); | |
295 | } | |
296 | ||
297 | if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, | |
298 | &tod, &nelem) == 0 && nelem == 2) { | |
299 | lta->lt_found = B_TRUE; | |
300 | lta->lt_time->ertv_sec = tod[0]; | |
301 | lta->lt_time->ertv_nsec = tod[1]; | |
302 | } | |
303 | ||
304 | zpool_close(zhp); | |
305 | ||
d02ca379 DB |
306 | return (0); |
307 | } | |
308 | ||
976246fa DB |
309 | static void |
310 | zfs_purge_cases(fmd_hdl_t *hdl) | |
311 | { | |
312 | zfs_case_t *zcp; | |
313 | uu_list_walk_t *walk; | |
314 | libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); | |
315 | ||
316 | /* | |
317 | * There is no way to open a pool by GUID, or lookup a vdev by GUID. No | |
318 | * matter what we do, we're going to have to stomach an O(vdevs * cases) | |
319 | * algorithm. In reality, both quantities are likely so small that | |
320 | * neither will matter. Given that iterating over pools is more | |
321 | * expensive than iterating over the in-memory case list, we opt for a | |
322 | * 'present' flag in each case that starts off cleared. We then iterate | |
323 | * over all pools, marking those that are still present, and removing | |
324 | * those that aren't found. | |
325 | * | |
326 | * Note that we could also construct an FMRI and rely on | |
327 | * fmd_nvl_fmri_present(), but this would end up doing the same search. | |
328 | */ | |
329 | ||
330 | /* | |
331 | * Mark the cases as not present. | |
332 | */ | |
333 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
334 | zcp = uu_list_next(zfs_cases, zcp)) | |
335 | zcp->zc_present = B_FALSE; | |
336 | ||
337 | /* | |
338 | * Iterate over all pools and mark the pools and vdevs found. If this | |
339 | * fails (most probably because we're out of memory), then don't close | |
340 | * any of the cases and we cannot be sure they are accurate. | |
341 | */ | |
342 | if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) | |
343 | return; | |
344 | ||
345 | /* | |
346 | * Remove those cases which were not found. | |
347 | */ | |
348 | walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); | |
349 | while ((zcp = uu_list_walk_next(walk)) != NULL) { | |
350 | if (!zcp->zc_present) | |
351 | fmd_case_close(hdl, zcp->zc_case); | |
352 | } | |
353 | uu_list_walk_end(walk); | |
354 | } | |
355 | ||
356 | /* | |
357 | * Construct the name of a serd engine given the pool/vdev GUID and type (io or | |
358 | * checksum). | |
359 | */ | |
360 | static void | |
361 | zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, | |
362 | const char *type) | |
363 | { | |
364 | (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", | |
365 | (long long unsigned int)pool_guid, | |
366 | (long long unsigned int)vdev_guid, type); | |
367 | } | |
368 | ||
369 | /* | |
370 | * Solve a given ZFS case. This first checks to make sure the diagnosis is | |
371 | * still valid, as well as cleaning up any pending timer associated with the | |
372 | * case. | |
373 | */ | |
374 | static void | |
e265a082 | 375 | zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname) |
976246fa DB |
376 | { |
377 | nvlist_t *detector, *fault; | |
378 | boolean_t serialize; | |
379 | nvlist_t *fru = NULL; | |
976246fa DB |
380 | fmd_hdl_debug(hdl, "solving fault '%s'", faultname); |
381 | ||
382 | /* | |
383 | * Construct the detector from the case data. The detector is in the | |
384 | * ZFS scheme, and is either the pool or the vdev, depending on whether | |
385 | * this is a vdev or pool fault. | |
386 | */ | |
387 | detector = fmd_nvl_alloc(hdl, FMD_SLEEP); | |
388 | ||
389 | (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); | |
390 | (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); | |
391 | (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, | |
392 | zcp->zc_data.zc_pool_guid); | |
393 | if (zcp->zc_data.zc_vdev_guid != 0) { | |
394 | (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, | |
395 | zcp->zc_data.zc_vdev_guid); | |
396 | } | |
397 | ||
976246fa DB |
398 | fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, |
399 | fru, detector); | |
400 | fmd_case_add_suspect(hdl, zcp->zc_case, fault); | |
401 | ||
402 | nvlist_free(fru); | |
403 | ||
404 | fmd_case_solve(hdl, zcp->zc_case); | |
405 | ||
406 | serialize = B_FALSE; | |
407 | if (zcp->zc_data.zc_has_remove_timer) { | |
408 | fmd_timer_remove(hdl, zcp->zc_remove_timer); | |
409 | zcp->zc_data.zc_has_remove_timer = 0; | |
410 | serialize = B_TRUE; | |
411 | } | |
412 | if (serialize) | |
e265a082 | 413 | zfs_case_serialize(zcp); |
976246fa DB |
414 | |
415 | nvlist_free(detector); | |
416 | } | |
417 | ||
418 | static boolean_t | |
419 | timeval_earlier(er_timeval_t *a, er_timeval_t *b) | |
420 | { | |
421 | return (a->ertv_sec < b->ertv_sec || | |
422 | (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec)); | |
423 | } | |
424 | ||
976246fa DB |
425 | static void |
426 | zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) | |
427 | { | |
e265a082 | 428 | (void) hdl; |
976246fa DB |
429 | int64_t *tod; |
430 | uint_t nelem; | |
431 | ||
432 | if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod, | |
433 | &nelem) == 0 && nelem == 2) { | |
434 | when->ertv_sec = tod[0]; | |
435 | when->ertv_nsec = tod[1]; | |
436 | } else { | |
437 | when->ertv_sec = when->ertv_nsec = UINT64_MAX; | |
438 | } | |
439 | } | |
440 | ||
441 | /* | |
442 | * Main fmd entry point. | |
443 | */ | |
976246fa DB |
444 | static void |
445 | zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) | |
446 | { | |
447 | zfs_case_t *zcp, *dcp; | |
448 | int32_t pool_state; | |
449 | uint64_t ena, pool_guid, vdev_guid; | |
450 | er_timeval_t pool_load; | |
451 | er_timeval_t er_when; | |
452 | nvlist_t *detector; | |
453 | boolean_t pool_found = B_FALSE; | |
454 | boolean_t isresource; | |
455 | char *type; | |
456 | ||
457 | /* | |
458 | * We subscribe to notifications for vdev or pool removal. In these | |
459 | * cases, there may be cases that no longer apply. Purge any cases | |
460 | * that no longer apply. | |
461 | */ | |
462 | if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) { | |
463 | fmd_hdl_debug(hdl, "purging orphaned cases from %s", | |
464 | strrchr(class, '.') + 1); | |
465 | zfs_purge_cases(hdl); | |
466 | zfs_stats.resource_drops.fmds_value.ui64++; | |
467 | return; | |
468 | } | |
469 | ||
470 | isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); | |
471 | ||
472 | if (isresource) { | |
473 | /* | |
474 | * For resources, we don't have a normal payload. | |
475 | */ | |
476 | if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, | |
477 | &vdev_guid) != 0) | |
478 | pool_state = SPA_LOAD_OPEN; | |
479 | else | |
480 | pool_state = SPA_LOAD_NONE; | |
481 | detector = NULL; | |
482 | } else { | |
483 | (void) nvlist_lookup_nvlist(nvl, | |
484 | FM_EREPORT_DETECTOR, &detector); | |
485 | (void) nvlist_lookup_int32(nvl, | |
486 | FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); | |
487 | } | |
488 | ||
489 | /* | |
490 | * We also ignore all ereports generated during an import of a pool, | |
491 | * since the only possible fault (.pool) would result in import failure, | |
492 | * and hence no persistent fault. Some day we may want to do something | |
493 | * with these ereports, so we continue generating them internally. | |
494 | */ | |
495 | if (pool_state == SPA_LOAD_IMPORT) { | |
496 | zfs_stats.import_drops.fmds_value.ui64++; | |
497 | fmd_hdl_debug(hdl, "ignoring '%s' during import", class); | |
498 | return; | |
499 | } | |
500 | ||
501 | /* | |
502 | * Device I/O errors are ignored during pool open. | |
503 | */ | |
504 | if (pool_state == SPA_LOAD_OPEN && | |
505 | (fmd_nvl_class_match(hdl, nvl, | |
506 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || | |
507 | fmd_nvl_class_match(hdl, nvl, | |
508 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || | |
509 | fmd_nvl_class_match(hdl, nvl, | |
510 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) { | |
511 | fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class); | |
512 | zfs_stats.dev_drops.fmds_value.ui64++; | |
513 | return; | |
514 | } | |
515 | ||
516 | /* | |
517 | * We ignore ereports for anything except disks and files. | |
518 | */ | |
519 | if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, | |
520 | &type) == 0) { | |
521 | if (strcmp(type, VDEV_TYPE_DISK) != 0 && | |
522 | strcmp(type, VDEV_TYPE_FILE) != 0) { | |
523 | zfs_stats.vdev_drops.fmds_value.ui64++; | |
524 | return; | |
525 | } | |
526 | } | |
527 | ||
528 | /* | |
529 | * Determine if this ereport corresponds to an open case. | |
530 | * Each vdev or pool can have a single case. | |
531 | */ | |
532 | (void) nvlist_lookup_uint64(nvl, | |
533 | FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); | |
534 | if (nvlist_lookup_uint64(nvl, | |
535 | FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) | |
536 | vdev_guid = 0; | |
537 | if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) | |
538 | ena = 0; | |
539 | ||
540 | zfs_ereport_when(hdl, nvl, &er_when); | |
541 | ||
542 | for (zcp = uu_list_first(zfs_cases); zcp != NULL; | |
543 | zcp = uu_list_next(zfs_cases, zcp)) { | |
544 | if (zcp->zc_data.zc_pool_guid == pool_guid) { | |
545 | pool_found = B_TRUE; | |
546 | pool_load = zcp->zc_when; | |
547 | } | |
548 | if (zcp->zc_data.zc_vdev_guid == vdev_guid) | |
549 | break; | |
550 | } | |
551 | ||
552 | /* | |
553 | * Avoid falsely accusing a pool of being faulty. Do so by | |
554 | * not replaying ereports that were generated prior to the | |
555 | * current import. If the failure that generated them was | |
556 | * transient because the device was actually removed but we | |
557 | * didn't receive the normal asynchronous notification, we | |
558 | * don't want to mark it as faulted and potentially panic. If | |
559 | * there is still a problem we'd expect not to be able to | |
560 | * import the pool, or that new ereports will be generated | |
561 | * once the pool is used. | |
562 | */ | |
563 | if (pool_found && timeval_earlier(&er_when, &pool_load)) { | |
564 | fmd_hdl_debug(hdl, "ignoring pool %llx, " | |
565 | "ereport time %lld.%lld, pool load time = %lld.%lld", | |
566 | pool_guid, er_when.ertv_sec, er_when.ertv_nsec, | |
567 | pool_load.ertv_sec, pool_load.ertv_nsec); | |
568 | zfs_stats.old_drops.fmds_value.ui64++; | |
569 | return; | |
570 | } | |
571 | ||
572 | if (!pool_found) { | |
573 | /* | |
574 | * Haven't yet seen this pool, but same situation | |
575 | * may apply. | |
576 | */ | |
577 | libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); | |
578 | struct load_time_arg la; | |
579 | ||
580 | la.lt_guid = pool_guid; | |
581 | la.lt_time = &pool_load; | |
582 | la.lt_found = B_FALSE; | |
583 | ||
584 | if (zhdl != NULL && | |
585 | zpool_iter(zhdl, zpool_find_load_time, &la) == 0 && | |
586 | la.lt_found == B_TRUE) { | |
587 | pool_found = B_TRUE; | |
588 | ||
589 | if (timeval_earlier(&er_when, &pool_load)) { | |
590 | fmd_hdl_debug(hdl, "ignoring pool %llx, " | |
591 | "ereport time %lld.%lld, " | |
592 | "pool load time = %lld.%lld", | |
593 | pool_guid, er_when.ertv_sec, | |
594 | er_when.ertv_nsec, pool_load.ertv_sec, | |
595 | pool_load.ertv_nsec); | |
596 | zfs_stats.old_drops.fmds_value.ui64++; | |
597 | return; | |
598 | } | |
599 | } | |
600 | } | |
601 | ||
602 | if (zcp == NULL) { | |
603 | fmd_case_t *cs; | |
604 | zfs_case_data_t data = { 0 }; | |
605 | ||
606 | /* | |
607 | * If this is one of our 'fake' resource ereports, and there is | |
608 | * no case open, simply discard it. | |
609 | */ | |
610 | if (isresource) { | |
611 | zfs_stats.resource_drops.fmds_value.ui64++; | |
612 | fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", | |
613 | class, vdev_guid); | |
614 | return; | |
615 | } | |
616 | ||
617 | /* | |
618 | * Skip tracking some ereports | |
619 | */ | |
620 | if (strcmp(class, | |
621 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || | |
622 | strcmp(class, | |
623 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || | |
624 | strcmp(class, | |
625 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { | |
626 | zfs_stats.resource_drops.fmds_value.ui64++; | |
627 | return; | |
628 | } | |
629 | ||
630 | /* | |
631 | * Open a new case. | |
632 | */ | |
633 | cs = fmd_case_open(hdl, NULL); | |
634 | ||
635 | fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'", | |
636 | vdev_guid, class); | |
637 | ||
638 | /* | |
639 | * Initialize the case buffer. To commonize code, we actually | |
640 | * create the buffer with existing data, and then call | |
641 | * zfs_case_unserialize() to instantiate the in-core structure. | |
642 | */ | |
643 | fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t)); | |
644 | ||
645 | data.zc_version = CASE_DATA_VERSION_SERD; | |
646 | data.zc_ena = ena; | |
647 | data.zc_pool_guid = pool_guid; | |
648 | data.zc_vdev_guid = vdev_guid; | |
649 | data.zc_pool_state = (int)pool_state; | |
650 | ||
651 | fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); | |
652 | ||
653 | zcp = zfs_case_unserialize(hdl, cs); | |
654 | assert(zcp != NULL); | |
655 | if (pool_found) | |
656 | zcp->zc_when = pool_load; | |
657 | } | |
658 | ||
659 | if (isresource) { | |
660 | fmd_hdl_debug(hdl, "resource event '%s'", class); | |
661 | ||
662 | if (fmd_nvl_class_match(hdl, nvl, | |
663 | ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { | |
664 | /* | |
665 | * The 'resource.fs.zfs.autoreplace' event indicates | |
666 | * that the pool was loaded with the 'autoreplace' | |
667 | * property set. In this case, any pending device | |
668 | * failures should be ignored, as the asynchronous | |
669 | * autoreplace handling will take care of them. | |
670 | */ | |
671 | fmd_case_close(hdl, zcp->zc_case); | |
672 | } else if (fmd_nvl_class_match(hdl, nvl, | |
673 | ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { | |
674 | /* | |
675 | * The 'resource.fs.zfs.removed' event indicates that | |
676 | * device removal was detected, and the device was | |
677 | * closed asynchronously. If this is the case, we | |
678 | * assume that any recent I/O errors were due to the | |
679 | * device removal, not any fault of the device itself. | |
680 | * We reset the SERD engine, and cancel any pending | |
681 | * timers. | |
682 | */ | |
683 | if (zcp->zc_data.zc_has_remove_timer) { | |
684 | fmd_timer_remove(hdl, zcp->zc_remove_timer); | |
685 | zcp->zc_data.zc_has_remove_timer = 0; | |
e265a082 | 686 | zfs_case_serialize(zcp); |
976246fa DB |
687 | } |
688 | if (zcp->zc_data.zc_serd_io[0] != '\0') | |
689 | fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io); | |
690 | if (zcp->zc_data.zc_serd_checksum[0] != '\0') | |
691 | fmd_serd_reset(hdl, | |
692 | zcp->zc_data.zc_serd_checksum); | |
693 | } else if (fmd_nvl_class_match(hdl, nvl, | |
694 | ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { | |
695 | uint64_t state = 0; | |
696 | ||
697 | if (zcp != NULL && | |
698 | nvlist_lookup_uint64(nvl, | |
699 | FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 && | |
700 | state == VDEV_STATE_HEALTHY) { | |
701 | fmd_hdl_debug(hdl, "closing case after a " | |
702 | "device statechange to healthy"); | |
703 | fmd_case_close(hdl, zcp->zc_case); | |
704 | } | |
705 | } | |
706 | zfs_stats.resource_drops.fmds_value.ui64++; | |
707 | return; | |
708 | } | |
709 | ||
710 | /* | |
711 | * Associate the ereport with this case. | |
712 | */ | |
713 | fmd_case_add_ereport(hdl, zcp->zc_case, ep); | |
714 | ||
715 | /* | |
716 | * Don't do anything else if this case is already solved. | |
717 | */ | |
718 | if (fmd_case_solved(hdl, zcp->zc_case)) | |
719 | return; | |
720 | ||
721 | fmd_hdl_debug(hdl, "error event '%s'", class); | |
722 | ||
723 | /* | |
724 | * Determine if we should solve the case and generate a fault. We solve | |
725 | * a case if: | |
726 | * | |
727 | * a. A pool failed to open (ereport.fs.zfs.pool) | |
728 | * b. A device failed to open (ereport.fs.zfs.pool) while a pool | |
729 | * was up and running. | |
730 | * | |
731 | * We may see a series of ereports associated with a pool open, all | |
732 | * chained together by the same ENA. If the pool open succeeds, then | |
733 | * we'll see no further ereports. To detect when a pool open has | |
734 | * succeeded, we associate a timer with the event. When it expires, we | |
735 | * close the case. | |
736 | */ | |
737 | if (fmd_nvl_class_match(hdl, nvl, | |
738 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { | |
739 | /* | |
740 | * Pool level fault. Before solving the case, go through and | |
741 | * close any open device cases that may be pending. | |
742 | */ | |
743 | for (dcp = uu_list_first(zfs_cases); dcp != NULL; | |
744 | dcp = uu_list_next(zfs_cases, dcp)) { | |
745 | if (dcp->zc_data.zc_pool_guid == | |
746 | zcp->zc_data.zc_pool_guid && | |
747 | dcp->zc_data.zc_vdev_guid != 0) | |
748 | fmd_case_close(hdl, dcp->zc_case); | |
749 | } | |
750 | ||
e265a082 | 751 | zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool"); |
976246fa DB |
752 | } else if (fmd_nvl_class_match(hdl, nvl, |
753 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { | |
754 | /* | |
755 | * Pool level fault for reading the intent logs. | |
756 | */ | |
e265a082 | 757 | zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay"); |
976246fa DB |
758 | } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { |
759 | /* | |
760 | * Device fault. | |
761 | */ | |
e265a082 | 762 | zfs_case_solve(hdl, zcp, "fault.fs.zfs.device"); |
976246fa DB |
763 | } else if (fmd_nvl_class_match(hdl, nvl, |
764 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || | |
765 | fmd_nvl_class_match(hdl, nvl, | |
766 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || | |
767 | fmd_nvl_class_match(hdl, nvl, | |
768 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || | |
769 | fmd_nvl_class_match(hdl, nvl, | |
770 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { | |
771 | char *failmode = NULL; | |
772 | boolean_t checkremove = B_FALSE; | |
773 | ||
774 | /* | |
775 | * If this is a checksum or I/O error, then toss it into the | |
776 | * appropriate SERD engine and check to see if it has fired. | |
777 | * Ideally, we want to do something more sophisticated, | |
778 | * (persistent errors for a single data block, etc). For now, | |
779 | * a single SERD engine is sufficient. | |
780 | */ | |
781 | if (fmd_nvl_class_match(hdl, nvl, | |
782 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { | |
783 | if (zcp->zc_data.zc_serd_io[0] == '\0') { | |
784 | zfs_serd_name(zcp->zc_data.zc_serd_io, | |
785 | pool_guid, vdev_guid, "io"); | |
786 | fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, | |
787 | fmd_prop_get_int32(hdl, "io_N"), | |
788 | fmd_prop_get_int64(hdl, "io_T")); | |
e265a082 | 789 | zfs_case_serialize(zcp); |
976246fa DB |
790 | } |
791 | if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) | |
792 | checkremove = B_TRUE; | |
793 | } else if (fmd_nvl_class_match(hdl, nvl, | |
794 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { | |
795 | if (zcp->zc_data.zc_serd_checksum[0] == '\0') { | |
796 | zfs_serd_name(zcp->zc_data.zc_serd_checksum, | |
797 | pool_guid, vdev_guid, "checksum"); | |
798 | fmd_serd_create(hdl, | |
799 | zcp->zc_data.zc_serd_checksum, | |
800 | fmd_prop_get_int32(hdl, "checksum_N"), | |
801 | fmd_prop_get_int64(hdl, "checksum_T")); | |
e265a082 | 802 | zfs_case_serialize(zcp); |
976246fa DB |
803 | } |
804 | if (fmd_serd_record(hdl, | |
805 | zcp->zc_data.zc_serd_checksum, ep)) { | |
806 | zfs_case_solve(hdl, zcp, | |
e265a082 | 807 | "fault.fs.zfs.vdev.checksum"); |
976246fa DB |
808 | } |
809 | } else if (fmd_nvl_class_match(hdl, nvl, | |
810 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && | |
811 | (nvlist_lookup_string(nvl, | |
812 | FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && | |
813 | failmode != NULL) { | |
814 | if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, | |
815 | strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { | |
816 | zfs_case_solve(hdl, zcp, | |
e265a082 | 817 | "fault.fs.zfs.io_failure_continue"); |
976246fa DB |
818 | } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, |
819 | strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { | |
820 | zfs_case_solve(hdl, zcp, | |
e265a082 | 821 | "fault.fs.zfs.io_failure_wait"); |
976246fa DB |
822 | } |
823 | } else if (fmd_nvl_class_match(hdl, nvl, | |
824 | ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { | |
825 | #ifndef __linux__ | |
826 | /* This causes an unexpected fault diagnosis on linux */ | |
827 | checkremove = B_TRUE; | |
828 | #endif | |
829 | } | |
830 | ||
831 | /* | |
832 | * Because I/O errors may be due to device removal, we postpone | |
833 | * any diagnosis until we're sure that we aren't about to | |
834 | * receive a 'resource.fs.zfs.removed' event. | |
835 | */ | |
836 | if (checkremove) { | |
837 | if (zcp->zc_data.zc_has_remove_timer) | |
838 | fmd_timer_remove(hdl, zcp->zc_remove_timer); | |
839 | zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, | |
840 | zfs_remove_timeout); | |
841 | if (!zcp->zc_data.zc_has_remove_timer) { | |
842 | zcp->zc_data.zc_has_remove_timer = 1; | |
e265a082 | 843 | zfs_case_serialize(zcp); |
976246fa DB |
844 | } |
845 | } | |
846 | } | |
847 | } | |
848 | ||
849 | /* | |
850 | * The timeout is fired when we diagnosed an I/O error, and it was not due to | |
851 | * device removal (which would cause the timeout to be cancelled). | |
852 | */ | |
976246fa DB |
853 | static void |
854 | zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) | |
855 | { | |
856 | zfs_case_t *zcp = data; | |
857 | ||
858 | if (id == zcp->zc_remove_timer) | |
e265a082 | 859 | zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io"); |
976246fa DB |
860 | } |
861 | ||
862 | /* | |
863 | * The specified case has been closed and any case-specific | |
864 | * data structures should be deallocated. | |
865 | */ | |
866 | static void | |
867 | zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) | |
868 | { | |
869 | zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); | |
870 | ||
871 | if (zcp->zc_data.zc_serd_checksum[0] != '\0') | |
872 | fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); | |
873 | if (zcp->zc_data.zc_serd_io[0] != '\0') | |
874 | fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); | |
875 | if (zcp->zc_data.zc_has_remove_timer) | |
876 | fmd_timer_remove(hdl, zcp->zc_remove_timer); | |
877 | ||
878 | uu_list_remove(zfs_cases, zcp); | |
879 | uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); | |
880 | fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); | |
881 | } | |
882 | ||
883 | /* | |
884 | * We use the fmd gc entry point to look for old cases that no longer apply. | |
885 | * This allows us to keep our set of case data small in a long running system. | |
886 | */ | |
887 | static void | |
888 | zfs_fm_gc(fmd_hdl_t *hdl) | |
889 | { | |
890 | zfs_purge_cases(hdl); | |
891 | } | |
892 | ||
893 | static const fmd_hdl_ops_t fmd_ops = { | |
894 | zfs_fm_recv, /* fmdo_recv */ | |
895 | zfs_fm_timeout, /* fmdo_timeout */ | |
896 | zfs_fm_close, /* fmdo_close */ | |
897 | NULL, /* fmdo_stats */ | |
898 | zfs_fm_gc, /* fmdo_gc */ | |
899 | }; | |
900 | ||
901 | static const fmd_prop_t fmd_props[] = { | |
902 | { "checksum_N", FMD_TYPE_UINT32, "10" }, | |
903 | { "checksum_T", FMD_TYPE_TIME, "10min" }, | |
904 | { "io_N", FMD_TYPE_UINT32, "10" }, | |
905 | { "io_T", FMD_TYPE_TIME, "10min" }, | |
906 | { "remove_timeout", FMD_TYPE_TIME, "15sec" }, | |
907 | { NULL, 0, NULL } | |
908 | }; | |
909 | ||
910 | static const fmd_hdl_info_t fmd_info = { | |
911 | "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props | |
912 | }; | |
913 | ||
d02ca379 | 914 | void |
976246fa | 915 | _zfs_diagnosis_init(fmd_hdl_t *hdl) |
d02ca379 | 916 | { |
976246fa DB |
917 | libzfs_handle_t *zhdl; |
918 | ||
4e9b1569 | 919 | if ((zhdl = libzfs_init()) == NULL) |
976246fa DB |
920 | return; |
921 | ||
922 | if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", | |
923 | sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), | |
924 | NULL, UU_LIST_POOL_DEBUG)) == NULL) { | |
4e9b1569 | 925 | libzfs_fini(zhdl); |
976246fa DB |
926 | return; |
927 | } | |
928 | ||
929 | if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, | |
930 | UU_LIST_DEBUG)) == NULL) { | |
931 | uu_list_pool_destroy(zfs_case_pool); | |
4e9b1569 | 932 | libzfs_fini(zhdl); |
976246fa DB |
933 | return; |
934 | } | |
935 | ||
936 | if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { | |
937 | uu_list_destroy(zfs_cases); | |
938 | uu_list_pool_destroy(zfs_case_pool); | |
4e9b1569 | 939 | libzfs_fini(zhdl); |
976246fa DB |
940 | return; |
941 | } | |
942 | ||
943 | fmd_hdl_setspecific(hdl, zhdl); | |
944 | ||
945 | (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / | |
946 | sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); | |
947 | ||
948 | zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); | |
949 | } | |
950 | ||
951 | void | |
952 | _zfs_diagnosis_fini(fmd_hdl_t *hdl) | |
953 | { | |
954 | zfs_case_t *zcp; | |
955 | uu_list_walk_t *walk; | |
956 | libzfs_handle_t *zhdl; | |
957 | ||
958 | /* | |
959 | * Remove all active cases. | |
960 | */ | |
961 | walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); | |
962 | while ((zcp = uu_list_walk_next(walk)) != NULL) { | |
963 | fmd_hdl_debug(hdl, "removing case ena %llu", | |
964 | (long long unsigned)zcp->zc_data.zc_ena); | |
965 | uu_list_remove(zfs_cases, zcp); | |
966 | uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); | |
967 | fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); | |
968 | } | |
969 | uu_list_walk_end(walk); | |
970 | ||
971 | uu_list_destroy(zfs_cases); | |
972 | uu_list_pool_destroy(zfs_case_pool); | |
973 | ||
974 | zhdl = fmd_hdl_getspecific(hdl); | |
4e9b1569 | 975 | libzfs_fini(zhdl); |
d02ca379 | 976 | } |