]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
9babb374 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/spa.h> |
27 | #include <sys/spa_impl.h> | |
28 | #include <sys/vdev.h> | |
29 | #include <sys/vdev_impl.h> | |
30 | #include <sys/zio.h> | |
31 | ||
32 | #include <sys/fm/fs/zfs.h> | |
33 | #include <sys/fm/protocol.h> | |
34 | #include <sys/fm/util.h> | |
35 | #include <sys/sysevent.h> | |
36 | ||
37 | /* | |
38 | * This general routine is responsible for generating all the different ZFS | |
39 | * ereports. The payload is dependent on the class, and which arguments are | |
40 | * supplied to the function: | |
41 | * | |
42 | * EREPORT POOL VDEV IO | |
43 | * block X X X | |
44 | * data X X | |
45 | * device X X | |
46 | * pool X | |
47 | * | |
48 | * If we are in a loading state, all errors are chained together by the same | |
b128c09f | 49 | * SPA-wide ENA (Error Numeric Association). |
34dc7c2f BB |
50 | * |
51 | * For isolated I/O requests, we get the ENA from the zio_t. The propagation | |
52 | * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want | |
53 | * to chain together all ereports associated with a logical piece of data. For | |
54 | * read I/Os, there are basically three 'types' of I/O, which form a roughly | |
55 | * layered diagram: | |
56 | * | |
57 | * +---------------+ | |
58 | * | Aggregate I/O | No associated logical data or device | |
59 | * +---------------+ | |
60 | * | | |
61 | * V | |
62 | * +---------------+ Reads associated with a piece of logical data. | |
63 | * | Read I/O | This includes reads on behalf of RAID-Z, | |
64 | * +---------------+ mirrors, gang blocks, retries, etc. | |
65 | * | | |
66 | * V | |
67 | * +---------------+ Reads associated with a particular device, but | |
68 | * | Physical I/O | no logical data. Issued as part of vdev caching | |
69 | * +---------------+ and I/O aggregation. | |
70 | * | |
71 | * Note that 'physical I/O' here is not the same terminology as used in the rest | |
72 | * of ZIO. Typically, 'physical I/O' simply means that there is no attached | |
73 | * blockpointer. But I/O with no associated block pointer can still be related | |
74 | * to a logical piece of data (i.e. RAID-Z requests). | |
75 | * | |
76 | * Purely physical I/O always have unique ENAs. They are not related to a | |
77 | * particular piece of logical data, and therefore cannot be chained together. | |
78 | * We still generate an ereport, but the DE doesn't correlate it with any | |
79 | * logical piece of data. When such an I/O fails, the delegated I/O requests | |
80 | * will issue a retry, which will trigger the 'real' ereport with the correct | |
81 | * ENA. | |
82 | * | |
83 | * We keep track of the ENA for a ZIO chain through the 'io_logical' member. | |
84 | * When a new logical I/O is issued, we set this to point to itself. Child I/Os | |
85 | * then inherit this pointer, so that when it is first set subsequent failures | |
b128c09f BB |
86 | * will use the same ENA. For vdev cache fill and queue aggregation I/O, |
87 | * this pointer is set to NULL, and no ereport will be generated (since it | |
88 | * doesn't actually correspond to any particular device or piece of data, | |
89 | * and the caller will always retry without caching or queueing anyway). | |
34dc7c2f BB |
90 | */ |
91 | void | |
92 | zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, | |
93 | uint64_t stateoroffset, uint64_t size) | |
94 | { | |
95 | #ifdef _KERNEL | |
96 | nvlist_t *ereport, *detector; | |
97 | uint64_t ena; | |
98 | char class[64]; | |
99 | ||
100 | /* | |
101 | * If we are doing a spa_tryimport(), ignore errors. | |
102 | */ | |
103 | if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) | |
104 | return; | |
105 | ||
106 | /* | |
107 | * If we are in the middle of opening a pool, and the previous attempt | |
108 | * failed, don't bother logging any new ereports - we're just going to | |
109 | * get the same diagnosis anyway. | |
110 | */ | |
111 | if (spa->spa_load_state != SPA_LOAD_NONE && | |
112 | spa->spa_last_open_failed) | |
113 | return; | |
114 | ||
b128c09f BB |
115 | if (zio != NULL) { |
116 | /* | |
117 | * If this is not a read or write zio, ignore the error. This | |
118 | * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. | |
119 | */ | |
120 | if (zio->io_type != ZIO_TYPE_READ && | |
121 | zio->io_type != ZIO_TYPE_WRITE) | |
122 | return; | |
34dc7c2f | 123 | |
b128c09f BB |
124 | /* |
125 | * Ignore any errors from speculative I/Os, as failure is an | |
126 | * expected result. | |
127 | */ | |
128 | if (zio->io_flags & ZIO_FLAG_SPECULATIVE) | |
129 | return; | |
130 | ||
131 | /* | |
9babb374 BB |
132 | * If this I/O is not a retry I/O, don't post an ereport. |
133 | * Otherwise, we risk making bad diagnoses based on B_FAILFAST | |
134 | * I/Os. | |
b128c09f | 135 | */ |
9babb374 BB |
136 | if (zio->io_error == EIO && |
137 | !(zio->io_flags & ZIO_FLAG_IO_RETRY)) | |
b128c09f | 138 | return; |
9babb374 BB |
139 | |
140 | if (vd != NULL) { | |
141 | /* | |
142 | * If the vdev has already been marked as failing due | |
143 | * to a failed probe, then ignore any subsequent I/O | |
144 | * errors, as the DE will automatically fault the vdev | |
145 | * on the first such failure. This also catches cases | |
146 | * where vdev_remove_wanted is set and the device has | |
147 | * not yet been asynchronously placed into the REMOVED | |
148 | * state. | |
149 | */ | |
150 | if (zio->io_vd == vd && | |
151 | !vdev_accessible(vd, zio) && | |
152 | strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) | |
153 | return; | |
154 | ||
155 | /* | |
156 | * Ignore checksum errors for reads from DTL regions of | |
157 | * leaf vdevs. | |
158 | */ | |
159 | if (zio->io_type == ZIO_TYPE_READ && | |
160 | zio->io_error == ECKSUM && | |
161 | vd->vdev_ops->vdev_op_leaf && | |
162 | vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) | |
163 | return; | |
164 | } | |
b128c09f | 165 | } |
34dc7c2f BB |
166 | |
167 | if ((ereport = fm_nvlist_create(NULL)) == NULL) | |
168 | return; | |
169 | ||
170 | if ((detector = fm_nvlist_create(NULL)) == NULL) { | |
171 | fm_nvlist_destroy(ereport, FM_NVA_FREE); | |
172 | return; | |
173 | } | |
174 | ||
175 | /* | |
176 | * Serialize ereport generation | |
177 | */ | |
178 | mutex_enter(&spa->spa_errlist_lock); | |
179 | ||
180 | /* | |
181 | * Determine the ENA to use for this event. If we are in a loading | |
182 | * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use | |
183 | * a root zio-wide ENA. Otherwise, simply use a unique ENA. | |
184 | */ | |
185 | if (spa->spa_load_state != SPA_LOAD_NONE) { | |
186 | if (spa->spa_ena == 0) | |
187 | spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); | |
188 | ena = spa->spa_ena; | |
189 | } else if (zio != NULL && zio->io_logical != NULL) { | |
190 | if (zio->io_logical->io_ena == 0) | |
191 | zio->io_logical->io_ena = | |
192 | fm_ena_generate(0, FM_ENA_FMT1); | |
193 | ena = zio->io_logical->io_ena; | |
194 | } else { | |
195 | ena = fm_ena_generate(0, FM_ENA_FMT1); | |
196 | } | |
197 | ||
198 | /* | |
199 | * Construct the full class, detector, and other standard FMA fields. | |
200 | */ | |
201 | (void) snprintf(class, sizeof (class), "%s.%s", | |
202 | ZFS_ERROR_CLASS, subclass); | |
203 | ||
204 | fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), | |
205 | vd != NULL ? vd->vdev_guid : 0); | |
206 | ||
207 | fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); | |
208 | ||
209 | /* | |
210 | * Construct the per-ereport payload, depending on which parameters are | |
211 | * passed in. | |
212 | */ | |
213 | ||
214 | /* | |
215 | * Generic payload members common to all ereports. | |
34dc7c2f BB |
216 | */ |
217 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, | |
b128c09f | 218 | DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, |
34dc7c2f BB |
219 | DATA_TYPE_UINT64, spa_guid(spa), |
220 | FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, | |
9babb374 | 221 | spa->spa_load_state, NULL); |
b128c09f BB |
222 | |
223 | if (spa != NULL) { | |
224 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, | |
225 | DATA_TYPE_STRING, | |
226 | spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? | |
227 | FM_EREPORT_FAILMODE_WAIT : | |
228 | spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? | |
229 | FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, | |
230 | NULL); | |
231 | } | |
34dc7c2f BB |
232 | |
233 | if (vd != NULL) { | |
234 | vdev_t *pvd = vd->vdev_parent; | |
235 | ||
236 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, | |
237 | DATA_TYPE_UINT64, vd->vdev_guid, | |
238 | FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, | |
239 | DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); | |
9babb374 | 240 | if (vd->vdev_path != NULL) |
34dc7c2f BB |
241 | fm_payload_set(ereport, |
242 | FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, | |
243 | DATA_TYPE_STRING, vd->vdev_path, NULL); | |
9babb374 | 244 | if (vd->vdev_devid != NULL) |
34dc7c2f BB |
245 | fm_payload_set(ereport, |
246 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, | |
247 | DATA_TYPE_STRING, vd->vdev_devid, NULL); | |
9babb374 BB |
248 | if (vd->vdev_fru != NULL) |
249 | fm_payload_set(ereport, | |
250 | FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, | |
251 | DATA_TYPE_STRING, vd->vdev_fru, NULL); | |
34dc7c2f BB |
252 | |
253 | if (pvd != NULL) { | |
254 | fm_payload_set(ereport, | |
255 | FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, | |
256 | DATA_TYPE_UINT64, pvd->vdev_guid, | |
257 | FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, | |
258 | DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, | |
259 | NULL); | |
260 | if (pvd->vdev_path) | |
261 | fm_payload_set(ereport, | |
262 | FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, | |
263 | DATA_TYPE_STRING, pvd->vdev_path, NULL); | |
264 | if (pvd->vdev_devid) | |
265 | fm_payload_set(ereport, | |
266 | FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, | |
267 | DATA_TYPE_STRING, pvd->vdev_devid, NULL); | |
268 | } | |
269 | } | |
270 | ||
271 | if (zio != NULL) { | |
272 | /* | |
273 | * Payload common to all I/Os. | |
274 | */ | |
275 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, | |
276 | DATA_TYPE_INT32, zio->io_error, NULL); | |
277 | ||
278 | /* | |
279 | * If the 'size' parameter is non-zero, it indicates this is a | |
280 | * RAID-Z or other I/O where the physical offset and length are | |
281 | * provided for us, instead of within the zio_t. | |
282 | */ | |
283 | if (vd != NULL) { | |
284 | if (size) | |
285 | fm_payload_set(ereport, | |
286 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, | |
287 | DATA_TYPE_UINT64, stateoroffset, | |
288 | FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, | |
289 | DATA_TYPE_UINT64, size, NULL); | |
290 | else | |
291 | fm_payload_set(ereport, | |
292 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, | |
293 | DATA_TYPE_UINT64, zio->io_offset, | |
294 | FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, | |
295 | DATA_TYPE_UINT64, zio->io_size, NULL); | |
296 | } | |
297 | ||
298 | /* | |
299 | * Payload for I/Os with corresponding logical information. | |
300 | */ | |
301 | if (zio->io_logical != NULL) | |
302 | fm_payload_set(ereport, | |
303 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, | |
304 | DATA_TYPE_UINT64, | |
305 | zio->io_logical->io_bookmark.zb_objset, | |
306 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, | |
307 | DATA_TYPE_UINT64, | |
308 | zio->io_logical->io_bookmark.zb_object, | |
309 | FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, | |
310 | DATA_TYPE_INT64, | |
311 | zio->io_logical->io_bookmark.zb_level, | |
312 | FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, | |
313 | DATA_TYPE_UINT64, | |
314 | zio->io_logical->io_bookmark.zb_blkid, NULL); | |
315 | } else if (vd != NULL) { | |
316 | /* | |
317 | * If we have a vdev but no zio, this is a device fault, and the | |
318 | * 'stateoroffset' parameter indicates the previous state of the | |
319 | * vdev. | |
320 | */ | |
321 | fm_payload_set(ereport, | |
322 | FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, | |
323 | DATA_TYPE_UINT64, stateoroffset, NULL); | |
324 | } | |
325 | mutex_exit(&spa->spa_errlist_lock); | |
326 | ||
327 | fm_ereport_post(ereport, EVCH_SLEEP); | |
328 | ||
329 | fm_nvlist_destroy(ereport, FM_NVA_FREE); | |
330 | fm_nvlist_destroy(detector, FM_NVA_FREE); | |
331 | #endif | |
332 | } | |
333 | ||
334 | static void | |
335 | zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) | |
336 | { | |
337 | #ifdef _KERNEL | |
338 | nvlist_t *resource; | |
339 | char class[64]; | |
340 | ||
341 | if ((resource = fm_nvlist_create(NULL)) == NULL) | |
342 | return; | |
343 | ||
344 | (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, | |
345 | ZFS_ERROR_CLASS, name); | |
346 | VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); | |
347 | VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); | |
348 | VERIFY(nvlist_add_uint64(resource, | |
349 | FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); | |
350 | if (vd) | |
351 | VERIFY(nvlist_add_uint64(resource, | |
352 | FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); | |
353 | ||
354 | fm_ereport_post(resource, EVCH_SLEEP); | |
355 | ||
356 | fm_nvlist_destroy(resource, FM_NVA_FREE); | |
357 | #endif | |
358 | } | |
359 | ||
34dc7c2f BB |
360 | /* |
361 | * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev | |
362 | * has been removed from the system. This will cause the DE to ignore any | |
363 | * recent I/O errors, inferring that they are due to the asynchronous device | |
364 | * removal. | |
365 | */ | |
366 | void | |
367 | zfs_post_remove(spa_t *spa, vdev_t *vd) | |
368 | { | |
369 | zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); | |
370 | } | |
371 | ||
372 | /* | |
373 | * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool | |
374 | * has the 'autoreplace' property set, and therefore any broken vdevs will be | |
375 | * handled by higher level logic, and no vdev fault should be generated. | |
376 | */ | |
377 | void | |
378 | zfs_post_autoreplace(spa_t *spa, vdev_t *vd) | |
379 | { | |
380 | zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); | |
381 | } |