]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. | |
23 | * Use is subject to license terms. | |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/spa.h> |
27 | #include <sys/spa_impl.h> | |
28 | #include <sys/vdev.h> | |
29 | #include <sys/vdev_impl.h> | |
30 | #include <sys/zio.h> | |
31 | ||
32 | #include <sys/fm/fs/zfs.h> | |
33 | #include <sys/fm/protocol.h> | |
34 | #include <sys/fm/util.h> | |
35 | #include <sys/sysevent.h> | |
36 | ||
37 | /* | |
38 | * This general routine is responsible for generating all the different ZFS | |
39 | * ereports. The payload is dependent on the class, and which arguments are | |
40 | * supplied to the function: | |
41 | * | |
42 | * EREPORT POOL VDEV IO | |
43 | * block X X X | |
44 | * data X X | |
45 | * device X X | |
46 | * pool X | |
47 | * | |
48 | * If we are in a loading state, all errors are chained together by the same | |
b128c09f | 49 | * SPA-wide ENA (Error Numeric Association). |
34dc7c2f BB |
50 | * |
51 | * For isolated I/O requests, we get the ENA from the zio_t. The propagation | |
52 | * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want | |
53 | * to chain together all ereports associated with a logical piece of data. For | |
54 | * read I/Os, there are basically three 'types' of I/O, which form a roughly | |
55 | * layered diagram: | |
56 | * | |
57 | * +---------------+ | |
58 | * | Aggregate I/O | No associated logical data or device | |
59 | * +---------------+ | |
60 | * | | |
61 | * V | |
62 | * +---------------+ Reads associated with a piece of logical data. | |
63 | * | Read I/O | This includes reads on behalf of RAID-Z, | |
64 | * +---------------+ mirrors, gang blocks, retries, etc. | |
65 | * | | |
66 | * V | |
67 | * +---------------+ Reads associated with a particular device, but | |
68 | * | Physical I/O | no logical data. Issued as part of vdev caching | |
69 | * +---------------+ and I/O aggregation. | |
70 | * | |
71 | * Note that 'physical I/O' here is not the same terminology as used in the rest | |
72 | * of ZIO. Typically, 'physical I/O' simply means that there is no attached | |
73 | * blockpointer. But I/O with no associated block pointer can still be related | |
74 | * to a logical piece of data (i.e. RAID-Z requests). | |
75 | * | |
76 | * Purely physical I/O always have unique ENAs. They are not related to a | |
77 | * particular piece of logical data, and therefore cannot be chained together. | |
78 | * We still generate an ereport, but the DE doesn't correlate it with any | |
79 | * logical piece of data. When such an I/O fails, the delegated I/O requests | |
80 | * will issue a retry, which will trigger the 'real' ereport with the correct | |
81 | * ENA. | |
82 | * | |
83 | * We keep track of the ENA for a ZIO chain through the 'io_logical' member. | |
84 | * When a new logical I/O is issued, we set this to point to itself. Child I/Os | |
85 | * then inherit this pointer, so that when it is first set subsequent failures | |
b128c09f BB |
86 | * will use the same ENA. For vdev cache fill and queue aggregation I/O, |
87 | * this pointer is set to NULL, and no ereport will be generated (since it | |
88 | * doesn't actually correspond to any particular device or piece of data, | |
89 | * and the caller will always retry without caching or queueing anyway). | |
34dc7c2f BB |
90 | */ |
91 | void | |
92 | zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, | |
93 | uint64_t stateoroffset, uint64_t size) | |
94 | { | |
95 | #ifdef _KERNEL | |
96 | nvlist_t *ereport, *detector; | |
97 | uint64_t ena; | |
98 | char class[64]; | |
b128c09f | 99 | int state; |
34dc7c2f BB |
100 | |
101 | /* | |
102 | * If we are doing a spa_tryimport(), ignore errors. | |
103 | */ | |
104 | if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) | |
105 | return; | |
106 | ||
107 | /* | |
108 | * If we are in the middle of opening a pool, and the previous attempt | |
109 | * failed, don't bother logging any new ereports - we're just going to | |
110 | * get the same diagnosis anyway. | |
111 | */ | |
112 | if (spa->spa_load_state != SPA_LOAD_NONE && | |
113 | spa->spa_last_open_failed) | |
114 | return; | |
115 | ||
b128c09f BB |
116 | if (zio != NULL) { |
117 | /* | |
118 | * If this is not a read or write zio, ignore the error. This | |
119 | * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. | |
120 | */ | |
121 | if (zio->io_type != ZIO_TYPE_READ && | |
122 | zio->io_type != ZIO_TYPE_WRITE) | |
123 | return; | |
34dc7c2f | 124 | |
b128c09f BB |
125 | /* |
126 | * Ignore any errors from speculative I/Os, as failure is an | |
127 | * expected result. | |
128 | */ | |
129 | if (zio->io_flags & ZIO_FLAG_SPECULATIVE) | |
130 | return; | |
131 | ||
132 | /* | |
133 | * If the vdev has already been marked as failing due to a | |
134 | * failed probe, then ignore any subsequent I/O errors, as the | |
135 | * DE will automatically fault the vdev on the first such | |
136 | * failure. | |
137 | */ | |
138 | if (vd != NULL && | |
139 | (!vdev_readable(vd) || !vdev_writeable(vd)) && | |
140 | strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) | |
141 | return; | |
142 | } | |
34dc7c2f BB |
143 | |
144 | if ((ereport = fm_nvlist_create(NULL)) == NULL) | |
145 | return; | |
146 | ||
147 | if ((detector = fm_nvlist_create(NULL)) == NULL) { | |
148 | fm_nvlist_destroy(ereport, FM_NVA_FREE); | |
149 | return; | |
150 | } | |
151 | ||
152 | /* | |
153 | * Serialize ereport generation | |
154 | */ | |
155 | mutex_enter(&spa->spa_errlist_lock); | |
156 | ||
157 | /* | |
158 | * Determine the ENA to use for this event. If we are in a loading | |
159 | * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use | |
160 | * a root zio-wide ENA. Otherwise, simply use a unique ENA. | |
161 | */ | |
162 | if (spa->spa_load_state != SPA_LOAD_NONE) { | |
163 | if (spa->spa_ena == 0) | |
164 | spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); | |
165 | ena = spa->spa_ena; | |
166 | } else if (zio != NULL && zio->io_logical != NULL) { | |
167 | if (zio->io_logical->io_ena == 0) | |
168 | zio->io_logical->io_ena = | |
169 | fm_ena_generate(0, FM_ENA_FMT1); | |
170 | ena = zio->io_logical->io_ena; | |
171 | } else { | |
172 | ena = fm_ena_generate(0, FM_ENA_FMT1); | |
173 | } | |
174 | ||
175 | /* | |
176 | * Construct the full class, detector, and other standard FMA fields. | |
177 | */ | |
178 | (void) snprintf(class, sizeof (class), "%s.%s", | |
179 | ZFS_ERROR_CLASS, subclass); | |
180 | ||
181 | fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), | |
182 | vd != NULL ? vd->vdev_guid : 0); | |
183 | ||
184 | fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); | |
185 | ||
186 | /* | |
187 | * Construct the per-ereport payload, depending on which parameters are | |
188 | * passed in. | |
189 | */ | |
190 | ||
b128c09f BB |
191 | /* |
192 | * If we are importing a faulted pool, then we treat it like an open, | |
193 | * not an import. Otherwise, the DE will ignore all faults during | |
194 | * import, since the default behavior is to mark the devices as | |
195 | * persistently unavailable, not leave them in the faulted state. | |
196 | */ | |
197 | state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state; | |
198 | ||
34dc7c2f BB |
199 | /* |
200 | * Generic payload members common to all ereports. | |
34dc7c2f BB |
201 | */ |
202 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, | |
b128c09f | 203 | DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, |
34dc7c2f BB |
204 | DATA_TYPE_UINT64, spa_guid(spa), |
205 | FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, | |
b128c09f BB |
206 | state, NULL); |
207 | ||
208 | if (spa != NULL) { | |
209 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, | |
210 | DATA_TYPE_STRING, | |
211 | spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? | |
212 | FM_EREPORT_FAILMODE_WAIT : | |
213 | spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? | |
214 | FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, | |
215 | NULL); | |
216 | } | |
34dc7c2f BB |
217 | |
218 | if (vd != NULL) { | |
219 | vdev_t *pvd = vd->vdev_parent; | |
220 | ||
221 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, | |
222 | DATA_TYPE_UINT64, vd->vdev_guid, | |
223 | FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, | |
224 | DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); | |
225 | if (vd->vdev_path) | |
226 | fm_payload_set(ereport, | |
227 | FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, | |
228 | DATA_TYPE_STRING, vd->vdev_path, NULL); | |
229 | if (vd->vdev_devid) | |
230 | fm_payload_set(ereport, | |
231 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, | |
232 | DATA_TYPE_STRING, vd->vdev_devid, NULL); | |
233 | ||
234 | if (pvd != NULL) { | |
235 | fm_payload_set(ereport, | |
236 | FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, | |
237 | DATA_TYPE_UINT64, pvd->vdev_guid, | |
238 | FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, | |
239 | DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, | |
240 | NULL); | |
241 | if (pvd->vdev_path) | |
242 | fm_payload_set(ereport, | |
243 | FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, | |
244 | DATA_TYPE_STRING, pvd->vdev_path, NULL); | |
245 | if (pvd->vdev_devid) | |
246 | fm_payload_set(ereport, | |
247 | FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, | |
248 | DATA_TYPE_STRING, pvd->vdev_devid, NULL); | |
249 | } | |
250 | } | |
251 | ||
252 | if (zio != NULL) { | |
253 | /* | |
254 | * Payload common to all I/Os. | |
255 | */ | |
256 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, | |
257 | DATA_TYPE_INT32, zio->io_error, NULL); | |
258 | ||
259 | /* | |
260 | * If the 'size' parameter is non-zero, it indicates this is a | |
261 | * RAID-Z or other I/O where the physical offset and length are | |
262 | * provided for us, instead of within the zio_t. | |
263 | */ | |
264 | if (vd != NULL) { | |
265 | if (size) | |
266 | fm_payload_set(ereport, | |
267 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, | |
268 | DATA_TYPE_UINT64, stateoroffset, | |
269 | FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, | |
270 | DATA_TYPE_UINT64, size, NULL); | |
271 | else | |
272 | fm_payload_set(ereport, | |
273 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, | |
274 | DATA_TYPE_UINT64, zio->io_offset, | |
275 | FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, | |
276 | DATA_TYPE_UINT64, zio->io_size, NULL); | |
277 | } | |
278 | ||
279 | /* | |
280 | * Payload for I/Os with corresponding logical information. | |
281 | */ | |
282 | if (zio->io_logical != NULL) | |
283 | fm_payload_set(ereport, | |
284 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, | |
285 | DATA_TYPE_UINT64, | |
286 | zio->io_logical->io_bookmark.zb_objset, | |
287 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, | |
288 | DATA_TYPE_UINT64, | |
289 | zio->io_logical->io_bookmark.zb_object, | |
290 | FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, | |
291 | DATA_TYPE_INT64, | |
292 | zio->io_logical->io_bookmark.zb_level, | |
293 | FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, | |
294 | DATA_TYPE_UINT64, | |
295 | zio->io_logical->io_bookmark.zb_blkid, NULL); | |
296 | } else if (vd != NULL) { | |
297 | /* | |
298 | * If we have a vdev but no zio, this is a device fault, and the | |
299 | * 'stateoroffset' parameter indicates the previous state of the | |
300 | * vdev. | |
301 | */ | |
302 | fm_payload_set(ereport, | |
303 | FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, | |
304 | DATA_TYPE_UINT64, stateoroffset, NULL); | |
305 | } | |
306 | mutex_exit(&spa->spa_errlist_lock); | |
307 | ||
308 | fm_ereport_post(ereport, EVCH_SLEEP); | |
309 | ||
310 | fm_nvlist_destroy(ereport, FM_NVA_FREE); | |
311 | fm_nvlist_destroy(detector, FM_NVA_FREE); | |
312 | #endif | |
313 | } | |
314 | ||
315 | static void | |
316 | zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) | |
317 | { | |
318 | #ifdef _KERNEL | |
319 | nvlist_t *resource; | |
320 | char class[64]; | |
321 | ||
322 | if ((resource = fm_nvlist_create(NULL)) == NULL) | |
323 | return; | |
324 | ||
325 | (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, | |
326 | ZFS_ERROR_CLASS, name); | |
327 | VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); | |
328 | VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); | |
329 | VERIFY(nvlist_add_uint64(resource, | |
330 | FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); | |
331 | if (vd) | |
332 | VERIFY(nvlist_add_uint64(resource, | |
333 | FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); | |
334 | ||
335 | fm_ereport_post(resource, EVCH_SLEEP); | |
336 | ||
337 | fm_nvlist_destroy(resource, FM_NVA_FREE); | |
338 | #endif | |
339 | } | |
340 | ||
34dc7c2f BB |
341 | /* |
342 | * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev | |
343 | * has been removed from the system. This will cause the DE to ignore any | |
344 | * recent I/O errors, inferring that they are due to the asynchronous device | |
345 | * removal. | |
346 | */ | |
347 | void | |
348 | zfs_post_remove(spa_t *spa, vdev_t *vd) | |
349 | { | |
350 | zfs_post_common(spa, vd, FM_RESOURCE_REMOVED); | |
351 | } | |
352 | ||
353 | /* | |
354 | * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool | |
355 | * has the 'autoreplace' property set, and therefore any broken vdevs will be | |
356 | * handled by higher level logic, and no vdev fault should be generated. | |
357 | */ | |
358 | void | |
359 | zfs_post_autoreplace(spa_t *spa, vdev_t *vd) | |
360 | { | |
361 | zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); | |
362 | } |