]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
9babb374 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/spa.h> |
27 | #include <sys/spa_impl.h> | |
28 | #include <sys/vdev.h> | |
29 | #include <sys/vdev_impl.h> | |
30 | #include <sys/zio.h> | |
428870ff | 31 | #include <sys/zio_checksum.h> |
34dc7c2f BB |
32 | |
33 | #include <sys/fm/fs/zfs.h> | |
34 | #include <sys/fm/protocol.h> | |
35 | #include <sys/fm/util.h> | |
36 | #include <sys/sysevent.h> | |
37 | ||
38 | /* | |
39 | * This general routine is responsible for generating all the different ZFS | |
40 | * ereports. The payload is dependent on the class, and which arguments are | |
41 | * supplied to the function: | |
42 | * | |
43 | * EREPORT POOL VDEV IO | |
44 | * block X X X | |
45 | * data X X | |
46 | * device X X | |
47 | * pool X | |
48 | * | |
49 | * If we are in a loading state, all errors are chained together by the same | |
b128c09f | 50 | * SPA-wide ENA (Error Numeric Association). |
34dc7c2f BB |
51 | * |
52 | * For isolated I/O requests, we get the ENA from the zio_t. The propagation | |
53 | * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want | |
54 | * to chain together all ereports associated with a logical piece of data. For | |
55 | * read I/Os, there are basically three 'types' of I/O, which form a roughly | |
56 | * layered diagram: | |
57 | * | |
58 | * +---------------+ | |
59 | * | Aggregate I/O | No associated logical data or device | |
60 | * +---------------+ | |
61 | * | | |
62 | * V | |
63 | * +---------------+ Reads associated with a piece of logical data. | |
64 | * | Read I/O | This includes reads on behalf of RAID-Z, | |
65 | * +---------------+ mirrors, gang blocks, retries, etc. | |
66 | * | | |
67 | * V | |
68 | * +---------------+ Reads associated with a particular device, but | |
69 | * | Physical I/O | no logical data. Issued as part of vdev caching | |
70 | * +---------------+ and I/O aggregation. | |
71 | * | |
72 | * Note that 'physical I/O' here is not the same terminology as used in the rest | |
73 | * of ZIO. Typically, 'physical I/O' simply means that there is no attached | |
74 | * blockpointer. But I/O with no associated block pointer can still be related | |
75 | * to a logical piece of data (i.e. RAID-Z requests). | |
76 | * | |
77 | * Purely physical I/O always have unique ENAs. They are not related to a | |
78 | * particular piece of logical data, and therefore cannot be chained together. | |
79 | * We still generate an ereport, but the DE doesn't correlate it with any | |
80 | * logical piece of data. When such an I/O fails, the delegated I/O requests | |
81 | * will issue a retry, which will trigger the 'real' ereport with the correct | |
82 | * ENA. | |
83 | * | |
84 | * We keep track of the ENA for a ZIO chain through the 'io_logical' member. | |
85 | * When a new logical I/O is issued, we set this to point to itself. Child I/Os | |
86 | * then inherit this pointer, so that when it is first set subsequent failures | |
b128c09f BB |
87 | * will use the same ENA. For vdev cache fill and queue aggregation I/O, |
88 | * this pointer is set to NULL, and no ereport will be generated (since it | |
89 | * doesn't actually correspond to any particular device or piece of data, | |
90 | * and the caller will always retry without caching or queueing anyway). | |
428870ff BB |
91 | * |
92 | * For checksum errors, we want to include more information about the actual | |
93 | * error which occurs. Accordingly, we build an ereport when the error is | |
94 | * noticed, but instead of sending it in immediately, we hang it off of the | |
95 | * io_cksum_report field of the logical IO. When the logical IO completes | |
96 | * (successfully or not), zfs_ereport_finish_checksum() is called with the | |
97 | * good and bad versions of the buffer (if available), and we annotate the | |
98 | * ereport with information about the differences. | |
34dc7c2f | 99 | */ |
428870ff | 100 | #ifdef _KERNEL |
26685276 BB |
101 | static void |
102 | zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) | |
103 | { | |
104 | if (nvl) | |
105 | fm_nvlist_destroy(nvl, FM_NVA_FREE); | |
106 | ||
107 | if (detector) | |
108 | fm_nvlist_destroy(detector, FM_NVA_FREE); | |
109 | } | |
110 | ||
428870ff BB |
111 | static void |
112 | zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, | |
113 | const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, | |
34dc7c2f BB |
114 | uint64_t stateoroffset, uint64_t size) |
115 | { | |
34dc7c2f | 116 | nvlist_t *ereport, *detector; |
428870ff | 117 | |
34dc7c2f BB |
118 | uint64_t ena; |
119 | char class[64]; | |
120 | ||
121 | /* | |
428870ff BB |
122 | * If we are doing a spa_tryimport() or in recovery mode, |
123 | * ignore errors. | |
34dc7c2f | 124 | */ |
428870ff BB |
125 | if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || |
126 | spa_load_state(spa) == SPA_LOAD_RECOVER) | |
34dc7c2f BB |
127 | return; |
128 | ||
129 | /* | |
130 | * If we are in the middle of opening a pool, and the previous attempt | |
131 | * failed, don't bother logging any new ereports - we're just going to | |
132 | * get the same diagnosis anyway. | |
133 | */ | |
428870ff | 134 | if (spa_load_state(spa) != SPA_LOAD_NONE && |
34dc7c2f BB |
135 | spa->spa_last_open_failed) |
136 | return; | |
137 | ||
b128c09f BB |
138 | if (zio != NULL) { |
139 | /* | |
140 | * If this is not a read or write zio, ignore the error. This | |
141 | * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. | |
142 | */ | |
143 | if (zio->io_type != ZIO_TYPE_READ && | |
144 | zio->io_type != ZIO_TYPE_WRITE) | |
145 | return; | |
34dc7c2f | 146 | |
9babb374 BB |
147 | if (vd != NULL) { |
148 | /* | |
149 | * If the vdev has already been marked as failing due | |
150 | * to a failed probe, then ignore any subsequent I/O | |
151 | * errors, as the DE will automatically fault the vdev | |
152 | * on the first such failure. This also catches cases | |
153 | * where vdev_remove_wanted is set and the device has | |
154 | * not yet been asynchronously placed into the REMOVED | |
155 | * state. | |
156 | */ | |
428870ff | 157 | if (zio->io_vd == vd && !vdev_accessible(vd, zio)) |
9babb374 BB |
158 | return; |
159 | ||
160 | /* | |
161 | * Ignore checksum errors for reads from DTL regions of | |
162 | * leaf vdevs. | |
163 | */ | |
164 | if (zio->io_type == ZIO_TYPE_READ && | |
165 | zio->io_error == ECKSUM && | |
166 | vd->vdev_ops->vdev_op_leaf && | |
167 | vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) | |
168 | return; | |
169 | } | |
b128c09f | 170 | } |
34dc7c2f | 171 | |
428870ff BB |
172 | /* |
173 | * For probe failure, we want to avoid posting ereports if we've | |
174 | * already removed the device in the meantime. | |
175 | */ | |
176 | if (vd != NULL && | |
177 | strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && | |
178 | (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) | |
179 | return; | |
180 | ||
34dc7c2f BB |
181 | if ((ereport = fm_nvlist_create(NULL)) == NULL) |
182 | return; | |
183 | ||
184 | if ((detector = fm_nvlist_create(NULL)) == NULL) { | |
185 | fm_nvlist_destroy(ereport, FM_NVA_FREE); | |
186 | return; | |
187 | } | |
188 | ||
189 | /* | |
190 | * Serialize ereport generation | |
191 | */ | |
192 | mutex_enter(&spa->spa_errlist_lock); | |
193 | ||
194 | /* | |
195 | * Determine the ENA to use for this event. If we are in a loading | |
196 | * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use | |
197 | * a root zio-wide ENA. Otherwise, simply use a unique ENA. | |
198 | */ | |
428870ff | 199 | if (spa_load_state(spa) != SPA_LOAD_NONE) { |
34dc7c2f BB |
200 | if (spa->spa_ena == 0) |
201 | spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); | |
202 | ena = spa->spa_ena; | |
203 | } else if (zio != NULL && zio->io_logical != NULL) { | |
204 | if (zio->io_logical->io_ena == 0) | |
205 | zio->io_logical->io_ena = | |
206 | fm_ena_generate(0, FM_ENA_FMT1); | |
207 | ena = zio->io_logical->io_ena; | |
208 | } else { | |
209 | ena = fm_ena_generate(0, FM_ENA_FMT1); | |
210 | } | |
211 | ||
212 | /* | |
213 | * Construct the full class, detector, and other standard FMA fields. | |
214 | */ | |
215 | (void) snprintf(class, sizeof (class), "%s.%s", | |
216 | ZFS_ERROR_CLASS, subclass); | |
217 | ||
218 | fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), | |
219 | vd != NULL ? vd->vdev_guid : 0); | |
220 | ||
221 | fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); | |
222 | ||
223 | /* | |
224 | * Construct the per-ereport payload, depending on which parameters are | |
225 | * passed in. | |
226 | */ | |
227 | ||
228 | /* | |
229 | * Generic payload members common to all ereports. | |
34dc7c2f BB |
230 | */ |
231 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, | |
b128c09f | 232 | DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, |
34dc7c2f BB |
233 | DATA_TYPE_UINT64, spa_guid(spa), |
234 | FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, | |
428870ff | 235 | spa_load_state(spa), NULL); |
b128c09f BB |
236 | |
237 | if (spa != NULL) { | |
238 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, | |
239 | DATA_TYPE_STRING, | |
240 | spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? | |
241 | FM_EREPORT_FAILMODE_WAIT : | |
242 | spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? | |
243 | FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, | |
244 | NULL); | |
245 | } | |
34dc7c2f BB |
246 | |
247 | if (vd != NULL) { | |
248 | vdev_t *pvd = vd->vdev_parent; | |
249 | ||
250 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, | |
251 | DATA_TYPE_UINT64, vd->vdev_guid, | |
252 | FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, | |
253 | DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); | |
9babb374 | 254 | if (vd->vdev_path != NULL) |
34dc7c2f BB |
255 | fm_payload_set(ereport, |
256 | FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, | |
257 | DATA_TYPE_STRING, vd->vdev_path, NULL); | |
9babb374 | 258 | if (vd->vdev_devid != NULL) |
34dc7c2f BB |
259 | fm_payload_set(ereport, |
260 | FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, | |
261 | DATA_TYPE_STRING, vd->vdev_devid, NULL); | |
9babb374 BB |
262 | if (vd->vdev_fru != NULL) |
263 | fm_payload_set(ereport, | |
264 | FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, | |
265 | DATA_TYPE_STRING, vd->vdev_fru, NULL); | |
34dc7c2f BB |
266 | |
267 | if (pvd != NULL) { | |
268 | fm_payload_set(ereport, | |
269 | FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, | |
270 | DATA_TYPE_UINT64, pvd->vdev_guid, | |
271 | FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, | |
272 | DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, | |
273 | NULL); | |
274 | if (pvd->vdev_path) | |
275 | fm_payload_set(ereport, | |
276 | FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, | |
277 | DATA_TYPE_STRING, pvd->vdev_path, NULL); | |
278 | if (pvd->vdev_devid) | |
279 | fm_payload_set(ereport, | |
280 | FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, | |
281 | DATA_TYPE_STRING, pvd->vdev_devid, NULL); | |
282 | } | |
283 | } | |
284 | ||
285 | if (zio != NULL) { | |
286 | /* | |
287 | * Payload common to all I/Os. | |
288 | */ | |
289 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, | |
290 | DATA_TYPE_INT32, zio->io_error, NULL); | |
312c07ed BB |
291 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, |
292 | DATA_TYPE_INT32, zio->io_flags, NULL); | |
a69052be BB |
293 | fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, |
294 | DATA_TYPE_UINT64, zio->io_delay, NULL); | |
34dc7c2f BB |
295 | |
296 | /* | |
297 | * If the 'size' parameter is non-zero, it indicates this is a | |
298 | * RAID-Z or other I/O where the physical offset and length are | |
299 | * provided for us, instead of within the zio_t. | |
300 | */ | |
301 | if (vd != NULL) { | |
302 | if (size) | |
303 | fm_payload_set(ereport, | |
304 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, | |
305 | DATA_TYPE_UINT64, stateoroffset, | |
306 | FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, | |
307 | DATA_TYPE_UINT64, size, NULL); | |
308 | else | |
309 | fm_payload_set(ereport, | |
310 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, | |
311 | DATA_TYPE_UINT64, zio->io_offset, | |
312 | FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, | |
313 | DATA_TYPE_UINT64, zio->io_size, NULL); | |
314 | } | |
315 | ||
316 | /* | |
317 | * Payload for I/Os with corresponding logical information. | |
318 | */ | |
319 | if (zio->io_logical != NULL) | |
320 | fm_payload_set(ereport, | |
321 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, | |
322 | DATA_TYPE_UINT64, | |
323 | zio->io_logical->io_bookmark.zb_objset, | |
324 | FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, | |
325 | DATA_TYPE_UINT64, | |
326 | zio->io_logical->io_bookmark.zb_object, | |
327 | FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, | |
328 | DATA_TYPE_INT64, | |
329 | zio->io_logical->io_bookmark.zb_level, | |
330 | FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, | |
331 | DATA_TYPE_UINT64, | |
332 | zio->io_logical->io_bookmark.zb_blkid, NULL); | |
333 | } else if (vd != NULL) { | |
334 | /* | |
335 | * If we have a vdev but no zio, this is a device fault, and the | |
336 | * 'stateoroffset' parameter indicates the previous state of the | |
337 | * vdev. | |
338 | */ | |
339 | fm_payload_set(ereport, | |
340 | FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, | |
341 | DATA_TYPE_UINT64, stateoroffset, NULL); | |
342 | } | |
428870ff | 343 | |
34dc7c2f BB |
344 | mutex_exit(&spa->spa_errlist_lock); |
345 | ||
428870ff BB |
346 | *ereport_out = ereport; |
347 | *detector_out = detector; | |
348 | } | |
349 | ||
350 | /* if it's <= 128 bytes, save the corruption directly */ | |
351 | #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) | |
352 | ||
353 | #define MAX_RANGES 16 | |
354 | ||
355 | typedef struct zfs_ecksum_info { | |
356 | /* histograms of set and cleared bits by bit number in a 64-bit word */ | |
357 | uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY]; | |
358 | uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; | |
359 | ||
360 | /* inline arrays of bits set and cleared. */ | |
361 | uint64_t zei_bits_set[ZFM_MAX_INLINE]; | |
362 | uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; | |
363 | ||
364 | /* | |
365 | * for each range, the number of bits set and cleared. The Hamming | |
366 | * distance between the good and bad buffers is the sum of them all. | |
367 | */ | |
368 | uint32_t zei_range_sets[MAX_RANGES]; | |
369 | uint32_t zei_range_clears[MAX_RANGES]; | |
370 | ||
371 | struct zei_ranges { | |
372 | uint32_t zr_start; | |
373 | uint32_t zr_end; | |
374 | } zei_ranges[MAX_RANGES]; | |
375 | ||
376 | size_t zei_range_count; | |
377 | uint32_t zei_mingap; | |
378 | uint32_t zei_allowed_mingap; | |
379 | ||
380 | } zfs_ecksum_info_t; | |
381 | ||
382 | static void | |
383 | update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count) | |
384 | { | |
385 | size_t i; | |
386 | size_t bits = 0; | |
387 | uint64_t value = BE_64(value_arg); | |
388 | ||
389 | /* We store the bits in big-endian (largest-first) order */ | |
390 | for (i = 0; i < 64; i++) { | |
391 | if (value & (1ull << i)) { | |
392 | hist[63 - i]++; | |
393 | ++bits; | |
394 | } | |
395 | } | |
396 | /* update the count of bits changed */ | |
397 | *count += bits; | |
398 | } | |
399 | ||
400 | /* | |
401 | * We've now filled up the range array, and need to increase "mingap" and | |
402 | * shrink the range list accordingly. zei_mingap is always the smallest | |
403 | * distance between array entries, so we set the new_allowed_gap to be | |
404 | * one greater than that. We then go through the list, joining together | |
405 | * any ranges which are closer than the new_allowed_gap. | |
406 | * | |
407 | * By construction, there will be at least one. We also update zei_mingap | |
408 | * to the new smallest gap, to prepare for our next invocation. | |
409 | */ | |
410 | static void | |
26685276 | 411 | zei_shrink_ranges(zfs_ecksum_info_t *eip) |
428870ff BB |
412 | { |
413 | uint32_t mingap = UINT32_MAX; | |
414 | uint32_t new_allowed_gap = eip->zei_mingap + 1; | |
415 | ||
416 | size_t idx, output; | |
417 | size_t max = eip->zei_range_count; | |
418 | ||
419 | struct zei_ranges *r = eip->zei_ranges; | |
420 | ||
421 | ASSERT3U(eip->zei_range_count, >, 0); | |
422 | ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); | |
423 | ||
424 | output = idx = 0; | |
425 | while (idx < max - 1) { | |
426 | uint32_t start = r[idx].zr_start; | |
427 | uint32_t end = r[idx].zr_end; | |
428 | ||
429 | while (idx < max - 1) { | |
26685276 | 430 | uint32_t nstart, nend, gap; |
428870ff | 431 | |
26685276 BB |
432 | idx++; |
433 | nstart = r[idx].zr_start; | |
434 | nend = r[idx].zr_end; | |
428870ff | 435 | |
26685276 | 436 | gap = nstart - end; |
428870ff BB |
437 | if (gap < new_allowed_gap) { |
438 | end = nend; | |
439 | continue; | |
440 | } | |
441 | if (gap < mingap) | |
442 | mingap = gap; | |
443 | break; | |
444 | } | |
445 | r[output].zr_start = start; | |
446 | r[output].zr_end = end; | |
447 | output++; | |
448 | } | |
449 | ASSERT3U(output, <, eip->zei_range_count); | |
450 | eip->zei_range_count = output; | |
451 | eip->zei_mingap = mingap; | |
452 | eip->zei_allowed_mingap = new_allowed_gap; | |
453 | } | |
454 | ||
455 | static void | |
26685276 | 456 | zei_add_range(zfs_ecksum_info_t *eip, int start, int end) |
428870ff BB |
457 | { |
458 | struct zei_ranges *r = eip->zei_ranges; | |
459 | size_t count = eip->zei_range_count; | |
460 | ||
461 | if (count >= MAX_RANGES) { | |
26685276 | 462 | zei_shrink_ranges(eip); |
428870ff BB |
463 | count = eip->zei_range_count; |
464 | } | |
465 | if (count == 0) { | |
466 | eip->zei_mingap = UINT32_MAX; | |
467 | eip->zei_allowed_mingap = 1; | |
468 | } else { | |
469 | int gap = start - r[count - 1].zr_end; | |
470 | ||
471 | if (gap < eip->zei_allowed_mingap) { | |
472 | r[count - 1].zr_end = end; | |
473 | return; | |
474 | } | |
475 | if (gap < eip->zei_mingap) | |
476 | eip->zei_mingap = gap; | |
477 | } | |
478 | r[count].zr_start = start; | |
479 | r[count].zr_end = end; | |
480 | eip->zei_range_count++; | |
481 | } | |
482 | ||
483 | static size_t | |
26685276 | 484 | zei_range_total_size(zfs_ecksum_info_t *eip) |
428870ff BB |
485 | { |
486 | struct zei_ranges *r = eip->zei_ranges; | |
487 | size_t count = eip->zei_range_count; | |
488 | size_t result = 0; | |
489 | size_t idx; | |
490 | ||
491 | for (idx = 0; idx < count; idx++) | |
492 | result += (r[idx].zr_end - r[idx].zr_start); | |
493 | ||
494 | return (result); | |
495 | } | |
496 | ||
497 | static zfs_ecksum_info_t * | |
498 | annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, | |
499 | const uint8_t *goodbuf, const uint8_t *badbuf, size_t size, | |
500 | boolean_t drop_if_identical) | |
501 | { | |
502 | const uint64_t *good = (const uint64_t *)goodbuf; | |
503 | const uint64_t *bad = (const uint64_t *)badbuf; | |
504 | ||
505 | uint64_t allset = 0; | |
506 | uint64_t allcleared = 0; | |
507 | ||
508 | size_t nui64s = size / sizeof (uint64_t); | |
509 | ||
510 | size_t inline_size; | |
511 | int no_inline = 0; | |
512 | size_t idx; | |
513 | size_t range; | |
514 | ||
515 | size_t offset = 0; | |
516 | ssize_t start = -1; | |
517 | ||
518 | zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); | |
519 | ||
520 | /* don't do any annotation for injected checksum errors */ | |
521 | if (info != NULL && info->zbc_injected) | |
522 | return (eip); | |
523 | ||
524 | if (info != NULL && info->zbc_has_cksum) { | |
525 | fm_payload_set(ereport, | |
526 | FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, | |
527 | DATA_TYPE_UINT64_ARRAY, | |
528 | sizeof (info->zbc_expected) / sizeof (uint64_t), | |
529 | (uint64_t *)&info->zbc_expected, | |
530 | FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, | |
531 | DATA_TYPE_UINT64_ARRAY, | |
532 | sizeof (info->zbc_actual) / sizeof (uint64_t), | |
533 | (uint64_t *)&info->zbc_actual, | |
534 | FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, | |
535 | DATA_TYPE_STRING, | |
536 | info->zbc_checksum_name, | |
537 | NULL); | |
538 | ||
539 | if (info->zbc_byteswapped) { | |
540 | fm_payload_set(ereport, | |
541 | FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, | |
542 | DATA_TYPE_BOOLEAN, 1, | |
543 | NULL); | |
544 | } | |
545 | } | |
546 | ||
547 | if (badbuf == NULL || goodbuf == NULL) | |
548 | return (eip); | |
549 | ||
550 | ASSERT3U(nui64s, <=, UINT16_MAX); | |
551 | ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); | |
552 | ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); | |
553 | ASSERT3U(size, <=, UINT32_MAX); | |
554 | ||
555 | /* build up the range list by comparing the two buffers. */ | |
556 | for (idx = 0; idx < nui64s; idx++) { | |
557 | if (good[idx] == bad[idx]) { | |
558 | if (start == -1) | |
559 | continue; | |
560 | ||
26685276 | 561 | zei_add_range(eip, start, idx); |
428870ff BB |
562 | start = -1; |
563 | } else { | |
564 | if (start != -1) | |
565 | continue; | |
566 | ||
567 | start = idx; | |
568 | } | |
569 | } | |
570 | if (start != -1) | |
26685276 | 571 | zei_add_range(eip, start, idx); |
428870ff BB |
572 | |
573 | /* See if it will fit in our inline buffers */ | |
26685276 | 574 | inline_size = zei_range_total_size(eip); |
428870ff BB |
575 | if (inline_size > ZFM_MAX_INLINE) |
576 | no_inline = 1; | |
577 | ||
578 | /* | |
579 | * If there is no change and we want to drop if the buffers are | |
580 | * identical, do so. | |
581 | */ | |
582 | if (inline_size == 0 && drop_if_identical) { | |
583 | kmem_free(eip, sizeof (*eip)); | |
584 | return (NULL); | |
585 | } | |
586 | ||
587 | /* | |
588 | * Now walk through the ranges, filling in the details of the | |
589 | * differences. Also convert our uint64_t-array offsets to byte | |
590 | * offsets. | |
591 | */ | |
592 | for (range = 0; range < eip->zei_range_count; range++) { | |
593 | size_t start = eip->zei_ranges[range].zr_start; | |
594 | size_t end = eip->zei_ranges[range].zr_end; | |
595 | ||
596 | for (idx = start; idx < end; idx++) { | |
597 | uint64_t set, cleared; | |
598 | ||
599 | // bits set in bad, but not in good | |
600 | set = ((~good[idx]) & bad[idx]); | |
601 | // bits set in good, but not in bad | |
602 | cleared = (good[idx] & (~bad[idx])); | |
603 | ||
604 | allset |= set; | |
605 | allcleared |= cleared; | |
606 | ||
607 | if (!no_inline) { | |
608 | ASSERT3U(offset, <, inline_size); | |
609 | eip->zei_bits_set[offset] = set; | |
610 | eip->zei_bits_cleared[offset] = cleared; | |
611 | offset++; | |
612 | } | |
613 | ||
614 | update_histogram(set, eip->zei_histogram_set, | |
615 | &eip->zei_range_sets[range]); | |
616 | update_histogram(cleared, eip->zei_histogram_cleared, | |
617 | &eip->zei_range_clears[range]); | |
618 | } | |
619 | ||
620 | /* convert to byte offsets */ | |
621 | eip->zei_ranges[range].zr_start *= sizeof (uint64_t); | |
622 | eip->zei_ranges[range].zr_end *= sizeof (uint64_t); | |
623 | } | |
624 | eip->zei_allowed_mingap *= sizeof (uint64_t); | |
625 | inline_size *= sizeof (uint64_t); | |
626 | ||
627 | /* fill in ereport */ | |
628 | fm_payload_set(ereport, | |
629 | FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, | |
630 | DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, | |
631 | (uint32_t *)eip->zei_ranges, | |
632 | FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, | |
633 | DATA_TYPE_UINT32, eip->zei_allowed_mingap, | |
634 | FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, | |
635 | DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, | |
636 | FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, | |
637 | DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, | |
638 | NULL); | |
639 | ||
640 | if (!no_inline) { | |
641 | fm_payload_set(ereport, | |
642 | FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, | |
643 | DATA_TYPE_UINT8_ARRAY, | |
644 | inline_size, (uint8_t *)eip->zei_bits_set, | |
645 | FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, | |
646 | DATA_TYPE_UINT8_ARRAY, | |
647 | inline_size, (uint8_t *)eip->zei_bits_cleared, | |
648 | NULL); | |
649 | } else { | |
650 | fm_payload_set(ereport, | |
651 | FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, | |
652 | DATA_TYPE_UINT16_ARRAY, | |
653 | NBBY * sizeof (uint64_t), eip->zei_histogram_set, | |
654 | FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, | |
655 | DATA_TYPE_UINT16_ARRAY, | |
656 | NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, | |
657 | NULL); | |
658 | } | |
659 | return (eip); | |
660 | } | |
661 | #endif | |
662 | ||
663 | void | |
664 | zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, | |
665 | uint64_t stateoroffset, uint64_t size) | |
666 | { | |
667 | #ifdef _KERNEL | |
668 | nvlist_t *ereport = NULL; | |
669 | nvlist_t *detector = NULL; | |
670 | ||
671 | zfs_ereport_start(&ereport, &detector, | |
672 | subclass, spa, vd, zio, stateoroffset, size); | |
673 | ||
674 | if (ereport == NULL) | |
675 | return; | |
676 | ||
26685276 BB |
677 | /* Cleanup is handled by the callback function */ |
678 | zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); | |
34dc7c2f BB |
679 | #endif |
680 | } | |
681 | ||
428870ff BB |
682 | void |
683 | zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, | |
684 | struct zio *zio, uint64_t offset, uint64_t length, void *arg, | |
685 | zio_bad_cksum_t *info) | |
686 | { | |
687 | zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP); | |
688 | ||
689 | if (zio->io_vsd != NULL) | |
690 | zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); | |
691 | else | |
692 | zio_vsd_default_cksum_report(zio, report, arg); | |
693 | ||
694 | /* copy the checksum failure information if it was provided */ | |
695 | if (info != NULL) { | |
696 | report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); | |
697 | bcopy(info, report->zcr_ckinfo, sizeof (*info)); | |
698 | } | |
699 | ||
700 | report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; | |
701 | report->zcr_length = length; | |
702 | ||
703 | #ifdef _KERNEL | |
704 | zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, | |
705 | FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); | |
706 | ||
707 | if (report->zcr_ereport == NULL) { | |
708 | report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo); | |
709 | kmem_free(report, sizeof (*report)); | |
710 | return; | |
711 | } | |
712 | #endif | |
713 | ||
714 | mutex_enter(&spa->spa_errlist_lock); | |
715 | report->zcr_next = zio->io_logical->io_cksum_report; | |
716 | zio->io_logical->io_cksum_report = report; | |
717 | mutex_exit(&spa->spa_errlist_lock); | |
718 | } | |
719 | ||
720 | void | |
721 | zfs_ereport_finish_checksum(zio_cksum_report_t *report, | |
722 | const void *good_data, const void *bad_data, boolean_t drop_if_identical) | |
723 | { | |
724 | #ifdef _KERNEL | |
725 | zfs_ecksum_info_t *info = NULL; | |
726 | info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, | |
727 | good_data, bad_data, report->zcr_length, drop_if_identical); | |
728 | ||
729 | if (info != NULL) | |
26685276 BB |
730 | zfs_zevent_post(report->zcr_ereport, |
731 | report->zcr_detector, zfs_zevent_post_cb); | |
428870ff | 732 | |
428870ff | 733 | report->zcr_ereport = report->zcr_detector = NULL; |
428870ff BB |
734 | if (info != NULL) |
735 | kmem_free(info, sizeof (*info)); | |
736 | #endif | |
737 | } | |
738 | ||
739 | void | |
740 | zfs_ereport_free_checksum(zio_cksum_report_t *rpt) | |
741 | { | |
742 | #ifdef _KERNEL | |
743 | if (rpt->zcr_ereport != NULL) { | |
744 | fm_nvlist_destroy(rpt->zcr_ereport, | |
745 | FM_NVA_FREE); | |
746 | fm_nvlist_destroy(rpt->zcr_detector, | |
747 | FM_NVA_FREE); | |
748 | } | |
749 | #endif | |
750 | rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); | |
751 | ||
752 | if (rpt->zcr_ckinfo != NULL) | |
753 | kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); | |
754 | ||
755 | kmem_free(rpt, sizeof (*rpt)); | |
756 | } | |
757 | ||
758 | void | |
759 | zfs_ereport_send_interim_checksum(zio_cksum_report_t *report) | |
760 | { | |
761 | #ifdef _KERNEL | |
26685276 | 762 | zfs_zevent_post(report->zcr_ereport, report->zcr_detector, NULL); |
428870ff BB |
763 | #endif |
764 | } | |
765 | ||
766 | void | |
767 | zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, | |
768 | struct zio *zio, uint64_t offset, uint64_t length, | |
769 | const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc) | |
770 | { | |
771 | #ifdef _KERNEL | |
772 | nvlist_t *ereport = NULL; | |
773 | nvlist_t *detector = NULL; | |
774 | zfs_ecksum_info_t *info; | |
775 | ||
776 | zfs_ereport_start(&ereport, &detector, | |
777 | FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); | |
778 | ||
779 | if (ereport == NULL) | |
780 | return; | |
781 | ||
782 | info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, | |
783 | B_FALSE); | |
784 | ||
26685276 BB |
785 | if (info != NULL) { |
786 | zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); | |
428870ff | 787 | kmem_free(info, sizeof (*info)); |
26685276 | 788 | } |
428870ff BB |
789 | #endif |
790 | } | |
791 | ||
34dc7c2f BB |
792 | static void |
793 | zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) | |
794 | { | |
795 | #ifdef _KERNEL | |
796 | nvlist_t *resource; | |
797 | char class[64]; | |
798 | ||
428870ff BB |
799 | if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) |
800 | return; | |
801 | ||
34dc7c2f BB |
802 | if ((resource = fm_nvlist_create(NULL)) == NULL) |
803 | return; | |
804 | ||
805 | (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, | |
806 | ZFS_ERROR_CLASS, name); | |
807 | VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); | |
808 | VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); | |
809 | VERIFY(nvlist_add_uint64(resource, | |
810 | FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); | |
26685276 | 811 | if (vd) { |
34dc7c2f BB |
812 | VERIFY(nvlist_add_uint64(resource, |
813 | FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); | |
26685276 BB |
814 | VERIFY(nvlist_add_uint64(resource, |
815 | FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state) == 0); | |
816 | } | |
34dc7c2f | 817 | |
26685276 | 818 | zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); |
34dc7c2f BB |
819 | #endif |
820 | } | |
821 | ||
34dc7c2f BB |
822 | /* |
823 | * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev | |
824 | * has been removed from the system. This will cause the DE to ignore any | |
825 | * recent I/O errors, inferring that they are due to the asynchronous device | |
826 | * removal. | |
827 | */ | |
828 | void | |
829 | zfs_post_remove(spa_t *spa, vdev_t *vd) | |
830 | { | |
26685276 | 831 | zfs_post_common(spa, vd, FM_EREPORT_RESOURCE_REMOVED); |
34dc7c2f BB |
832 | } |
833 | ||
834 | /* | |
835 | * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool | |
836 | * has the 'autoreplace' property set, and therefore any broken vdevs will be | |
837 | * handled by higher level logic, and no vdev fault should be generated. | |
838 | */ | |
839 | void | |
840 | zfs_post_autoreplace(spa_t *spa, vdev_t *vd) | |
841 | { | |
26685276 | 842 | zfs_post_common(spa, vd, FM_EREPORT_RESOURCE_AUTOREPLACE); |
34dc7c2f | 843 | } |
428870ff BB |
844 | |
845 | /* | |
846 | * The 'resource.fs.zfs.statechange' event is an internal signal that the | |
847 | * given vdev has transitioned its state to DEGRADED or HEALTHY. This will | |
848 | * cause the retire agent to repair any outstanding fault management cases | |
849 | * open because the device was not found (fault.fs.zfs.device). | |
850 | */ | |
851 | void | |
852 | zfs_post_state_change(spa_t *spa, vdev_t *vd) | |
853 | { | |
26685276 | 854 | zfs_post_common(spa, vd, FM_EREPORT_RESOURCE_STATECHANGE); |
428870ff | 855 | } |
26685276 BB |
856 | |
857 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
858 | EXPORT_SYMBOL(zfs_ereport_post); | |
859 | EXPORT_SYMBOL(zfs_ereport_post_checksum); | |
860 | EXPORT_SYMBOL(zfs_post_remove); | |
861 | EXPORT_SYMBOL(zfs_post_autoreplace); | |
862 | EXPORT_SYMBOL(zfs_post_state_change); | |
863 | #endif /* _KERNEL */ |