]>
Commit | Line | Data |
---|---|---|
d02ca379 DB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
d02ca379 DB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. | |
23 | * Copyright (c) 2012 by Delphix. All rights reserved. | |
24 | * Copyright 2014 Nexenta Systems, Inc. All rights reserved. | |
7a4500a1 | 25 | * Copyright (c) 2016, 2017, Intel Corporation. |
d3f2cd7e | 26 | * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. |
d02ca379 DB |
27 | */ |
28 | ||
29 | /* | |
30 | * ZFS syseventd module. | |
31 | * | |
32 | * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c | |
33 | * | |
34 | * The purpose of this module is to identify when devices are added to the | |
35 | * system, and appropriately online or replace the affected vdevs. | |
36 | * | |
37 | * When a device is added to the system: | |
38 | * | |
39 | * 1. Search for any vdevs whose devid matches that of the newly added | |
40 | * device. | |
41 | * | |
42 | * 2. If no vdevs are found, then search for any vdevs whose udev path | |
43 | * matches that of the new device. | |
44 | * | |
45 | * 3. If no vdevs match by either method, then ignore the event. | |
46 | * | |
47 | * 4. Attempt to online the device with a flag to indicate that it should | |
48 | * be unspared when resilvering completes. If this succeeds, then the | |
49 | * same device was inserted and we should continue normally. | |
50 | * | |
51 | * 5. If the pool does not have the 'autoreplace' property set, attempt to | |
52 | * online the device again without the unspare flag, which will | |
53 | * generate a FMA fault. | |
54 | * | |
55 | * 6. If the pool has the 'autoreplace' property set, and the matching vdev | |
56 | * is a whole disk, then label the new disk and attempt a 'zpool | |
57 | * replace'. | |
58 | * | |
59 | * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK | |
60 | * event indicates that a device failed to open during pool load, but the | |
61 | * autoreplace property was set. In this case, we deferred the associated | |
62 | * FMA fault until our module had a chance to process the autoreplace logic. | |
63 | * If the device could not be replaced, then the second online attempt will | |
64 | * trigger the FMA fault that we skipped earlier. | |
65 | * | |
d0249a4b | 66 | * On Linux udev provides a disk insert for both the disk and the partition. |
d02ca379 DB |
67 | */ |
68 | ||
69 | #include <ctype.h> | |
d02ca379 DB |
70 | #include <fcntl.h> |
71 | #include <libnvpair.h> | |
72 | #include <libzfs.h> | |
e89f1295 | 73 | #include <libzutil.h> |
d02ca379 DB |
74 | #include <limits.h> |
75 | #include <stddef.h> | |
76 | #include <stdlib.h> | |
77 | #include <string.h> | |
78 | #include <syslog.h> | |
79 | #include <sys/list.h> | |
80 | #include <sys/sunddi.h> | |
81 | #include <sys/sysevent/eventdefs.h> | |
82 | #include <sys/sysevent/dev.h> | |
4e9b1569 | 83 | #include <thread_pool.h> |
d02ca379 DB |
84 | #include <pthread.h> |
85 | #include <unistd.h> | |
93ce2b4c | 86 | #include <errno.h> |
d02ca379 DB |
87 | #include "zfs_agents.h" |
88 | #include "../zed_log.h" | |
89 | ||
90 | #define DEV_BYID_PATH "/dev/disk/by-id/" | |
91 | #define DEV_BYPATH_PATH "/dev/disk/by-path/" | |
7a4500a1 | 92 | #define DEV_BYVDEV_PATH "/dev/disk/by-vdev/" |
d02ca379 DB |
93 | |
94 | typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); | |
95 | ||
96 | libzfs_handle_t *g_zfshdl; | |
97 | list_t g_pool_list; /* list of unavailable pools at initialization */ | |
98 | list_t g_device_list; /* list of disks with asynchronous label request */ | |
4e9b1569 | 99 | tpool_t *g_tpool; |
d02ca379 | 100 | boolean_t g_enumeration_done; |
4e9b1569 | 101 | pthread_t g_zfs_tid; /* zfs_enum_pools() thread */ |
d02ca379 DB |
102 | |
103 | typedef struct unavailpool { | |
104 | zpool_handle_t *uap_zhp; | |
d02ca379 DB |
105 | list_node_t uap_node; |
106 | } unavailpool_t; | |
107 | ||
108 | typedef struct pendingdev { | |
109 | char pd_physpath[128]; | |
110 | list_node_t pd_node; | |
111 | } pendingdev_t; | |
112 | ||
113 | static int | |
114 | zfs_toplevel_state(zpool_handle_t *zhp) | |
115 | { | |
116 | nvlist_t *nvroot; | |
117 | vdev_stat_t *vs; | |
118 | unsigned int c; | |
119 | ||
120 | verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), | |
121 | ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); | |
122 | verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, | |
123 | (uint64_t **)&vs, &c) == 0); | |
124 | return (vs->vs_state); | |
125 | } | |
126 | ||
127 | static int | |
128 | zfs_unavail_pool(zpool_handle_t *zhp, void *data) | |
129 | { | |
130 | zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", | |
131 | zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); | |
132 | ||
133 | if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { | |
134 | unavailpool_t *uap; | |
135 | uap = malloc(sizeof (unavailpool_t)); | |
72c99dc9 RY |
136 | if (uap == NULL) { |
137 | perror("malloc"); | |
138 | exit(EXIT_FAILURE); | |
139 | } | |
140 | ||
d02ca379 | 141 | uap->uap_zhp = zhp; |
d02ca379 DB |
142 | list_insert_tail((list_t *)data, uap); |
143 | } else { | |
144 | zpool_close(zhp); | |
145 | } | |
146 | return (0); | |
147 | } | |
148 | ||
b53077a9 TH |
149 | /* |
150 | * Write an array of strings to the zed log | |
151 | */ | |
152 | static void lines_to_zed_log_msg(char **lines, int lines_cnt) | |
153 | { | |
154 | int i; | |
155 | for (i = 0; i < lines_cnt; i++) { | |
156 | zed_log_msg(LOG_INFO, "%s", lines[i]); | |
157 | } | |
158 | } | |
159 | ||
d02ca379 DB |
160 | /* |
161 | * Two stage replace on Linux | |
162 | * since we get disk notifications | |
163 | * we can wait for partitioned disk slice to show up! | |
164 | * | |
165 | * First stage tags the disk, initiates async partitioning, and returns | |
166 | * Second stage finds the tag and proceeds to ZFS labeling/replace | |
167 | * | |
168 | * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach | |
169 | * | |
170 | * 1. physical match with no fs, no partition | |
171 | * tag it top, partition disk | |
172 | * | |
ad0b23b1 | 173 | * 2. physical match again, see partition and tag |
d02ca379 DB |
174 | * |
175 | */ | |
176 | ||
177 | /* | |
178 | * The device associated with the given vdev (either by devid or physical path) | |
179 | * has been added to the system. If 'isdisk' is set, then we only attempt a | |
180 | * replacement if it's a whole disk. This also implies that we should label the | |
181 | * disk first. | |
182 | * | |
183 | * First, we attempt to online the device (making sure to undo any spare | |
184 | * operation when finished). If this succeeds, then we're done. If it fails, | |
185 | * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, | |
186 | * but that the label was not what we expected. If the 'autoreplace' property | |
976246fa | 187 | * is enabled, then we relabel the disk (if specified), and attempt a 'zpool |
d02ca379 DB |
188 | * replace'. If the online is successful, but the new state is something else |
189 | * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of | |
190 | * race, and we should avoid attempting to relabel the disk. | |
191 | * | |
192 | * Also can arrive here from a ESC_ZFS_VDEV_CHECK event | |
193 | */ | |
194 | static void | |
195 | zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) | |
196 | { | |
d1807f16 | 197 | const char *path; |
d02ca379 DB |
198 | vdev_state_t newstate; |
199 | nvlist_t *nvroot, *newvd; | |
200 | pendingdev_t *device; | |
201 | uint64_t wholedisk = 0ULL; | |
f2f6c18f | 202 | uint64_t offline = 0ULL, faulted = 0ULL; |
d02ca379 | 203 | uint64_t guid = 0ULL; |
5091867e | 204 | uint64_t is_spare = 0; |
d1807f16 | 205 | const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; |
d02ca379 DB |
206 | char rawpath[PATH_MAX], fullpath[PATH_MAX]; |
207 | char devpath[PATH_MAX]; | |
208 | int ret; | |
5091867e | 209 | int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE; |
7c5eff94 | 210 | boolean_t is_sd = B_FALSE; |
f2f6c18f | 211 | boolean_t is_mpath_wholedisk = B_FALSE; |
6078881a TH |
212 | uint_t c; |
213 | vdev_stat_t *vs; | |
b53077a9 TH |
214 | char **lines = NULL; |
215 | int lines_cnt = 0; | |
d02ca379 DB |
216 | |
217 | if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) | |
218 | return; | |
219 | ||
6078881a TH |
220 | /* Skip healthy disks */ |
221 | verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, | |
222 | (uint64_t **)&vs, &c) == 0); | |
223 | if (vs->vs_state == VDEV_STATE_HEALTHY) { | |
224 | zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", | |
225 | __func__, path); | |
226 | return; | |
227 | } | |
228 | ||
d02ca379 | 229 | (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); |
1bbd8770 TH |
230 | (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, |
231 | &enc_sysfs_path); | |
d02ca379 DB |
232 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); |
233 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); | |
f2f6c18f TH |
234 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); |
235 | ||
d02ca379 | 236 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); |
5091867e | 237 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_IS_SPARE, &is_spare); |
d02ca379 | 238 | |
f2f6c18f TH |
239 | /* |
240 | * Special case: | |
241 | * | |
242 | * We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH | |
243 | * entry in their config. For example, on this force-faulted disk: | |
244 | * | |
245 | * children[0]: | |
246 | * type: 'disk' | |
247 | * id: 0 | |
248 | * guid: 14309659774640089719 | |
249 | * path: '/dev/disk/by-vdev/L28' | |
250 | * whole_disk: 0 | |
251 | * DTL: 654 | |
252 | * create_txg: 4 | |
253 | * com.delphix:vdev_zap_leaf: 1161 | |
254 | * faulted: 1 | |
255 | * aux_state: 'external' | |
256 | * children[1]: | |
257 | * type: 'disk' | |
258 | * id: 1 | |
259 | * guid: 16002508084177980912 | |
260 | * path: '/dev/disk/by-vdev/L29' | |
261 | * devid: 'dm-uuid-mpath-35000c500a61d68a3' | |
262 | * phys_path: 'L29' | |
263 | * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' | |
264 | * whole_disk: 0 | |
265 | * DTL: 1028 | |
266 | * create_txg: 4 | |
267 | * com.delphix:vdev_zap_leaf: 131 | |
268 | * | |
269 | * If the disk's path is a /dev/disk/by-vdev/ path, then we can infer | |
270 | * the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name. | |
271 | */ | |
272 | if (physpath == NULL && path != NULL) { | |
273 | /* If path begins with "/dev/disk/by-vdev/" ... */ | |
274 | if (strncmp(path, DEV_BYVDEV_PATH, | |
275 | strlen(DEV_BYVDEV_PATH)) == 0) { | |
276 | /* Set physpath to the char after "/dev/disk/by-vdev" */ | |
277 | physpath = &path[strlen(DEV_BYVDEV_PATH)]; | |
278 | } | |
279 | } | |
d02ca379 | 280 | |
f2f6c18f TH |
281 | /* |
282 | * We don't want to autoreplace offlined disks. However, we do want to | |
283 | * replace force-faulted disks (`zpool offline -f`). Force-faulted | |
284 | * disks have both offline=1 and faulted=1 in the nvlist. | |
285 | */ | |
286 | if (offline && !faulted) { | |
287 | zed_log_msg(LOG_INFO, "%s: %s is offline, skip autoreplace", | |
288 | __func__, path); | |
289 | return; | |
290 | } | |
291 | ||
292 | is_mpath_wholedisk = is_mpath_whole_disk(path); | |
6078881a | 293 | zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" |
f2f6c18f TH |
294 | " %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', " |
295 | "(guid %llu)", | |
296 | zpool_get_name(zhp), path, | |
297 | physpath ? physpath : "NULL", | |
298 | wholedisk ? "is" : "not", | |
299 | is_mpath_wholedisk? "is" : "not", | |
300 | labeled ? "is" : "not", | |
301 | enc_sysfs_path, | |
6078881a | 302 | (long long unsigned int)guid); |
d02ca379 DB |
303 | |
304 | /* | |
305 | * The VDEV guid is preferred for identification (gets passed in path) | |
306 | */ | |
307 | if (guid != 0) { | |
308 | (void) snprintf(fullpath, sizeof (fullpath), "%llu", | |
309 | (long long unsigned int)guid); | |
310 | } else { | |
311 | /* | |
312 | * otherwise use path sans partition suffix for whole disks | |
313 | */ | |
314 | (void) strlcpy(fullpath, path, sizeof (fullpath)); | |
315 | if (wholedisk) { | |
6078881a TH |
316 | char *spath = zfs_strip_partition(fullpath); |
317 | if (!spath) { | |
318 | zed_log_msg(LOG_INFO, "%s: Can't alloc", | |
319 | __func__); | |
320 | return; | |
321 | } | |
d02ca379 DB |
322 | |
323 | (void) strlcpy(fullpath, spath, sizeof (fullpath)); | |
324 | free(spath); | |
325 | } | |
326 | } | |
327 | ||
5091867e AH |
328 | if (is_spare) |
329 | online_flag |= ZFS_ONLINE_SPARE; | |
330 | ||
d02ca379 DB |
331 | /* |
332 | * Attempt to online the device. | |
333 | */ | |
5091867e | 334 | if (zpool_vdev_online(zhp, fullpath, online_flag, &newstate) == 0 && |
d02ca379 DB |
335 | (newstate == VDEV_STATE_HEALTHY || |
336 | newstate == VDEV_STATE_DEGRADED)) { | |
f2f6c18f TH |
337 | zed_log_msg(LOG_INFO, |
338 | " zpool_vdev_online: vdev '%s' ('%s') is " | |
339 | "%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ? | |
d02ca379 DB |
340 | "HEALTHY" : "DEGRADED"); |
341 | return; | |
342 | } | |
343 | ||
7a4500a1 SV |
344 | /* |
345 | * vdev_id alias rule for using scsi_debug devices (FMA automated | |
346 | * testing) | |
347 | */ | |
f02ad0dc | 348 | if (physpath != NULL && strcmp("scsidebug", physpath) == 0) |
7c5eff94 | 349 | is_sd = B_TRUE; |
7a4500a1 | 350 | |
d02ca379 | 351 | /* |
976246fa DB |
352 | * If the pool doesn't have the autoreplace property set, then use |
353 | * vdev online to trigger a FMA fault by posting an ereport. | |
d02ca379 | 354 | */ |
976246fa | 355 | if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || |
f2f6c18f | 356 | !(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) { |
d02ca379 DB |
357 | (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, |
358 | &newstate); | |
976246fa | 359 | zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " |
f2f6c18f TH |
360 | "not a blank disk for '%s' ('%s')", fullpath, |
361 | physpath); | |
d02ca379 DB |
362 | return; |
363 | } | |
364 | ||
365 | /* | |
7a4500a1 SV |
366 | * Convert physical path into its current device node. Rawpath |
367 | * needs to be /dev/disk/by-vdev for a scsi_debug device since | |
368 | * /dev/disk/by-path will not be present. | |
d02ca379 | 369 | */ |
7a4500a1 SV |
370 | (void) snprintf(rawpath, sizeof (rawpath), "%s%s", |
371 | is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); | |
372 | ||
f2f6c18f | 373 | if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) { |
d02ca379 DB |
374 | zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", |
375 | rawpath, strerror(errno)); | |
376 | ||
377 | (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, | |
378 | &newstate); | |
379 | ||
380 | zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", | |
381 | fullpath, libzfs_error_description(g_zfshdl)); | |
382 | return; | |
383 | } | |
384 | ||
6078881a TH |
385 | /* Only autoreplace bad disks */ |
386 | if ((vs->vs_state != VDEV_STATE_DEGRADED) && | |
387 | (vs->vs_state != VDEV_STATE_FAULTED) && | |
529bec7d | 388 | (vs->vs_state != VDEV_STATE_REMOVED) && |
6078881a | 389 | (vs->vs_state != VDEV_STATE_CANT_OPEN)) { |
f2f6c18f | 390 | zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in " |
f272960d | 391 | "a bad state (currently %llu)", vs->vs_state); |
6078881a TH |
392 | return; |
393 | } | |
394 | ||
395 | nvlist_lookup_string(vdev, "new_devid", &new_devid); | |
396 | ||
f2f6c18f | 397 | if (is_mpath_wholedisk) { |
6078881a | 398 | /* Don't label device mapper or multipath disks. */ |
b53077a9 TH |
399 | zed_log_msg(LOG_INFO, |
400 | " it's a multipath wholedisk, don't label"); | |
401 | if (zpool_prepare_disk(zhp, vdev, "autoreplace", &lines, | |
402 | &lines_cnt) != 0) { | |
403 | zed_log_msg(LOG_INFO, | |
404 | " zpool_prepare_disk: could not " | |
405 | "prepare '%s' (%s)", fullpath, | |
406 | libzfs_error_description(g_zfshdl)); | |
407 | if (lines_cnt > 0) { | |
408 | zed_log_msg(LOG_INFO, | |
409 | " zfs_prepare_disk output:"); | |
410 | lines_to_zed_log_msg(lines, lines_cnt); | |
411 | } | |
412 | libzfs_free_str_array(lines, lines_cnt); | |
413 | return; | |
414 | } | |
6078881a TH |
415 | } else if (!labeled) { |
416 | /* | |
417 | * we're auto-replacing a raw disk, so label it first | |
418 | */ | |
d02ca379 DB |
419 | char *leafname; |
420 | ||
421 | /* | |
422 | * If this is a request to label a whole disk, then attempt to | |
423 | * write out the label. Before we can label the disk, we need | |
424 | * to map the physical string that was matched on to the under | |
425 | * lying device node. | |
426 | * | |
427 | * If any part of this process fails, then do a force online | |
428 | * to trigger a ZFS fault for the device (and any hot spare | |
429 | * replacement). | |
430 | */ | |
431 | leafname = strrchr(devpath, '/') + 1; | |
432 | ||
433 | /* | |
434 | * If this is a request to label a whole disk, then attempt to | |
435 | * write out the label. | |
436 | */ | |
b53077a9 TH |
437 | if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname, |
438 | vdev, "autoreplace", &lines, &lines_cnt) != 0) { | |
439 | zed_log_msg(LOG_INFO, | |
440 | " zpool_prepare_and_label_disk: could not " | |
d02ca379 DB |
441 | "label '%s' (%s)", leafname, |
442 | libzfs_error_description(g_zfshdl)); | |
b53077a9 TH |
443 | if (lines_cnt > 0) { |
444 | zed_log_msg(LOG_INFO, | |
445 | " zfs_prepare_disk output:"); | |
446 | lines_to_zed_log_msg(lines, lines_cnt); | |
447 | } | |
448 | libzfs_free_str_array(lines, lines_cnt); | |
d02ca379 DB |
449 | |
450 | (void) zpool_vdev_online(zhp, fullpath, | |
451 | ZFS_ONLINE_FORCEFAULT, &newstate); | |
452 | return; | |
453 | } | |
454 | ||
455 | /* | |
456 | * The disk labeling is asynchronous on Linux. Just record | |
457 | * this label request and return as there will be another | |
458 | * disk add event for the partition after the labeling is | |
459 | * completed. | |
460 | */ | |
461 | device = malloc(sizeof (pendingdev_t)); | |
72c99dc9 RY |
462 | if (device == NULL) { |
463 | perror("malloc"); | |
464 | exit(EXIT_FAILURE); | |
465 | } | |
466 | ||
d02ca379 DB |
467 | (void) strlcpy(device->pd_physpath, physpath, |
468 | sizeof (device->pd_physpath)); | |
469 | list_insert_tail(&g_device_list, device); | |
470 | ||
471 | zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", | |
02730c33 | 472 | leafname, (u_longlong_t)guid); |
d02ca379 DB |
473 | |
474 | return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ | |
475 | ||
476 | } else /* labeled */ { | |
477 | boolean_t found = B_FALSE; | |
478 | /* | |
479 | * match up with request above to label the disk | |
480 | */ | |
481 | for (device = list_head(&g_device_list); device != NULL; | |
482 | device = list_next(&g_device_list, device)) { | |
483 | if (strcmp(physpath, device->pd_physpath) == 0) { | |
484 | list_remove(&g_device_list, device); | |
485 | free(device); | |
486 | found = B_TRUE; | |
487 | break; | |
488 | } | |
976246fa DB |
489 | zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", |
490 | physpath, device->pd_physpath); | |
d02ca379 DB |
491 | } |
492 | if (!found) { | |
493 | /* unexpected partition slice encountered */ | |
976246fa DB |
494 | zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", |
495 | fullpath); | |
d02ca379 DB |
496 | (void) zpool_vdev_online(zhp, fullpath, |
497 | ZFS_ONLINE_FORCEFAULT, &newstate); | |
498 | return; | |
499 | } | |
500 | ||
501 | zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", | |
02730c33 | 502 | physpath, (u_longlong_t)guid); |
d02ca379 DB |
503 | |
504 | (void) snprintf(devpath, sizeof (devpath), "%s%s", | |
505 | DEV_BYID_PATH, new_devid); | |
d02ca379 DB |
506 | } |
507 | ||
b53077a9 TH |
508 | libzfs_free_str_array(lines, lines_cnt); |
509 | ||
d02ca379 DB |
510 | /* |
511 | * Construct the root vdev to pass to zpool_vdev_attach(). While adding | |
512 | * the entire vdev structure is harmless, we construct a reduced set of | |
513 | * path/physpath/wholedisk to keep it simple. | |
514 | */ | |
515 | if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { | |
516 | zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); | |
517 | return; | |
518 | } | |
519 | if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { | |
520 | zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); | |
521 | nvlist_free(nvroot); | |
522 | return; | |
523 | } | |
524 | ||
525 | if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || | |
526 | nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || | |
527 | nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || | |
528 | (physpath != NULL && nvlist_add_string(newvd, | |
529 | ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || | |
1ad9de6d TH |
530 | (enc_sysfs_path != NULL && nvlist_add_string(newvd, |
531 | ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || | |
d02ca379 DB |
532 | nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || |
533 | nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || | |
795075e6 PD |
534 | nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, |
535 | (const nvlist_t **)&newvd, 1) != 0) { | |
d02ca379 DB |
536 | zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); |
537 | nvlist_free(newvd); | |
538 | nvlist_free(nvroot); | |
539 | return; | |
540 | } | |
541 | ||
542 | nvlist_free(newvd); | |
543 | ||
544 | /* | |
bea75783 BB |
545 | * Wait for udev to verify the links exist, then auto-replace |
546 | * the leaf disk at same physical location. | |
d02ca379 | 547 | */ |
bea75783 BB |
548 | if (zpool_label_disk_wait(path, 3000) != 0) { |
549 | zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " | |
550 | "disk %s is missing", path); | |
551 | nvlist_free(nvroot); | |
552 | return; | |
553 | } | |
554 | ||
b2255edc BB |
555 | /* |
556 | * Prefer sequential resilvering when supported (mirrors and dRAID), | |
557 | * otherwise fallback to a traditional healing resilver. | |
558 | */ | |
559 | ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE); | |
560 | if (ret != 0) { | |
561 | ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, | |
562 | B_TRUE, B_FALSE); | |
563 | } | |
d02ca379 DB |
564 | |
565 | zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", | |
566 | fullpath, path, (ret == 0) ? "no errors" : | |
567 | libzfs_error_description(g_zfshdl)); | |
568 | ||
569 | nvlist_free(nvroot); | |
570 | } | |
571 | ||
572 | /* | |
573 | * Utility functions to find a vdev matching given criteria. | |
574 | */ | |
575 | typedef struct dev_data { | |
576 | const char *dd_compare; | |
577 | const char *dd_prop; | |
578 | zfs_process_func_t dd_func; | |
579 | boolean_t dd_found; | |
580 | boolean_t dd_islabeled; | |
581 | uint64_t dd_pool_guid; | |
582 | uint64_t dd_vdev_guid; | |
e996c502 | 583 | uint64_t dd_new_vdev_guid; |
d02ca379 | 584 | const char *dd_new_devid; |
5091867e | 585 | uint64_t dd_num_spares; |
d02ca379 DB |
586 | } dev_data_t; |
587 | ||
588 | static void | |
589 | zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) | |
590 | { | |
591 | dev_data_t *dp = data; | |
d1807f16 | 592 | const char *path = NULL; |
d02ca379 DB |
593 | uint_t c, children; |
594 | nvlist_t **child; | |
e996c502 | 595 | uint64_t guid = 0; |
5091867e | 596 | uint64_t isspare = 0; |
d02ca379 DB |
597 | |
598 | /* | |
599 | * First iterate over any children. | |
600 | */ | |
601 | if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, | |
602 | &child, &children) == 0) { | |
603 | for (c = 0; c < children; c++) | |
604 | zfs_iter_vdev(zhp, child[c], data); | |
d48091de | 605 | } |
606 | ||
607 | /* | |
608 | * Iterate over any spares and cache devices | |
609 | */ | |
610 | if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, | |
611 | &child, &children) == 0) { | |
612 | for (c = 0; c < children; c++) | |
613 | zfs_iter_vdev(zhp, child[c], data); | |
614 | } | |
615 | if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, | |
616 | &child, &children) == 0) { | |
617 | for (c = 0; c < children; c++) | |
618 | zfs_iter_vdev(zhp, child[c], data); | |
d02ca379 DB |
619 | } |
620 | ||
621 | /* once a vdev was matched and processed there is nothing left to do */ | |
5091867e | 622 | if (dp->dd_found && dp->dd_num_spares == 0) |
d02ca379 | 623 | return; |
e996c502 | 624 | (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &guid); |
d02ca379 DB |
625 | |
626 | /* | |
627 | * Match by GUID if available otherwise fallback to devid or physical | |
628 | */ | |
629 | if (dp->dd_vdev_guid != 0) { | |
e996c502 | 630 | if (guid != dp->dd_vdev_guid) |
d02ca379 | 631 | return; |
d02ca379 DB |
632 | zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); |
633 | dp->dd_found = B_TRUE; | |
634 | ||
635 | } else if (dp->dd_compare != NULL) { | |
636 | /* | |
637 | * NOTE: On Linux there is an event for partition, so unlike | |
4e33ba4c | 638 | * illumos, substring matching is not required to accommodate |
d02ca379 DB |
639 | * the partition suffix. An exact match will be present in |
640 | * the dp->dd_compare value. | |
e996c502 AH |
641 | * If the attached disk already contains a vdev GUID, it means |
642 | * the disk is not clean. In such a scenario, the physical path | |
643 | * would be a match that makes the disk faulted when trying to | |
644 | * online it. So, we would only want to proceed if either GUID | |
645 | * matches with the last attached disk or the disk is in clean | |
646 | * state. | |
d02ca379 DB |
647 | */ |
648 | if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || | |
f2f6c18f | 649 | strcmp(dp->dd_compare, path) != 0) { |
d02ca379 | 650 | return; |
f2f6c18f | 651 | } |
e996c502 AH |
652 | if (dp->dd_new_vdev_guid != 0 && dp->dd_new_vdev_guid != guid) { |
653 | zed_log_msg(LOG_INFO, " %s: no match (GUID:%llu" | |
654 | " != vdev GUID:%llu)", __func__, | |
655 | dp->dd_new_vdev_guid, guid); | |
656 | return; | |
657 | } | |
6078881a | 658 | |
d02ca379 DB |
659 | zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", |
660 | dp->dd_prop, path); | |
661 | dp->dd_found = B_TRUE; | |
662 | ||
663 | /* pass the new devid for use by replacing code */ | |
6078881a | 664 | if (dp->dd_new_devid != NULL) { |
d02ca379 DB |
665 | (void) nvlist_add_string(nvl, "new_devid", |
666 | dp->dd_new_devid); | |
667 | } | |
668 | } | |
669 | ||
5091867e AH |
670 | if (dp->dd_found == B_TRUE && nvlist_lookup_uint64(nvl, |
671 | ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare) | |
672 | dp->dd_num_spares++; | |
673 | ||
d02ca379 DB |
674 | (dp->dd_func)(zhp, nvl, dp->dd_islabeled); |
675 | } | |
676 | ||
65c7cc49 | 677 | static void |
d02ca379 DB |
678 | zfs_enable_ds(void *arg) |
679 | { | |
680 | unavailpool_t *pool = (unavailpool_t *)arg; | |
681 | ||
d02ca379 DB |
682 | (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); |
683 | zpool_close(pool->uap_zhp); | |
4e9b1569 | 684 | free(pool); |
d02ca379 DB |
685 | } |
686 | ||
687 | static int | |
688 | zfs_iter_pool(zpool_handle_t *zhp, void *data) | |
689 | { | |
690 | nvlist_t *config, *nvl; | |
691 | dev_data_t *dp = data; | |
692 | uint64_t pool_guid; | |
693 | unavailpool_t *pool; | |
694 | ||
695 | zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", | |
696 | zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); | |
697 | ||
698 | /* | |
699 | * For each vdev in this pool, look for a match to apply dd_func | |
700 | */ | |
701 | if ((config = zpool_get_config(zhp, NULL)) != NULL) { | |
702 | if (dp->dd_pool_guid == 0 || | |
703 | (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, | |
704 | &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { | |
705 | (void) nvlist_lookup_nvlist(config, | |
706 | ZPOOL_CONFIG_VDEV_TREE, &nvl); | |
707 | zfs_iter_vdev(zhp, nvl, data); | |
708 | } | |
f2f6c18f TH |
709 | } else { |
710 | zed_log_msg(LOG_INFO, "%s: no config\n", __func__); | |
d02ca379 DB |
711 | } |
712 | ||
713 | /* | |
714 | * if this pool was originally unavailable, | |
715 | * then enable its datasets asynchronously | |
716 | */ | |
717 | if (g_enumeration_done) { | |
718 | for (pool = list_head(&g_pool_list); pool != NULL; | |
719 | pool = list_next(&g_pool_list, pool)) { | |
720 | ||
d02ca379 DB |
721 | if (strcmp(zpool_get_name(zhp), |
722 | zpool_get_name(pool->uap_zhp))) | |
723 | continue; | |
724 | if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { | |
4e9b1569 | 725 | list_remove(&g_pool_list, pool); |
726 | (void) tpool_dispatch(g_tpool, zfs_enable_ds, | |
727 | pool); | |
d02ca379 DB |
728 | break; |
729 | } | |
730 | } | |
731 | } | |
732 | ||
733 | zpool_close(zhp); | |
5091867e AH |
734 | |
735 | /* cease iteration after a match */ | |
736 | return (dp->dd_found && dp->dd_num_spares == 0); | |
d02ca379 DB |
737 | } |
738 | ||
739 | /* | |
740 | * Given a physical device location, iterate over all | |
741 | * (pool, vdev) pairs which correspond to that location. | |
742 | */ | |
743 | static boolean_t | |
744 | devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, | |
e996c502 | 745 | boolean_t is_slice, uint64_t new_vdev_guid) |
d02ca379 DB |
746 | { |
747 | dev_data_t data = { 0 }; | |
748 | ||
749 | data.dd_compare = physical; | |
750 | data.dd_func = func; | |
751 | data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; | |
752 | data.dd_found = B_FALSE; | |
753 | data.dd_islabeled = is_slice; | |
754 | data.dd_new_devid = devid; /* used by auto replace code */ | |
e996c502 | 755 | data.dd_new_vdev_guid = new_vdev_guid; |
d02ca379 DB |
756 | |
757 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
758 | ||
759 | return (data.dd_found); | |
760 | } | |
761 | ||
f2f6c18f TH |
762 | /* |
763 | * Given a device identifier, find any vdevs with a matching by-vdev | |
764 | * path. Normally we shouldn't need this as the comparison would be | |
765 | * made earlier in the devphys_iter(). For example, if we were replacing | |
766 | * /dev/disk/by-vdev/L28, normally devphys_iter() would match the | |
767 | * ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28" | |
768 | * of the new disk config. However, we've seen cases where | |
769 | * ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk. Here's | |
770 | * an example of a real 2-disk mirror pool where one disk was force | |
771 | * faulted: | |
772 | * | |
773 | * com.delphix:vdev_zap_top: 129 | |
774 | * children[0]: | |
775 | * type: 'disk' | |
776 | * id: 0 | |
777 | * guid: 14309659774640089719 | |
778 | * path: '/dev/disk/by-vdev/L28' | |
779 | * whole_disk: 0 | |
780 | * DTL: 654 | |
781 | * create_txg: 4 | |
782 | * com.delphix:vdev_zap_leaf: 1161 | |
783 | * faulted: 1 | |
784 | * aux_state: 'external' | |
785 | * children[1]: | |
786 | * type: 'disk' | |
787 | * id: 1 | |
788 | * guid: 16002508084177980912 | |
789 | * path: '/dev/disk/by-vdev/L29' | |
790 | * devid: 'dm-uuid-mpath-35000c500a61d68a3' | |
791 | * phys_path: 'L29' | |
792 | * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' | |
793 | * whole_disk: 0 | |
794 | * DTL: 1028 | |
795 | * create_txg: 4 | |
796 | * com.delphix:vdev_zap_leaf: 131 | |
797 | * | |
798 | * So in the case above, the only thing we could compare is the path. | |
799 | * | |
800 | * We can do this because we assume by-vdev paths are authoritative as physical | |
801 | * paths. We could not assume this for normal paths like /dev/sda since the | |
802 | * physical location /dev/sda points to could change over time. | |
803 | */ | |
804 | static boolean_t | |
805 | by_vdev_path_iter(const char *by_vdev_path, const char *devid, | |
806 | zfs_process_func_t func, boolean_t is_slice) | |
807 | { | |
808 | dev_data_t data = { 0 }; | |
809 | ||
810 | data.dd_compare = by_vdev_path; | |
811 | data.dd_func = func; | |
812 | data.dd_prop = ZPOOL_CONFIG_PATH; | |
813 | data.dd_found = B_FALSE; | |
814 | data.dd_islabeled = is_slice; | |
815 | data.dd_new_devid = devid; | |
816 | ||
817 | if (strncmp(by_vdev_path, DEV_BYVDEV_PATH, | |
818 | strlen(DEV_BYVDEV_PATH)) != 0) { | |
819 | /* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */ | |
820 | return (B_FALSE); | |
821 | } | |
822 | ||
823 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
824 | ||
825 | return (data.dd_found); | |
826 | } | |
827 | ||
d02ca379 DB |
828 | /* |
829 | * Given a device identifier, find any vdevs with a matching devid. | |
830 | * On Linux we can match devid directly which is always a whole disk. | |
831 | */ | |
832 | static boolean_t | |
833 | devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) | |
834 | { | |
835 | dev_data_t data = { 0 }; | |
836 | ||
837 | data.dd_compare = devid; | |
838 | data.dd_func = func; | |
839 | data.dd_prop = ZPOOL_CONFIG_DEVID; | |
840 | data.dd_found = B_FALSE; | |
841 | data.dd_islabeled = is_slice; | |
842 | data.dd_new_devid = devid; | |
843 | ||
844 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
845 | ||
846 | return (data.dd_found); | |
847 | } | |
848 | ||
cfc564f9 RM |
849 | /* |
850 | * Given a device guid, find any vdevs with a matching guid. | |
851 | */ | |
852 | static boolean_t | |
853 | guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid, | |
854 | zfs_process_func_t func, boolean_t is_slice) | |
855 | { | |
856 | dev_data_t data = { 0 }; | |
857 | ||
858 | data.dd_func = func; | |
859 | data.dd_found = B_FALSE; | |
860 | data.dd_pool_guid = pool_guid; | |
861 | data.dd_vdev_guid = vdev_guid; | |
862 | data.dd_islabeled = is_slice; | |
863 | data.dd_new_devid = devid; | |
864 | ||
865 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
866 | ||
867 | return (data.dd_found); | |
868 | } | |
869 | ||
d02ca379 DB |
870 | /* |
871 | * Handle a EC_DEV_ADD.ESC_DISK event. | |
872 | * | |
873 | * illumos | |
874 | * Expects: DEV_PHYS_PATH string in schema | |
875 | * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID | |
876 | * | |
877 | * path: '/dev/dsk/c0t1d0s0' (persistent) | |
878 | * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' | |
879 | * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' | |
880 | * | |
881 | * linux | |
882 | * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema | |
883 | * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID | |
884 | * | |
885 | * path: '/dev/sdc1' (not persistent) | |
886 | * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' | |
887 | * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' | |
888 | */ | |
889 | static int | |
491165c0 | 890 | zfs_deliver_add(nvlist_t *nvl) |
d02ca379 | 891 | { |
d1807f16 | 892 | const char *devpath = NULL, *devid = NULL; |
cfc564f9 | 893 | uint64_t pool_guid = 0, vdev_guid = 0; |
d02ca379 DB |
894 | boolean_t is_slice; |
895 | ||
896 | /* | |
cfc564f9 | 897 | * Expecting a devid string and an optional physical location and guid |
d02ca379 | 898 | */ |
f2f6c18f TH |
899 | if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) { |
900 | zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__); | |
d02ca379 | 901 | return (-1); |
f2f6c18f | 902 | } |
d02ca379 DB |
903 | |
904 | (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); | |
cfc564f9 RM |
905 | (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); |
906 | (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); | |
d02ca379 | 907 | |
d02ca379 DB |
908 | is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); |
909 | ||
6078881a TH |
910 | zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)", |
911 | devid, devpath ? devpath : "NULL", is_slice); | |
912 | ||
d02ca379 | 913 | /* |
ad0b23b1 | 914 | * Iterate over all vdevs looking for a match in the following order: |
d02ca379 DB |
915 | * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) |
916 | * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). | |
cfc564f9 | 917 | * 3. ZPOOL_CONFIG_GUID (identifies unique vdev). |
f2f6c18f TH |
918 | * 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since |
919 | * by-vdev paths represent physical paths). | |
d02ca379 | 920 | */ |
cfc564f9 RM |
921 | if (devid_iter(devid, zfs_process_add, is_slice)) |
922 | return (0); | |
923 | if (devpath != NULL && devphys_iter(devpath, devid, zfs_process_add, | |
e996c502 | 924 | is_slice, vdev_guid)) |
cfc564f9 RM |
925 | return (0); |
926 | if (vdev_guid != 0) | |
927 | (void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add, | |
928 | is_slice); | |
d02ca379 | 929 | |
f2f6c18f TH |
930 | if (devpath != NULL) { |
931 | /* Can we match a /dev/disk/by-vdev/ path? */ | |
932 | char by_vdev_path[MAXPATHLEN]; | |
933 | snprintf(by_vdev_path, sizeof (by_vdev_path), | |
934 | "/dev/disk/by-vdev/%s", devpath); | |
935 | if (by_vdev_path_iter(by_vdev_path, devid, zfs_process_add, | |
936 | is_slice)) | |
937 | return (0); | |
938 | } | |
939 | ||
d02ca379 DB |
940 | return (0); |
941 | } | |
942 | ||
943 | /* | |
944 | * Called when we receive a VDEV_CHECK event, which indicates a device could not | |
945 | * be opened during initial pool open, but the autoreplace property was set on | |
946 | * the pool. In this case, we treat it as if it were an add event. | |
947 | */ | |
948 | static int | |
949 | zfs_deliver_check(nvlist_t *nvl) | |
950 | { | |
951 | dev_data_t data = { 0 }; | |
952 | ||
953 | if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, | |
954 | &data.dd_pool_guid) != 0 || | |
955 | nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, | |
956 | &data.dd_vdev_guid) != 0 || | |
957 | data.dd_vdev_guid == 0) | |
958 | return (0); | |
959 | ||
960 | zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", | |
961 | data.dd_pool_guid, data.dd_vdev_guid); | |
962 | ||
963 | data.dd_func = zfs_process_add; | |
964 | ||
965 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
966 | ||
967 | return (0); | |
968 | } | |
969 | ||
e27e692b TH |
970 | /* |
971 | * Given a path to a vdev, lookup the vdev's physical size from its | |
972 | * config nvlist. | |
973 | * | |
974 | * Returns the vdev's physical size in bytes on success, 0 on error. | |
975 | */ | |
976 | static uint64_t | |
977 | vdev_size_from_config(zpool_handle_t *zhp, const char *vdev_path) | |
978 | { | |
979 | nvlist_t *nvl = NULL; | |
980 | boolean_t avail_spare, l2cache, log; | |
981 | vdev_stat_t *vs = NULL; | |
982 | uint_t c; | |
983 | ||
984 | nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); | |
985 | if (!nvl) | |
986 | return (0); | |
987 | ||
988 | verify(nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS, | |
989 | (uint64_t **)&vs, &c) == 0); | |
990 | if (!vs) { | |
991 | zed_log_msg(LOG_INFO, "%s: no nvlist for '%s'", __func__, | |
992 | vdev_path); | |
993 | return (0); | |
994 | } | |
995 | ||
996 | return (vs->vs_pspace); | |
997 | } | |
998 | ||
999 | /* | |
1000 | * Given a path to a vdev, lookup if the vdev is a "whole disk" in the | |
1001 | * config nvlist. "whole disk" means that ZFS was passed a whole disk | |
1002 | * at pool creation time, which it partitioned up and has full control over. | |
1003 | * Thus a partition with wholedisk=1 set tells us that zfs created the | |
1004 | * partition at creation time. A partition without whole disk set would have | |
1005 | * been created by externally (like with fdisk) and passed to ZFS. | |
1006 | * | |
1007 | * Returns the whole disk value (either 0 or 1). | |
1008 | */ | |
1009 | static uint64_t | |
1010 | vdev_whole_disk_from_config(zpool_handle_t *zhp, const char *vdev_path) | |
1011 | { | |
1012 | nvlist_t *nvl = NULL; | |
1013 | boolean_t avail_spare, l2cache, log; | |
0b2428da | 1014 | uint64_t wholedisk = 0; |
e27e692b TH |
1015 | |
1016 | nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); | |
1017 | if (!nvl) | |
1018 | return (0); | |
1019 | ||
0b2428da | 1020 | (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); |
e27e692b TH |
1021 | |
1022 | return (wholedisk); | |
1023 | } | |
1024 | ||
1025 | /* | |
1026 | * If the device size grew more than 1% then return true. | |
1027 | */ | |
1028 | #define DEVICE_GREW(oldsize, newsize) \ | |
1029 | ((newsize > oldsize) && \ | |
1030 | ((newsize / (newsize - oldsize)) <= 100)) | |
1031 | ||
d02ca379 DB |
1032 | static int |
1033 | zfsdle_vdev_online(zpool_handle_t *zhp, void *data) | |
1034 | { | |
d02ca379 | 1035 | boolean_t avail_spare, l2cache; |
e27e692b | 1036 | nvlist_t *udev_nvl = data; |
d02ca379 | 1037 | nvlist_t *tgt; |
d441e85d | 1038 | int error; |
d02ca379 | 1039 | |
d1807f16 RY |
1040 | const char *tmp_devname; |
1041 | char devname[MAXPATHLEN] = ""; | |
e27e692b TH |
1042 | uint64_t guid; |
1043 | ||
1044 | if (nvlist_lookup_uint64(udev_nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { | |
1045 | sprintf(devname, "%llu", (u_longlong_t)guid); | |
1046 | } else if (nvlist_lookup_string(udev_nvl, DEV_PHYS_PATH, | |
1047 | &tmp_devname) == 0) { | |
1048 | strlcpy(devname, tmp_devname, MAXPATHLEN); | |
1049 | zfs_append_partition(devname, MAXPATHLEN); | |
1050 | } else { | |
1051 | zed_log_msg(LOG_INFO, "%s: no guid or physpath", __func__); | |
1052 | } | |
1053 | ||
d02ca379 DB |
1054 | zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", |
1055 | devname, zpool_get_name(zhp)); | |
1056 | ||
1057 | if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, | |
1058 | &avail_spare, &l2cache, NULL)) != NULL) { | |
d1807f16 RY |
1059 | const char *path; |
1060 | char fullpath[MAXPATHLEN]; | |
0b2428da | 1061 | uint64_t wholedisk = 0; |
d02ca379 | 1062 | |
d441e85d BB |
1063 | error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path); |
1064 | if (error) { | |
1065 | zpool_close(zhp); | |
1066 | return (0); | |
1067 | } | |
d02ca379 | 1068 | |
0b2428da | 1069 | (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, |
d441e85d | 1070 | &wholedisk); |
d3f2cd7e | 1071 | |
d441e85d | 1072 | if (wholedisk) { |
d1807f16 | 1073 | char *tmp; |
d441e85d BB |
1074 | path = strrchr(path, '/'); |
1075 | if (path != NULL) { | |
d1807f16 RY |
1076 | tmp = zfs_strip_partition(path + 1); |
1077 | if (tmp == NULL) { | |
d441e85d BB |
1078 | zpool_close(zhp); |
1079 | return (0); | |
1080 | } | |
1081 | } else { | |
1082 | zpool_close(zhp); | |
6078881a TH |
1083 | return (0); |
1084 | } | |
d02ca379 | 1085 | |
d1807f16 RY |
1086 | (void) strlcpy(fullpath, tmp, sizeof (fullpath)); |
1087 | free(tmp); | |
d02ca379 DB |
1088 | |
1089 | /* | |
1090 | * We need to reopen the pool associated with this | |
d441e85d BB |
1091 | * device so that the kernel can update the size of |
1092 | * the expanded device. When expanding there is no | |
1093 | * need to restart the scrub from the beginning. | |
d02ca379 | 1094 | */ |
d441e85d | 1095 | boolean_t scrub_restart = B_FALSE; |
d3f2cd7e | 1096 | (void) zpool_reopen_one(zhp, &scrub_restart); |
d441e85d BB |
1097 | } else { |
1098 | (void) strlcpy(fullpath, path, sizeof (fullpath)); | |
d02ca379 DB |
1099 | } |
1100 | ||
1101 | if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { | |
d441e85d BB |
1102 | vdev_state_t newstate; |
1103 | ||
1104 | if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { | |
e27e692b TH |
1105 | /* |
1106 | * If this disk size has not changed, then | |
1107 | * there's no need to do an autoexpand. To | |
1108 | * check we look at the disk's size in its | |
1109 | * config, and compare it to the disk size | |
1110 | * that udev is reporting. | |
1111 | */ | |
1112 | uint64_t udev_size = 0, conf_size = 0, | |
1113 | wholedisk = 0, udev_parent_size = 0; | |
1114 | ||
1115 | /* | |
1116 | * Get the size of our disk that udev is | |
1117 | * reporting. | |
1118 | */ | |
1119 | if (nvlist_lookup_uint64(udev_nvl, DEV_SIZE, | |
1120 | &udev_size) != 0) { | |
1121 | udev_size = 0; | |
1122 | } | |
1123 | ||
1124 | /* | |
1125 | * Get the size of our disk's parent device | |
1126 | * from udev (where sda1's parent is sda). | |
1127 | */ | |
1128 | if (nvlist_lookup_uint64(udev_nvl, | |
1129 | DEV_PARENT_SIZE, &udev_parent_size) != 0) { | |
1130 | udev_parent_size = 0; | |
1131 | } | |
1132 | ||
1133 | conf_size = vdev_size_from_config(zhp, | |
1134 | fullpath); | |
1135 | ||
1136 | wholedisk = vdev_whole_disk_from_config(zhp, | |
1137 | fullpath); | |
1138 | ||
1139 | /* | |
1140 | * Only attempt an autoexpand if the vdev size | |
1141 | * changed. There are two different cases | |
1142 | * to consider. | |
1143 | * | |
1144 | * 1. wholedisk=1 | |
1145 | * If you do a 'zpool create' on a whole disk | |
1146 | * (like /dev/sda), then zfs will create | |
1147 | * partitions on the disk (like /dev/sda1). In | |
1148 | * that case, wholedisk=1 will be set in the | |
1149 | * partition's nvlist config. So zed will need | |
1150 | * to see if your parent device (/dev/sda) | |
1151 | * expanded in size, and if so, then attempt | |
1152 | * the autoexpand. | |
1153 | * | |
1154 | * 2. wholedisk=0 | |
1155 | * If you do a 'zpool create' on an existing | |
1156 | * partition, or a device that doesn't allow | |
1157 | * partitions, then wholedisk=0, and you will | |
1158 | * simply need to check if the device itself | |
1159 | * expanded in size. | |
1160 | */ | |
1161 | if (DEVICE_GREW(conf_size, udev_size) || | |
1162 | (wholedisk && DEVICE_GREW(conf_size, | |
1163 | udev_parent_size))) { | |
1164 | error = zpool_vdev_online(zhp, fullpath, | |
1165 | 0, &newstate); | |
1166 | ||
1167 | zed_log_msg(LOG_INFO, | |
1168 | "%s: autoexpanding '%s' from %llu" | |
1169 | " to %llu bytes in pool '%s': %d", | |
1170 | __func__, fullpath, conf_size, | |
1171 | MAX(udev_size, udev_parent_size), | |
1172 | zpool_get_name(zhp), error); | |
1173 | } | |
d441e85d | 1174 | } |
d02ca379 DB |
1175 | } |
1176 | zpool_close(zhp); | |
1177 | return (1); | |
1178 | } | |
1179 | zpool_close(zhp); | |
1180 | return (0); | |
1181 | } | |
1182 | ||
1183 | /* | |
d441e85d BB |
1184 | * This function handles the ESC_DEV_DLE device change event. Use the |
1185 | * provided vdev guid when looking up a disk or partition, when the guid | |
1186 | * is not present assume the entire disk is owned by ZFS and append the | |
1187 | * expected -part1 partition information then lookup by physical path. | |
d02ca379 DB |
1188 | */ |
1189 | static int | |
1190 | zfs_deliver_dle(nvlist_t *nvl) | |
1191 | { | |
d1807f16 RY |
1192 | const char *devname; |
1193 | char name[MAXPATHLEN]; | |
d441e85d BB |
1194 | uint64_t guid; |
1195 | ||
1196 | if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { | |
1197 | sprintf(name, "%llu", (u_longlong_t)guid); | |
1198 | } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) { | |
1199 | strlcpy(name, devname, MAXPATHLEN); | |
1200 | zfs_append_partition(name, MAXPATHLEN); | |
1201 | } else { | |
09453dea | 1202 | sprintf(name, "unknown"); |
d441e85d | 1203 | zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); |
d02ca379 DB |
1204 | } |
1205 | ||
e27e692b | 1206 | if (zpool_iter(g_zfshdl, zfsdle_vdev_online, nvl) != 1) { |
95401cb6 | 1207 | zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " |
d441e85d | 1208 | "found", name); |
d02ca379 DB |
1209 | return (1); |
1210 | } | |
d441e85d | 1211 | |
d02ca379 DB |
1212 | return (0); |
1213 | } | |
1214 | ||
1215 | /* | |
1216 | * syseventd daemon module event handler | |
1217 | * | |
1218 | * Handles syseventd daemon zfs device related events: | |
1219 | * | |
1220 | * EC_DEV_ADD.ESC_DISK | |
1221 | * EC_DEV_STATUS.ESC_DEV_DLE | |
1222 | * EC_ZFS.ESC_ZFS_VDEV_CHECK | |
1223 | * | |
1224 | * Note: assumes only one thread active at a time (not thread safe) | |
1225 | */ | |
1226 | static int | |
1227 | zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) | |
1228 | { | |
1229 | int ret; | |
491165c0 | 1230 | boolean_t is_check = B_FALSE, is_dle = B_FALSE; |
d02ca379 DB |
1231 | |
1232 | if (strcmp(class, EC_DEV_ADD) == 0) { | |
1233 | /* | |
1234 | * We're mainly interested in disk additions, but we also listen | |
1235 | * for new loop devices, to allow for simplified testing. | |
1236 | */ | |
491165c0 AZ |
1237 | if (strcmp(subclass, ESC_DISK) != 0 && |
1238 | strcmp(subclass, ESC_LOFI) != 0) | |
d02ca379 DB |
1239 | return (0); |
1240 | ||
1241 | is_check = B_FALSE; | |
1242 | } else if (strcmp(class, EC_ZFS) == 0 && | |
1243 | strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { | |
1244 | /* | |
1245 | * This event signifies that a device failed to open | |
1246 | * during pool load, but the 'autoreplace' property was | |
1247 | * set, so we should pretend it's just been added. | |
1248 | */ | |
1249 | is_check = B_TRUE; | |
1250 | } else if (strcmp(class, EC_DEV_STATUS) == 0 && | |
1251 | strcmp(subclass, ESC_DEV_DLE) == 0) { | |
1252 | is_dle = B_TRUE; | |
1253 | } else { | |
1254 | return (0); | |
1255 | } | |
1256 | ||
1257 | if (is_dle) | |
1258 | ret = zfs_deliver_dle(nvl); | |
1259 | else if (is_check) | |
1260 | ret = zfs_deliver_check(nvl); | |
1261 | else | |
491165c0 | 1262 | ret = zfs_deliver_add(nvl); |
d02ca379 DB |
1263 | |
1264 | return (ret); | |
1265 | } | |
1266 | ||
d02ca379 DB |
1267 | static void * |
1268 | zfs_enum_pools(void *arg) | |
1269 | { | |
491165c0 AZ |
1270 | (void) arg; |
1271 | ||
d02ca379 DB |
1272 | (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); |
1273 | /* | |
1274 | * Linux - instead of using a thread pool, each list entry | |
1275 | * will spawn a thread when an unavailable pool transitions | |
1276 | * to available. zfs_slm_fini will wait for these threads. | |
1277 | */ | |
1278 | g_enumeration_done = B_TRUE; | |
1279 | return (NULL); | |
1280 | } | |
1281 | ||
1282 | /* | |
1283 | * called from zed daemon at startup | |
1284 | * | |
1285 | * sent messages from zevents or udev monitor | |
1286 | * | |
ad0b23b1 | 1287 | * For now, each agent has its own libzfs instance |
d02ca379 DB |
1288 | */ |
1289 | int | |
89e81bc6 | 1290 | zfs_slm_init(void) |
d02ca379 | 1291 | { |
4e9b1569 | 1292 | if ((g_zfshdl = libzfs_init()) == NULL) |
d02ca379 DB |
1293 | return (-1); |
1294 | ||
1295 | /* | |
1296 | * collect a list of unavailable pools (asynchronously, | |
1297 | * since this can take a while) | |
1298 | */ | |
1299 | list_create(&g_pool_list, sizeof (struct unavailpool), | |
1300 | offsetof(struct unavailpool, uap_node)); | |
1301 | ||
1302 | if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { | |
1303 | list_destroy(&g_pool_list); | |
4e9b1569 | 1304 | libzfs_fini(g_zfshdl); |
d02ca379 DB |
1305 | return (-1); |
1306 | } | |
1307 | ||
3ef80eef | 1308 | pthread_setname_np(g_zfs_tid, "enum-pools"); |
d02ca379 DB |
1309 | list_create(&g_device_list, sizeof (struct pendingdev), |
1310 | offsetof(struct pendingdev, pd_node)); | |
1311 | ||
1312 | return (0); | |
1313 | } | |
1314 | ||
1315 | void | |
89e81bc6 | 1316 | zfs_slm_fini(void) |
d02ca379 DB |
1317 | { |
1318 | unavailpool_t *pool; | |
1319 | pendingdev_t *device; | |
1320 | ||
1321 | /* wait for zfs_enum_pools thread to complete */ | |
1322 | (void) pthread_join(g_zfs_tid, NULL); | |
4e9b1569 | 1323 | /* destroy the thread pool */ |
1324 | if (g_tpool != NULL) { | |
1325 | tpool_wait(g_tpool); | |
1326 | tpool_destroy(g_tpool); | |
1327 | } | |
d02ca379 | 1328 | |
b3ad3f48 | 1329 | while ((pool = list_remove_head(&g_pool_list)) != NULL) { |
4e9b1569 | 1330 | zpool_close(pool->uap_zhp); |
d02ca379 DB |
1331 | free(pool); |
1332 | } | |
1333 | list_destroy(&g_pool_list); | |
1334 | ||
b3ad3f48 | 1335 | while ((device = list_remove_head(&g_device_list)) != NULL) |
d02ca379 | 1336 | free(device); |
d02ca379 DB |
1337 | list_destroy(&g_device_list); |
1338 | ||
4e9b1569 | 1339 | libzfs_fini(g_zfshdl); |
d02ca379 DB |
1340 | } |
1341 | ||
1342 | void | |
1343 | zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) | |
1344 | { | |
d02ca379 DB |
1345 | zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); |
1346 | (void) zfs_slm_deliver_event(class, subclass, nvl); | |
d02ca379 | 1347 | } |