]>
Commit | Line | Data |
---|---|---|
d02ca379 DB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. | |
23 | * Copyright (c) 2012 by Delphix. All rights reserved. | |
24 | * Copyright 2014 Nexenta Systems, Inc. All rights reserved. | |
7a4500a1 | 25 | * Copyright (c) 2016, 2017, Intel Corporation. |
d3f2cd7e | 26 | * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. |
d02ca379 DB |
27 | */ |
28 | ||
29 | /* | |
30 | * ZFS syseventd module. | |
31 | * | |
32 | * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c | |
33 | * | |
34 | * The purpose of this module is to identify when devices are added to the | |
35 | * system, and appropriately online or replace the affected vdevs. | |
36 | * | |
37 | * When a device is added to the system: | |
38 | * | |
39 | * 1. Search for any vdevs whose devid matches that of the newly added | |
40 | * device. | |
41 | * | |
42 | * 2. If no vdevs are found, then search for any vdevs whose udev path | |
43 | * matches that of the new device. | |
44 | * | |
45 | * 3. If no vdevs match by either method, then ignore the event. | |
46 | * | |
47 | * 4. Attempt to online the device with a flag to indicate that it should | |
48 | * be unspared when resilvering completes. If this succeeds, then the | |
49 | * same device was inserted and we should continue normally. | |
50 | * | |
51 | * 5. If the pool does not have the 'autoreplace' property set, attempt to | |
52 | * online the device again without the unspare flag, which will | |
53 | * generate a FMA fault. | |
54 | * | |
55 | * 6. If the pool has the 'autoreplace' property set, and the matching vdev | |
56 | * is a whole disk, then label the new disk and attempt a 'zpool | |
57 | * replace'. | |
58 | * | |
59 | * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK | |
60 | * event indicates that a device failed to open during pool load, but the | |
61 | * autoreplace property was set. In this case, we deferred the associated | |
62 | * FMA fault until our module had a chance to process the autoreplace logic. | |
63 | * If the device could not be replaced, then the second online attempt will | |
64 | * trigger the FMA fault that we skipped earlier. | |
65 | * | |
66 | * ZFS on Linux porting notes: | |
d02ca379 DB |
67 | * Linux udev provides a disk insert for both the disk and the partition |
68 | * | |
69 | */ | |
70 | ||
71 | #include <ctype.h> | |
72 | #include <devid.h> | |
73 | #include <fcntl.h> | |
74 | #include <libnvpair.h> | |
75 | #include <libzfs.h> | |
76 | #include <limits.h> | |
77 | #include <stddef.h> | |
78 | #include <stdlib.h> | |
79 | #include <string.h> | |
80 | #include <syslog.h> | |
81 | #include <sys/list.h> | |
82 | #include <sys/sunddi.h> | |
83 | #include <sys/sysevent/eventdefs.h> | |
84 | #include <sys/sysevent/dev.h> | |
4e9b1569 | 85 | #include <thread_pool.h> |
d02ca379 DB |
86 | #include <pthread.h> |
87 | #include <unistd.h> | |
93ce2b4c | 88 | #include <errno.h> |
d02ca379 DB |
89 | #include "zfs_agents.h" |
90 | #include "../zed_log.h" | |
91 | ||
92 | #define DEV_BYID_PATH "/dev/disk/by-id/" | |
93 | #define DEV_BYPATH_PATH "/dev/disk/by-path/" | |
7a4500a1 | 94 | #define DEV_BYVDEV_PATH "/dev/disk/by-vdev/" |
d02ca379 DB |
95 | |
96 | typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); | |
97 | ||
98 | libzfs_handle_t *g_zfshdl; | |
99 | list_t g_pool_list; /* list of unavailable pools at initialization */ | |
100 | list_t g_device_list; /* list of disks with asynchronous label request */ | |
4e9b1569 | 101 | tpool_t *g_tpool; |
d02ca379 | 102 | boolean_t g_enumeration_done; |
4e9b1569 | 103 | pthread_t g_zfs_tid; /* zfs_enum_pools() thread */ |
d02ca379 DB |
104 | |
105 | typedef struct unavailpool { | |
106 | zpool_handle_t *uap_zhp; | |
d02ca379 DB |
107 | list_node_t uap_node; |
108 | } unavailpool_t; | |
109 | ||
110 | typedef struct pendingdev { | |
111 | char pd_physpath[128]; | |
112 | list_node_t pd_node; | |
113 | } pendingdev_t; | |
114 | ||
115 | static int | |
116 | zfs_toplevel_state(zpool_handle_t *zhp) | |
117 | { | |
118 | nvlist_t *nvroot; | |
119 | vdev_stat_t *vs; | |
120 | unsigned int c; | |
121 | ||
122 | verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), | |
123 | ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); | |
124 | verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, | |
125 | (uint64_t **)&vs, &c) == 0); | |
126 | return (vs->vs_state); | |
127 | } | |
128 | ||
129 | static int | |
130 | zfs_unavail_pool(zpool_handle_t *zhp, void *data) | |
131 | { | |
132 | zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", | |
133 | zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); | |
134 | ||
135 | if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { | |
136 | unavailpool_t *uap; | |
137 | uap = malloc(sizeof (unavailpool_t)); | |
138 | uap->uap_zhp = zhp; | |
d02ca379 DB |
139 | list_insert_tail((list_t *)data, uap); |
140 | } else { | |
141 | zpool_close(zhp); | |
142 | } | |
143 | return (0); | |
144 | } | |
145 | ||
146 | /* | |
147 | * Two stage replace on Linux | |
148 | * since we get disk notifications | |
149 | * we can wait for partitioned disk slice to show up! | |
150 | * | |
151 | * First stage tags the disk, initiates async partitioning, and returns | |
152 | * Second stage finds the tag and proceeds to ZFS labeling/replace | |
153 | * | |
154 | * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach | |
155 | * | |
156 | * 1. physical match with no fs, no partition | |
157 | * tag it top, partition disk | |
158 | * | |
159 | * 2. physical match again, see partion and tag | |
160 | * | |
161 | */ | |
162 | ||
163 | /* | |
164 | * The device associated with the given vdev (either by devid or physical path) | |
165 | * has been added to the system. If 'isdisk' is set, then we only attempt a | |
166 | * replacement if it's a whole disk. This also implies that we should label the | |
167 | * disk first. | |
168 | * | |
169 | * First, we attempt to online the device (making sure to undo any spare | |
170 | * operation when finished). If this succeeds, then we're done. If it fails, | |
171 | * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, | |
172 | * but that the label was not what we expected. If the 'autoreplace' property | |
976246fa | 173 | * is enabled, then we relabel the disk (if specified), and attempt a 'zpool |
d02ca379 DB |
174 | * replace'. If the online is successful, but the new state is something else |
175 | * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of | |
176 | * race, and we should avoid attempting to relabel the disk. | |
177 | * | |
178 | * Also can arrive here from a ESC_ZFS_VDEV_CHECK event | |
179 | */ | |
180 | static void | |
181 | zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) | |
182 | { | |
183 | char *path; | |
184 | vdev_state_t newstate; | |
185 | nvlist_t *nvroot, *newvd; | |
186 | pendingdev_t *device; | |
187 | uint64_t wholedisk = 0ULL; | |
188 | uint64_t offline = 0ULL; | |
189 | uint64_t guid = 0ULL; | |
1bbd8770 | 190 | char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; |
d02ca379 DB |
191 | char rawpath[PATH_MAX], fullpath[PATH_MAX]; |
192 | char devpath[PATH_MAX]; | |
193 | int ret; | |
6078881a | 194 | int is_dm = 0; |
7a4500a1 | 195 | int is_sd = 0; |
6078881a TH |
196 | uint_t c; |
197 | vdev_stat_t *vs; | |
d02ca379 DB |
198 | |
199 | if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) | |
200 | return; | |
201 | ||
6078881a TH |
202 | /* Skip healthy disks */ |
203 | verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, | |
204 | (uint64_t **)&vs, &c) == 0); | |
205 | if (vs->vs_state == VDEV_STATE_HEALTHY) { | |
206 | zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", | |
207 | __func__, path); | |
208 | return; | |
209 | } | |
210 | ||
d02ca379 | 211 | (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); |
1bbd8770 TH |
212 | (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, |
213 | &enc_sysfs_path); | |
d02ca379 DB |
214 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); |
215 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); | |
216 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); | |
217 | ||
218 | if (offline) | |
219 | return; /* don't intervene if it was taken offline */ | |
220 | ||
1bbd8770 | 221 | is_dm = zfs_dev_is_dm(path); |
6078881a TH |
222 | zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" |
223 | " wholedisk %d, dm %d (%llu)", zpool_get_name(zhp), path, | |
224 | physpath ? physpath : "NULL", wholedisk, is_dm, | |
225 | (long long unsigned int)guid); | |
d02ca379 DB |
226 | |
227 | /* | |
228 | * The VDEV guid is preferred for identification (gets passed in path) | |
229 | */ | |
230 | if (guid != 0) { | |
231 | (void) snprintf(fullpath, sizeof (fullpath), "%llu", | |
232 | (long long unsigned int)guid); | |
233 | } else { | |
234 | /* | |
235 | * otherwise use path sans partition suffix for whole disks | |
236 | */ | |
237 | (void) strlcpy(fullpath, path, sizeof (fullpath)); | |
238 | if (wholedisk) { | |
6078881a TH |
239 | char *spath = zfs_strip_partition(fullpath); |
240 | if (!spath) { | |
241 | zed_log_msg(LOG_INFO, "%s: Can't alloc", | |
242 | __func__); | |
243 | return; | |
244 | } | |
d02ca379 DB |
245 | |
246 | (void) strlcpy(fullpath, spath, sizeof (fullpath)); | |
247 | free(spath); | |
248 | } | |
249 | } | |
250 | ||
251 | /* | |
252 | * Attempt to online the device. | |
253 | */ | |
254 | if (zpool_vdev_online(zhp, fullpath, | |
255 | ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && | |
256 | (newstate == VDEV_STATE_HEALTHY || | |
257 | newstate == VDEV_STATE_DEGRADED)) { | |
258 | zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s", | |
259 | fullpath, (newstate == VDEV_STATE_HEALTHY) ? | |
260 | "HEALTHY" : "DEGRADED"); | |
261 | return; | |
262 | } | |
263 | ||
7a4500a1 SV |
264 | /* |
265 | * vdev_id alias rule for using scsi_debug devices (FMA automated | |
266 | * testing) | |
267 | */ | |
f02ad0dc | 268 | if (physpath != NULL && strcmp("scsidebug", physpath) == 0) |
7a4500a1 SV |
269 | is_sd = 1; |
270 | ||
d02ca379 | 271 | /* |
976246fa DB |
272 | * If the pool doesn't have the autoreplace property set, then use |
273 | * vdev online to trigger a FMA fault by posting an ereport. | |
d02ca379 | 274 | */ |
976246fa DB |
275 | if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || |
276 | !(wholedisk || is_dm) || (physpath == NULL)) { | |
d02ca379 DB |
277 | (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, |
278 | &newstate); | |
976246fa DB |
279 | zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " |
280 | "not a whole disk for '%s'", fullpath); | |
d02ca379 DB |
281 | return; |
282 | } | |
283 | ||
284 | /* | |
7a4500a1 SV |
285 | * Convert physical path into its current device node. Rawpath |
286 | * needs to be /dev/disk/by-vdev for a scsi_debug device since | |
287 | * /dev/disk/by-path will not be present. | |
d02ca379 | 288 | */ |
7a4500a1 SV |
289 | (void) snprintf(rawpath, sizeof (rawpath), "%s%s", |
290 | is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); | |
291 | ||
6078881a | 292 | if (realpath(rawpath, devpath) == NULL && !is_dm) { |
d02ca379 DB |
293 | zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", |
294 | rawpath, strerror(errno)); | |
295 | ||
296 | (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, | |
297 | &newstate); | |
298 | ||
299 | zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", | |
300 | fullpath, libzfs_error_description(g_zfshdl)); | |
301 | return; | |
302 | } | |
303 | ||
6078881a TH |
304 | /* Only autoreplace bad disks */ |
305 | if ((vs->vs_state != VDEV_STATE_DEGRADED) && | |
306 | (vs->vs_state != VDEV_STATE_FAULTED) && | |
307 | (vs->vs_state != VDEV_STATE_CANT_OPEN)) { | |
308 | return; | |
309 | } | |
310 | ||
311 | nvlist_lookup_string(vdev, "new_devid", &new_devid); | |
312 | ||
313 | if (is_dm) { | |
314 | /* Don't label device mapper or multipath disks. */ | |
315 | } else if (!labeled) { | |
316 | /* | |
317 | * we're auto-replacing a raw disk, so label it first | |
318 | */ | |
d02ca379 DB |
319 | char *leafname; |
320 | ||
321 | /* | |
322 | * If this is a request to label a whole disk, then attempt to | |
323 | * write out the label. Before we can label the disk, we need | |
324 | * to map the physical string that was matched on to the under | |
325 | * lying device node. | |
326 | * | |
327 | * If any part of this process fails, then do a force online | |
328 | * to trigger a ZFS fault for the device (and any hot spare | |
329 | * replacement). | |
330 | */ | |
331 | leafname = strrchr(devpath, '/') + 1; | |
332 | ||
333 | /* | |
334 | * If this is a request to label a whole disk, then attempt to | |
335 | * write out the label. | |
336 | */ | |
337 | if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { | |
338 | zed_log_msg(LOG_INFO, " zpool_label_disk: could not " | |
339 | "label '%s' (%s)", leafname, | |
340 | libzfs_error_description(g_zfshdl)); | |
341 | ||
342 | (void) zpool_vdev_online(zhp, fullpath, | |
343 | ZFS_ONLINE_FORCEFAULT, &newstate); | |
344 | return; | |
345 | } | |
346 | ||
347 | /* | |
348 | * The disk labeling is asynchronous on Linux. Just record | |
349 | * this label request and return as there will be another | |
350 | * disk add event for the partition after the labeling is | |
351 | * completed. | |
352 | */ | |
353 | device = malloc(sizeof (pendingdev_t)); | |
354 | (void) strlcpy(device->pd_physpath, physpath, | |
355 | sizeof (device->pd_physpath)); | |
356 | list_insert_tail(&g_device_list, device); | |
357 | ||
358 | zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", | |
02730c33 | 359 | leafname, (u_longlong_t)guid); |
d02ca379 DB |
360 | |
361 | return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ | |
362 | ||
363 | } else /* labeled */ { | |
364 | boolean_t found = B_FALSE; | |
365 | /* | |
366 | * match up with request above to label the disk | |
367 | */ | |
368 | for (device = list_head(&g_device_list); device != NULL; | |
369 | device = list_next(&g_device_list, device)) { | |
370 | if (strcmp(physpath, device->pd_physpath) == 0) { | |
371 | list_remove(&g_device_list, device); | |
372 | free(device); | |
373 | found = B_TRUE; | |
374 | break; | |
375 | } | |
976246fa DB |
376 | zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", |
377 | physpath, device->pd_physpath); | |
d02ca379 DB |
378 | } |
379 | if (!found) { | |
380 | /* unexpected partition slice encountered */ | |
976246fa DB |
381 | zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", |
382 | fullpath); | |
d02ca379 DB |
383 | (void) zpool_vdev_online(zhp, fullpath, |
384 | ZFS_ONLINE_FORCEFAULT, &newstate); | |
385 | return; | |
386 | } | |
387 | ||
388 | zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", | |
02730c33 | 389 | physpath, (u_longlong_t)guid); |
d02ca379 DB |
390 | |
391 | (void) snprintf(devpath, sizeof (devpath), "%s%s", | |
392 | DEV_BYID_PATH, new_devid); | |
d02ca379 DB |
393 | } |
394 | ||
395 | /* | |
396 | * Construct the root vdev to pass to zpool_vdev_attach(). While adding | |
397 | * the entire vdev structure is harmless, we construct a reduced set of | |
398 | * path/physpath/wholedisk to keep it simple. | |
399 | */ | |
400 | if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { | |
401 | zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); | |
402 | return; | |
403 | } | |
404 | if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { | |
405 | zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); | |
406 | nvlist_free(nvroot); | |
407 | return; | |
408 | } | |
409 | ||
410 | if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || | |
411 | nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || | |
412 | nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || | |
413 | (physpath != NULL && nvlist_add_string(newvd, | |
414 | ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || | |
1ad9de6d TH |
415 | (enc_sysfs_path != NULL && nvlist_add_string(newvd, |
416 | ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || | |
d02ca379 DB |
417 | nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || |
418 | nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || | |
419 | nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, | |
420 | 1) != 0) { | |
421 | zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); | |
422 | nvlist_free(newvd); | |
423 | nvlist_free(nvroot); | |
424 | return; | |
425 | } | |
426 | ||
427 | nvlist_free(newvd); | |
428 | ||
429 | /* | |
bea75783 BB |
430 | * Wait for udev to verify the links exist, then auto-replace |
431 | * the leaf disk at same physical location. | |
d02ca379 | 432 | */ |
bea75783 BB |
433 | if (zpool_label_disk_wait(path, 3000) != 0) { |
434 | zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " | |
435 | "disk %s is missing", path); | |
436 | nvlist_free(nvroot); | |
437 | return; | |
438 | } | |
439 | ||
d02ca379 DB |
440 | ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); |
441 | ||
442 | zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", | |
443 | fullpath, path, (ret == 0) ? "no errors" : | |
444 | libzfs_error_description(g_zfshdl)); | |
445 | ||
446 | nvlist_free(nvroot); | |
447 | } | |
448 | ||
449 | /* | |
450 | * Utility functions to find a vdev matching given criteria. | |
451 | */ | |
452 | typedef struct dev_data { | |
453 | const char *dd_compare; | |
454 | const char *dd_prop; | |
455 | zfs_process_func_t dd_func; | |
456 | boolean_t dd_found; | |
457 | boolean_t dd_islabeled; | |
458 | uint64_t dd_pool_guid; | |
459 | uint64_t dd_vdev_guid; | |
460 | const char *dd_new_devid; | |
461 | } dev_data_t; | |
462 | ||
463 | static void | |
464 | zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) | |
465 | { | |
466 | dev_data_t *dp = data; | |
6078881a | 467 | char *path = NULL; |
d02ca379 DB |
468 | uint_t c, children; |
469 | nvlist_t **child; | |
470 | ||
471 | /* | |
472 | * First iterate over any children. | |
473 | */ | |
474 | if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, | |
475 | &child, &children) == 0) { | |
476 | for (c = 0; c < children; c++) | |
477 | zfs_iter_vdev(zhp, child[c], data); | |
478 | return; | |
479 | } | |
480 | ||
481 | /* once a vdev was matched and processed there is nothing left to do */ | |
482 | if (dp->dd_found) | |
483 | return; | |
484 | ||
485 | /* | |
486 | * Match by GUID if available otherwise fallback to devid or physical | |
487 | */ | |
488 | if (dp->dd_vdev_guid != 0) { | |
489 | uint64_t guid; | |
490 | ||
491 | if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, | |
492 | &guid) != 0 || guid != dp->dd_vdev_guid) { | |
493 | return; | |
494 | } | |
495 | zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); | |
496 | dp->dd_found = B_TRUE; | |
497 | ||
498 | } else if (dp->dd_compare != NULL) { | |
499 | /* | |
500 | * NOTE: On Linux there is an event for partition, so unlike | |
4e33ba4c | 501 | * illumos, substring matching is not required to accommodate |
d02ca379 DB |
502 | * the partition suffix. An exact match will be present in |
503 | * the dp->dd_compare value. | |
504 | */ | |
505 | if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || | |
6078881a | 506 | strcmp(dp->dd_compare, path) != 0) |
d02ca379 | 507 | return; |
6078881a | 508 | |
d02ca379 DB |
509 | zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", |
510 | dp->dd_prop, path); | |
511 | dp->dd_found = B_TRUE; | |
512 | ||
513 | /* pass the new devid for use by replacing code */ | |
6078881a | 514 | if (dp->dd_new_devid != NULL) { |
d02ca379 DB |
515 | (void) nvlist_add_string(nvl, "new_devid", |
516 | dp->dd_new_devid); | |
517 | } | |
518 | } | |
519 | ||
520 | (dp->dd_func)(zhp, nvl, dp->dd_islabeled); | |
521 | } | |
522 | ||
4e9b1569 | 523 | void |
d02ca379 DB |
524 | zfs_enable_ds(void *arg) |
525 | { | |
526 | unavailpool_t *pool = (unavailpool_t *)arg; | |
527 | ||
d02ca379 DB |
528 | (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); |
529 | zpool_close(pool->uap_zhp); | |
4e9b1569 | 530 | free(pool); |
d02ca379 DB |
531 | } |
532 | ||
533 | static int | |
534 | zfs_iter_pool(zpool_handle_t *zhp, void *data) | |
535 | { | |
536 | nvlist_t *config, *nvl; | |
537 | dev_data_t *dp = data; | |
538 | uint64_t pool_guid; | |
539 | unavailpool_t *pool; | |
540 | ||
541 | zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", | |
542 | zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); | |
543 | ||
544 | /* | |
545 | * For each vdev in this pool, look for a match to apply dd_func | |
546 | */ | |
547 | if ((config = zpool_get_config(zhp, NULL)) != NULL) { | |
548 | if (dp->dd_pool_guid == 0 || | |
549 | (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, | |
550 | &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { | |
551 | (void) nvlist_lookup_nvlist(config, | |
552 | ZPOOL_CONFIG_VDEV_TREE, &nvl); | |
553 | zfs_iter_vdev(zhp, nvl, data); | |
554 | } | |
555 | } | |
556 | ||
557 | /* | |
558 | * if this pool was originally unavailable, | |
559 | * then enable its datasets asynchronously | |
560 | */ | |
561 | if (g_enumeration_done) { | |
562 | for (pool = list_head(&g_pool_list); pool != NULL; | |
563 | pool = list_next(&g_pool_list, pool)) { | |
564 | ||
d02ca379 DB |
565 | if (strcmp(zpool_get_name(zhp), |
566 | zpool_get_name(pool->uap_zhp))) | |
567 | continue; | |
568 | if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { | |
4e9b1569 | 569 | list_remove(&g_pool_list, pool); |
570 | (void) tpool_dispatch(g_tpool, zfs_enable_ds, | |
571 | pool); | |
d02ca379 DB |
572 | break; |
573 | } | |
574 | } | |
575 | } | |
576 | ||
577 | zpool_close(zhp); | |
578 | return (dp->dd_found); /* cease iteration after a match */ | |
579 | } | |
580 | ||
581 | /* | |
582 | * Given a physical device location, iterate over all | |
583 | * (pool, vdev) pairs which correspond to that location. | |
584 | */ | |
585 | static boolean_t | |
586 | devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, | |
587 | boolean_t is_slice) | |
588 | { | |
589 | dev_data_t data = { 0 }; | |
590 | ||
591 | data.dd_compare = physical; | |
592 | data.dd_func = func; | |
593 | data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; | |
594 | data.dd_found = B_FALSE; | |
595 | data.dd_islabeled = is_slice; | |
596 | data.dd_new_devid = devid; /* used by auto replace code */ | |
597 | ||
598 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
599 | ||
600 | return (data.dd_found); | |
601 | } | |
602 | ||
603 | /* | |
604 | * Given a device identifier, find any vdevs with a matching devid. | |
605 | * On Linux we can match devid directly which is always a whole disk. | |
606 | */ | |
607 | static boolean_t | |
608 | devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) | |
609 | { | |
610 | dev_data_t data = { 0 }; | |
611 | ||
612 | data.dd_compare = devid; | |
613 | data.dd_func = func; | |
614 | data.dd_prop = ZPOOL_CONFIG_DEVID; | |
615 | data.dd_found = B_FALSE; | |
616 | data.dd_islabeled = is_slice; | |
617 | data.dd_new_devid = devid; | |
618 | ||
619 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
620 | ||
621 | return (data.dd_found); | |
622 | } | |
623 | ||
624 | /* | |
625 | * Handle a EC_DEV_ADD.ESC_DISK event. | |
626 | * | |
627 | * illumos | |
628 | * Expects: DEV_PHYS_PATH string in schema | |
629 | * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID | |
630 | * | |
631 | * path: '/dev/dsk/c0t1d0s0' (persistent) | |
632 | * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' | |
633 | * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' | |
634 | * | |
635 | * linux | |
636 | * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema | |
637 | * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID | |
638 | * | |
639 | * path: '/dev/sdc1' (not persistent) | |
640 | * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' | |
641 | * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' | |
642 | */ | |
643 | static int | |
644 | zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) | |
645 | { | |
646 | char *devpath = NULL, *devid; | |
647 | boolean_t is_slice; | |
648 | ||
649 | /* | |
650 | * Expecting a devid string and an optional physical location | |
651 | */ | |
652 | if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) | |
653 | return (-1); | |
654 | ||
655 | (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); | |
656 | ||
d02ca379 DB |
657 | is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); |
658 | ||
6078881a TH |
659 | zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)", |
660 | devid, devpath ? devpath : "NULL", is_slice); | |
661 | ||
d02ca379 DB |
662 | /* |
663 | * Iterate over all vdevs looking for a match in the folllowing order: | |
664 | * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) | |
665 | * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). | |
666 | * | |
667 | * For disks, we only want to pay attention to vdevs marked as whole | |
976246fa | 668 | * disks or are a multipath device. |
d02ca379 | 669 | */ |
976246fa DB |
670 | if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) |
671 | (void) devphys_iter(devpath, devid, zfs_process_add, is_slice); | |
d02ca379 DB |
672 | |
673 | return (0); | |
674 | } | |
675 | ||
676 | /* | |
677 | * Called when we receive a VDEV_CHECK event, which indicates a device could not | |
678 | * be opened during initial pool open, but the autoreplace property was set on | |
679 | * the pool. In this case, we treat it as if it were an add event. | |
680 | */ | |
681 | static int | |
682 | zfs_deliver_check(nvlist_t *nvl) | |
683 | { | |
684 | dev_data_t data = { 0 }; | |
685 | ||
686 | if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, | |
687 | &data.dd_pool_guid) != 0 || | |
688 | nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, | |
689 | &data.dd_vdev_guid) != 0 || | |
690 | data.dd_vdev_guid == 0) | |
691 | return (0); | |
692 | ||
693 | zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", | |
694 | data.dd_pool_guid, data.dd_vdev_guid); | |
695 | ||
696 | data.dd_func = zfs_process_add; | |
697 | ||
698 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
699 | ||
700 | return (0); | |
701 | } | |
702 | ||
703 | static int | |
704 | zfsdle_vdev_online(zpool_handle_t *zhp, void *data) | |
705 | { | |
706 | char *devname = data; | |
707 | boolean_t avail_spare, l2cache; | |
d02ca379 | 708 | nvlist_t *tgt; |
d441e85d | 709 | int error; |
d02ca379 DB |
710 | |
711 | zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", | |
712 | devname, zpool_get_name(zhp)); | |
713 | ||
714 | if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, | |
715 | &avail_spare, &l2cache, NULL)) != NULL) { | |
716 | char *path, fullpath[MAXPATHLEN]; | |
d441e85d | 717 | uint64_t wholedisk; |
d02ca379 | 718 | |
d441e85d BB |
719 | error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path); |
720 | if (error) { | |
721 | zpool_close(zhp); | |
722 | return (0); | |
723 | } | |
d02ca379 | 724 | |
d441e85d BB |
725 | error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, |
726 | &wholedisk); | |
727 | if (error) | |
728 | wholedisk = 0; | |
d3f2cd7e | 729 | |
d441e85d BB |
730 | if (wholedisk) { |
731 | path = strrchr(path, '/'); | |
732 | if (path != NULL) { | |
733 | path = zfs_strip_partition(path + 1); | |
734 | if (path == NULL) { | |
735 | zpool_close(zhp); | |
736 | return (0); | |
737 | } | |
738 | } else { | |
739 | zpool_close(zhp); | |
6078881a TH |
740 | return (0); |
741 | } | |
d02ca379 | 742 | |
d441e85d BB |
743 | (void) strlcpy(fullpath, path, sizeof (fullpath)); |
744 | free(path); | |
d02ca379 DB |
745 | |
746 | /* | |
747 | * We need to reopen the pool associated with this | |
d441e85d BB |
748 | * device so that the kernel can update the size of |
749 | * the expanded device. When expanding there is no | |
750 | * need to restart the scrub from the beginning. | |
d02ca379 | 751 | */ |
d441e85d | 752 | boolean_t scrub_restart = B_FALSE; |
d3f2cd7e | 753 | (void) zpool_reopen_one(zhp, &scrub_restart); |
d441e85d BB |
754 | } else { |
755 | (void) strlcpy(fullpath, path, sizeof (fullpath)); | |
d02ca379 DB |
756 | } |
757 | ||
758 | if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { | |
d441e85d BB |
759 | vdev_state_t newstate; |
760 | ||
761 | if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { | |
762 | error = zpool_vdev_online(zhp, fullpath, 0, | |
d02ca379 | 763 | &newstate); |
d441e85d BB |
764 | zed_log_msg(LOG_INFO, "zfsdle_vdev_online: " |
765 | "setting device '%s' to ONLINE state " | |
766 | "in pool '%s': %d", fullpath, | |
767 | zpool_get_name(zhp), error); | |
768 | } | |
d02ca379 DB |
769 | } |
770 | zpool_close(zhp); | |
771 | return (1); | |
772 | } | |
773 | zpool_close(zhp); | |
774 | return (0); | |
775 | } | |
776 | ||
777 | /* | |
d441e85d BB |
778 | * This function handles the ESC_DEV_DLE device change event. Use the |
779 | * provided vdev guid when looking up a disk or partition, when the guid | |
780 | * is not present assume the entire disk is owned by ZFS and append the | |
781 | * expected -part1 partition information then lookup by physical path. | |
d02ca379 DB |
782 | */ |
783 | static int | |
784 | zfs_deliver_dle(nvlist_t *nvl) | |
785 | { | |
d441e85d BB |
786 | char *devname, name[MAXPATHLEN]; |
787 | uint64_t guid; | |
788 | ||
789 | if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { | |
790 | sprintf(name, "%llu", (u_longlong_t)guid); | |
791 | } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) { | |
792 | strlcpy(name, devname, MAXPATHLEN); | |
793 | zfs_append_partition(name, MAXPATHLEN); | |
794 | } else { | |
795 | zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); | |
d02ca379 DB |
796 | } |
797 | ||
d441e85d | 798 | if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) { |
95401cb6 | 799 | zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " |
d441e85d | 800 | "found", name); |
d02ca379 DB |
801 | return (1); |
802 | } | |
d441e85d | 803 | |
d02ca379 DB |
804 | return (0); |
805 | } | |
806 | ||
807 | /* | |
808 | * syseventd daemon module event handler | |
809 | * | |
810 | * Handles syseventd daemon zfs device related events: | |
811 | * | |
812 | * EC_DEV_ADD.ESC_DISK | |
813 | * EC_DEV_STATUS.ESC_DEV_DLE | |
814 | * EC_ZFS.ESC_ZFS_VDEV_CHECK | |
815 | * | |
816 | * Note: assumes only one thread active at a time (not thread safe) | |
817 | */ | |
818 | static int | |
819 | zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) | |
820 | { | |
821 | int ret; | |
822 | boolean_t is_lofi = B_FALSE, is_check = B_FALSE, is_dle = B_FALSE; | |
823 | ||
824 | if (strcmp(class, EC_DEV_ADD) == 0) { | |
825 | /* | |
826 | * We're mainly interested in disk additions, but we also listen | |
827 | * for new loop devices, to allow for simplified testing. | |
828 | */ | |
829 | if (strcmp(subclass, ESC_DISK) == 0) | |
830 | is_lofi = B_FALSE; | |
831 | else if (strcmp(subclass, ESC_LOFI) == 0) | |
832 | is_lofi = B_TRUE; | |
833 | else | |
834 | return (0); | |
835 | ||
836 | is_check = B_FALSE; | |
837 | } else if (strcmp(class, EC_ZFS) == 0 && | |
838 | strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { | |
839 | /* | |
840 | * This event signifies that a device failed to open | |
841 | * during pool load, but the 'autoreplace' property was | |
842 | * set, so we should pretend it's just been added. | |
843 | */ | |
844 | is_check = B_TRUE; | |
845 | } else if (strcmp(class, EC_DEV_STATUS) == 0 && | |
846 | strcmp(subclass, ESC_DEV_DLE) == 0) { | |
847 | is_dle = B_TRUE; | |
848 | } else { | |
849 | return (0); | |
850 | } | |
851 | ||
852 | if (is_dle) | |
853 | ret = zfs_deliver_dle(nvl); | |
854 | else if (is_check) | |
855 | ret = zfs_deliver_check(nvl); | |
856 | else | |
857 | ret = zfs_deliver_add(nvl, is_lofi); | |
858 | ||
859 | return (ret); | |
860 | } | |
861 | ||
862 | /*ARGSUSED*/ | |
863 | static void * | |
864 | zfs_enum_pools(void *arg) | |
865 | { | |
866 | (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); | |
867 | /* | |
868 | * Linux - instead of using a thread pool, each list entry | |
869 | * will spawn a thread when an unavailable pool transitions | |
870 | * to available. zfs_slm_fini will wait for these threads. | |
871 | */ | |
872 | g_enumeration_done = B_TRUE; | |
873 | return (NULL); | |
874 | } | |
875 | ||
876 | /* | |
877 | * called from zed daemon at startup | |
878 | * | |
879 | * sent messages from zevents or udev monitor | |
880 | * | |
881 | * For now, each agent has it's own libzfs instance | |
882 | */ | |
883 | int | |
976246fa | 884 | zfs_slm_init() |
d02ca379 | 885 | { |
4e9b1569 | 886 | if ((g_zfshdl = libzfs_init()) == NULL) |
d02ca379 DB |
887 | return (-1); |
888 | ||
889 | /* | |
890 | * collect a list of unavailable pools (asynchronously, | |
891 | * since this can take a while) | |
892 | */ | |
893 | list_create(&g_pool_list, sizeof (struct unavailpool), | |
894 | offsetof(struct unavailpool, uap_node)); | |
895 | ||
896 | if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { | |
897 | list_destroy(&g_pool_list); | |
4e9b1569 | 898 | libzfs_fini(g_zfshdl); |
d02ca379 DB |
899 | return (-1); |
900 | } | |
901 | ||
902 | list_create(&g_device_list, sizeof (struct pendingdev), | |
903 | offsetof(struct pendingdev, pd_node)); | |
904 | ||
905 | return (0); | |
906 | } | |
907 | ||
908 | void | |
909 | zfs_slm_fini() | |
910 | { | |
911 | unavailpool_t *pool; | |
912 | pendingdev_t *device; | |
913 | ||
914 | /* wait for zfs_enum_pools thread to complete */ | |
915 | (void) pthread_join(g_zfs_tid, NULL); | |
4e9b1569 | 916 | /* destroy the thread pool */ |
917 | if (g_tpool != NULL) { | |
918 | tpool_wait(g_tpool); | |
919 | tpool_destroy(g_tpool); | |
920 | } | |
d02ca379 DB |
921 | |
922 | while ((pool = (list_head(&g_pool_list))) != NULL) { | |
d02ca379 | 923 | list_remove(&g_pool_list, pool); |
4e9b1569 | 924 | zpool_close(pool->uap_zhp); |
d02ca379 DB |
925 | free(pool); |
926 | } | |
927 | list_destroy(&g_pool_list); | |
928 | ||
929 | while ((device = (list_head(&g_device_list))) != NULL) { | |
930 | list_remove(&g_device_list, device); | |
931 | free(device); | |
932 | } | |
933 | list_destroy(&g_device_list); | |
934 | ||
4e9b1569 | 935 | libzfs_fini(g_zfshdl); |
d02ca379 DB |
936 | } |
937 | ||
938 | void | |
939 | zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) | |
940 | { | |
d02ca379 DB |
941 | zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); |
942 | (void) zfs_slm_deliver_event(class, subclass, nvl); | |
d02ca379 | 943 | } |