]>
Commit | Line | Data |
---|---|---|
d02ca379 DB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. | |
23 | * Copyright (c) 2012 by Delphix. All rights reserved. | |
24 | * Copyright 2014 Nexenta Systems, Inc. All rights reserved. | |
7a4500a1 | 25 | * Copyright (c) 2016, 2017, Intel Corporation. |
d3f2cd7e | 26 | * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. |
d02ca379 DB |
27 | */ |
28 | ||
29 | /* | |
30 | * ZFS syseventd module. | |
31 | * | |
32 | * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c | |
33 | * | |
34 | * The purpose of this module is to identify when devices are added to the | |
35 | * system, and appropriately online or replace the affected vdevs. | |
36 | * | |
37 | * When a device is added to the system: | |
38 | * | |
39 | * 1. Search for any vdevs whose devid matches that of the newly added | |
40 | * device. | |
41 | * | |
42 | * 2. If no vdevs are found, then search for any vdevs whose udev path | |
43 | * matches that of the new device. | |
44 | * | |
45 | * 3. If no vdevs match by either method, then ignore the event. | |
46 | * | |
47 | * 4. Attempt to online the device with a flag to indicate that it should | |
48 | * be unspared when resilvering completes. If this succeeds, then the | |
49 | * same device was inserted and we should continue normally. | |
50 | * | |
51 | * 5. If the pool does not have the 'autoreplace' property set, attempt to | |
52 | * online the device again without the unspare flag, which will | |
53 | * generate a FMA fault. | |
54 | * | |
55 | * 6. If the pool has the 'autoreplace' property set, and the matching vdev | |
56 | * is a whole disk, then label the new disk and attempt a 'zpool | |
57 | * replace'. | |
58 | * | |
59 | * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK | |
60 | * event indicates that a device failed to open during pool load, but the | |
61 | * autoreplace property was set. In this case, we deferred the associated | |
62 | * FMA fault until our module had a chance to process the autoreplace logic. | |
63 | * If the device could not be replaced, then the second online attempt will | |
64 | * trigger the FMA fault that we skipped earlier. | |
65 | * | |
66 | * ZFS on Linux porting notes: | |
d02ca379 DB |
67 | * Linux udev provides a disk insert for both the disk and the partition |
68 | * | |
69 | */ | |
70 | ||
71 | #include <ctype.h> | |
72 | #include <devid.h> | |
73 | #include <fcntl.h> | |
74 | #include <libnvpair.h> | |
75 | #include <libzfs.h> | |
76 | #include <limits.h> | |
77 | #include <stddef.h> | |
78 | #include <stdlib.h> | |
79 | #include <string.h> | |
80 | #include <syslog.h> | |
81 | #include <sys/list.h> | |
82 | #include <sys/sunddi.h> | |
83 | #include <sys/sysevent/eventdefs.h> | |
84 | #include <sys/sysevent/dev.h> | |
4e9b1569 | 85 | #include <thread_pool.h> |
d02ca379 DB |
86 | #include <pthread.h> |
87 | #include <unistd.h> | |
88 | #include "zfs_agents.h" | |
89 | #include "../zed_log.h" | |
90 | ||
91 | #define DEV_BYID_PATH "/dev/disk/by-id/" | |
92 | #define DEV_BYPATH_PATH "/dev/disk/by-path/" | |
7a4500a1 | 93 | #define DEV_BYVDEV_PATH "/dev/disk/by-vdev/" |
d02ca379 DB |
94 | |
95 | typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); | |
96 | ||
97 | libzfs_handle_t *g_zfshdl; | |
98 | list_t g_pool_list; /* list of unavailable pools at initialization */ | |
99 | list_t g_device_list; /* list of disks with asynchronous label request */ | |
4e9b1569 | 100 | tpool_t *g_tpool; |
d02ca379 | 101 | boolean_t g_enumeration_done; |
4e9b1569 | 102 | pthread_t g_zfs_tid; /* zfs_enum_pools() thread */ |
d02ca379 DB |
103 | |
104 | typedef struct unavailpool { | |
105 | zpool_handle_t *uap_zhp; | |
d02ca379 DB |
106 | list_node_t uap_node; |
107 | } unavailpool_t; | |
108 | ||
109 | typedef struct pendingdev { | |
110 | char pd_physpath[128]; | |
111 | list_node_t pd_node; | |
112 | } pendingdev_t; | |
113 | ||
114 | static int | |
115 | zfs_toplevel_state(zpool_handle_t *zhp) | |
116 | { | |
117 | nvlist_t *nvroot; | |
118 | vdev_stat_t *vs; | |
119 | unsigned int c; | |
120 | ||
121 | verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), | |
122 | ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); | |
123 | verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, | |
124 | (uint64_t **)&vs, &c) == 0); | |
125 | return (vs->vs_state); | |
126 | } | |
127 | ||
128 | static int | |
129 | zfs_unavail_pool(zpool_handle_t *zhp, void *data) | |
130 | { | |
131 | zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", | |
132 | zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); | |
133 | ||
134 | if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { | |
135 | unavailpool_t *uap; | |
136 | uap = malloc(sizeof (unavailpool_t)); | |
137 | uap->uap_zhp = zhp; | |
d02ca379 DB |
138 | list_insert_tail((list_t *)data, uap); |
139 | } else { | |
140 | zpool_close(zhp); | |
141 | } | |
142 | return (0); | |
143 | } | |
144 | ||
145 | /* | |
146 | * Two stage replace on Linux | |
147 | * since we get disk notifications | |
148 | * we can wait for partitioned disk slice to show up! | |
149 | * | |
150 | * First stage tags the disk, initiates async partitioning, and returns | |
151 | * Second stage finds the tag and proceeds to ZFS labeling/replace | |
152 | * | |
153 | * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach | |
154 | * | |
155 | * 1. physical match with no fs, no partition | |
156 | * tag it top, partition disk | |
157 | * | |
158 | * 2. physical match again, see partion and tag | |
159 | * | |
160 | */ | |
161 | ||
162 | /* | |
163 | * The device associated with the given vdev (either by devid or physical path) | |
164 | * has been added to the system. If 'isdisk' is set, then we only attempt a | |
165 | * replacement if it's a whole disk. This also implies that we should label the | |
166 | * disk first. | |
167 | * | |
168 | * First, we attempt to online the device (making sure to undo any spare | |
169 | * operation when finished). If this succeeds, then we're done. If it fails, | |
170 | * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, | |
171 | * but that the label was not what we expected. If the 'autoreplace' property | |
976246fa | 172 | * is enabled, then we relabel the disk (if specified), and attempt a 'zpool |
d02ca379 DB |
173 | * replace'. If the online is successful, but the new state is something else |
174 | * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of | |
175 | * race, and we should avoid attempting to relabel the disk. | |
176 | * | |
177 | * Also can arrive here from a ESC_ZFS_VDEV_CHECK event | |
178 | */ | |
179 | static void | |
180 | zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) | |
181 | { | |
182 | char *path; | |
183 | vdev_state_t newstate; | |
184 | nvlist_t *nvroot, *newvd; | |
185 | pendingdev_t *device; | |
186 | uint64_t wholedisk = 0ULL; | |
187 | uint64_t offline = 0ULL; | |
188 | uint64_t guid = 0ULL; | |
1bbd8770 | 189 | char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; |
d02ca379 DB |
190 | char rawpath[PATH_MAX], fullpath[PATH_MAX]; |
191 | char devpath[PATH_MAX]; | |
192 | int ret; | |
6078881a | 193 | int is_dm = 0; |
7a4500a1 | 194 | int is_sd = 0; |
6078881a TH |
195 | uint_t c; |
196 | vdev_stat_t *vs; | |
d02ca379 DB |
197 | |
198 | if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) | |
199 | return; | |
200 | ||
6078881a TH |
201 | /* Skip healthy disks */ |
202 | verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, | |
203 | (uint64_t **)&vs, &c) == 0); | |
204 | if (vs->vs_state == VDEV_STATE_HEALTHY) { | |
205 | zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", | |
206 | __func__, path); | |
207 | return; | |
208 | } | |
209 | ||
d02ca379 | 210 | (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); |
1bbd8770 TH |
211 | (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, |
212 | &enc_sysfs_path); | |
d02ca379 DB |
213 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); |
214 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); | |
215 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); | |
216 | ||
217 | if (offline) | |
218 | return; /* don't intervene if it was taken offline */ | |
219 | ||
1bbd8770 | 220 | is_dm = zfs_dev_is_dm(path); |
6078881a TH |
221 | zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" |
222 | " wholedisk %d, dm %d (%llu)", zpool_get_name(zhp), path, | |
223 | physpath ? physpath : "NULL", wholedisk, is_dm, | |
224 | (long long unsigned int)guid); | |
d02ca379 DB |
225 | |
226 | /* | |
227 | * The VDEV guid is preferred for identification (gets passed in path) | |
228 | */ | |
229 | if (guid != 0) { | |
230 | (void) snprintf(fullpath, sizeof (fullpath), "%llu", | |
231 | (long long unsigned int)guid); | |
232 | } else { | |
233 | /* | |
234 | * otherwise use path sans partition suffix for whole disks | |
235 | */ | |
236 | (void) strlcpy(fullpath, path, sizeof (fullpath)); | |
237 | if (wholedisk) { | |
6078881a TH |
238 | char *spath = zfs_strip_partition(fullpath); |
239 | if (!spath) { | |
240 | zed_log_msg(LOG_INFO, "%s: Can't alloc", | |
241 | __func__); | |
242 | return; | |
243 | } | |
d02ca379 DB |
244 | |
245 | (void) strlcpy(fullpath, spath, sizeof (fullpath)); | |
246 | free(spath); | |
247 | } | |
248 | } | |
249 | ||
250 | /* | |
251 | * Attempt to online the device. | |
252 | */ | |
253 | if (zpool_vdev_online(zhp, fullpath, | |
254 | ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && | |
255 | (newstate == VDEV_STATE_HEALTHY || | |
256 | newstate == VDEV_STATE_DEGRADED)) { | |
257 | zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s", | |
258 | fullpath, (newstate == VDEV_STATE_HEALTHY) ? | |
259 | "HEALTHY" : "DEGRADED"); | |
260 | return; | |
261 | } | |
262 | ||
7a4500a1 SV |
263 | /* |
264 | * vdev_id alias rule for using scsi_debug devices (FMA automated | |
265 | * testing) | |
266 | */ | |
f02ad0dc | 267 | if (physpath != NULL && strcmp("scsidebug", physpath) == 0) |
7a4500a1 SV |
268 | is_sd = 1; |
269 | ||
d02ca379 | 270 | /* |
976246fa DB |
271 | * If the pool doesn't have the autoreplace property set, then use |
272 | * vdev online to trigger a FMA fault by posting an ereport. | |
d02ca379 | 273 | */ |
976246fa DB |
274 | if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || |
275 | !(wholedisk || is_dm) || (physpath == NULL)) { | |
d02ca379 DB |
276 | (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, |
277 | &newstate); | |
976246fa DB |
278 | zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " |
279 | "not a whole disk for '%s'", fullpath); | |
d02ca379 DB |
280 | return; |
281 | } | |
282 | ||
283 | /* | |
7a4500a1 SV |
284 | * Convert physical path into its current device node. Rawpath |
285 | * needs to be /dev/disk/by-vdev for a scsi_debug device since | |
286 | * /dev/disk/by-path will not be present. | |
d02ca379 | 287 | */ |
7a4500a1 SV |
288 | (void) snprintf(rawpath, sizeof (rawpath), "%s%s", |
289 | is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); | |
290 | ||
6078881a | 291 | if (realpath(rawpath, devpath) == NULL && !is_dm) { |
d02ca379 DB |
292 | zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", |
293 | rawpath, strerror(errno)); | |
294 | ||
295 | (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, | |
296 | &newstate); | |
297 | ||
298 | zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", | |
299 | fullpath, libzfs_error_description(g_zfshdl)); | |
300 | return; | |
301 | } | |
302 | ||
6078881a TH |
303 | /* Only autoreplace bad disks */ |
304 | if ((vs->vs_state != VDEV_STATE_DEGRADED) && | |
305 | (vs->vs_state != VDEV_STATE_FAULTED) && | |
306 | (vs->vs_state != VDEV_STATE_CANT_OPEN)) { | |
307 | return; | |
308 | } | |
309 | ||
310 | nvlist_lookup_string(vdev, "new_devid", &new_devid); | |
311 | ||
312 | if (is_dm) { | |
313 | /* Don't label device mapper or multipath disks. */ | |
314 | } else if (!labeled) { | |
315 | /* | |
316 | * we're auto-replacing a raw disk, so label it first | |
317 | */ | |
d02ca379 DB |
318 | char *leafname; |
319 | ||
320 | /* | |
321 | * If this is a request to label a whole disk, then attempt to | |
322 | * write out the label. Before we can label the disk, we need | |
323 | * to map the physical string that was matched on to the under | |
324 | * lying device node. | |
325 | * | |
326 | * If any part of this process fails, then do a force online | |
327 | * to trigger a ZFS fault for the device (and any hot spare | |
328 | * replacement). | |
329 | */ | |
330 | leafname = strrchr(devpath, '/') + 1; | |
331 | ||
332 | /* | |
333 | * If this is a request to label a whole disk, then attempt to | |
334 | * write out the label. | |
335 | */ | |
336 | if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { | |
337 | zed_log_msg(LOG_INFO, " zpool_label_disk: could not " | |
338 | "label '%s' (%s)", leafname, | |
339 | libzfs_error_description(g_zfshdl)); | |
340 | ||
341 | (void) zpool_vdev_online(zhp, fullpath, | |
342 | ZFS_ONLINE_FORCEFAULT, &newstate); | |
343 | return; | |
344 | } | |
345 | ||
346 | /* | |
347 | * The disk labeling is asynchronous on Linux. Just record | |
348 | * this label request and return as there will be another | |
349 | * disk add event for the partition after the labeling is | |
350 | * completed. | |
351 | */ | |
352 | device = malloc(sizeof (pendingdev_t)); | |
353 | (void) strlcpy(device->pd_physpath, physpath, | |
354 | sizeof (device->pd_physpath)); | |
355 | list_insert_tail(&g_device_list, device); | |
356 | ||
357 | zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", | |
02730c33 | 358 | leafname, (u_longlong_t)guid); |
d02ca379 DB |
359 | |
360 | return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ | |
361 | ||
362 | } else /* labeled */ { | |
363 | boolean_t found = B_FALSE; | |
364 | /* | |
365 | * match up with request above to label the disk | |
366 | */ | |
367 | for (device = list_head(&g_device_list); device != NULL; | |
368 | device = list_next(&g_device_list, device)) { | |
369 | if (strcmp(physpath, device->pd_physpath) == 0) { | |
370 | list_remove(&g_device_list, device); | |
371 | free(device); | |
372 | found = B_TRUE; | |
373 | break; | |
374 | } | |
976246fa DB |
375 | zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", |
376 | physpath, device->pd_physpath); | |
d02ca379 DB |
377 | } |
378 | if (!found) { | |
379 | /* unexpected partition slice encountered */ | |
976246fa DB |
380 | zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", |
381 | fullpath); | |
d02ca379 DB |
382 | (void) zpool_vdev_online(zhp, fullpath, |
383 | ZFS_ONLINE_FORCEFAULT, &newstate); | |
384 | return; | |
385 | } | |
386 | ||
387 | zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", | |
02730c33 | 388 | physpath, (u_longlong_t)guid); |
d02ca379 DB |
389 | |
390 | (void) snprintf(devpath, sizeof (devpath), "%s%s", | |
391 | DEV_BYID_PATH, new_devid); | |
d02ca379 DB |
392 | } |
393 | ||
394 | /* | |
395 | * Construct the root vdev to pass to zpool_vdev_attach(). While adding | |
396 | * the entire vdev structure is harmless, we construct a reduced set of | |
397 | * path/physpath/wholedisk to keep it simple. | |
398 | */ | |
399 | if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { | |
400 | zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); | |
401 | return; | |
402 | } | |
403 | if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { | |
404 | zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); | |
405 | nvlist_free(nvroot); | |
406 | return; | |
407 | } | |
408 | ||
409 | if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || | |
410 | nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || | |
411 | nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || | |
412 | (physpath != NULL && nvlist_add_string(newvd, | |
413 | ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || | |
1ad9de6d TH |
414 | (enc_sysfs_path != NULL && nvlist_add_string(newvd, |
415 | ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || | |
d02ca379 DB |
416 | nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || |
417 | nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || | |
418 | nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, | |
419 | 1) != 0) { | |
420 | zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); | |
421 | nvlist_free(newvd); | |
422 | nvlist_free(nvroot); | |
423 | return; | |
424 | } | |
425 | ||
426 | nvlist_free(newvd); | |
427 | ||
428 | /* | |
429 | * auto replace a leaf disk at same physical location | |
430 | */ | |
431 | ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); | |
432 | ||
433 | zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", | |
434 | fullpath, path, (ret == 0) ? "no errors" : | |
435 | libzfs_error_description(g_zfshdl)); | |
436 | ||
437 | nvlist_free(nvroot); | |
438 | } | |
439 | ||
440 | /* | |
441 | * Utility functions to find a vdev matching given criteria. | |
442 | */ | |
443 | typedef struct dev_data { | |
444 | const char *dd_compare; | |
445 | const char *dd_prop; | |
446 | zfs_process_func_t dd_func; | |
447 | boolean_t dd_found; | |
448 | boolean_t dd_islabeled; | |
449 | uint64_t dd_pool_guid; | |
450 | uint64_t dd_vdev_guid; | |
451 | const char *dd_new_devid; | |
452 | } dev_data_t; | |
453 | ||
454 | static void | |
455 | zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) | |
456 | { | |
457 | dev_data_t *dp = data; | |
6078881a | 458 | char *path = NULL; |
d02ca379 DB |
459 | uint_t c, children; |
460 | nvlist_t **child; | |
461 | ||
462 | /* | |
463 | * First iterate over any children. | |
464 | */ | |
465 | if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, | |
466 | &child, &children) == 0) { | |
467 | for (c = 0; c < children; c++) | |
468 | zfs_iter_vdev(zhp, child[c], data); | |
469 | return; | |
470 | } | |
471 | ||
472 | /* once a vdev was matched and processed there is nothing left to do */ | |
473 | if (dp->dd_found) | |
474 | return; | |
475 | ||
476 | /* | |
477 | * Match by GUID if available otherwise fallback to devid or physical | |
478 | */ | |
479 | if (dp->dd_vdev_guid != 0) { | |
480 | uint64_t guid; | |
481 | ||
482 | if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, | |
483 | &guid) != 0 || guid != dp->dd_vdev_guid) { | |
484 | return; | |
485 | } | |
486 | zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); | |
487 | dp->dd_found = B_TRUE; | |
488 | ||
489 | } else if (dp->dd_compare != NULL) { | |
490 | /* | |
491 | * NOTE: On Linux there is an event for partition, so unlike | |
4e33ba4c | 492 | * illumos, substring matching is not required to accommodate |
d02ca379 DB |
493 | * the partition suffix. An exact match will be present in |
494 | * the dp->dd_compare value. | |
495 | */ | |
496 | if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || | |
6078881a | 497 | strcmp(dp->dd_compare, path) != 0) |
d02ca379 | 498 | return; |
6078881a | 499 | |
d02ca379 DB |
500 | zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", |
501 | dp->dd_prop, path); | |
502 | dp->dd_found = B_TRUE; | |
503 | ||
504 | /* pass the new devid for use by replacing code */ | |
6078881a | 505 | if (dp->dd_new_devid != NULL) { |
d02ca379 DB |
506 | (void) nvlist_add_string(nvl, "new_devid", |
507 | dp->dd_new_devid); | |
508 | } | |
509 | } | |
510 | ||
511 | (dp->dd_func)(zhp, nvl, dp->dd_islabeled); | |
512 | } | |
513 | ||
4e9b1569 | 514 | void |
d02ca379 DB |
515 | zfs_enable_ds(void *arg) |
516 | { | |
517 | unavailpool_t *pool = (unavailpool_t *)arg; | |
518 | ||
d02ca379 DB |
519 | (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); |
520 | zpool_close(pool->uap_zhp); | |
4e9b1569 | 521 | free(pool); |
d02ca379 DB |
522 | } |
523 | ||
524 | static int | |
525 | zfs_iter_pool(zpool_handle_t *zhp, void *data) | |
526 | { | |
527 | nvlist_t *config, *nvl; | |
528 | dev_data_t *dp = data; | |
529 | uint64_t pool_guid; | |
530 | unavailpool_t *pool; | |
531 | ||
532 | zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", | |
533 | zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); | |
534 | ||
535 | /* | |
536 | * For each vdev in this pool, look for a match to apply dd_func | |
537 | */ | |
538 | if ((config = zpool_get_config(zhp, NULL)) != NULL) { | |
539 | if (dp->dd_pool_guid == 0 || | |
540 | (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, | |
541 | &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { | |
542 | (void) nvlist_lookup_nvlist(config, | |
543 | ZPOOL_CONFIG_VDEV_TREE, &nvl); | |
544 | zfs_iter_vdev(zhp, nvl, data); | |
545 | } | |
546 | } | |
547 | ||
548 | /* | |
549 | * if this pool was originally unavailable, | |
550 | * then enable its datasets asynchronously | |
551 | */ | |
552 | if (g_enumeration_done) { | |
553 | for (pool = list_head(&g_pool_list); pool != NULL; | |
554 | pool = list_next(&g_pool_list, pool)) { | |
555 | ||
d02ca379 DB |
556 | if (strcmp(zpool_get_name(zhp), |
557 | zpool_get_name(pool->uap_zhp))) | |
558 | continue; | |
559 | if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { | |
4e9b1569 | 560 | list_remove(&g_pool_list, pool); |
561 | (void) tpool_dispatch(g_tpool, zfs_enable_ds, | |
562 | pool); | |
d02ca379 DB |
563 | break; |
564 | } | |
565 | } | |
566 | } | |
567 | ||
568 | zpool_close(zhp); | |
569 | return (dp->dd_found); /* cease iteration after a match */ | |
570 | } | |
571 | ||
572 | /* | |
573 | * Given a physical device location, iterate over all | |
574 | * (pool, vdev) pairs which correspond to that location. | |
575 | */ | |
576 | static boolean_t | |
577 | devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, | |
578 | boolean_t is_slice) | |
579 | { | |
580 | dev_data_t data = { 0 }; | |
581 | ||
582 | data.dd_compare = physical; | |
583 | data.dd_func = func; | |
584 | data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; | |
585 | data.dd_found = B_FALSE; | |
586 | data.dd_islabeled = is_slice; | |
587 | data.dd_new_devid = devid; /* used by auto replace code */ | |
588 | ||
589 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
590 | ||
591 | return (data.dd_found); | |
592 | } | |
593 | ||
594 | /* | |
595 | * Given a device identifier, find any vdevs with a matching devid. | |
596 | * On Linux we can match devid directly which is always a whole disk. | |
597 | */ | |
598 | static boolean_t | |
599 | devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) | |
600 | { | |
601 | dev_data_t data = { 0 }; | |
602 | ||
603 | data.dd_compare = devid; | |
604 | data.dd_func = func; | |
605 | data.dd_prop = ZPOOL_CONFIG_DEVID; | |
606 | data.dd_found = B_FALSE; | |
607 | data.dd_islabeled = is_slice; | |
608 | data.dd_new_devid = devid; | |
609 | ||
610 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
611 | ||
612 | return (data.dd_found); | |
613 | } | |
614 | ||
615 | /* | |
616 | * Handle a EC_DEV_ADD.ESC_DISK event. | |
617 | * | |
618 | * illumos | |
619 | * Expects: DEV_PHYS_PATH string in schema | |
620 | * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID | |
621 | * | |
622 | * path: '/dev/dsk/c0t1d0s0' (persistent) | |
623 | * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' | |
624 | * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' | |
625 | * | |
626 | * linux | |
627 | * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema | |
628 | * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID | |
629 | * | |
630 | * path: '/dev/sdc1' (not persistent) | |
631 | * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' | |
632 | * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' | |
633 | */ | |
634 | static int | |
635 | zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) | |
636 | { | |
637 | char *devpath = NULL, *devid; | |
638 | boolean_t is_slice; | |
639 | ||
640 | /* | |
641 | * Expecting a devid string and an optional physical location | |
642 | */ | |
643 | if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) | |
644 | return (-1); | |
645 | ||
646 | (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); | |
647 | ||
d02ca379 DB |
648 | is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); |
649 | ||
6078881a TH |
650 | zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)", |
651 | devid, devpath ? devpath : "NULL", is_slice); | |
652 | ||
d02ca379 DB |
653 | /* |
654 | * Iterate over all vdevs looking for a match in the folllowing order: | |
655 | * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) | |
656 | * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). | |
657 | * | |
658 | * For disks, we only want to pay attention to vdevs marked as whole | |
976246fa | 659 | * disks or are a multipath device. |
d02ca379 | 660 | */ |
976246fa DB |
661 | if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) |
662 | (void) devphys_iter(devpath, devid, zfs_process_add, is_slice); | |
d02ca379 DB |
663 | |
664 | return (0); | |
665 | } | |
666 | ||
667 | /* | |
668 | * Called when we receive a VDEV_CHECK event, which indicates a device could not | |
669 | * be opened during initial pool open, but the autoreplace property was set on | |
670 | * the pool. In this case, we treat it as if it were an add event. | |
671 | */ | |
672 | static int | |
673 | zfs_deliver_check(nvlist_t *nvl) | |
674 | { | |
675 | dev_data_t data = { 0 }; | |
676 | ||
677 | if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, | |
678 | &data.dd_pool_guid) != 0 || | |
679 | nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, | |
680 | &data.dd_vdev_guid) != 0 || | |
681 | data.dd_vdev_guid == 0) | |
682 | return (0); | |
683 | ||
684 | zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", | |
685 | data.dd_pool_guid, data.dd_vdev_guid); | |
686 | ||
687 | data.dd_func = zfs_process_add; | |
688 | ||
689 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
690 | ||
691 | return (0); | |
692 | } | |
693 | ||
694 | static int | |
695 | zfsdle_vdev_online(zpool_handle_t *zhp, void *data) | |
696 | { | |
697 | char *devname = data; | |
698 | boolean_t avail_spare, l2cache; | |
699 | vdev_state_t newstate; | |
700 | nvlist_t *tgt; | |
701 | ||
702 | zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", | |
703 | devname, zpool_get_name(zhp)); | |
704 | ||
705 | if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, | |
706 | &avail_spare, &l2cache, NULL)) != NULL) { | |
707 | char *path, fullpath[MAXPATHLEN]; | |
708 | uint64_t wholedisk = 0ULL; | |
709 | ||
710 | verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, | |
711 | &path) == 0); | |
712 | verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, | |
713 | &wholedisk) == 0); | |
714 | ||
715 | (void) strlcpy(fullpath, path, sizeof (fullpath)); | |
716 | if (wholedisk) { | |
6078881a | 717 | char *spath = zfs_strip_partition(fullpath); |
d3f2cd7e AB |
718 | boolean_t scrub_restart = B_TRUE; |
719 | ||
6078881a TH |
720 | if (!spath) { |
721 | zed_log_msg(LOG_INFO, "%s: Can't alloc", | |
722 | __func__); | |
723 | return (0); | |
724 | } | |
d02ca379 DB |
725 | |
726 | (void) strlcpy(fullpath, spath, sizeof (fullpath)); | |
727 | free(spath); | |
728 | ||
729 | /* | |
730 | * We need to reopen the pool associated with this | |
731 | * device so that the kernel can update the size | |
732 | * of the expanded device. | |
733 | */ | |
d3f2cd7e | 734 | (void) zpool_reopen_one(zhp, &scrub_restart); |
d02ca379 DB |
735 | } |
736 | ||
737 | if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { | |
738 | zed_log_msg(LOG_INFO, "zfsdle_vdev_online: setting " | |
739 | "device '%s' to ONLINE state in pool '%s'", | |
740 | fullpath, zpool_get_name(zhp)); | |
741 | if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) | |
742 | (void) zpool_vdev_online(zhp, fullpath, 0, | |
743 | &newstate); | |
744 | } | |
745 | zpool_close(zhp); | |
746 | return (1); | |
747 | } | |
748 | zpool_close(zhp); | |
749 | return (0); | |
750 | } | |
751 | ||
752 | /* | |
753 | * This function handles the ESC_DEV_DLE event. | |
754 | */ | |
755 | static int | |
756 | zfs_deliver_dle(nvlist_t *nvl) | |
757 | { | |
758 | char *devname; | |
759 | ||
760 | if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) { | |
95401cb6 | 761 | zed_log_msg(LOG_INFO, "zfs_deliver_dle: no physpath"); |
d02ca379 DB |
762 | return (-1); |
763 | } | |
764 | ||
765 | if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) { | |
95401cb6 | 766 | zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " |
d02ca379 DB |
767 | "found", devname); |
768 | return (1); | |
769 | } | |
770 | return (0); | |
771 | } | |
772 | ||
773 | /* | |
774 | * syseventd daemon module event handler | |
775 | * | |
776 | * Handles syseventd daemon zfs device related events: | |
777 | * | |
778 | * EC_DEV_ADD.ESC_DISK | |
779 | * EC_DEV_STATUS.ESC_DEV_DLE | |
780 | * EC_ZFS.ESC_ZFS_VDEV_CHECK | |
781 | * | |
782 | * Note: assumes only one thread active at a time (not thread safe) | |
783 | */ | |
784 | static int | |
785 | zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) | |
786 | { | |
787 | int ret; | |
788 | boolean_t is_lofi = B_FALSE, is_check = B_FALSE, is_dle = B_FALSE; | |
789 | ||
790 | if (strcmp(class, EC_DEV_ADD) == 0) { | |
791 | /* | |
792 | * We're mainly interested in disk additions, but we also listen | |
793 | * for new loop devices, to allow for simplified testing. | |
794 | */ | |
795 | if (strcmp(subclass, ESC_DISK) == 0) | |
796 | is_lofi = B_FALSE; | |
797 | else if (strcmp(subclass, ESC_LOFI) == 0) | |
798 | is_lofi = B_TRUE; | |
799 | else | |
800 | return (0); | |
801 | ||
802 | is_check = B_FALSE; | |
803 | } else if (strcmp(class, EC_ZFS) == 0 && | |
804 | strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { | |
805 | /* | |
806 | * This event signifies that a device failed to open | |
807 | * during pool load, but the 'autoreplace' property was | |
808 | * set, so we should pretend it's just been added. | |
809 | */ | |
810 | is_check = B_TRUE; | |
811 | } else if (strcmp(class, EC_DEV_STATUS) == 0 && | |
812 | strcmp(subclass, ESC_DEV_DLE) == 0) { | |
813 | is_dle = B_TRUE; | |
814 | } else { | |
815 | return (0); | |
816 | } | |
817 | ||
818 | if (is_dle) | |
819 | ret = zfs_deliver_dle(nvl); | |
820 | else if (is_check) | |
821 | ret = zfs_deliver_check(nvl); | |
822 | else | |
823 | ret = zfs_deliver_add(nvl, is_lofi); | |
824 | ||
825 | return (ret); | |
826 | } | |
827 | ||
828 | /*ARGSUSED*/ | |
829 | static void * | |
830 | zfs_enum_pools(void *arg) | |
831 | { | |
832 | (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); | |
833 | /* | |
834 | * Linux - instead of using a thread pool, each list entry | |
835 | * will spawn a thread when an unavailable pool transitions | |
836 | * to available. zfs_slm_fini will wait for these threads. | |
837 | */ | |
838 | g_enumeration_done = B_TRUE; | |
839 | return (NULL); | |
840 | } | |
841 | ||
842 | /* | |
843 | * called from zed daemon at startup | |
844 | * | |
845 | * sent messages from zevents or udev monitor | |
846 | * | |
847 | * For now, each agent has it's own libzfs instance | |
848 | */ | |
849 | int | |
976246fa | 850 | zfs_slm_init() |
d02ca379 | 851 | { |
4e9b1569 | 852 | if ((g_zfshdl = libzfs_init()) == NULL) |
d02ca379 DB |
853 | return (-1); |
854 | ||
855 | /* | |
856 | * collect a list of unavailable pools (asynchronously, | |
857 | * since this can take a while) | |
858 | */ | |
859 | list_create(&g_pool_list, sizeof (struct unavailpool), | |
860 | offsetof(struct unavailpool, uap_node)); | |
861 | ||
862 | if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { | |
863 | list_destroy(&g_pool_list); | |
4e9b1569 | 864 | libzfs_fini(g_zfshdl); |
d02ca379 DB |
865 | return (-1); |
866 | } | |
867 | ||
868 | list_create(&g_device_list, sizeof (struct pendingdev), | |
869 | offsetof(struct pendingdev, pd_node)); | |
870 | ||
871 | return (0); | |
872 | } | |
873 | ||
874 | void | |
875 | zfs_slm_fini() | |
876 | { | |
877 | unavailpool_t *pool; | |
878 | pendingdev_t *device; | |
879 | ||
880 | /* wait for zfs_enum_pools thread to complete */ | |
881 | (void) pthread_join(g_zfs_tid, NULL); | |
4e9b1569 | 882 | /* destroy the thread pool */ |
883 | if (g_tpool != NULL) { | |
884 | tpool_wait(g_tpool); | |
885 | tpool_destroy(g_tpool); | |
886 | } | |
d02ca379 DB |
887 | |
888 | while ((pool = (list_head(&g_pool_list))) != NULL) { | |
d02ca379 | 889 | list_remove(&g_pool_list, pool); |
4e9b1569 | 890 | zpool_close(pool->uap_zhp); |
d02ca379 DB |
891 | free(pool); |
892 | } | |
893 | list_destroy(&g_pool_list); | |
894 | ||
895 | while ((device = (list_head(&g_device_list))) != NULL) { | |
896 | list_remove(&g_device_list, device); | |
897 | free(device); | |
898 | } | |
899 | list_destroy(&g_device_list); | |
900 | ||
4e9b1569 | 901 | libzfs_fini(g_zfshdl); |
d02ca379 DB |
902 | } |
903 | ||
904 | void | |
905 | zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) | |
906 | { | |
d02ca379 DB |
907 | zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); |
908 | (void) zfs_slm_deliver_event(class, subclass, nvl); | |
d02ca379 | 909 | } |