]>
Commit | Line | Data |
---|---|---|
d02ca379 DB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. | |
23 | * Copyright (c) 2012 by Delphix. All rights reserved. | |
24 | * Copyright 2014 Nexenta Systems, Inc. All rights reserved. | |
25 | * Copyright (c) 2016, Intel Corporation. | |
26 | */ | |
27 | ||
28 | /* | |
29 | * ZFS syseventd module. | |
30 | * | |
31 | * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c | |
32 | * | |
33 | * The purpose of this module is to identify when devices are added to the | |
34 | * system, and appropriately online or replace the affected vdevs. | |
35 | * | |
36 | * When a device is added to the system: | |
37 | * | |
38 | * 1. Search for any vdevs whose devid matches that of the newly added | |
39 | * device. | |
40 | * | |
41 | * 2. If no vdevs are found, then search for any vdevs whose udev path | |
42 | * matches that of the new device. | |
43 | * | |
44 | * 3. If no vdevs match by either method, then ignore the event. | |
45 | * | |
46 | * 4. Attempt to online the device with a flag to indicate that it should | |
47 | * be unspared when resilvering completes. If this succeeds, then the | |
48 | * same device was inserted and we should continue normally. | |
49 | * | |
50 | * 5. If the pool does not have the 'autoreplace' property set, attempt to | |
51 | * online the device again without the unspare flag, which will | |
52 | * generate a FMA fault. | |
53 | * | |
54 | * 6. If the pool has the 'autoreplace' property set, and the matching vdev | |
55 | * is a whole disk, then label the new disk and attempt a 'zpool | |
56 | * replace'. | |
57 | * | |
58 | * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK | |
59 | * event indicates that a device failed to open during pool load, but the | |
60 | * autoreplace property was set. In this case, we deferred the associated | |
61 | * FMA fault until our module had a chance to process the autoreplace logic. | |
62 | * If the device could not be replaced, then the second online attempt will | |
63 | * trigger the FMA fault that we skipped earlier. | |
64 | * | |
65 | * ZFS on Linux porting notes: | |
66 | * In lieu of a thread pool, just spawn a thread on demmand. | |
67 | * Linux udev provides a disk insert for both the disk and the partition | |
68 | * | |
69 | */ | |
70 | ||
71 | #include <ctype.h> | |
72 | #include <devid.h> | |
73 | #include <fcntl.h> | |
74 | #include <libnvpair.h> | |
75 | #include <libzfs.h> | |
76 | #include <limits.h> | |
77 | #include <stddef.h> | |
78 | #include <stdlib.h> | |
79 | #include <string.h> | |
80 | #include <syslog.h> | |
81 | #include <sys/list.h> | |
82 | #include <sys/sunddi.h> | |
83 | #include <sys/sysevent/eventdefs.h> | |
84 | #include <sys/sysevent/dev.h> | |
85 | #include <pthread.h> | |
86 | #include <unistd.h> | |
87 | #include "zfs_agents.h" | |
88 | #include "../zed_log.h" | |
89 | ||
90 | #define DEV_BYID_PATH "/dev/disk/by-id/" | |
91 | #define DEV_BYPATH_PATH "/dev/disk/by-path/" | |
92 | ||
93 | typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); | |
94 | ||
95 | libzfs_handle_t *g_zfshdl; | |
96 | list_t g_pool_list; /* list of unavailable pools at initialization */ | |
97 | list_t g_device_list; /* list of disks with asynchronous label request */ | |
98 | boolean_t g_enumeration_done; | |
99 | pthread_t g_zfs_tid; | |
100 | ||
101 | typedef struct unavailpool { | |
102 | zpool_handle_t *uap_zhp; | |
103 | pthread_t uap_enable_tid; /* dataset enable thread if activated */ | |
104 | list_node_t uap_node; | |
105 | } unavailpool_t; | |
106 | ||
107 | typedef struct pendingdev { | |
108 | char pd_physpath[128]; | |
109 | list_node_t pd_node; | |
110 | } pendingdev_t; | |
111 | ||
112 | static int | |
113 | zfs_toplevel_state(zpool_handle_t *zhp) | |
114 | { | |
115 | nvlist_t *nvroot; | |
116 | vdev_stat_t *vs; | |
117 | unsigned int c; | |
118 | ||
119 | verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), | |
120 | ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); | |
121 | verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, | |
122 | (uint64_t **)&vs, &c) == 0); | |
123 | return (vs->vs_state); | |
124 | } | |
125 | ||
126 | static int | |
127 | zfs_unavail_pool(zpool_handle_t *zhp, void *data) | |
128 | { | |
129 | zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", | |
130 | zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); | |
131 | ||
132 | if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { | |
133 | unavailpool_t *uap; | |
134 | uap = malloc(sizeof (unavailpool_t)); | |
135 | uap->uap_zhp = zhp; | |
136 | uap->uap_enable_tid = 0; | |
137 | list_insert_tail((list_t *)data, uap); | |
138 | } else { | |
139 | zpool_close(zhp); | |
140 | } | |
141 | return (0); | |
142 | } | |
143 | ||
144 | /* | |
145 | * Two stage replace on Linux | |
146 | * since we get disk notifications | |
147 | * we can wait for partitioned disk slice to show up! | |
148 | * | |
149 | * First stage tags the disk, initiates async partitioning, and returns | |
150 | * Second stage finds the tag and proceeds to ZFS labeling/replace | |
151 | * | |
152 | * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach | |
153 | * | |
154 | * 1. physical match with no fs, no partition | |
155 | * tag it top, partition disk | |
156 | * | |
157 | * 2. physical match again, see partion and tag | |
158 | * | |
159 | */ | |
160 | ||
161 | /* | |
162 | * The device associated with the given vdev (either by devid or physical path) | |
163 | * has been added to the system. If 'isdisk' is set, then we only attempt a | |
164 | * replacement if it's a whole disk. This also implies that we should label the | |
165 | * disk first. | |
166 | * | |
167 | * First, we attempt to online the device (making sure to undo any spare | |
168 | * operation when finished). If this succeeds, then we're done. If it fails, | |
169 | * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, | |
170 | * but that the label was not what we expected. If the 'autoreplace' property | |
171 | * is not set, then we relabel the disk (if specified), and attempt a 'zpool | |
172 | * replace'. If the online is successful, but the new state is something else | |
173 | * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of | |
174 | * race, and we should avoid attempting to relabel the disk. | |
175 | * | |
176 | * Also can arrive here from a ESC_ZFS_VDEV_CHECK event | |
177 | */ | |
178 | static void | |
179 | zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) | |
180 | { | |
181 | char *path; | |
182 | vdev_state_t newstate; | |
183 | nvlist_t *nvroot, *newvd; | |
184 | pendingdev_t *device; | |
185 | uint64_t wholedisk = 0ULL; | |
186 | uint64_t offline = 0ULL; | |
187 | uint64_t guid = 0ULL; | |
188 | char *physpath = NULL, *new_devid = NULL; | |
189 | char rawpath[PATH_MAX], fullpath[PATH_MAX]; | |
190 | char devpath[PATH_MAX]; | |
191 | int ret; | |
6078881a TH |
192 | int is_dm = 0; |
193 | uint_t c; | |
194 | vdev_stat_t *vs; | |
d02ca379 DB |
195 | |
196 | if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) | |
197 | return; | |
198 | ||
6078881a TH |
199 | /* Skip healthy disks */ |
200 | verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, | |
201 | (uint64_t **)&vs, &c) == 0); | |
202 | if (vs->vs_state == VDEV_STATE_HEALTHY) { | |
203 | zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", | |
204 | __func__, path); | |
205 | return; | |
206 | } | |
207 | ||
d02ca379 DB |
208 | (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); |
209 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); | |
210 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); | |
211 | (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); | |
212 | ||
213 | if (offline) | |
214 | return; /* don't intervene if it was taken offline */ | |
215 | ||
6078881a TH |
216 | #ifdef HAVE_LIBDEVMAPPER |
217 | is_dm = dev_is_dm(path); | |
218 | #endif | |
219 | zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" | |
220 | " wholedisk %d, dm %d (%llu)", zpool_get_name(zhp), path, | |
221 | physpath ? physpath : "NULL", wholedisk, is_dm, | |
222 | (long long unsigned int)guid); | |
d02ca379 DB |
223 | |
224 | /* | |
225 | * The VDEV guid is preferred for identification (gets passed in path) | |
226 | */ | |
227 | if (guid != 0) { | |
228 | (void) snprintf(fullpath, sizeof (fullpath), "%llu", | |
229 | (long long unsigned int)guid); | |
230 | } else { | |
231 | /* | |
232 | * otherwise use path sans partition suffix for whole disks | |
233 | */ | |
234 | (void) strlcpy(fullpath, path, sizeof (fullpath)); | |
235 | if (wholedisk) { | |
6078881a TH |
236 | char *spath = zfs_strip_partition(fullpath); |
237 | if (!spath) { | |
238 | zed_log_msg(LOG_INFO, "%s: Can't alloc", | |
239 | __func__); | |
240 | return; | |
241 | } | |
d02ca379 DB |
242 | |
243 | (void) strlcpy(fullpath, spath, sizeof (fullpath)); | |
244 | free(spath); | |
245 | } | |
246 | } | |
247 | ||
248 | /* | |
249 | * Attempt to online the device. | |
250 | */ | |
251 | if (zpool_vdev_online(zhp, fullpath, | |
252 | ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && | |
253 | (newstate == VDEV_STATE_HEALTHY || | |
254 | newstate == VDEV_STATE_DEGRADED)) { | |
255 | zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s", | |
256 | fullpath, (newstate == VDEV_STATE_HEALTHY) ? | |
257 | "HEALTHY" : "DEGRADED"); | |
258 | return; | |
259 | } | |
260 | ||
261 | /* | |
262 | * If the pool doesn't have the autoreplace property set, then attempt | |
263 | * a true online (without the unspare flag), which will trigger a FMA | |
264 | * fault. | |
265 | */ | |
6078881a TH |
266 | if (!is_dm && (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || |
267 | !wholedisk || physpath == NULL)) { | |
d02ca379 DB |
268 | (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, |
269 | &newstate); | |
270 | zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", | |
271 | fullpath, libzfs_error_description(g_zfshdl)); | |
272 | return; | |
273 | } | |
274 | ||
275 | /* | |
276 | * convert physical path into its current device node | |
277 | */ | |
278 | (void) snprintf(rawpath, sizeof (rawpath), "%s%s", DEV_BYPATH_PATH, | |
279 | physpath); | |
6078881a | 280 | if (realpath(rawpath, devpath) == NULL && !is_dm) { |
d02ca379 DB |
281 | zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", |
282 | rawpath, strerror(errno)); | |
283 | ||
284 | (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, | |
285 | &newstate); | |
286 | ||
287 | zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", | |
288 | fullpath, libzfs_error_description(g_zfshdl)); | |
289 | return; | |
290 | } | |
291 | ||
6078881a TH |
292 | if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL)) { |
293 | zed_log_msg(LOG_INFO, "%s: Autoreplace is not enabled on this" | |
294 | " pool, ignore disk.", __func__); | |
295 | return; | |
296 | } | |
297 | ||
298 | /* Only autoreplace bad disks */ | |
299 | if ((vs->vs_state != VDEV_STATE_DEGRADED) && | |
300 | (vs->vs_state != VDEV_STATE_FAULTED) && | |
301 | (vs->vs_state != VDEV_STATE_CANT_OPEN)) { | |
302 | return; | |
303 | } | |
304 | ||
305 | nvlist_lookup_string(vdev, "new_devid", &new_devid); | |
306 | ||
307 | if (is_dm) { | |
308 | /* Don't label device mapper or multipath disks. */ | |
309 | } else if (!labeled) { | |
310 | /* | |
311 | * we're auto-replacing a raw disk, so label it first | |
312 | */ | |
d02ca379 DB |
313 | char *leafname; |
314 | ||
315 | /* | |
316 | * If this is a request to label a whole disk, then attempt to | |
317 | * write out the label. Before we can label the disk, we need | |
318 | * to map the physical string that was matched on to the under | |
319 | * lying device node. | |
320 | * | |
321 | * If any part of this process fails, then do a force online | |
322 | * to trigger a ZFS fault for the device (and any hot spare | |
323 | * replacement). | |
324 | */ | |
325 | leafname = strrchr(devpath, '/') + 1; | |
326 | ||
327 | /* | |
328 | * If this is a request to label a whole disk, then attempt to | |
329 | * write out the label. | |
330 | */ | |
331 | if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { | |
332 | zed_log_msg(LOG_INFO, " zpool_label_disk: could not " | |
333 | "label '%s' (%s)", leafname, | |
334 | libzfs_error_description(g_zfshdl)); | |
335 | ||
336 | (void) zpool_vdev_online(zhp, fullpath, | |
337 | ZFS_ONLINE_FORCEFAULT, &newstate); | |
338 | return; | |
339 | } | |
340 | ||
341 | /* | |
342 | * The disk labeling is asynchronous on Linux. Just record | |
343 | * this label request and return as there will be another | |
344 | * disk add event for the partition after the labeling is | |
345 | * completed. | |
346 | */ | |
347 | device = malloc(sizeof (pendingdev_t)); | |
348 | (void) strlcpy(device->pd_physpath, physpath, | |
349 | sizeof (device->pd_physpath)); | |
350 | list_insert_tail(&g_device_list, device); | |
351 | ||
352 | zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", | |
6078881a | 353 | leafname, (u_longlong_t) guid); |
d02ca379 DB |
354 | |
355 | return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ | |
356 | ||
357 | } else /* labeled */ { | |
358 | boolean_t found = B_FALSE; | |
359 | /* | |
360 | * match up with request above to label the disk | |
361 | */ | |
362 | for (device = list_head(&g_device_list); device != NULL; | |
363 | device = list_next(&g_device_list, device)) { | |
364 | if (strcmp(physpath, device->pd_physpath) == 0) { | |
365 | list_remove(&g_device_list, device); | |
366 | free(device); | |
367 | found = B_TRUE; | |
368 | break; | |
369 | } | |
370 | } | |
371 | if (!found) { | |
372 | /* unexpected partition slice encountered */ | |
373 | (void) zpool_vdev_online(zhp, fullpath, | |
374 | ZFS_ONLINE_FORCEFAULT, &newstate); | |
375 | return; | |
376 | } | |
377 | ||
378 | zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", | |
6078881a | 379 | physpath, (u_longlong_t) guid); |
d02ca379 DB |
380 | |
381 | (void) snprintf(devpath, sizeof (devpath), "%s%s", | |
382 | DEV_BYID_PATH, new_devid); | |
d02ca379 DB |
383 | } |
384 | ||
385 | /* | |
386 | * Construct the root vdev to pass to zpool_vdev_attach(). While adding | |
387 | * the entire vdev structure is harmless, we construct a reduced set of | |
388 | * path/physpath/wholedisk to keep it simple. | |
389 | */ | |
390 | if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { | |
391 | zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); | |
392 | return; | |
393 | } | |
394 | if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { | |
395 | zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); | |
396 | nvlist_free(nvroot); | |
397 | return; | |
398 | } | |
399 | ||
400 | if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || | |
401 | nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || | |
402 | nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || | |
403 | (physpath != NULL && nvlist_add_string(newvd, | |
404 | ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || | |
405 | nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || | |
406 | nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || | |
407 | nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, | |
408 | 1) != 0) { | |
409 | zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); | |
410 | nvlist_free(newvd); | |
411 | nvlist_free(nvroot); | |
412 | return; | |
413 | } | |
414 | ||
415 | nvlist_free(newvd); | |
416 | ||
417 | /* | |
418 | * auto replace a leaf disk at same physical location | |
419 | */ | |
420 | ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); | |
421 | ||
422 | zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", | |
423 | fullpath, path, (ret == 0) ? "no errors" : | |
424 | libzfs_error_description(g_zfshdl)); | |
425 | ||
426 | nvlist_free(nvroot); | |
427 | } | |
428 | ||
429 | /* | |
430 | * Utility functions to find a vdev matching given criteria. | |
431 | */ | |
432 | typedef struct dev_data { | |
433 | const char *dd_compare; | |
434 | const char *dd_prop; | |
435 | zfs_process_func_t dd_func; | |
436 | boolean_t dd_found; | |
437 | boolean_t dd_islabeled; | |
438 | uint64_t dd_pool_guid; | |
439 | uint64_t dd_vdev_guid; | |
440 | const char *dd_new_devid; | |
441 | } dev_data_t; | |
442 | ||
443 | static void | |
444 | zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) | |
445 | { | |
446 | dev_data_t *dp = data; | |
6078881a | 447 | char *path = NULL; |
d02ca379 DB |
448 | uint_t c, children; |
449 | nvlist_t **child; | |
450 | ||
451 | /* | |
452 | * First iterate over any children. | |
453 | */ | |
454 | if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, | |
455 | &child, &children) == 0) { | |
456 | for (c = 0; c < children; c++) | |
457 | zfs_iter_vdev(zhp, child[c], data); | |
458 | return; | |
459 | } | |
460 | ||
461 | /* once a vdev was matched and processed there is nothing left to do */ | |
462 | if (dp->dd_found) | |
463 | return; | |
464 | ||
465 | /* | |
466 | * Match by GUID if available otherwise fallback to devid or physical | |
467 | */ | |
468 | if (dp->dd_vdev_guid != 0) { | |
469 | uint64_t guid; | |
470 | ||
471 | if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, | |
472 | &guid) != 0 || guid != dp->dd_vdev_guid) { | |
473 | return; | |
474 | } | |
475 | zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); | |
476 | dp->dd_found = B_TRUE; | |
477 | ||
478 | } else if (dp->dd_compare != NULL) { | |
479 | /* | |
480 | * NOTE: On Linux there is an event for partition, so unlike | |
481 | * illumos, substring matching is not required to accomodate | |
482 | * the partition suffix. An exact match will be present in | |
483 | * the dp->dd_compare value. | |
484 | */ | |
485 | if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || | |
6078881a | 486 | strcmp(dp->dd_compare, path) != 0) |
d02ca379 | 487 | return; |
6078881a | 488 | |
d02ca379 DB |
489 | zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", |
490 | dp->dd_prop, path); | |
491 | dp->dd_found = B_TRUE; | |
492 | ||
493 | /* pass the new devid for use by replacing code */ | |
6078881a | 494 | if (dp->dd_new_devid != NULL) { |
d02ca379 DB |
495 | (void) nvlist_add_string(nvl, "new_devid", |
496 | dp->dd_new_devid); | |
497 | } | |
498 | } | |
499 | ||
500 | (dp->dd_func)(zhp, nvl, dp->dd_islabeled); | |
501 | } | |
502 | ||
503 | static void * | |
504 | zfs_enable_ds(void *arg) | |
505 | { | |
506 | unavailpool_t *pool = (unavailpool_t *)arg; | |
507 | ||
508 | assert(pool->uap_enable_tid = pthread_self()); | |
509 | ||
510 | (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); | |
511 | zpool_close(pool->uap_zhp); | |
512 | pool->uap_zhp = NULL; | |
513 | ||
514 | /* Note: zfs_slm_fini() will cleanup this pool entry on exit */ | |
515 | return (NULL); | |
516 | } | |
517 | ||
518 | static int | |
519 | zfs_iter_pool(zpool_handle_t *zhp, void *data) | |
520 | { | |
521 | nvlist_t *config, *nvl; | |
522 | dev_data_t *dp = data; | |
523 | uint64_t pool_guid; | |
524 | unavailpool_t *pool; | |
525 | ||
526 | zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", | |
527 | zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); | |
528 | ||
529 | /* | |
530 | * For each vdev in this pool, look for a match to apply dd_func | |
531 | */ | |
532 | if ((config = zpool_get_config(zhp, NULL)) != NULL) { | |
533 | if (dp->dd_pool_guid == 0 || | |
534 | (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, | |
535 | &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { | |
536 | (void) nvlist_lookup_nvlist(config, | |
537 | ZPOOL_CONFIG_VDEV_TREE, &nvl); | |
538 | zfs_iter_vdev(zhp, nvl, data); | |
539 | } | |
540 | } | |
541 | ||
542 | /* | |
543 | * if this pool was originally unavailable, | |
544 | * then enable its datasets asynchronously | |
545 | */ | |
546 | if (g_enumeration_done) { | |
547 | for (pool = list_head(&g_pool_list); pool != NULL; | |
548 | pool = list_next(&g_pool_list, pool)) { | |
549 | ||
550 | if (pool->uap_enable_tid != 0) | |
551 | continue; /* entry already processed */ | |
552 | if (strcmp(zpool_get_name(zhp), | |
553 | zpool_get_name(pool->uap_zhp))) | |
554 | continue; | |
555 | if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { | |
556 | /* send to a background thread; keep on list */ | |
557 | (void) pthread_create(&pool->uap_enable_tid, | |
558 | NULL, zfs_enable_ds, pool); | |
559 | break; | |
560 | } | |
561 | } | |
562 | } | |
563 | ||
564 | zpool_close(zhp); | |
565 | return (dp->dd_found); /* cease iteration after a match */ | |
566 | } | |
567 | ||
568 | /* | |
569 | * Given a physical device location, iterate over all | |
570 | * (pool, vdev) pairs which correspond to that location. | |
571 | */ | |
572 | static boolean_t | |
573 | devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, | |
574 | boolean_t is_slice) | |
575 | { | |
576 | dev_data_t data = { 0 }; | |
577 | ||
578 | data.dd_compare = physical; | |
579 | data.dd_func = func; | |
580 | data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; | |
581 | data.dd_found = B_FALSE; | |
582 | data.dd_islabeled = is_slice; | |
583 | data.dd_new_devid = devid; /* used by auto replace code */ | |
584 | ||
585 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
586 | ||
587 | return (data.dd_found); | |
588 | } | |
589 | ||
590 | /* | |
591 | * Given a device identifier, find any vdevs with a matching devid. | |
592 | * On Linux we can match devid directly which is always a whole disk. | |
593 | */ | |
594 | static boolean_t | |
595 | devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) | |
596 | { | |
597 | dev_data_t data = { 0 }; | |
598 | ||
599 | data.dd_compare = devid; | |
600 | data.dd_func = func; | |
601 | data.dd_prop = ZPOOL_CONFIG_DEVID; | |
602 | data.dd_found = B_FALSE; | |
603 | data.dd_islabeled = is_slice; | |
604 | data.dd_new_devid = devid; | |
605 | ||
606 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
607 | ||
608 | return (data.dd_found); | |
609 | } | |
610 | ||
611 | /* | |
612 | * Handle a EC_DEV_ADD.ESC_DISK event. | |
613 | * | |
614 | * illumos | |
615 | * Expects: DEV_PHYS_PATH string in schema | |
616 | * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID | |
617 | * | |
618 | * path: '/dev/dsk/c0t1d0s0' (persistent) | |
619 | * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' | |
620 | * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' | |
621 | * | |
622 | * linux | |
623 | * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema | |
624 | * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID | |
625 | * | |
626 | * path: '/dev/sdc1' (not persistent) | |
627 | * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' | |
628 | * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' | |
629 | */ | |
630 | static int | |
631 | zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) | |
632 | { | |
633 | char *devpath = NULL, *devid; | |
634 | boolean_t is_slice; | |
635 | ||
636 | /* | |
637 | * Expecting a devid string and an optional physical location | |
638 | */ | |
639 | if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) | |
640 | return (-1); | |
641 | ||
642 | (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); | |
643 | ||
d02ca379 DB |
644 | is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); |
645 | ||
6078881a TH |
646 | zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)", |
647 | devid, devpath ? devpath : "NULL", is_slice); | |
648 | ||
d02ca379 DB |
649 | /* |
650 | * Iterate over all vdevs looking for a match in the folllowing order: | |
651 | * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) | |
652 | * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). | |
653 | * | |
654 | * For disks, we only want to pay attention to vdevs marked as whole | |
655 | * disks. For multipath devices does whole disk apply? (TBD). | |
656 | */ | |
657 | if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) { | |
658 | if (!is_slice) { | |
659 | (void) devphys_iter(devpath, devid, zfs_process_add, | |
660 | is_slice); | |
661 | } | |
662 | } | |
663 | ||
664 | return (0); | |
665 | } | |
666 | ||
667 | /* | |
668 | * Called when we receive a VDEV_CHECK event, which indicates a device could not | |
669 | * be opened during initial pool open, but the autoreplace property was set on | |
670 | * the pool. In this case, we treat it as if it were an add event. | |
671 | */ | |
672 | static int | |
673 | zfs_deliver_check(nvlist_t *nvl) | |
674 | { | |
675 | dev_data_t data = { 0 }; | |
676 | ||
677 | if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, | |
678 | &data.dd_pool_guid) != 0 || | |
679 | nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, | |
680 | &data.dd_vdev_guid) != 0 || | |
681 | data.dd_vdev_guid == 0) | |
682 | return (0); | |
683 | ||
684 | zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", | |
685 | data.dd_pool_guid, data.dd_vdev_guid); | |
686 | ||
687 | data.dd_func = zfs_process_add; | |
688 | ||
689 | (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); | |
690 | ||
691 | return (0); | |
692 | } | |
693 | ||
694 | static int | |
695 | zfsdle_vdev_online(zpool_handle_t *zhp, void *data) | |
696 | { | |
697 | char *devname = data; | |
698 | boolean_t avail_spare, l2cache; | |
699 | vdev_state_t newstate; | |
700 | nvlist_t *tgt; | |
701 | ||
702 | zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", | |
703 | devname, zpool_get_name(zhp)); | |
704 | ||
705 | if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, | |
706 | &avail_spare, &l2cache, NULL)) != NULL) { | |
707 | char *path, fullpath[MAXPATHLEN]; | |
708 | uint64_t wholedisk = 0ULL; | |
709 | ||
710 | verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, | |
711 | &path) == 0); | |
712 | verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, | |
713 | &wholedisk) == 0); | |
714 | ||
715 | (void) strlcpy(fullpath, path, sizeof (fullpath)); | |
716 | if (wholedisk) { | |
6078881a TH |
717 | char *spath = zfs_strip_partition(fullpath); |
718 | if (!spath) { | |
719 | zed_log_msg(LOG_INFO, "%s: Can't alloc", | |
720 | __func__); | |
721 | return (0); | |
722 | } | |
d02ca379 DB |
723 | |
724 | (void) strlcpy(fullpath, spath, sizeof (fullpath)); | |
725 | free(spath); | |
726 | ||
727 | /* | |
728 | * We need to reopen the pool associated with this | |
729 | * device so that the kernel can update the size | |
730 | * of the expanded device. | |
731 | */ | |
732 | (void) zpool_reopen(zhp); | |
733 | } | |
734 | ||
735 | if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { | |
736 | zed_log_msg(LOG_INFO, "zfsdle_vdev_online: setting " | |
737 | "device '%s' to ONLINE state in pool '%s'", | |
738 | fullpath, zpool_get_name(zhp)); | |
739 | if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) | |
740 | (void) zpool_vdev_online(zhp, fullpath, 0, | |
741 | &newstate); | |
742 | } | |
743 | zpool_close(zhp); | |
744 | return (1); | |
745 | } | |
746 | zpool_close(zhp); | |
747 | return (0); | |
748 | } | |
749 | ||
750 | /* | |
751 | * This function handles the ESC_DEV_DLE event. | |
752 | */ | |
753 | static int | |
754 | zfs_deliver_dle(nvlist_t *nvl) | |
755 | { | |
756 | char *devname; | |
757 | ||
758 | if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) { | |
759 | zed_log_msg(LOG_INFO, "zfs_deliver_event: no physpath"); | |
760 | return (-1); | |
761 | } | |
762 | ||
763 | if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) { | |
764 | zed_log_msg(LOG_INFO, "zfs_deliver_event: device '%s' not " | |
765 | "found", devname); | |
766 | return (1); | |
767 | } | |
768 | return (0); | |
769 | } | |
770 | ||
771 | /* | |
772 | * syseventd daemon module event handler | |
773 | * | |
774 | * Handles syseventd daemon zfs device related events: | |
775 | * | |
776 | * EC_DEV_ADD.ESC_DISK | |
777 | * EC_DEV_STATUS.ESC_DEV_DLE | |
778 | * EC_ZFS.ESC_ZFS_VDEV_CHECK | |
779 | * | |
780 | * Note: assumes only one thread active at a time (not thread safe) | |
781 | */ | |
782 | static int | |
783 | zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) | |
784 | { | |
785 | int ret; | |
786 | boolean_t is_lofi = B_FALSE, is_check = B_FALSE, is_dle = B_FALSE; | |
787 | ||
788 | if (strcmp(class, EC_DEV_ADD) == 0) { | |
789 | /* | |
790 | * We're mainly interested in disk additions, but we also listen | |
791 | * for new loop devices, to allow for simplified testing. | |
792 | */ | |
793 | if (strcmp(subclass, ESC_DISK) == 0) | |
794 | is_lofi = B_FALSE; | |
795 | else if (strcmp(subclass, ESC_LOFI) == 0) | |
796 | is_lofi = B_TRUE; | |
797 | else | |
798 | return (0); | |
799 | ||
800 | is_check = B_FALSE; | |
801 | } else if (strcmp(class, EC_ZFS) == 0 && | |
802 | strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { | |
803 | /* | |
804 | * This event signifies that a device failed to open | |
805 | * during pool load, but the 'autoreplace' property was | |
806 | * set, so we should pretend it's just been added. | |
807 | */ | |
808 | is_check = B_TRUE; | |
809 | } else if (strcmp(class, EC_DEV_STATUS) == 0 && | |
810 | strcmp(subclass, ESC_DEV_DLE) == 0) { | |
811 | is_dle = B_TRUE; | |
812 | } else { | |
813 | return (0); | |
814 | } | |
815 | ||
816 | if (is_dle) | |
817 | ret = zfs_deliver_dle(nvl); | |
818 | else if (is_check) | |
819 | ret = zfs_deliver_check(nvl); | |
820 | else | |
821 | ret = zfs_deliver_add(nvl, is_lofi); | |
822 | ||
823 | return (ret); | |
824 | } | |
825 | ||
826 | /*ARGSUSED*/ | |
827 | static void * | |
828 | zfs_enum_pools(void *arg) | |
829 | { | |
830 | (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); | |
831 | /* | |
832 | * Linux - instead of using a thread pool, each list entry | |
833 | * will spawn a thread when an unavailable pool transitions | |
834 | * to available. zfs_slm_fini will wait for these threads. | |
835 | */ | |
836 | g_enumeration_done = B_TRUE; | |
837 | return (NULL); | |
838 | } | |
839 | ||
840 | /* | |
841 | * called from zed daemon at startup | |
842 | * | |
843 | * sent messages from zevents or udev monitor | |
844 | * | |
845 | * For now, each agent has it's own libzfs instance | |
846 | */ | |
847 | int | |
848 | zfs_slm_init(libzfs_handle_t *zfs_hdl) | |
849 | { | |
850 | if ((g_zfshdl = libzfs_init()) == NULL) | |
851 | return (-1); | |
852 | ||
853 | /* | |
854 | * collect a list of unavailable pools (asynchronously, | |
855 | * since this can take a while) | |
856 | */ | |
857 | list_create(&g_pool_list, sizeof (struct unavailpool), | |
858 | offsetof(struct unavailpool, uap_node)); | |
859 | ||
860 | if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { | |
861 | list_destroy(&g_pool_list); | |
862 | return (-1); | |
863 | } | |
864 | ||
865 | list_create(&g_device_list, sizeof (struct pendingdev), | |
866 | offsetof(struct pendingdev, pd_node)); | |
867 | ||
868 | return (0); | |
869 | } | |
870 | ||
871 | void | |
872 | zfs_slm_fini() | |
873 | { | |
874 | unavailpool_t *pool; | |
875 | pendingdev_t *device; | |
876 | ||
877 | /* wait for zfs_enum_pools thread to complete */ | |
878 | (void) pthread_join(g_zfs_tid, NULL); | |
879 | ||
880 | while ((pool = (list_head(&g_pool_list))) != NULL) { | |
881 | /* | |
882 | * each pool entry has two possibilities | |
883 | * 1. was made available (so wait for zfs_enable_ds thread) | |
884 | * 2. still unavailable (just close the pool) | |
885 | */ | |
886 | if (pool->uap_enable_tid) | |
887 | (void) pthread_join(pool->uap_enable_tid, NULL); | |
888 | else if (pool->uap_zhp != NULL) | |
889 | zpool_close(pool->uap_zhp); | |
890 | ||
891 | list_remove(&g_pool_list, pool); | |
892 | free(pool); | |
893 | } | |
894 | list_destroy(&g_pool_list); | |
895 | ||
896 | while ((device = (list_head(&g_device_list))) != NULL) { | |
897 | list_remove(&g_device_list, device); | |
898 | free(device); | |
899 | } | |
900 | list_destroy(&g_device_list); | |
901 | ||
902 | libzfs_fini(g_zfshdl); | |
903 | } | |
904 | ||
905 | void | |
906 | zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) | |
907 | { | |
908 | static pthread_mutex_t serialize = PTHREAD_MUTEX_INITIALIZER; | |
909 | ||
910 | /* | |
911 | * Serialize incoming events from zfs or libudev sources | |
912 | */ | |
913 | (void) pthread_mutex_lock(&serialize); | |
914 | zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); | |
915 | (void) zfs_slm_deliver_event(class, subclass, nvl); | |
916 | (void) pthread_mutex_unlock(&serialize); | |
917 | } |