cmd/zed/agents/zfs_mod.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright (c) 2016, 2017, Intel Corporation.
  26  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
  27  */
  28
  29 /*
  30  * ZFS syseventd module.
  31  *
  32  * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c
  33  *
  34  * The purpose of this module is to identify when devices are added to the
  35  * system, and appropriately online or replace the affected vdevs.
  36  *
  37  * When a device is added to the system:
  38  *
  39  *      1. Search for any vdevs whose devid matches that of the newly added
  40  *         device.
  41  *
  42  *      2. If no vdevs are found, then search for any vdevs whose udev path
  43  *         matches that of the new device.
  44  *
  45  *      3. If no vdevs match by either method, then ignore the event.
  46  *
  47  *      4. Attempt to online the device with a flag to indicate that it should
  48  *         be unspared when resilvering completes.  If this succeeds, then the
  49  *         same device was inserted and we should continue normally.
  50  *
  51  *      5. If the pool does not have the 'autoreplace' property set, attempt to
  52  *         online the device again without the unspare flag, which will
  53  *         generate a FMA fault.
  54  *
  55  *      6. If the pool has the 'autoreplace' property set, and the matching vdev
  56  *         is a whole disk, then label the new disk and attempt a 'zpool
  57  *         replace'.
  58  *
  59  * The module responds to EC_DEV_ADD events.  The special ESC_ZFS_VDEV_CHECK
  60  * event indicates that a device failed to open during pool load, but the
  61  * autoreplace property was set.  In this case, we deferred the associated
  62  * FMA fault until our module had a chance to process the autoreplace logic.
  63  * If the device could not be replaced, then the second online attempt will
  64  * trigger the FMA fault that we skipped earlier.
  65  *
  66  * On Linux udev provides a disk insert for both the disk and the partition.
  67  */
  68
  69 #include <ctype.h>
  70 #include <fcntl.h>
  71 #include <libnvpair.h>
  72 #include <libzfs.h>
  73 #include <libzutil.h>
  74 #include <limits.h>
  75 #include <stddef.h>
  76 #include <stdlib.h>
  77 #include <string.h>
  78 #include <syslog.h>
  79 #include <sys/list.h>
  80 #include <sys/sunddi.h>
  81 #include <sys/sysevent/eventdefs.h>
  82 #include <sys/sysevent/dev.h>
  83 #include <thread_pool.h>
  84 #include <pthread.h>
  85 #include <unistd.h>
  86 #include <errno.h>
  87 #include "zfs_agents.h"
  88 #include "../zed_log.h"
  89
  90 #define DEV_BYID_PATH   "/dev/disk/by-id/"
  91 #define DEV_BYPATH_PATH "/dev/disk/by-path/"
  92 #define DEV_BYVDEV_PATH "/dev/disk/by-vdev/"
  93
  94 typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
  95
  96 libzfs_handle_t *g_zfshdl;
  97 list_t g_pool_list;     /* list of unavailable pools at initialization */
  98 list_t g_device_list;   /* list of disks with asynchronous label request */
  99 tpool_t *g_tpool;
 100 boolean_t g_enumeration_done;
 101 pthread_t g_zfs_tid;    /* zfs_enum_pools() thread */
 102
 103 typedef struct unavailpool {
 104         zpool_handle_t  *uap_zhp;
 105         list_node_t     uap_node;
 106 } unavailpool_t;
 107
 108 typedef struct pendingdev {
 109         char            pd_physpath[128];
 110         list_node_t     pd_node;
 111 } pendingdev_t;
 112
 113 static int
 114 zfs_toplevel_state(zpool_handle_t *zhp)
 115 {
 116         nvlist_t *nvroot;
 117         vdev_stat_t *vs;
 118         unsigned int c;
 119
 120         verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
 121             ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 122         verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
 123             (uint64_t **)&vs, &c) == 0);
 124         return (vs->vs_state);
 125 }
 126
 127 static int
 128 zfs_unavail_pool(zpool_handle_t *zhp, void *data)
 129 {
 130         zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)",
 131             zpool_get_name(zhp), (int)zfs_toplevel_state(zhp));
 132
 133         if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) {
 134                 unavailpool_t *uap;
 135                 uap = malloc(sizeof (unavailpool_t));
 136                 if (uap == NULL) {
 137                         perror("malloc");
 138                         exit(EXIT_FAILURE);
 139                 }
 140
 141                 uap->uap_zhp = zhp;
 142                 list_insert_tail((list_t *)data, uap);
 143         } else {
 144                 zpool_close(zhp);
 145         }
 146         return (0);
 147 }
 148
 149 /*
 150  * Two stage replace on Linux
 151  * since we get disk notifications
 152  * we can wait for partitioned disk slice to show up!
 153  *
 154  * First stage tags the disk, initiates async partitioning, and returns
 155  * Second stage finds the tag and proceeds to ZFS labeling/replace
 156  *
 157  * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach
 158  *
 159  * 1. physical match with no fs, no partition
 160  *      tag it top, partition disk
 161  *
 162  * 2. physical match again, see partition and tag
 163  *
 164  */
 165
 166 /*
 167  * The device associated with the given vdev (either by devid or physical path)
 168  * has been added to the system.  If 'isdisk' is set, then we only attempt a
 169  * replacement if it's a whole disk.  This also implies that we should label the
 170  * disk first.
 171  *
 172  * First, we attempt to online the device (making sure to undo any spare
 173  * operation when finished).  If this succeeds, then we're done.  If it fails,
 174  * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
 175  * but that the label was not what we expected.  If the 'autoreplace' property
 176  * is enabled, then we relabel the disk (if specified), and attempt a 'zpool
 177  * replace'.  If the online is successful, but the new state is something else
 178  * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
 179  * race, and we should avoid attempting to relabel the disk.
 180  *
 181  * Also can arrive here from a ESC_ZFS_VDEV_CHECK event
 182  */
 183 static void
 184 zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
 185 {
 186         char *path;
 187         vdev_state_t newstate;
 188         nvlist_t *nvroot, *newvd;
 189         pendingdev_t *device;
 190         uint64_t wholedisk = 0ULL;
 191         uint64_t offline = 0ULL, faulted = 0ULL;
 192         uint64_t guid = 0ULL;
 193         char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
 194         char rawpath[PATH_MAX], fullpath[PATH_MAX];
 195         char devpath[PATH_MAX];
 196         int ret;
 197         boolean_t is_sd = B_FALSE;
 198         boolean_t is_mpath_wholedisk = B_FALSE;
 199         uint_t c;
 200         vdev_stat_t *vs;
 201
 202         if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
 203                 return;
 204
 205         /* Skip healthy disks */
 206         verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
 207             (uint64_t **)&vs, &c) == 0);
 208         if (vs->vs_state == VDEV_STATE_HEALTHY) {
 209                 zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.",
 210                     __func__, path);
 211                 return;
 212         }
 213
 214         (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
 215         (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
 216             &enc_sysfs_path);
 217         (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
 218         (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
 219         (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted);
 220
 221         (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid);
 222
 223         /*
 224          * Special case:
 225          *
 226          * We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH
 227          * entry in their config. For example, on this force-faulted disk:
 228          *
 229          *      children[0]:
 230          *         type: 'disk'
 231          *         id: 0
 232          *         guid: 14309659774640089719
 233          *        path: '/dev/disk/by-vdev/L28'
 234          *        whole_disk: 0
 235          *        DTL: 654
 236          *        create_txg: 4
 237          *        com.delphix:vdev_zap_leaf: 1161
 238          *        faulted: 1
 239          *        aux_state: 'external'
 240          *      children[1]:
 241          *        type: 'disk'
 242          *        id: 1
 243          *        guid: 16002508084177980912
 244          *        path: '/dev/disk/by-vdev/L29'
 245          *        devid: 'dm-uuid-mpath-35000c500a61d68a3'
 246          *        phys_path: 'L29'
 247          *        vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32'
 248          *        whole_disk: 0
 249          *        DTL: 1028
 250          *        create_txg: 4
 251          *        com.delphix:vdev_zap_leaf: 131
 252          *
 253          * If the disk's path is a /dev/disk/by-vdev/ path, then we can infer
 254          * the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name.
 255          */
 256         if (physpath == NULL && path != NULL) {
 257                 /* If path begins with "/dev/disk/by-vdev/" ... */
 258                 if (strncmp(path, DEV_BYVDEV_PATH,
 259                     strlen(DEV_BYVDEV_PATH)) == 0) {
 260                         /* Set physpath to the char after "/dev/disk/by-vdev" */
 261                         physpath = &path[strlen(DEV_BYVDEV_PATH)];
 262                 }
 263         }
 264
 265         /*
 266          * We don't want to autoreplace offlined disks.  However, we do want to
 267          * replace force-faulted disks (`zpool offline -f`).  Force-faulted
 268          * disks have both offline=1 and faulted=1 in the nvlist.
 269          */
 270         if (offline && !faulted) {
 271                 zed_log_msg(LOG_INFO, "%s: %s is offline, skip autoreplace",
 272                     __func__, path);
 273                 return;
 274         }
 275
 276         is_mpath_wholedisk = is_mpath_whole_disk(path);
 277         zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
 278             " %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', "
 279             "(guid %llu)",
 280             zpool_get_name(zhp), path,
 281             physpath ? physpath : "NULL",
 282             wholedisk ? "is" : "not",
 283             is_mpath_wholedisk? "is" : "not",
 284             labeled ? "is" : "not",
 285             enc_sysfs_path,
 286             (long long unsigned int)guid);
 287
 288         /*
 289          * The VDEV guid is preferred for identification (gets passed in path)
 290          */
 291         if (guid != 0) {
 292                 (void) snprintf(fullpath, sizeof (fullpath), "%llu",
 293                     (long long unsigned int)guid);
 294         } else {
 295                 /*
 296                  * otherwise use path sans partition suffix for whole disks
 297                  */
 298                 (void) strlcpy(fullpath, path, sizeof (fullpath));
 299                 if (wholedisk) {
 300                         char *spath = zfs_strip_partition(fullpath);
 301                         if (!spath) {
 302                                 zed_log_msg(LOG_INFO, "%s: Can't alloc",
 303                                     __func__);
 304                                 return;
 305                         }
 306
 307                         (void) strlcpy(fullpath, spath, sizeof (fullpath));
 308                         free(spath);
 309                 }
 310         }
 311
 312         /*
 313          * Attempt to online the device.
 314          */
 315         if (zpool_vdev_online(zhp, fullpath,
 316             ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
 317             (newstate == VDEV_STATE_HEALTHY ||
 318             newstate == VDEV_STATE_DEGRADED)) {
 319                 zed_log_msg(LOG_INFO,
 320                     "  zpool_vdev_online: vdev '%s' ('%s') is "
 321                     "%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ?
 322                     "HEALTHY" : "DEGRADED");
 323                 return;
 324         }
 325
 326         /*
 327          * vdev_id alias rule for using scsi_debug devices (FMA automated
 328          * testing)
 329          */
 330         if (physpath != NULL && strcmp("scsidebug", physpath) == 0)
 331                 is_sd = B_TRUE;
 332
 333         /*
 334          * If the pool doesn't have the autoreplace property set, then use
 335          * vdev online to trigger a FMA fault by posting an ereport.
 336          */
 337         if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
 338             !(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) {
 339                 (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
 340                     &newstate);
 341                 zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or "
 342                     "not a blank disk for '%s' ('%s')", fullpath,
 343                     physpath);
 344                 return;
 345         }
 346
 347         /*
 348          * Convert physical path into its current device node.  Rawpath
 349          * needs to be /dev/disk/by-vdev for a scsi_debug device since
 350          * /dev/disk/by-path will not be present.
 351          */
 352         (void) snprintf(rawpath, sizeof (rawpath), "%s%s",
 353             is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
 354
 355         if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) {
 356                 zed_log_msg(LOG_INFO, "  realpath: %s failed (%s)",
 357                     rawpath, strerror(errno));
 358
 359                 (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
 360                     &newstate);
 361
 362                 zed_log_msg(LOG_INFO, "  zpool_vdev_online: %s FORCEFAULT (%s)",
 363                     fullpath, libzfs_error_description(g_zfshdl));
 364                 return;
 365         }
 366
 367         /* Only autoreplace bad disks */
 368         if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
 369             (vs->vs_state != VDEV_STATE_FAULTED) &&
 370             (vs->vs_state != VDEV_STATE_CANT_OPEN)) {
 371                 zed_log_msg(LOG_INFO, "  not autoreplacing since disk isn't in "
 372                     "a bad state (currently %llu)", vs->vs_state);
 373                 return;
 374         }
 375
 376         nvlist_lookup_string(vdev, "new_devid", &new_devid);
 377
 378         if (is_mpath_wholedisk) {
 379                 /* Don't label device mapper or multipath disks. */
 380         } else if (!labeled) {
 381                 /*
 382                  * we're auto-replacing a raw disk, so label it first
 383                  */
 384                 char *leafname;
 385
 386                 /*
 387                  * If this is a request to label a whole disk, then attempt to
 388                  * write out the label.  Before we can label the disk, we need
 389                  * to map the physical string that was matched on to the under
 390                  * lying device node.
 391                  *
 392                  * If any part of this process fails, then do a force online
 393                  * to trigger a ZFS fault for the device (and any hot spare
 394                  * replacement).
 395                  */
 396                 leafname = strrchr(devpath, '/') + 1;
 397
 398                 /*
 399                  * If this is a request to label a whole disk, then attempt to
 400                  * write out the label.
 401                  */
 402                 if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) {
 403                         zed_log_msg(LOG_INFO, "  zpool_label_disk: could not "
 404                             "label '%s' (%s)", leafname,
 405                             libzfs_error_description(g_zfshdl));
 406
 407                         (void) zpool_vdev_online(zhp, fullpath,
 408                             ZFS_ONLINE_FORCEFAULT, &newstate);
 409                         return;
 410                 }
 411
 412                 /*
 413                  * The disk labeling is asynchronous on Linux. Just record
 414                  * this label request and return as there will be another
 415                  * disk add event for the partition after the labeling is
 416                  * completed.
 417                  */
 418                 device = malloc(sizeof (pendingdev_t));
 419                 if (device == NULL) {
 420                         perror("malloc");
 421                         exit(EXIT_FAILURE);
 422                 }
 423
 424                 (void) strlcpy(device->pd_physpath, physpath,
 425                     sizeof (device->pd_physpath));
 426                 list_insert_tail(&g_device_list, device);
 427
 428                 zed_log_msg(LOG_INFO, "  zpool_label_disk: async '%s' (%llu)",
 429                     leafname, (u_longlong_t)guid);
 430
 431                 return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */
 432
 433         } else /* labeled */ {
 434                 boolean_t found = B_FALSE;
 435                 /*
 436                  * match up with request above to label the disk
 437                  */
 438                 for (device = list_head(&g_device_list); device != NULL;
 439                     device = list_next(&g_device_list, device)) {
 440                         if (strcmp(physpath, device->pd_physpath) == 0) {
 441                                 list_remove(&g_device_list, device);
 442                                 free(device);
 443                                 found = B_TRUE;
 444                                 break;
 445                         }
 446                         zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s",
 447                             physpath, device->pd_physpath);
 448                 }
 449                 if (!found) {
 450                         /* unexpected partition slice encountered */
 451                         zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
 452                             fullpath);
 453                         (void) zpool_vdev_online(zhp, fullpath,
 454                             ZFS_ONLINE_FORCEFAULT, &newstate);
 455                         return;
 456                 }
 457
 458                 zed_log_msg(LOG_INFO, "  zpool_label_disk: resume '%s' (%llu)",
 459                     physpath, (u_longlong_t)guid);
 460
 461                 (void) snprintf(devpath, sizeof (devpath), "%s%s",
 462                     DEV_BYID_PATH, new_devid);
 463         }
 464
 465         /*
 466          * Construct the root vdev to pass to zpool_vdev_attach().  While adding
 467          * the entire vdev structure is harmless, we construct a reduced set of
 468          * path/physpath/wholedisk to keep it simple.
 469          */
 470         if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) {
 471                 zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
 472                 return;
 473         }
 474         if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
 475                 zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
 476                 nvlist_free(nvroot);
 477                 return;
 478         }
 479
 480         if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
 481             nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
 482             nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 ||
 483             (physpath != NULL && nvlist_add_string(newvd,
 484             ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
 485             (enc_sysfs_path != NULL && nvlist_add_string(newvd,
 486             ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) ||
 487             nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
 488             nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
 489             nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 490             (const nvlist_t **)&newvd, 1) != 0) {
 491                 zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs");
 492                 nvlist_free(newvd);
 493                 nvlist_free(nvroot);
 494                 return;
 495         }
 496
 497         nvlist_free(newvd);
 498
 499         /*
 500          * Wait for udev to verify the links exist, then auto-replace
 501          * the leaf disk at same physical location.
 502          */
 503         if (zpool_label_disk_wait(path, 3000) != 0) {
 504                 zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
 505                     "disk %s is missing", path);
 506                 nvlist_free(nvroot);
 507                 return;
 508         }
 509
 510         /*
 511          * Prefer sequential resilvering when supported (mirrors and dRAID),
 512          * otherwise fallback to a traditional healing resilver.
 513          */
 514         ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE);
 515         if (ret != 0) {
 516                 ret = zpool_vdev_attach(zhp, fullpath, path, nvroot,
 517                     B_TRUE, B_FALSE);
 518         }
 519
 520         zed_log_msg(LOG_INFO, "  zpool_vdev_replace: %s with %s (%s)",
 521             fullpath, path, (ret == 0) ? "no errors" :
 522             libzfs_error_description(g_zfshdl));
 523
 524         nvlist_free(nvroot);
 525 }
 526
 527 /*
 528  * Utility functions to find a vdev matching given criteria.
 529  */
 530 typedef struct dev_data {
 531         const char              *dd_compare;
 532         const char              *dd_prop;
 533         zfs_process_func_t      dd_func;
 534         boolean_t               dd_found;
 535         boolean_t               dd_islabeled;
 536         uint64_t                dd_pool_guid;
 537         uint64_t                dd_vdev_guid;
 538         const char              *dd_new_devid;
 539 } dev_data_t;
 540
 541 static void
 542 zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
 543 {
 544         dev_data_t *dp = data;
 545         char *path = NULL;
 546         uint_t c, children;
 547         nvlist_t **child;
 548
 549         /*
 550          * First iterate over any children.
 551          */
 552         if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
 553             &child, &children) == 0) {
 554                 for (c = 0; c < children; c++)
 555                         zfs_iter_vdev(zhp, child[c], data);
 556         }
 557
 558         /*
 559          * Iterate over any spares and cache devices
 560          */
 561         if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
 562             &child, &children) == 0) {
 563                 for (c = 0; c < children; c++)
 564                         zfs_iter_vdev(zhp, child[c], data);
 565         }
 566         if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
 567             &child, &children) == 0) {
 568                 for (c = 0; c < children; c++)
 569                         zfs_iter_vdev(zhp, child[c], data);
 570         }
 571
 572         /* once a vdev was matched and processed there is nothing left to do */
 573         if (dp->dd_found)
 574                 return;
 575
 576         /*
 577          * Match by GUID if available otherwise fallback to devid or physical
 578          */
 579         if (dp->dd_vdev_guid != 0) {
 580                 uint64_t guid;
 581
 582                 if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
 583                     &guid) != 0 || guid != dp->dd_vdev_guid) {
 584                         return;
 585                 }
 586                 zed_log_msg(LOG_INFO, "  zfs_iter_vdev: matched on %llu", guid);
 587                 dp->dd_found = B_TRUE;
 588
 589         } else if (dp->dd_compare != NULL) {
 590                 /*
 591                  * NOTE: On Linux there is an event for partition, so unlike
 592                  * illumos, substring matching is not required to accommodate
 593                  * the partition suffix. An exact match will be present in
 594                  * the dp->dd_compare value.
 595                  */
 596                 if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
 597                     strcmp(dp->dd_compare, path) != 0) {
 598                         zed_log_msg(LOG_INFO, "  %s: no match (%s != vdev %s)",
 599                             __func__, dp->dd_compare, path);
 600                         return;
 601                 }
 602
 603                 zed_log_msg(LOG_INFO, "  zfs_iter_vdev: matched %s on %s",
 604                     dp->dd_prop, path);
 605                 dp->dd_found = B_TRUE;
 606
 607                 /* pass the new devid for use by replacing code */
 608                 if (dp->dd_new_devid != NULL) {
 609                         (void) nvlist_add_string(nvl, "new_devid",
 610                             dp->dd_new_devid);
 611                 }
 612         }
 613
 614         (dp->dd_func)(zhp, nvl, dp->dd_islabeled);
 615 }
 616
 617 static void
 618 zfs_enable_ds(void *arg)
 619 {
 620         unavailpool_t *pool = (unavailpool_t *)arg;
 621
 622         (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0);
 623         zpool_close(pool->uap_zhp);
 624         free(pool);
 625 }
 626
 627 static int
 628 zfs_iter_pool(zpool_handle_t *zhp, void *data)
 629 {
 630         nvlist_t *config, *nvl;
 631         dev_data_t *dp = data;
 632         uint64_t pool_guid;
 633         unavailpool_t *pool;
 634
 635         zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)",
 636             zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop);
 637
 638         /*
 639          * For each vdev in this pool, look for a match to apply dd_func
 640          */
 641         if ((config = zpool_get_config(zhp, NULL)) != NULL) {
 642                 if (dp->dd_pool_guid == 0 ||
 643                     (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 644                     &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
 645                         (void) nvlist_lookup_nvlist(config,
 646                             ZPOOL_CONFIG_VDEV_TREE, &nvl);
 647                         zfs_iter_vdev(zhp, nvl, data);
 648                 }
 649         } else {
 650                 zed_log_msg(LOG_INFO, "%s: no config\n", __func__);
 651         }
 652
 653         /*
 654          * if this pool was originally unavailable,
 655          * then enable its datasets asynchronously
 656          */
 657         if (g_enumeration_done)  {
 658                 for (pool = list_head(&g_pool_list); pool != NULL;
 659                     pool = list_next(&g_pool_list, pool)) {
 660
 661                         if (strcmp(zpool_get_name(zhp),
 662                             zpool_get_name(pool->uap_zhp)))
 663                                 continue;
 664                         if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) {
 665                                 list_remove(&g_pool_list, pool);
 666                                 (void) tpool_dispatch(g_tpool, zfs_enable_ds,
 667                                     pool);
 668                                 break;
 669                         }
 670                 }
 671         }
 672
 673         zpool_close(zhp);
 674         return (dp->dd_found);  /* cease iteration after a match */
 675 }
 676
 677 /*
 678  * Given a physical device location, iterate over all
 679  * (pool, vdev) pairs which correspond to that location.
 680  */
 681 static boolean_t
 682 devphys_iter(const char *physical, const char *devid, zfs_process_func_t func,
 683     boolean_t is_slice)
 684 {
 685         dev_data_t data = { 0 };
 686
 687         data.dd_compare = physical;
 688         data.dd_func = func;
 689         data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
 690         data.dd_found = B_FALSE;
 691         data.dd_islabeled = is_slice;
 692         data.dd_new_devid = devid;      /* used by auto replace code */
 693
 694         (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
 695
 696         return (data.dd_found);
 697 }
 698
 699 /*
 700  * Given a device identifier, find any vdevs with a matching by-vdev
 701  * path.  Normally we shouldn't need this as the comparison would be
 702  * made earlier in the devphys_iter().  For example, if we were replacing
 703  * /dev/disk/by-vdev/L28, normally devphys_iter() would match the
 704  * ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28"
 705  * of the new disk config.  However, we've seen cases where
 706  * ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk.  Here's
 707  * an example of a real 2-disk mirror pool where one disk was force
 708  * faulted:
 709  *
 710  *       com.delphix:vdev_zap_top: 129
 711  *           children[0]:
 712  *               type: 'disk'
 713  *               id: 0
 714  *               guid: 14309659774640089719
 715  *               path: '/dev/disk/by-vdev/L28'
 716  *               whole_disk: 0
 717  *               DTL: 654
 718  *               create_txg: 4
 719  *               com.delphix:vdev_zap_leaf: 1161
 720  *               faulted: 1
 721  *               aux_state: 'external'
 722  *           children[1]:
 723  *               type: 'disk'
 724  *               id: 1
 725  *               guid: 16002508084177980912
 726  *               path: '/dev/disk/by-vdev/L29'
 727  *               devid: 'dm-uuid-mpath-35000c500a61d68a3'
 728  *               phys_path: 'L29'
 729  *               vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32'
 730  *               whole_disk: 0
 731  *               DTL: 1028
 732  *               create_txg: 4
 733  *               com.delphix:vdev_zap_leaf: 131
 734  *
 735  * So in the case above, the only thing we could compare is the path.
 736  *
 737  * We can do this because we assume by-vdev paths are authoritative as physical
 738  * paths.  We could not assume this for normal paths like /dev/sda since the
 739  * physical location /dev/sda points to could change over time.
 740  */
 741 static boolean_t
 742 by_vdev_path_iter(const char *by_vdev_path, const char *devid,
 743     zfs_process_func_t func, boolean_t is_slice)
 744 {
 745         dev_data_t data = { 0 };
 746
 747         data.dd_compare = by_vdev_path;
 748         data.dd_func = func;
 749         data.dd_prop = ZPOOL_CONFIG_PATH;
 750         data.dd_found = B_FALSE;
 751         data.dd_islabeled = is_slice;
 752         data.dd_new_devid = devid;
 753
 754         if (strncmp(by_vdev_path, DEV_BYVDEV_PATH,
 755             strlen(DEV_BYVDEV_PATH)) != 0) {
 756                 /* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */
 757                 return (B_FALSE);
 758         }
 759
 760         (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
 761
 762         return (data.dd_found);
 763 }
 764
 765 /*
 766  * Given a device identifier, find any vdevs with a matching devid.
 767  * On Linux we can match devid directly which is always a whole disk.
 768  */
 769 static boolean_t
 770 devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice)
 771 {
 772         dev_data_t data = { 0 };
 773
 774         data.dd_compare = devid;
 775         data.dd_func = func;
 776         data.dd_prop = ZPOOL_CONFIG_DEVID;
 777         data.dd_found = B_FALSE;
 778         data.dd_islabeled = is_slice;
 779         data.dd_new_devid = devid;
 780
 781         (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
 782
 783         return (data.dd_found);
 784 }
 785
 786 /*
 787  * Given a device guid, find any vdevs with a matching guid.
 788  */
 789 static boolean_t
 790 guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid,
 791     zfs_process_func_t func, boolean_t is_slice)
 792 {
 793         dev_data_t data = { 0 };
 794
 795         data.dd_func = func;
 796         data.dd_found = B_FALSE;
 797         data.dd_pool_guid = pool_guid;
 798         data.dd_vdev_guid = vdev_guid;
 799         data.dd_islabeled = is_slice;
 800         data.dd_new_devid = devid;
 801
 802         (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
 803
 804         return (data.dd_found);
 805 }
 806
 807 /*
 808  * Handle a EC_DEV_ADD.ESC_DISK event.
 809  *
 810  * illumos
 811  *      Expects: DEV_PHYS_PATH string in schema
 812  *      Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID
 813  *
 814  *      path: '/dev/dsk/c0t1d0s0' (persistent)
 815  *     devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a'
 816  * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a'
 817  *
 818  * linux
 819  *      provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema
 820  *      Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID
 821  *
 822  *      path: '/dev/sdc1' (not persistent)
 823  *     devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1'
 824  * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0'
 825  */
 826 static int
 827 zfs_deliver_add(nvlist_t *nvl)
 828 {
 829         char *devpath = NULL, *devid = NULL;
 830         uint64_t pool_guid = 0, vdev_guid = 0;
 831         boolean_t is_slice;
 832
 833         /*
 834          * Expecting a devid string and an optional physical location and guid
 835          */
 836         if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) {
 837                 zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__);
 838                 return (-1);
 839         }
 840
 841         (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);
 842         (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
 843         (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
 844
 845         is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0);
 846
 847         zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)",
 848             devid, devpath ? devpath : "NULL", is_slice);
 849
 850         /*
 851          * Iterate over all vdevs looking for a match in the following order:
 852          * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
 853          * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location).
 854          * 3. ZPOOL_CONFIG_GUID (identifies unique vdev).
 855          * 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since
 856          *    by-vdev paths represent physical paths).
 857          */
 858         if (devid_iter(devid, zfs_process_add, is_slice))
 859                 return (0);
 860         if (devpath != NULL && devphys_iter(devpath, devid, zfs_process_add,
 861             is_slice))
 862                 return (0);
 863         if (vdev_guid != 0)
 864                 (void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add,
 865                     is_slice);
 866
 867         if (devpath != NULL) {
 868                 /* Can we match a /dev/disk/by-vdev/ path? */
 869                 char by_vdev_path[MAXPATHLEN];
 870                 snprintf(by_vdev_path, sizeof (by_vdev_path),
 871                     "/dev/disk/by-vdev/%s", devpath);
 872                 if (by_vdev_path_iter(by_vdev_path, devid, zfs_process_add,
 873                     is_slice))
 874                         return (0);
 875         }
 876
 877         return (0);
 878 }
 879
 880 /*
 881  * Called when we receive a VDEV_CHECK event, which indicates a device could not
 882  * be opened during initial pool open, but the autoreplace property was set on
 883  * the pool.  In this case, we treat it as if it were an add event.
 884  */
 885 static int
 886 zfs_deliver_check(nvlist_t *nvl)
 887 {
 888         dev_data_t data = { 0 };
 889
 890         if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
 891             &data.dd_pool_guid) != 0 ||
 892             nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
 893             &data.dd_vdev_guid) != 0 ||
 894             data.dd_vdev_guid == 0)
 895                 return (0);
 896
 897         zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu",
 898             data.dd_pool_guid, data.dd_vdev_guid);
 899
 900         data.dd_func = zfs_process_add;
 901
 902         (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
 903
 904         return (0);
 905 }
 906
 907 /*
 908  * Given a path to a vdev, lookup the vdev's physical size from its
 909  * config nvlist.
 910  *
 911  * Returns the vdev's physical size in bytes on success, 0 on error.
 912  */
 913 static uint64_t
 914 vdev_size_from_config(zpool_handle_t *zhp, const char *vdev_path)
 915 {
 916         nvlist_t *nvl = NULL;
 917         boolean_t avail_spare, l2cache, log;
 918         vdev_stat_t *vs = NULL;
 919         uint_t c;
 920
 921         nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log);
 922         if (!nvl)
 923                 return (0);
 924
 925         verify(nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS,
 926             (uint64_t **)&vs, &c) == 0);
 927         if (!vs) {
 928                 zed_log_msg(LOG_INFO, "%s: no nvlist for '%s'", __func__,
 929                     vdev_path);
 930                 return (0);
 931         }
 932
 933         return (vs->vs_pspace);
 934 }
 935
 936 /*
 937  * Given a path to a vdev, lookup if the vdev is a "whole disk" in the
 938  * config nvlist.  "whole disk" means that ZFS was passed a whole disk
 939  * at pool creation time, which it partitioned up and has full control over.
 940  * Thus a partition with wholedisk=1 set tells us that zfs created the
 941  * partition at creation time.  A partition without whole disk set would have
 942  * been created by externally (like with fdisk) and passed to ZFS.
 943  *
 944  * Returns the whole disk value (either 0 or 1).
 945  */
 946 static uint64_t
 947 vdev_whole_disk_from_config(zpool_handle_t *zhp, const char *vdev_path)
 948 {
 949         nvlist_t *nvl = NULL;
 950         boolean_t avail_spare, l2cache, log;
 951         uint64_t wholedisk;
 952
 953         nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log);
 954         if (!nvl)
 955                 return (0);
 956
 957         verify(nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK,
 958             &wholedisk) == 0);
 959
 960         return (wholedisk);
 961 }
 962
 963 /*
 964  * If the device size grew more than 1% then return true.
 965  */
 966 #define DEVICE_GREW(oldsize, newsize) \
 967                     ((newsize > oldsize) && \
 968                     ((newsize / (newsize - oldsize)) <= 100))
 969
 970 static int
 971 zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
 972 {
 973         boolean_t avail_spare, l2cache;
 974         nvlist_t *udev_nvl = data;
 975         nvlist_t *tgt;
 976         int error;
 977
 978         char *tmp_devname, devname[MAXPATHLEN] = "";
 979         uint64_t guid;
 980
 981         if (nvlist_lookup_uint64(udev_nvl, ZFS_EV_VDEV_GUID, &guid) == 0) {
 982                 sprintf(devname, "%llu", (u_longlong_t)guid);
 983         } else if (nvlist_lookup_string(udev_nvl, DEV_PHYS_PATH,
 984             &tmp_devname) == 0) {
 985                 strlcpy(devname, tmp_devname, MAXPATHLEN);
 986                 zfs_append_partition(devname, MAXPATHLEN);
 987         } else {
 988                 zed_log_msg(LOG_INFO, "%s: no guid or physpath", __func__);
 989         }
 990
 991         zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'",
 992             devname, zpool_get_name(zhp));
 993
 994         if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
 995             &avail_spare, &l2cache, NULL)) != NULL) {
 996                 char *path, fullpath[MAXPATHLEN];
 997                 uint64_t wholedisk;
 998
 999                 error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path);
1000                 if (error) {
1001                         zpool_close(zhp);
1002                         return (0);
1003                 }
1004
1005                 error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
1006                     &wholedisk);
1007                 if (error)
1008                         wholedisk = 0;
1009
1010                 if (wholedisk) {
1011                         path = strrchr(path, '/');
1012                         if (path != NULL) {
1013                                 path = zfs_strip_partition(path + 1);
1014                                 if (path == NULL) {
1015                                         zpool_close(zhp);
1016                                         return (0);
1017                                 }
1018                         } else {
1019                                 zpool_close(zhp);
1020                                 return (0);
1021                         }
1022
1023                         (void) strlcpy(fullpath, path, sizeof (fullpath));
1024                         free(path);
1025
1026                         /*
1027                          * We need to reopen the pool associated with this
1028                          * device so that the kernel can update the size of
1029                          * the expanded device.  When expanding there is no
1030                          * need to restart the scrub from the beginning.
1031                          */
1032                         boolean_t scrub_restart = B_FALSE;
1033                         (void) zpool_reopen_one(zhp, &scrub_restart);
1034                 } else {
1035                         (void) strlcpy(fullpath, path, sizeof (fullpath));
1036                 }
1037
1038                 if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
1039                         vdev_state_t newstate;
1040
1041                         if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
1042                                 /*
1043                                  * If this disk size has not changed, then
1044                                  * there's no need to do an autoexpand.  To
1045                                  * check we look at the disk's size in its
1046                                  * config, and compare it to the disk size
1047                                  * that udev is reporting.
1048                                  */
1049                                 uint64_t udev_size = 0, conf_size = 0,
1050                                     wholedisk = 0, udev_parent_size = 0;
1051
1052                                 /*
1053                                  * Get the size of our disk that udev is
1054                                  * reporting.
1055                                  */
1056                                 if (nvlist_lookup_uint64(udev_nvl, DEV_SIZE,
1057                                     &udev_size) != 0) {
1058                                         udev_size = 0;
1059                                 }
1060
1061                                 /*
1062                                  * Get the size of our disk's parent device
1063                                  * from udev (where sda1's parent is sda).
1064                                  */
1065                                 if (nvlist_lookup_uint64(udev_nvl,
1066                                     DEV_PARENT_SIZE, &udev_parent_size) != 0) {
1067                                         udev_parent_size = 0;
1068                                 }
1069
1070                                 conf_size = vdev_size_from_config(zhp,
1071                                     fullpath);
1072
1073                                 wholedisk = vdev_whole_disk_from_config(zhp,
1074                                     fullpath);
1075
1076                                 /*
1077                                  * Only attempt an autoexpand if the vdev size
1078                                  * changed.  There are two different cases
1079                                  * to consider.
1080                                  *
1081                                  * 1. wholedisk=1
1082                                  * If you do a 'zpool create' on a whole disk
1083                                  * (like /dev/sda), then zfs will create
1084                                  * partitions on the disk (like /dev/sda1).  In
1085                                  * that case, wholedisk=1 will be set in the
1086                                  * partition's nvlist config.  So zed will need
1087                                  * to see if your parent device (/dev/sda)
1088                                  * expanded in size, and if so, then attempt
1089                                  * the autoexpand.
1090                                  *
1091                                  * 2. wholedisk=0
1092                                  * If you do a 'zpool create' on an existing
1093                                  * partition, or a device that doesn't allow
1094                                  * partitions, then wholedisk=0, and you will
1095                                  * simply need to check if the device itself
1096                                  * expanded in size.
1097                                  */
1098                                 if (DEVICE_GREW(conf_size, udev_size) ||
1099                                     (wholedisk && DEVICE_GREW(conf_size,
1100                                     udev_parent_size))) {
1101                                         error = zpool_vdev_online(zhp, fullpath,
1102                                             0, &newstate);
1103
1104                                         zed_log_msg(LOG_INFO,
1105                                             "%s: autoexpanding '%s' from %llu"
1106                                             " to %llu bytes in pool '%s': %d",
1107                                             __func__, fullpath, conf_size,
1108                                             MAX(udev_size, udev_parent_size),
1109                                             zpool_get_name(zhp), error);
1110                                 }
1111                         }
1112                 }
1113                 zpool_close(zhp);
1114                 return (1);
1115         }
1116         zpool_close(zhp);
1117         return (0);
1118 }
1119
1120 /*
1121  * This function handles the ESC_DEV_DLE device change event.  Use the
1122  * provided vdev guid when looking up a disk or partition, when the guid
1123  * is not present assume the entire disk is owned by ZFS and append the
1124  * expected -part1 partition information then lookup by physical path.
1125  */
1126 static int
1127 zfs_deliver_dle(nvlist_t *nvl)
1128 {
1129         char *devname, name[MAXPATHLEN];
1130         uint64_t guid;
1131
1132         if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) {
1133                 sprintf(name, "%llu", (u_longlong_t)guid);
1134         } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) {
1135                 strlcpy(name, devname, MAXPATHLEN);
1136                 zfs_append_partition(name, MAXPATHLEN);
1137         } else {
1138                 sprintf(name, "unknown");
1139                 zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath");
1140         }
1141
1142         if (zpool_iter(g_zfshdl, zfsdle_vdev_online, nvl) != 1) {
1143                 zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not "
1144                     "found", name);
1145                 return (1);
1146         }
1147
1148         return (0);
1149 }
1150
1151 /*
1152  * syseventd daemon module event handler
1153  *
1154  * Handles syseventd daemon zfs device related events:
1155  *
1156  *      EC_DEV_ADD.ESC_DISK
1157  *      EC_DEV_STATUS.ESC_DEV_DLE
1158  *      EC_ZFS.ESC_ZFS_VDEV_CHECK
1159  *
1160  * Note: assumes only one thread active at a time (not thread safe)
1161  */
1162 static int
1163 zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl)
1164 {
1165         int ret;
1166         boolean_t is_check = B_FALSE, is_dle = B_FALSE;
1167
1168         if (strcmp(class, EC_DEV_ADD) == 0) {
1169                 /*
1170                  * We're mainly interested in disk additions, but we also listen
1171                  * for new loop devices, to allow for simplified testing.
1172                  */
1173                 if (strcmp(subclass, ESC_DISK) != 0 &&
1174                     strcmp(subclass, ESC_LOFI) != 0)
1175                         return (0);
1176
1177                 is_check = B_FALSE;
1178         } else if (strcmp(class, EC_ZFS) == 0 &&
1179             strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
1180                 /*
1181                  * This event signifies that a device failed to open
1182                  * during pool load, but the 'autoreplace' property was
1183                  * set, so we should pretend it's just been added.
1184                  */
1185                 is_check = B_TRUE;
1186         } else if (strcmp(class, EC_DEV_STATUS) == 0 &&
1187             strcmp(subclass, ESC_DEV_DLE) == 0) {
1188                 is_dle = B_TRUE;
1189         } else {
1190                 return (0);
1191         }
1192
1193         if (is_dle)
1194                 ret = zfs_deliver_dle(nvl);
1195         else if (is_check)
1196                 ret = zfs_deliver_check(nvl);
1197         else
1198                 ret = zfs_deliver_add(nvl);
1199
1200         return (ret);
1201 }
1202
1203 static void *
1204 zfs_enum_pools(void *arg)
1205 {
1206         (void) arg;
1207
1208         (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list);
1209         /*
1210          * Linux - instead of using a thread pool, each list entry
1211          * will spawn a thread when an unavailable pool transitions
1212          * to available. zfs_slm_fini will wait for these threads.
1213          */
1214         g_enumeration_done = B_TRUE;
1215         return (NULL);
1216 }
1217
1218 /*
1219  * called from zed daemon at startup
1220  *
1221  * sent messages from zevents or udev monitor
1222  *
1223  * For now, each agent has its own libzfs instance
1224  */
1225 int
1226 zfs_slm_init(void)
1227 {
1228         if ((g_zfshdl = libzfs_init()) == NULL)
1229                 return (-1);
1230
1231         /*
1232          * collect a list of unavailable pools (asynchronously,
1233          * since this can take a while)
1234          */
1235         list_create(&g_pool_list, sizeof (struct unavailpool),
1236             offsetof(struct unavailpool, uap_node));
1237
1238         if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) {
1239                 list_destroy(&g_pool_list);
1240                 libzfs_fini(g_zfshdl);
1241                 return (-1);
1242         }
1243
1244         pthread_setname_np(g_zfs_tid, "enum-pools");
1245         list_create(&g_device_list, sizeof (struct pendingdev),
1246             offsetof(struct pendingdev, pd_node));
1247
1248         return (0);
1249 }
1250
1251 void
1252 zfs_slm_fini(void)
1253 {
1254         unavailpool_t *pool;
1255         pendingdev_t *device;
1256
1257         /* wait for zfs_enum_pools thread to complete */
1258         (void) pthread_join(g_zfs_tid, NULL);
1259         /* destroy the thread pool */
1260         if (g_tpool != NULL) {
1261                 tpool_wait(g_tpool);
1262                 tpool_destroy(g_tpool);
1263         }
1264
1265         while ((pool = (list_head(&g_pool_list))) != NULL) {
1266                 list_remove(&g_pool_list, pool);
1267                 zpool_close(pool->uap_zhp);
1268                 free(pool);
1269         }
1270         list_destroy(&g_pool_list);
1271
1272         while ((device = (list_head(&g_device_list))) != NULL) {
1273                 list_remove(&g_device_list, device);
1274                 free(device);
1275         }
1276         list_destroy(&g_device_list);
1277
1278         libzfs_fini(g_zfshdl);
1279 }
1280
1281 void
1282 zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl)
1283 {
1284         zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass);
1285         (void) zfs_slm_deliver_event(class, subclass, nvl);
1286 }