]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Use udev for partition detection
authorBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 19 Apr 2016 18:19:12 +0000 (11:19 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Mon, 25 Apr 2016 18:13:20 +0000 (11:13 -0700)
When ZFS partitions a block device it must wait for udev to create
both a device node and all the device symlinks.  This process takes
a variable length of time and depends on factors such how many links
must be created, the complexity of the rules, etc.  Complicating
the situation further it is not uncommon for udev to create and
then remove a link multiple times while processing the udev rules.

Given the above, the existing scheme of waiting for an expected
partition to appear by name isn't 100% reliable.  At this point
udev may still remove and recreate think link resulting in the
kernel modules being unable to open the device.

In order to address this the zpool_label_disk_wait() function
has been updated to use libudev.  Until the registered system
device acknowledges that it in fully initialized the function
will wait.  Once fully initialized all device links are checked
and allowed to settle for 50ms.  This makes it far more likely
that all the device nodes will exist when the kernel modules
need to open them.

For systems without libudev an alternate zpool_label_disk_wait()
was updated to include a settle time.  In addition, the kernel
modules were updated to include retry logic for this ENOENT case.
Due to the improved checks in the utilities it is unlikely this
logic will be invoked.  However, if the rare event it is needed
it will prevent a failure.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Richard Laager <rlaager@wiktel.com>
Closes #4523
Closes #3708
Closes #4077
Closes #4144
Closes #4214
Closes #4517

cmd/zpool/zpool_vdev.c
lib/libzfs/libzfs_import.c
lib/libzfs/libzfs_pool.c
module/zfs/vdev_disk.c

index 8bbbf6615695f5ddbb7566fb1ed7b436e202995e..cf87554d5d8bf4ad93fd0c53950a9eb1b90c34ae 100644 (file)
@@ -1198,12 +1198,10 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
 
                /*
                 * Remove any previously existing symlink from a udev path to
-                * the device before labeling the disk.  This makes
-                * zpool_label_disk_wait() truly wait for the new link to show
-                * up instead of returning if it finds an old link still in
-                * place.  Otherwise there is a window between when udev
-                * deletes and recreates the link during which access attempts
-                * will fail with ENOENT.
+                * the device before labeling the disk.  This ensures that
+                * only newly created links are used.  Otherwise there is a
+                * window between when udev deletes and recreates the link
+                * during which access attempts will fail with ENOENT.
                 */
                strncpy(udevpath, path, MAXPATHLEN);
                (void) zfs_append_partition(udevpath, MAXPATHLEN);
@@ -1227,6 +1225,8 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
                 * and then block until udev creates the new link.
                 */
                if (!is_exclusive || !is_spare(NULL, udevpath)) {
+                       char *devnode = strrchr(devpath, '/') + 1;
+
                        ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
                        if (ret == 0) {
                                ret = lstat64(udevpath, &statbuf);
@@ -1234,18 +1234,29 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
                                        (void) unlink(udevpath);
                        }
 
-                       if (zpool_label_disk(g_zfs, zhp,
-                           strrchr(devpath, '/') + 1) == -1)
+                       /*
+                        * When labeling a pool the raw device node name
+                        * is provided as it appears under /dev/.
+                        */
+                       if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
                                return (-1);
 
+                       /*
+                        * Wait for udev to signal the device is available
+                        * by the provided path.
+                        */
                        ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
                        if (ret) {
-                               (void) fprintf(stderr, gettext("cannot "
-                                   "resolve path '%s': %d\n"), udevpath, ret);
-                               return (-1);
+                               (void) fprintf(stderr,
+                                   gettext("missing link: %s was "
+                                   "partitioned but %s is missing\n"),
+                                   devnode, udevpath);
+                               return (ret);
                        }
 
-                       (void) zero_label(udevpath);
+                       ret = zero_label(udevpath);
+                       if (ret)
+                               return (ret);
                }
 
                /*
@@ -1259,8 +1270,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
                /*
                 * Update device id strings for whole disks (Linux only)
                 */
-               if (wholedisk)
-                       update_vdev_config_dev_strs(nv);
+               update_vdev_config_dev_strs(nv);
 
                return (0);
        }
index 8f27ed58c8dfc0d2fd73bed2dd06b85b7df0bd03..2776ed29cd00af8aef94b90f11ac0ce99adf8f88 100644 (file)
@@ -259,6 +259,86 @@ udev_device_is_ready(struct udev_device *dev)
 #endif
 }
 
+/*
+ * Wait up to timeout_ms for udev to set up the device node.  The device is
+ * considered ready when libudev determines it has been initialized, all of
+ * the device links have been verified to exist, and it has been allowed to
+ * settle.  At this point the device the device can be accessed reliably.
+ * Depending on the complexity of the udev rules this process could take
+ * several seconds.
+ */
+int
+zpool_label_disk_wait(char *path, int timeout_ms)
+{
+       struct udev *udev;
+       struct udev_device *dev = NULL;
+       char nodepath[MAXPATHLEN];
+       char *sysname = NULL;
+       int ret = ENODEV;
+       int settle_ms = 50;
+       long sleep_ms = 10;
+       hrtime_t start, settle;
+
+       if ((udev = udev_new()) == NULL)
+               return (ENXIO);
+
+       start = gethrtime();
+       settle = 0;
+
+       do {
+               if (sysname == NULL) {
+                       if (realpath(path, nodepath) != NULL) {
+                               sysname = strrchr(nodepath, '/') + 1;
+                       } else {
+                               (void) usleep(sleep_ms * MILLISEC);
+                               continue;
+                       }
+               }
+
+               dev = udev_device_new_from_subsystem_sysname(udev,
+                   "block", sysname);
+               if ((dev != NULL) && udev_device_is_ready(dev)) {
+                       struct udev_list_entry *links, *link;
+
+                       ret = 0;
+                       links = udev_device_get_devlinks_list_entry(dev);
+
+                       udev_list_entry_foreach(link, links) {
+                               struct stat64 statbuf;
+                               const char *name;
+
+                               name = udev_list_entry_get_name(link);
+                               errno = 0;
+                               if (stat64(name, &statbuf) == 0 && errno == 0)
+                                       continue;
+
+                               settle = 0;
+                               ret = ENODEV;
+                               break;
+                       }
+
+                       if (ret == 0) {
+                               if (settle == 0) {
+                                       settle = gethrtime();
+                               } else if (NSEC2MSEC(gethrtime() - settle) >=
+                                   settle_ms) {
+                                       udev_device_unref(dev);
+                                       break;
+                               }
+                       }
+               }
+
+               udev_device_unref(dev);
+               (void) usleep(sleep_ms * MILLISEC);
+
+       } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
+
+       udev_unref(udev);
+
+       return (ret);
+}
+
+
 /*
  * Encode the persistent devices strings
  * used for the vdev disk label
@@ -414,6 +494,41 @@ is_mpath_whole_disk(const char *path)
        return (B_FALSE);
 }
 
+/*
+ * Wait up to timeout_ms for udev to set up the device node.  The device is
+ * considered ready when the provided path have been verified to exist and
+ * it has been allowed to settle.  At this point the device the device can
+ * be accessed reliably.  Depending on the complexity of the udev rules thisi
+ * process could take several seconds.
+ */
+int
+zpool_label_disk_wait(char *path, int timeout_ms)
+{
+       int settle_ms = 50;
+       long sleep_ms = 10;
+       hrtime_t start, settle;
+       struct stat64 statbuf;
+
+       start = gethrtime();
+       settle = 0;
+
+       do {
+               errno = 0;
+               if ((stat64(path, &statbuf) == 0) && (errno == 0)) {
+                       if (settle == 0)
+                               settle = gethrtime();
+                       else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms)
+                               return (0);
+               } else if (errno != ENOENT) {
+                       return (errno);
+               }
+
+               usleep(sleep_ms * MILLISEC);
+       } while (NSEC2MSEC(gethrtime() - start) < timeout_ms);
+
+       return (ENODEV);
+}
+
 void
 update_vdev_config_dev_strs(nvlist_t *nv)
 {
index 214c57ab4b46b783c1642dd9aeaf369cc0584211..c405abe3edf13c9d3b349715ab2d8ac223eb1460 100644 (file)
@@ -4122,30 +4122,7 @@ find_start_block(nvlist_t *config)
        return (MAXOFFSET_T);
 }
 
-int
-zpool_label_disk_wait(char *path, int timeout)
-{
-       struct stat64 statbuf;
-       int i;
-
-       /*
-        * Wait timeout miliseconds for a newly created device to be available
-        * from the given path.  There is a small window when a /dev/ device
-        * will exist and the udev link will not, so we must wait for the
-        * symlink.  Depending on the udev rules this may take a few seconds.
-        */
-       for (i = 0; i < timeout; i++) {
-               usleep(1000);
-
-               errno = 0;
-               if ((stat64(path, &statbuf) == 0) && (errno == 0))
-                       return (0);
-       }
-
-       return (ENOENT);
-}
-
-int
+static int
 zpool_label_disk_check(char *path)
 {
        struct dk_gpt *vtoc;
@@ -4310,12 +4287,11 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
        (void) close(fd);
        efi_free(vtoc);
 
-       /* Wait for the first expected partition to appear. */
-
        (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name);
        (void) zfs_append_partition(path, MAXPATHLEN);
 
-       rval = zpool_label_disk_wait(path, 3000);
+       /* Wait to udev to signal use the device has settled. */
+       rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT);
        if (rval) {
                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to "
                    "detect device partitions on '%s': %d"), path, rval);
index cdb8f78e27884d1d363746291a06c3026f99bd5e..9b51ecc1d9687cd8bd805cfa2fb3de27c02adbfa 100644 (file)
@@ -244,12 +244,12 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 {
        struct block_device *bdev = ERR_PTR(-ENXIO);
        vdev_disk_t *vd;
-       int mode, block_size;
+       int count = 0, mode, block_size;
 
        /* Must have a pathname and it must be absolute. */
        if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
                v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-               return (EINVAL);
+               return (SET_ERROR(EINVAL));
        }
 
        /*
@@ -264,7 +264,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 
        vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
        if (vd == NULL)
-               return (ENOMEM);
+               return (SET_ERROR(ENOMEM));
 
        /*
         * Devices are always opened by the path provided at configuration
@@ -279,16 +279,35 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
         * /dev/[hd]d devices which may be reordered due to probing order.
         * Devices in the wrong locations will be detected by the higher
         * level vdev validation.
+        *
+        * The specified paths may be briefly removed and recreated in
+        * response to udev events.  This should be exceptionally unlikely
+        * because the zpool command makes every effort to verify these paths
+        * have already settled prior to reaching this point.  Therefore,
+        * a ENOENT failure at this point is highly likely to be transient
+        * and it is reasonable to sleep and retry before giving up.  In
+        * practice delays have been observed to be on the order of 100ms.
         */
        mode = spa_mode(v->vdev_spa);
        if (v->vdev_wholedisk && v->vdev_expanding)
                bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
-       if (IS_ERR(bdev))
+
+       while (IS_ERR(bdev) && count < 50) {
                bdev = vdev_bdev_open(v->vdev_path,
                    vdev_bdev_mode(mode), zfs_vdev_holder);
+               if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
+                       msleep(10);
+                       count++;
+               } else if (IS_ERR(bdev)) {
+                       break;
+               }
+       }
+
        if (IS_ERR(bdev)) {
+               dprintf("failed open v->vdev_path=%s, error=%d count=%d\n",
+                   v->vdev_path, -PTR_ERR(bdev), count);
                kmem_free(vd, sizeof (vdev_disk_t));
-               return (-PTR_ERR(bdev));
+               return (SET_ERROR(-PTR_ERR(bdev)));
        }
 
        v->vdev_tsd = vd;