]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/spdk/lib/env_dpdk/init.c
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / lib / env_dpdk / init.c
index e087e14790a85fa01a49e475cba3421168a3e582..0376dbe7b6de3adefe786577e624736e55a5616d 100644 (file)
@@ -40,6 +40,8 @@
 
 #include <rte_config.h>
 #include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_vfio.h>
 
 #define SPDK_ENV_DPDK_DEFAULT_NAME             "spdk"
 #define SPDK_ENV_DPDK_DEFAULT_SHM_ID           -1
@@ -47,6 +49,7 @@
 #define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE      -1
 #define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL      -1
 #define SPDK_ENV_DPDK_DEFAULT_CORE_MASK                "0x1"
+#define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR    0x200000000000
 
 static char **g_eal_cmdline;
 static int g_eal_cmdline_argcount;
@@ -102,23 +105,6 @@ _sprintf_alloc(const char *format, ...)
        return NULL;
 }
 
-static void
-spdk_env_unlink_shared_files(void)
-{
-       /* Starting with DPDK 18.05, there are more files with unpredictable paths
-        * and filenames. The --no-shconf option prevents from creating them, but
-        * only for DPDK 18.08+. For DPDK 18.05 we just leave them be.
-        */
-#if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0)
-       char buffer[PATH_MAX];
-
-       snprintf(buffer, PATH_MAX, "/var/run/.spdk_pid%d_hugepage_info", getpid());
-       if (unlink(buffer)) {
-               fprintf(stderr, "Unable to unlink shared memory file: %s. Error code: %d\n", buffer, errno);
-       }
-#endif
-}
-
 void
 spdk_env_opts_init(struct spdk_env_opts *opts)
 {
@@ -134,13 +120,18 @@ spdk_env_opts_init(struct spdk_env_opts *opts)
        opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
        opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE;
        opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
+       opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
 }
 
 static void
-spdk_free_args(char **args, int argcount)
+free_args(char **args, int argcount)
 {
        int i;
 
+       if (args == NULL) {
+               return;
+       }
+
        for (i = 0; i < argcount; i++) {
                free(args[i]);
        }
@@ -151,20 +142,20 @@ spdk_free_args(char **args, int argcount)
 }
 
 static char **
-spdk_push_arg(char *args[], int *argcount, char *arg)
+push_arg(char *args[], int *argcount, char *arg)
 {
        char **tmp;
 
        if (arg == NULL) {
                fprintf(stderr, "%s: NULL arg supplied\n", __func__);
-               spdk_free_args(args, *argcount);
+               free_args(args, *argcount);
                return NULL;
        }
 
        tmp = realloc(args, sizeof(char *) * (*argcount + 1));
        if (tmp == NULL) {
                free(arg);
-               spdk_free_args(args, *argcount);
+               free_args(args, *argcount);
                return NULL;
        }
 
@@ -174,8 +165,77 @@ spdk_push_arg(char *args[], int *argcount, char *arg)
        return tmp;
 }
 
+#if defined(__linux__) && defined(__x86_64__)
+
+/* TODO: Can likely get this value from rlimits in the future */
+#define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
+#define VTD_CAP_MGAW_SHIFT 16
+#define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
+
+static int
+get_iommu_width(void)
+{
+       DIR *dir;
+       FILE *file;
+       struct dirent *entry;
+       char mgaw_path[64];
+       char buf[64];
+       char *end;
+       long long int val;
+       int width, tmp;
+
+       dir = opendir("/sys/devices/virtual/iommu/");
+       if (dir == NULL) {
+               return -EINVAL;
+       }
+
+       width = 0;
+
+       while ((entry = readdir(dir)) != NULL) {
+               /* Find directories named "dmar0", "dmar1", etc */
+               if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
+                       continue;
+               }
+
+               tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
+                              entry->d_name);
+               if ((unsigned)tmp >= sizeof(mgaw_path)) {
+                       continue;
+               }
+
+               file = fopen(mgaw_path, "r");
+               if (file == NULL) {
+                       continue;
+               }
+
+               if (fgets(buf, sizeof(buf), file) == NULL) {
+                       fclose(file);
+                       continue;
+               }
+
+               val = strtoll(buf, &end, 16);
+               if (val == LLONG_MIN || val == LLONG_MAX) {
+                       fclose(file);
+                       continue;
+               }
+
+               tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
+               if (width == 0 || tmp < width) {
+                       width = tmp;
+               }
+
+               fclose(file);
+       }
+
+       closedir(dir);
+
+       return width;
+}
+
+#endif
+
 static int
-spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
+build_eal_cmdline(const struct spdk_env_opts *opts)
 {
        int argcount = 0;
        char **args;
@@ -183,14 +243,14 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
        args = NULL;
 
        /* set the program name */
-       args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
+       args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
        if (args == NULL) {
                return -1;
        }
 
        /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
        if (opts->shm_id < 0) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
+               args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
                if (args == NULL) {
                        return -1;
                }
@@ -201,13 +261,17 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
         */
        if (opts->core_mask[0] == '[') {
                char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
-               int len = strlen(l_arg);
-               if (l_arg[len - 1] == ']') {
-                       l_arg[len - 1] = '\0';
+
+               if (l_arg != NULL) {
+                       int len = strlen(l_arg);
+
+                       if (l_arg[len - 1] == ']') {
+                               l_arg[len - 1] = '\0';
+                       }
                }
-               args = spdk_push_arg(args, &argcount, l_arg);
+               args = push_arg(args, &argcount, l_arg);
        } else {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
+               args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
        }
 
        if (args == NULL) {
@@ -216,7 +280,7 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
 
        /* set the memory channel number */
        if (opts->mem_channel > 0) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
+               args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
                if (args == NULL) {
                        return -1;
                }
@@ -224,7 +288,7 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
 
        /* set the memory size */
        if (opts->mem_size >= 0) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
+               args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
                if (args == NULL) {
                        return -1;
                }
@@ -232,8 +296,8 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
 
        /* set the master core */
        if (opts->master_core > 0) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
-                                    opts->master_core));
+               args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
+                               opts->master_core));
                if (args == NULL) {
                        return -1;
                }
@@ -241,7 +305,7 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
 
        /* set no pci  if enabled */
        if (opts->no_pci) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
+               args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
                if (args == NULL) {
                        return -1;
                }
@@ -249,7 +313,7 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
 
        /* create just one hugetlbfs file */
        if (opts->hugepage_single_segments) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
+               args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
                if (args == NULL) {
                        return -1;
                }
@@ -257,7 +321,7 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
 
        /* unlink hugepages after initialization */
        if (opts->unlink_hugepage) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
+               args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
                if (args == NULL) {
                        return -1;
                }
@@ -265,22 +329,12 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
 
        /* use a specific hugetlbfs mount */
        if (opts->hugedir) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
+               args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
                if (args == NULL) {
                        return -1;
                }
        }
 
-#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) && RTE_VERSION < RTE_VERSION_NUM(18, 5, 1, 0)
-       /* Dynamic memory management is buggy in DPDK 18.05.0. Don't use it. */
-       if (!opts->env_context || strcmp(opts->env_context, "--legacy-mem") != 0) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("--legacy-mem"));
-               if (args == NULL) {
-                       return -1;
-               }
-       }
-#endif
-
        if (opts->num_pci_addr) {
                size_t i;
                char bdf[32];
@@ -289,9 +343,9 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
 
                for (i = 0; i < opts->num_pci_addr; i++) {
                        spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
-                       args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s=%s",
-                                            (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
-                                            bdf));
+                       args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
+                                       (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
+                                       bdf));
                        if (args == NULL) {
                                return -1;
                        }
@@ -301,7 +355,7 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
        /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
         * This can be overridden by specifying the same option in opts->env_context
         */
-       args = spdk_push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
+       args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
        if (args == NULL) {
                return -1;
        }
@@ -309,7 +363,7 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
        /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
         * This can be overridden by specifying the same option in opts->env_context
         */
-       args = spdk_push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
+       args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
        if (args == NULL) {
                return -1;
        }
@@ -319,26 +373,66 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
         * of other DPDK libs, but none of which we make use right now. If necessary, this can
         * be overridden via opts->env_context.
         */
-       args = spdk_push_arg(args, &argcount, strdup("--log-level=user1:6"));
+       args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
        if (args == NULL) {
                return -1;
        }
 
        if (opts->env_context) {
-               args = spdk_push_arg(args, &argcount, strdup(opts->env_context));
+               args = push_arg(args, &argcount, strdup(opts->env_context));
                if (args == NULL) {
                        return -1;
                }
        }
 
 #ifdef __linux__
+
+       if (opts->iova_mode) {
+               args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
+               if (args == NULL) {
+                       return -1;
+               }
+       } else {
+               /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
+                * but DPDK guesses it should be iova-mode=va. Add a check and force
+                * iova-mode=pa here. */
+               if (rte_vfio_noiommu_is_enabled()) {
+                       args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+                       if (args == NULL) {
+                               return -1;
+                       }
+               }
+
+#if defined(__x86_64__)
+               /* DPDK by default guesses that it should be using iova-mode=va so that it can
+                * support running as an unprivileged user. However, some systems (especially
+                * virtual machines) don't have an IOMMU capable of handling the full virtual
+                * address space and DPDK doesn't currently catch that. Add a check in SPDK
+                * and force iova-mode=pa here. */
+               if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
+                       args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+                       if (args == NULL) {
+                               return -1;
+                       }
+               }
+#elif defined(__PPC64__)
+               /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
+                * auto-detect at the moment, so we'll just force it here. */
+               args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+               if (args == NULL) {
+                       return -1;
+               }
+#endif
+       }
+
+
        /* Set the base virtual address - it must be an address that is not in the
         * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
         * mmap hint.
         *
         * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
         */
-       args = spdk_push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x200000000000"));
+       args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
        if (args == NULL) {
                return -1;
        }
@@ -349,8 +443,8 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
         * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
         */
 #if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
-       if (!opts->env_context || strcmp(opts->env_context, "--legacy-mem") != 0) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
+       if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
+               args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
                if (args == NULL) {
                        return -1;
                }
@@ -358,20 +452,20 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
 #endif
 
        if (opts->shm_id < 0) {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
-                                    getpid()));
+               args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
+                               getpid()));
                if (args == NULL) {
                        return -1;
                }
        } else {
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
-                                    opts->shm_id));
+               args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
+                               opts->shm_id));
                if (args == NULL) {
                        return -1;
                }
 
                /* set the process type */
-               args = spdk_push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
+               args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
                if (args == NULL) {
                        return -1;
                }
@@ -384,17 +478,22 @@ spdk_build_eal_cmdline(const struct spdk_env_opts *opts)
 }
 
 int
-spdk_env_dpdk_post_init(void)
+spdk_env_dpdk_post_init(bool legacy_mem)
 {
-       spdk_pci_init();
+       int rc;
 
-       if (spdk_mem_map_init() < 0) {
+       pci_env_init();
+
+       rc = mem_map_init(legacy_mem);
+       if (rc < 0) {
                fprintf(stderr, "Failed to allocate mem_map\n");
-               return -1;
+               return rc;
        }
-       if (spdk_vtophys_init() < 0) {
+
+       rc = vtophys_init();
+       if (rc < 0) {
                fprintf(stderr, "Failed to initialize vtophys\n");
-               return -1;
+               return rc;
        }
 
        return 0;
@@ -403,9 +502,11 @@ spdk_env_dpdk_post_init(void)
 void
 spdk_env_dpdk_post_fini(void)
 {
-       spdk_pci_fini();
+       pci_env_fini();
 
-       spdk_free_args(g_eal_cmdline, g_eal_cmdline_argcount);
+       free_args(g_eal_cmdline, g_eal_cmdline_argcount);
+       g_eal_cmdline = NULL;
+       g_eal_cmdline_argcount = 0;
 }
 
 int
@@ -414,13 +515,32 @@ spdk_env_init(const struct spdk_env_opts *opts)
        char **dpdk_args = NULL;
        int i, rc;
        int orig_optind;
+       bool legacy_mem;
 
-       g_external_init = false;
+       /* If SPDK env has been initialized before, then only pci env requires
+        * reinitialization.
+        */
+       if (g_external_init == false) {
+               if (opts != NULL) {
+                       fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n");
+                       return -EINVAL;
+               }
+
+               printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version());
+               pci_env_reinit();
+
+               return 0;
+       }
 
-       rc = spdk_build_eal_cmdline(opts);
+       if (opts == NULL) {
+               fprintf(stderr, "NULL arguments to initialize DPDK\n");
+               return -EINVAL;
+       }
+
+       rc = build_eal_cmdline(opts);
        if (rc < 0) {
                fprintf(stderr, "Invalid arguments to initialize DPDK\n");
-               return -1;
+               return -EINVAL;
        }
 
        printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
@@ -437,7 +557,7 @@ spdk_env_init(const struct spdk_env_opts *opts)
        dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
        if (dpdk_args == NULL) {
                fprintf(stderr, "Failed to allocate dpdk_args\n");
-               return -1;
+               return -ENOMEM;
        }
        memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
 
@@ -450,22 +570,25 @@ spdk_env_init(const struct spdk_env_opts *opts)
        free(dpdk_args);
 
        if (rc < 0) {
-               fprintf(stderr, "Failed to initialize DPDK\n");
-               return -1;
+               if (rte_errno == EALREADY) {
+                       fprintf(stderr, "DPDK already initialized\n");
+               } else {
+                       fprintf(stderr, "Failed to initialize DPDK\n");
+               }
+               return -rte_errno;
        }
 
-       if (opts->shm_id < 0 && !opts->hugepage_single_segments) {
-               /*
-                * Unlink hugepage and config info files after init.  This will ensure they get
-                *  deleted on app exit, even if the app crashes and does not exit normally.
-                *  Only do this when not in multi-process mode, since for multi-process other
-                *  apps will need to open these files. These files are not created for
-                *  "single file segments".
-                */
-               spdk_env_unlink_shared_files();
+       legacy_mem = false;
+       if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
+               legacy_mem = true;
+       }
+
+       rc = spdk_env_dpdk_post_init(legacy_mem);
+       if (rc == 0) {
+               g_external_init = false;
        }
 
-       return spdk_env_dpdk_post_init();
+       return rc;
 }
 
 void