2 * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
21 #include <sys/types.h>
25 #include <rte_cpuflags.h>
26 #include <rte_errno.h>
28 #include <rte_memzone.h>
29 #include <rte_version.h>
32 #include "fatal-signal.h"
33 #include "netdev-dpdk.h"
34 #include "netdev-offload-provider.h"
35 #include "openvswitch/dynamic-string.h"
36 #include "openvswitch/vlog.h"
42 #include "vswitch-idl.h"
44 VLOG_DEFINE_THIS_MODULE(dpdk
);
46 static FILE *log_stream
= NULL
; /* Stream for DPDK log redirection */
48 static char *vhost_sock_dir
= NULL
; /* Location of vhost-user sockets */
49 static bool vhost_iommu_enabled
= false; /* Status of vHost IOMMU support */
50 static bool vhost_postcopy_enabled
= false; /* Status of vHost POSTCOPY
52 static bool dpdk_initialized
= false; /* Indicates successful initialization
54 static bool per_port_memory
= false; /* Status of per port memory support */
57 process_vhost_flags(char *flag
, const char *default_val
, int size
,
58 const struct smap
*ovs_other_config
,
64 val
= smap_get(ovs_other_config
, flag
);
66 /* Process the vhost-sock-dir flag if it is provided, otherwise resort to
69 if (val
&& (strlen(val
) <= size
)) {
71 *new_val
= xstrdup(val
);
72 VLOG_INFO("User-provided %s in use: %s", flag
, *new_val
);
74 VLOG_INFO("No %s provided - defaulting to %s", flag
, default_val
);
75 *new_val
= xstrdup(default_val
);
82 args_contains(const struct svec
*args
, const char *value
)
87 /* We can't just use 'svec_contains' because args are not sorted. */
88 SVEC_FOR_EACH (i
, arg
, args
) {
89 if (!strcmp(arg
, value
)) {
97 construct_dpdk_options(const struct smap
*ovs_other_config
, struct svec
*args
)
99 struct dpdk_options_map
{
100 const char *ovs_configuration
;
101 const char *dpdk_option
;
102 bool default_enabled
;
103 const char *default_value
;
105 {"dpdk-lcore-mask", "-c", false, NULL
},
106 {"dpdk-hugepage-dir", "--huge-dir", false, NULL
},
107 {"dpdk-socket-limit", "--socket-limit", false, NULL
},
112 /*First, construct from the flat-options (non-mutex)*/
113 for (i
= 0; i
< ARRAY_SIZE(opts
); ++i
) {
114 const char *value
= smap_get(ovs_other_config
,
115 opts
[i
].ovs_configuration
);
116 if (!value
&& opts
[i
].default_enabled
) {
117 value
= opts
[i
].default_value
;
121 if (!args_contains(args
, opts
[i
].dpdk_option
)) {
122 svec_add(args
, opts
[i
].dpdk_option
);
123 svec_add(args
, value
);
125 VLOG_WARN("Ignoring database defined option '%s' due to "
126 "dpdk-extra config", opts
[i
].dpdk_option
);
133 construct_dpdk_socket_mem(void)
135 const char *def_value
= "1024";
136 int numa
, numa_nodes
= ovs_numa_get_n_numas();
137 struct ds dpdk_socket_mem
= DS_EMPTY_INITIALIZER
;
139 if (numa_nodes
== 0 || numa_nodes
== OVS_NUMA_UNSPEC
) {
143 ds_put_cstr(&dpdk_socket_mem
, def_value
);
144 for (numa
= 1; numa
< numa_nodes
; ++numa
) {
145 ds_put_format(&dpdk_socket_mem
, ",%s", def_value
);
148 return ds_cstr(&dpdk_socket_mem
);
151 #define MAX_DPDK_EXCL_OPTS 10
154 construct_dpdk_mutex_options(const struct smap
*ovs_other_config
,
157 char *default_dpdk_socket_mem
= construct_dpdk_socket_mem();
159 struct dpdk_exclusive_options_map
{
160 const char *category
;
161 const char *ovs_dpdk_options
[MAX_DPDK_EXCL_OPTS
];
162 const char *eal_dpdk_options
[MAX_DPDK_EXCL_OPTS
];
163 const char *default_value
;
167 {"dpdk-alloc-mem", "dpdk-socket-mem", NULL
,},
168 {"-m", "--socket-mem", NULL
,},
169 default_dpdk_socket_mem
, 1
174 for (i
= 0; i
< ARRAY_SIZE(excl_opts
); ++i
) {
175 int found_opts
= 0, scan
, found_pos
= -1;
176 const char *found_value
;
177 struct dpdk_exclusive_options_map
*popt
= &excl_opts
[i
];
179 for (scan
= 0; scan
< MAX_DPDK_EXCL_OPTS
180 && popt
->ovs_dpdk_options
[scan
]; ++scan
) {
181 const char *value
= smap_get(ovs_other_config
,
182 popt
->ovs_dpdk_options
[scan
]);
183 if (value
&& strlen(value
)) {
191 if (popt
->default_option
) {
192 found_pos
= popt
->default_option
;
193 found_value
= popt
->default_value
;
199 if (found_opts
> 1) {
200 VLOG_ERR("Multiple defined options for %s. Please check your"
201 " database settings and reconfigure if necessary.",
205 if (!args_contains(args
, popt
->eal_dpdk_options
[found_pos
])) {
206 svec_add(args
, popt
->eal_dpdk_options
[found_pos
]);
207 svec_add(args
, found_value
);
209 VLOG_WARN("Ignoring database defined option '%s' due to "
210 "dpdk-extra config", popt
->eal_dpdk_options
[found_pos
]);
214 free(default_dpdk_socket_mem
);
218 construct_dpdk_args(const struct smap
*ovs_other_config
, struct svec
*args
)
220 const char *extra_configuration
= smap_get(ovs_other_config
, "dpdk-extra");
222 if (extra_configuration
) {
223 svec_parse_words(args
, extra_configuration
);
226 construct_dpdk_options(ovs_other_config
, args
);
227 construct_dpdk_mutex_options(ovs_other_config
, args
);
231 dpdk_log_write(void *c OVS_UNUSED
, const char *buf
, size_t size
)
233 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(600, 600);
234 static struct vlog_rate_limit dbg_rl
= VLOG_RATE_LIMIT_INIT(600, 600);
236 switch (rte_log_cur_msg_loglevel()) {
238 VLOG_DBG_RL(&dbg_rl
, "%.*s", (int) size
, buf
);
242 VLOG_INFO_RL(&rl
, "%.*s", (int) size
, buf
);
244 case RTE_LOG_WARNING
:
245 VLOG_WARN_RL(&rl
, "%.*s", (int) size
, buf
);
248 VLOG_ERR_RL(&rl
, "%.*s", (int) size
, buf
);
253 VLOG_EMER("%.*s", (int) size
, buf
);
262 static cookie_io_functions_t dpdk_log_func
= {
263 .write
= dpdk_log_write
,
267 dpdk_unixctl_mem_stream(struct unixctl_conn
*conn
, int argc OVS_UNUSED
,
268 const char *argv
[] OVS_UNUSED
, void *aux
)
270 void (*callback
)(FILE *) = aux
;
271 char *response
= NULL
;
275 stream
= open_memstream(&response
, &size
);
277 response
= xasprintf("Unable to open memstream: %s.",
278 ovs_strerror(errno
));
279 unixctl_command_reply_error(conn
, response
);
285 unixctl_command_reply(conn
, response
);
291 dpdk_parse_log_level(const char *s
)
293 static const char * const levels
[] = {
294 [RTE_LOG_EMERG
] = "emergency",
295 [RTE_LOG_ALERT
] = "alert",
296 [RTE_LOG_CRIT
] = "critical",
297 [RTE_LOG_ERR
] = "error",
298 [RTE_LOG_WARNING
] = "warning",
299 [RTE_LOG_NOTICE
] = "notice",
300 [RTE_LOG_INFO
] = "info",
301 [RTE_LOG_DEBUG
] = "debug",
305 for (i
= 1; i
< ARRAY_SIZE(levels
); ++i
) {
306 if (!strcmp(s
, levels
[i
])) {
314 dpdk_unixctl_log_set(struct unixctl_conn
*conn
, int argc
, const char *argv
[],
315 void *aux OVS_UNUSED
)
319 /* With no argument, set all components level to 'debug'. */
321 rte_log_set_level_pattern("*", RTE_LOG_DEBUG
);
323 for (i
= 1; i
< argc
; i
++) {
324 char *err_msg
= NULL
;
330 s
= xstrdup(argv
[i
]);
331 level_string
= strchr(s
, ':');
332 if (level_string
== NULL
) {
337 level_string
[0] = '\0';
341 level
= dpdk_parse_log_level(level_string
);
343 err_msg
= xasprintf("invalid log level: '%s'", level_string
);
344 } else if (rte_log_set_level_pattern(pattern
, level
) < 0) {
345 err_msg
= xasprintf("cannot set log level for '%s'", argv
[i
]);
349 unixctl_command_reply_error(conn
, err_msg
);
356 unixctl_command_reply(conn
, NULL
);
360 dpdk_init__(const struct smap
*ovs_other_config
)
362 char *sock_dir_subcomponent
;
365 bool auto_determine
= true;
367 struct ovs_numa_dump
*affinity
= NULL
;
368 struct svec args
= SVEC_EMPTY_INITIALIZER
;
370 log_stream
= fopencookie(NULL
, "w+", dpdk_log_func
);
371 if (log_stream
== NULL
) {
372 VLOG_ERR("Can't redirect DPDK log: %s.", ovs_strerror(errno
));
374 setbuf(log_stream
, NULL
);
375 rte_openlog_stream(log_stream
);
378 if (process_vhost_flags("vhost-sock-dir", ovs_rundir(),
379 NAME_MAX
, ovs_other_config
,
380 &sock_dir_subcomponent
)) {
382 if (!strstr(sock_dir_subcomponent
, "..")) {
383 vhost_sock_dir
= xasprintf("%s/%s", ovs_rundir(),
384 sock_dir_subcomponent
);
386 err
= stat(vhost_sock_dir
, &s
);
388 VLOG_ERR("vhost-user sock directory '%s' does not exist.",
392 vhost_sock_dir
= xstrdup(ovs_rundir());
393 VLOG_ERR("vhost-user sock directory request '%s/%s' has invalid"
394 "characters '..' - using %s instead.",
395 ovs_rundir(), sock_dir_subcomponent
, ovs_rundir());
397 free(sock_dir_subcomponent
);
399 vhost_sock_dir
= sock_dir_subcomponent
;
402 vhost_iommu_enabled
= smap_get_bool(ovs_other_config
,
403 "vhost-iommu-support", false);
404 VLOG_INFO("IOMMU support for vhost-user-client %s.",
405 vhost_iommu_enabled
? "enabled" : "disabled");
407 vhost_postcopy_enabled
= smap_get_bool(ovs_other_config
,
408 "vhost-postcopy-support", false);
409 if (vhost_postcopy_enabled
&& memory_locked()) {
410 VLOG_WARN("vhost-postcopy-support and mlockall are not compatible.");
411 vhost_postcopy_enabled
= false;
413 VLOG_INFO("POSTCOPY support for vhost-user-client %s.",
414 vhost_postcopy_enabled
? "enabled" : "disabled");
416 per_port_memory
= smap_get_bool(ovs_other_config
,
417 "per-port-memory", false);
418 VLOG_INFO("Per port memory for DPDK devices %s.",
419 per_port_memory
? "enabled" : "disabled");
421 svec_add(&args
, ovs_get_program_name());
422 construct_dpdk_args(ovs_other_config
, &args
);
424 if (!args_contains(&args
, "--legacy-mem")
425 && !args_contains(&args
, "--socket-limit")) {
429 SVEC_FOR_EACH (i
, arg
, &args
) {
430 if (!strcmp(arg
, "--socket-mem")) {
434 if (i
< args
.n
- 1) {
435 svec_add(&args
, "--socket-limit");
436 svec_add(&args
, args
.names
[i
+ 1]);
440 if (args_contains(&args
, "-c") || args_contains(&args
, "-l")) {
441 auto_determine
= false;
445 * NOTE: This is an unsophisticated mechanism for determining the DPDK
448 if (auto_determine
) {
449 const struct ovs_numa_info_core
*core
;
452 /* Get the main thread affinity */
453 affinity
= ovs_numa_thread_getaffinity_dump();
456 FOR_EACH_CORE_ON_DUMP (core
, affinity
) {
457 if (cpu
> core
->core_id
) {
462 /* User did not set dpdk-lcore-mask and unable to get current
463 * thread affintity - default to core #0 */
464 VLOG_ERR("Thread getaffinity failed. Using core #0");
466 svec_add(&args
, "-l");
467 svec_add_nocopy(&args
, xasprintf("%d", cpu
));
470 svec_terminate(&args
);
474 if (VLOG_IS_INFO_ENABLED()) {
475 struct ds eal_args
= DS_EMPTY_INITIALIZER
;
476 char *joined_args
= svec_join(&args
, " ", ".");
478 ds_put_format(&eal_args
, "EAL ARGS: %s", joined_args
);
479 VLOG_INFO("%s", ds_cstr_ro(&eal_args
));
480 ds_destroy(&eal_args
);
484 /* Copy because 'rte_eal_init' will change the argv, i.e. it will remove
485 * some arguments from it. '+1' to copy the terminating NULL. */
486 argv
= xmemdup(args
.names
, (args
.n
+ 1) * sizeof args
.names
[0]);
488 /* Make sure things are initialized ... */
489 result
= rte_eal_init(args
.n
, argv
);
494 /* Set the main thread affinity back to pre rte_eal_init() value */
496 ovs_numa_thread_setaffinity_dump(affinity
);
497 ovs_numa_dump_destroy(affinity
);
501 VLOG_EMER("Unable to initialize DPDK: %s", ovs_strerror(rte_errno
));
505 if (VLOG_IS_DBG_ENABLED()) {
507 char *response
= NULL
;
508 FILE *stream
= open_memstream(&response
, &size
);
511 fprintf(stream
, "rte_memzone_dump:\n");
512 rte_memzone_dump(stream
);
513 fprintf(stream
, "rte_log_dump:\n");
514 rte_log_dump(stream
);
516 VLOG_DBG("%s", response
);
519 VLOG_DBG("Could not dump memzone and log levels. "
520 "Unable to open memstream: %s.", ovs_strerror(errno
));
524 unixctl_command_register("dpdk/log-list", "", 0, 0,
525 dpdk_unixctl_mem_stream
, rte_log_dump
);
526 unixctl_command_register("dpdk/log-set", "{level | pattern:level}", 0,
527 INT_MAX
, dpdk_unixctl_log_set
, NULL
);
529 /* We are called from the main thread here */
530 RTE_PER_LCORE(_lcore_id
) = NON_PMD_CORE_ID
;
532 /* Finally, register the dpdk classes */
533 netdev_dpdk_register();
534 netdev_register_flow_api_provider(&netdev_offload_dpdk
);
539 dpdk_init(const struct smap
*ovs_other_config
)
541 static bool enabled
= false;
543 if (enabled
|| !ovs_other_config
) {
547 const char *dpdk_init_val
= smap_get_def(ovs_other_config
, "dpdk-init",
550 bool try_only
= !strcasecmp(dpdk_init_val
, "try");
551 if (!strcasecmp(dpdk_init_val
, "true") || try_only
) {
552 static struct ovsthread_once once_enable
= OVSTHREAD_ONCE_INITIALIZER
;
554 if (ovsthread_once_start(&once_enable
)) {
555 VLOG_INFO("Using %s", rte_version());
556 VLOG_INFO("DPDK Enabled - initializing...");
557 enabled
= dpdk_init__(ovs_other_config
);
559 VLOG_INFO("DPDK Enabled - initialized");
560 } else if (!try_only
) {
561 ovs_abort(rte_errno
, "Cannot init EAL");
563 ovsthread_once_done(&once_enable
);
565 VLOG_ERR_ONCE("DPDK Initialization Failed.");
568 VLOG_INFO_ONCE("DPDK Disabled - Use other_config:dpdk-init to enable");
570 dpdk_initialized
= enabled
;
574 dpdk_get_vhost_sock_dir(void)
576 return vhost_sock_dir
;
580 dpdk_vhost_iommu_enabled(void)
582 return vhost_iommu_enabled
;
586 dpdk_vhost_postcopy_enabled(void)
588 return vhost_postcopy_enabled
;
592 dpdk_per_port_memory(void)
594 return per_port_memory
;
600 return dpdk_initialized
;
604 dpdk_set_lcore_id(unsigned cpu
)
606 /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
607 ovs_assert(cpu
!= NON_PMD_CORE_ID
);
608 RTE_PER_LCORE(_lcore_id
) = cpu
;
612 print_dpdk_version(void)
617 #define CHECK_CPU_FEATURE(feature, name_str, RTE_CPUFLAG) \
619 if (strncmp(feature, name_str, strlen(name_str)) == 0) { \
620 int has_isa = rte_cpu_get_flag_enabled(RTE_CPUFLAG); \
621 VLOG_DBG("CPU flag %s, available %s\n", name_str, \
622 has_isa ? "yes" : "no"); \
628 dpdk_get_cpu_has_isa(const char *arch
, const char *feature
)
630 /* Ensure Arch is x86_64. */
631 if (strncmp(arch
, "x86_64", 6) != 0) {
636 /* CPU flags only defined for the architecture that support it. */
637 CHECK_CPU_FEATURE(feature
, "avx512f", RTE_CPUFLAG_AVX512F
);
638 CHECK_CPU_FEATURE(feature
, "bmi2", RTE_CPUFLAG_BMI2
);
641 VLOG_WARN("Unknown CPU arch,feature: %s,%s. Returning not supported.\n",
647 dpdk_status(const struct ovsrec_open_vswitch
*cfg
)
650 ovsrec_open_vswitch_set_dpdk_initialized(cfg
, dpdk_initialized
);
651 ovsrec_open_vswitch_set_dpdk_version(cfg
, rte_version());