]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright 2015 6WIND S.A. | |
3 | * Copyright 2015 Mellanox Technologies, Ltd | |
4 | */ | |
5 | ||
6 | #include <stddef.h> | |
7 | #include <unistd.h> | |
8 | #include <string.h> | |
9 | #include <assert.h> | |
10 | #include <dlfcn.h> | |
11 | #include <stdint.h> | |
12 | #include <stdlib.h> | |
13 | #include <errno.h> | |
14 | #include <net/if.h> | |
15 | #include <sys/mman.h> | |
11fdf7f2 TL |
16 | #include <linux/rtnetlink.h> |
17 | ||
18 | /* Verbs header. */ | |
19 | /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ | |
20 | #ifdef PEDANTIC | |
21 | #pragma GCC diagnostic ignored "-Wpedantic" | |
22 | #endif | |
23 | #include <infiniband/verbs.h> | |
24 | #ifdef PEDANTIC | |
25 | #pragma GCC diagnostic error "-Wpedantic" | |
26 | #endif | |
27 | ||
28 | #include <rte_malloc.h> | |
29 | #include <rte_ethdev_driver.h> | |
30 | #include <rte_ethdev_pci.h> | |
31 | #include <rte_pci.h> | |
32 | #include <rte_bus_pci.h> | |
33 | #include <rte_common.h> | |
34 | #include <rte_config.h> | |
35 | #include <rte_eal_memconfig.h> | |
36 | #include <rte_kvargs.h> | |
37 | #include <rte_rwlock.h> | |
38 | #include <rte_spinlock.h> | |
39 | #include <rte_string_fns.h> | |
40 | ||
41 | #include "mlx5.h" | |
42 | #include "mlx5_utils.h" | |
43 | #include "mlx5_rxtx.h" | |
44 | #include "mlx5_autoconf.h" | |
45 | #include "mlx5_defs.h" | |
46 | #include "mlx5_glue.h" | |
47 | #include "mlx5_mr.h" | |
9f95a23c | 48 | #include "mlx5_flow.h" |
11fdf7f2 TL |
49 | |
50 | /* Device parameter to enable RX completion queue compression. */ | |
51 | #define MLX5_RXQ_CQE_COMP_EN "rxq_cqe_comp_en" | |
52 | ||
9f95a23c TL |
53 | /* Device parameter to enable RX completion entry padding to 128B. */ |
54 | #define MLX5_RXQ_CQE_PAD_EN "rxq_cqe_pad_en" | |
55 | ||
56 | /* Device parameter to enable padding Rx packet to cacheline size. */ | |
57 | #define MLX5_RXQ_PKT_PAD_EN "rxq_pkt_pad_en" | |
58 | ||
11fdf7f2 TL |
59 | /* Device parameter to enable Multi-Packet Rx queue. */ |
60 | #define MLX5_RX_MPRQ_EN "mprq_en" | |
61 | ||
62 | /* Device parameter to configure log 2 of the number of strides for MPRQ. */ | |
63 | #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num" | |
64 | ||
65 | /* Device parameter to limit the size of memcpy'd packet for MPRQ. */ | |
66 | #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len" | |
67 | ||
68 | /* Device parameter to set the minimum number of Rx queues to enable MPRQ. */ | |
69 | #define MLX5_RXQS_MIN_MPRQ "rxqs_min_mprq" | |
70 | ||
71 | /* Device parameter to configure inline send. */ | |
72 | #define MLX5_TXQ_INLINE "txq_inline" | |
73 | ||
74 | /* | |
75 | * Device parameter to configure the number of TX queues threshold for | |
76 | * enabling inline send. | |
77 | */ | |
78 | #define MLX5_TXQS_MIN_INLINE "txqs_min_inline" | |
79 | ||
9f95a23c TL |
80 | /* |
81 | * Device parameter to configure the number of TX queues threshold for | |
82 | * enabling vectorized Tx. | |
83 | */ | |
84 | #define MLX5_TXQS_MAX_VEC "txqs_max_vec" | |
85 | ||
11fdf7f2 TL |
86 | /* Device parameter to enable multi-packet send WQEs. */ |
87 | #define MLX5_TXQ_MPW_EN "txq_mpw_en" | |
88 | ||
89 | /* Device parameter to include 2 dsegs in the title WQEBB. */ | |
90 | #define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en" | |
91 | ||
92 | /* Device parameter to limit the size of inlining packet. */ | |
93 | #define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len" | |
94 | ||
95 | /* Device parameter to enable hardware Tx vector. */ | |
96 | #define MLX5_TX_VEC_EN "tx_vec_en" | |
97 | ||
98 | /* Device parameter to enable hardware Rx vector. */ | |
99 | #define MLX5_RX_VEC_EN "rx_vec_en" | |
100 | ||
101 | /* Allow L3 VXLAN flow creation. */ | |
102 | #define MLX5_L3_VXLAN_EN "l3_vxlan_en" | |
103 | ||
9f95a23c TL |
104 | /* Activate DV E-Switch flow steering. */ |
105 | #define MLX5_DV_ESW_EN "dv_esw_en" | |
106 | ||
107 | /* Activate DV flow steering. */ | |
108 | #define MLX5_DV_FLOW_EN "dv_flow_en" | |
109 | ||
11fdf7f2 TL |
110 | /* Activate Netlink support in VF mode. */ |
111 | #define MLX5_VF_NL_EN "vf_nl_en" | |
112 | ||
9f95a23c TL |
113 | /* Enable extending memsegs when creating a MR. */ |
114 | #define MLX5_MR_EXT_MEMSEG_EN "mr_ext_memseg_en" | |
115 | ||
11fdf7f2 TL |
116 | /* Select port representors to instantiate. */ |
117 | #define MLX5_REPRESENTOR "representor" | |
118 | ||
119 | #ifndef HAVE_IBV_MLX5_MOD_MPW | |
120 | #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2) | |
121 | #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3) | |
122 | #endif | |
123 | ||
124 | #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP | |
125 | #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4) | |
126 | #endif | |
127 | ||
128 | static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data"; | |
129 | ||
130 | /* Shared memory between primary and secondary processes. */ | |
131 | struct mlx5_shared_data *mlx5_shared_data; | |
132 | ||
133 | /* Spinlock for mlx5_shared_data allocation. */ | |
134 | static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER; | |
135 | ||
9f95a23c TL |
136 | /* Process local data for secondary processes. */ |
137 | static struct mlx5_local_data mlx5_local_data; | |
138 | ||
11fdf7f2 TL |
139 | /** Driver-specific log messages type. */ |
140 | int mlx5_logtype; | |
141 | ||
9f95a23c TL |
142 | /** Data associated with devices to spawn. */ |
143 | struct mlx5_dev_spawn_data { | |
144 | uint32_t ifindex; /**< Network interface index. */ | |
145 | uint32_t max_port; /**< IB device maximal port index. */ | |
146 | uint32_t ibv_port; /**< IB device physical port index. */ | |
147 | struct mlx5_switch_info info; /**< Switch information. */ | |
148 | struct ibv_device *ibv_dev; /**< Associated IB device. */ | |
149 | struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ | |
150 | struct rte_pci_device *pci_dev; /**< Backend PCI device. */ | |
151 | }; | |
152 | ||
153 | static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER(); | |
154 | static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER; | |
155 | ||
11fdf7f2 | 156 | /** |
9f95a23c TL |
157 | * Allocate shared IB device context. If there is multiport device the |
158 | * master and representors will share this context, if there is single | |
159 | * port dedicated IB device, the context will be used by only given | |
160 | * port due to unification. | |
161 | * | |
162 | * Routine first searches the context for the specified IB device name, | |
163 | * if found the shared context assumed and reference counter is incremented. | |
164 | * If no context found the new one is created and initialized with specified | |
165 | * IB device context and parameters. | |
166 | * | |
167 | * @param[in] spawn | |
168 | * Pointer to the IB device attributes (name, port, etc). | |
169 | * | |
170 | * @return | |
171 | * Pointer to mlx5_ibv_shared object on success, | |
172 | * otherwise NULL and rte_errno is set. | |
173 | */ | |
174 | static struct mlx5_ibv_shared * | |
175 | mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn) | |
176 | { | |
177 | struct mlx5_ibv_shared *sh; | |
178 | int err = 0; | |
179 | uint32_t i; | |
180 | ||
181 | assert(spawn); | |
182 | /* Secondary process should not create the shared context. */ | |
183 | assert(rte_eal_process_type() == RTE_PROC_PRIMARY); | |
184 | pthread_mutex_lock(&mlx5_ibv_list_mutex); | |
185 | /* Search for IB context by device name. */ | |
186 | LIST_FOREACH(sh, &mlx5_ibv_list, next) { | |
187 | if (!strcmp(sh->ibdev_name, spawn->ibv_dev->name)) { | |
188 | sh->refcnt++; | |
189 | goto exit; | |
190 | } | |
191 | } | |
192 | /* No device found, we have to create new shared context. */ | |
193 | assert(spawn->max_port); | |
194 | sh = rte_zmalloc("ethdev shared ib context", | |
195 | sizeof(struct mlx5_ibv_shared) + | |
196 | spawn->max_port * | |
197 | sizeof(struct mlx5_ibv_shared_port), | |
198 | RTE_CACHE_LINE_SIZE); | |
199 | if (!sh) { | |
200 | DRV_LOG(ERR, "shared context allocation failure"); | |
201 | rte_errno = ENOMEM; | |
202 | goto exit; | |
203 | } | |
204 | /* Try to open IB device with DV first, then usual Verbs. */ | |
205 | errno = 0; | |
206 | sh->ctx = mlx5_glue->dv_open_device(spawn->ibv_dev); | |
207 | if (sh->ctx) { | |
208 | sh->devx = 1; | |
209 | DRV_LOG(DEBUG, "DevX is supported"); | |
210 | } else { | |
211 | sh->ctx = mlx5_glue->open_device(spawn->ibv_dev); | |
212 | if (!sh->ctx) { | |
213 | err = errno ? errno : ENODEV; | |
214 | goto error; | |
215 | } | |
216 | DRV_LOG(DEBUG, "DevX is NOT supported"); | |
217 | } | |
218 | err = mlx5_glue->query_device_ex(sh->ctx, NULL, &sh->device_attr); | |
219 | if (err) { | |
220 | DRV_LOG(DEBUG, "ibv_query_device_ex() failed"); | |
221 | goto error; | |
222 | } | |
223 | sh->refcnt = 1; | |
224 | sh->max_port = spawn->max_port; | |
225 | strncpy(sh->ibdev_name, sh->ctx->device->name, | |
226 | sizeof(sh->ibdev_name)); | |
227 | strncpy(sh->ibdev_path, sh->ctx->device->ibdev_path, | |
228 | sizeof(sh->ibdev_path)); | |
229 | sh->pci_dev = spawn->pci_dev; | |
230 | pthread_mutex_init(&sh->intr_mutex, NULL); | |
231 | /* | |
232 | * Setting port_id to max unallowed value means | |
233 | * there is no interrupt subhandler installed for | |
234 | * the given port index i. | |
235 | */ | |
236 | for (i = 0; i < sh->max_port; i++) | |
237 | sh->port[i].ih_port_id = RTE_MAX_ETHPORTS; | |
238 | sh->pd = mlx5_glue->alloc_pd(sh->ctx); | |
239 | if (sh->pd == NULL) { | |
240 | DRV_LOG(ERR, "PD allocation failure"); | |
241 | err = ENOMEM; | |
242 | goto error; | |
243 | } | |
244 | /* | |
245 | * Once the device is added to the list of memory event | |
246 | * callback, its global MR cache table cannot be expanded | |
247 | * on the fly because of deadlock. If it overflows, lookup | |
248 | * should be done by searching MR list linearly, which is slow. | |
249 | * | |
250 | * At this point the device is not added to the memory | |
251 | * event list yet, context is just being created. | |
252 | */ | |
253 | err = mlx5_mr_btree_init(&sh->mr.cache, | |
254 | MLX5_MR_BTREE_CACHE_N * 2, | |
255 | sh->pci_dev->device.numa_node); | |
256 | if (err) { | |
257 | err = rte_errno; | |
258 | goto error; | |
259 | } | |
260 | LIST_INSERT_HEAD(&mlx5_ibv_list, sh, next); | |
261 | exit: | |
262 | pthread_mutex_unlock(&mlx5_ibv_list_mutex); | |
263 | return sh; | |
264 | error: | |
265 | pthread_mutex_unlock(&mlx5_ibv_list_mutex); | |
266 | assert(sh); | |
267 | if (sh->pd) | |
268 | claim_zero(mlx5_glue->dealloc_pd(sh->pd)); | |
269 | if (sh->ctx) | |
270 | claim_zero(mlx5_glue->close_device(sh->ctx)); | |
271 | rte_free(sh); | |
272 | assert(err > 0); | |
273 | rte_errno = err; | |
274 | return NULL; | |
275 | } | |
276 | ||
277 | /** | |
278 | * Free shared IB device context. Decrement counter and if zero free | |
279 | * all allocated resources and close handles. | |
280 | * | |
281 | * @param[in] sh | |
282 | * Pointer to mlx5_ibv_shared object to free | |
283 | */ | |
284 | static void | |
285 | mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh) | |
286 | { | |
287 | pthread_mutex_lock(&mlx5_ibv_list_mutex); | |
288 | #ifndef NDEBUG | |
289 | /* Check the object presence in the list. */ | |
290 | struct mlx5_ibv_shared *lctx; | |
291 | ||
292 | LIST_FOREACH(lctx, &mlx5_ibv_list, next) | |
293 | if (lctx == sh) | |
294 | break; | |
295 | assert(lctx); | |
296 | if (lctx != sh) { | |
297 | DRV_LOG(ERR, "Freeing non-existing shared IB context"); | |
298 | goto exit; | |
299 | } | |
300 | #endif | |
301 | assert(sh); | |
302 | assert(sh->refcnt); | |
303 | /* Secondary process should not free the shared context. */ | |
304 | assert(rte_eal_process_type() == RTE_PROC_PRIMARY); | |
305 | if (--sh->refcnt) | |
306 | goto exit; | |
307 | /* Release created Memory Regions. */ | |
308 | mlx5_mr_release(sh); | |
309 | LIST_REMOVE(sh, next); | |
310 | /* | |
311 | * Ensure there is no async event handler installed. | |
312 | * Only primary process handles async device events. | |
313 | **/ | |
314 | assert(!sh->intr_cnt); | |
315 | if (sh->intr_cnt) | |
316 | rte_intr_callback_unregister | |
317 | (&sh->intr_handle, mlx5_dev_interrupt_handler, sh); | |
318 | pthread_mutex_destroy(&sh->intr_mutex); | |
319 | if (sh->pd) | |
320 | claim_zero(mlx5_glue->dealloc_pd(sh->pd)); | |
321 | if (sh->ctx) | |
322 | claim_zero(mlx5_glue->close_device(sh->ctx)); | |
323 | rte_free(sh); | |
324 | exit: | |
325 | pthread_mutex_unlock(&mlx5_ibv_list_mutex); | |
326 | } | |
327 | ||
328 | /** | |
329 | * Initialize DR related data within private structure. | |
330 | * Routine checks the reference counter and does actual | |
331 | * resources creation/initialization only if counter is zero. | |
332 | * | |
333 | * @param[in] priv | |
334 | * Pointer to the private device data structure. | |
335 | * | |
336 | * @return | |
337 | * Zero on success, positive error code otherwise. | |
338 | */ | |
339 | static int | |
340 | mlx5_alloc_shared_dr(struct mlx5_priv *priv) | |
341 | { | |
342 | #ifdef HAVE_MLX5DV_DR | |
343 | struct mlx5_ibv_shared *sh = priv->sh; | |
344 | int err = 0; | |
345 | void *domain; | |
346 | ||
347 | assert(sh); | |
348 | if (sh->dv_refcnt) { | |
349 | /* Shared DV/DR structures is already initialized. */ | |
350 | sh->dv_refcnt++; | |
351 | priv->dr_shared = 1; | |
352 | return 0; | |
353 | } | |
354 | /* Reference counter is zero, we should initialize structures. */ | |
355 | domain = mlx5_glue->dr_create_domain(sh->ctx, | |
356 | MLX5DV_DR_DOMAIN_TYPE_NIC_RX); | |
357 | if (!domain) { | |
358 | DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed"); | |
359 | err = errno; | |
360 | goto error; | |
361 | } | |
362 | sh->rx_domain = domain; | |
363 | domain = mlx5_glue->dr_create_domain(sh->ctx, | |
364 | MLX5DV_DR_DOMAIN_TYPE_NIC_TX); | |
365 | if (!domain) { | |
366 | DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed"); | |
367 | err = errno; | |
368 | goto error; | |
369 | } | |
370 | pthread_mutex_init(&sh->dv_mutex, NULL); | |
371 | sh->tx_domain = domain; | |
372 | #ifdef HAVE_MLX5DV_DR_ESWITCH | |
373 | if (priv->config.dv_esw_en) { | |
374 | domain = mlx5_glue->dr_create_domain | |
375 | (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB); | |
376 | if (!domain) { | |
377 | DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed"); | |
378 | err = errno; | |
379 | goto error; | |
380 | } | |
381 | sh->fdb_domain = domain; | |
382 | sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop(); | |
383 | } | |
384 | #endif | |
385 | sh->dv_refcnt++; | |
386 | priv->dr_shared = 1; | |
387 | return 0; | |
388 | ||
389 | error: | |
390 | /* Rollback the created objects. */ | |
391 | if (sh->rx_domain) { | |
392 | mlx5_glue->dr_destroy_domain(sh->rx_domain); | |
393 | sh->rx_domain = NULL; | |
394 | } | |
395 | if (sh->tx_domain) { | |
396 | mlx5_glue->dr_destroy_domain(sh->tx_domain); | |
397 | sh->tx_domain = NULL; | |
398 | } | |
399 | if (sh->fdb_domain) { | |
400 | mlx5_glue->dr_destroy_domain(sh->fdb_domain); | |
401 | sh->fdb_domain = NULL; | |
402 | } | |
403 | if (sh->esw_drop_action) { | |
404 | mlx5_glue->destroy_flow_action(sh->esw_drop_action); | |
405 | sh->esw_drop_action = NULL; | |
406 | } | |
407 | return err; | |
408 | #else | |
409 | (void)priv; | |
410 | return 0; | |
411 | #endif | |
412 | } | |
413 | ||
414 | /** | |
415 | * Destroy DR related data within private structure. | |
416 | * | |
417 | * @param[in] priv | |
418 | * Pointer to the private device data structure. | |
11fdf7f2 TL |
419 | */ |
420 | static void | |
9f95a23c TL |
421 | mlx5_free_shared_dr(struct mlx5_priv *priv) |
422 | { | |
423 | #ifdef HAVE_MLX5DV_DR | |
424 | struct mlx5_ibv_shared *sh; | |
425 | ||
426 | if (!priv->dr_shared) | |
427 | return; | |
428 | priv->dr_shared = 0; | |
429 | sh = priv->sh; | |
430 | assert(sh); | |
431 | assert(sh->dv_refcnt); | |
432 | if (sh->dv_refcnt && --sh->dv_refcnt) | |
433 | return; | |
434 | if (sh->rx_domain) { | |
435 | mlx5_glue->dr_destroy_domain(sh->rx_domain); | |
436 | sh->rx_domain = NULL; | |
437 | } | |
438 | if (sh->tx_domain) { | |
439 | mlx5_glue->dr_destroy_domain(sh->tx_domain); | |
440 | sh->tx_domain = NULL; | |
441 | } | |
442 | #ifdef HAVE_MLX5DV_DR_ESWITCH | |
443 | if (sh->fdb_domain) { | |
444 | mlx5_glue->dr_destroy_domain(sh->fdb_domain); | |
445 | sh->fdb_domain = NULL; | |
446 | } | |
447 | if (sh->esw_drop_action) { | |
448 | mlx5_glue->destroy_flow_action(sh->esw_drop_action); | |
449 | sh->esw_drop_action = NULL; | |
450 | } | |
451 | #endif | |
452 | pthread_mutex_destroy(&sh->dv_mutex); | |
453 | #else | |
454 | (void)priv; | |
455 | #endif | |
456 | } | |
457 | ||
458 | /** | |
459 | * Initialize shared data between primary and secondary process. | |
460 | * | |
461 | * A memzone is reserved by primary process and secondary processes attach to | |
462 | * the memzone. | |
463 | * | |
464 | * @return | |
465 | * 0 on success, a negative errno value otherwise and rte_errno is set. | |
466 | */ | |
467 | static int | |
468 | mlx5_init_shared_data(void) | |
11fdf7f2 TL |
469 | { |
470 | const struct rte_memzone *mz; | |
9f95a23c | 471 | int ret = 0; |
11fdf7f2 TL |
472 | |
473 | rte_spinlock_lock(&mlx5_shared_data_lock); | |
474 | if (mlx5_shared_data == NULL) { | |
475 | if (rte_eal_process_type() == RTE_PROC_PRIMARY) { | |
476 | /* Allocate shared memory. */ | |
477 | mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA, | |
478 | sizeof(*mlx5_shared_data), | |
479 | SOCKET_ID_ANY, 0); | |
9f95a23c TL |
480 | if (mz == NULL) { |
481 | DRV_LOG(ERR, | |
482 | "Cannot allocate mlx5 shared data\n"); | |
483 | ret = -rte_errno; | |
484 | goto error; | |
485 | } | |
486 | mlx5_shared_data = mz->addr; | |
487 | memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data)); | |
488 | rte_spinlock_init(&mlx5_shared_data->lock); | |
11fdf7f2 TL |
489 | } else { |
490 | /* Lookup allocated shared memory. */ | |
491 | mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA); | |
9f95a23c TL |
492 | if (mz == NULL) { |
493 | DRV_LOG(ERR, | |
494 | "Cannot attach mlx5 shared data\n"); | |
495 | ret = -rte_errno; | |
496 | goto error; | |
497 | } | |
498 | mlx5_shared_data = mz->addr; | |
499 | memset(&mlx5_local_data, 0, sizeof(mlx5_local_data)); | |
11fdf7f2 | 500 | } |
11fdf7f2 | 501 | } |
9f95a23c | 502 | error: |
11fdf7f2 | 503 | rte_spinlock_unlock(&mlx5_shared_data_lock); |
9f95a23c | 504 | return ret; |
11fdf7f2 TL |
505 | } |
506 | ||
507 | /** | |
508 | * Retrieve integer value from environment variable. | |
509 | * | |
510 | * @param[in] name | |
511 | * Environment variable name. | |
512 | * | |
513 | * @return | |
514 | * Integer value, 0 if the variable is not set. | |
515 | */ | |
516 | int | |
517 | mlx5_getenv_int(const char *name) | |
518 | { | |
519 | const char *val = getenv(name); | |
520 | ||
521 | if (val == NULL) | |
522 | return 0; | |
523 | return atoi(val); | |
524 | } | |
525 | ||
526 | /** | |
527 | * Verbs callback to allocate a memory. This function should allocate the space | |
528 | * according to the size provided residing inside a huge page. | |
529 | * Please note that all allocation must respect the alignment from libmlx5 | |
530 | * (i.e. currently sysconf(_SC_PAGESIZE)). | |
531 | * | |
532 | * @param[in] size | |
533 | * The size in bytes of the memory to allocate. | |
534 | * @param[in] data | |
535 | * A pointer to the callback data. | |
536 | * | |
537 | * @return | |
538 | * Allocated buffer, NULL otherwise and rte_errno is set. | |
539 | */ | |
540 | static void * | |
541 | mlx5_alloc_verbs_buf(size_t size, void *data) | |
542 | { | |
9f95a23c | 543 | struct mlx5_priv *priv = data; |
11fdf7f2 TL |
544 | void *ret; |
545 | size_t alignment = sysconf(_SC_PAGESIZE); | |
546 | unsigned int socket = SOCKET_ID_ANY; | |
547 | ||
548 | if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) { | |
549 | const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; | |
550 | ||
551 | socket = ctrl->socket; | |
552 | } else if (priv->verbs_alloc_ctx.type == | |
553 | MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) { | |
554 | const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj; | |
555 | ||
556 | socket = ctrl->socket; | |
557 | } | |
558 | assert(data != NULL); | |
559 | ret = rte_malloc_socket(__func__, size, alignment, socket); | |
560 | if (!ret && size) | |
561 | rte_errno = ENOMEM; | |
562 | return ret; | |
563 | } | |
564 | ||
565 | /** | |
566 | * Verbs callback to free a memory. | |
567 | * | |
568 | * @param[in] ptr | |
569 | * A pointer to the memory to free. | |
570 | * @param[in] data | |
571 | * A pointer to the callback data. | |
572 | */ | |
573 | static void | |
574 | mlx5_free_verbs_buf(void *ptr, void *data __rte_unused) | |
575 | { | |
576 | assert(data != NULL); | |
577 | rte_free(ptr); | |
578 | } | |
579 | ||
9f95a23c TL |
580 | /** |
581 | * Initialize process private data structure. | |
582 | * | |
583 | * @param dev | |
584 | * Pointer to Ethernet device structure. | |
585 | * | |
586 | * @return | |
587 | * 0 on success, a negative errno value otherwise and rte_errno is set. | |
588 | */ | |
589 | int | |
590 | mlx5_proc_priv_init(struct rte_eth_dev *dev) | |
591 | { | |
592 | struct mlx5_priv *priv = dev->data->dev_private; | |
593 | struct mlx5_proc_priv *ppriv; | |
594 | size_t ppriv_size; | |
595 | ||
596 | /* | |
597 | * UAR register table follows the process private structure. BlueFlame | |
598 | * registers for Tx queues are stored in the table. | |
599 | */ | |
600 | ppriv_size = | |
601 | sizeof(struct mlx5_proc_priv) + priv->txqs_n * sizeof(void *); | |
602 | ppriv = rte_malloc_socket("mlx5_proc_priv", ppriv_size, | |
603 | RTE_CACHE_LINE_SIZE, dev->device->numa_node); | |
604 | if (!ppriv) { | |
605 | rte_errno = ENOMEM; | |
606 | return -rte_errno; | |
607 | } | |
608 | ppriv->uar_table_sz = ppriv_size; | |
609 | dev->process_private = ppriv; | |
610 | return 0; | |
611 | } | |
612 | ||
613 | /** | |
614 | * Un-initialize process private data structure. | |
615 | * | |
616 | * @param dev | |
617 | * Pointer to Ethernet device structure. | |
618 | */ | |
619 | static void | |
620 | mlx5_proc_priv_uninit(struct rte_eth_dev *dev) | |
621 | { | |
622 | if (!dev->process_private) | |
623 | return; | |
624 | rte_free(dev->process_private); | |
625 | dev->process_private = NULL; | |
626 | } | |
627 | ||
11fdf7f2 TL |
628 | /** |
629 | * DPDK callback to close the device. | |
630 | * | |
631 | * Destroy all queues and objects, free memory. | |
632 | * | |
633 | * @param dev | |
634 | * Pointer to Ethernet device structure. | |
635 | */ | |
636 | static void | |
637 | mlx5_dev_close(struct rte_eth_dev *dev) | |
638 | { | |
9f95a23c | 639 | struct mlx5_priv *priv = dev->data->dev_private; |
11fdf7f2 TL |
640 | unsigned int i; |
641 | int ret; | |
642 | ||
643 | DRV_LOG(DEBUG, "port %u closing device \"%s\"", | |
644 | dev->data->port_id, | |
9f95a23c | 645 | ((priv->sh->ctx != NULL) ? priv->sh->ctx->device->name : "")); |
11fdf7f2 TL |
646 | /* In case mlx5_dev_stop() has not been called. */ |
647 | mlx5_dev_interrupt_handler_uninstall(dev); | |
648 | mlx5_traffic_disable(dev); | |
649 | mlx5_flow_flush(dev, NULL); | |
650 | /* Prevent crashes when queues are still in use. */ | |
651 | dev->rx_pkt_burst = removed_rx_burst; | |
652 | dev->tx_pkt_burst = removed_tx_burst; | |
9f95a23c TL |
653 | rte_wmb(); |
654 | /* Disable datapath on secondary process. */ | |
655 | mlx5_mp_req_stop_rxtx(dev); | |
11fdf7f2 TL |
656 | if (priv->rxqs != NULL) { |
657 | /* XXX race condition if mlx5_rx_burst() is still running. */ | |
658 | usleep(1000); | |
659 | for (i = 0; (i != priv->rxqs_n); ++i) | |
660 | mlx5_rxq_release(dev, i); | |
661 | priv->rxqs_n = 0; | |
662 | priv->rxqs = NULL; | |
663 | } | |
664 | if (priv->txqs != NULL) { | |
665 | /* XXX race condition if mlx5_tx_burst() is still running. */ | |
666 | usleep(1000); | |
667 | for (i = 0; (i != priv->txqs_n); ++i) | |
668 | mlx5_txq_release(dev, i); | |
669 | priv->txqs_n = 0; | |
670 | priv->txqs = NULL; | |
671 | } | |
9f95a23c | 672 | mlx5_proc_priv_uninit(dev); |
11fdf7f2 | 673 | mlx5_mprq_free_mp(dev); |
9f95a23c TL |
674 | /* Remove from memory callback device list. */ |
675 | rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); | |
676 | assert(priv->sh); | |
677 | LIST_REMOVE(priv->sh, mem_event_cb); | |
678 | rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); | |
679 | mlx5_free_shared_dr(priv); | |
11fdf7f2 TL |
680 | if (priv->rss_conf.rss_key != NULL) |
681 | rte_free(priv->rss_conf.rss_key); | |
682 | if (priv->reta_idx != NULL) | |
683 | rte_free(priv->reta_idx); | |
11fdf7f2 TL |
684 | if (priv->config.vf) |
685 | mlx5_nl_mac_addr_flush(dev); | |
686 | if (priv->nl_socket_route >= 0) | |
687 | close(priv->nl_socket_route); | |
688 | if (priv->nl_socket_rdma >= 0) | |
689 | close(priv->nl_socket_rdma); | |
9f95a23c TL |
690 | if (priv->tcf_context) |
691 | mlx5_flow_tcf_context_destroy(priv->tcf_context); | |
692 | if (priv->sh) { | |
693 | /* | |
694 | * Free the shared context in last turn, because the cleanup | |
695 | * routines above may use some shared fields, like | |
696 | * mlx5_nl_mac_addr_flush() uses ibdev_path for retrieveing | |
697 | * ifindex if Netlink fails. | |
698 | */ | |
699 | mlx5_free_shared_ibctx(priv->sh); | |
700 | priv->sh = NULL; | |
701 | } | |
11fdf7f2 TL |
702 | ret = mlx5_hrxq_ibv_verify(dev); |
703 | if (ret) | |
704 | DRV_LOG(WARNING, "port %u some hash Rx queue still remain", | |
705 | dev->data->port_id); | |
706 | ret = mlx5_ind_table_ibv_verify(dev); | |
707 | if (ret) | |
708 | DRV_LOG(WARNING, "port %u some indirection table still remain", | |
709 | dev->data->port_id); | |
710 | ret = mlx5_rxq_ibv_verify(dev); | |
711 | if (ret) | |
712 | DRV_LOG(WARNING, "port %u some Verbs Rx queue still remain", | |
713 | dev->data->port_id); | |
714 | ret = mlx5_rxq_verify(dev); | |
715 | if (ret) | |
716 | DRV_LOG(WARNING, "port %u some Rx queues still remain", | |
717 | dev->data->port_id); | |
718 | ret = mlx5_txq_ibv_verify(dev); | |
719 | if (ret) | |
720 | DRV_LOG(WARNING, "port %u some Verbs Tx queue still remain", | |
721 | dev->data->port_id); | |
722 | ret = mlx5_txq_verify(dev); | |
723 | if (ret) | |
724 | DRV_LOG(WARNING, "port %u some Tx queues still remain", | |
725 | dev->data->port_id); | |
726 | ret = mlx5_flow_verify(dev); | |
727 | if (ret) | |
728 | DRV_LOG(WARNING, "port %u some flows still remain", | |
729 | dev->data->port_id); | |
730 | if (priv->domain_id != RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { | |
731 | unsigned int c = 0; | |
9f95a23c | 732 | uint16_t port_id; |
11fdf7f2 | 733 | |
9f95a23c TL |
734 | RTE_ETH_FOREACH_DEV_OF(port_id, dev->device) { |
735 | struct mlx5_priv *opriv = | |
736 | rte_eth_devices[port_id].data->dev_private; | |
11fdf7f2 TL |
737 | |
738 | if (!opriv || | |
739 | opriv->domain_id != priv->domain_id || | |
9f95a23c | 740 | &rte_eth_devices[port_id] == dev) |
11fdf7f2 TL |
741 | continue; |
742 | ++c; | |
743 | } | |
744 | if (!c) | |
745 | claim_zero(rte_eth_switch_domain_free(priv->domain_id)); | |
746 | } | |
747 | memset(priv, 0, sizeof(*priv)); | |
748 | priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; | |
9f95a23c TL |
749 | /* |
750 | * Reset mac_addrs to NULL such that it is not freed as part of | |
751 | * rte_eth_dev_release_port(). mac_addrs is part of dev_private so | |
752 | * it is freed when dev_private is freed. | |
753 | */ | |
754 | dev->data->mac_addrs = NULL; | |
11fdf7f2 TL |
755 | } |
756 | ||
757 | const struct eth_dev_ops mlx5_dev_ops = { | |
758 | .dev_configure = mlx5_dev_configure, | |
759 | .dev_start = mlx5_dev_start, | |
760 | .dev_stop = mlx5_dev_stop, | |
761 | .dev_set_link_down = mlx5_set_link_down, | |
762 | .dev_set_link_up = mlx5_set_link_up, | |
763 | .dev_close = mlx5_dev_close, | |
764 | .promiscuous_enable = mlx5_promiscuous_enable, | |
765 | .promiscuous_disable = mlx5_promiscuous_disable, | |
766 | .allmulticast_enable = mlx5_allmulticast_enable, | |
767 | .allmulticast_disable = mlx5_allmulticast_disable, | |
768 | .link_update = mlx5_link_update, | |
769 | .stats_get = mlx5_stats_get, | |
770 | .stats_reset = mlx5_stats_reset, | |
771 | .xstats_get = mlx5_xstats_get, | |
772 | .xstats_reset = mlx5_xstats_reset, | |
773 | .xstats_get_names = mlx5_xstats_get_names, | |
9f95a23c | 774 | .fw_version_get = mlx5_fw_version_get, |
11fdf7f2 TL |
775 | .dev_infos_get = mlx5_dev_infos_get, |
776 | .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, | |
777 | .vlan_filter_set = mlx5_vlan_filter_set, | |
778 | .rx_queue_setup = mlx5_rx_queue_setup, | |
779 | .tx_queue_setup = mlx5_tx_queue_setup, | |
780 | .rx_queue_release = mlx5_rx_queue_release, | |
781 | .tx_queue_release = mlx5_tx_queue_release, | |
782 | .flow_ctrl_get = mlx5_dev_get_flow_ctrl, | |
783 | .flow_ctrl_set = mlx5_dev_set_flow_ctrl, | |
784 | .mac_addr_remove = mlx5_mac_addr_remove, | |
785 | .mac_addr_add = mlx5_mac_addr_add, | |
786 | .mac_addr_set = mlx5_mac_addr_set, | |
787 | .set_mc_addr_list = mlx5_set_mc_addr_list, | |
788 | .mtu_set = mlx5_dev_set_mtu, | |
789 | .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, | |
790 | .vlan_offload_set = mlx5_vlan_offload_set, | |
791 | .reta_update = mlx5_dev_rss_reta_update, | |
792 | .reta_query = mlx5_dev_rss_reta_query, | |
793 | .rss_hash_update = mlx5_rss_hash_update, | |
794 | .rss_hash_conf_get = mlx5_rss_hash_conf_get, | |
795 | .filter_ctrl = mlx5_dev_filter_ctrl, | |
796 | .rx_descriptor_status = mlx5_rx_descriptor_status, | |
797 | .tx_descriptor_status = mlx5_tx_descriptor_status, | |
9f95a23c | 798 | .rx_queue_count = mlx5_rx_queue_count, |
11fdf7f2 TL |
799 | .rx_queue_intr_enable = mlx5_rx_intr_enable, |
800 | .rx_queue_intr_disable = mlx5_rx_intr_disable, | |
801 | .is_removed = mlx5_is_removed, | |
802 | }; | |
803 | ||
9f95a23c | 804 | /* Available operations from secondary process. */ |
11fdf7f2 TL |
805 | static const struct eth_dev_ops mlx5_dev_sec_ops = { |
806 | .stats_get = mlx5_stats_get, | |
807 | .stats_reset = mlx5_stats_reset, | |
808 | .xstats_get = mlx5_xstats_get, | |
809 | .xstats_reset = mlx5_xstats_reset, | |
810 | .xstats_get_names = mlx5_xstats_get_names, | |
9f95a23c | 811 | .fw_version_get = mlx5_fw_version_get, |
11fdf7f2 TL |
812 | .dev_infos_get = mlx5_dev_infos_get, |
813 | .rx_descriptor_status = mlx5_rx_descriptor_status, | |
814 | .tx_descriptor_status = mlx5_tx_descriptor_status, | |
815 | }; | |
816 | ||
9f95a23c | 817 | /* Available operations in flow isolated mode. */ |
11fdf7f2 TL |
818 | const struct eth_dev_ops mlx5_dev_ops_isolate = { |
819 | .dev_configure = mlx5_dev_configure, | |
820 | .dev_start = mlx5_dev_start, | |
821 | .dev_stop = mlx5_dev_stop, | |
822 | .dev_set_link_down = mlx5_set_link_down, | |
823 | .dev_set_link_up = mlx5_set_link_up, | |
824 | .dev_close = mlx5_dev_close, | |
825 | .promiscuous_enable = mlx5_promiscuous_enable, | |
826 | .promiscuous_disable = mlx5_promiscuous_disable, | |
827 | .allmulticast_enable = mlx5_allmulticast_enable, | |
828 | .allmulticast_disable = mlx5_allmulticast_disable, | |
829 | .link_update = mlx5_link_update, | |
830 | .stats_get = mlx5_stats_get, | |
831 | .stats_reset = mlx5_stats_reset, | |
832 | .xstats_get = mlx5_xstats_get, | |
833 | .xstats_reset = mlx5_xstats_reset, | |
834 | .xstats_get_names = mlx5_xstats_get_names, | |
9f95a23c | 835 | .fw_version_get = mlx5_fw_version_get, |
11fdf7f2 TL |
836 | .dev_infos_get = mlx5_dev_infos_get, |
837 | .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get, | |
838 | .vlan_filter_set = mlx5_vlan_filter_set, | |
839 | .rx_queue_setup = mlx5_rx_queue_setup, | |
840 | .tx_queue_setup = mlx5_tx_queue_setup, | |
841 | .rx_queue_release = mlx5_rx_queue_release, | |
842 | .tx_queue_release = mlx5_tx_queue_release, | |
843 | .flow_ctrl_get = mlx5_dev_get_flow_ctrl, | |
844 | .flow_ctrl_set = mlx5_dev_set_flow_ctrl, | |
845 | .mac_addr_remove = mlx5_mac_addr_remove, | |
846 | .mac_addr_add = mlx5_mac_addr_add, | |
847 | .mac_addr_set = mlx5_mac_addr_set, | |
848 | .set_mc_addr_list = mlx5_set_mc_addr_list, | |
849 | .mtu_set = mlx5_dev_set_mtu, | |
850 | .vlan_strip_queue_set = mlx5_vlan_strip_queue_set, | |
851 | .vlan_offload_set = mlx5_vlan_offload_set, | |
852 | .filter_ctrl = mlx5_dev_filter_ctrl, | |
853 | .rx_descriptor_status = mlx5_rx_descriptor_status, | |
854 | .tx_descriptor_status = mlx5_tx_descriptor_status, | |
855 | .rx_queue_intr_enable = mlx5_rx_intr_enable, | |
856 | .rx_queue_intr_disable = mlx5_rx_intr_disable, | |
857 | .is_removed = mlx5_is_removed, | |
858 | }; | |
859 | ||
860 | /** | |
861 | * Verify and store value for device argument. | |
862 | * | |
863 | * @param[in] key | |
864 | * Key argument to verify. | |
865 | * @param[in] val | |
866 | * Value associated with key. | |
867 | * @param opaque | |
868 | * User data. | |
869 | * | |
870 | * @return | |
871 | * 0 on success, a negative errno value otherwise and rte_errno is set. | |
872 | */ | |
873 | static int | |
874 | mlx5_args_check(const char *key, const char *val, void *opaque) | |
875 | { | |
876 | struct mlx5_dev_config *config = opaque; | |
877 | unsigned long tmp; | |
878 | ||
879 | /* No-op, port representors are processed in mlx5_dev_spawn(). */ | |
880 | if (!strcmp(MLX5_REPRESENTOR, key)) | |
881 | return 0; | |
882 | errno = 0; | |
883 | tmp = strtoul(val, NULL, 0); | |
884 | if (errno) { | |
885 | rte_errno = errno; | |
886 | DRV_LOG(WARNING, "%s: \"%s\" is not a valid integer", key, val); | |
887 | return -rte_errno; | |
888 | } | |
889 | if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) { | |
890 | config->cqe_comp = !!tmp; | |
9f95a23c TL |
891 | } else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) { |
892 | config->cqe_pad = !!tmp; | |
893 | } else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) { | |
894 | config->hw_padding = !!tmp; | |
11fdf7f2 TL |
895 | } else if (strcmp(MLX5_RX_MPRQ_EN, key) == 0) { |
896 | config->mprq.enabled = !!tmp; | |
897 | } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { | |
898 | config->mprq.stride_num_n = tmp; | |
899 | } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) { | |
900 | config->mprq.max_memcpy_len = tmp; | |
901 | } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) { | |
902 | config->mprq.min_rxqs_num = tmp; | |
903 | } else if (strcmp(MLX5_TXQ_INLINE, key) == 0) { | |
904 | config->txq_inline = tmp; | |
905 | } else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) { | |
906 | config->txqs_inline = tmp; | |
9f95a23c TL |
907 | } else if (strcmp(MLX5_TXQS_MAX_VEC, key) == 0) { |
908 | config->txqs_vec = tmp; | |
11fdf7f2 | 909 | } else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) { |
9f95a23c | 910 | config->mps = !!tmp; |
11fdf7f2 TL |
911 | } else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) { |
912 | config->mpw_hdr_dseg = !!tmp; | |
913 | } else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) { | |
914 | config->inline_max_packet_sz = tmp; | |
915 | } else if (strcmp(MLX5_TX_VEC_EN, key) == 0) { | |
916 | config->tx_vec_en = !!tmp; | |
917 | } else if (strcmp(MLX5_RX_VEC_EN, key) == 0) { | |
918 | config->rx_vec_en = !!tmp; | |
919 | } else if (strcmp(MLX5_L3_VXLAN_EN, key) == 0) { | |
920 | config->l3_vxlan_en = !!tmp; | |
921 | } else if (strcmp(MLX5_VF_NL_EN, key) == 0) { | |
922 | config->vf_nl_en = !!tmp; | |
9f95a23c TL |
923 | } else if (strcmp(MLX5_DV_ESW_EN, key) == 0) { |
924 | config->dv_esw_en = !!tmp; | |
925 | } else if (strcmp(MLX5_DV_FLOW_EN, key) == 0) { | |
926 | config->dv_flow_en = !!tmp; | |
927 | } else if (strcmp(MLX5_MR_EXT_MEMSEG_EN, key) == 0) { | |
928 | config->mr_ext_memseg_en = !!tmp; | |
11fdf7f2 TL |
929 | } else { |
930 | DRV_LOG(WARNING, "%s: unknown parameter", key); | |
931 | rte_errno = EINVAL; | |
932 | return -rte_errno; | |
933 | } | |
934 | return 0; | |
935 | } | |
936 | ||
937 | /** | |
938 | * Parse device parameters. | |
939 | * | |
940 | * @param config | |
941 | * Pointer to device configuration structure. | |
942 | * @param devargs | |
943 | * Device arguments structure. | |
944 | * | |
945 | * @return | |
946 | * 0 on success, a negative errno value otherwise and rte_errno is set. | |
947 | */ | |
948 | static int | |
949 | mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) | |
950 | { | |
951 | const char **params = (const char *[]){ | |
952 | MLX5_RXQ_CQE_COMP_EN, | |
9f95a23c TL |
953 | MLX5_RXQ_CQE_PAD_EN, |
954 | MLX5_RXQ_PKT_PAD_EN, | |
11fdf7f2 TL |
955 | MLX5_RX_MPRQ_EN, |
956 | MLX5_RX_MPRQ_LOG_STRIDE_NUM, | |
957 | MLX5_RX_MPRQ_MAX_MEMCPY_LEN, | |
958 | MLX5_RXQS_MIN_MPRQ, | |
959 | MLX5_TXQ_INLINE, | |
960 | MLX5_TXQS_MIN_INLINE, | |
9f95a23c | 961 | MLX5_TXQS_MAX_VEC, |
11fdf7f2 TL |
962 | MLX5_TXQ_MPW_EN, |
963 | MLX5_TXQ_MPW_HDR_DSEG_EN, | |
964 | MLX5_TXQ_MAX_INLINE_LEN, | |
965 | MLX5_TX_VEC_EN, | |
966 | MLX5_RX_VEC_EN, | |
967 | MLX5_L3_VXLAN_EN, | |
968 | MLX5_VF_NL_EN, | |
9f95a23c TL |
969 | MLX5_DV_ESW_EN, |
970 | MLX5_DV_FLOW_EN, | |
971 | MLX5_MR_EXT_MEMSEG_EN, | |
11fdf7f2 TL |
972 | MLX5_REPRESENTOR, |
973 | NULL, | |
974 | }; | |
975 | struct rte_kvargs *kvlist; | |
976 | int ret = 0; | |
977 | int i; | |
978 | ||
979 | if (devargs == NULL) | |
980 | return 0; | |
981 | /* Following UGLY cast is done to pass checkpatch. */ | |
982 | kvlist = rte_kvargs_parse(devargs->args, params); | |
983 | if (kvlist == NULL) | |
984 | return 0; | |
985 | /* Process parameters. */ | |
986 | for (i = 0; (params[i] != NULL); ++i) { | |
987 | if (rte_kvargs_count(kvlist, params[i])) { | |
988 | ret = rte_kvargs_process(kvlist, params[i], | |
989 | mlx5_args_check, config); | |
990 | if (ret) { | |
991 | rte_errno = EINVAL; | |
992 | rte_kvargs_free(kvlist); | |
993 | return -rte_errno; | |
994 | } | |
995 | } | |
996 | } | |
997 | rte_kvargs_free(kvlist); | |
998 | return 0; | |
999 | } | |
1000 | ||
1001 | static struct rte_pci_driver mlx5_driver; | |
1002 | ||
11fdf7f2 | 1003 | /** |
9f95a23c | 1004 | * PMD global initialization. |
11fdf7f2 | 1005 | * |
9f95a23c TL |
1006 | * Independent from individual device, this function initializes global |
1007 | * per-PMD data structures distinguishing primary and secondary processes. | |
1008 | * Hence, each initialization is called once per a process. | |
11fdf7f2 TL |
1009 | * |
1010 | * @return | |
1011 | * 0 on success, a negative errno value otherwise and rte_errno is set. | |
1012 | */ | |
1013 | static int | |
9f95a23c | 1014 | mlx5_init_once(void) |
11fdf7f2 | 1015 | { |
9f95a23c TL |
1016 | struct mlx5_shared_data *sd; |
1017 | struct mlx5_local_data *ld = &mlx5_local_data; | |
11fdf7f2 | 1018 | |
9f95a23c | 1019 | if (mlx5_init_shared_data()) |
11fdf7f2 | 1020 | return -rte_errno; |
9f95a23c TL |
1021 | sd = mlx5_shared_data; |
1022 | assert(sd); | |
1023 | rte_spinlock_lock(&sd->lock); | |
1024 | switch (rte_eal_process_type()) { | |
1025 | case RTE_PROC_PRIMARY: | |
1026 | if (sd->init_done) | |
1027 | break; | |
1028 | LIST_INIT(&sd->mem_event_cb_list); | |
1029 | rte_rwlock_init(&sd->mem_event_rwlock); | |
1030 | rte_mem_event_callback_register("MLX5_MEM_EVENT_CB", | |
1031 | mlx5_mr_mem_event_cb, NULL); | |
1032 | mlx5_mp_init_primary(); | |
1033 | sd->init_done = true; | |
1034 | break; | |
1035 | case RTE_PROC_SECONDARY: | |
1036 | if (ld->init_done) | |
1037 | break; | |
1038 | mlx5_mp_init_secondary(); | |
1039 | ++sd->secondary_cnt; | |
1040 | ld->init_done = true; | |
1041 | break; | |
1042 | default: | |
1043 | break; | |
11fdf7f2 | 1044 | } |
9f95a23c | 1045 | rte_spinlock_unlock(&sd->lock); |
11fdf7f2 TL |
1046 | return 0; |
1047 | } | |
1048 | ||
1049 | /** | |
1050 | * Spawn an Ethernet device from Verbs information. | |
1051 | * | |
1052 | * @param dpdk_dev | |
1053 | * Backing DPDK device. | |
9f95a23c TL |
1054 | * @param spawn |
1055 | * Verbs device parameters (name, port, switch_info) to spawn. | |
1056 | * @param config | |
1057 | * Device configuration parameters. | |
11fdf7f2 TL |
1058 | * |
1059 | * @return | |
1060 | * A valid Ethernet device object on success, NULL otherwise and rte_errno | |
9f95a23c | 1061 | * is set. The following errors are defined: |
11fdf7f2 TL |
1062 | * |
1063 | * EBUSY: device is not supposed to be spawned. | |
9f95a23c | 1064 | * EEXIST: device is already spawned |
11fdf7f2 TL |
1065 | */ |
1066 | static struct rte_eth_dev * | |
1067 | mlx5_dev_spawn(struct rte_device *dpdk_dev, | |
9f95a23c TL |
1068 | struct mlx5_dev_spawn_data *spawn, |
1069 | struct mlx5_dev_config config) | |
11fdf7f2 | 1070 | { |
9f95a23c TL |
1071 | const struct mlx5_switch_info *switch_info = &spawn->info; |
1072 | struct mlx5_ibv_shared *sh = NULL; | |
11fdf7f2 | 1073 | struct ibv_port_attr port_attr; |
11fdf7f2 | 1074 | struct mlx5dv_context dv_attr = { .comp_mask = 0 }; |
11fdf7f2 | 1075 | struct rte_eth_dev *eth_dev = NULL; |
9f95a23c | 1076 | struct mlx5_priv *priv = NULL; |
11fdf7f2 | 1077 | int err = 0; |
9f95a23c | 1078 | unsigned int hw_padding = 0; |
11fdf7f2 TL |
1079 | unsigned int mps; |
1080 | unsigned int cqe_comp; | |
9f95a23c | 1081 | unsigned int cqe_pad = 0; |
11fdf7f2 TL |
1082 | unsigned int tunnel_en = 0; |
1083 | unsigned int mpls_en = 0; | |
1084 | unsigned int swp = 0; | |
1085 | unsigned int mprq = 0; | |
1086 | unsigned int mprq_min_stride_size_n = 0; | |
1087 | unsigned int mprq_max_stride_size_n = 0; | |
1088 | unsigned int mprq_min_stride_num_n = 0; | |
1089 | unsigned int mprq_max_stride_num_n = 0; | |
11fdf7f2 TL |
1090 | struct ether_addr mac; |
1091 | char name[RTE_ETH_NAME_MAX_LEN]; | |
1092 | int own_domain_id = 0; | |
9f95a23c | 1093 | uint16_t port_id; |
11fdf7f2 TL |
1094 | unsigned int i; |
1095 | ||
1096 | /* Determine if this port representor is supposed to be spawned. */ | |
1097 | if (switch_info->representor && dpdk_dev->devargs) { | |
1098 | struct rte_eth_devargs eth_da; | |
1099 | ||
1100 | err = rte_eth_devargs_parse(dpdk_dev->devargs->args, ð_da); | |
1101 | if (err) { | |
1102 | rte_errno = -err; | |
1103 | DRV_LOG(ERR, "failed to process device arguments: %s", | |
1104 | strerror(rte_errno)); | |
1105 | return NULL; | |
1106 | } | |
1107 | for (i = 0; i < eth_da.nb_representor_ports; ++i) | |
1108 | if (eth_da.representor_ports[i] == | |
1109 | (uint16_t)switch_info->port_name) | |
1110 | break; | |
1111 | if (i == eth_da.nb_representor_ports) { | |
1112 | rte_errno = EBUSY; | |
1113 | return NULL; | |
1114 | } | |
1115 | } | |
9f95a23c TL |
1116 | /* Build device name. */ |
1117 | if (!switch_info->representor) | |
1118 | strlcpy(name, dpdk_dev->name, sizeof(name)); | |
1119 | else | |
1120 | snprintf(name, sizeof(name), "%s_representor_%u", | |
1121 | dpdk_dev->name, switch_info->port_name); | |
1122 | /* check if the device is already spawned */ | |
1123 | if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) { | |
1124 | rte_errno = EEXIST; | |
11fdf7f2 TL |
1125 | return NULL; |
1126 | } | |
9f95a23c TL |
1127 | DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name); |
1128 | if (rte_eal_process_type() == RTE_PROC_SECONDARY) { | |
1129 | eth_dev = rte_eth_dev_attach_secondary(name); | |
1130 | if (eth_dev == NULL) { | |
1131 | DRV_LOG(ERR, "can not attach rte ethdev"); | |
1132 | rte_errno = ENOMEM; | |
1133 | return NULL; | |
1134 | } | |
1135 | eth_dev->device = dpdk_dev; | |
1136 | eth_dev->dev_ops = &mlx5_dev_sec_ops; | |
1137 | err = mlx5_proc_priv_init(eth_dev); | |
1138 | if (err) | |
1139 | return NULL; | |
1140 | /* Receive command fd from primary process */ | |
1141 | err = mlx5_mp_req_verbs_cmd_fd(eth_dev); | |
1142 | if (err < 0) | |
1143 | return NULL; | |
1144 | /* Remap UAR for Tx queues. */ | |
1145 | err = mlx5_tx_uar_init_secondary(eth_dev, err); | |
1146 | if (err) | |
1147 | return NULL; | |
1148 | /* | |
1149 | * Ethdev pointer is still required as input since | |
1150 | * the primary device is not accessible from the | |
1151 | * secondary process. | |
1152 | */ | |
1153 | eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev); | |
1154 | eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev); | |
1155 | return eth_dev; | |
1156 | } | |
1157 | sh = mlx5_alloc_shared_ibctx(spawn); | |
1158 | if (!sh) | |
1159 | return NULL; | |
1160 | config.devx = sh->devx; | |
11fdf7f2 TL |
1161 | #ifdef HAVE_IBV_MLX5_MOD_SWP |
1162 | dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; | |
1163 | #endif | |
1164 | /* | |
1165 | * Multi-packet send is supported by ConnectX-4 Lx PF as well | |
1166 | * as all ConnectX-5 devices. | |
1167 | */ | |
1168 | #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT | |
1169 | dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS; | |
1170 | #endif | |
1171 | #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT | |
1172 | dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ; | |
1173 | #endif | |
9f95a23c | 1174 | mlx5_glue->dv_query_device(sh->ctx, &dv_attr); |
11fdf7f2 TL |
1175 | if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) { |
1176 | if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) { | |
1177 | DRV_LOG(DEBUG, "enhanced MPW is supported"); | |
1178 | mps = MLX5_MPW_ENHANCED; | |
1179 | } else { | |
1180 | DRV_LOG(DEBUG, "MPW is supported"); | |
1181 | mps = MLX5_MPW; | |
1182 | } | |
1183 | } else { | |
1184 | DRV_LOG(DEBUG, "MPW isn't supported"); | |
1185 | mps = MLX5_MPW_DISABLED; | |
1186 | } | |
11fdf7f2 TL |
1187 | #ifdef HAVE_IBV_MLX5_MOD_SWP |
1188 | if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP) | |
1189 | swp = dv_attr.sw_parsing_caps.sw_parsing_offloads; | |
1190 | DRV_LOG(DEBUG, "SWP support: %u", swp); | |
1191 | #endif | |
1192 | config.swp = !!swp; | |
1193 | #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT | |
1194 | if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) { | |
1195 | struct mlx5dv_striding_rq_caps mprq_caps = | |
1196 | dv_attr.striding_rq_caps; | |
1197 | ||
1198 | DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d", | |
1199 | mprq_caps.min_single_stride_log_num_of_bytes); | |
1200 | DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d", | |
1201 | mprq_caps.max_single_stride_log_num_of_bytes); | |
1202 | DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d", | |
1203 | mprq_caps.min_single_wqe_log_num_of_strides); | |
1204 | DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d", | |
1205 | mprq_caps.max_single_wqe_log_num_of_strides); | |
1206 | DRV_LOG(DEBUG, "\tsupported_qpts: %d", | |
1207 | mprq_caps.supported_qpts); | |
1208 | DRV_LOG(DEBUG, "device supports Multi-Packet RQ"); | |
1209 | mprq = 1; | |
1210 | mprq_min_stride_size_n = | |
1211 | mprq_caps.min_single_stride_log_num_of_bytes; | |
1212 | mprq_max_stride_size_n = | |
1213 | mprq_caps.max_single_stride_log_num_of_bytes; | |
1214 | mprq_min_stride_num_n = | |
1215 | mprq_caps.min_single_wqe_log_num_of_strides; | |
1216 | mprq_max_stride_num_n = | |
1217 | mprq_caps.max_single_wqe_log_num_of_strides; | |
1218 | config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, | |
1219 | mprq_min_stride_num_n); | |
1220 | } | |
1221 | #endif | |
1222 | if (RTE_CACHE_LINE_SIZE == 128 && | |
1223 | !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) | |
1224 | cqe_comp = 0; | |
1225 | else | |
1226 | cqe_comp = 1; | |
1227 | config.cqe_comp = cqe_comp; | |
9f95a23c TL |
1228 | #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD |
1229 | /* Whether device supports 128B Rx CQE padding. */ | |
1230 | cqe_pad = RTE_CACHE_LINE_SIZE == 128 && | |
1231 | (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD); | |
1232 | #endif | |
11fdf7f2 TL |
1233 | #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT |
1234 | if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) { | |
1235 | tunnel_en = ((dv_attr.tunnel_offloads_caps & | |
1236 | MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) && | |
1237 | (dv_attr.tunnel_offloads_caps & | |
1238 | MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE)); | |
1239 | } | |
1240 | DRV_LOG(DEBUG, "tunnel offloading is %ssupported", | |
1241 | tunnel_en ? "" : "not "); | |
1242 | #else | |
1243 | DRV_LOG(WARNING, | |
1244 | "tunnel offloading disabled due to old OFED/rdma-core version"); | |
1245 | #endif | |
1246 | config.tunnel_en = tunnel_en; | |
1247 | #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT | |
1248 | mpls_en = ((dv_attr.tunnel_offloads_caps & | |
1249 | MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) && | |
1250 | (dv_attr.tunnel_offloads_caps & | |
1251 | MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP)); | |
1252 | DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported", | |
1253 | mpls_en ? "" : "not "); | |
1254 | #else | |
1255 | DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to" | |
1256 | " old OFED/rdma-core version or firmware configuration"); | |
1257 | #endif | |
1258 | config.mpls_en = mpls_en; | |
11fdf7f2 | 1259 | /* Check port status. */ |
9f95a23c | 1260 | err = mlx5_glue->query_port(sh->ctx, spawn->ibv_port, &port_attr); |
11fdf7f2 TL |
1261 | if (err) { |
1262 | DRV_LOG(ERR, "port query failed: %s", strerror(err)); | |
1263 | goto error; | |
1264 | } | |
1265 | if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { | |
1266 | DRV_LOG(ERR, "port is not configured in Ethernet mode"); | |
1267 | err = EINVAL; | |
1268 | goto error; | |
1269 | } | |
1270 | if (port_attr.state != IBV_PORT_ACTIVE) | |
1271 | DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)", | |
1272 | mlx5_glue->port_state_str(port_attr.state), | |
1273 | port_attr.state); | |
9f95a23c | 1274 | /* Allocate private eth device data. */ |
11fdf7f2 TL |
1275 | priv = rte_zmalloc("ethdev private structure", |
1276 | sizeof(*priv), | |
1277 | RTE_CACHE_LINE_SIZE); | |
1278 | if (priv == NULL) { | |
1279 | DRV_LOG(ERR, "priv allocation failure"); | |
1280 | err = ENOMEM; | |
1281 | goto error; | |
1282 | } | |
9f95a23c TL |
1283 | priv->sh = sh; |
1284 | priv->ibv_port = spawn->ibv_port; | |
11fdf7f2 TL |
1285 | priv->mtu = ETHER_MTU; |
1286 | #ifndef RTE_ARCH_64 | |
1287 | /* Initialize UAR access locks for 32bit implementations. */ | |
1288 | rte_spinlock_init(&priv->uar_lock_cq); | |
1289 | for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++) | |
1290 | rte_spinlock_init(&priv->uar_lock[i]); | |
1291 | #endif | |
1292 | /* Some internal functions rely on Netlink sockets, open them now. */ | |
1293 | priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA); | |
1294 | priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE); | |
1295 | priv->nl_sn = 0; | |
1296 | priv->representor = !!switch_info->representor; | |
9f95a23c | 1297 | priv->master = !!switch_info->master; |
11fdf7f2 | 1298 | priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; |
9f95a23c TL |
1299 | /* |
1300 | * Currently we support single E-Switch per PF configurations | |
1301 | * only and vport_id field contains the vport index for | |
1302 | * associated VF, which is deduced from representor port name. | |
1303 | * For example, let's have the IB device port 10, it has | |
1304 | * attached network device eth0, which has port name attribute | |
1305 | * pf0vf2, we can deduce the VF number as 2, and set vport index | |
1306 | * as 3 (2+1). This assigning schema should be changed if the | |
1307 | * multiple E-Switch instances per PF configurations or/and PCI | |
1308 | * subfunctions are added. | |
1309 | */ | |
1310 | priv->vport_id = switch_info->representor ? | |
1311 | switch_info->port_name + 1 : -1; | |
1312 | /* representor_id field keeps the unmodified port/VF index. */ | |
1313 | priv->representor_id = switch_info->representor ? | |
1314 | switch_info->port_name : -1; | |
11fdf7f2 TL |
1315 | /* |
1316 | * Look for sibling devices in order to reuse their switch domain | |
1317 | * if any, otherwise allocate one. | |
1318 | */ | |
9f95a23c TL |
1319 | RTE_ETH_FOREACH_DEV_OF(port_id, dpdk_dev) { |
1320 | const struct mlx5_priv *opriv = | |
1321 | rte_eth_devices[port_id].data->dev_private; | |
11fdf7f2 | 1322 | |
9f95a23c TL |
1323 | if (!opriv || |
1324 | opriv->domain_id == | |
1325 | RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) | |
1326 | continue; | |
1327 | priv->domain_id = opriv->domain_id; | |
1328 | break; | |
11fdf7f2 TL |
1329 | } |
1330 | if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) { | |
1331 | err = rte_eth_switch_domain_alloc(&priv->domain_id); | |
1332 | if (err) { | |
1333 | err = rte_errno; | |
1334 | DRV_LOG(ERR, "unable to allocate switch domain: %s", | |
1335 | strerror(rte_errno)); | |
1336 | goto error; | |
1337 | } | |
1338 | own_domain_id = 1; | |
1339 | } | |
1340 | err = mlx5_args(&config, dpdk_dev->devargs); | |
1341 | if (err) { | |
1342 | err = rte_errno; | |
1343 | DRV_LOG(ERR, "failed to process device arguments: %s", | |
1344 | strerror(rte_errno)); | |
1345 | goto error; | |
1346 | } | |
9f95a23c TL |
1347 | config.hw_csum = !!(sh->device_attr.device_cap_flags_ex & |
1348 | IBV_DEVICE_RAW_IP_CSUM); | |
11fdf7f2 TL |
1349 | DRV_LOG(DEBUG, "checksum offloading is %ssupported", |
1350 | (config.hw_csum ? "" : "not ")); | |
9f95a23c TL |
1351 | #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \ |
1352 | !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45) | |
1353 | DRV_LOG(DEBUG, "counters are not supported"); | |
1354 | #endif | |
1355 | #ifndef HAVE_IBV_FLOW_DV_SUPPORT | |
1356 | if (config.dv_flow_en) { | |
1357 | DRV_LOG(WARNING, "DV flow is not supported"); | |
1358 | config.dv_flow_en = 0; | |
1359 | } | |
11fdf7f2 TL |
1360 | #endif |
1361 | config.ind_table_max_size = | |
9f95a23c | 1362 | sh->device_attr.rss_caps.max_rwq_indirection_table_size; |
11fdf7f2 TL |
1363 | /* |
1364 | * Remove this check once DPDK supports larger/variable | |
1365 | * indirection tables. | |
1366 | */ | |
1367 | if (config.ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512) | |
1368 | config.ind_table_max_size = ETH_RSS_RETA_SIZE_512; | |
1369 | DRV_LOG(DEBUG, "maximum Rx indirection table size is %u", | |
1370 | config.ind_table_max_size); | |
9f95a23c | 1371 | config.hw_vlan_strip = !!(sh->device_attr.raw_packet_caps & |
11fdf7f2 TL |
1372 | IBV_RAW_PACKET_CAP_CVLAN_STRIPPING); |
1373 | DRV_LOG(DEBUG, "VLAN stripping is %ssupported", | |
1374 | (config.hw_vlan_strip ? "" : "not ")); | |
9f95a23c | 1375 | config.hw_fcs_strip = !!(sh->device_attr.raw_packet_caps & |
11fdf7f2 TL |
1376 | IBV_RAW_PACKET_CAP_SCATTER_FCS); |
1377 | DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported", | |
1378 | (config.hw_fcs_strip ? "" : "not ")); | |
9f95a23c TL |
1379 | #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING) |
1380 | hw_padding = !!sh->device_attr.rx_pad_end_addr_align; | |
1381 | #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING) | |
1382 | hw_padding = !!(sh->device_attr.device_cap_flags_ex & | |
1383 | IBV_DEVICE_PCI_WRITE_END_PADDING); | |
11fdf7f2 | 1384 | #endif |
9f95a23c TL |
1385 | if (config.hw_padding && !hw_padding) { |
1386 | DRV_LOG(DEBUG, "Rx end alignment padding isn't supported"); | |
1387 | config.hw_padding = 0; | |
1388 | } else if (config.hw_padding) { | |
1389 | DRV_LOG(DEBUG, "Rx end alignment padding is enabled"); | |
1390 | } | |
1391 | config.tso = (sh->device_attr.tso_caps.max_tso > 0 && | |
1392 | (sh->device_attr.tso_caps.supported_qpts & | |
11fdf7f2 TL |
1393 | (1 << IBV_QPT_RAW_PACKET))); |
1394 | if (config.tso) | |
9f95a23c TL |
1395 | config.tso_max_payload_sz = sh->device_attr.tso_caps.max_tso; |
1396 | /* | |
1397 | * MPW is disabled by default, while the Enhanced MPW is enabled | |
1398 | * by default. | |
1399 | */ | |
1400 | if (config.mps == MLX5_ARG_UNSET) | |
1401 | config.mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED : | |
1402 | MLX5_MPW_DISABLED; | |
1403 | else | |
1404 | config.mps = config.mps ? mps : MLX5_MPW_DISABLED; | |
11fdf7f2 TL |
1405 | DRV_LOG(INFO, "%sMPS is %s", |
1406 | config.mps == MLX5_MPW_ENHANCED ? "enhanced " : "", | |
1407 | config.mps != MLX5_MPW_DISABLED ? "enabled" : "disabled"); | |
1408 | if (config.cqe_comp && !cqe_comp) { | |
1409 | DRV_LOG(WARNING, "Rx CQE compression isn't supported"); | |
1410 | config.cqe_comp = 0; | |
1411 | } | |
9f95a23c TL |
1412 | if (config.cqe_pad && !cqe_pad) { |
1413 | DRV_LOG(WARNING, "Rx CQE padding isn't supported"); | |
1414 | config.cqe_pad = 0; | |
1415 | } else if (config.cqe_pad) { | |
1416 | DRV_LOG(INFO, "Rx CQE padding is enabled"); | |
1417 | } | |
11fdf7f2 TL |
1418 | if (config.mprq.enabled && mprq) { |
1419 | if (config.mprq.stride_num_n > mprq_max_stride_num_n || | |
1420 | config.mprq.stride_num_n < mprq_min_stride_num_n) { | |
1421 | config.mprq.stride_num_n = | |
1422 | RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, | |
1423 | mprq_min_stride_num_n); | |
1424 | DRV_LOG(WARNING, | |
1425 | "the number of strides" | |
1426 | " for Multi-Packet RQ is out of range," | |
1427 | " setting default value (%u)", | |
1428 | 1 << config.mprq.stride_num_n); | |
1429 | } | |
1430 | config.mprq.min_stride_size_n = mprq_min_stride_size_n; | |
1431 | config.mprq.max_stride_size_n = mprq_max_stride_size_n; | |
1432 | } else if (config.mprq.enabled && !mprq) { | |
1433 | DRV_LOG(WARNING, "Multi-Packet RQ isn't supported"); | |
1434 | config.mprq.enabled = 0; | |
1435 | } | |
1436 | eth_dev = rte_eth_dev_allocate(name); | |
1437 | if (eth_dev == NULL) { | |
1438 | DRV_LOG(ERR, "can not allocate rte ethdev"); | |
1439 | err = ENOMEM; | |
1440 | goto error; | |
1441 | } | |
9f95a23c TL |
1442 | /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */ |
1443 | eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; | |
1444 | if (priv->representor) { | |
11fdf7f2 | 1445 | eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR; |
9f95a23c TL |
1446 | eth_dev->data->representor_id = priv->representor_id; |
1447 | } | |
11fdf7f2 TL |
1448 | eth_dev->data->dev_private = priv; |
1449 | priv->dev_data = eth_dev->data; | |
1450 | eth_dev->data->mac_addrs = priv->mac; | |
1451 | eth_dev->device = dpdk_dev; | |
11fdf7f2 TL |
1452 | /* Configure the first MAC address by default. */ |
1453 | if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) { | |
1454 | DRV_LOG(ERR, | |
1455 | "port %u cannot get MAC address, is mlx5_en" | |
1456 | " loaded? (errno: %s)", | |
1457 | eth_dev->data->port_id, strerror(rte_errno)); | |
1458 | err = ENODEV; | |
1459 | goto error; | |
1460 | } | |
1461 | DRV_LOG(INFO, | |
1462 | "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", | |
1463 | eth_dev->data->port_id, | |
1464 | mac.addr_bytes[0], mac.addr_bytes[1], | |
1465 | mac.addr_bytes[2], mac.addr_bytes[3], | |
1466 | mac.addr_bytes[4], mac.addr_bytes[5]); | |
1467 | #ifndef NDEBUG | |
1468 | { | |
1469 | char ifname[IF_NAMESIZE]; | |
1470 | ||
1471 | if (mlx5_get_ifname(eth_dev, &ifname) == 0) | |
1472 | DRV_LOG(DEBUG, "port %u ifname is \"%s\"", | |
1473 | eth_dev->data->port_id, ifname); | |
1474 | else | |
1475 | DRV_LOG(DEBUG, "port %u ifname is unknown", | |
1476 | eth_dev->data->port_id); | |
1477 | } | |
1478 | #endif | |
1479 | /* Get actual MTU if possible. */ | |
1480 | err = mlx5_get_mtu(eth_dev, &priv->mtu); | |
1481 | if (err) { | |
1482 | err = rte_errno; | |
1483 | goto error; | |
1484 | } | |
1485 | DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id, | |
1486 | priv->mtu); | |
1487 | /* Initialize burst functions to prevent crashes before link-up. */ | |
1488 | eth_dev->rx_pkt_burst = removed_rx_burst; | |
1489 | eth_dev->tx_pkt_burst = removed_tx_burst; | |
1490 | eth_dev->dev_ops = &mlx5_dev_ops; | |
1491 | /* Register MAC address. */ | |
1492 | claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0)); | |
9f95a23c | 1493 | if (config.vf && config.vf_nl_en) |
11fdf7f2 | 1494 | mlx5_nl_mac_addr_sync(eth_dev); |
9f95a23c TL |
1495 | priv->tcf_context = mlx5_flow_tcf_context_create(); |
1496 | if (!priv->tcf_context) { | |
11fdf7f2 TL |
1497 | err = -rte_errno; |
1498 | DRV_LOG(WARNING, | |
1499 | "flow rules relying on switch offloads will not be" | |
1500 | " supported: cannot open libmnl socket: %s", | |
1501 | strerror(rte_errno)); | |
1502 | } else { | |
1503 | struct rte_flow_error error; | |
1504 | unsigned int ifindex = mlx5_ifindex(eth_dev); | |
1505 | ||
1506 | if (!ifindex) { | |
1507 | err = -rte_errno; | |
1508 | error.message = | |
1509 | "cannot retrieve network interface index"; | |
1510 | } else { | |
9f95a23c TL |
1511 | err = mlx5_flow_tcf_init(priv->tcf_context, |
1512 | ifindex, &error); | |
11fdf7f2 TL |
1513 | } |
1514 | if (err) { | |
1515 | DRV_LOG(WARNING, | |
1516 | "flow rules relying on switch offloads will" | |
1517 | " not be supported: %s: %s", | |
1518 | error.message, strerror(rte_errno)); | |
9f95a23c TL |
1519 | mlx5_flow_tcf_context_destroy(priv->tcf_context); |
1520 | priv->tcf_context = NULL; | |
11fdf7f2 TL |
1521 | } |
1522 | } | |
1523 | TAILQ_INIT(&priv->flows); | |
1524 | TAILQ_INIT(&priv->ctrl_flows); | |
1525 | /* Hint libmlx5 to use PMD allocator for data plane resources */ | |
1526 | struct mlx5dv_ctx_allocators alctr = { | |
1527 | .alloc = &mlx5_alloc_verbs_buf, | |
1528 | .free = &mlx5_free_verbs_buf, | |
1529 | .data = priv, | |
1530 | }; | |
9f95a23c TL |
1531 | mlx5_glue->dv_set_context_attr(sh->ctx, |
1532 | MLX5DV_CTX_ATTR_BUF_ALLOCATORS, | |
11fdf7f2 TL |
1533 | (void *)((uintptr_t)&alctr)); |
1534 | /* Bring Ethernet device up. */ | |
1535 | DRV_LOG(DEBUG, "port %u forcing Ethernet interface up", | |
1536 | eth_dev->data->port_id); | |
1537 | mlx5_set_link_up(eth_dev); | |
1538 | /* | |
1539 | * Even though the interrupt handler is not installed yet, | |
9f95a23c | 1540 | * interrupts will still trigger on the async_fd from |
11fdf7f2 TL |
1541 | * Verbs context returned by ibv_open_device(). |
1542 | */ | |
1543 | mlx5_link_update(eth_dev, 0); | |
9f95a23c TL |
1544 | #ifdef HAVE_IBV_DEVX_OBJ |
1545 | if (config.devx) { | |
1546 | err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config.hca_attr); | |
1547 | if (err) { | |
1548 | err = -err; | |
1549 | goto error; | |
1550 | } | |
1551 | } | |
1552 | #endif | |
1553 | #ifdef HAVE_MLX5DV_DR_ESWITCH | |
1554 | if (!(config.hca_attr.eswitch_manager && config.dv_flow_en && | |
1555 | (switch_info->representor || switch_info->master))) | |
1556 | config.dv_esw_en = 0; | |
1557 | #else | |
1558 | config.dv_esw_en = 0; | |
1559 | #endif | |
11fdf7f2 TL |
1560 | /* Store device configuration on private structure. */ |
1561 | priv->config = config; | |
9f95a23c TL |
1562 | if (config.dv_flow_en) { |
1563 | err = mlx5_alloc_shared_dr(priv); | |
1564 | if (err) | |
1565 | goto error; | |
1566 | } | |
11fdf7f2 TL |
1567 | /* Supported Verbs flow priority number detection. */ |
1568 | err = mlx5_flow_discover_priorities(eth_dev); | |
9f95a23c TL |
1569 | if (err < 0) { |
1570 | err = -err; | |
11fdf7f2 TL |
1571 | goto error; |
1572 | } | |
9f95a23c | 1573 | priv->config.flow_prio = err; |
11fdf7f2 TL |
1574 | /* Add device to memory callback list. */ |
1575 | rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); | |
1576 | LIST_INSERT_HEAD(&mlx5_shared_data->mem_event_cb_list, | |
9f95a23c | 1577 | sh, mem_event_cb); |
11fdf7f2 TL |
1578 | rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); |
1579 | return eth_dev; | |
1580 | error: | |
1581 | if (priv) { | |
9f95a23c TL |
1582 | if (priv->sh) |
1583 | mlx5_free_shared_dr(priv); | |
11fdf7f2 TL |
1584 | if (priv->nl_socket_route >= 0) |
1585 | close(priv->nl_socket_route); | |
1586 | if (priv->nl_socket_rdma >= 0) | |
1587 | close(priv->nl_socket_rdma); | |
9f95a23c TL |
1588 | if (priv->tcf_context) |
1589 | mlx5_flow_tcf_context_destroy(priv->tcf_context); | |
11fdf7f2 TL |
1590 | if (own_domain_id) |
1591 | claim_zero(rte_eth_switch_domain_free(priv->domain_id)); | |
1592 | rte_free(priv); | |
9f95a23c TL |
1593 | if (eth_dev != NULL) |
1594 | eth_dev->data->dev_private = NULL; | |
11fdf7f2 | 1595 | } |
9f95a23c TL |
1596 | if (eth_dev != NULL) { |
1597 | /* mac_addrs must not be freed alone because part of dev_private */ | |
1598 | eth_dev->data->mac_addrs = NULL; | |
11fdf7f2 | 1599 | rte_eth_dev_release_port(eth_dev); |
9f95a23c TL |
1600 | } |
1601 | if (sh) | |
1602 | mlx5_free_shared_ibctx(sh); | |
11fdf7f2 TL |
1603 | assert(err > 0); |
1604 | rte_errno = err; | |
1605 | return NULL; | |
1606 | } | |
1607 | ||
11fdf7f2 TL |
1608 | /** |
1609 | * Comparison callback to sort device data. | |
1610 | * | |
1611 | * This is meant to be used with qsort(). | |
1612 | * | |
1613 | * @param a[in] | |
1614 | * Pointer to pointer to first data object. | |
1615 | * @param b[in] | |
1616 | * Pointer to pointer to second data object. | |
1617 | * | |
1618 | * @return | |
1619 | * 0 if both objects are equal, less than 0 if the first argument is less | |
1620 | * than the second, greater than 0 otherwise. | |
1621 | */ | |
1622 | static int | |
1623 | mlx5_dev_spawn_data_cmp(const void *a, const void *b) | |
1624 | { | |
1625 | const struct mlx5_switch_info *si_a = | |
1626 | &((const struct mlx5_dev_spawn_data *)a)->info; | |
1627 | const struct mlx5_switch_info *si_b = | |
1628 | &((const struct mlx5_dev_spawn_data *)b)->info; | |
1629 | int ret; | |
1630 | ||
1631 | /* Master device first. */ | |
1632 | ret = si_b->master - si_a->master; | |
1633 | if (ret) | |
1634 | return ret; | |
1635 | /* Then representor devices. */ | |
1636 | ret = si_b->representor - si_a->representor; | |
1637 | if (ret) | |
1638 | return ret; | |
1639 | /* Unidentified devices come last in no specific order. */ | |
1640 | if (!si_a->representor) | |
1641 | return 0; | |
1642 | /* Order representors by name. */ | |
1643 | return si_a->port_name - si_b->port_name; | |
1644 | } | |
1645 | ||
1646 | /** | |
1647 | * DPDK callback to register a PCI device. | |
1648 | * | |
1649 | * This function spawns Ethernet devices out of a given PCI device. | |
1650 | * | |
1651 | * @param[in] pci_drv | |
1652 | * PCI driver structure (mlx5_driver). | |
1653 | * @param[in] pci_dev | |
1654 | * PCI device information. | |
1655 | * | |
1656 | * @return | |
1657 | * 0 on success, a negative errno value otherwise and rte_errno is set. | |
1658 | */ | |
1659 | static int | |
1660 | mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, | |
1661 | struct rte_pci_device *pci_dev) | |
1662 | { | |
1663 | struct ibv_device **ibv_list; | |
9f95a23c TL |
1664 | /* |
1665 | * Number of found IB Devices matching with requested PCI BDF. | |
1666 | * nd != 1 means there are multiple IB devices over the same | |
1667 | * PCI device and we have representors and master. | |
1668 | */ | |
1669 | unsigned int nd = 0; | |
1670 | /* | |
1671 | * Number of found IB device Ports. nd = 1 and np = 1..n means | |
1672 | * we have the single multiport IB device, and there may be | |
1673 | * representors attached to some of found ports. | |
1674 | */ | |
1675 | unsigned int np = 0; | |
1676 | /* | |
1677 | * Number of DPDK ethernet devices to Spawn - either over | |
1678 | * multiple IB devices or multiple ports of single IB device. | |
1679 | * Actually this is the number of iterations to spawn. | |
1680 | */ | |
1681 | unsigned int ns = 0; | |
1682 | struct mlx5_dev_config dev_config; | |
11fdf7f2 TL |
1683 | int ret; |
1684 | ||
9f95a23c TL |
1685 | ret = mlx5_init_once(); |
1686 | if (ret) { | |
1687 | DRV_LOG(ERR, "unable to init PMD global data: %s", | |
1688 | strerror(rte_errno)); | |
1689 | return -rte_errno; | |
1690 | } | |
11fdf7f2 TL |
1691 | assert(pci_drv == &mlx5_driver); |
1692 | errno = 0; | |
1693 | ibv_list = mlx5_glue->get_device_list(&ret); | |
1694 | if (!ibv_list) { | |
1695 | rte_errno = errno ? errno : ENOSYS; | |
1696 | DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?"); | |
1697 | return -rte_errno; | |
1698 | } | |
9f95a23c TL |
1699 | /* |
1700 | * First scan the list of all Infiniband devices to find | |
1701 | * matching ones, gathering into the list. | |
1702 | */ | |
11fdf7f2 | 1703 | struct ibv_device *ibv_match[ret + 1]; |
9f95a23c TL |
1704 | int nl_route = -1; |
1705 | int nl_rdma = -1; | |
1706 | unsigned int i; | |
11fdf7f2 TL |
1707 | |
1708 | while (ret-- > 0) { | |
1709 | struct rte_pci_addr pci_addr; | |
1710 | ||
1711 | DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name); | |
1712 | if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr)) | |
1713 | continue; | |
1714 | if (pci_dev->addr.domain != pci_addr.domain || | |
1715 | pci_dev->addr.bus != pci_addr.bus || | |
1716 | pci_dev->addr.devid != pci_addr.devid || | |
1717 | pci_dev->addr.function != pci_addr.function) | |
1718 | continue; | |
1719 | DRV_LOG(INFO, "PCI information matches for device \"%s\"", | |
1720 | ibv_list[ret]->name); | |
9f95a23c TL |
1721 | ibv_match[nd++] = ibv_list[ret]; |
1722 | } | |
1723 | ibv_match[nd] = NULL; | |
1724 | if (!nd) { | |
1725 | /* No device matches, just complain and bail out. */ | |
1726 | mlx5_glue->free_device_list(ibv_list); | |
1727 | DRV_LOG(WARNING, | |
1728 | "no Verbs device matches PCI device " PCI_PRI_FMT "," | |
1729 | " are kernel drivers loaded?", | |
1730 | pci_dev->addr.domain, pci_dev->addr.bus, | |
1731 | pci_dev->addr.devid, pci_dev->addr.function); | |
1732 | rte_errno = ENOENT; | |
1733 | ret = -rte_errno; | |
1734 | return ret; | |
1735 | } | |
1736 | nl_route = mlx5_nl_init(NETLINK_ROUTE); | |
1737 | nl_rdma = mlx5_nl_init(NETLINK_RDMA); | |
1738 | if (nd == 1) { | |
1739 | /* | |
1740 | * Found single matching device may have multiple ports. | |
1741 | * Each port may be representor, we have to check the port | |
1742 | * number and check the representors existence. | |
1743 | */ | |
1744 | if (nl_rdma >= 0) | |
1745 | np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name); | |
1746 | if (!np) | |
1747 | DRV_LOG(WARNING, "can not get IB device \"%s\"" | |
1748 | " ports number", ibv_match[0]->name); | |
11fdf7f2 | 1749 | } |
11fdf7f2 | 1750 | /* |
9f95a23c TL |
1751 | * Now we can determine the maximal |
1752 | * amount of devices to be spawned. | |
11fdf7f2 | 1753 | */ |
9f95a23c TL |
1754 | struct mlx5_dev_spawn_data list[np ? np : nd]; |
1755 | ||
1756 | if (np > 1) { | |
1757 | /* | |
1758 | * Single IB device with multiple ports found, | |
1759 | * it may be E-Switch master device and representors. | |
1760 | * We have to perform identification trough the ports. | |
1761 | */ | |
1762 | assert(nl_rdma >= 0); | |
1763 | assert(ns == 0); | |
1764 | assert(nd == 1); | |
1765 | for (i = 1; i <= np; ++i) { | |
1766 | list[ns].max_port = np; | |
1767 | list[ns].ibv_port = i; | |
1768 | list[ns].ibv_dev = ibv_match[0]; | |
1769 | list[ns].eth_dev = NULL; | |
1770 | list[ns].pci_dev = pci_dev; | |
1771 | list[ns].ifindex = mlx5_nl_ifindex | |
1772 | (nl_rdma, list[ns].ibv_dev->name, i); | |
1773 | if (!list[ns].ifindex) { | |
1774 | /* | |
1775 | * No network interface index found for the | |
1776 | * specified port, it means there is no | |
1777 | * representor on this port. It's OK, | |
1778 | * there can be disabled ports, for example | |
1779 | * if sriov_numvfs < sriov_totalvfs. | |
1780 | */ | |
1781 | continue; | |
1782 | } | |
1783 | ret = -1; | |
1784 | if (nl_route >= 0) | |
1785 | ret = mlx5_nl_switch_info | |
1786 | (nl_route, | |
1787 | list[ns].ifindex, | |
1788 | &list[ns].info); | |
1789 | if (ret || (!list[ns].info.representor && | |
1790 | !list[ns].info.master)) { | |
1791 | /* | |
1792 | * We failed to recognize representors with | |
1793 | * Netlink, let's try to perform the task | |
1794 | * with sysfs. | |
1795 | */ | |
1796 | ret = mlx5_sysfs_switch_info | |
1797 | (list[ns].ifindex, | |
1798 | &list[ns].info); | |
1799 | } | |
1800 | if (!ret && (list[ns].info.representor ^ | |
1801 | list[ns].info.master)) | |
1802 | ns++; | |
11fdf7f2 | 1803 | } |
9f95a23c TL |
1804 | if (!ns) { |
1805 | DRV_LOG(ERR, | |
1806 | "unable to recognize master/representors" | |
1807 | " on the IB device with multiple ports"); | |
1808 | rte_errno = ENOENT; | |
1809 | ret = -rte_errno; | |
1810 | goto exit; | |
1811 | } | |
1812 | } else { | |
1813 | /* | |
1814 | * The existence of several matching entries (nd > 1) means | |
1815 | * port representors have been instantiated. No existing Verbs | |
1816 | * call nor sysfs entries can tell them apart, this can only | |
1817 | * be done through Netlink calls assuming kernel drivers are | |
1818 | * recent enough to support them. | |
1819 | * | |
1820 | * In the event of identification failure through Netlink, | |
1821 | * try again through sysfs, then: | |
1822 | * | |
1823 | * 1. A single IB device matches (nd == 1) with single | |
1824 | * port (np=0/1) and is not a representor, assume | |
1825 | * no switch support. | |
1826 | * | |
1827 | * 2. Otherwise no safe assumptions can be made; | |
1828 | * complain louder and bail out. | |
1829 | */ | |
1830 | np = 1; | |
1831 | for (i = 0; i != nd; ++i) { | |
1832 | memset(&list[ns].info, 0, sizeof(list[ns].info)); | |
1833 | list[ns].max_port = 1; | |
1834 | list[ns].ibv_port = 1; | |
1835 | list[ns].ibv_dev = ibv_match[i]; | |
1836 | list[ns].eth_dev = NULL; | |
1837 | list[ns].pci_dev = pci_dev; | |
1838 | list[ns].ifindex = 0; | |
1839 | if (nl_rdma >= 0) | |
1840 | list[ns].ifindex = mlx5_nl_ifindex | |
1841 | (nl_rdma, list[ns].ibv_dev->name, 1); | |
1842 | if (!list[ns].ifindex) { | |
1843 | char ifname[IF_NAMESIZE]; | |
1844 | ||
1845 | /* | |
1846 | * Netlink failed, it may happen with old | |
1847 | * ib_core kernel driver (before 4.16). | |
1848 | * We can assume there is old driver because | |
1849 | * here we are processing single ports IB | |
1850 | * devices. Let's try sysfs to retrieve | |
1851 | * the ifindex. The method works for | |
1852 | * master device only. | |
1853 | */ | |
1854 | if (nd > 1) { | |
1855 | /* | |
1856 | * Multiple devices found, assume | |
1857 | * representors, can not distinguish | |
1858 | * master/representor and retrieve | |
1859 | * ifindex via sysfs. | |
1860 | */ | |
1861 | continue; | |
1862 | } | |
1863 | ret = mlx5_get_master_ifname | |
1864 | (ibv_match[i]->ibdev_path, &ifname); | |
1865 | if (!ret) | |
1866 | list[ns].ifindex = | |
1867 | if_nametoindex(ifname); | |
1868 | if (!list[ns].ifindex) { | |
1869 | /* | |
1870 | * No network interface index found | |
1871 | * for the specified device, it means | |
1872 | * there it is neither representor | |
1873 | * nor master. | |
1874 | */ | |
1875 | continue; | |
1876 | } | |
1877 | } | |
1878 | ret = -1; | |
1879 | if (nl_route >= 0) | |
1880 | ret = mlx5_nl_switch_info | |
1881 | (nl_route, | |
1882 | list[ns].ifindex, | |
1883 | &list[ns].info); | |
1884 | if (ret || (!list[ns].info.representor && | |
1885 | !list[ns].info.master)) { | |
1886 | /* | |
1887 | * We failed to recognize representors with | |
1888 | * Netlink, let's try to perform the task | |
1889 | * with sysfs. | |
1890 | */ | |
1891 | ret = mlx5_sysfs_switch_info | |
1892 | (list[ns].ifindex, | |
1893 | &list[ns].info); | |
1894 | } | |
1895 | if (!ret && (list[ns].info.representor ^ | |
1896 | list[ns].info.master)) { | |
1897 | ns++; | |
1898 | } else if ((nd == 1) && | |
1899 | !list[ns].info.representor && | |
1900 | !list[ns].info.master) { | |
1901 | /* | |
1902 | * Single IB device with | |
1903 | * one physical port and | |
1904 | * attached network device. | |
1905 | * May be SRIOV is not enabled | |
1906 | * or there is no representors. | |
1907 | */ | |
1908 | DRV_LOG(INFO, "no E-Switch support detected"); | |
1909 | ns++; | |
1910 | break; | |
1911 | } | |
1912 | } | |
1913 | if (!ns) { | |
11fdf7f2 | 1914 | DRV_LOG(ERR, |
9f95a23c TL |
1915 | "unable to recognize master/representors" |
1916 | " on the multiple IB devices"); | |
1917 | rte_errno = ENOENT; | |
1918 | ret = -rte_errno; | |
1919 | goto exit; | |
11fdf7f2 TL |
1920 | } |
1921 | } | |
9f95a23c | 1922 | assert(ns); |
11fdf7f2 TL |
1923 | /* |
1924 | * Sort list to probe devices in natural order for users convenience | |
1925 | * (i.e. master first, then representors from lowest to highest ID). | |
1926 | */ | |
9f95a23c TL |
1927 | qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp); |
1928 | /* Default configuration. */ | |
1929 | dev_config = (struct mlx5_dev_config){ | |
1930 | .hw_padding = 0, | |
1931 | .mps = MLX5_ARG_UNSET, | |
1932 | .tx_vec_en = 1, | |
1933 | .rx_vec_en = 1, | |
1934 | .txq_inline = MLX5_ARG_UNSET, | |
1935 | .txqs_inline = MLX5_ARG_UNSET, | |
1936 | .txqs_vec = MLX5_ARG_UNSET, | |
1937 | .inline_max_packet_sz = MLX5_ARG_UNSET, | |
1938 | .vf_nl_en = 1, | |
1939 | .mr_ext_memseg_en = 1, | |
1940 | .mprq = { | |
1941 | .enabled = 0, /* Disabled by default. */ | |
1942 | .stride_num_n = MLX5_MPRQ_STRIDE_NUM_N, | |
1943 | .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, | |
1944 | .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, | |
1945 | }, | |
1946 | .dv_esw_en = 1, | |
1947 | }; | |
1948 | /* Device specific configuration. */ | |
11fdf7f2 | 1949 | switch (pci_dev->id.device_id) { |
9f95a23c TL |
1950 | case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF: |
1951 | dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS_BLUEFIELD; | |
1952 | break; | |
11fdf7f2 TL |
1953 | case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF: |
1954 | case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF: | |
1955 | case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF: | |
1956 | case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF: | |
9f95a23c | 1957 | dev_config.vf = 1; |
11fdf7f2 TL |
1958 | break; |
1959 | default: | |
9f95a23c | 1960 | break; |
11fdf7f2 | 1961 | } |
9f95a23c TL |
1962 | /* Set architecture-dependent default value if unset. */ |
1963 | if (dev_config.txqs_vec == MLX5_ARG_UNSET) | |
1964 | dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS; | |
1965 | for (i = 0; i != ns; ++i) { | |
11fdf7f2 TL |
1966 | uint32_t restore; |
1967 | ||
9f95a23c TL |
1968 | list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device, |
1969 | &list[i], | |
1970 | dev_config); | |
11fdf7f2 | 1971 | if (!list[i].eth_dev) { |
9f95a23c | 1972 | if (rte_errno != EBUSY && rte_errno != EEXIST) |
11fdf7f2 | 1973 | break; |
9f95a23c | 1974 | /* Device is disabled or already spawned. Ignore it. */ |
11fdf7f2 TL |
1975 | continue; |
1976 | } | |
1977 | restore = list[i].eth_dev->data->dev_flags; | |
1978 | rte_eth_copy_pci_info(list[i].eth_dev, pci_dev); | |
1979 | /* Restore non-PCI flags cleared by the above call. */ | |
1980 | list[i].eth_dev->data->dev_flags |= restore; | |
1981 | rte_eth_dev_probing_finish(list[i].eth_dev); | |
1982 | } | |
9f95a23c | 1983 | if (i != ns) { |
11fdf7f2 TL |
1984 | DRV_LOG(ERR, |
1985 | "probe of PCI device " PCI_PRI_FMT " aborted after" | |
1986 | " encountering an error: %s", | |
1987 | pci_dev->addr.domain, pci_dev->addr.bus, | |
1988 | pci_dev->addr.devid, pci_dev->addr.function, | |
1989 | strerror(rte_errno)); | |
1990 | ret = -rte_errno; | |
1991 | /* Roll back. */ | |
1992 | while (i--) { | |
1993 | if (!list[i].eth_dev) | |
1994 | continue; | |
1995 | mlx5_dev_close(list[i].eth_dev); | |
9f95a23c TL |
1996 | /* mac_addrs must not be freed because in dev_private */ |
1997 | list[i].eth_dev->data->mac_addrs = NULL; | |
11fdf7f2 TL |
1998 | claim_zero(rte_eth_dev_release_port(list[i].eth_dev)); |
1999 | } | |
2000 | /* Restore original error. */ | |
2001 | rte_errno = -ret; | |
2002 | } else { | |
2003 | ret = 0; | |
2004 | } | |
9f95a23c TL |
2005 | exit: |
2006 | /* | |
2007 | * Do the routine cleanup: | |
2008 | * - close opened Netlink sockets | |
2009 | * - free the Infiniband device list | |
2010 | */ | |
2011 | if (nl_rdma >= 0) | |
2012 | close(nl_rdma); | |
2013 | if (nl_route >= 0) | |
2014 | close(nl_route); | |
2015 | assert(ibv_list); | |
2016 | mlx5_glue->free_device_list(ibv_list); | |
11fdf7f2 TL |
2017 | return ret; |
2018 | } | |
2019 | ||
9f95a23c TL |
2020 | /** |
2021 | * DPDK callback to remove a PCI device. | |
2022 | * | |
2023 | * This function removes all Ethernet devices belong to a given PCI device. | |
2024 | * | |
2025 | * @param[in] pci_dev | |
2026 | * Pointer to the PCI device. | |
2027 | * | |
2028 | * @return | |
2029 | * 0 on success, the function cannot fail. | |
2030 | */ | |
2031 | static int | |
2032 | mlx5_pci_remove(struct rte_pci_device *pci_dev) | |
2033 | { | |
2034 | uint16_t port_id; | |
2035 | ||
2036 | RTE_ETH_FOREACH_DEV_OF(port_id, &pci_dev->device) | |
2037 | rte_eth_dev_close(port_id); | |
2038 | return 0; | |
2039 | } | |
2040 | ||
11fdf7f2 TL |
2041 | static const struct rte_pci_id mlx5_pci_id_map[] = { |
2042 | { | |
2043 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2044 | PCI_DEVICE_ID_MELLANOX_CONNECTX4) | |
2045 | }, | |
2046 | { | |
2047 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2048 | PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) | |
2049 | }, | |
2050 | { | |
2051 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2052 | PCI_DEVICE_ID_MELLANOX_CONNECTX4LX) | |
2053 | }, | |
2054 | { | |
2055 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2056 | PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) | |
2057 | }, | |
2058 | { | |
2059 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2060 | PCI_DEVICE_ID_MELLANOX_CONNECTX5) | |
2061 | }, | |
2062 | { | |
2063 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2064 | PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) | |
2065 | }, | |
2066 | { | |
2067 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2068 | PCI_DEVICE_ID_MELLANOX_CONNECTX5EX) | |
2069 | }, | |
2070 | { | |
2071 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2072 | PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF) | |
2073 | }, | |
2074 | { | |
2075 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2076 | PCI_DEVICE_ID_MELLANOX_CONNECTX5BF) | |
2077 | }, | |
9f95a23c TL |
2078 | { |
2079 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2080 | PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF) | |
2081 | }, | |
2082 | { | |
2083 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2084 | PCI_DEVICE_ID_MELLANOX_CONNECTX6) | |
2085 | }, | |
2086 | { | |
2087 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
2088 | PCI_DEVICE_ID_MELLANOX_CONNECTX6VF) | |
2089 | }, | |
11fdf7f2 TL |
2090 | { |
2091 | .vendor_id = 0 | |
2092 | } | |
2093 | }; | |
2094 | ||
2095 | static struct rte_pci_driver mlx5_driver = { | |
2096 | .driver = { | |
2097 | .name = MLX5_DRIVER_NAME | |
2098 | }, | |
2099 | .id_table = mlx5_pci_id_map, | |
2100 | .probe = mlx5_pci_probe, | |
9f95a23c TL |
2101 | .remove = mlx5_pci_remove, |
2102 | .dma_map = mlx5_dma_map, | |
2103 | .dma_unmap = mlx5_dma_unmap, | |
2104 | .drv_flags = (RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV | | |
2105 | RTE_PCI_DRV_PROBE_AGAIN), | |
11fdf7f2 TL |
2106 | }; |
2107 | ||
9f95a23c | 2108 | #ifdef RTE_IBVERBS_LINK_DLOPEN |
11fdf7f2 TL |
2109 | |
2110 | /** | |
2111 | * Suffix RTE_EAL_PMD_PATH with "-glue". | |
2112 | * | |
2113 | * This function performs a sanity check on RTE_EAL_PMD_PATH before | |
2114 | * suffixing its last component. | |
2115 | * | |
2116 | * @param buf[out] | |
2117 | * Output buffer, should be large enough otherwise NULL is returned. | |
2118 | * @param size | |
2119 | * Size of @p out. | |
2120 | * | |
2121 | * @return | |
2122 | * Pointer to @p buf or @p NULL in case suffix cannot be appended. | |
2123 | */ | |
2124 | static char * | |
2125 | mlx5_glue_path(char *buf, size_t size) | |
2126 | { | |
2127 | static const char *const bad[] = { "/", ".", "..", NULL }; | |
2128 | const char *path = RTE_EAL_PMD_PATH; | |
2129 | size_t len = strlen(path); | |
2130 | size_t off; | |
2131 | int i; | |
2132 | ||
2133 | while (len && path[len - 1] == '/') | |
2134 | --len; | |
2135 | for (off = len; off && path[off - 1] != '/'; --off) | |
2136 | ; | |
2137 | for (i = 0; bad[i]; ++i) | |
2138 | if (!strncmp(path + off, bad[i], (int)(len - off))) | |
2139 | goto error; | |
2140 | i = snprintf(buf, size, "%.*s-glue", (int)len, path); | |
2141 | if (i == -1 || (size_t)i >= size) | |
2142 | goto error; | |
2143 | return buf; | |
2144 | error: | |
2145 | DRV_LOG(ERR, | |
2146 | "unable to append \"-glue\" to last component of" | |
2147 | " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\")," | |
2148 | " please re-configure DPDK"); | |
2149 | return NULL; | |
2150 | } | |
2151 | ||
2152 | /** | |
2153 | * Initialization routine for run-time dependency on rdma-core. | |
2154 | */ | |
2155 | static int | |
2156 | mlx5_glue_init(void) | |
2157 | { | |
2158 | char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")]; | |
2159 | const char *path[] = { | |
2160 | /* | |
2161 | * A basic security check is necessary before trusting | |
2162 | * MLX5_GLUE_PATH, which may override RTE_EAL_PMD_PATH. | |
2163 | */ | |
2164 | (geteuid() == getuid() && getegid() == getgid() ? | |
2165 | getenv("MLX5_GLUE_PATH") : NULL), | |
2166 | /* | |
2167 | * When RTE_EAL_PMD_PATH is set, use its glue-suffixed | |
2168 | * variant, otherwise let dlopen() look up libraries on its | |
2169 | * own. | |
2170 | */ | |
2171 | (*RTE_EAL_PMD_PATH ? | |
2172 | mlx5_glue_path(glue_path, sizeof(glue_path)) : ""), | |
2173 | }; | |
2174 | unsigned int i = 0; | |
2175 | void *handle = NULL; | |
2176 | void **sym; | |
2177 | const char *dlmsg; | |
2178 | ||
2179 | while (!handle && i != RTE_DIM(path)) { | |
2180 | const char *end; | |
2181 | size_t len; | |
2182 | int ret; | |
2183 | ||
2184 | if (!path[i]) { | |
2185 | ++i; | |
2186 | continue; | |
2187 | } | |
2188 | end = strpbrk(path[i], ":;"); | |
2189 | if (!end) | |
2190 | end = path[i] + strlen(path[i]); | |
2191 | len = end - path[i]; | |
2192 | ret = 0; | |
2193 | do { | |
2194 | char name[ret + 1]; | |
2195 | ||
2196 | ret = snprintf(name, sizeof(name), "%.*s%s" MLX5_GLUE, | |
2197 | (int)len, path[i], | |
2198 | (!len || *(end - 1) == '/') ? "" : "/"); | |
2199 | if (ret == -1) | |
2200 | break; | |
2201 | if (sizeof(name) != (size_t)ret + 1) | |
2202 | continue; | |
2203 | DRV_LOG(DEBUG, "looking for rdma-core glue as \"%s\"", | |
2204 | name); | |
2205 | handle = dlopen(name, RTLD_LAZY); | |
2206 | break; | |
2207 | } while (1); | |
2208 | path[i] = end + 1; | |
2209 | if (!*end) | |
2210 | ++i; | |
2211 | } | |
2212 | if (!handle) { | |
2213 | rte_errno = EINVAL; | |
2214 | dlmsg = dlerror(); | |
2215 | if (dlmsg) | |
2216 | DRV_LOG(WARNING, "cannot load glue library: %s", dlmsg); | |
2217 | goto glue_error; | |
2218 | } | |
2219 | sym = dlsym(handle, "mlx5_glue"); | |
2220 | if (!sym || !*sym) { | |
2221 | rte_errno = EINVAL; | |
2222 | dlmsg = dlerror(); | |
2223 | if (dlmsg) | |
2224 | DRV_LOG(ERR, "cannot resolve glue symbol: %s", dlmsg); | |
2225 | goto glue_error; | |
2226 | } | |
2227 | mlx5_glue = *sym; | |
2228 | return 0; | |
2229 | glue_error: | |
2230 | if (handle) | |
2231 | dlclose(handle); | |
2232 | DRV_LOG(WARNING, | |
2233 | "cannot initialize PMD due to missing run-time dependency on" | |
2234 | " rdma-core libraries (libibverbs, libmlx5)"); | |
2235 | return -rte_errno; | |
2236 | } | |
2237 | ||
2238 | #endif | |
2239 | ||
2240 | /** | |
2241 | * Driver initialization routine. | |
2242 | */ | |
2243 | RTE_INIT(rte_mlx5_pmd_init) | |
2244 | { | |
2245 | /* Initialize driver log type. */ | |
2246 | mlx5_logtype = rte_log_register("pmd.net.mlx5"); | |
2247 | if (mlx5_logtype >= 0) | |
2248 | rte_log_set_level(mlx5_logtype, RTE_LOG_NOTICE); | |
2249 | ||
2250 | /* Build the static tables for Verbs conversion. */ | |
2251 | mlx5_set_ptype_table(); | |
2252 | mlx5_set_cksum_table(); | |
2253 | mlx5_set_swp_types_table(); | |
2254 | /* | |
2255 | * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use | |
2256 | * huge pages. Calling ibv_fork_init() during init allows | |
2257 | * applications to use fork() safely for purposes other than | |
2258 | * using this PMD, which is not supported in forked processes. | |
2259 | */ | |
2260 | setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); | |
2261 | /* Match the size of Rx completion entry to the size of a cacheline. */ | |
2262 | if (RTE_CACHE_LINE_SIZE == 128) | |
2263 | setenv("MLX5_CQE_SIZE", "128", 0); | |
2264 | /* | |
2265 | * MLX5_DEVICE_FATAL_CLEANUP tells ibv_destroy functions to | |
2266 | * cleanup all the Verbs resources even when the device was removed. | |
2267 | */ | |
2268 | setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1); | |
9f95a23c | 2269 | #ifdef RTE_IBVERBS_LINK_DLOPEN |
11fdf7f2 TL |
2270 | if (mlx5_glue_init()) |
2271 | return; | |
2272 | assert(mlx5_glue); | |
2273 | #endif | |
2274 | #ifndef NDEBUG | |
2275 | /* Glue structure must not contain any NULL pointers. */ | |
2276 | { | |
2277 | unsigned int i; | |
2278 | ||
2279 | for (i = 0; i != sizeof(*mlx5_glue) / sizeof(void *); ++i) | |
2280 | assert(((const void *const *)mlx5_glue)[i]); | |
2281 | } | |
2282 | #endif | |
2283 | if (strcmp(mlx5_glue->version, MLX5_GLUE_VERSION)) { | |
2284 | DRV_LOG(ERR, | |
2285 | "rdma-core glue \"%s\" mismatch: \"%s\" is required", | |
2286 | mlx5_glue->version, MLX5_GLUE_VERSION); | |
2287 | return; | |
2288 | } | |
2289 | mlx5_glue->fork_init(); | |
2290 | rte_pci_register(&mlx5_driver); | |
2291 | } | |
2292 | ||
2293 | RTE_PMD_EXPORT_NAME(net_mlx5, __COUNTER__); | |
2294 | RTE_PMD_REGISTER_PCI_TABLE(net_mlx5, mlx5_pci_id_map); | |
2295 | RTE_PMD_REGISTER_KMOD_DEP(net_mlx5, "* ib_uverbs & mlx5_core & mlx5_ib"); |