]> git.proxmox.com Git - ceph.git/blob - ceph/src/dpdk/drivers/net/mlx5/mlx5_rxq.c
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / dpdk / drivers / net / mlx5 / mlx5_rxq.c
1 /*-
2 * BSD LICENSE
3 *
4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include <stddef.h>
35 #include <assert.h>
36 #include <errno.h>
37 #include <string.h>
38 #include <stdint.h>
39
40 /* Verbs header. */
41 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
42 #ifdef PEDANTIC
43 #pragma GCC diagnostic ignored "-Wpedantic"
44 #endif
45 #include <infiniband/verbs.h>
46 #include <infiniband/arch.h>
47 #include <infiniband/mlx5_hw.h>
48 #ifdef PEDANTIC
49 #pragma GCC diagnostic error "-Wpedantic"
50 #endif
51
52 /* DPDK headers don't like -pedantic. */
53 #ifdef PEDANTIC
54 #pragma GCC diagnostic ignored "-Wpedantic"
55 #endif
56 #include <rte_mbuf.h>
57 #include <rte_malloc.h>
58 #include <rte_ethdev.h>
59 #include <rte_common.h>
60 #ifdef PEDANTIC
61 #pragma GCC diagnostic error "-Wpedantic"
62 #endif
63
64 #include "mlx5.h"
65 #include "mlx5_rxtx.h"
66 #include "mlx5_utils.h"
67 #include "mlx5_autoconf.h"
68 #include "mlx5_defs.h"
69
70 /* Initialization data for hash RX queues. */
71 const struct hash_rxq_init hash_rxq_init[] = {
72 [HASH_RXQ_TCPV4] = {
73 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
74 IBV_EXP_RX_HASH_DST_IPV4 |
75 IBV_EXP_RX_HASH_SRC_PORT_TCP |
76 IBV_EXP_RX_HASH_DST_PORT_TCP),
77 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
78 .flow_priority = 0,
79 .flow_spec.tcp_udp = {
80 .type = IBV_EXP_FLOW_SPEC_TCP,
81 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
82 },
83 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
84 },
85 [HASH_RXQ_UDPV4] = {
86 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
87 IBV_EXP_RX_HASH_DST_IPV4 |
88 IBV_EXP_RX_HASH_SRC_PORT_UDP |
89 IBV_EXP_RX_HASH_DST_PORT_UDP),
90 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
91 .flow_priority = 0,
92 .flow_spec.tcp_udp = {
93 .type = IBV_EXP_FLOW_SPEC_UDP,
94 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
95 },
96 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
97 },
98 [HASH_RXQ_IPV4] = {
99 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
100 IBV_EXP_RX_HASH_DST_IPV4),
101 .dpdk_rss_hf = (ETH_RSS_IPV4 |
102 ETH_RSS_FRAG_IPV4),
103 .flow_priority = 1,
104 .flow_spec.ipv4 = {
105 .type = IBV_EXP_FLOW_SPEC_IPV4,
106 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
107 },
108 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
109 },
110 [HASH_RXQ_TCPV6] = {
111 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
112 IBV_EXP_RX_HASH_DST_IPV6 |
113 IBV_EXP_RX_HASH_SRC_PORT_TCP |
114 IBV_EXP_RX_HASH_DST_PORT_TCP),
115 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
116 .flow_priority = 0,
117 .flow_spec.tcp_udp = {
118 .type = IBV_EXP_FLOW_SPEC_TCP,
119 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
120 },
121 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
122 },
123 [HASH_RXQ_UDPV6] = {
124 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
125 IBV_EXP_RX_HASH_DST_IPV6 |
126 IBV_EXP_RX_HASH_SRC_PORT_UDP |
127 IBV_EXP_RX_HASH_DST_PORT_UDP),
128 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
129 .flow_priority = 0,
130 .flow_spec.tcp_udp = {
131 .type = IBV_EXP_FLOW_SPEC_UDP,
132 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
133 },
134 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
135 },
136 [HASH_RXQ_IPV6] = {
137 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
138 IBV_EXP_RX_HASH_DST_IPV6),
139 .dpdk_rss_hf = (ETH_RSS_IPV6 |
140 ETH_RSS_FRAG_IPV6),
141 .flow_priority = 1,
142 .flow_spec.ipv6 = {
143 .type = IBV_EXP_FLOW_SPEC_IPV6,
144 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
145 },
146 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
147 },
148 [HASH_RXQ_ETH] = {
149 .hash_fields = 0,
150 .dpdk_rss_hf = 0,
151 .flow_priority = 2,
152 .flow_spec.eth = {
153 .type = IBV_EXP_FLOW_SPEC_ETH,
154 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
155 },
156 .underlayer = NULL,
157 },
158 };
159
160 /* Number of entries in hash_rxq_init[]. */
161 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
162
163 /* Initialization data for hash RX queue indirection tables. */
164 static const struct ind_table_init ind_table_init[] = {
165 {
166 .max_size = -1u, /* Superseded by HW limitations. */
167 .hash_types =
168 1 << HASH_RXQ_TCPV4 |
169 1 << HASH_RXQ_UDPV4 |
170 1 << HASH_RXQ_IPV4 |
171 1 << HASH_RXQ_TCPV6 |
172 1 << HASH_RXQ_UDPV6 |
173 1 << HASH_RXQ_IPV6 |
174 0,
175 .hash_types_n = 6,
176 },
177 {
178 .max_size = 1,
179 .hash_types = 1 << HASH_RXQ_ETH,
180 .hash_types_n = 1,
181 },
182 };
183
184 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
185
186 /* Default RSS hash key also used for ConnectX-3. */
187 uint8_t rss_hash_default_key[] = {
188 0x2c, 0xc6, 0x81, 0xd1,
189 0x5b, 0xdb, 0xf4, 0xf7,
190 0xfc, 0xa2, 0x83, 0x19,
191 0xdb, 0x1a, 0x3e, 0x94,
192 0x6b, 0x9e, 0x38, 0xd9,
193 0x2c, 0x9c, 0x03, 0xd1,
194 0xad, 0x99, 0x44, 0xa7,
195 0xd9, 0x56, 0x3d, 0x59,
196 0x06, 0x3c, 0x25, 0xf3,
197 0xfc, 0x1f, 0xdc, 0x2a,
198 };
199
200 /* Length of the default RSS hash key. */
201 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
202
203 /**
204 * Populate flow steering rule for a given hash RX queue type using
205 * information from hash_rxq_init[]. Nothing is written to flow_attr when
206 * flow_attr_size is not large enough, but the required size is still returned.
207 *
208 * @param priv
209 * Pointer to private structure.
210 * @param[out] flow_attr
211 * Pointer to flow attribute structure to fill. Note that the allocated
212 * area must be larger and large enough to hold all flow specifications.
213 * @param flow_attr_size
214 * Entire size of flow_attr and trailing room for flow specifications.
215 * @param type
216 * Hash RX queue type to use for flow steering rule.
217 *
218 * @return
219 * Total size of the flow attribute buffer. No errors are defined.
220 */
221 size_t
222 priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
223 size_t flow_attr_size, enum hash_rxq_type type)
224 {
225 size_t offset = sizeof(*flow_attr);
226 const struct hash_rxq_init *init = &hash_rxq_init[type];
227
228 assert(priv != NULL);
229 assert((size_t)type < RTE_DIM(hash_rxq_init));
230 do {
231 offset += init->flow_spec.hdr.size;
232 init = init->underlayer;
233 } while (init != NULL);
234 if (offset > flow_attr_size)
235 return offset;
236 flow_attr_size = offset;
237 init = &hash_rxq_init[type];
238 *flow_attr = (struct ibv_exp_flow_attr){
239 .type = IBV_EXP_FLOW_ATTR_NORMAL,
240 /* Priorities < 3 are reserved for flow director. */
241 .priority = init->flow_priority + 3,
242 .num_of_specs = 0,
243 .port = priv->port,
244 .flags = 0,
245 };
246 do {
247 offset -= init->flow_spec.hdr.size;
248 memcpy((void *)((uintptr_t)flow_attr + offset),
249 &init->flow_spec,
250 init->flow_spec.hdr.size);
251 ++flow_attr->num_of_specs;
252 init = init->underlayer;
253 } while (init != NULL);
254 return flow_attr_size;
255 }
256
257 /**
258 * Convert hash type position in indirection table initializer to
259 * hash RX queue type.
260 *
261 * @param table
262 * Indirection table initializer.
263 * @param pos
264 * Hash type position.
265 *
266 * @return
267 * Hash RX queue type.
268 */
269 static enum hash_rxq_type
270 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
271 {
272 enum hash_rxq_type type = HASH_RXQ_TCPV4;
273
274 assert(pos < table->hash_types_n);
275 do {
276 if ((table->hash_types & (1 << type)) && (pos-- == 0))
277 break;
278 ++type;
279 } while (1);
280 return type;
281 }
282
283 /**
284 * Filter out disabled hash RX queue types from ind_table_init[].
285 *
286 * @param priv
287 * Pointer to private structure.
288 * @param[out] table
289 * Output table.
290 *
291 * @return
292 * Number of table entries.
293 */
294 static unsigned int
295 priv_make_ind_table_init(struct priv *priv,
296 struct ind_table_init (*table)[IND_TABLE_INIT_N])
297 {
298 uint64_t rss_hf;
299 unsigned int i;
300 unsigned int j;
301 unsigned int table_n = 0;
302 /* Mandatory to receive frames not handled by normal hash RX queues. */
303 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
304
305 rss_hf = priv->rss_hf;
306 /* Process other protocols only if more than one queue. */
307 if (priv->rxqs_n > 1)
308 for (i = 0; (i != hash_rxq_init_n); ++i)
309 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
310 hash_types_sup |= (1 << i);
311
312 /* Filter out entries whose protocols are not in the set. */
313 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
314 unsigned int nb;
315 unsigned int h;
316
317 /* j is increased only if the table has valid protocols. */
318 assert(j <= i);
319 (*table)[j] = ind_table_init[i];
320 (*table)[j].hash_types &= hash_types_sup;
321 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
322 if (((*table)[j].hash_types >> h) & 0x1)
323 ++nb;
324 (*table)[i].hash_types_n = nb;
325 if (nb) {
326 ++table_n;
327 ++j;
328 }
329 }
330 return table_n;
331 }
332
333 /**
334 * Initialize hash RX queues and indirection table.
335 *
336 * @param priv
337 * Pointer to private structure.
338 *
339 * @return
340 * 0 on success, errno value on failure.
341 */
342 int
343 priv_create_hash_rxqs(struct priv *priv)
344 {
345 struct ibv_exp_wq *wqs[priv->reta_idx_n];
346 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
347 unsigned int ind_tables_n =
348 priv_make_ind_table_init(priv, &ind_table_init);
349 unsigned int hash_rxqs_n = 0;
350 struct hash_rxq (*hash_rxqs)[] = NULL;
351 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
352 unsigned int i;
353 unsigned int j;
354 unsigned int k;
355 int err = 0;
356
357 assert(priv->ind_tables == NULL);
358 assert(priv->ind_tables_n == 0);
359 assert(priv->hash_rxqs == NULL);
360 assert(priv->hash_rxqs_n == 0);
361 assert(priv->pd != NULL);
362 assert(priv->ctx != NULL);
363 if (priv->rxqs_n == 0)
364 return EINVAL;
365 assert(priv->rxqs != NULL);
366 if (ind_tables_n == 0) {
367 ERROR("all hash RX queue types have been filtered out,"
368 " indirection table cannot be created");
369 return EINVAL;
370 }
371 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
372 INFO("%u RX queues are configured, consider rounding this"
373 " number to the next power of two for better balancing",
374 priv->rxqs_n);
375 DEBUG("indirection table extended to assume %u WQs",
376 priv->reta_idx_n);
377 }
378 for (i = 0; (i != priv->reta_idx_n); ++i) {
379 struct rxq_ctrl *rxq_ctrl;
380
381 rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
382 struct rxq_ctrl, rxq);
383 wqs[i] = rxq_ctrl->wq;
384 }
385 /* Get number of hash RX queues to configure. */
386 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
387 hash_rxqs_n += ind_table_init[i].hash_types_n;
388 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
389 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
390 /* Create indirection tables. */
391 ind_tables = rte_calloc(__func__, ind_tables_n,
392 sizeof((*ind_tables)[0]), 0);
393 if (ind_tables == NULL) {
394 err = ENOMEM;
395 ERROR("cannot allocate indirection tables container: %s",
396 strerror(err));
397 goto error;
398 }
399 for (i = 0; (i != ind_tables_n); ++i) {
400 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
401 .pd = priv->pd,
402 .log_ind_tbl_size = 0, /* Set below. */
403 .ind_tbl = wqs,
404 .comp_mask = 0,
405 };
406 unsigned int ind_tbl_size = ind_table_init[i].max_size;
407 struct ibv_exp_rwq_ind_table *ind_table;
408
409 if (priv->reta_idx_n < ind_tbl_size)
410 ind_tbl_size = priv->reta_idx_n;
411 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
412 errno = 0;
413 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
414 &ind_init_attr);
415 if (ind_table != NULL) {
416 (*ind_tables)[i] = ind_table;
417 continue;
418 }
419 /* Not clear whether errno is set. */
420 err = (errno ? errno : EINVAL);
421 ERROR("RX indirection table creation failed with error %d: %s",
422 err, strerror(err));
423 goto error;
424 }
425 /* Allocate array that holds hash RX queues and related data. */
426 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
427 sizeof((*hash_rxqs)[0]), 0);
428 if (hash_rxqs == NULL) {
429 err = ENOMEM;
430 ERROR("cannot allocate hash RX queues container: %s",
431 strerror(err));
432 goto error;
433 }
434 for (i = 0, j = 0, k = 0;
435 ((i != hash_rxqs_n) && (j != ind_tables_n));
436 ++i) {
437 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
438 enum hash_rxq_type type =
439 hash_rxq_type_from_pos(&ind_table_init[j], k);
440 struct rte_eth_rss_conf *priv_rss_conf =
441 (*priv->rss_conf)[type];
442 struct ibv_exp_rx_hash_conf hash_conf = {
443 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
444 .rx_hash_key_len = (priv_rss_conf ?
445 priv_rss_conf->rss_key_len :
446 rss_hash_default_key_len),
447 .rx_hash_key = (priv_rss_conf ?
448 priv_rss_conf->rss_key :
449 rss_hash_default_key),
450 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
451 .rwq_ind_tbl = (*ind_tables)[j],
452 };
453 struct ibv_exp_qp_init_attr qp_init_attr = {
454 .max_inl_recv = 0, /* Currently not supported. */
455 .qp_type = IBV_QPT_RAW_PACKET,
456 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
457 IBV_EXP_QP_INIT_ATTR_RX_HASH),
458 .pd = priv->pd,
459 .rx_hash_conf = &hash_conf,
460 .port_num = priv->port,
461 };
462
463 DEBUG("using indirection table %u for hash RX queue %u type %d",
464 j, i, type);
465 *hash_rxq = (struct hash_rxq){
466 .priv = priv,
467 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
468 .type = type,
469 };
470 if (hash_rxq->qp == NULL) {
471 err = (errno ? errno : EINVAL);
472 ERROR("Hash RX QP creation failure: %s",
473 strerror(err));
474 goto error;
475 }
476 if (++k < ind_table_init[j].hash_types_n)
477 continue;
478 /* Switch to the next indirection table and reset hash RX
479 * queue type array index. */
480 ++j;
481 k = 0;
482 }
483 priv->ind_tables = ind_tables;
484 priv->ind_tables_n = ind_tables_n;
485 priv->hash_rxqs = hash_rxqs;
486 priv->hash_rxqs_n = hash_rxqs_n;
487 assert(err == 0);
488 return 0;
489 error:
490 if (hash_rxqs != NULL) {
491 for (i = 0; (i != hash_rxqs_n); ++i) {
492 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
493
494 if (qp == NULL)
495 continue;
496 claim_zero(ibv_destroy_qp(qp));
497 }
498 rte_free(hash_rxqs);
499 }
500 if (ind_tables != NULL) {
501 for (j = 0; (j != ind_tables_n); ++j) {
502 struct ibv_exp_rwq_ind_table *ind_table =
503 (*ind_tables)[j];
504
505 if (ind_table == NULL)
506 continue;
507 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
508 }
509 rte_free(ind_tables);
510 }
511 return err;
512 }
513
514 /**
515 * Clean up hash RX queues and indirection table.
516 *
517 * @param priv
518 * Pointer to private structure.
519 */
520 void
521 priv_destroy_hash_rxqs(struct priv *priv)
522 {
523 unsigned int i;
524
525 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
526 if (priv->hash_rxqs_n == 0) {
527 assert(priv->hash_rxqs == NULL);
528 assert(priv->ind_tables == NULL);
529 return;
530 }
531 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
532 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
533 unsigned int j, k;
534
535 assert(hash_rxq->priv == priv);
536 assert(hash_rxq->qp != NULL);
537 /* Also check that there are no remaining flows. */
538 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
539 for (k = 0;
540 (k != RTE_DIM(hash_rxq->special_flow[j]));
541 ++k)
542 assert(hash_rxq->special_flow[j][k] == NULL);
543 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
544 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
545 assert(hash_rxq->mac_flow[j][k] == NULL);
546 claim_zero(ibv_destroy_qp(hash_rxq->qp));
547 }
548 priv->hash_rxqs_n = 0;
549 rte_free(priv->hash_rxqs);
550 priv->hash_rxqs = NULL;
551 for (i = 0; (i != priv->ind_tables_n); ++i) {
552 struct ibv_exp_rwq_ind_table *ind_table =
553 (*priv->ind_tables)[i];
554
555 assert(ind_table != NULL);
556 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
557 }
558 priv->ind_tables_n = 0;
559 rte_free(priv->ind_tables);
560 priv->ind_tables = NULL;
561 }
562
563 /**
564 * Check whether a given flow type is allowed.
565 *
566 * @param priv
567 * Pointer to private structure.
568 * @param type
569 * Flow type to check.
570 *
571 * @return
572 * Nonzero if the given flow type is allowed.
573 */
574 int
575 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
576 {
577 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
578 * has been requested. */
579 if (priv->promisc_req)
580 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
581 switch (type) {
582 case HASH_RXQ_FLOW_TYPE_PROMISC:
583 return !!priv->promisc_req;
584 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
585 return !!priv->allmulti_req;
586 case HASH_RXQ_FLOW_TYPE_BROADCAST:
587 case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
588 /* If allmulti is enabled, broadcast and ipv6multi
589 * are unnecessary. */
590 return !priv->allmulti_req;
591 case HASH_RXQ_FLOW_TYPE_MAC:
592 return 1;
593 default:
594 /* Unsupported flow type is not allowed. */
595 return 0;
596 }
597 return 0;
598 }
599
600 /**
601 * Automatically enable/disable flows according to configuration.
602 *
603 * @param priv
604 * Private structure.
605 *
606 * @return
607 * 0 on success, errno value on failure.
608 */
609 int
610 priv_rehash_flows(struct priv *priv)
611 {
612 enum hash_rxq_flow_type i;
613
614 for (i = HASH_RXQ_FLOW_TYPE_PROMISC;
615 i != RTE_DIM((*priv->hash_rxqs)[0].special_flow);
616 ++i)
617 if (!priv_allow_flow_type(priv, i)) {
618 priv_special_flow_disable(priv, i);
619 } else {
620 int ret = priv_special_flow_enable(priv, i);
621
622 if (ret)
623 return ret;
624 }
625 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
626 return priv_mac_addrs_enable(priv);
627 priv_mac_addrs_disable(priv);
628 return 0;
629 }
630
631 /**
632 * Allocate RX queue elements.
633 *
634 * @param rxq_ctrl
635 * Pointer to RX queue structure.
636 * @param elts_n
637 * Number of elements to allocate.
638 * @param[in] pool
639 * If not NULL, fetch buffers from this array instead of allocating them
640 * with rte_pktmbuf_alloc().
641 *
642 * @return
643 * 0 on success, errno value on failure.
644 */
645 static int
646 rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
647 struct rte_mbuf *(*pool)[])
648 {
649 const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
650 unsigned int i;
651 int ret = 0;
652
653 /* Iterate on segments. */
654 for (i = 0; (i != elts_n); ++i) {
655 struct rte_mbuf *buf;
656 volatile struct mlx5_wqe_data_seg *scat =
657 &(*rxq_ctrl->rxq.wqes)[i];
658
659 if (pool != NULL) {
660 buf = (*pool)[i];
661 assert(buf != NULL);
662 rte_pktmbuf_reset(buf);
663 rte_pktmbuf_refcnt_update(buf, 1);
664 } else
665 buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
666 if (buf == NULL) {
667 assert(pool == NULL);
668 ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
669 ret = ENOMEM;
670 goto error;
671 }
672 /* Headroom is reserved by rte_pktmbuf_alloc(). */
673 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
674 /* Buffer is supposed to be empty. */
675 assert(rte_pktmbuf_data_len(buf) == 0);
676 assert(rte_pktmbuf_pkt_len(buf) == 0);
677 assert(!buf->next);
678 /* Only the first segment keeps headroom. */
679 if (i % sges_n)
680 SET_DATA_OFF(buf, 0);
681 PORT(buf) = rxq_ctrl->rxq.port_id;
682 DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
683 PKT_LEN(buf) = DATA_LEN(buf);
684 NB_SEGS(buf) = 1;
685 /* scat->addr must be able to store a pointer. */
686 assert(sizeof(scat->addr) >= sizeof(uintptr_t));
687 *scat = (struct mlx5_wqe_data_seg){
688 .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
689 .byte_count = htonl(DATA_LEN(buf)),
690 .lkey = htonl(rxq_ctrl->mr->lkey),
691 };
692 (*rxq_ctrl->rxq.elts)[i] = buf;
693 }
694 DEBUG("%p: allocated and configured %u segments (max %u packets)",
695 (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
696 assert(ret == 0);
697 return 0;
698 error:
699 assert(pool == NULL);
700 elts_n = i;
701 for (i = 0; (i != elts_n); ++i) {
702 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
703 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
704 (*rxq_ctrl->rxq.elts)[i] = NULL;
705 }
706 DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
707 assert(ret > 0);
708 return ret;
709 }
710
711 /**
712 * Free RX queue elements.
713 *
714 * @param rxq_ctrl
715 * Pointer to RX queue structure.
716 */
717 static void
718 rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
719 {
720 unsigned int i;
721
722 DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
723 if (rxq_ctrl->rxq.elts == NULL)
724 return;
725
726 for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
727 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
728 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
729 (*rxq_ctrl->rxq.elts)[i] = NULL;
730 }
731 }
732
733 /**
734 * Clean up a RX queue.
735 *
736 * Destroy objects, free allocated memory and reset the structure for reuse.
737 *
738 * @param rxq_ctrl
739 * Pointer to RX queue structure.
740 */
741 void
742 rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
743 {
744 struct ibv_exp_release_intf_params params;
745
746 DEBUG("cleaning up %p", (void *)rxq_ctrl);
747 rxq_free_elts(rxq_ctrl);
748 if (rxq_ctrl->fdir_queue != NULL)
749 priv_fdir_queue_destroy(rxq_ctrl->priv, rxq_ctrl->fdir_queue);
750 if (rxq_ctrl->if_wq != NULL) {
751 assert(rxq_ctrl->priv != NULL);
752 assert(rxq_ctrl->priv->ctx != NULL);
753 assert(rxq_ctrl->wq != NULL);
754 params = (struct ibv_exp_release_intf_params){
755 .comp_mask = 0,
756 };
757 claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
758 rxq_ctrl->if_wq,
759 &params));
760 }
761 if (rxq_ctrl->if_cq != NULL) {
762 assert(rxq_ctrl->priv != NULL);
763 assert(rxq_ctrl->priv->ctx != NULL);
764 assert(rxq_ctrl->cq != NULL);
765 params = (struct ibv_exp_release_intf_params){
766 .comp_mask = 0,
767 };
768 claim_zero(ibv_exp_release_intf(rxq_ctrl->priv->ctx,
769 rxq_ctrl->if_cq,
770 &params));
771 }
772 if (rxq_ctrl->wq != NULL)
773 claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
774 if (rxq_ctrl->cq != NULL)
775 claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
776 if (rxq_ctrl->rd != NULL) {
777 struct ibv_exp_destroy_res_domain_attr attr = {
778 .comp_mask = 0,
779 };
780
781 assert(rxq_ctrl->priv != NULL);
782 assert(rxq_ctrl->priv->ctx != NULL);
783 claim_zero(ibv_exp_destroy_res_domain(rxq_ctrl->priv->ctx,
784 rxq_ctrl->rd,
785 &attr));
786 }
787 if (rxq_ctrl->mr != NULL)
788 claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
789 memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
790 }
791
792 /**
793 * Reconfigure RX queue buffers.
794 *
795 * rxq_rehash() does not allocate mbufs, which, if not done from the right
796 * thread (such as a control thread), may corrupt the pool.
797 * In case of failure, the queue is left untouched.
798 *
799 * @param dev
800 * Pointer to Ethernet device structure.
801 * @param rxq_ctrl
802 * RX queue pointer.
803 *
804 * @return
805 * 0 on success, errno value on failure.
806 */
807 int
808 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
809 {
810 unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
811 unsigned int i;
812 struct ibv_exp_wq_attr mod;
813 int err;
814
815 DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
816 (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
817 assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
818 /* From now on, any failure will render the queue unusable.
819 * Reinitialize WQ. */
820 mod = (struct ibv_exp_wq_attr){
821 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
822 .wq_state = IBV_EXP_WQS_RESET,
823 };
824 err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
825 if (err) {
826 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
827 assert(err > 0);
828 return err;
829 }
830 /* Snatch mbufs from original queue. */
831 claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
832 for (i = 0; i != elts_n; ++i) {
833 struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
834
835 assert(rte_mbuf_refcnt_read(buf) == 2);
836 rte_pktmbuf_free_seg(buf);
837 }
838 /* Change queue state to ready. */
839 mod = (struct ibv_exp_wq_attr){
840 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
841 .wq_state = IBV_EXP_WQS_RDY,
842 };
843 err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
844 if (err) {
845 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
846 (void *)dev, strerror(err));
847 goto error;
848 }
849 /* Update doorbell counter. */
850 rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
851 rte_wmb();
852 *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
853 error:
854 assert(err >= 0);
855 return err;
856 }
857
858 /**
859 * Initialize RX queue.
860 *
861 * @param tmpl
862 * Pointer to RX queue control template.
863 *
864 * @return
865 * 0 on success, errno value on failure.
866 */
867 static inline int
868 rxq_setup(struct rxq_ctrl *tmpl)
869 {
870 struct ibv_cq *ibcq = tmpl->cq;
871 struct mlx5_cq *cq = to_mxxx(cq, cq);
872 struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
873 struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
874 rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
875
876 if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
877 ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
878 "it should be set to %u", RTE_CACHE_LINE_SIZE);
879 return EINVAL;
880 }
881 if (elts == NULL)
882 return ENOMEM;
883 tmpl->rxq.rq_db = rwq->rq.db;
884 tmpl->rxq.cqe_n = log2above(ibcq->cqe);
885 tmpl->rxq.cq_ci = 0;
886 tmpl->rxq.rq_ci = 0;
887 tmpl->rxq.cq_db = cq->dbrec;
888 tmpl->rxq.wqes =
889 (volatile struct mlx5_wqe_data_seg (*)[])
890 (uintptr_t)rwq->rq.buff;
891 tmpl->rxq.cqes =
892 (volatile struct mlx5_cqe (*)[])
893 (uintptr_t)cq->active_buf->buf;
894 tmpl->rxq.elts = elts;
895 return 0;
896 }
897
898 /**
899 * Configure a RX queue.
900 *
901 * @param dev
902 * Pointer to Ethernet device structure.
903 * @param rxq_ctrl
904 * Pointer to RX queue structure.
905 * @param desc
906 * Number of descriptors to configure in queue.
907 * @param socket
908 * NUMA socket on which memory must be allocated.
909 * @param[in] conf
910 * Thresholds parameters.
911 * @param mp
912 * Memory pool for buffer allocations.
913 *
914 * @return
915 * 0 on success, errno value on failure.
916 */
917 int
918 rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
919 uint16_t desc, unsigned int socket,
920 const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
921 {
922 struct priv *priv = dev->data->dev_private;
923 struct rxq_ctrl tmpl = {
924 .priv = priv,
925 .socket = socket,
926 .rxq = {
927 .elts_n = log2above(desc),
928 .mp = mp,
929 .rss_hash = priv->rxqs_n > 1,
930 },
931 };
932 struct ibv_exp_wq_attr mod;
933 union {
934 struct ibv_exp_query_intf_params params;
935 struct ibv_exp_cq_init_attr cq;
936 struct ibv_exp_res_domain_init_attr rd;
937 struct ibv_exp_wq_init_attr wq;
938 struct ibv_exp_cq_attr cq_attr;
939 } attr;
940 enum ibv_exp_query_intf_status status;
941 unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
942 unsigned int cqe_n = desc - 1;
943 struct rte_mbuf *(*elts)[desc] = NULL;
944 int ret = 0;
945
946 (void)conf; /* Thresholds configuration (ignored). */
947 /* Enable scattered packets support for this queue if necessary. */
948 assert(mb_len >= RTE_PKTMBUF_HEADROOM);
949 /* If smaller than MRU, multi-segment support must be enabled. */
950 if (mb_len < (priv->mtu > dev->data->dev_conf.rxmode.max_rx_pkt_len ?
951 dev->data->dev_conf.rxmode.max_rx_pkt_len :
952 priv->mtu))
953 dev->data->dev_conf.rxmode.jumbo_frame = 1;
954 if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
955 (dev->data->dev_conf.rxmode.max_rx_pkt_len >
956 (mb_len - RTE_PKTMBUF_HEADROOM))) {
957 unsigned int size =
958 RTE_PKTMBUF_HEADROOM +
959 dev->data->dev_conf.rxmode.max_rx_pkt_len;
960 unsigned int sges_n;
961
962 /*
963 * Determine the number of SGEs needed for a full packet
964 * and round it to the next power of two.
965 */
966 sges_n = log2above((size / mb_len) + !!(size % mb_len));
967 tmpl.rxq.sges_n = sges_n;
968 /* Make sure rxq.sges_n did not overflow. */
969 size = mb_len * (1 << tmpl.rxq.sges_n);
970 size -= RTE_PKTMBUF_HEADROOM;
971 if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
972 ERROR("%p: too many SGEs (%u) needed to handle"
973 " requested maximum packet size %u",
974 (void *)dev,
975 1 << sges_n,
976 dev->data->dev_conf.rxmode.max_rx_pkt_len);
977 return EOVERFLOW;
978 }
979 }
980 DEBUG("%p: maximum number of segments per packet: %u",
981 (void *)dev, 1 << tmpl.rxq.sges_n);
982 if (desc % (1 << tmpl.rxq.sges_n)) {
983 ERROR("%p: number of RX queue descriptors (%u) is not a"
984 " multiple of SGEs per packet (%u)",
985 (void *)dev,
986 desc,
987 1 << tmpl.rxq.sges_n);
988 return EINVAL;
989 }
990 /* Toggle RX checksum offload if hardware supports it. */
991 if (priv->hw_csum)
992 tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
993 if (priv->hw_csum_l2tun)
994 tmpl.rxq.csum_l2tun =
995 !!dev->data->dev_conf.rxmode.hw_ip_checksum;
996 /* Use the entire RX mempool as the memory region. */
997 tmpl.mr = mlx5_mp2mr(priv->pd, mp);
998 if (tmpl.mr == NULL) {
999 ret = EINVAL;
1000 ERROR("%p: MR creation failure: %s",
1001 (void *)dev, strerror(ret));
1002 goto error;
1003 }
1004 attr.rd = (struct ibv_exp_res_domain_init_attr){
1005 .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
1006 IBV_EXP_RES_DOMAIN_MSG_MODEL),
1007 .thread_model = IBV_EXP_THREAD_SINGLE,
1008 .msg_model = IBV_EXP_MSG_HIGH_BW,
1009 };
1010 tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd);
1011 if (tmpl.rd == NULL) {
1012 ret = ENOMEM;
1013 ERROR("%p: RD creation failure: %s",
1014 (void *)dev, strerror(ret));
1015 goto error;
1016 }
1017 attr.cq = (struct ibv_exp_cq_init_attr){
1018 .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN,
1019 .res_domain = tmpl.rd,
1020 };
1021 if (priv->cqe_comp) {
1022 attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
1023 attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
1024 cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
1025 }
1026 tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, NULL, 0,
1027 &attr.cq);
1028 if (tmpl.cq == NULL) {
1029 ret = ENOMEM;
1030 ERROR("%p: CQ creation failure: %s",
1031 (void *)dev, strerror(ret));
1032 goto error;
1033 }
1034 DEBUG("priv->device_attr.max_qp_wr is %d",
1035 priv->device_attr.max_qp_wr);
1036 DEBUG("priv->device_attr.max_sge is %d",
1037 priv->device_attr.max_sge);
1038 /* Configure VLAN stripping. */
1039 tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
1040 !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1041 attr.wq = (struct ibv_exp_wq_init_attr){
1042 .wq_context = NULL, /* Could be useful in the future. */
1043 .wq_type = IBV_EXP_WQT_RQ,
1044 /* Max number of outstanding WRs. */
1045 .max_recv_wr = desc >> tmpl.rxq.sges_n,
1046 /* Max number of scatter/gather elements in a WR. */
1047 .max_recv_sge = 1 << tmpl.rxq.sges_n,
1048 .pd = priv->pd,
1049 .cq = tmpl.cq,
1050 .comp_mask =
1051 IBV_EXP_CREATE_WQ_RES_DOMAIN |
1052 IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1053 0,
1054 .res_domain = tmpl.rd,
1055 .vlan_offloads = (tmpl.rxq.vlan_strip ?
1056 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1057 0),
1058 };
1059 /* By default, FCS (CRC) is stripped by hardware. */
1060 if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1061 tmpl.rxq.crc_present = 0;
1062 } else if (priv->hw_fcs_strip) {
1063 /* Ask HW/Verbs to leave CRC in place when supported. */
1064 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
1065 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1066 tmpl.rxq.crc_present = 1;
1067 } else {
1068 WARN("%p: CRC stripping has been disabled but will still"
1069 " be performed by hardware, make sure MLNX_OFED and"
1070 " firmware are up to date",
1071 (void *)dev);
1072 tmpl.rxq.crc_present = 0;
1073 }
1074 DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1075 " incoming frames to hide it",
1076 (void *)dev,
1077 tmpl.rxq.crc_present ? "disabled" : "enabled",
1078 tmpl.rxq.crc_present << 2);
1079 if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1080 ; /* Nothing else to do. */
1081 else if (priv->hw_padding) {
1082 INFO("%p: enabling packet padding on queue %p",
1083 (void *)dev, (void *)rxq_ctrl);
1084 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1085 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1086 } else
1087 WARN("%p: packet padding has been requested but is not"
1088 " supported, make sure MLNX_OFED and firmware are"
1089 " up to date",
1090 (void *)dev);
1091
1092 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1093 if (tmpl.wq == NULL) {
1094 ret = (errno ? errno : EINVAL);
1095 ERROR("%p: WQ creation failure: %s",
1096 (void *)dev, strerror(ret));
1097 goto error;
1098 }
1099 /*
1100 * Make sure number of WRs*SGEs match expectations since a queue
1101 * cannot allocate more than "desc" buffers.
1102 */
1103 if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
1104 ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
1105 ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1106 (void *)dev,
1107 (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
1108 attr.wq.max_recv_wr, attr.wq.max_recv_sge);
1109 ret = EINVAL;
1110 goto error;
1111 }
1112 /* Save port ID. */
1113 tmpl.rxq.port_id = dev->data->port_id;
1114 DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
1115 attr.params = (struct ibv_exp_query_intf_params){
1116 .intf_scope = IBV_EXP_INTF_GLOBAL,
1117 .intf_version = 1,
1118 .intf = IBV_EXP_INTF_CQ,
1119 .obj = tmpl.cq,
1120 };
1121 tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1122 if (tmpl.if_cq == NULL) {
1123 ERROR("%p: CQ interface family query failed with status %d",
1124 (void *)dev, status);
1125 goto error;
1126 }
1127 attr.params = (struct ibv_exp_query_intf_params){
1128 .intf_scope = IBV_EXP_INTF_GLOBAL,
1129 .intf = IBV_EXP_INTF_WQ,
1130 .obj = tmpl.wq,
1131 };
1132 tmpl.if_wq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
1133 if (tmpl.if_wq == NULL) {
1134 ERROR("%p: WQ interface family query failed with status %d",
1135 (void *)dev, status);
1136 goto error;
1137 }
1138 /* Change queue state to ready. */
1139 mod = (struct ibv_exp_wq_attr){
1140 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1141 .wq_state = IBV_EXP_WQS_RDY,
1142 };
1143 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1144 if (ret) {
1145 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1146 (void *)dev, strerror(ret));
1147 goto error;
1148 }
1149 ret = rxq_setup(&tmpl);
1150 if (ret) {
1151 ERROR("%p: cannot initialize RX queue structure: %s",
1152 (void *)dev, strerror(ret));
1153 goto error;
1154 }
1155 /* Reuse buffers from original queue if possible. */
1156 if (rxq_ctrl->rxq.elts_n) {
1157 assert(1 << rxq_ctrl->rxq.elts_n == desc);
1158 assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
1159 ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
1160 } else
1161 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1162 if (ret) {
1163 ERROR("%p: RXQ allocation failed: %s",
1164 (void *)dev, strerror(ret));
1165 goto error;
1166 }
1167 /* Clean up rxq in case we're reinitializing it. */
1168 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
1169 rxq_cleanup(rxq_ctrl);
1170 /* Move mbuf pointers to dedicated storage area in RX queue. */
1171 elts = (void *)(rxq_ctrl + 1);
1172 rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
1173 #ifndef NDEBUG
1174 memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
1175 #endif
1176 rte_free(tmpl.rxq.elts);
1177 tmpl.rxq.elts = elts;
1178 *rxq_ctrl = tmpl;
1179 /* Update doorbell counter. */
1180 rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
1181 rte_wmb();
1182 *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
1183 DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1184 assert(ret == 0);
1185 return 0;
1186 error:
1187 elts = tmpl.rxq.elts;
1188 rxq_cleanup(&tmpl);
1189 rte_free(elts);
1190 assert(ret > 0);
1191 return ret;
1192 }
1193
1194 /**
1195 * DPDK callback to configure a RX queue.
1196 *
1197 * @param dev
1198 * Pointer to Ethernet device structure.
1199 * @param idx
1200 * RX queue index.
1201 * @param desc
1202 * Number of descriptors to configure in queue.
1203 * @param socket
1204 * NUMA socket on which memory must be allocated.
1205 * @param[in] conf
1206 * Thresholds parameters.
1207 * @param mp
1208 * Memory pool for buffer allocations.
1209 *
1210 * @return
1211 * 0 on success, negative errno value on failure.
1212 */
1213 int
1214 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1215 unsigned int socket, const struct rte_eth_rxconf *conf,
1216 struct rte_mempool *mp)
1217 {
1218 struct priv *priv = dev->data->dev_private;
1219 struct rxq *rxq = (*priv->rxqs)[idx];
1220 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1221 int ret;
1222
1223 if (mlx5_is_secondary())
1224 return -E_RTE_SECONDARY;
1225
1226 priv_lock(priv);
1227 if (!rte_is_power_of_2(desc)) {
1228 desc = 1 << log2above(desc);
1229 WARN("%p: increased number of descriptors in RX queue %u"
1230 " to the next power of two (%d)",
1231 (void *)dev, idx, desc);
1232 }
1233 DEBUG("%p: configuring queue %u for %u descriptors",
1234 (void *)dev, idx, desc);
1235 if (idx >= priv->rxqs_n) {
1236 ERROR("%p: queue index out of range (%u >= %u)",
1237 (void *)dev, idx, priv->rxqs_n);
1238 priv_unlock(priv);
1239 return -EOVERFLOW;
1240 }
1241 if (rxq != NULL) {
1242 DEBUG("%p: reusing already allocated queue index %u (%p)",
1243 (void *)dev, idx, (void *)rxq);
1244 if (priv->started) {
1245 priv_unlock(priv);
1246 return -EEXIST;
1247 }
1248 (*priv->rxqs)[idx] = NULL;
1249 rxq_cleanup(rxq_ctrl);
1250 } else {
1251 rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
1252 desc * sizeof(struct rte_mbuf *),
1253 0, socket);
1254 if (rxq_ctrl == NULL) {
1255 ERROR("%p: unable to allocate queue index %u",
1256 (void *)dev, idx);
1257 priv_unlock(priv);
1258 return -ENOMEM;
1259 }
1260 }
1261 ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
1262 if (ret)
1263 rte_free(rxq_ctrl);
1264 else {
1265 rxq_ctrl->rxq.stats.idx = idx;
1266 DEBUG("%p: adding RX queue %p to list",
1267 (void *)dev, (void *)rxq_ctrl);
1268 (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
1269 /* Update receive callback. */
1270 priv_select_rx_function(priv);
1271 }
1272 priv_unlock(priv);
1273 return -ret;
1274 }
1275
1276 /**
1277 * DPDK callback to release a RX queue.
1278 *
1279 * @param dpdk_rxq
1280 * Generic RX queue pointer.
1281 */
1282 void
1283 mlx5_rx_queue_release(void *dpdk_rxq)
1284 {
1285 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1286 struct rxq_ctrl *rxq_ctrl;
1287 struct priv *priv;
1288 unsigned int i;
1289
1290 if (mlx5_is_secondary())
1291 return;
1292
1293 if (rxq == NULL)
1294 return;
1295 rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1296 priv = rxq_ctrl->priv;
1297 priv_lock(priv);
1298 for (i = 0; (i != priv->rxqs_n); ++i)
1299 if ((*priv->rxqs)[i] == rxq) {
1300 DEBUG("%p: removing RX queue %p from list",
1301 (void *)priv->dev, (void *)rxq_ctrl);
1302 (*priv->rxqs)[i] = NULL;
1303 break;
1304 }
1305 rxq_cleanup(rxq_ctrl);
1306 rte_free(rxq_ctrl);
1307 priv_unlock(priv);
1308 }
1309
1310 /**
1311 * DPDK callback for RX in secondary processes.
1312 *
1313 * This function configures all queues from primary process information
1314 * if necessary before reverting to the normal RX burst callback.
1315 *
1316 * @param dpdk_rxq
1317 * Generic pointer to RX queue structure.
1318 * @param[out] pkts
1319 * Array to store received packets.
1320 * @param pkts_n
1321 * Maximum number of packets in array.
1322 *
1323 * @return
1324 * Number of packets successfully received (<= pkts_n).
1325 */
1326 uint16_t
1327 mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
1328 uint16_t pkts_n)
1329 {
1330 struct rxq *rxq = dpdk_rxq;
1331 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1332 struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
1333 struct priv *primary_priv;
1334 unsigned int index;
1335
1336 if (priv == NULL)
1337 return 0;
1338 primary_priv =
1339 mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
1340 /* Look for queue index in both private structures. */
1341 for (index = 0; index != priv->rxqs_n; ++index)
1342 if (((*primary_priv->rxqs)[index] == rxq) ||
1343 ((*priv->rxqs)[index] == rxq))
1344 break;
1345 if (index == priv->rxqs_n)
1346 return 0;
1347 rxq = (*priv->rxqs)[index];
1348 return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
1349 }