]> git.proxmox.com Git - ceph.git/blob - ceph/src/seastar/dpdk/drivers/net/mlx5/mlx5_rxq.c
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / seastar / dpdk / drivers / net / mlx5 / mlx5_rxq.c
1 /*-
2 * BSD LICENSE
3 *
4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include <stddef.h>
35 #include <assert.h>
36 #include <errno.h>
37 #include <string.h>
38 #include <stdint.h>
39 #include <fcntl.h>
40
41 /* Verbs header. */
42 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
43 #ifdef PEDANTIC
44 #pragma GCC diagnostic ignored "-Wpedantic"
45 #endif
46 #include <infiniband/verbs.h>
47 #include <infiniband/arch.h>
48 #include <infiniband/mlx5_hw.h>
49 #ifdef PEDANTIC
50 #pragma GCC diagnostic error "-Wpedantic"
51 #endif
52
53 /* DPDK headers don't like -pedantic. */
54 #ifdef PEDANTIC
55 #pragma GCC diagnostic ignored "-Wpedantic"
56 #endif
57 #include <rte_mbuf.h>
58 #include <rte_malloc.h>
59 #include <rte_ethdev.h>
60 #include <rte_common.h>
61 #include <rte_interrupts.h>
62 #include <rte_debug.h>
63 #ifdef PEDANTIC
64 #pragma GCC diagnostic error "-Wpedantic"
65 #endif
66
67 #include "mlx5.h"
68 #include "mlx5_rxtx.h"
69 #include "mlx5_utils.h"
70 #include "mlx5_autoconf.h"
71 #include "mlx5_defs.h"
72
73 /* Initialization data for hash RX queues. */
74 const struct hash_rxq_init hash_rxq_init[] = {
75 [HASH_RXQ_TCPV4] = {
76 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
77 IBV_EXP_RX_HASH_DST_IPV4 |
78 IBV_EXP_RX_HASH_SRC_PORT_TCP |
79 IBV_EXP_RX_HASH_DST_PORT_TCP),
80 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
81 .flow_priority = 0,
82 .flow_spec.tcp_udp = {
83 .type = IBV_EXP_FLOW_SPEC_TCP,
84 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
85 },
86 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
87 },
88 [HASH_RXQ_UDPV4] = {
89 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
90 IBV_EXP_RX_HASH_DST_IPV4 |
91 IBV_EXP_RX_HASH_SRC_PORT_UDP |
92 IBV_EXP_RX_HASH_DST_PORT_UDP),
93 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
94 .flow_priority = 0,
95 .flow_spec.tcp_udp = {
96 .type = IBV_EXP_FLOW_SPEC_UDP,
97 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
98 },
99 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
100 },
101 [HASH_RXQ_IPV4] = {
102 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV4 |
103 IBV_EXP_RX_HASH_DST_IPV4),
104 .dpdk_rss_hf = (ETH_RSS_IPV4 |
105 ETH_RSS_FRAG_IPV4),
106 .flow_priority = 1,
107 .flow_spec.ipv4 = {
108 .type = IBV_EXP_FLOW_SPEC_IPV4,
109 .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
110 },
111 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
112 },
113 [HASH_RXQ_TCPV6] = {
114 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
115 IBV_EXP_RX_HASH_DST_IPV6 |
116 IBV_EXP_RX_HASH_SRC_PORT_TCP |
117 IBV_EXP_RX_HASH_DST_PORT_TCP),
118 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
119 .flow_priority = 0,
120 .flow_spec.tcp_udp = {
121 .type = IBV_EXP_FLOW_SPEC_TCP,
122 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
123 },
124 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
125 },
126 [HASH_RXQ_UDPV6] = {
127 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
128 IBV_EXP_RX_HASH_DST_IPV6 |
129 IBV_EXP_RX_HASH_SRC_PORT_UDP |
130 IBV_EXP_RX_HASH_DST_PORT_UDP),
131 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
132 .flow_priority = 0,
133 .flow_spec.tcp_udp = {
134 .type = IBV_EXP_FLOW_SPEC_UDP,
135 .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
136 },
137 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
138 },
139 [HASH_RXQ_IPV6] = {
140 .hash_fields = (IBV_EXP_RX_HASH_SRC_IPV6 |
141 IBV_EXP_RX_HASH_DST_IPV6),
142 .dpdk_rss_hf = (ETH_RSS_IPV6 |
143 ETH_RSS_FRAG_IPV6),
144 .flow_priority = 1,
145 .flow_spec.ipv6 = {
146 .type = IBV_EXP_FLOW_SPEC_IPV6,
147 .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
148 },
149 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
150 },
151 [HASH_RXQ_ETH] = {
152 .hash_fields = 0,
153 .dpdk_rss_hf = 0,
154 .flow_priority = 2,
155 .flow_spec.eth = {
156 .type = IBV_EXP_FLOW_SPEC_ETH,
157 .size = sizeof(hash_rxq_init[0].flow_spec.eth),
158 },
159 .underlayer = NULL,
160 },
161 };
162
163 /* Number of entries in hash_rxq_init[]. */
164 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
165
166 /* Initialization data for hash RX queue indirection tables. */
167 static const struct ind_table_init ind_table_init[] = {
168 {
169 .max_size = -1u, /* Superseded by HW limitations. */
170 .hash_types =
171 1 << HASH_RXQ_TCPV4 |
172 1 << HASH_RXQ_UDPV4 |
173 1 << HASH_RXQ_IPV4 |
174 1 << HASH_RXQ_TCPV6 |
175 1 << HASH_RXQ_UDPV6 |
176 1 << HASH_RXQ_IPV6 |
177 0,
178 .hash_types_n = 6,
179 },
180 {
181 .max_size = 1,
182 .hash_types = 1 << HASH_RXQ_ETH,
183 .hash_types_n = 1,
184 },
185 };
186
187 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
188
189 /* Default RSS hash key also used for ConnectX-3. */
190 uint8_t rss_hash_default_key[] = {
191 0x2c, 0xc6, 0x81, 0xd1,
192 0x5b, 0xdb, 0xf4, 0xf7,
193 0xfc, 0xa2, 0x83, 0x19,
194 0xdb, 0x1a, 0x3e, 0x94,
195 0x6b, 0x9e, 0x38, 0xd9,
196 0x2c, 0x9c, 0x03, 0xd1,
197 0xad, 0x99, 0x44, 0xa7,
198 0xd9, 0x56, 0x3d, 0x59,
199 0x06, 0x3c, 0x25, 0xf3,
200 0xfc, 0x1f, 0xdc, 0x2a,
201 };
202
203 /* Length of the default RSS hash key. */
204 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
205
206 /**
207 * Populate flow steering rule for a given hash RX queue type using
208 * information from hash_rxq_init[]. Nothing is written to flow_attr when
209 * flow_attr_size is not large enough, but the required size is still returned.
210 *
211 * @param priv
212 * Pointer to private structure.
213 * @param[out] flow_attr
214 * Pointer to flow attribute structure to fill. Note that the allocated
215 * area must be larger and large enough to hold all flow specifications.
216 * @param flow_attr_size
217 * Entire size of flow_attr and trailing room for flow specifications.
218 * @param type
219 * Hash RX queue type to use for flow steering rule.
220 *
221 * @return
222 * Total size of the flow attribute buffer. No errors are defined.
223 */
224 size_t
225 priv_flow_attr(struct priv *priv, struct ibv_exp_flow_attr *flow_attr,
226 size_t flow_attr_size, enum hash_rxq_type type)
227 {
228 size_t offset = sizeof(*flow_attr);
229 const struct hash_rxq_init *init = &hash_rxq_init[type];
230
231 assert(priv != NULL);
232 assert((size_t)type < RTE_DIM(hash_rxq_init));
233 do {
234 offset += init->flow_spec.hdr.size;
235 init = init->underlayer;
236 } while (init != NULL);
237 if (offset > flow_attr_size)
238 return offset;
239 flow_attr_size = offset;
240 init = &hash_rxq_init[type];
241 *flow_attr = (struct ibv_exp_flow_attr){
242 .type = IBV_EXP_FLOW_ATTR_NORMAL,
243 /* Priorities < 3 are reserved for flow director. */
244 .priority = init->flow_priority + 3,
245 .num_of_specs = 0,
246 .port = priv->port,
247 .flags = 0,
248 };
249 do {
250 offset -= init->flow_spec.hdr.size;
251 memcpy((void *)((uintptr_t)flow_attr + offset),
252 &init->flow_spec,
253 init->flow_spec.hdr.size);
254 ++flow_attr->num_of_specs;
255 init = init->underlayer;
256 } while (init != NULL);
257 return flow_attr_size;
258 }
259
260 /**
261 * Convert hash type position in indirection table initializer to
262 * hash RX queue type.
263 *
264 * @param table
265 * Indirection table initializer.
266 * @param pos
267 * Hash type position.
268 *
269 * @return
270 * Hash RX queue type.
271 */
272 static enum hash_rxq_type
273 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
274 {
275 enum hash_rxq_type type = HASH_RXQ_TCPV4;
276
277 assert(pos < table->hash_types_n);
278 do {
279 if ((table->hash_types & (1 << type)) && (pos-- == 0))
280 break;
281 ++type;
282 } while (1);
283 return type;
284 }
285
286 /**
287 * Filter out disabled hash RX queue types from ind_table_init[].
288 *
289 * @param priv
290 * Pointer to private structure.
291 * @param[out] table
292 * Output table.
293 *
294 * @return
295 * Number of table entries.
296 */
297 static unsigned int
298 priv_make_ind_table_init(struct priv *priv,
299 struct ind_table_init (*table)[IND_TABLE_INIT_N])
300 {
301 uint64_t rss_hf;
302 unsigned int i;
303 unsigned int j;
304 unsigned int table_n = 0;
305 /* Mandatory to receive frames not handled by normal hash RX queues. */
306 unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
307
308 rss_hf = priv->rss_hf;
309 /* Process other protocols only if more than one queue. */
310 if (priv->rxqs_n > 1)
311 for (i = 0; (i != hash_rxq_init_n); ++i)
312 if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
313 hash_types_sup |= (1 << i);
314
315 /* Filter out entries whose protocols are not in the set. */
316 for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
317 unsigned int nb;
318 unsigned int h;
319
320 /* j is increased only if the table has valid protocols. */
321 assert(j <= i);
322 (*table)[j] = ind_table_init[i];
323 (*table)[j].hash_types &= hash_types_sup;
324 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
325 if (((*table)[j].hash_types >> h) & 0x1)
326 ++nb;
327 (*table)[i].hash_types_n = nb;
328 if (nb) {
329 ++table_n;
330 ++j;
331 }
332 }
333 return table_n;
334 }
335
336 /**
337 * Initialize hash RX queues and indirection table.
338 *
339 * @param priv
340 * Pointer to private structure.
341 *
342 * @return
343 * 0 on success, errno value on failure.
344 */
345 int
346 priv_create_hash_rxqs(struct priv *priv)
347 {
348 struct ibv_exp_wq *wqs[priv->reta_idx_n];
349 struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
350 unsigned int ind_tables_n =
351 priv_make_ind_table_init(priv, &ind_table_init);
352 unsigned int hash_rxqs_n = 0;
353 struct hash_rxq (*hash_rxqs)[] = NULL;
354 struct ibv_exp_rwq_ind_table *(*ind_tables)[] = NULL;
355 unsigned int i;
356 unsigned int j;
357 unsigned int k;
358 int err = 0;
359
360 assert(priv->ind_tables == NULL);
361 assert(priv->ind_tables_n == 0);
362 assert(priv->hash_rxqs == NULL);
363 assert(priv->hash_rxqs_n == 0);
364 assert(priv->pd != NULL);
365 assert(priv->ctx != NULL);
366 if (priv->rxqs_n == 0)
367 return EINVAL;
368 assert(priv->rxqs != NULL);
369 if (ind_tables_n == 0) {
370 ERROR("all hash RX queue types have been filtered out,"
371 " indirection table cannot be created");
372 return EINVAL;
373 }
374 if (priv->rxqs_n & (priv->rxqs_n - 1)) {
375 INFO("%u RX queues are configured, consider rounding this"
376 " number to the next power of two for better balancing",
377 priv->rxqs_n);
378 DEBUG("indirection table extended to assume %u WQs",
379 priv->reta_idx_n);
380 }
381 for (i = 0; (i != priv->reta_idx_n); ++i) {
382 struct rxq_ctrl *rxq_ctrl;
383
384 rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
385 struct rxq_ctrl, rxq);
386 wqs[i] = rxq_ctrl->wq;
387 }
388 /* Get number of hash RX queues to configure. */
389 for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
390 hash_rxqs_n += ind_table_init[i].hash_types_n;
391 DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
392 hash_rxqs_n, priv->rxqs_n, ind_tables_n);
393 /* Create indirection tables. */
394 ind_tables = rte_calloc(__func__, ind_tables_n,
395 sizeof((*ind_tables)[0]), 0);
396 if (ind_tables == NULL) {
397 err = ENOMEM;
398 ERROR("cannot allocate indirection tables container: %s",
399 strerror(err));
400 goto error;
401 }
402 for (i = 0; (i != ind_tables_n); ++i) {
403 struct ibv_exp_rwq_ind_table_init_attr ind_init_attr = {
404 .pd = priv->pd,
405 .log_ind_tbl_size = 0, /* Set below. */
406 .ind_tbl = wqs,
407 .comp_mask = 0,
408 };
409 unsigned int ind_tbl_size = ind_table_init[i].max_size;
410 struct ibv_exp_rwq_ind_table *ind_table;
411
412 if (priv->reta_idx_n < ind_tbl_size)
413 ind_tbl_size = priv->reta_idx_n;
414 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
415 errno = 0;
416 ind_table = ibv_exp_create_rwq_ind_table(priv->ctx,
417 &ind_init_attr);
418 if (ind_table != NULL) {
419 (*ind_tables)[i] = ind_table;
420 continue;
421 }
422 /* Not clear whether errno is set. */
423 err = (errno ? errno : EINVAL);
424 ERROR("RX indirection table creation failed with error %d: %s",
425 err, strerror(err));
426 goto error;
427 }
428 /* Allocate array that holds hash RX queues and related data. */
429 hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
430 sizeof((*hash_rxqs)[0]), 0);
431 if (hash_rxqs == NULL) {
432 err = ENOMEM;
433 ERROR("cannot allocate hash RX queues container: %s",
434 strerror(err));
435 goto error;
436 }
437 for (i = 0, j = 0, k = 0;
438 ((i != hash_rxqs_n) && (j != ind_tables_n));
439 ++i) {
440 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
441 enum hash_rxq_type type =
442 hash_rxq_type_from_pos(&ind_table_init[j], k);
443 struct rte_eth_rss_conf *priv_rss_conf =
444 (*priv->rss_conf)[type];
445 struct ibv_exp_rx_hash_conf hash_conf = {
446 .rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ,
447 .rx_hash_key_len = (priv_rss_conf ?
448 priv_rss_conf->rss_key_len :
449 rss_hash_default_key_len),
450 .rx_hash_key = (priv_rss_conf ?
451 priv_rss_conf->rss_key :
452 rss_hash_default_key),
453 .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
454 .rwq_ind_tbl = (*ind_tables)[j],
455 };
456 struct ibv_exp_qp_init_attr qp_init_attr = {
457 .max_inl_recv = 0, /* Currently not supported. */
458 .qp_type = IBV_QPT_RAW_PACKET,
459 .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD |
460 IBV_EXP_QP_INIT_ATTR_RX_HASH),
461 .pd = priv->pd,
462 .rx_hash_conf = &hash_conf,
463 .port_num = priv->port,
464 };
465
466 DEBUG("using indirection table %u for hash RX queue %u type %d",
467 j, i, type);
468 *hash_rxq = (struct hash_rxq){
469 .priv = priv,
470 .qp = ibv_exp_create_qp(priv->ctx, &qp_init_attr),
471 .type = type,
472 };
473 if (hash_rxq->qp == NULL) {
474 err = (errno ? errno : EINVAL);
475 ERROR("Hash RX QP creation failure: %s",
476 strerror(err));
477 goto error;
478 }
479 if (++k < ind_table_init[j].hash_types_n)
480 continue;
481 /* Switch to the next indirection table and reset hash RX
482 * queue type array index. */
483 ++j;
484 k = 0;
485 }
486 priv->ind_tables = ind_tables;
487 priv->ind_tables_n = ind_tables_n;
488 priv->hash_rxqs = hash_rxqs;
489 priv->hash_rxqs_n = hash_rxqs_n;
490 assert(err == 0);
491 return 0;
492 error:
493 if (hash_rxqs != NULL) {
494 for (i = 0; (i != hash_rxqs_n); ++i) {
495 struct ibv_qp *qp = (*hash_rxqs)[i].qp;
496
497 if (qp == NULL)
498 continue;
499 claim_zero(ibv_destroy_qp(qp));
500 }
501 rte_free(hash_rxqs);
502 }
503 if (ind_tables != NULL) {
504 for (j = 0; (j != ind_tables_n); ++j) {
505 struct ibv_exp_rwq_ind_table *ind_table =
506 (*ind_tables)[j];
507
508 if (ind_table == NULL)
509 continue;
510 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
511 }
512 rte_free(ind_tables);
513 }
514 return err;
515 }
516
517 /**
518 * Clean up hash RX queues and indirection table.
519 *
520 * @param priv
521 * Pointer to private structure.
522 */
523 void
524 priv_destroy_hash_rxqs(struct priv *priv)
525 {
526 unsigned int i;
527
528 DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
529 if (priv->hash_rxqs_n == 0) {
530 assert(priv->hash_rxqs == NULL);
531 assert(priv->ind_tables == NULL);
532 return;
533 }
534 for (i = 0; (i != priv->hash_rxqs_n); ++i) {
535 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
536 unsigned int j, k;
537
538 assert(hash_rxq->priv == priv);
539 assert(hash_rxq->qp != NULL);
540 /* Also check that there are no remaining flows. */
541 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
542 for (k = 0;
543 (k != RTE_DIM(hash_rxq->special_flow[j]));
544 ++k)
545 assert(hash_rxq->special_flow[j][k] == NULL);
546 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
547 for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
548 assert(hash_rxq->mac_flow[j][k] == NULL);
549 claim_zero(ibv_destroy_qp(hash_rxq->qp));
550 }
551 priv->hash_rxqs_n = 0;
552 rte_free(priv->hash_rxqs);
553 priv->hash_rxqs = NULL;
554 for (i = 0; (i != priv->ind_tables_n); ++i) {
555 struct ibv_exp_rwq_ind_table *ind_table =
556 (*priv->ind_tables)[i];
557
558 assert(ind_table != NULL);
559 claim_zero(ibv_exp_destroy_rwq_ind_table(ind_table));
560 }
561 priv->ind_tables_n = 0;
562 rte_free(priv->ind_tables);
563 priv->ind_tables = NULL;
564 }
565
566 /**
567 * Check whether a given flow type is allowed.
568 *
569 * @param priv
570 * Pointer to private structure.
571 * @param type
572 * Flow type to check.
573 *
574 * @return
575 * Nonzero if the given flow type is allowed.
576 */
577 int
578 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
579 {
580 /* Only FLOW_TYPE_PROMISC is allowed when promiscuous mode
581 * has been requested. */
582 if (priv->promisc_req)
583 return type == HASH_RXQ_FLOW_TYPE_PROMISC;
584 switch (type) {
585 case HASH_RXQ_FLOW_TYPE_PROMISC:
586 return !!priv->promisc_req;
587 case HASH_RXQ_FLOW_TYPE_ALLMULTI:
588 return !!priv->allmulti_req;
589 case HASH_RXQ_FLOW_TYPE_BROADCAST:
590 case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
591 /* If allmulti is enabled, broadcast and ipv6multi
592 * are unnecessary. */
593 return !priv->allmulti_req;
594 case HASH_RXQ_FLOW_TYPE_MAC:
595 return 1;
596 default:
597 /* Unsupported flow type is not allowed. */
598 return 0;
599 }
600 return 0;
601 }
602
603 /**
604 * Automatically enable/disable flows according to configuration.
605 *
606 * @param priv
607 * Private structure.
608 *
609 * @return
610 * 0 on success, errno value on failure.
611 */
612 int
613 priv_rehash_flows(struct priv *priv)
614 {
615 enum hash_rxq_flow_type i;
616
617 for (i = HASH_RXQ_FLOW_TYPE_PROMISC;
618 i != RTE_DIM((*priv->hash_rxqs)[0].special_flow);
619 ++i)
620 if (!priv_allow_flow_type(priv, i)) {
621 priv_special_flow_disable(priv, i);
622 } else {
623 int ret = priv_special_flow_enable(priv, i);
624
625 if (ret)
626 return ret;
627 }
628 if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
629 return priv_mac_addrs_enable(priv);
630 priv_mac_addrs_disable(priv);
631 return 0;
632 }
633
634 /**
635 * Allocate RX queue elements.
636 *
637 * @param rxq_ctrl
638 * Pointer to RX queue structure.
639 * @param elts_n
640 * Number of elements to allocate.
641 * @param[in] pool
642 * If not NULL, fetch buffers from this array instead of allocating them
643 * with rte_pktmbuf_alloc().
644 *
645 * @return
646 * 0 on success, errno value on failure.
647 */
648 static int
649 rxq_alloc_elts(struct rxq_ctrl *rxq_ctrl, unsigned int elts_n,
650 struct rte_mbuf *(*pool)[])
651 {
652 const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
653 unsigned int i;
654 int ret = 0;
655
656 /* Iterate on segments. */
657 for (i = 0; (i != elts_n); ++i) {
658 struct rte_mbuf *buf;
659 volatile struct mlx5_wqe_data_seg *scat =
660 &(*rxq_ctrl->rxq.wqes)[i];
661
662 if (pool != NULL) {
663 buf = (*pool)[i];
664 assert(buf != NULL);
665 rte_pktmbuf_reset(buf);
666 rte_pktmbuf_refcnt_update(buf, 1);
667 } else
668 buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
669 if (buf == NULL) {
670 assert(pool == NULL);
671 ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
672 ret = ENOMEM;
673 goto error;
674 }
675 /* Headroom is reserved by rte_pktmbuf_alloc(). */
676 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
677 /* Buffer is supposed to be empty. */
678 assert(rte_pktmbuf_data_len(buf) == 0);
679 assert(rte_pktmbuf_pkt_len(buf) == 0);
680 assert(!buf->next);
681 /* Only the first segment keeps headroom. */
682 if (i % sges_n)
683 SET_DATA_OFF(buf, 0);
684 PORT(buf) = rxq_ctrl->rxq.port_id;
685 DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
686 PKT_LEN(buf) = DATA_LEN(buf);
687 NB_SEGS(buf) = 1;
688 /* scat->addr must be able to store a pointer. */
689 assert(sizeof(scat->addr) >= sizeof(uintptr_t));
690 *scat = (struct mlx5_wqe_data_seg){
691 .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
692 .byte_count = htonl(DATA_LEN(buf)),
693 .lkey = htonl(rxq_ctrl->mr->lkey),
694 };
695 (*rxq_ctrl->rxq.elts)[i] = buf;
696 }
697 DEBUG("%p: allocated and configured %u segments (max %u packets)",
698 (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
699 assert(ret == 0);
700 return 0;
701 error:
702 assert(pool == NULL);
703 elts_n = i;
704 for (i = 0; (i != elts_n); ++i) {
705 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
706 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
707 (*rxq_ctrl->rxq.elts)[i] = NULL;
708 }
709 DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
710 assert(ret > 0);
711 return ret;
712 }
713
714 /**
715 * Free RX queue elements.
716 *
717 * @param rxq_ctrl
718 * Pointer to RX queue structure.
719 */
720 static void
721 rxq_free_elts(struct rxq_ctrl *rxq_ctrl)
722 {
723 unsigned int i;
724
725 DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
726 if (rxq_ctrl->rxq.elts == NULL)
727 return;
728
729 for (i = 0; (i != (1u << rxq_ctrl->rxq.elts_n)); ++i) {
730 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
731 rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
732 (*rxq_ctrl->rxq.elts)[i] = NULL;
733 }
734 }
735
736 /**
737 * Clean up a RX queue.
738 *
739 * Destroy objects, free allocated memory and reset the structure for reuse.
740 *
741 * @param rxq_ctrl
742 * Pointer to RX queue structure.
743 */
744 void
745 rxq_cleanup(struct rxq_ctrl *rxq_ctrl)
746 {
747 DEBUG("cleaning up %p", (void *)rxq_ctrl);
748 rxq_free_elts(rxq_ctrl);
749 if (rxq_ctrl->fdir_queue != NULL)
750 priv_fdir_queue_destroy(rxq_ctrl->priv, rxq_ctrl->fdir_queue);
751 if (rxq_ctrl->wq != NULL)
752 claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
753 if (rxq_ctrl->cq != NULL)
754 claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
755 if (rxq_ctrl->channel != NULL)
756 claim_zero(ibv_destroy_comp_channel(rxq_ctrl->channel));
757 if (rxq_ctrl->mr != NULL)
758 claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
759 memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
760 }
761
762 /**
763 * Reconfigure RX queue buffers.
764 *
765 * rxq_rehash() does not allocate mbufs, which, if not done from the right
766 * thread (such as a control thread), may corrupt the pool.
767 * In case of failure, the queue is left untouched.
768 *
769 * @param dev
770 * Pointer to Ethernet device structure.
771 * @param rxq_ctrl
772 * RX queue pointer.
773 *
774 * @return
775 * 0 on success, errno value on failure.
776 */
777 int
778 rxq_rehash(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl)
779 {
780 unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
781 unsigned int i;
782 struct ibv_exp_wq_attr mod;
783 int err;
784
785 DEBUG("%p: rehashing queue %p with %u SGE(s) per packet",
786 (void *)dev, (void *)rxq_ctrl, 1 << rxq_ctrl->rxq.sges_n);
787 assert(!(elts_n % (1 << rxq_ctrl->rxq.sges_n)));
788 /* From now on, any failure will render the queue unusable.
789 * Reinitialize WQ. */
790 mod = (struct ibv_exp_wq_attr){
791 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
792 .wq_state = IBV_EXP_WQS_RESET,
793 };
794 err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
795 if (err) {
796 ERROR("%p: cannot reset WQ: %s", (void *)dev, strerror(err));
797 assert(err > 0);
798 return err;
799 }
800 /* Snatch mbufs from original queue. */
801 claim_zero(rxq_alloc_elts(rxq_ctrl, elts_n, rxq_ctrl->rxq.elts));
802 for (i = 0; i != elts_n; ++i) {
803 struct rte_mbuf *buf = (*rxq_ctrl->rxq.elts)[i];
804
805 assert(rte_mbuf_refcnt_read(buf) == 2);
806 rte_pktmbuf_free_seg(buf);
807 }
808 /* Change queue state to ready. */
809 mod = (struct ibv_exp_wq_attr){
810 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
811 .wq_state = IBV_EXP_WQS_RDY,
812 };
813 err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
814 if (err) {
815 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
816 (void *)dev, strerror(err));
817 goto error;
818 }
819 /* Update doorbell counter. */
820 rxq_ctrl->rxq.rq_ci = elts_n >> rxq_ctrl->rxq.sges_n;
821 rte_wmb();
822 *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
823 error:
824 assert(err >= 0);
825 return err;
826 }
827
828 /**
829 * Initialize RX queue.
830 *
831 * @param tmpl
832 * Pointer to RX queue control template.
833 *
834 * @return
835 * 0 on success, errno value on failure.
836 */
837 static inline int
838 rxq_setup(struct rxq_ctrl *tmpl)
839 {
840 struct ibv_cq *ibcq = tmpl->cq;
841 struct mlx5_cq *cq = to_mxxx(cq, cq);
842 struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
843 struct rte_mbuf *(*elts)[1 << tmpl->rxq.elts_n] =
844 rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
845
846 if (cq->cqe_sz != RTE_CACHE_LINE_SIZE) {
847 ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
848 "it should be set to %u", RTE_CACHE_LINE_SIZE);
849 return EINVAL;
850 }
851 if (elts == NULL)
852 return ENOMEM;
853 tmpl->rxq.rq_db = rwq->rq.db;
854 tmpl->rxq.cqe_n = log2above(ibcq->cqe);
855 tmpl->rxq.cq_ci = 0;
856 tmpl->rxq.rq_ci = 0;
857 tmpl->rxq.cq_db = cq->dbrec;
858 tmpl->rxq.wqes =
859 (volatile struct mlx5_wqe_data_seg (*)[])
860 (uintptr_t)rwq->rq.buff;
861 tmpl->rxq.cqes =
862 (volatile struct mlx5_cqe (*)[])
863 (uintptr_t)cq->active_buf->buf;
864 tmpl->rxq.elts = elts;
865 return 0;
866 }
867
868 /**
869 * Configure a RX queue.
870 *
871 * @param dev
872 * Pointer to Ethernet device structure.
873 * @param rxq_ctrl
874 * Pointer to RX queue structure.
875 * @param desc
876 * Number of descriptors to configure in queue.
877 * @param socket
878 * NUMA socket on which memory must be allocated.
879 * @param[in] conf
880 * Thresholds parameters.
881 * @param mp
882 * Memory pool for buffer allocations.
883 *
884 * @return
885 * 0 on success, errno value on failure.
886 */
887 int
888 rxq_ctrl_setup(struct rte_eth_dev *dev, struct rxq_ctrl *rxq_ctrl,
889 uint16_t desc, unsigned int socket,
890 const struct rte_eth_rxconf *conf, struct rte_mempool *mp)
891 {
892 struct priv *priv = dev->data->dev_private;
893 struct rxq_ctrl tmpl = {
894 .priv = priv,
895 .socket = socket,
896 .rxq = {
897 .elts_n = log2above(desc),
898 .mp = mp,
899 .rss_hash = priv->rxqs_n > 1,
900 },
901 };
902 struct ibv_exp_wq_attr mod;
903 union {
904 struct ibv_exp_cq_init_attr cq;
905 struct ibv_exp_wq_init_attr wq;
906 struct ibv_exp_cq_attr cq_attr;
907 } attr;
908 unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
909 unsigned int cqe_n = desc - 1;
910 struct rte_mbuf *(*elts)[desc] = NULL;
911 int ret = 0;
912
913 (void)conf; /* Thresholds configuration (ignored). */
914 /* Enable scattered packets support for this queue if necessary. */
915 assert(mb_len >= RTE_PKTMBUF_HEADROOM);
916 if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
917 (mb_len - RTE_PKTMBUF_HEADROOM)) {
918 tmpl.rxq.sges_n = 0;
919 } else if (dev->data->dev_conf.rxmode.enable_scatter) {
920 unsigned int size =
921 RTE_PKTMBUF_HEADROOM +
922 dev->data->dev_conf.rxmode.max_rx_pkt_len;
923 unsigned int sges_n;
924
925 /*
926 * Determine the number of SGEs needed for a full packet
927 * and round it to the next power of two.
928 */
929 sges_n = log2above((size / mb_len) + !!(size % mb_len));
930 tmpl.rxq.sges_n = sges_n;
931 /* Make sure rxq.sges_n did not overflow. */
932 size = mb_len * (1 << tmpl.rxq.sges_n);
933 size -= RTE_PKTMBUF_HEADROOM;
934 if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
935 ERROR("%p: too many SGEs (%u) needed to handle"
936 " requested maximum packet size %u",
937 (void *)dev,
938 1 << sges_n,
939 dev->data->dev_conf.rxmode.max_rx_pkt_len);
940 return EOVERFLOW;
941 }
942 } else {
943 WARN("%p: the requested maximum Rx packet size (%u) is"
944 " larger than a single mbuf (%u) and scattered"
945 " mode has not been requested",
946 (void *)dev,
947 dev->data->dev_conf.rxmode.max_rx_pkt_len,
948 mb_len - RTE_PKTMBUF_HEADROOM);
949 }
950 DEBUG("%p: maximum number of segments per packet: %u",
951 (void *)dev, 1 << tmpl.rxq.sges_n);
952 if (desc % (1 << tmpl.rxq.sges_n)) {
953 ERROR("%p: number of RX queue descriptors (%u) is not a"
954 " multiple of SGEs per packet (%u)",
955 (void *)dev,
956 desc,
957 1 << tmpl.rxq.sges_n);
958 return EINVAL;
959 }
960 /* Toggle RX checksum offload if hardware supports it. */
961 if (priv->hw_csum)
962 tmpl.rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
963 if (priv->hw_csum_l2tun)
964 tmpl.rxq.csum_l2tun =
965 !!dev->data->dev_conf.rxmode.hw_ip_checksum;
966 /* Use the entire RX mempool as the memory region. */
967 tmpl.mr = mlx5_mp2mr(priv->pd, mp);
968 if (tmpl.mr == NULL) {
969 ret = EINVAL;
970 ERROR("%p: MR creation failure: %s",
971 (void *)dev, strerror(ret));
972 goto error;
973 }
974 if (dev->data->dev_conf.intr_conf.rxq) {
975 tmpl.channel = ibv_create_comp_channel(priv->ctx);
976 if (tmpl.channel == NULL) {
977 dev->data->dev_conf.intr_conf.rxq = 0;
978 ret = ENOMEM;
979 ERROR("%p: Comp Channel creation failure: %s",
980 (void *)dev, strerror(ret));
981 goto error;
982 }
983 }
984 attr.cq = (struct ibv_exp_cq_init_attr){
985 .comp_mask = 0,
986 };
987 if (priv->cqe_comp) {
988 attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
989 attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
990 cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
991 }
992 tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl.channel, 0,
993 &attr.cq);
994 if (tmpl.cq == NULL) {
995 ret = ENOMEM;
996 ERROR("%p: CQ creation failure: %s",
997 (void *)dev, strerror(ret));
998 goto error;
999 }
1000 DEBUG("priv->device_attr.max_qp_wr is %d",
1001 priv->device_attr.max_qp_wr);
1002 DEBUG("priv->device_attr.max_sge is %d",
1003 priv->device_attr.max_sge);
1004 /* Configure VLAN stripping. */
1005 tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
1006 !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1007 attr.wq = (struct ibv_exp_wq_init_attr){
1008 .wq_context = NULL, /* Could be useful in the future. */
1009 .wq_type = IBV_EXP_WQT_RQ,
1010 /* Max number of outstanding WRs. */
1011 .max_recv_wr = desc >> tmpl.rxq.sges_n,
1012 /* Max number of scatter/gather elements in a WR. */
1013 .max_recv_sge = 1 << tmpl.rxq.sges_n,
1014 .pd = priv->pd,
1015 .cq = tmpl.cq,
1016 .comp_mask =
1017 IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
1018 0,
1019 .vlan_offloads = (tmpl.rxq.vlan_strip ?
1020 IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
1021 0),
1022 };
1023 /* By default, FCS (CRC) is stripped by hardware. */
1024 if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1025 tmpl.rxq.crc_present = 0;
1026 } else if (priv->hw_fcs_strip) {
1027 /* Ask HW/Verbs to leave CRC in place when supported. */
1028 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
1029 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1030 tmpl.rxq.crc_present = 1;
1031 } else {
1032 WARN("%p: CRC stripping has been disabled but will still"
1033 " be performed by hardware, make sure MLNX_OFED and"
1034 " firmware are up to date",
1035 (void *)dev);
1036 tmpl.rxq.crc_present = 0;
1037 }
1038 DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1039 " incoming frames to hide it",
1040 (void *)dev,
1041 tmpl.rxq.crc_present ? "disabled" : "enabled",
1042 tmpl.rxq.crc_present << 2);
1043 if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
1044 ; /* Nothing else to do. */
1045 else if (priv->hw_padding) {
1046 INFO("%p: enabling packet padding on queue %p",
1047 (void *)dev, (void *)rxq_ctrl);
1048 attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
1049 attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
1050 } else
1051 WARN("%p: packet padding has been requested but is not"
1052 " supported, make sure MLNX_OFED and firmware are"
1053 " up to date",
1054 (void *)dev);
1055
1056 tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
1057 if (tmpl.wq == NULL) {
1058 ret = (errno ? errno : EINVAL);
1059 ERROR("%p: WQ creation failure: %s",
1060 (void *)dev, strerror(ret));
1061 goto error;
1062 }
1063 /*
1064 * Make sure number of WRs*SGEs match expectations since a queue
1065 * cannot allocate more than "desc" buffers.
1066 */
1067 if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
1068 ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
1069 ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1070 (void *)dev,
1071 (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
1072 attr.wq.max_recv_wr, attr.wq.max_recv_sge);
1073 ret = EINVAL;
1074 goto error;
1075 }
1076 /* Save port ID. */
1077 tmpl.rxq.port_id = dev->data->port_id;
1078 DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
1079 /* Change queue state to ready. */
1080 mod = (struct ibv_exp_wq_attr){
1081 .attr_mask = IBV_EXP_WQ_ATTR_STATE,
1082 .wq_state = IBV_EXP_WQS_RDY,
1083 };
1084 ret = ibv_exp_modify_wq(tmpl.wq, &mod);
1085 if (ret) {
1086 ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
1087 (void *)dev, strerror(ret));
1088 goto error;
1089 }
1090 ret = rxq_setup(&tmpl);
1091 if (ret) {
1092 ERROR("%p: cannot initialize RX queue structure: %s",
1093 (void *)dev, strerror(ret));
1094 goto error;
1095 }
1096 /* Reuse buffers from original queue if possible. */
1097 if (rxq_ctrl->rxq.elts_n) {
1098 assert(1 << rxq_ctrl->rxq.elts_n == desc);
1099 assert(rxq_ctrl->rxq.elts != tmpl.rxq.elts);
1100 ret = rxq_alloc_elts(&tmpl, desc, rxq_ctrl->rxq.elts);
1101 } else
1102 ret = rxq_alloc_elts(&tmpl, desc, NULL);
1103 if (ret) {
1104 ERROR("%p: RXQ allocation failed: %s",
1105 (void *)dev, strerror(ret));
1106 goto error;
1107 }
1108 /* Clean up rxq in case we're reinitializing it. */
1109 DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq_ctrl);
1110 rxq_cleanup(rxq_ctrl);
1111 /* Move mbuf pointers to dedicated storage area in RX queue. */
1112 elts = (void *)(rxq_ctrl + 1);
1113 rte_memcpy(elts, tmpl.rxq.elts, sizeof(*elts));
1114 #ifndef NDEBUG
1115 memset(tmpl.rxq.elts, 0x55, sizeof(*elts));
1116 #endif
1117 rte_free(tmpl.rxq.elts);
1118 tmpl.rxq.elts = elts;
1119 *rxq_ctrl = tmpl;
1120 /* Update doorbell counter. */
1121 rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
1122 rte_wmb();
1123 *rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
1124 DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1125 assert(ret == 0);
1126 return 0;
1127 error:
1128 elts = tmpl.rxq.elts;
1129 rxq_cleanup(&tmpl);
1130 rte_free(elts);
1131 assert(ret > 0);
1132 return ret;
1133 }
1134
1135 /**
1136 * DPDK callback to configure a RX queue.
1137 *
1138 * @param dev
1139 * Pointer to Ethernet device structure.
1140 * @param idx
1141 * RX queue index.
1142 * @param desc
1143 * Number of descriptors to configure in queue.
1144 * @param socket
1145 * NUMA socket on which memory must be allocated.
1146 * @param[in] conf
1147 * Thresholds parameters.
1148 * @param mp
1149 * Memory pool for buffer allocations.
1150 *
1151 * @return
1152 * 0 on success, negative errno value on failure.
1153 */
1154 int
1155 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
1156 unsigned int socket, const struct rte_eth_rxconf *conf,
1157 struct rte_mempool *mp)
1158 {
1159 struct priv *priv = dev->data->dev_private;
1160 struct rxq *rxq = (*priv->rxqs)[idx];
1161 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1162 int ret;
1163
1164 if (mlx5_is_secondary())
1165 return -E_RTE_SECONDARY;
1166
1167 priv_lock(priv);
1168 if (!rte_is_power_of_2(desc)) {
1169 desc = 1 << log2above(desc);
1170 WARN("%p: increased number of descriptors in RX queue %u"
1171 " to the next power of two (%d)",
1172 (void *)dev, idx, desc);
1173 }
1174 DEBUG("%p: configuring queue %u for %u descriptors",
1175 (void *)dev, idx, desc);
1176 if (idx >= priv->rxqs_n) {
1177 ERROR("%p: queue index out of range (%u >= %u)",
1178 (void *)dev, idx, priv->rxqs_n);
1179 priv_unlock(priv);
1180 return -EOVERFLOW;
1181 }
1182 if (rxq != NULL) {
1183 DEBUG("%p: reusing already allocated queue index %u (%p)",
1184 (void *)dev, idx, (void *)rxq);
1185 if (priv->started) {
1186 priv_unlock(priv);
1187 return -EEXIST;
1188 }
1189 (*priv->rxqs)[idx] = NULL;
1190 rxq_cleanup(rxq_ctrl);
1191 /* Resize if rxq size is changed. */
1192 if (rxq_ctrl->rxq.elts_n != log2above(desc)) {
1193 rxq_ctrl = rte_realloc(rxq_ctrl,
1194 sizeof(*rxq_ctrl) +
1195 desc * sizeof(struct rte_mbuf *),
1196 RTE_CACHE_LINE_SIZE);
1197 if (!rxq_ctrl) {
1198 ERROR("%p: unable to reallocate queue index %u",
1199 (void *)dev, idx);
1200 priv_unlock(priv);
1201 return -ENOMEM;
1202 }
1203 }
1204 } else {
1205 rxq_ctrl = rte_calloc_socket("RXQ", 1, sizeof(*rxq_ctrl) +
1206 desc * sizeof(struct rte_mbuf *),
1207 0, socket);
1208 if (rxq_ctrl == NULL) {
1209 ERROR("%p: unable to allocate queue index %u",
1210 (void *)dev, idx);
1211 priv_unlock(priv);
1212 return -ENOMEM;
1213 }
1214 }
1215 ret = rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
1216 if (ret)
1217 rte_free(rxq_ctrl);
1218 else {
1219 rxq_ctrl->rxq.stats.idx = idx;
1220 DEBUG("%p: adding RX queue %p to list",
1221 (void *)dev, (void *)rxq_ctrl);
1222 (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
1223 /* Update receive callback. */
1224 priv_select_rx_function(priv);
1225 }
1226 priv_unlock(priv);
1227 return -ret;
1228 }
1229
1230 /**
1231 * DPDK callback to release a RX queue.
1232 *
1233 * @param dpdk_rxq
1234 * Generic RX queue pointer.
1235 */
1236 void
1237 mlx5_rx_queue_release(void *dpdk_rxq)
1238 {
1239 struct rxq *rxq = (struct rxq *)dpdk_rxq;
1240 struct rxq_ctrl *rxq_ctrl;
1241 struct priv *priv;
1242 unsigned int i;
1243
1244 if (mlx5_is_secondary())
1245 return;
1246
1247 if (rxq == NULL)
1248 return;
1249 rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1250 priv = rxq_ctrl->priv;
1251 priv_lock(priv);
1252 if (priv_flow_rxq_in_use(priv, rxq))
1253 rte_panic("Rx queue %p is still used by a flow and cannot be"
1254 " removed\n", (void *)rxq_ctrl);
1255 for (i = 0; (i != priv->rxqs_n); ++i)
1256 if ((*priv->rxqs)[i] == rxq) {
1257 DEBUG("%p: removing RX queue %p from list",
1258 (void *)priv->dev, (void *)rxq_ctrl);
1259 (*priv->rxqs)[i] = NULL;
1260 break;
1261 }
1262 rxq_cleanup(rxq_ctrl);
1263 rte_free(rxq_ctrl);
1264 priv_unlock(priv);
1265 }
1266
1267 /**
1268 * DPDK callback for RX in secondary processes.
1269 *
1270 * This function configures all queues from primary process information
1271 * if necessary before reverting to the normal RX burst callback.
1272 *
1273 * @param dpdk_rxq
1274 * Generic pointer to RX queue structure.
1275 * @param[out] pkts
1276 * Array to store received packets.
1277 * @param pkts_n
1278 * Maximum number of packets in array.
1279 *
1280 * @return
1281 * Number of packets successfully received (<= pkts_n).
1282 */
1283 uint16_t
1284 mlx5_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts,
1285 uint16_t pkts_n)
1286 {
1287 struct rxq *rxq = dpdk_rxq;
1288 struct rxq_ctrl *rxq_ctrl = container_of(rxq, struct rxq_ctrl, rxq);
1289 struct priv *priv = mlx5_secondary_data_setup(rxq_ctrl->priv);
1290 struct priv *primary_priv;
1291 unsigned int index;
1292
1293 if (priv == NULL)
1294 return 0;
1295 primary_priv =
1296 mlx5_secondary_data[priv->dev->data->port_id].primary_priv;
1297 /* Look for queue index in both private structures. */
1298 for (index = 0; index != priv->rxqs_n; ++index)
1299 if (((*primary_priv->rxqs)[index] == rxq) ||
1300 ((*priv->rxqs)[index] == rxq))
1301 break;
1302 if (index == priv->rxqs_n)
1303 return 0;
1304 rxq = (*priv->rxqs)[index];
1305 return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n);
1306 }
1307
1308 /**
1309 * Fill epoll fd list for rxq interrupts.
1310 *
1311 * @param priv
1312 * Private structure.
1313 *
1314 * @return
1315 * 0 on success, negative on failure.
1316 */
1317 int
1318 priv_intr_efd_enable(struct priv *priv)
1319 {
1320 unsigned int i;
1321 unsigned int rxqs_n = priv->rxqs_n;
1322 unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1323 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1324
1325 if (n == 0)
1326 return 0;
1327 if (n < rxqs_n) {
1328 WARN("rxqs num is larger than EAL max interrupt vector "
1329 "%u > %u unable to supprt rxq interrupts",
1330 rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
1331 return -EINVAL;
1332 }
1333 intr_handle->type = RTE_INTR_HANDLE_EXT;
1334 for (i = 0; i != n; ++i) {
1335 struct rxq *rxq = (*priv->rxqs)[i];
1336 struct rxq_ctrl *rxq_ctrl =
1337 container_of(rxq, struct rxq_ctrl, rxq);
1338 int fd = rxq_ctrl->channel->fd;
1339 int flags;
1340 int rc;
1341
1342 flags = fcntl(fd, F_GETFL);
1343 rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1344 if (rc < 0) {
1345 WARN("failed to change rxq interrupt file "
1346 "descriptor %d for queue index %d", fd, i);
1347 return -1;
1348 }
1349 intr_handle->efds[i] = fd;
1350 }
1351 intr_handle->nb_efd = n;
1352 return 0;
1353 }
1354
1355 /**
1356 * Clean epoll fd list for rxq interrupts.
1357 *
1358 * @param priv
1359 * Private structure.
1360 */
1361 void
1362 priv_intr_efd_disable(struct priv *priv)
1363 {
1364 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1365
1366 rte_intr_free_epoll_fd(intr_handle);
1367 }
1368
1369 /**
1370 * Create and init interrupt vector array.
1371 *
1372 * @param priv
1373 * Private structure.
1374 *
1375 * @return
1376 * 0 on success, negative on failure.
1377 */
1378 int
1379 priv_create_intr_vec(struct priv *priv)
1380 {
1381 unsigned int rxqs_n = priv->rxqs_n;
1382 unsigned int i;
1383 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1384
1385 if (rxqs_n == 0)
1386 return 0;
1387 intr_handle->intr_vec = (int *)
1388 rte_malloc("intr_vec", rxqs_n * sizeof(int), 0);
1389 if (intr_handle->intr_vec == NULL) {
1390 WARN("Failed to allocate memory for intr_vec "
1391 "rxq interrupt will not be supported");
1392 return -ENOMEM;
1393 }
1394 for (i = 0; i != rxqs_n; ++i) {
1395 /* 1:1 mapping between rxq and interrupt. */
1396 intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
1397 }
1398 return 0;
1399 }
1400
1401 /**
1402 * Destroy init interrupt vector array.
1403 *
1404 * @param priv
1405 * Private structure.
1406 *
1407 * @return
1408 * 0 on success, negative on failure.
1409 */
1410 void
1411 priv_destroy_intr_vec(struct priv *priv)
1412 {
1413 struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
1414
1415 rte_free(intr_handle->intr_vec);
1416 }