]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*- |
2 | * BSD LICENSE | |
3 | * | |
4 | * Copyright 2012-2015 6WIND S.A. | |
5 | * Copyright 2012 Mellanox. | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * * Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * * Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * * Neither the name of 6WIND S.A. nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
32 | */ | |
33 | ||
34 | /* | |
35 | * Known limitations: | |
36 | * - RSS hash key and options cannot be modified. | |
37 | * - Hardware counters aren't implemented. | |
38 | */ | |
39 | ||
40 | /* System headers. */ | |
41 | #include <stddef.h> | |
42 | #include <stdio.h> | |
43 | #include <stdlib.h> | |
44 | #include <stdint.h> | |
45 | #include <inttypes.h> | |
46 | #include <string.h> | |
47 | #include <errno.h> | |
48 | #include <unistd.h> | |
49 | #include <limits.h> | |
50 | #include <assert.h> | |
51 | #include <arpa/inet.h> | |
52 | #include <net/if.h> | |
53 | #include <dirent.h> | |
54 | #include <sys/ioctl.h> | |
55 | #include <sys/socket.h> | |
56 | #include <netinet/in.h> | |
57 | #include <linux/ethtool.h> | |
58 | #include <linux/sockios.h> | |
59 | #include <fcntl.h> | |
60 | ||
61 | /* Verbs header. */ | |
62 | /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */ | |
63 | #ifdef PEDANTIC | |
64 | #pragma GCC diagnostic ignored "-Wpedantic" | |
65 | #endif | |
66 | #include <infiniband/verbs.h> | |
67 | #ifdef PEDANTIC | |
68 | #pragma GCC diagnostic error "-Wpedantic" | |
69 | #endif | |
70 | ||
71 | /* DPDK headers don't like -pedantic. */ | |
72 | #ifdef PEDANTIC | |
73 | #pragma GCC diagnostic ignored "-Wpedantic" | |
74 | #endif | |
75 | #include <rte_ether.h> | |
76 | #include <rte_ethdev.h> | |
77 | #include <rte_dev.h> | |
78 | #include <rte_mbuf.h> | |
79 | #include <rte_errno.h> | |
80 | #include <rte_mempool.h> | |
81 | #include <rte_prefetch.h> | |
82 | #include <rte_malloc.h> | |
83 | #include <rte_spinlock.h> | |
84 | #include <rte_atomic.h> | |
85 | #include <rte_version.h> | |
86 | #include <rte_log.h> | |
87 | #include <rte_alarm.h> | |
88 | #include <rte_memory.h> | |
89 | #ifdef PEDANTIC | |
90 | #pragma GCC diagnostic error "-Wpedantic" | |
91 | #endif | |
92 | ||
93 | /* Generated configuration header. */ | |
94 | #include "mlx4_autoconf.h" | |
95 | ||
96 | /* PMD header. */ | |
97 | #include "mlx4.h" | |
98 | ||
99 | /* Runtime logging through RTE_LOG() is enabled when not in debugging mode. | |
100 | * Intermediate LOG_*() macros add the required end-of-line characters. */ | |
101 | #ifndef NDEBUG | |
102 | #define INFO(...) DEBUG(__VA_ARGS__) | |
103 | #define WARN(...) DEBUG(__VA_ARGS__) | |
104 | #define ERROR(...) DEBUG(__VA_ARGS__) | |
105 | #else | |
106 | #define LOG__(level, m, ...) \ | |
107 | RTE_LOG(level, PMD, MLX4_DRIVER_NAME ": " m "%c", __VA_ARGS__) | |
108 | #define LOG_(level, ...) LOG__(level, __VA_ARGS__, '\n') | |
109 | #define INFO(...) LOG_(INFO, __VA_ARGS__) | |
110 | #define WARN(...) LOG_(WARNING, __VA_ARGS__) | |
111 | #define ERROR(...) LOG_(ERR, __VA_ARGS__) | |
112 | #endif | |
113 | ||
114 | /* Convenience macros for accessing mbuf fields. */ | |
115 | #define NEXT(m) ((m)->next) | |
116 | #define DATA_LEN(m) ((m)->data_len) | |
117 | #define PKT_LEN(m) ((m)->pkt_len) | |
118 | #define DATA_OFF(m) ((m)->data_off) | |
119 | #define SET_DATA_OFF(m, o) ((m)->data_off = (o)) | |
120 | #define NB_SEGS(m) ((m)->nb_segs) | |
121 | #define PORT(m) ((m)->port) | |
122 | ||
123 | /* Work Request ID data type (64 bit). */ | |
124 | typedef union { | |
125 | struct { | |
126 | uint32_t id; | |
127 | uint16_t offset; | |
128 | } data; | |
129 | uint64_t raw; | |
130 | } wr_id_t; | |
131 | ||
132 | #define WR_ID(o) (((wr_id_t *)&(o))->data) | |
133 | ||
134 | /* Transpose flags. Useful to convert IBV to DPDK flags. */ | |
135 | #define TRANSPOSE(val, from, to) \ | |
136 | (((from) >= (to)) ? \ | |
137 | (((val) & (from)) / ((from) / (to))) : \ | |
138 | (((val) & (from)) * ((to) / (from)))) | |
139 | ||
140 | struct mlx4_rxq_stats { | |
141 | unsigned int idx; /**< Mapping index. */ | |
142 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
143 | uint64_t ipackets; /**< Total of successfully received packets. */ | |
144 | uint64_t ibytes; /**< Total of successfully received bytes. */ | |
145 | #endif | |
146 | uint64_t idropped; /**< Total of packets dropped when RX ring full. */ | |
147 | uint64_t rx_nombuf; /**< Total of RX mbuf allocation failures. */ | |
148 | }; | |
149 | ||
150 | struct mlx4_txq_stats { | |
151 | unsigned int idx; /**< Mapping index. */ | |
152 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
153 | uint64_t opackets; /**< Total of successfully sent packets. */ | |
154 | uint64_t obytes; /**< Total of successfully sent bytes. */ | |
155 | #endif | |
156 | uint64_t odropped; /**< Total of packets not sent when TX ring full. */ | |
157 | }; | |
158 | ||
159 | /* RX element (scattered packets). */ | |
160 | struct rxq_elt_sp { | |
161 | struct ibv_recv_wr wr; /* Work Request. */ | |
162 | struct ibv_sge sges[MLX4_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */ | |
163 | struct rte_mbuf *bufs[MLX4_PMD_SGE_WR_N]; /* SGEs buffers. */ | |
164 | }; | |
165 | ||
166 | /* RX element. */ | |
167 | struct rxq_elt { | |
168 | struct ibv_recv_wr wr; /* Work Request. */ | |
169 | struct ibv_sge sge; /* Scatter/Gather Element. */ | |
170 | /* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */ | |
171 | }; | |
172 | ||
173 | /* RX queue descriptor. */ | |
174 | struct rxq { | |
175 | struct priv *priv; /* Back pointer to private data. */ | |
176 | struct rte_mempool *mp; /* Memory Pool for allocations. */ | |
177 | struct ibv_mr *mr; /* Memory Region (for mp). */ | |
178 | struct ibv_cq *cq; /* Completion Queue. */ | |
179 | struct ibv_qp *qp; /* Queue Pair. */ | |
180 | struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ | |
181 | struct ibv_exp_cq_family *if_cq; /* CQ interface. */ | |
182 | /* | |
183 | * Each VLAN ID requires a separate flow steering rule. | |
184 | */ | |
185 | BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES); | |
186 | struct ibv_flow *mac_flow[MLX4_MAX_MAC_ADDRESSES][MLX4_MAX_VLAN_IDS]; | |
187 | struct ibv_flow *promisc_flow; /* Promiscuous flow. */ | |
188 | struct ibv_flow *allmulti_flow; /* Multicast flow. */ | |
189 | unsigned int port_id; /* Port ID for incoming packets. */ | |
190 | unsigned int elts_n; /* (*elts)[] length. */ | |
191 | unsigned int elts_head; /* Current index in (*elts)[]. */ | |
192 | union { | |
193 | struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */ | |
194 | struct rxq_elt (*no_sp)[]; /* RX elements. */ | |
195 | } elts; | |
196 | unsigned int sp:1; /* Use scattered RX elements. */ | |
197 | unsigned int csum:1; /* Enable checksum offloading. */ | |
198 | unsigned int csum_l2tun:1; /* Same for L2 tunnels. */ | |
199 | struct mlx4_rxq_stats stats; /* RX queue counters. */ | |
200 | unsigned int socket; /* CPU socket ID for allocations. */ | |
201 | struct ibv_exp_res_domain *rd; /* Resource Domain. */ | |
202 | }; | |
203 | ||
204 | /* TX element. */ | |
205 | struct txq_elt { | |
206 | struct rte_mbuf *buf; | |
207 | }; | |
208 | ||
209 | /* Linear buffer type. It is used when transmitting buffers with too many | |
210 | * segments that do not fit the hardware queue (see max_send_sge). | |
211 | * Extra segments are copied (linearized) in such buffers, replacing the | |
212 | * last SGE during TX. | |
213 | * The size is arbitrary but large enough to hold a jumbo frame with | |
214 | * 8 segments considering mbuf.buf_len is about 2048 bytes. */ | |
215 | typedef uint8_t linear_t[16384]; | |
216 | ||
217 | /* TX queue descriptor. */ | |
218 | struct txq { | |
219 | struct priv *priv; /* Back pointer to private data. */ | |
220 | struct { | |
221 | const struct rte_mempool *mp; /* Cached Memory Pool. */ | |
222 | struct ibv_mr *mr; /* Memory Region (for mp). */ | |
223 | uint32_t lkey; /* mr->lkey */ | |
224 | } mp2mr[MLX4_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ | |
225 | struct ibv_cq *cq; /* Completion Queue. */ | |
226 | struct ibv_qp *qp; /* Queue Pair. */ | |
227 | struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */ | |
228 | struct ibv_exp_cq_family *if_cq; /* CQ interface. */ | |
229 | #if MLX4_PMD_MAX_INLINE > 0 | |
230 | uint32_t max_inline; /* Max inline send size <= MLX4_PMD_MAX_INLINE. */ | |
231 | #endif | |
232 | unsigned int elts_n; /* (*elts)[] length. */ | |
233 | struct txq_elt (*elts)[]; /* TX elements. */ | |
234 | unsigned int elts_head; /* Current index in (*elts)[]. */ | |
235 | unsigned int elts_tail; /* First element awaiting completion. */ | |
236 | unsigned int elts_comp; /* Number of completion requests. */ | |
237 | unsigned int elts_comp_cd; /* Countdown for next completion request. */ | |
238 | unsigned int elts_comp_cd_init; /* Initial value for countdown. */ | |
239 | struct mlx4_txq_stats stats; /* TX queue counters. */ | |
240 | linear_t (*elts_linear)[]; /* Linearized buffers. */ | |
241 | struct ibv_mr *mr_linear; /* Memory Region for linearized buffers. */ | |
242 | unsigned int socket; /* CPU socket ID for allocations. */ | |
243 | struct ibv_exp_res_domain *rd; /* Resource Domain. */ | |
244 | }; | |
245 | ||
246 | struct priv { | |
247 | struct rte_eth_dev *dev; /* Ethernet device. */ | |
248 | struct ibv_context *ctx; /* Verbs context. */ | |
249 | struct ibv_device_attr device_attr; /* Device properties. */ | |
250 | struct ibv_pd *pd; /* Protection Domain. */ | |
251 | /* | |
252 | * MAC addresses array and configuration bit-field. | |
253 | * An extra entry that cannot be modified by the DPDK is reserved | |
254 | * for broadcast frames (destination MAC address ff:ff:ff:ff:ff:ff). | |
255 | */ | |
256 | struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES]; | |
257 | BITFIELD_DECLARE(mac_configured, uint32_t, MLX4_MAX_MAC_ADDRESSES); | |
258 | /* VLAN filters. */ | |
259 | struct { | |
260 | unsigned int enabled:1; /* If enabled. */ | |
261 | unsigned int id:12; /* VLAN ID (0-4095). */ | |
262 | } vlan_filter[MLX4_MAX_VLAN_IDS]; /* VLAN filters table. */ | |
263 | /* Device properties. */ | |
264 | uint16_t mtu; /* Configured MTU. */ | |
265 | uint8_t port; /* Physical port number. */ | |
266 | unsigned int started:1; /* Device started, flows enabled. */ | |
267 | unsigned int promisc:1; /* Device in promiscuous mode. */ | |
268 | unsigned int allmulti:1; /* Device receives all multicast packets. */ | |
269 | unsigned int hw_qpg:1; /* QP groups are supported. */ | |
270 | unsigned int hw_tss:1; /* TSS is supported. */ | |
271 | unsigned int hw_rss:1; /* RSS is supported. */ | |
272 | unsigned int hw_csum:1; /* Checksum offload is supported. */ | |
273 | unsigned int hw_csum_l2tun:1; /* Same for L2 tunnels. */ | |
274 | unsigned int rss:1; /* RSS is enabled. */ | |
275 | unsigned int vf:1; /* This is a VF device. */ | |
276 | unsigned int pending_alarm:1; /* An alarm is pending. */ | |
277 | #ifdef INLINE_RECV | |
278 | unsigned int inl_recv_size; /* Inline recv size */ | |
279 | #endif | |
280 | unsigned int max_rss_tbl_sz; /* Maximum number of RSS queues. */ | |
281 | /* RX/TX queues. */ | |
282 | struct rxq rxq_parent; /* Parent queue when RSS is enabled. */ | |
283 | unsigned int rxqs_n; /* RX queues array size. */ | |
284 | unsigned int txqs_n; /* TX queues array size. */ | |
285 | struct rxq *(*rxqs)[]; /* RX queues. */ | |
286 | struct txq *(*txqs)[]; /* TX queues. */ | |
287 | struct rte_intr_handle intr_handle; /* Interrupt handler. */ | |
288 | rte_spinlock_t lock; /* Lock for control functions. */ | |
289 | }; | |
290 | ||
291 | /* Local storage for secondary process data. */ | |
292 | struct mlx4_secondary_data { | |
293 | struct rte_eth_dev_data data; /* Local device data. */ | |
294 | struct priv *primary_priv; /* Private structure from primary. */ | |
295 | struct rte_eth_dev_data *shared_dev_data; /* Shared device data. */ | |
296 | rte_spinlock_t lock; /* Port configuration lock. */ | |
297 | } mlx4_secondary_data[RTE_MAX_ETHPORTS]; | |
298 | ||
299 | /** | |
300 | * Check if running as a secondary process. | |
301 | * | |
302 | * @return | |
303 | * Nonzero if running as a secondary process. | |
304 | */ | |
305 | static inline int | |
306 | mlx4_is_secondary(void) | |
307 | { | |
308 | return rte_eal_process_type() != RTE_PROC_PRIMARY; | |
309 | } | |
310 | ||
311 | /** | |
312 | * Return private structure associated with an Ethernet device. | |
313 | * | |
314 | * @param dev | |
315 | * Pointer to Ethernet device structure. | |
316 | * | |
317 | * @return | |
318 | * Pointer to private structure. | |
319 | */ | |
320 | static struct priv * | |
321 | mlx4_get_priv(struct rte_eth_dev *dev) | |
322 | { | |
323 | struct mlx4_secondary_data *sd; | |
324 | ||
325 | if (!mlx4_is_secondary()) | |
326 | return dev->data->dev_private; | |
327 | sd = &mlx4_secondary_data[dev->data->port_id]; | |
328 | return sd->data.dev_private; | |
329 | } | |
330 | ||
331 | /** | |
332 | * Lock private structure to protect it from concurrent access in the | |
333 | * control path. | |
334 | * | |
335 | * @param priv | |
336 | * Pointer to private structure. | |
337 | */ | |
338 | static void | |
339 | priv_lock(struct priv *priv) | |
340 | { | |
341 | rte_spinlock_lock(&priv->lock); | |
342 | } | |
343 | ||
344 | /** | |
345 | * Unlock private structure. | |
346 | * | |
347 | * @param priv | |
348 | * Pointer to private structure. | |
349 | */ | |
350 | static void | |
351 | priv_unlock(struct priv *priv) | |
352 | { | |
353 | rte_spinlock_unlock(&priv->lock); | |
354 | } | |
355 | ||
356 | /* Allocate a buffer on the stack and fill it with a printf format string. */ | |
357 | #define MKSTR(name, ...) \ | |
358 | char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \ | |
359 | \ | |
360 | snprintf(name, sizeof(name), __VA_ARGS__) | |
361 | ||
362 | /** | |
363 | * Get interface name from private structure. | |
364 | * | |
365 | * @param[in] priv | |
366 | * Pointer to private structure. | |
367 | * @param[out] ifname | |
368 | * Interface name output buffer. | |
369 | * | |
370 | * @return | |
371 | * 0 on success, -1 on failure and errno is set. | |
372 | */ | |
373 | static int | |
374 | priv_get_ifname(const struct priv *priv, char (*ifname)[IF_NAMESIZE]) | |
375 | { | |
376 | DIR *dir; | |
377 | struct dirent *dent; | |
378 | unsigned int dev_type = 0; | |
379 | unsigned int dev_port_prev = ~0u; | |
380 | char match[IF_NAMESIZE] = ""; | |
381 | ||
382 | { | |
383 | MKSTR(path, "%s/device/net", priv->ctx->device->ibdev_path); | |
384 | ||
385 | dir = opendir(path); | |
386 | if (dir == NULL) | |
387 | return -1; | |
388 | } | |
389 | while ((dent = readdir(dir)) != NULL) { | |
390 | char *name = dent->d_name; | |
391 | FILE *file; | |
392 | unsigned int dev_port; | |
393 | int r; | |
394 | ||
395 | if ((name[0] == '.') && | |
396 | ((name[1] == '\0') || | |
397 | ((name[1] == '.') && (name[2] == '\0')))) | |
398 | continue; | |
399 | ||
400 | MKSTR(path, "%s/device/net/%s/%s", | |
401 | priv->ctx->device->ibdev_path, name, | |
402 | (dev_type ? "dev_id" : "dev_port")); | |
403 | ||
404 | file = fopen(path, "rb"); | |
405 | if (file == NULL) { | |
406 | if (errno != ENOENT) | |
407 | continue; | |
408 | /* | |
409 | * Switch to dev_id when dev_port does not exist as | |
410 | * is the case with Linux kernel versions < 3.15. | |
411 | */ | |
412 | try_dev_id: | |
413 | match[0] = '\0'; | |
414 | if (dev_type) | |
415 | break; | |
416 | dev_type = 1; | |
417 | dev_port_prev = ~0u; | |
418 | rewinddir(dir); | |
419 | continue; | |
420 | } | |
421 | r = fscanf(file, (dev_type ? "%x" : "%u"), &dev_port); | |
422 | fclose(file); | |
423 | if (r != 1) | |
424 | continue; | |
425 | /* | |
426 | * Switch to dev_id when dev_port returns the same value for | |
427 | * all ports. May happen when using a MOFED release older than | |
428 | * 3.0 with a Linux kernel >= 3.15. | |
429 | */ | |
430 | if (dev_port == dev_port_prev) | |
431 | goto try_dev_id; | |
432 | dev_port_prev = dev_port; | |
433 | if (dev_port == (priv->port - 1u)) | |
434 | snprintf(match, sizeof(match), "%s", name); | |
435 | } | |
436 | closedir(dir); | |
437 | if (match[0] == '\0') | |
438 | return -1; | |
439 | strncpy(*ifname, match, sizeof(*ifname)); | |
440 | return 0; | |
441 | } | |
442 | ||
443 | /** | |
444 | * Read from sysfs entry. | |
445 | * | |
446 | * @param[in] priv | |
447 | * Pointer to private structure. | |
448 | * @param[in] entry | |
449 | * Entry name relative to sysfs path. | |
450 | * @param[out] buf | |
451 | * Data output buffer. | |
452 | * @param size | |
453 | * Buffer size. | |
454 | * | |
455 | * @return | |
456 | * 0 on success, -1 on failure and errno is set. | |
457 | */ | |
458 | static int | |
459 | priv_sysfs_read(const struct priv *priv, const char *entry, | |
460 | char *buf, size_t size) | |
461 | { | |
462 | char ifname[IF_NAMESIZE]; | |
463 | FILE *file; | |
464 | int ret; | |
465 | int err; | |
466 | ||
467 | if (priv_get_ifname(priv, &ifname)) | |
468 | return -1; | |
469 | ||
470 | MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, | |
471 | ifname, entry); | |
472 | ||
473 | file = fopen(path, "rb"); | |
474 | if (file == NULL) | |
475 | return -1; | |
476 | ret = fread(buf, 1, size, file); | |
477 | err = errno; | |
478 | if (((size_t)ret < size) && (ferror(file))) | |
479 | ret = -1; | |
480 | else | |
481 | ret = size; | |
482 | fclose(file); | |
483 | errno = err; | |
484 | return ret; | |
485 | } | |
486 | ||
487 | /** | |
488 | * Write to sysfs entry. | |
489 | * | |
490 | * @param[in] priv | |
491 | * Pointer to private structure. | |
492 | * @param[in] entry | |
493 | * Entry name relative to sysfs path. | |
494 | * @param[in] buf | |
495 | * Data buffer. | |
496 | * @param size | |
497 | * Buffer size. | |
498 | * | |
499 | * @return | |
500 | * 0 on success, -1 on failure and errno is set. | |
501 | */ | |
502 | static int | |
503 | priv_sysfs_write(const struct priv *priv, const char *entry, | |
504 | char *buf, size_t size) | |
505 | { | |
506 | char ifname[IF_NAMESIZE]; | |
507 | FILE *file; | |
508 | int ret; | |
509 | int err; | |
510 | ||
511 | if (priv_get_ifname(priv, &ifname)) | |
512 | return -1; | |
513 | ||
514 | MKSTR(path, "%s/device/net/%s/%s", priv->ctx->device->ibdev_path, | |
515 | ifname, entry); | |
516 | ||
517 | file = fopen(path, "wb"); | |
518 | if (file == NULL) | |
519 | return -1; | |
520 | ret = fwrite(buf, 1, size, file); | |
521 | err = errno; | |
522 | if (((size_t)ret < size) || (ferror(file))) | |
523 | ret = -1; | |
524 | else | |
525 | ret = size; | |
526 | fclose(file); | |
527 | errno = err; | |
528 | return ret; | |
529 | } | |
530 | ||
531 | /** | |
532 | * Get unsigned long sysfs property. | |
533 | * | |
534 | * @param priv | |
535 | * Pointer to private structure. | |
536 | * @param[in] name | |
537 | * Entry name relative to sysfs path. | |
538 | * @param[out] value | |
539 | * Value output buffer. | |
540 | * | |
541 | * @return | |
542 | * 0 on success, -1 on failure and errno is set. | |
543 | */ | |
544 | static int | |
545 | priv_get_sysfs_ulong(struct priv *priv, const char *name, unsigned long *value) | |
546 | { | |
547 | int ret; | |
548 | unsigned long value_ret; | |
549 | char value_str[32]; | |
550 | ||
551 | ret = priv_sysfs_read(priv, name, value_str, (sizeof(value_str) - 1)); | |
552 | if (ret == -1) { | |
553 | DEBUG("cannot read %s value from sysfs: %s", | |
554 | name, strerror(errno)); | |
555 | return -1; | |
556 | } | |
557 | value_str[ret] = '\0'; | |
558 | errno = 0; | |
559 | value_ret = strtoul(value_str, NULL, 0); | |
560 | if (errno) { | |
561 | DEBUG("invalid %s value `%s': %s", name, value_str, | |
562 | strerror(errno)); | |
563 | return -1; | |
564 | } | |
565 | *value = value_ret; | |
566 | return 0; | |
567 | } | |
568 | ||
569 | /** | |
570 | * Set unsigned long sysfs property. | |
571 | * | |
572 | * @param priv | |
573 | * Pointer to private structure. | |
574 | * @param[in] name | |
575 | * Entry name relative to sysfs path. | |
576 | * @param value | |
577 | * Value to set. | |
578 | * | |
579 | * @return | |
580 | * 0 on success, -1 on failure and errno is set. | |
581 | */ | |
582 | static int | |
583 | priv_set_sysfs_ulong(struct priv *priv, const char *name, unsigned long value) | |
584 | { | |
585 | int ret; | |
586 | MKSTR(value_str, "%lu", value); | |
587 | ||
588 | ret = priv_sysfs_write(priv, name, value_str, (sizeof(value_str) - 1)); | |
589 | if (ret == -1) { | |
590 | DEBUG("cannot write %s `%s' (%lu) to sysfs: %s", | |
591 | name, value_str, value, strerror(errno)); | |
592 | return -1; | |
593 | } | |
594 | return 0; | |
595 | } | |
596 | ||
597 | /** | |
598 | * Perform ifreq ioctl() on associated Ethernet device. | |
599 | * | |
600 | * @param[in] priv | |
601 | * Pointer to private structure. | |
602 | * @param req | |
603 | * Request number to pass to ioctl(). | |
604 | * @param[out] ifr | |
605 | * Interface request structure output buffer. | |
606 | * | |
607 | * @return | |
608 | * 0 on success, -1 on failure and errno is set. | |
609 | */ | |
610 | static int | |
611 | priv_ifreq(const struct priv *priv, int req, struct ifreq *ifr) | |
612 | { | |
613 | int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); | |
614 | int ret = -1; | |
615 | ||
616 | if (sock == -1) | |
617 | return ret; | |
618 | if (priv_get_ifname(priv, &ifr->ifr_name) == 0) | |
619 | ret = ioctl(sock, req, ifr); | |
620 | close(sock); | |
621 | return ret; | |
622 | } | |
623 | ||
624 | /** | |
625 | * Get device MTU. | |
626 | * | |
627 | * @param priv | |
628 | * Pointer to private structure. | |
629 | * @param[out] mtu | |
630 | * MTU value output buffer. | |
631 | * | |
632 | * @return | |
633 | * 0 on success, -1 on failure and errno is set. | |
634 | */ | |
635 | static int | |
636 | priv_get_mtu(struct priv *priv, uint16_t *mtu) | |
637 | { | |
638 | unsigned long ulong_mtu; | |
639 | ||
640 | if (priv_get_sysfs_ulong(priv, "mtu", &ulong_mtu) == -1) | |
641 | return -1; | |
642 | *mtu = ulong_mtu; | |
643 | return 0; | |
644 | } | |
645 | ||
646 | /** | |
647 | * Set device MTU. | |
648 | * | |
649 | * @param priv | |
650 | * Pointer to private structure. | |
651 | * @param mtu | |
652 | * MTU value to set. | |
653 | * | |
654 | * @return | |
655 | * 0 on success, -1 on failure and errno is set. | |
656 | */ | |
657 | static int | |
658 | priv_set_mtu(struct priv *priv, uint16_t mtu) | |
659 | { | |
660 | uint16_t new_mtu; | |
661 | ||
662 | if (priv_set_sysfs_ulong(priv, "mtu", mtu) || | |
663 | priv_get_mtu(priv, &new_mtu)) | |
664 | return -1; | |
665 | if (new_mtu == mtu) | |
666 | return 0; | |
667 | errno = EINVAL; | |
668 | return -1; | |
669 | } | |
670 | ||
671 | /** | |
672 | * Set device flags. | |
673 | * | |
674 | * @param priv | |
675 | * Pointer to private structure. | |
676 | * @param keep | |
677 | * Bitmask for flags that must remain untouched. | |
678 | * @param flags | |
679 | * Bitmask for flags to modify. | |
680 | * | |
681 | * @return | |
682 | * 0 on success, -1 on failure and errno is set. | |
683 | */ | |
684 | static int | |
685 | priv_set_flags(struct priv *priv, unsigned int keep, unsigned int flags) | |
686 | { | |
687 | unsigned long tmp; | |
688 | ||
689 | if (priv_get_sysfs_ulong(priv, "flags", &tmp) == -1) | |
690 | return -1; | |
691 | tmp &= keep; | |
692 | tmp |= (flags & (~keep)); | |
693 | return priv_set_sysfs_ulong(priv, "flags", tmp); | |
694 | } | |
695 | ||
696 | /* Device configuration. */ | |
697 | ||
698 | static int | |
699 | txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, | |
700 | unsigned int socket, const struct rte_eth_txconf *conf); | |
701 | ||
702 | static void | |
703 | txq_cleanup(struct txq *txq); | |
704 | ||
705 | static int | |
706 | rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, | |
707 | unsigned int socket, int inactive, const struct rte_eth_rxconf *conf, | |
708 | struct rte_mempool *mp); | |
709 | ||
710 | static void | |
711 | rxq_cleanup(struct rxq *rxq); | |
712 | ||
713 | /** | |
714 | * Ethernet device configuration. | |
715 | * | |
716 | * Prepare the driver for a given number of TX and RX queues. | |
717 | * Allocate parent RSS queue when several RX queues are requested. | |
718 | * | |
719 | * @param dev | |
720 | * Pointer to Ethernet device structure. | |
721 | * | |
722 | * @return | |
723 | * 0 on success, errno value on failure. | |
724 | */ | |
725 | static int | |
726 | dev_configure(struct rte_eth_dev *dev) | |
727 | { | |
728 | struct priv *priv = dev->data->dev_private; | |
729 | unsigned int rxqs_n = dev->data->nb_rx_queues; | |
730 | unsigned int txqs_n = dev->data->nb_tx_queues; | |
731 | unsigned int tmp; | |
732 | int ret; | |
733 | ||
734 | priv->rxqs = (void *)dev->data->rx_queues; | |
735 | priv->txqs = (void *)dev->data->tx_queues; | |
736 | if (txqs_n != priv->txqs_n) { | |
737 | INFO("%p: TX queues number update: %u -> %u", | |
738 | (void *)dev, priv->txqs_n, txqs_n); | |
739 | priv->txqs_n = txqs_n; | |
740 | } | |
741 | if (rxqs_n == priv->rxqs_n) | |
742 | return 0; | |
743 | if (!rte_is_power_of_2(rxqs_n)) { | |
744 | unsigned n_active; | |
745 | ||
746 | n_active = rte_align32pow2(rxqs_n + 1) >> 1; | |
747 | WARN("%p: number of RX queues must be a power" | |
748 | " of 2: %u queues among %u will be active", | |
749 | (void *)dev, n_active, rxqs_n); | |
750 | } | |
751 | ||
752 | INFO("%p: RX queues number update: %u -> %u", | |
753 | (void *)dev, priv->rxqs_n, rxqs_n); | |
754 | /* If RSS is enabled, disable it first. */ | |
755 | if (priv->rss) { | |
756 | unsigned int i; | |
757 | ||
758 | /* Only if there are no remaining child RX queues. */ | |
759 | for (i = 0; (i != priv->rxqs_n); ++i) | |
760 | if ((*priv->rxqs)[i] != NULL) | |
761 | return EINVAL; | |
762 | rxq_cleanup(&priv->rxq_parent); | |
763 | priv->rss = 0; | |
764 | priv->rxqs_n = 0; | |
765 | } | |
766 | if (rxqs_n <= 1) { | |
767 | /* Nothing else to do. */ | |
768 | priv->rxqs_n = rxqs_n; | |
769 | return 0; | |
770 | } | |
771 | /* Allocate a new RSS parent queue if supported by hardware. */ | |
772 | if (!priv->hw_rss) { | |
773 | ERROR("%p: only a single RX queue can be configured when" | |
774 | " hardware doesn't support RSS", | |
775 | (void *)dev); | |
776 | return EINVAL; | |
777 | } | |
778 | /* Fail if hardware doesn't support that many RSS queues. */ | |
779 | if (rxqs_n >= priv->max_rss_tbl_sz) { | |
780 | ERROR("%p: only %u RX queues can be configured for RSS", | |
781 | (void *)dev, priv->max_rss_tbl_sz); | |
782 | return EINVAL; | |
783 | } | |
784 | priv->rss = 1; | |
785 | tmp = priv->rxqs_n; | |
786 | priv->rxqs_n = rxqs_n; | |
787 | ret = rxq_setup(dev, &priv->rxq_parent, 0, 0, 0, NULL, NULL); | |
788 | if (!ret) | |
789 | return 0; | |
790 | /* Failure, rollback. */ | |
791 | priv->rss = 0; | |
792 | priv->rxqs_n = tmp; | |
793 | assert(ret > 0); | |
794 | return ret; | |
795 | } | |
796 | ||
797 | /** | |
798 | * DPDK callback for Ethernet device configuration. | |
799 | * | |
800 | * @param dev | |
801 | * Pointer to Ethernet device structure. | |
802 | * | |
803 | * @return | |
804 | * 0 on success, negative errno value on failure. | |
805 | */ | |
806 | static int | |
807 | mlx4_dev_configure(struct rte_eth_dev *dev) | |
808 | { | |
809 | struct priv *priv = dev->data->dev_private; | |
810 | int ret; | |
811 | ||
812 | if (mlx4_is_secondary()) | |
813 | return -E_RTE_SECONDARY; | |
814 | priv_lock(priv); | |
815 | ret = dev_configure(dev); | |
816 | assert(ret >= 0); | |
817 | priv_unlock(priv); | |
818 | return -ret; | |
819 | } | |
820 | ||
821 | static uint16_t mlx4_tx_burst(void *, struct rte_mbuf **, uint16_t); | |
822 | static uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t); | |
823 | ||
824 | /** | |
825 | * Configure secondary process queues from a private data pointer (primary | |
826 | * or secondary) and update burst callbacks. Can take place only once. | |
827 | * | |
828 | * All queues must have been previously created by the primary process to | |
829 | * avoid undefined behavior. | |
830 | * | |
831 | * @param priv | |
832 | * Private data pointer from either primary or secondary process. | |
833 | * | |
834 | * @return | |
835 | * Private data pointer from secondary process, NULL in case of error. | |
836 | */ | |
837 | static struct priv * | |
838 | mlx4_secondary_data_setup(struct priv *priv) | |
839 | { | |
840 | unsigned int port_id = 0; | |
841 | struct mlx4_secondary_data *sd; | |
842 | void **tx_queues; | |
843 | void **rx_queues; | |
844 | unsigned int nb_tx_queues; | |
845 | unsigned int nb_rx_queues; | |
846 | unsigned int i; | |
847 | ||
848 | /* priv must be valid at this point. */ | |
849 | assert(priv != NULL); | |
850 | /* priv->dev must also be valid but may point to local memory from | |
851 | * another process, possibly with the same address and must not | |
852 | * be dereferenced yet. */ | |
853 | assert(priv->dev != NULL); | |
854 | /* Determine port ID by finding out where priv comes from. */ | |
855 | while (1) { | |
856 | sd = &mlx4_secondary_data[port_id]; | |
857 | rte_spinlock_lock(&sd->lock); | |
858 | /* Primary process? */ | |
859 | if (sd->primary_priv == priv) | |
860 | break; | |
861 | /* Secondary process? */ | |
862 | if (sd->data.dev_private == priv) | |
863 | break; | |
864 | rte_spinlock_unlock(&sd->lock); | |
865 | if (++port_id == RTE_DIM(mlx4_secondary_data)) | |
866 | port_id = 0; | |
867 | } | |
868 | /* Switch to secondary private structure. If private data has already | |
869 | * been updated by another thread, there is nothing else to do. */ | |
870 | priv = sd->data.dev_private; | |
871 | if (priv->dev->data == &sd->data) | |
872 | goto end; | |
873 | /* Sanity checks. Secondary private structure is supposed to point | |
874 | * to local eth_dev, itself still pointing to the shared device data | |
875 | * structure allocated by the primary process. */ | |
876 | assert(sd->shared_dev_data != &sd->data); | |
877 | assert(sd->data.nb_tx_queues == 0); | |
878 | assert(sd->data.tx_queues == NULL); | |
879 | assert(sd->data.nb_rx_queues == 0); | |
880 | assert(sd->data.rx_queues == NULL); | |
881 | assert(priv != sd->primary_priv); | |
882 | assert(priv->dev->data == sd->shared_dev_data); | |
883 | assert(priv->txqs_n == 0); | |
884 | assert(priv->txqs == NULL); | |
885 | assert(priv->rxqs_n == 0); | |
886 | assert(priv->rxqs == NULL); | |
887 | nb_tx_queues = sd->shared_dev_data->nb_tx_queues; | |
888 | nb_rx_queues = sd->shared_dev_data->nb_rx_queues; | |
889 | /* Allocate local storage for queues. */ | |
890 | tx_queues = rte_zmalloc("secondary ethdev->tx_queues", | |
891 | sizeof(sd->data.tx_queues[0]) * nb_tx_queues, | |
892 | RTE_CACHE_LINE_SIZE); | |
893 | rx_queues = rte_zmalloc("secondary ethdev->rx_queues", | |
894 | sizeof(sd->data.rx_queues[0]) * nb_rx_queues, | |
895 | RTE_CACHE_LINE_SIZE); | |
896 | if (tx_queues == NULL || rx_queues == NULL) | |
897 | goto error; | |
898 | /* Lock to prevent control operations during setup. */ | |
899 | priv_lock(priv); | |
900 | /* TX queues. */ | |
901 | for (i = 0; i != nb_tx_queues; ++i) { | |
902 | struct txq *primary_txq = (*sd->primary_priv->txqs)[i]; | |
903 | struct txq *txq; | |
904 | ||
905 | if (primary_txq == NULL) | |
906 | continue; | |
907 | txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, | |
908 | primary_txq->socket); | |
909 | if (txq != NULL) { | |
910 | if (txq_setup(priv->dev, | |
911 | txq, | |
912 | primary_txq->elts_n * MLX4_PMD_SGE_WR_N, | |
913 | primary_txq->socket, | |
914 | NULL) == 0) { | |
915 | txq->stats.idx = primary_txq->stats.idx; | |
916 | tx_queues[i] = txq; | |
917 | continue; | |
918 | } | |
919 | rte_free(txq); | |
920 | } | |
921 | while (i) { | |
922 | txq = tx_queues[--i]; | |
923 | txq_cleanup(txq); | |
924 | rte_free(txq); | |
925 | } | |
926 | goto error; | |
927 | } | |
928 | /* RX queues. */ | |
929 | for (i = 0; i != nb_rx_queues; ++i) { | |
930 | struct rxq *primary_rxq = (*sd->primary_priv->rxqs)[i]; | |
931 | ||
932 | if (primary_rxq == NULL) | |
933 | continue; | |
934 | /* Not supported yet. */ | |
935 | rx_queues[i] = NULL; | |
936 | } | |
937 | /* Update everything. */ | |
938 | priv->txqs = (void *)tx_queues; | |
939 | priv->txqs_n = nb_tx_queues; | |
940 | priv->rxqs = (void *)rx_queues; | |
941 | priv->rxqs_n = nb_rx_queues; | |
942 | sd->data.rx_queues = rx_queues; | |
943 | sd->data.tx_queues = tx_queues; | |
944 | sd->data.nb_rx_queues = nb_rx_queues; | |
945 | sd->data.nb_tx_queues = nb_tx_queues; | |
946 | sd->data.dev_link = sd->shared_dev_data->dev_link; | |
947 | sd->data.mtu = sd->shared_dev_data->mtu; | |
948 | memcpy(sd->data.rx_queue_state, sd->shared_dev_data->rx_queue_state, | |
949 | sizeof(sd->data.rx_queue_state)); | |
950 | memcpy(sd->data.tx_queue_state, sd->shared_dev_data->tx_queue_state, | |
951 | sizeof(sd->data.tx_queue_state)); | |
952 | sd->data.dev_flags = sd->shared_dev_data->dev_flags; | |
953 | /* Use local data from now on. */ | |
954 | rte_mb(); | |
955 | priv->dev->data = &sd->data; | |
956 | rte_mb(); | |
957 | priv->dev->tx_pkt_burst = mlx4_tx_burst; | |
958 | priv->dev->rx_pkt_burst = removed_rx_burst; | |
959 | priv_unlock(priv); | |
960 | end: | |
961 | /* More sanity checks. */ | |
962 | assert(priv->dev->tx_pkt_burst == mlx4_tx_burst); | |
963 | assert(priv->dev->rx_pkt_burst == removed_rx_burst); | |
964 | assert(priv->dev->data == &sd->data); | |
965 | rte_spinlock_unlock(&sd->lock); | |
966 | return priv; | |
967 | error: | |
968 | priv_unlock(priv); | |
969 | rte_free(tx_queues); | |
970 | rte_free(rx_queues); | |
971 | rte_spinlock_unlock(&sd->lock); | |
972 | return NULL; | |
973 | } | |
974 | ||
975 | /* TX queues handling. */ | |
976 | ||
977 | /** | |
978 | * Allocate TX queue elements. | |
979 | * | |
980 | * @param txq | |
981 | * Pointer to TX queue structure. | |
982 | * @param elts_n | |
983 | * Number of elements to allocate. | |
984 | * | |
985 | * @return | |
986 | * 0 on success, errno value on failure. | |
987 | */ | |
988 | static int | |
989 | txq_alloc_elts(struct txq *txq, unsigned int elts_n) | |
990 | { | |
991 | unsigned int i; | |
992 | struct txq_elt (*elts)[elts_n] = | |
993 | rte_calloc_socket("TXQ", 1, sizeof(*elts), 0, txq->socket); | |
994 | linear_t (*elts_linear)[elts_n] = | |
995 | rte_calloc_socket("TXQ", 1, sizeof(*elts_linear), 0, | |
996 | txq->socket); | |
997 | struct ibv_mr *mr_linear = NULL; | |
998 | int ret = 0; | |
999 | ||
1000 | if ((elts == NULL) || (elts_linear == NULL)) { | |
1001 | ERROR("%p: can't allocate packets array", (void *)txq); | |
1002 | ret = ENOMEM; | |
1003 | goto error; | |
1004 | } | |
1005 | mr_linear = | |
1006 | ibv_reg_mr(txq->priv->pd, elts_linear, sizeof(*elts_linear), | |
1007 | IBV_ACCESS_LOCAL_WRITE); | |
1008 | if (mr_linear == NULL) { | |
1009 | ERROR("%p: unable to configure MR, ibv_reg_mr() failed", | |
1010 | (void *)txq); | |
1011 | ret = EINVAL; | |
1012 | goto error; | |
1013 | } | |
1014 | for (i = 0; (i != elts_n); ++i) { | |
1015 | struct txq_elt *elt = &(*elts)[i]; | |
1016 | ||
1017 | elt->buf = NULL; | |
1018 | } | |
1019 | DEBUG("%p: allocated and configured %u WRs", (void *)txq, elts_n); | |
1020 | txq->elts_n = elts_n; | |
1021 | txq->elts = elts; | |
1022 | txq->elts_head = 0; | |
1023 | txq->elts_tail = 0; | |
1024 | txq->elts_comp = 0; | |
1025 | /* Request send completion every MLX4_PMD_TX_PER_COMP_REQ packets or | |
1026 | * at least 4 times per ring. */ | |
1027 | txq->elts_comp_cd_init = | |
1028 | ((MLX4_PMD_TX_PER_COMP_REQ < (elts_n / 4)) ? | |
1029 | MLX4_PMD_TX_PER_COMP_REQ : (elts_n / 4)); | |
1030 | txq->elts_comp_cd = txq->elts_comp_cd_init; | |
1031 | txq->elts_linear = elts_linear; | |
1032 | txq->mr_linear = mr_linear; | |
1033 | assert(ret == 0); | |
1034 | return 0; | |
1035 | error: | |
1036 | if (mr_linear != NULL) | |
1037 | claim_zero(ibv_dereg_mr(mr_linear)); | |
1038 | ||
1039 | rte_free(elts_linear); | |
1040 | rte_free(elts); | |
1041 | ||
1042 | DEBUG("%p: failed, freed everything", (void *)txq); | |
1043 | assert(ret > 0); | |
1044 | return ret; | |
1045 | } | |
1046 | ||
1047 | /** | |
1048 | * Free TX queue elements. | |
1049 | * | |
1050 | * @param txq | |
1051 | * Pointer to TX queue structure. | |
1052 | */ | |
1053 | static void | |
1054 | txq_free_elts(struct txq *txq) | |
1055 | { | |
1056 | unsigned int elts_n = txq->elts_n; | |
1057 | unsigned int elts_head = txq->elts_head; | |
1058 | unsigned int elts_tail = txq->elts_tail; | |
1059 | struct txq_elt (*elts)[elts_n] = txq->elts; | |
1060 | linear_t (*elts_linear)[elts_n] = txq->elts_linear; | |
1061 | struct ibv_mr *mr_linear = txq->mr_linear; | |
1062 | ||
1063 | DEBUG("%p: freeing WRs", (void *)txq); | |
1064 | txq->elts_n = 0; | |
1065 | txq->elts_head = 0; | |
1066 | txq->elts_tail = 0; | |
1067 | txq->elts_comp = 0; | |
1068 | txq->elts_comp_cd = 0; | |
1069 | txq->elts_comp_cd_init = 0; | |
1070 | txq->elts = NULL; | |
1071 | txq->elts_linear = NULL; | |
1072 | txq->mr_linear = NULL; | |
1073 | if (mr_linear != NULL) | |
1074 | claim_zero(ibv_dereg_mr(mr_linear)); | |
1075 | ||
1076 | rte_free(elts_linear); | |
1077 | if (elts == NULL) | |
1078 | return; | |
1079 | while (elts_tail != elts_head) { | |
1080 | struct txq_elt *elt = &(*elts)[elts_tail]; | |
1081 | ||
1082 | assert(elt->buf != NULL); | |
1083 | rte_pktmbuf_free(elt->buf); | |
1084 | #ifndef NDEBUG | |
1085 | /* Poisoning. */ | |
1086 | memset(elt, 0x77, sizeof(*elt)); | |
1087 | #endif | |
1088 | if (++elts_tail == elts_n) | |
1089 | elts_tail = 0; | |
1090 | } | |
1091 | rte_free(elts); | |
1092 | } | |
1093 | ||
1094 | ||
1095 | /** | |
1096 | * Clean up a TX queue. | |
1097 | * | |
1098 | * Destroy objects, free allocated memory and reset the structure for reuse. | |
1099 | * | |
1100 | * @param txq | |
1101 | * Pointer to TX queue structure. | |
1102 | */ | |
1103 | static void | |
1104 | txq_cleanup(struct txq *txq) | |
1105 | { | |
1106 | struct ibv_exp_release_intf_params params; | |
1107 | size_t i; | |
1108 | ||
1109 | DEBUG("cleaning up %p", (void *)txq); | |
1110 | txq_free_elts(txq); | |
1111 | if (txq->if_qp != NULL) { | |
1112 | assert(txq->priv != NULL); | |
1113 | assert(txq->priv->ctx != NULL); | |
1114 | assert(txq->qp != NULL); | |
1115 | params = (struct ibv_exp_release_intf_params){ | |
1116 | .comp_mask = 0, | |
1117 | }; | |
1118 | claim_zero(ibv_exp_release_intf(txq->priv->ctx, | |
1119 | txq->if_qp, | |
1120 | ¶ms)); | |
1121 | } | |
1122 | if (txq->if_cq != NULL) { | |
1123 | assert(txq->priv != NULL); | |
1124 | assert(txq->priv->ctx != NULL); | |
1125 | assert(txq->cq != NULL); | |
1126 | params = (struct ibv_exp_release_intf_params){ | |
1127 | .comp_mask = 0, | |
1128 | }; | |
1129 | claim_zero(ibv_exp_release_intf(txq->priv->ctx, | |
1130 | txq->if_cq, | |
1131 | ¶ms)); | |
1132 | } | |
1133 | if (txq->qp != NULL) | |
1134 | claim_zero(ibv_destroy_qp(txq->qp)); | |
1135 | if (txq->cq != NULL) | |
1136 | claim_zero(ibv_destroy_cq(txq->cq)); | |
1137 | if (txq->rd != NULL) { | |
1138 | struct ibv_exp_destroy_res_domain_attr attr = { | |
1139 | .comp_mask = 0, | |
1140 | }; | |
1141 | ||
1142 | assert(txq->priv != NULL); | |
1143 | assert(txq->priv->ctx != NULL); | |
1144 | claim_zero(ibv_exp_destroy_res_domain(txq->priv->ctx, | |
1145 | txq->rd, | |
1146 | &attr)); | |
1147 | } | |
1148 | for (i = 0; (i != elemof(txq->mp2mr)); ++i) { | |
1149 | if (txq->mp2mr[i].mp == NULL) | |
1150 | break; | |
1151 | assert(txq->mp2mr[i].mr != NULL); | |
1152 | claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr)); | |
1153 | } | |
1154 | memset(txq, 0, sizeof(*txq)); | |
1155 | } | |
1156 | ||
1157 | /** | |
1158 | * Manage TX completions. | |
1159 | * | |
1160 | * When sending a burst, mlx4_tx_burst() posts several WRs. | |
1161 | * To improve performance, a completion event is only required once every | |
1162 | * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information | |
1163 | * for other WRs, but this information would not be used anyway. | |
1164 | * | |
1165 | * @param txq | |
1166 | * Pointer to TX queue structure. | |
1167 | * | |
1168 | * @return | |
1169 | * 0 on success, -1 on failure. | |
1170 | */ | |
1171 | static int | |
1172 | txq_complete(struct txq *txq) | |
1173 | { | |
1174 | unsigned int elts_comp = txq->elts_comp; | |
1175 | unsigned int elts_tail = txq->elts_tail; | |
1176 | const unsigned int elts_n = txq->elts_n; | |
1177 | int wcs_n; | |
1178 | ||
1179 | if (unlikely(elts_comp == 0)) | |
1180 | return 0; | |
1181 | #ifdef DEBUG_SEND | |
1182 | DEBUG("%p: processing %u work requests completions", | |
1183 | (void *)txq, elts_comp); | |
1184 | #endif | |
1185 | wcs_n = txq->if_cq->poll_cnt(txq->cq, elts_comp); | |
1186 | if (unlikely(wcs_n == 0)) | |
1187 | return 0; | |
1188 | if (unlikely(wcs_n < 0)) { | |
1189 | DEBUG("%p: ibv_poll_cq() failed (wcs_n=%d)", | |
1190 | (void *)txq, wcs_n); | |
1191 | return -1; | |
1192 | } | |
1193 | elts_comp -= wcs_n; | |
1194 | assert(elts_comp <= txq->elts_comp); | |
1195 | /* | |
1196 | * Assume WC status is successful as nothing can be done about it | |
1197 | * anyway. | |
1198 | */ | |
1199 | elts_tail += wcs_n * txq->elts_comp_cd_init; | |
1200 | if (elts_tail >= elts_n) | |
1201 | elts_tail -= elts_n; | |
1202 | txq->elts_tail = elts_tail; | |
1203 | txq->elts_comp = elts_comp; | |
1204 | return 0; | |
1205 | } | |
1206 | ||
1207 | struct mlx4_check_mempool_data { | |
1208 | int ret; | |
1209 | char *start; | |
1210 | char *end; | |
1211 | }; | |
1212 | ||
1213 | /* Called by mlx4_check_mempool() when iterating the memory chunks. */ | |
1214 | static void mlx4_check_mempool_cb(struct rte_mempool *mp, | |
1215 | void *opaque, struct rte_mempool_memhdr *memhdr, | |
1216 | unsigned mem_idx) | |
1217 | { | |
1218 | struct mlx4_check_mempool_data *data = opaque; | |
1219 | ||
1220 | (void)mp; | |
1221 | (void)mem_idx; | |
1222 | ||
1223 | /* It already failed, skip the next chunks. */ | |
1224 | if (data->ret != 0) | |
1225 | return; | |
1226 | /* It is the first chunk. */ | |
1227 | if (data->start == NULL && data->end == NULL) { | |
1228 | data->start = memhdr->addr; | |
1229 | data->end = data->start + memhdr->len; | |
1230 | return; | |
1231 | } | |
1232 | if (data->end == memhdr->addr) { | |
1233 | data->end += memhdr->len; | |
1234 | return; | |
1235 | } | |
1236 | if (data->start == (char *)memhdr->addr + memhdr->len) { | |
1237 | data->start -= memhdr->len; | |
1238 | return; | |
1239 | } | |
1240 | /* Error, mempool is not virtually contigous. */ | |
1241 | data->ret = -1; | |
1242 | } | |
1243 | ||
1244 | /** | |
1245 | * Check if a mempool can be used: it must be virtually contiguous. | |
1246 | * | |
1247 | * @param[in] mp | |
1248 | * Pointer to memory pool. | |
1249 | * @param[out] start | |
1250 | * Pointer to the start address of the mempool virtual memory area | |
1251 | * @param[out] end | |
1252 | * Pointer to the end address of the mempool virtual memory area | |
1253 | * | |
1254 | * @return | |
1255 | * 0 on success (mempool is virtually contiguous), -1 on error. | |
1256 | */ | |
1257 | static int mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, | |
1258 | uintptr_t *end) | |
1259 | { | |
1260 | struct mlx4_check_mempool_data data; | |
1261 | ||
1262 | memset(&data, 0, sizeof(data)); | |
1263 | rte_mempool_mem_iter(mp, mlx4_check_mempool_cb, &data); | |
1264 | *start = (uintptr_t)data.start; | |
1265 | *end = (uintptr_t)data.end; | |
1266 | ||
1267 | return data.ret; | |
1268 | } | |
1269 | ||
1270 | /* For best performance, this function should not be inlined. */ | |
1271 | static struct ibv_mr *mlx4_mp2mr(struct ibv_pd *, struct rte_mempool *) | |
1272 | __attribute__((noinline)); | |
1273 | ||
1274 | /** | |
1275 | * Register mempool as a memory region. | |
1276 | * | |
1277 | * @param pd | |
1278 | * Pointer to protection domain. | |
1279 | * @param mp | |
1280 | * Pointer to memory pool. | |
1281 | * | |
1282 | * @return | |
1283 | * Memory region pointer, NULL in case of error. | |
1284 | */ | |
1285 | static struct ibv_mr * | |
1286 | mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp) | |
1287 | { | |
1288 | const struct rte_memseg *ms = rte_eal_get_physmem_layout(); | |
1289 | uintptr_t start; | |
1290 | uintptr_t end; | |
1291 | unsigned int i; | |
1292 | ||
1293 | if (mlx4_check_mempool(mp, &start, &end) != 0) { | |
1294 | ERROR("mempool %p: not virtually contiguous", | |
1295 | (void *)mp); | |
1296 | return NULL; | |
1297 | } | |
1298 | ||
1299 | DEBUG("mempool %p area start=%p end=%p size=%zu", | |
1300 | (void *)mp, (void *)start, (void *)end, | |
1301 | (size_t)(end - start)); | |
1302 | /* Round start and end to page boundary if found in memory segments. */ | |
1303 | for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) { | |
1304 | uintptr_t addr = (uintptr_t)ms[i].addr; | |
1305 | size_t len = ms[i].len; | |
1306 | unsigned int align = ms[i].hugepage_sz; | |
1307 | ||
1308 | if ((start > addr) && (start < addr + len)) | |
1309 | start = RTE_ALIGN_FLOOR(start, align); | |
1310 | if ((end > addr) && (end < addr + len)) | |
1311 | end = RTE_ALIGN_CEIL(end, align); | |
1312 | } | |
1313 | DEBUG("mempool %p using start=%p end=%p size=%zu for MR", | |
1314 | (void *)mp, (void *)start, (void *)end, | |
1315 | (size_t)(end - start)); | |
1316 | return ibv_reg_mr(pd, | |
1317 | (void *)start, | |
1318 | end - start, | |
1319 | IBV_ACCESS_LOCAL_WRITE); | |
1320 | } | |
1321 | ||
1322 | /** | |
1323 | * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which | |
1324 | * the cloned mbuf is allocated is returned instead. | |
1325 | * | |
1326 | * @param buf | |
1327 | * Pointer to mbuf. | |
1328 | * | |
1329 | * @return | |
1330 | * Memory pool where data is located for given mbuf. | |
1331 | */ | |
1332 | static struct rte_mempool * | |
1333 | txq_mb2mp(struct rte_mbuf *buf) | |
1334 | { | |
1335 | if (unlikely(RTE_MBUF_INDIRECT(buf))) | |
1336 | return rte_mbuf_from_indirect(buf)->pool; | |
1337 | return buf->pool; | |
1338 | } | |
1339 | ||
1340 | /** | |
1341 | * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. | |
1342 | * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, | |
1343 | * remove an entry first. | |
1344 | * | |
1345 | * @param txq | |
1346 | * Pointer to TX queue structure. | |
1347 | * @param[in] mp | |
1348 | * Memory Pool for which a Memory Region lkey must be returned. | |
1349 | * | |
1350 | * @return | |
1351 | * mr->lkey on success, (uint32_t)-1 on failure. | |
1352 | */ | |
1353 | static uint32_t | |
1354 | txq_mp2mr(struct txq *txq, struct rte_mempool *mp) | |
1355 | { | |
1356 | unsigned int i; | |
1357 | struct ibv_mr *mr; | |
1358 | ||
1359 | for (i = 0; (i != elemof(txq->mp2mr)); ++i) { | |
1360 | if (unlikely(txq->mp2mr[i].mp == NULL)) { | |
1361 | /* Unknown MP, add a new MR for it. */ | |
1362 | break; | |
1363 | } | |
1364 | if (txq->mp2mr[i].mp == mp) { | |
1365 | assert(txq->mp2mr[i].lkey != (uint32_t)-1); | |
1366 | assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey); | |
1367 | return txq->mp2mr[i].lkey; | |
1368 | } | |
1369 | } | |
1370 | /* Add a new entry, register MR first. */ | |
1371 | DEBUG("%p: discovered new memory pool \"%s\" (%p)", | |
1372 | (void *)txq, mp->name, (void *)mp); | |
1373 | mr = mlx4_mp2mr(txq->priv->pd, mp); | |
1374 | if (unlikely(mr == NULL)) { | |
1375 | DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.", | |
1376 | (void *)txq); | |
1377 | return (uint32_t)-1; | |
1378 | } | |
1379 | if (unlikely(i == elemof(txq->mp2mr))) { | |
1380 | /* Table is full, remove oldest entry. */ | |
1381 | DEBUG("%p: MR <-> MP table full, dropping oldest entry.", | |
1382 | (void *)txq); | |
1383 | --i; | |
1384 | claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr)); | |
1385 | memmove(&txq->mp2mr[0], &txq->mp2mr[1], | |
1386 | (sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0]))); | |
1387 | } | |
1388 | /* Store the new entry. */ | |
1389 | txq->mp2mr[i].mp = mp; | |
1390 | txq->mp2mr[i].mr = mr; | |
1391 | txq->mp2mr[i].lkey = mr->lkey; | |
1392 | DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32, | |
1393 | (void *)txq, mp->name, (void *)mp, txq->mp2mr[i].lkey); | |
1394 | return txq->mp2mr[i].lkey; | |
1395 | } | |
1396 | ||
1397 | struct txq_mp2mr_mbuf_check_data { | |
1398 | int ret; | |
1399 | }; | |
1400 | ||
1401 | /** | |
1402 | * Callback function for rte_mempool_obj_iter() to check whether a given | |
1403 | * mempool object looks like a mbuf. | |
1404 | * | |
1405 | * @param[in] mp | |
1406 | * The mempool pointer | |
1407 | * @param[in] arg | |
1408 | * Context data (struct txq_mp2mr_mbuf_check_data). Contains the | |
1409 | * return value. | |
1410 | * @param[in] obj | |
1411 | * Object address. | |
1412 | * @param index | |
1413 | * Object index, unused. | |
1414 | */ | |
1415 | static void | |
1416 | txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj, | |
1417 | uint32_t index __rte_unused) | |
1418 | { | |
1419 | struct txq_mp2mr_mbuf_check_data *data = arg; | |
1420 | struct rte_mbuf *buf = obj; | |
1421 | ||
1422 | /* Check whether mbuf structure fits element size and whether mempool | |
1423 | * pointer is valid. */ | |
1424 | if (sizeof(*buf) > mp->elt_size || buf->pool != mp) | |
1425 | data->ret = -1; | |
1426 | } | |
1427 | ||
1428 | /** | |
1429 | * Iterator function for rte_mempool_walk() to register existing mempools and | |
1430 | * fill the MP to MR cache of a TX queue. | |
1431 | * | |
1432 | * @param[in] mp | |
1433 | * Memory Pool to register. | |
1434 | * @param *arg | |
1435 | * Pointer to TX queue structure. | |
1436 | */ | |
1437 | static void | |
1438 | txq_mp2mr_iter(struct rte_mempool *mp, void *arg) | |
1439 | { | |
1440 | struct txq *txq = arg; | |
1441 | struct txq_mp2mr_mbuf_check_data data = { | |
1442 | .ret = 0, | |
1443 | }; | |
1444 | ||
1445 | /* Register mempool only if the first element looks like a mbuf. */ | |
1446 | if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 || | |
1447 | data.ret == -1) | |
1448 | return; | |
1449 | txq_mp2mr(txq, mp); | |
1450 | } | |
1451 | ||
1452 | #if MLX4_PMD_SGE_WR_N > 1 | |
1453 | ||
1454 | /** | |
1455 | * Copy scattered mbuf contents to a single linear buffer. | |
1456 | * | |
1457 | * @param[out] linear | |
1458 | * Linear output buffer. | |
1459 | * @param[in] buf | |
1460 | * Scattered input buffer. | |
1461 | * | |
1462 | * @return | |
1463 | * Number of bytes copied to the output buffer or 0 if not large enough. | |
1464 | */ | |
1465 | static unsigned int | |
1466 | linearize_mbuf(linear_t *linear, struct rte_mbuf *buf) | |
1467 | { | |
1468 | unsigned int size = 0; | |
1469 | unsigned int offset; | |
1470 | ||
1471 | do { | |
1472 | unsigned int len = DATA_LEN(buf); | |
1473 | ||
1474 | offset = size; | |
1475 | size += len; | |
1476 | if (unlikely(size > sizeof(*linear))) | |
1477 | return 0; | |
1478 | memcpy(&(*linear)[offset], | |
1479 | rte_pktmbuf_mtod(buf, uint8_t *), | |
1480 | len); | |
1481 | buf = NEXT(buf); | |
1482 | } while (buf != NULL); | |
1483 | return size; | |
1484 | } | |
1485 | ||
1486 | /** | |
1487 | * Handle scattered buffers for mlx4_tx_burst(). | |
1488 | * | |
1489 | * @param txq | |
1490 | * TX queue structure. | |
1491 | * @param segs | |
1492 | * Number of segments in buf. | |
1493 | * @param elt | |
1494 | * TX queue element to fill. | |
1495 | * @param[in] buf | |
1496 | * Buffer to process. | |
1497 | * @param elts_head | |
1498 | * Index of the linear buffer to use if necessary (normally txq->elts_head). | |
1499 | * @param[out] sges | |
1500 | * Array filled with SGEs on success. | |
1501 | * | |
1502 | * @return | |
1503 | * A structure containing the processed packet size in bytes and the | |
1504 | * number of SGEs. Both fields are set to (unsigned int)-1 in case of | |
1505 | * failure. | |
1506 | */ | |
1507 | static struct tx_burst_sg_ret { | |
1508 | unsigned int length; | |
1509 | unsigned int num; | |
1510 | } | |
1511 | tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt, | |
1512 | struct rte_mbuf *buf, unsigned int elts_head, | |
1513 | struct ibv_sge (*sges)[MLX4_PMD_SGE_WR_N]) | |
1514 | { | |
1515 | unsigned int sent_size = 0; | |
1516 | unsigned int j; | |
1517 | int linearize = 0; | |
1518 | ||
1519 | /* When there are too many segments, extra segments are | |
1520 | * linearized in the last SGE. */ | |
1521 | if (unlikely(segs > elemof(*sges))) { | |
1522 | segs = (elemof(*sges) - 1); | |
1523 | linearize = 1; | |
1524 | } | |
1525 | /* Update element. */ | |
1526 | elt->buf = buf; | |
1527 | /* Register segments as SGEs. */ | |
1528 | for (j = 0; (j != segs); ++j) { | |
1529 | struct ibv_sge *sge = &(*sges)[j]; | |
1530 | uint32_t lkey; | |
1531 | ||
1532 | /* Retrieve Memory Region key for this memory pool. */ | |
1533 | lkey = txq_mp2mr(txq, txq_mb2mp(buf)); | |
1534 | if (unlikely(lkey == (uint32_t)-1)) { | |
1535 | /* MR does not exist. */ | |
1536 | DEBUG("%p: unable to get MP <-> MR association", | |
1537 | (void *)txq); | |
1538 | /* Clean up TX element. */ | |
1539 | elt->buf = NULL; | |
1540 | goto stop; | |
1541 | } | |
1542 | /* Update SGE. */ | |
1543 | sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); | |
1544 | if (txq->priv->vf) | |
1545 | rte_prefetch0((volatile void *) | |
1546 | (uintptr_t)sge->addr); | |
1547 | sge->length = DATA_LEN(buf); | |
1548 | sge->lkey = lkey; | |
1549 | sent_size += sge->length; | |
1550 | buf = NEXT(buf); | |
1551 | } | |
1552 | /* If buf is not NULL here and is not going to be linearized, | |
1553 | * nb_segs is not valid. */ | |
1554 | assert(j == segs); | |
1555 | assert((buf == NULL) || (linearize)); | |
1556 | /* Linearize extra segments. */ | |
1557 | if (linearize) { | |
1558 | struct ibv_sge *sge = &(*sges)[segs]; | |
1559 | linear_t *linear = &(*txq->elts_linear)[elts_head]; | |
1560 | unsigned int size = linearize_mbuf(linear, buf); | |
1561 | ||
1562 | assert(segs == (elemof(*sges) - 1)); | |
1563 | if (size == 0) { | |
1564 | /* Invalid packet. */ | |
1565 | DEBUG("%p: packet too large to be linearized.", | |
1566 | (void *)txq); | |
1567 | /* Clean up TX element. */ | |
1568 | elt->buf = NULL; | |
1569 | goto stop; | |
1570 | } | |
1571 | /* If MLX4_PMD_SGE_WR_N is 1, free mbuf immediately. */ | |
1572 | if (elemof(*sges) == 1) { | |
1573 | do { | |
1574 | struct rte_mbuf *next = NEXT(buf); | |
1575 | ||
1576 | rte_pktmbuf_free_seg(buf); | |
1577 | buf = next; | |
1578 | } while (buf != NULL); | |
1579 | elt->buf = NULL; | |
1580 | } | |
1581 | /* Update SGE. */ | |
1582 | sge->addr = (uintptr_t)&(*linear)[0]; | |
1583 | sge->length = size; | |
1584 | sge->lkey = txq->mr_linear->lkey; | |
1585 | sent_size += size; | |
1586 | /* Include last segment. */ | |
1587 | segs++; | |
1588 | } | |
1589 | return (struct tx_burst_sg_ret){ | |
1590 | .length = sent_size, | |
1591 | .num = segs, | |
1592 | }; | |
1593 | stop: | |
1594 | return (struct tx_burst_sg_ret){ | |
1595 | .length = -1, | |
1596 | .num = -1, | |
1597 | }; | |
1598 | } | |
1599 | ||
1600 | #endif /* MLX4_PMD_SGE_WR_N > 1 */ | |
1601 | ||
1602 | /** | |
1603 | * DPDK callback for TX. | |
1604 | * | |
1605 | * @param dpdk_txq | |
1606 | * Generic pointer to TX queue structure. | |
1607 | * @param[in] pkts | |
1608 | * Packets to transmit. | |
1609 | * @param pkts_n | |
1610 | * Number of packets in array. | |
1611 | * | |
1612 | * @return | |
1613 | * Number of packets successfully transmitted (<= pkts_n). | |
1614 | */ | |
1615 | static uint16_t | |
1616 | mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) | |
1617 | { | |
1618 | struct txq *txq = (struct txq *)dpdk_txq; | |
1619 | unsigned int elts_head = txq->elts_head; | |
1620 | const unsigned int elts_n = txq->elts_n; | |
1621 | unsigned int elts_comp_cd = txq->elts_comp_cd; | |
1622 | unsigned int elts_comp = 0; | |
1623 | unsigned int i; | |
1624 | unsigned int max; | |
1625 | int err; | |
1626 | ||
1627 | assert(elts_comp_cd != 0); | |
1628 | txq_complete(txq); | |
1629 | max = (elts_n - (elts_head - txq->elts_tail)); | |
1630 | if (max > elts_n) | |
1631 | max -= elts_n; | |
1632 | assert(max >= 1); | |
1633 | assert(max <= elts_n); | |
1634 | /* Always leave one free entry in the ring. */ | |
1635 | --max; | |
1636 | if (max == 0) | |
1637 | return 0; | |
1638 | if (max > pkts_n) | |
1639 | max = pkts_n; | |
1640 | for (i = 0; (i != max); ++i) { | |
1641 | struct rte_mbuf *buf = pkts[i]; | |
1642 | unsigned int elts_head_next = | |
1643 | (((elts_head + 1) == elts_n) ? 0 : elts_head + 1); | |
1644 | struct txq_elt *elt_next = &(*txq->elts)[elts_head_next]; | |
1645 | struct txq_elt *elt = &(*txq->elts)[elts_head]; | |
1646 | unsigned int segs = NB_SEGS(buf); | |
1647 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
1648 | unsigned int sent_size = 0; | |
1649 | #endif | |
1650 | uint32_t send_flags = 0; | |
1651 | ||
1652 | /* Clean up old buffer. */ | |
1653 | if (likely(elt->buf != NULL)) { | |
1654 | struct rte_mbuf *tmp = elt->buf; | |
1655 | ||
1656 | #ifndef NDEBUG | |
1657 | /* Poisoning. */ | |
1658 | memset(elt, 0x66, sizeof(*elt)); | |
1659 | #endif | |
1660 | /* Faster than rte_pktmbuf_free(). */ | |
1661 | do { | |
1662 | struct rte_mbuf *next = NEXT(tmp); | |
1663 | ||
1664 | rte_pktmbuf_free_seg(tmp); | |
1665 | tmp = next; | |
1666 | } while (tmp != NULL); | |
1667 | } | |
1668 | /* Request TX completion. */ | |
1669 | if (unlikely(--elts_comp_cd == 0)) { | |
1670 | elts_comp_cd = txq->elts_comp_cd_init; | |
1671 | ++elts_comp; | |
1672 | send_flags |= IBV_EXP_QP_BURST_SIGNALED; | |
1673 | } | |
1674 | /* Should we enable HW CKSUM offload */ | |
1675 | if (buf->ol_flags & | |
1676 | (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) { | |
1677 | send_flags |= IBV_EXP_QP_BURST_IP_CSUM; | |
1678 | /* HW does not support checksum offloads at arbitrary | |
1679 | * offsets but automatically recognizes the packet | |
1680 | * type. For inner L3/L4 checksums, only VXLAN (UDP) | |
1681 | * tunnels are currently supported. */ | |
1682 | if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type)) | |
1683 | send_flags |= IBV_EXP_QP_BURST_TUNNEL; | |
1684 | } | |
1685 | if (likely(segs == 1)) { | |
1686 | uintptr_t addr; | |
1687 | uint32_t length; | |
1688 | uint32_t lkey; | |
1689 | ||
1690 | /* Retrieve buffer information. */ | |
1691 | addr = rte_pktmbuf_mtod(buf, uintptr_t); | |
1692 | length = DATA_LEN(buf); | |
1693 | /* Retrieve Memory Region key for this memory pool. */ | |
1694 | lkey = txq_mp2mr(txq, txq_mb2mp(buf)); | |
1695 | if (unlikely(lkey == (uint32_t)-1)) { | |
1696 | /* MR does not exist. */ | |
1697 | DEBUG("%p: unable to get MP <-> MR" | |
1698 | " association", (void *)txq); | |
1699 | /* Clean up TX element. */ | |
1700 | elt->buf = NULL; | |
1701 | goto stop; | |
1702 | } | |
1703 | /* Update element. */ | |
1704 | elt->buf = buf; | |
1705 | if (txq->priv->vf) | |
1706 | rte_prefetch0((volatile void *) | |
1707 | (uintptr_t)addr); | |
1708 | RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); | |
1709 | /* Put packet into send queue. */ | |
1710 | #if MLX4_PMD_MAX_INLINE > 0 | |
1711 | if (length <= txq->max_inline) | |
1712 | err = txq->if_qp->send_pending_inline | |
1713 | (txq->qp, | |
1714 | (void *)addr, | |
1715 | length, | |
1716 | send_flags); | |
1717 | else | |
1718 | #endif | |
1719 | err = txq->if_qp->send_pending | |
1720 | (txq->qp, | |
1721 | addr, | |
1722 | length, | |
1723 | lkey, | |
1724 | send_flags); | |
1725 | if (unlikely(err)) | |
1726 | goto stop; | |
1727 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
1728 | sent_size += length; | |
1729 | #endif | |
1730 | } else { | |
1731 | #if MLX4_PMD_SGE_WR_N > 1 | |
1732 | struct ibv_sge sges[MLX4_PMD_SGE_WR_N]; | |
1733 | struct tx_burst_sg_ret ret; | |
1734 | ||
1735 | ret = tx_burst_sg(txq, segs, elt, buf, elts_head, | |
1736 | &sges); | |
1737 | if (ret.length == (unsigned int)-1) | |
1738 | goto stop; | |
1739 | RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf); | |
1740 | /* Put SG list into send queue. */ | |
1741 | err = txq->if_qp->send_pending_sg_list | |
1742 | (txq->qp, | |
1743 | sges, | |
1744 | ret.num, | |
1745 | send_flags); | |
1746 | if (unlikely(err)) | |
1747 | goto stop; | |
1748 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
1749 | sent_size += ret.length; | |
1750 | #endif | |
1751 | #else /* MLX4_PMD_SGE_WR_N > 1 */ | |
1752 | DEBUG("%p: TX scattered buffers support not" | |
1753 | " compiled in", (void *)txq); | |
1754 | goto stop; | |
1755 | #endif /* MLX4_PMD_SGE_WR_N > 1 */ | |
1756 | } | |
1757 | elts_head = elts_head_next; | |
1758 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
1759 | /* Increment sent bytes counter. */ | |
1760 | txq->stats.obytes += sent_size; | |
1761 | #endif | |
1762 | } | |
1763 | stop: | |
1764 | /* Take a shortcut if nothing must be sent. */ | |
1765 | if (unlikely(i == 0)) | |
1766 | return 0; | |
1767 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
1768 | /* Increment sent packets counter. */ | |
1769 | txq->stats.opackets += i; | |
1770 | #endif | |
1771 | /* Ring QP doorbell. */ | |
1772 | err = txq->if_qp->send_flush(txq->qp); | |
1773 | if (unlikely(err)) { | |
1774 | /* A nonzero value is not supposed to be returned. | |
1775 | * Nothing can be done about it. */ | |
1776 | DEBUG("%p: send_flush() failed with error %d", | |
1777 | (void *)txq, err); | |
1778 | } | |
1779 | txq->elts_head = elts_head; | |
1780 | txq->elts_comp += elts_comp; | |
1781 | txq->elts_comp_cd = elts_comp_cd; | |
1782 | return i; | |
1783 | } | |
1784 | ||
1785 | /** | |
1786 | * DPDK callback for TX in secondary processes. | |
1787 | * | |
1788 | * This function configures all queues from primary process information | |
1789 | * if necessary before reverting to the normal TX burst callback. | |
1790 | * | |
1791 | * @param dpdk_txq | |
1792 | * Generic pointer to TX queue structure. | |
1793 | * @param[in] pkts | |
1794 | * Packets to transmit. | |
1795 | * @param pkts_n | |
1796 | * Number of packets in array. | |
1797 | * | |
1798 | * @return | |
1799 | * Number of packets successfully transmitted (<= pkts_n). | |
1800 | */ | |
1801 | static uint16_t | |
1802 | mlx4_tx_burst_secondary_setup(void *dpdk_txq, struct rte_mbuf **pkts, | |
1803 | uint16_t pkts_n) | |
1804 | { | |
1805 | struct txq *txq = dpdk_txq; | |
1806 | struct priv *priv = mlx4_secondary_data_setup(txq->priv); | |
1807 | struct priv *primary_priv; | |
1808 | unsigned int index; | |
1809 | ||
1810 | if (priv == NULL) | |
1811 | return 0; | |
1812 | primary_priv = | |
1813 | mlx4_secondary_data[priv->dev->data->port_id].primary_priv; | |
1814 | /* Look for queue index in both private structures. */ | |
1815 | for (index = 0; index != priv->txqs_n; ++index) | |
1816 | if (((*primary_priv->txqs)[index] == txq) || | |
1817 | ((*priv->txqs)[index] == txq)) | |
1818 | break; | |
1819 | if (index == priv->txqs_n) | |
1820 | return 0; | |
1821 | txq = (*priv->txqs)[index]; | |
1822 | return priv->dev->tx_pkt_burst(txq, pkts, pkts_n); | |
1823 | } | |
1824 | ||
1825 | /** | |
1826 | * Configure a TX queue. | |
1827 | * | |
1828 | * @param dev | |
1829 | * Pointer to Ethernet device structure. | |
1830 | * @param txq | |
1831 | * Pointer to TX queue structure. | |
1832 | * @param desc | |
1833 | * Number of descriptors to configure in queue. | |
1834 | * @param socket | |
1835 | * NUMA socket on which memory must be allocated. | |
1836 | * @param[in] conf | |
1837 | * Thresholds parameters. | |
1838 | * | |
1839 | * @return | |
1840 | * 0 on success, errno value on failure. | |
1841 | */ | |
1842 | static int | |
1843 | txq_setup(struct rte_eth_dev *dev, struct txq *txq, uint16_t desc, | |
1844 | unsigned int socket, const struct rte_eth_txconf *conf) | |
1845 | { | |
1846 | struct priv *priv = mlx4_get_priv(dev); | |
1847 | struct txq tmpl = { | |
1848 | .priv = priv, | |
1849 | .socket = socket | |
1850 | }; | |
1851 | union { | |
1852 | struct ibv_exp_query_intf_params params; | |
1853 | struct ibv_exp_qp_init_attr init; | |
1854 | struct ibv_exp_res_domain_init_attr rd; | |
1855 | struct ibv_exp_cq_init_attr cq; | |
1856 | struct ibv_exp_qp_attr mod; | |
1857 | } attr; | |
1858 | enum ibv_exp_query_intf_status status; | |
1859 | int ret = 0; | |
1860 | ||
1861 | (void)conf; /* Thresholds configuration (ignored). */ | |
1862 | if (priv == NULL) | |
1863 | return EINVAL; | |
1864 | if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) { | |
1865 | ERROR("%p: invalid number of TX descriptors (must be a" | |
1866 | " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N); | |
1867 | return EINVAL; | |
1868 | } | |
1869 | desc /= MLX4_PMD_SGE_WR_N; | |
1870 | /* MRs will be registered in mp2mr[] later. */ | |
1871 | attr.rd = (struct ibv_exp_res_domain_init_attr){ | |
1872 | .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL | | |
1873 | IBV_EXP_RES_DOMAIN_MSG_MODEL), | |
1874 | .thread_model = IBV_EXP_THREAD_SINGLE, | |
1875 | .msg_model = IBV_EXP_MSG_HIGH_BW, | |
1876 | }; | |
1877 | tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd); | |
1878 | if (tmpl.rd == NULL) { | |
1879 | ret = ENOMEM; | |
1880 | ERROR("%p: RD creation failure: %s", | |
1881 | (void *)dev, strerror(ret)); | |
1882 | goto error; | |
1883 | } | |
1884 | attr.cq = (struct ibv_exp_cq_init_attr){ | |
1885 | .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, | |
1886 | .res_domain = tmpl.rd, | |
1887 | }; | |
1888 | tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq); | |
1889 | if (tmpl.cq == NULL) { | |
1890 | ret = ENOMEM; | |
1891 | ERROR("%p: CQ creation failure: %s", | |
1892 | (void *)dev, strerror(ret)); | |
1893 | goto error; | |
1894 | } | |
1895 | DEBUG("priv->device_attr.max_qp_wr is %d", | |
1896 | priv->device_attr.max_qp_wr); | |
1897 | DEBUG("priv->device_attr.max_sge is %d", | |
1898 | priv->device_attr.max_sge); | |
1899 | attr.init = (struct ibv_exp_qp_init_attr){ | |
1900 | /* CQ to be associated with the send queue. */ | |
1901 | .send_cq = tmpl.cq, | |
1902 | /* CQ to be associated with the receive queue. */ | |
1903 | .recv_cq = tmpl.cq, | |
1904 | .cap = { | |
1905 | /* Max number of outstanding WRs. */ | |
1906 | .max_send_wr = ((priv->device_attr.max_qp_wr < desc) ? | |
1907 | priv->device_attr.max_qp_wr : | |
1908 | desc), | |
1909 | /* Max number of scatter/gather elements in a WR. */ | |
1910 | .max_send_sge = ((priv->device_attr.max_sge < | |
1911 | MLX4_PMD_SGE_WR_N) ? | |
1912 | priv->device_attr.max_sge : | |
1913 | MLX4_PMD_SGE_WR_N), | |
1914 | #if MLX4_PMD_MAX_INLINE > 0 | |
1915 | .max_inline_data = MLX4_PMD_MAX_INLINE, | |
1916 | #endif | |
1917 | }, | |
1918 | .qp_type = IBV_QPT_RAW_PACKET, | |
1919 | /* Do *NOT* enable this, completions events are managed per | |
1920 | * TX burst. */ | |
1921 | .sq_sig_all = 0, | |
1922 | .pd = priv->pd, | |
1923 | .res_domain = tmpl.rd, | |
1924 | .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | | |
1925 | IBV_EXP_QP_INIT_ATTR_RES_DOMAIN), | |
1926 | }; | |
1927 | tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init); | |
1928 | if (tmpl.qp == NULL) { | |
1929 | ret = (errno ? errno : EINVAL); | |
1930 | ERROR("%p: QP creation failure: %s", | |
1931 | (void *)dev, strerror(ret)); | |
1932 | goto error; | |
1933 | } | |
1934 | #if MLX4_PMD_MAX_INLINE > 0 | |
1935 | /* ibv_create_qp() updates this value. */ | |
1936 | tmpl.max_inline = attr.init.cap.max_inline_data; | |
1937 | #endif | |
1938 | attr.mod = (struct ibv_exp_qp_attr){ | |
1939 | /* Move the QP to this state. */ | |
1940 | .qp_state = IBV_QPS_INIT, | |
1941 | /* Primary port number. */ | |
1942 | .port_num = priv->port | |
1943 | }; | |
1944 | ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, | |
1945 | (IBV_EXP_QP_STATE | IBV_EXP_QP_PORT)); | |
1946 | if (ret) { | |
1947 | ERROR("%p: QP state to IBV_QPS_INIT failed: %s", | |
1948 | (void *)dev, strerror(ret)); | |
1949 | goto error; | |
1950 | } | |
1951 | ret = txq_alloc_elts(&tmpl, desc); | |
1952 | if (ret) { | |
1953 | ERROR("%p: TXQ allocation failed: %s", | |
1954 | (void *)dev, strerror(ret)); | |
1955 | goto error; | |
1956 | } | |
1957 | attr.mod = (struct ibv_exp_qp_attr){ | |
1958 | .qp_state = IBV_QPS_RTR | |
1959 | }; | |
1960 | ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE); | |
1961 | if (ret) { | |
1962 | ERROR("%p: QP state to IBV_QPS_RTR failed: %s", | |
1963 | (void *)dev, strerror(ret)); | |
1964 | goto error; | |
1965 | } | |
1966 | attr.mod.qp_state = IBV_QPS_RTS; | |
1967 | ret = ibv_exp_modify_qp(tmpl.qp, &attr.mod, IBV_EXP_QP_STATE); | |
1968 | if (ret) { | |
1969 | ERROR("%p: QP state to IBV_QPS_RTS failed: %s", | |
1970 | (void *)dev, strerror(ret)); | |
1971 | goto error; | |
1972 | } | |
1973 | attr.params = (struct ibv_exp_query_intf_params){ | |
1974 | .intf_scope = IBV_EXP_INTF_GLOBAL, | |
1975 | .intf = IBV_EXP_INTF_CQ, | |
1976 | .obj = tmpl.cq, | |
1977 | }; | |
1978 | tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status); | |
1979 | if (tmpl.if_cq == NULL) { | |
1980 | ERROR("%p: CQ interface family query failed with status %d", | |
1981 | (void *)dev, status); | |
1982 | goto error; | |
1983 | } | |
1984 | attr.params = (struct ibv_exp_query_intf_params){ | |
1985 | .intf_scope = IBV_EXP_INTF_GLOBAL, | |
1986 | .intf = IBV_EXP_INTF_QP_BURST, | |
1987 | .obj = tmpl.qp, | |
1988 | #ifdef HAVE_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK | |
1989 | /* MC loopback must be disabled when not using a VF. */ | |
1990 | .family_flags = | |
1991 | (!priv->vf ? | |
1992 | IBV_EXP_QP_BURST_CREATE_DISABLE_ETH_LOOPBACK : | |
1993 | 0), | |
1994 | #endif | |
1995 | }; | |
1996 | tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status); | |
1997 | if (tmpl.if_qp == NULL) { | |
1998 | ERROR("%p: QP interface family query failed with status %d", | |
1999 | (void *)dev, status); | |
2000 | goto error; | |
2001 | } | |
2002 | /* Clean up txq in case we're reinitializing it. */ | |
2003 | DEBUG("%p: cleaning-up old txq just in case", (void *)txq); | |
2004 | txq_cleanup(txq); | |
2005 | *txq = tmpl; | |
2006 | DEBUG("%p: txq updated with %p", (void *)txq, (void *)&tmpl); | |
2007 | /* Pre-register known mempools. */ | |
2008 | rte_mempool_walk(txq_mp2mr_iter, txq); | |
2009 | assert(ret == 0); | |
2010 | return 0; | |
2011 | error: | |
2012 | txq_cleanup(&tmpl); | |
2013 | assert(ret > 0); | |
2014 | return ret; | |
2015 | } | |
2016 | ||
2017 | /** | |
2018 | * DPDK callback to configure a TX queue. | |
2019 | * | |
2020 | * @param dev | |
2021 | * Pointer to Ethernet device structure. | |
2022 | * @param idx | |
2023 | * TX queue index. | |
2024 | * @param desc | |
2025 | * Number of descriptors to configure in queue. | |
2026 | * @param socket | |
2027 | * NUMA socket on which memory must be allocated. | |
2028 | * @param[in] conf | |
2029 | * Thresholds parameters. | |
2030 | * | |
2031 | * @return | |
2032 | * 0 on success, negative errno value on failure. | |
2033 | */ | |
2034 | static int | |
2035 | mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, | |
2036 | unsigned int socket, const struct rte_eth_txconf *conf) | |
2037 | { | |
2038 | struct priv *priv = dev->data->dev_private; | |
2039 | struct txq *txq = (*priv->txqs)[idx]; | |
2040 | int ret; | |
2041 | ||
2042 | if (mlx4_is_secondary()) | |
2043 | return -E_RTE_SECONDARY; | |
2044 | priv_lock(priv); | |
2045 | DEBUG("%p: configuring queue %u for %u descriptors", | |
2046 | (void *)dev, idx, desc); | |
2047 | if (idx >= priv->txqs_n) { | |
2048 | ERROR("%p: queue index out of range (%u >= %u)", | |
2049 | (void *)dev, idx, priv->txqs_n); | |
2050 | priv_unlock(priv); | |
2051 | return -EOVERFLOW; | |
2052 | } | |
2053 | if (txq != NULL) { | |
2054 | DEBUG("%p: reusing already allocated queue index %u (%p)", | |
2055 | (void *)dev, idx, (void *)txq); | |
2056 | if (priv->started) { | |
2057 | priv_unlock(priv); | |
2058 | return -EEXIST; | |
2059 | } | |
2060 | (*priv->txqs)[idx] = NULL; | |
2061 | txq_cleanup(txq); | |
2062 | } else { | |
2063 | txq = rte_calloc_socket("TXQ", 1, sizeof(*txq), 0, socket); | |
2064 | if (txq == NULL) { | |
2065 | ERROR("%p: unable to allocate queue index %u", | |
2066 | (void *)dev, idx); | |
2067 | priv_unlock(priv); | |
2068 | return -ENOMEM; | |
2069 | } | |
2070 | } | |
2071 | ret = txq_setup(dev, txq, desc, socket, conf); | |
2072 | if (ret) | |
2073 | rte_free(txq); | |
2074 | else { | |
2075 | txq->stats.idx = idx; | |
2076 | DEBUG("%p: adding TX queue %p to list", | |
2077 | (void *)dev, (void *)txq); | |
2078 | (*priv->txqs)[idx] = txq; | |
2079 | /* Update send callback. */ | |
2080 | dev->tx_pkt_burst = mlx4_tx_burst; | |
2081 | } | |
2082 | priv_unlock(priv); | |
2083 | return -ret; | |
2084 | } | |
2085 | ||
2086 | /** | |
2087 | * DPDK callback to release a TX queue. | |
2088 | * | |
2089 | * @param dpdk_txq | |
2090 | * Generic TX queue pointer. | |
2091 | */ | |
2092 | static void | |
2093 | mlx4_tx_queue_release(void *dpdk_txq) | |
2094 | { | |
2095 | struct txq *txq = (struct txq *)dpdk_txq; | |
2096 | struct priv *priv; | |
2097 | unsigned int i; | |
2098 | ||
2099 | if (mlx4_is_secondary()) | |
2100 | return; | |
2101 | if (txq == NULL) | |
2102 | return; | |
2103 | priv = txq->priv; | |
2104 | priv_lock(priv); | |
2105 | for (i = 0; (i != priv->txqs_n); ++i) | |
2106 | if ((*priv->txqs)[i] == txq) { | |
2107 | DEBUG("%p: removing TX queue %p from list", | |
2108 | (void *)priv->dev, (void *)txq); | |
2109 | (*priv->txqs)[i] = NULL; | |
2110 | break; | |
2111 | } | |
2112 | txq_cleanup(txq); | |
2113 | rte_free(txq); | |
2114 | priv_unlock(priv); | |
2115 | } | |
2116 | ||
2117 | /* RX queues handling. */ | |
2118 | ||
2119 | /** | |
2120 | * Allocate RX queue elements with scattered packets support. | |
2121 | * | |
2122 | * @param rxq | |
2123 | * Pointer to RX queue structure. | |
2124 | * @param elts_n | |
2125 | * Number of elements to allocate. | |
2126 | * @param[in] pool | |
2127 | * If not NULL, fetch buffers from this array instead of allocating them | |
2128 | * with rte_pktmbuf_alloc(). | |
2129 | * | |
2130 | * @return | |
2131 | * 0 on success, errno value on failure. | |
2132 | */ | |
2133 | static int | |
2134 | rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n, | |
2135 | struct rte_mbuf **pool) | |
2136 | { | |
2137 | unsigned int i; | |
2138 | struct rxq_elt_sp (*elts)[elts_n] = | |
2139 | rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0, | |
2140 | rxq->socket); | |
2141 | int ret = 0; | |
2142 | ||
2143 | if (elts == NULL) { | |
2144 | ERROR("%p: can't allocate packets array", (void *)rxq); | |
2145 | ret = ENOMEM; | |
2146 | goto error; | |
2147 | } | |
2148 | /* For each WR (packet). */ | |
2149 | for (i = 0; (i != elts_n); ++i) { | |
2150 | unsigned int j; | |
2151 | struct rxq_elt_sp *elt = &(*elts)[i]; | |
2152 | struct ibv_recv_wr *wr = &elt->wr; | |
2153 | struct ibv_sge (*sges)[(elemof(elt->sges))] = &elt->sges; | |
2154 | ||
2155 | /* These two arrays must have the same size. */ | |
2156 | assert(elemof(elt->sges) == elemof(elt->bufs)); | |
2157 | /* Configure WR. */ | |
2158 | wr->wr_id = i; | |
2159 | wr->next = &(*elts)[(i + 1)].wr; | |
2160 | wr->sg_list = &(*sges)[0]; | |
2161 | wr->num_sge = elemof(*sges); | |
2162 | /* For each SGE (segment). */ | |
2163 | for (j = 0; (j != elemof(elt->bufs)); ++j) { | |
2164 | struct ibv_sge *sge = &(*sges)[j]; | |
2165 | struct rte_mbuf *buf; | |
2166 | ||
2167 | if (pool != NULL) { | |
2168 | buf = *(pool++); | |
2169 | assert(buf != NULL); | |
2170 | rte_pktmbuf_reset(buf); | |
2171 | } else | |
2172 | buf = rte_pktmbuf_alloc(rxq->mp); | |
2173 | if (buf == NULL) { | |
2174 | assert(pool == NULL); | |
2175 | ERROR("%p: empty mbuf pool", (void *)rxq); | |
2176 | ret = ENOMEM; | |
2177 | goto error; | |
2178 | } | |
2179 | elt->bufs[j] = buf; | |
2180 | /* Headroom is reserved by rte_pktmbuf_alloc(). */ | |
2181 | assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); | |
2182 | /* Buffer is supposed to be empty. */ | |
2183 | assert(rte_pktmbuf_data_len(buf) == 0); | |
2184 | assert(rte_pktmbuf_pkt_len(buf) == 0); | |
2185 | /* sge->addr must be able to store a pointer. */ | |
2186 | assert(sizeof(sge->addr) >= sizeof(uintptr_t)); | |
2187 | if (j == 0) { | |
2188 | /* The first SGE keeps its headroom. */ | |
2189 | sge->addr = rte_pktmbuf_mtod(buf, uintptr_t); | |
2190 | sge->length = (buf->buf_len - | |
2191 | RTE_PKTMBUF_HEADROOM); | |
2192 | } else { | |
2193 | /* Subsequent SGEs lose theirs. */ | |
2194 | assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); | |
2195 | SET_DATA_OFF(buf, 0); | |
2196 | sge->addr = (uintptr_t)buf->buf_addr; | |
2197 | sge->length = buf->buf_len; | |
2198 | } | |
2199 | sge->lkey = rxq->mr->lkey; | |
2200 | /* Redundant check for tailroom. */ | |
2201 | assert(sge->length == rte_pktmbuf_tailroom(buf)); | |
2202 | } | |
2203 | } | |
2204 | /* The last WR pointer must be NULL. */ | |
2205 | (*elts)[(i - 1)].wr.next = NULL; | |
2206 | DEBUG("%p: allocated and configured %u WRs (%zu segments)", | |
2207 | (void *)rxq, elts_n, (elts_n * elemof((*elts)[0].sges))); | |
2208 | rxq->elts_n = elts_n; | |
2209 | rxq->elts_head = 0; | |
2210 | rxq->elts.sp = elts; | |
2211 | assert(ret == 0); | |
2212 | return 0; | |
2213 | error: | |
2214 | if (elts != NULL) { | |
2215 | assert(pool == NULL); | |
2216 | for (i = 0; (i != elemof(*elts)); ++i) { | |
2217 | unsigned int j; | |
2218 | struct rxq_elt_sp *elt = &(*elts)[i]; | |
2219 | ||
2220 | for (j = 0; (j != elemof(elt->bufs)); ++j) { | |
2221 | struct rte_mbuf *buf = elt->bufs[j]; | |
2222 | ||
2223 | if (buf != NULL) | |
2224 | rte_pktmbuf_free_seg(buf); | |
2225 | } | |
2226 | } | |
2227 | rte_free(elts); | |
2228 | } | |
2229 | DEBUG("%p: failed, freed everything", (void *)rxq); | |
2230 | assert(ret > 0); | |
2231 | return ret; | |
2232 | } | |
2233 | ||
2234 | /** | |
2235 | * Free RX queue elements with scattered packets support. | |
2236 | * | |
2237 | * @param rxq | |
2238 | * Pointer to RX queue structure. | |
2239 | */ | |
2240 | static void | |
2241 | rxq_free_elts_sp(struct rxq *rxq) | |
2242 | { | |
2243 | unsigned int i; | |
2244 | unsigned int elts_n = rxq->elts_n; | |
2245 | struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp; | |
2246 | ||
2247 | DEBUG("%p: freeing WRs", (void *)rxq); | |
2248 | rxq->elts_n = 0; | |
2249 | rxq->elts.sp = NULL; | |
2250 | if (elts == NULL) | |
2251 | return; | |
2252 | for (i = 0; (i != elemof(*elts)); ++i) { | |
2253 | unsigned int j; | |
2254 | struct rxq_elt_sp *elt = &(*elts)[i]; | |
2255 | ||
2256 | for (j = 0; (j != elemof(elt->bufs)); ++j) { | |
2257 | struct rte_mbuf *buf = elt->bufs[j]; | |
2258 | ||
2259 | if (buf != NULL) | |
2260 | rte_pktmbuf_free_seg(buf); | |
2261 | } | |
2262 | } | |
2263 | rte_free(elts); | |
2264 | } | |
2265 | ||
2266 | /** | |
2267 | * Allocate RX queue elements. | |
2268 | * | |
2269 | * @param rxq | |
2270 | * Pointer to RX queue structure. | |
2271 | * @param elts_n | |
2272 | * Number of elements to allocate. | |
2273 | * @param[in] pool | |
2274 | * If not NULL, fetch buffers from this array instead of allocating them | |
2275 | * with rte_pktmbuf_alloc(). | |
2276 | * | |
2277 | * @return | |
2278 | * 0 on success, errno value on failure. | |
2279 | */ | |
2280 | static int | |
2281 | rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n, struct rte_mbuf **pool) | |
2282 | { | |
2283 | unsigned int i; | |
2284 | struct rxq_elt (*elts)[elts_n] = | |
2285 | rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0, | |
2286 | rxq->socket); | |
2287 | int ret = 0; | |
2288 | ||
2289 | if (elts == NULL) { | |
2290 | ERROR("%p: can't allocate packets array", (void *)rxq); | |
2291 | ret = ENOMEM; | |
2292 | goto error; | |
2293 | } | |
2294 | /* For each WR (packet). */ | |
2295 | for (i = 0; (i != elts_n); ++i) { | |
2296 | struct rxq_elt *elt = &(*elts)[i]; | |
2297 | struct ibv_recv_wr *wr = &elt->wr; | |
2298 | struct ibv_sge *sge = &(*elts)[i].sge; | |
2299 | struct rte_mbuf *buf; | |
2300 | ||
2301 | if (pool != NULL) { | |
2302 | buf = *(pool++); | |
2303 | assert(buf != NULL); | |
2304 | rte_pktmbuf_reset(buf); | |
2305 | } else | |
2306 | buf = rte_pktmbuf_alloc(rxq->mp); | |
2307 | if (buf == NULL) { | |
2308 | assert(pool == NULL); | |
2309 | ERROR("%p: empty mbuf pool", (void *)rxq); | |
2310 | ret = ENOMEM; | |
2311 | goto error; | |
2312 | } | |
2313 | /* Configure WR. Work request ID contains its own index in | |
2314 | * the elts array and the offset between SGE buffer header and | |
2315 | * its data. */ | |
2316 | WR_ID(wr->wr_id).id = i; | |
2317 | WR_ID(wr->wr_id).offset = | |
2318 | (((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) - | |
2319 | (uintptr_t)buf); | |
2320 | wr->next = &(*elts)[(i + 1)].wr; | |
2321 | wr->sg_list = sge; | |
2322 | wr->num_sge = 1; | |
2323 | /* Headroom is reserved by rte_pktmbuf_alloc(). */ | |
2324 | assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM); | |
2325 | /* Buffer is supposed to be empty. */ | |
2326 | assert(rte_pktmbuf_data_len(buf) == 0); | |
2327 | assert(rte_pktmbuf_pkt_len(buf) == 0); | |
2328 | /* sge->addr must be able to store a pointer. */ | |
2329 | assert(sizeof(sge->addr) >= sizeof(uintptr_t)); | |
2330 | /* SGE keeps its headroom. */ | |
2331 | sge->addr = (uintptr_t) | |
2332 | ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM); | |
2333 | sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM); | |
2334 | sge->lkey = rxq->mr->lkey; | |
2335 | /* Redundant check for tailroom. */ | |
2336 | assert(sge->length == rte_pktmbuf_tailroom(buf)); | |
2337 | /* Make sure elts index and SGE mbuf pointer can be deduced | |
2338 | * from WR ID. */ | |
2339 | if ((WR_ID(wr->wr_id).id != i) || | |
2340 | ((void *)((uintptr_t)sge->addr - | |
2341 | WR_ID(wr->wr_id).offset) != buf)) { | |
2342 | ERROR("%p: cannot store index and offset in WR ID", | |
2343 | (void *)rxq); | |
2344 | sge->addr = 0; | |
2345 | rte_pktmbuf_free(buf); | |
2346 | ret = EOVERFLOW; | |
2347 | goto error; | |
2348 | } | |
2349 | } | |
2350 | /* The last WR pointer must be NULL. */ | |
2351 | (*elts)[(i - 1)].wr.next = NULL; | |
2352 | DEBUG("%p: allocated and configured %u single-segment WRs", | |
2353 | (void *)rxq, elts_n); | |
2354 | rxq->elts_n = elts_n; | |
2355 | rxq->elts_head = 0; | |
2356 | rxq->elts.no_sp = elts; | |
2357 | assert(ret == 0); | |
2358 | return 0; | |
2359 | error: | |
2360 | if (elts != NULL) { | |
2361 | assert(pool == NULL); | |
2362 | for (i = 0; (i != elemof(*elts)); ++i) { | |
2363 | struct rxq_elt *elt = &(*elts)[i]; | |
2364 | struct rte_mbuf *buf; | |
2365 | ||
2366 | if (elt->sge.addr == 0) | |
2367 | continue; | |
2368 | assert(WR_ID(elt->wr.wr_id).id == i); | |
2369 | buf = (void *)((uintptr_t)elt->sge.addr - | |
2370 | WR_ID(elt->wr.wr_id).offset); | |
2371 | rte_pktmbuf_free_seg(buf); | |
2372 | } | |
2373 | rte_free(elts); | |
2374 | } | |
2375 | DEBUG("%p: failed, freed everything", (void *)rxq); | |
2376 | assert(ret > 0); | |
2377 | return ret; | |
2378 | } | |
2379 | ||
2380 | /** | |
2381 | * Free RX queue elements. | |
2382 | * | |
2383 | * @param rxq | |
2384 | * Pointer to RX queue structure. | |
2385 | */ | |
2386 | static void | |
2387 | rxq_free_elts(struct rxq *rxq) | |
2388 | { | |
2389 | unsigned int i; | |
2390 | unsigned int elts_n = rxq->elts_n; | |
2391 | struct rxq_elt (*elts)[elts_n] = rxq->elts.no_sp; | |
2392 | ||
2393 | DEBUG("%p: freeing WRs", (void *)rxq); | |
2394 | rxq->elts_n = 0; | |
2395 | rxq->elts.no_sp = NULL; | |
2396 | if (elts == NULL) | |
2397 | return; | |
2398 | for (i = 0; (i != elemof(*elts)); ++i) { | |
2399 | struct rxq_elt *elt = &(*elts)[i]; | |
2400 | struct rte_mbuf *buf; | |
2401 | ||
2402 | if (elt->sge.addr == 0) | |
2403 | continue; | |
2404 | assert(WR_ID(elt->wr.wr_id).id == i); | |
2405 | buf = (void *)((uintptr_t)elt->sge.addr - | |
2406 | WR_ID(elt->wr.wr_id).offset); | |
2407 | rte_pktmbuf_free_seg(buf); | |
2408 | } | |
2409 | rte_free(elts); | |
2410 | } | |
2411 | ||
2412 | /** | |
2413 | * Delete flow steering rule. | |
2414 | * | |
2415 | * @param rxq | |
2416 | * Pointer to RX queue structure. | |
2417 | * @param mac_index | |
2418 | * MAC address index. | |
2419 | * @param vlan_index | |
2420 | * VLAN index. | |
2421 | */ | |
2422 | static void | |
2423 | rxq_del_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index) | |
2424 | { | |
2425 | #ifndef NDEBUG | |
2426 | struct priv *priv = rxq->priv; | |
2427 | const uint8_t (*mac)[ETHER_ADDR_LEN] = | |
2428 | (const uint8_t (*)[ETHER_ADDR_LEN]) | |
2429 | priv->mac[mac_index].addr_bytes; | |
2430 | #endif | |
2431 | assert(rxq->mac_flow[mac_index][vlan_index] != NULL); | |
2432 | DEBUG("%p: removing MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u" | |
2433 | " (VLAN ID %" PRIu16 ")", | |
2434 | (void *)rxq, | |
2435 | (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5], | |
2436 | mac_index, priv->vlan_filter[vlan_index].id); | |
2437 | claim_zero(ibv_destroy_flow(rxq->mac_flow[mac_index][vlan_index])); | |
2438 | rxq->mac_flow[mac_index][vlan_index] = NULL; | |
2439 | } | |
2440 | ||
2441 | /** | |
2442 | * Unregister a MAC address from a RX queue. | |
2443 | * | |
2444 | * @param rxq | |
2445 | * Pointer to RX queue structure. | |
2446 | * @param mac_index | |
2447 | * MAC address index. | |
2448 | */ | |
2449 | static void | |
2450 | rxq_mac_addr_del(struct rxq *rxq, unsigned int mac_index) | |
2451 | { | |
2452 | struct priv *priv = rxq->priv; | |
2453 | unsigned int i; | |
2454 | unsigned int vlans = 0; | |
2455 | ||
2456 | assert(mac_index < elemof(priv->mac)); | |
2457 | if (!BITFIELD_ISSET(rxq->mac_configured, mac_index)) | |
2458 | return; | |
2459 | for (i = 0; (i != elemof(priv->vlan_filter)); ++i) { | |
2460 | if (!priv->vlan_filter[i].enabled) | |
2461 | continue; | |
2462 | rxq_del_flow(rxq, mac_index, i); | |
2463 | vlans++; | |
2464 | } | |
2465 | if (!vlans) { | |
2466 | rxq_del_flow(rxq, mac_index, 0); | |
2467 | } | |
2468 | BITFIELD_RESET(rxq->mac_configured, mac_index); | |
2469 | } | |
2470 | ||
2471 | /** | |
2472 | * Unregister all MAC addresses from a RX queue. | |
2473 | * | |
2474 | * @param rxq | |
2475 | * Pointer to RX queue structure. | |
2476 | */ | |
2477 | static void | |
2478 | rxq_mac_addrs_del(struct rxq *rxq) | |
2479 | { | |
2480 | struct priv *priv = rxq->priv; | |
2481 | unsigned int i; | |
2482 | ||
2483 | for (i = 0; (i != elemof(priv->mac)); ++i) | |
2484 | rxq_mac_addr_del(rxq, i); | |
2485 | } | |
2486 | ||
2487 | static int rxq_promiscuous_enable(struct rxq *); | |
2488 | static void rxq_promiscuous_disable(struct rxq *); | |
2489 | ||
2490 | /** | |
2491 | * Add single flow steering rule. | |
2492 | * | |
2493 | * @param rxq | |
2494 | * Pointer to RX queue structure. | |
2495 | * @param mac_index | |
2496 | * MAC address index to register. | |
2497 | * @param vlan_index | |
2498 | * VLAN index. Use -1 for a flow without VLAN. | |
2499 | * | |
2500 | * @return | |
2501 | * 0 on success, errno value on failure. | |
2502 | */ | |
2503 | static int | |
2504 | rxq_add_flow(struct rxq *rxq, unsigned int mac_index, unsigned int vlan_index) | |
2505 | { | |
2506 | struct ibv_flow *flow; | |
2507 | struct priv *priv = rxq->priv; | |
2508 | const uint8_t (*mac)[ETHER_ADDR_LEN] = | |
2509 | (const uint8_t (*)[ETHER_ADDR_LEN]) | |
2510 | priv->mac[mac_index].addr_bytes; | |
2511 | ||
2512 | /* Allocate flow specification on the stack. */ | |
2513 | struct __attribute__((packed)) { | |
2514 | struct ibv_flow_attr attr; | |
2515 | struct ibv_flow_spec_eth spec; | |
2516 | } data; | |
2517 | struct ibv_flow_attr *attr = &data.attr; | |
2518 | struct ibv_flow_spec_eth *spec = &data.spec; | |
2519 | ||
2520 | assert(mac_index < elemof(priv->mac)); | |
2521 | assert((vlan_index < elemof(priv->vlan_filter)) || (vlan_index == -1u)); | |
2522 | /* | |
2523 | * No padding must be inserted by the compiler between attr and spec. | |
2524 | * This layout is expected by libibverbs. | |
2525 | */ | |
2526 | assert(((uint8_t *)attr + sizeof(*attr)) == (uint8_t *)spec); | |
2527 | *attr = (struct ibv_flow_attr){ | |
2528 | .type = IBV_FLOW_ATTR_NORMAL, | |
2529 | .num_of_specs = 1, | |
2530 | .port = priv->port, | |
2531 | .flags = 0 | |
2532 | }; | |
2533 | *spec = (struct ibv_flow_spec_eth){ | |
2534 | .type = IBV_FLOW_SPEC_ETH, | |
2535 | .size = sizeof(*spec), | |
2536 | .val = { | |
2537 | .dst_mac = { | |
2538 | (*mac)[0], (*mac)[1], (*mac)[2], | |
2539 | (*mac)[3], (*mac)[4], (*mac)[5] | |
2540 | }, | |
2541 | .vlan_tag = ((vlan_index != -1u) ? | |
2542 | htons(priv->vlan_filter[vlan_index].id) : | |
2543 | 0), | |
2544 | }, | |
2545 | .mask = { | |
2546 | .dst_mac = "\xff\xff\xff\xff\xff\xff", | |
2547 | .vlan_tag = ((vlan_index != -1u) ? htons(0xfff) : 0), | |
2548 | } | |
2549 | }; | |
2550 | DEBUG("%p: adding MAC address %02x:%02x:%02x:%02x:%02x:%02x index %u" | |
2551 | " (VLAN %s %" PRIu16 ")", | |
2552 | (void *)rxq, | |
2553 | (*mac)[0], (*mac)[1], (*mac)[2], (*mac)[3], (*mac)[4], (*mac)[5], | |
2554 | mac_index, | |
2555 | ((vlan_index != -1u) ? "ID" : "index"), | |
2556 | ((vlan_index != -1u) ? priv->vlan_filter[vlan_index].id : -1u)); | |
2557 | /* Create related flow. */ | |
2558 | errno = 0; | |
2559 | flow = ibv_create_flow(rxq->qp, attr); | |
2560 | if (flow == NULL) { | |
2561 | /* It's not clear whether errno is always set in this case. */ | |
2562 | ERROR("%p: flow configuration failed, errno=%d: %s", | |
2563 | (void *)rxq, errno, | |
2564 | (errno ? strerror(errno) : "Unknown error")); | |
2565 | if (errno) | |
2566 | return errno; | |
2567 | return EINVAL; | |
2568 | } | |
2569 | if (vlan_index == -1u) | |
2570 | vlan_index = 0; | |
2571 | assert(rxq->mac_flow[mac_index][vlan_index] == NULL); | |
2572 | rxq->mac_flow[mac_index][vlan_index] = flow; | |
2573 | return 0; | |
2574 | } | |
2575 | ||
2576 | /** | |
2577 | * Register a MAC address in a RX queue. | |
2578 | * | |
2579 | * @param rxq | |
2580 | * Pointer to RX queue structure. | |
2581 | * @param mac_index | |
2582 | * MAC address index to register. | |
2583 | * | |
2584 | * @return | |
2585 | * 0 on success, errno value on failure. | |
2586 | */ | |
2587 | static int | |
2588 | rxq_mac_addr_add(struct rxq *rxq, unsigned int mac_index) | |
2589 | { | |
2590 | struct priv *priv = rxq->priv; | |
2591 | unsigned int i; | |
2592 | unsigned int vlans = 0; | |
2593 | int ret; | |
2594 | ||
2595 | assert(mac_index < elemof(priv->mac)); | |
2596 | if (BITFIELD_ISSET(rxq->mac_configured, mac_index)) | |
2597 | rxq_mac_addr_del(rxq, mac_index); | |
2598 | /* Fill VLAN specifications. */ | |
2599 | for (i = 0; (i != elemof(priv->vlan_filter)); ++i) { | |
2600 | if (!priv->vlan_filter[i].enabled) | |
2601 | continue; | |
2602 | /* Create related flow. */ | |
2603 | ret = rxq_add_flow(rxq, mac_index, i); | |
2604 | if (!ret) { | |
2605 | vlans++; | |
2606 | continue; | |
2607 | } | |
2608 | /* Failure, rollback. */ | |
2609 | while (i != 0) | |
2610 | if (priv->vlan_filter[--i].enabled) | |
2611 | rxq_del_flow(rxq, mac_index, i); | |
2612 | assert(ret > 0); | |
2613 | return ret; | |
2614 | } | |
2615 | /* In case there is no VLAN filter. */ | |
2616 | if (!vlans) { | |
2617 | ret = rxq_add_flow(rxq, mac_index, -1); | |
2618 | if (ret) | |
2619 | return ret; | |
2620 | } | |
2621 | BITFIELD_SET(rxq->mac_configured, mac_index); | |
2622 | return 0; | |
2623 | } | |
2624 | ||
2625 | /** | |
2626 | * Register all MAC addresses in a RX queue. | |
2627 | * | |
2628 | * @param rxq | |
2629 | * Pointer to RX queue structure. | |
2630 | * | |
2631 | * @return | |
2632 | * 0 on success, errno value on failure. | |
2633 | */ | |
2634 | static int | |
2635 | rxq_mac_addrs_add(struct rxq *rxq) | |
2636 | { | |
2637 | struct priv *priv = rxq->priv; | |
2638 | unsigned int i; | |
2639 | int ret; | |
2640 | ||
2641 | for (i = 0; (i != elemof(priv->mac)); ++i) { | |
2642 | if (!BITFIELD_ISSET(priv->mac_configured, i)) | |
2643 | continue; | |
2644 | ret = rxq_mac_addr_add(rxq, i); | |
2645 | if (!ret) | |
2646 | continue; | |
2647 | /* Failure, rollback. */ | |
2648 | while (i != 0) | |
2649 | rxq_mac_addr_del(rxq, --i); | |
2650 | assert(ret > 0); | |
2651 | return ret; | |
2652 | } | |
2653 | return 0; | |
2654 | } | |
2655 | ||
2656 | /** | |
2657 | * Unregister a MAC address. | |
2658 | * | |
2659 | * In RSS mode, the MAC address is unregistered from the parent queue, | |
2660 | * otherwise it is unregistered from each queue directly. | |
2661 | * | |
2662 | * @param priv | |
2663 | * Pointer to private structure. | |
2664 | * @param mac_index | |
2665 | * MAC address index. | |
2666 | */ | |
2667 | static void | |
2668 | priv_mac_addr_del(struct priv *priv, unsigned int mac_index) | |
2669 | { | |
2670 | unsigned int i; | |
2671 | ||
2672 | assert(mac_index < elemof(priv->mac)); | |
2673 | if (!BITFIELD_ISSET(priv->mac_configured, mac_index)) | |
2674 | return; | |
2675 | if (priv->rss) { | |
2676 | rxq_mac_addr_del(&priv->rxq_parent, mac_index); | |
2677 | goto end; | |
2678 | } | |
2679 | for (i = 0; (i != priv->dev->data->nb_rx_queues); ++i) | |
2680 | rxq_mac_addr_del((*priv->rxqs)[i], mac_index); | |
2681 | end: | |
2682 | BITFIELD_RESET(priv->mac_configured, mac_index); | |
2683 | } | |
2684 | ||
2685 | /** | |
2686 | * Register a MAC address. | |
2687 | * | |
2688 | * In RSS mode, the MAC address is registered in the parent queue, | |
2689 | * otherwise it is registered in each queue directly. | |
2690 | * | |
2691 | * @param priv | |
2692 | * Pointer to private structure. | |
2693 | * @param mac_index | |
2694 | * MAC address index to use. | |
2695 | * @param mac | |
2696 | * MAC address to register. | |
2697 | * | |
2698 | * @return | |
2699 | * 0 on success, errno value on failure. | |
2700 | */ | |
2701 | static int | |
2702 | priv_mac_addr_add(struct priv *priv, unsigned int mac_index, | |
2703 | const uint8_t (*mac)[ETHER_ADDR_LEN]) | |
2704 | { | |
2705 | unsigned int i; | |
2706 | int ret; | |
2707 | ||
2708 | assert(mac_index < elemof(priv->mac)); | |
2709 | /* First, make sure this address isn't already configured. */ | |
2710 | for (i = 0; (i != elemof(priv->mac)); ++i) { | |
2711 | /* Skip this index, it's going to be reconfigured. */ | |
2712 | if (i == mac_index) | |
2713 | continue; | |
2714 | if (!BITFIELD_ISSET(priv->mac_configured, i)) | |
2715 | continue; | |
2716 | if (memcmp(priv->mac[i].addr_bytes, *mac, sizeof(*mac))) | |
2717 | continue; | |
2718 | /* Address already configured elsewhere, return with error. */ | |
2719 | return EADDRINUSE; | |
2720 | } | |
2721 | if (BITFIELD_ISSET(priv->mac_configured, mac_index)) | |
2722 | priv_mac_addr_del(priv, mac_index); | |
2723 | priv->mac[mac_index] = (struct ether_addr){ | |
2724 | { | |
2725 | (*mac)[0], (*mac)[1], (*mac)[2], | |
2726 | (*mac)[3], (*mac)[4], (*mac)[5] | |
2727 | } | |
2728 | }; | |
2729 | /* If device isn't started, this is all we need to do. */ | |
2730 | if (!priv->started) { | |
2731 | #ifndef NDEBUG | |
2732 | /* Verify that all queues have this index disabled. */ | |
2733 | for (i = 0; (i != priv->rxqs_n); ++i) { | |
2734 | if ((*priv->rxqs)[i] == NULL) | |
2735 | continue; | |
2736 | assert(!BITFIELD_ISSET | |
2737 | ((*priv->rxqs)[i]->mac_configured, mac_index)); | |
2738 | } | |
2739 | #endif | |
2740 | goto end; | |
2741 | } | |
2742 | if (priv->rss) { | |
2743 | ret = rxq_mac_addr_add(&priv->rxq_parent, mac_index); | |
2744 | if (ret) | |
2745 | return ret; | |
2746 | goto end; | |
2747 | } | |
2748 | for (i = 0; (i != priv->rxqs_n); ++i) { | |
2749 | if ((*priv->rxqs)[i] == NULL) | |
2750 | continue; | |
2751 | ret = rxq_mac_addr_add((*priv->rxqs)[i], mac_index); | |
2752 | if (!ret) | |
2753 | continue; | |
2754 | /* Failure, rollback. */ | |
2755 | while (i != 0) | |
2756 | if ((*priv->rxqs)[(--i)] != NULL) | |
2757 | rxq_mac_addr_del((*priv->rxqs)[i], mac_index); | |
2758 | return ret; | |
2759 | } | |
2760 | end: | |
2761 | BITFIELD_SET(priv->mac_configured, mac_index); | |
2762 | return 0; | |
2763 | } | |
2764 | ||
2765 | /** | |
2766 | * Enable allmulti mode in a RX queue. | |
2767 | * | |
2768 | * @param rxq | |
2769 | * Pointer to RX queue structure. | |
2770 | * | |
2771 | * @return | |
2772 | * 0 on success, errno value on failure. | |
2773 | */ | |
2774 | static int | |
2775 | rxq_allmulticast_enable(struct rxq *rxq) | |
2776 | { | |
2777 | struct ibv_flow *flow; | |
2778 | struct ibv_flow_attr attr = { | |
2779 | .type = IBV_FLOW_ATTR_MC_DEFAULT, | |
2780 | .num_of_specs = 0, | |
2781 | .port = rxq->priv->port, | |
2782 | .flags = 0 | |
2783 | }; | |
2784 | ||
2785 | DEBUG("%p: enabling allmulticast mode", (void *)rxq); | |
2786 | if (rxq->allmulti_flow != NULL) | |
2787 | return EBUSY; | |
2788 | errno = 0; | |
2789 | flow = ibv_create_flow(rxq->qp, &attr); | |
2790 | if (flow == NULL) { | |
2791 | /* It's not clear whether errno is always set in this case. */ | |
2792 | ERROR("%p: flow configuration failed, errno=%d: %s", | |
2793 | (void *)rxq, errno, | |
2794 | (errno ? strerror(errno) : "Unknown error")); | |
2795 | if (errno) | |
2796 | return errno; | |
2797 | return EINVAL; | |
2798 | } | |
2799 | rxq->allmulti_flow = flow; | |
2800 | DEBUG("%p: allmulticast mode enabled", (void *)rxq); | |
2801 | return 0; | |
2802 | } | |
2803 | ||
2804 | /** | |
2805 | * Disable allmulti mode in a RX queue. | |
2806 | * | |
2807 | * @param rxq | |
2808 | * Pointer to RX queue structure. | |
2809 | */ | |
2810 | static void | |
2811 | rxq_allmulticast_disable(struct rxq *rxq) | |
2812 | { | |
2813 | DEBUG("%p: disabling allmulticast mode", (void *)rxq); | |
2814 | if (rxq->allmulti_flow == NULL) | |
2815 | return; | |
2816 | claim_zero(ibv_destroy_flow(rxq->allmulti_flow)); | |
2817 | rxq->allmulti_flow = NULL; | |
2818 | DEBUG("%p: allmulticast mode disabled", (void *)rxq); | |
2819 | } | |
2820 | ||
2821 | /** | |
2822 | * Enable promiscuous mode in a RX queue. | |
2823 | * | |
2824 | * @param rxq | |
2825 | * Pointer to RX queue structure. | |
2826 | * | |
2827 | * @return | |
2828 | * 0 on success, errno value on failure. | |
2829 | */ | |
2830 | static int | |
2831 | rxq_promiscuous_enable(struct rxq *rxq) | |
2832 | { | |
2833 | struct ibv_flow *flow; | |
2834 | struct ibv_flow_attr attr = { | |
2835 | .type = IBV_FLOW_ATTR_ALL_DEFAULT, | |
2836 | .num_of_specs = 0, | |
2837 | .port = rxq->priv->port, | |
2838 | .flags = 0 | |
2839 | }; | |
2840 | ||
2841 | if (rxq->priv->vf) | |
2842 | return 0; | |
2843 | DEBUG("%p: enabling promiscuous mode", (void *)rxq); | |
2844 | if (rxq->promisc_flow != NULL) | |
2845 | return EBUSY; | |
2846 | errno = 0; | |
2847 | flow = ibv_create_flow(rxq->qp, &attr); | |
2848 | if (flow == NULL) { | |
2849 | /* It's not clear whether errno is always set in this case. */ | |
2850 | ERROR("%p: flow configuration failed, errno=%d: %s", | |
2851 | (void *)rxq, errno, | |
2852 | (errno ? strerror(errno) : "Unknown error")); | |
2853 | if (errno) | |
2854 | return errno; | |
2855 | return EINVAL; | |
2856 | } | |
2857 | rxq->promisc_flow = flow; | |
2858 | DEBUG("%p: promiscuous mode enabled", (void *)rxq); | |
2859 | return 0; | |
2860 | } | |
2861 | ||
2862 | /** | |
2863 | * Disable promiscuous mode in a RX queue. | |
2864 | * | |
2865 | * @param rxq | |
2866 | * Pointer to RX queue structure. | |
2867 | */ | |
2868 | static void | |
2869 | rxq_promiscuous_disable(struct rxq *rxq) | |
2870 | { | |
2871 | if (rxq->priv->vf) | |
2872 | return; | |
2873 | DEBUG("%p: disabling promiscuous mode", (void *)rxq); | |
2874 | if (rxq->promisc_flow == NULL) | |
2875 | return; | |
2876 | claim_zero(ibv_destroy_flow(rxq->promisc_flow)); | |
2877 | rxq->promisc_flow = NULL; | |
2878 | DEBUG("%p: promiscuous mode disabled", (void *)rxq); | |
2879 | } | |
2880 | ||
2881 | /** | |
2882 | * Clean up a RX queue. | |
2883 | * | |
2884 | * Destroy objects, free allocated memory and reset the structure for reuse. | |
2885 | * | |
2886 | * @param rxq | |
2887 | * Pointer to RX queue structure. | |
2888 | */ | |
2889 | static void | |
2890 | rxq_cleanup(struct rxq *rxq) | |
2891 | { | |
2892 | struct ibv_exp_release_intf_params params; | |
2893 | ||
2894 | DEBUG("cleaning up %p", (void *)rxq); | |
2895 | if (rxq->sp) | |
2896 | rxq_free_elts_sp(rxq); | |
2897 | else | |
2898 | rxq_free_elts(rxq); | |
2899 | if (rxq->if_qp != NULL) { | |
2900 | assert(rxq->priv != NULL); | |
2901 | assert(rxq->priv->ctx != NULL); | |
2902 | assert(rxq->qp != NULL); | |
2903 | params = (struct ibv_exp_release_intf_params){ | |
2904 | .comp_mask = 0, | |
2905 | }; | |
2906 | claim_zero(ibv_exp_release_intf(rxq->priv->ctx, | |
2907 | rxq->if_qp, | |
2908 | ¶ms)); | |
2909 | } | |
2910 | if (rxq->if_cq != NULL) { | |
2911 | assert(rxq->priv != NULL); | |
2912 | assert(rxq->priv->ctx != NULL); | |
2913 | assert(rxq->cq != NULL); | |
2914 | params = (struct ibv_exp_release_intf_params){ | |
2915 | .comp_mask = 0, | |
2916 | }; | |
2917 | claim_zero(ibv_exp_release_intf(rxq->priv->ctx, | |
2918 | rxq->if_cq, | |
2919 | ¶ms)); | |
2920 | } | |
2921 | if (rxq->qp != NULL) { | |
2922 | rxq_promiscuous_disable(rxq); | |
2923 | rxq_allmulticast_disable(rxq); | |
2924 | rxq_mac_addrs_del(rxq); | |
2925 | claim_zero(ibv_destroy_qp(rxq->qp)); | |
2926 | } | |
2927 | if (rxq->cq != NULL) | |
2928 | claim_zero(ibv_destroy_cq(rxq->cq)); | |
2929 | if (rxq->rd != NULL) { | |
2930 | struct ibv_exp_destroy_res_domain_attr attr = { | |
2931 | .comp_mask = 0, | |
2932 | }; | |
2933 | ||
2934 | assert(rxq->priv != NULL); | |
2935 | assert(rxq->priv->ctx != NULL); | |
2936 | claim_zero(ibv_exp_destroy_res_domain(rxq->priv->ctx, | |
2937 | rxq->rd, | |
2938 | &attr)); | |
2939 | } | |
2940 | if (rxq->mr != NULL) | |
2941 | claim_zero(ibv_dereg_mr(rxq->mr)); | |
2942 | memset(rxq, 0, sizeof(*rxq)); | |
2943 | } | |
2944 | ||
2945 | /** | |
2946 | * Translate RX completion flags to packet type. | |
2947 | * | |
2948 | * @param flags | |
2949 | * RX completion flags returned by poll_length_flags(). | |
2950 | * | |
2951 | * @note: fix mlx4_dev_supported_ptypes_get() if any change here. | |
2952 | * | |
2953 | * @return | |
2954 | * Packet type for struct rte_mbuf. | |
2955 | */ | |
2956 | static inline uint32_t | |
2957 | rxq_cq_to_pkt_type(uint32_t flags) | |
2958 | { | |
2959 | uint32_t pkt_type; | |
2960 | ||
2961 | if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) | |
2962 | pkt_type = | |
2963 | TRANSPOSE(flags, | |
2964 | IBV_EXP_CQ_RX_OUTER_IPV4_PACKET, RTE_PTYPE_L3_IPV4) | | |
2965 | TRANSPOSE(flags, | |
2966 | IBV_EXP_CQ_RX_OUTER_IPV6_PACKET, RTE_PTYPE_L3_IPV6) | | |
2967 | TRANSPOSE(flags, | |
2968 | IBV_EXP_CQ_RX_IPV4_PACKET, RTE_PTYPE_INNER_L3_IPV4) | | |
2969 | TRANSPOSE(flags, | |
2970 | IBV_EXP_CQ_RX_IPV6_PACKET, RTE_PTYPE_INNER_L3_IPV6); | |
2971 | else | |
2972 | pkt_type = | |
2973 | TRANSPOSE(flags, | |
2974 | IBV_EXP_CQ_RX_IPV4_PACKET, RTE_PTYPE_L3_IPV4) | | |
2975 | TRANSPOSE(flags, | |
2976 | IBV_EXP_CQ_RX_IPV6_PACKET, RTE_PTYPE_L3_IPV6); | |
2977 | return pkt_type; | |
2978 | } | |
2979 | ||
2980 | /** | |
2981 | * Translate RX completion flags to offload flags. | |
2982 | * | |
2983 | * @param[in] rxq | |
2984 | * Pointer to RX queue structure. | |
2985 | * @param flags | |
2986 | * RX completion flags returned by poll_length_flags(). | |
2987 | * | |
2988 | * @return | |
2989 | * Offload flags (ol_flags) for struct rte_mbuf. | |
2990 | */ | |
2991 | static inline uint32_t | |
2992 | rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags) | |
2993 | { | |
2994 | uint32_t ol_flags = 0; | |
2995 | ||
2996 | if (rxq->csum) | |
2997 | ol_flags |= | |
2998 | TRANSPOSE(flags, | |
2999 | IBV_EXP_CQ_RX_IP_CSUM_OK, | |
3000 | PKT_RX_IP_CKSUM_GOOD) | | |
3001 | TRANSPOSE(flags, | |
3002 | IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK, | |
3003 | PKT_RX_L4_CKSUM_GOOD); | |
3004 | if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun)) | |
3005 | ol_flags |= | |
3006 | TRANSPOSE(flags, | |
3007 | IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK, | |
3008 | PKT_RX_IP_CKSUM_GOOD) | | |
3009 | TRANSPOSE(flags, | |
3010 | IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK, | |
3011 | PKT_RX_L4_CKSUM_GOOD); | |
3012 | return ol_flags; | |
3013 | } | |
3014 | ||
3015 | static uint16_t | |
3016 | mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n); | |
3017 | ||
3018 | /** | |
3019 | * DPDK callback for RX with scattered packets support. | |
3020 | * | |
3021 | * @param dpdk_rxq | |
3022 | * Generic pointer to RX queue structure. | |
3023 | * @param[out] pkts | |
3024 | * Array to store received packets. | |
3025 | * @param pkts_n | |
3026 | * Maximum number of packets in array. | |
3027 | * | |
3028 | * @return | |
3029 | * Number of packets successfully received (<= pkts_n). | |
3030 | */ | |
3031 | static uint16_t | |
3032 | mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) | |
3033 | { | |
3034 | struct rxq *rxq = (struct rxq *)dpdk_rxq; | |
3035 | struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; | |
3036 | const unsigned int elts_n = rxq->elts_n; | |
3037 | unsigned int elts_head = rxq->elts_head; | |
3038 | struct ibv_recv_wr head; | |
3039 | struct ibv_recv_wr **next = &head.next; | |
3040 | struct ibv_recv_wr *bad_wr; | |
3041 | unsigned int i; | |
3042 | unsigned int pkts_ret = 0; | |
3043 | int ret; | |
3044 | ||
3045 | if (unlikely(!rxq->sp)) | |
3046 | return mlx4_rx_burst(dpdk_rxq, pkts, pkts_n); | |
3047 | if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */ | |
3048 | return 0; | |
3049 | for (i = 0; (i != pkts_n); ++i) { | |
3050 | struct rxq_elt_sp *elt = &(*elts)[elts_head]; | |
3051 | struct ibv_recv_wr *wr = &elt->wr; | |
3052 | uint64_t wr_id = wr->wr_id; | |
3053 | unsigned int len; | |
3054 | unsigned int pkt_buf_len; | |
3055 | struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */ | |
3056 | struct rte_mbuf **pkt_buf_next = &pkt_buf; | |
3057 | unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM; | |
3058 | unsigned int j = 0; | |
3059 | uint32_t flags; | |
3060 | ||
3061 | /* Sanity checks. */ | |
3062 | #ifdef NDEBUG | |
3063 | (void)wr_id; | |
3064 | #endif | |
3065 | assert(wr_id < rxq->elts_n); | |
3066 | assert(wr->sg_list == elt->sges); | |
3067 | assert(wr->num_sge == elemof(elt->sges)); | |
3068 | assert(elts_head < rxq->elts_n); | |
3069 | assert(rxq->elts_head < rxq->elts_n); | |
3070 | ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, | |
3071 | &flags); | |
3072 | if (unlikely(ret < 0)) { | |
3073 | struct ibv_wc wc; | |
3074 | int wcs_n; | |
3075 | ||
3076 | DEBUG("rxq=%p, poll_length() failed (ret=%d)", | |
3077 | (void *)rxq, ret); | |
3078 | /* ibv_poll_cq() must be used in case of failure. */ | |
3079 | wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); | |
3080 | if (unlikely(wcs_n == 0)) | |
3081 | break; | |
3082 | if (unlikely(wcs_n < 0)) { | |
3083 | DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", | |
3084 | (void *)rxq, wcs_n); | |
3085 | break; | |
3086 | } | |
3087 | assert(wcs_n == 1); | |
3088 | if (unlikely(wc.status != IBV_WC_SUCCESS)) { | |
3089 | /* Whatever, just repost the offending WR. */ | |
3090 | DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" | |
3091 | " completion status (%d): %s", | |
3092 | (void *)rxq, wc.wr_id, wc.status, | |
3093 | ibv_wc_status_str(wc.status)); | |
3094 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
3095 | /* Increment dropped packets counter. */ | |
3096 | ++rxq->stats.idropped; | |
3097 | #endif | |
3098 | /* Link completed WRs together for repost. */ | |
3099 | *next = wr; | |
3100 | next = &wr->next; | |
3101 | goto repost; | |
3102 | } | |
3103 | ret = wc.byte_len; | |
3104 | } | |
3105 | if (ret == 0) | |
3106 | break; | |
3107 | len = ret; | |
3108 | pkt_buf_len = len; | |
3109 | /* Link completed WRs together for repost. */ | |
3110 | *next = wr; | |
3111 | next = &wr->next; | |
3112 | /* | |
3113 | * Replace spent segments with new ones, concatenate and | |
3114 | * return them as pkt_buf. | |
3115 | */ | |
3116 | while (1) { | |
3117 | struct ibv_sge *sge = &elt->sges[j]; | |
3118 | struct rte_mbuf *seg = elt->bufs[j]; | |
3119 | struct rte_mbuf *rep; | |
3120 | unsigned int seg_tailroom; | |
3121 | ||
3122 | /* | |
3123 | * Fetch initial bytes of packet descriptor into a | |
3124 | * cacheline while allocating rep. | |
3125 | */ | |
3126 | rte_prefetch0(seg); | |
3127 | rep = rte_mbuf_raw_alloc(rxq->mp); | |
3128 | if (unlikely(rep == NULL)) { | |
3129 | /* | |
3130 | * Unable to allocate a replacement mbuf, | |
3131 | * repost WR. | |
3132 | */ | |
3133 | DEBUG("rxq=%p, wr_id=%" PRIu64 ":" | |
3134 | " can't allocate a new mbuf", | |
3135 | (void *)rxq, wr_id); | |
3136 | if (pkt_buf != NULL) { | |
3137 | *pkt_buf_next = NULL; | |
3138 | rte_pktmbuf_free(pkt_buf); | |
3139 | } | |
3140 | /* Increase out of memory counters. */ | |
3141 | ++rxq->stats.rx_nombuf; | |
3142 | ++rxq->priv->dev->data->rx_mbuf_alloc_failed; | |
3143 | goto repost; | |
3144 | } | |
3145 | #ifndef NDEBUG | |
3146 | /* Poison user-modifiable fields in rep. */ | |
3147 | NEXT(rep) = (void *)((uintptr_t)-1); | |
3148 | SET_DATA_OFF(rep, 0xdead); | |
3149 | DATA_LEN(rep) = 0xd00d; | |
3150 | PKT_LEN(rep) = 0xdeadd00d; | |
3151 | NB_SEGS(rep) = 0x2a; | |
3152 | PORT(rep) = 0x2a; | |
3153 | rep->ol_flags = -1; | |
3154 | #endif | |
3155 | assert(rep->buf_len == seg->buf_len); | |
3156 | /* Reconfigure sge to use rep instead of seg. */ | |
3157 | assert(sge->lkey == rxq->mr->lkey); | |
3158 | sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom); | |
3159 | elt->bufs[j] = rep; | |
3160 | ++j; | |
3161 | /* Update pkt_buf if it's the first segment, or link | |
3162 | * seg to the previous one and update pkt_buf_next. */ | |
3163 | *pkt_buf_next = seg; | |
3164 | pkt_buf_next = &NEXT(seg); | |
3165 | /* Update seg information. */ | |
3166 | seg_tailroom = (seg->buf_len - seg_headroom); | |
3167 | assert(sge->length == seg_tailroom); | |
3168 | SET_DATA_OFF(seg, seg_headroom); | |
3169 | if (likely(len <= seg_tailroom)) { | |
3170 | /* Last segment. */ | |
3171 | DATA_LEN(seg) = len; | |
3172 | PKT_LEN(seg) = len; | |
3173 | /* Sanity check. */ | |
3174 | assert(rte_pktmbuf_headroom(seg) == | |
3175 | seg_headroom); | |
3176 | assert(rte_pktmbuf_tailroom(seg) == | |
3177 | (seg_tailroom - len)); | |
3178 | break; | |
3179 | } | |
3180 | DATA_LEN(seg) = seg_tailroom; | |
3181 | PKT_LEN(seg) = seg_tailroom; | |
3182 | /* Sanity check. */ | |
3183 | assert(rte_pktmbuf_headroom(seg) == seg_headroom); | |
3184 | assert(rte_pktmbuf_tailroom(seg) == 0); | |
3185 | /* Fix len and clear headroom for next segments. */ | |
3186 | len -= seg_tailroom; | |
3187 | seg_headroom = 0; | |
3188 | } | |
3189 | /* Update head and tail segments. */ | |
3190 | *pkt_buf_next = NULL; | |
3191 | assert(pkt_buf != NULL); | |
3192 | assert(j != 0); | |
3193 | NB_SEGS(pkt_buf) = j; | |
3194 | PORT(pkt_buf) = rxq->port_id; | |
3195 | PKT_LEN(pkt_buf) = pkt_buf_len; | |
3196 | pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); | |
3197 | pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); | |
3198 | ||
3199 | /* Return packet. */ | |
3200 | *(pkts++) = pkt_buf; | |
3201 | ++pkts_ret; | |
3202 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
3203 | /* Increase bytes counter. */ | |
3204 | rxq->stats.ibytes += pkt_buf_len; | |
3205 | #endif | |
3206 | repost: | |
3207 | if (++elts_head >= elts_n) | |
3208 | elts_head = 0; | |
3209 | continue; | |
3210 | } | |
3211 | if (unlikely(i == 0)) | |
3212 | return 0; | |
3213 | *next = NULL; | |
3214 | /* Repost WRs. */ | |
3215 | #ifdef DEBUG_RECV | |
3216 | DEBUG("%p: reposting %d WRs", (void *)rxq, i); | |
3217 | #endif | |
3218 | ret = ibv_post_recv(rxq->qp, head.next, &bad_wr); | |
3219 | if (unlikely(ret)) { | |
3220 | /* Inability to repost WRs is fatal. */ | |
3221 | DEBUG("%p: ibv_post_recv(): failed for WR %p: %s", | |
3222 | (void *)rxq->priv, | |
3223 | (void *)bad_wr, | |
3224 | strerror(ret)); | |
3225 | abort(); | |
3226 | } | |
3227 | rxq->elts_head = elts_head; | |
3228 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
3229 | /* Increase packets counter. */ | |
3230 | rxq->stats.ipackets += pkts_ret; | |
3231 | #endif | |
3232 | return pkts_ret; | |
3233 | } | |
3234 | ||
3235 | /** | |
3236 | * DPDK callback for RX. | |
3237 | * | |
3238 | * The following function is the same as mlx4_rx_burst_sp(), except it doesn't | |
3239 | * manage scattered packets. Improves performance when MRU is lower than the | |
3240 | * size of the first segment. | |
3241 | * | |
3242 | * @param dpdk_rxq | |
3243 | * Generic pointer to RX queue structure. | |
3244 | * @param[out] pkts | |
3245 | * Array to store received packets. | |
3246 | * @param pkts_n | |
3247 | * Maximum number of packets in array. | |
3248 | * | |
3249 | * @return | |
3250 | * Number of packets successfully received (<= pkts_n). | |
3251 | */ | |
3252 | static uint16_t | |
3253 | mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) | |
3254 | { | |
3255 | struct rxq *rxq = (struct rxq *)dpdk_rxq; | |
3256 | struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; | |
3257 | const unsigned int elts_n = rxq->elts_n; | |
3258 | unsigned int elts_head = rxq->elts_head; | |
3259 | struct ibv_sge sges[pkts_n]; | |
3260 | unsigned int i; | |
3261 | unsigned int pkts_ret = 0; | |
3262 | int ret; | |
3263 | ||
3264 | if (unlikely(rxq->sp)) | |
3265 | return mlx4_rx_burst_sp(dpdk_rxq, pkts, pkts_n); | |
3266 | for (i = 0; (i != pkts_n); ++i) { | |
3267 | struct rxq_elt *elt = &(*elts)[elts_head]; | |
3268 | struct ibv_recv_wr *wr = &elt->wr; | |
3269 | uint64_t wr_id = wr->wr_id; | |
3270 | unsigned int len; | |
3271 | struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr - | |
3272 | WR_ID(wr_id).offset); | |
3273 | struct rte_mbuf *rep; | |
3274 | uint32_t flags; | |
3275 | ||
3276 | /* Sanity checks. */ | |
3277 | assert(WR_ID(wr_id).id < rxq->elts_n); | |
3278 | assert(wr->sg_list == &elt->sge); | |
3279 | assert(wr->num_sge == 1); | |
3280 | assert(elts_head < rxq->elts_n); | |
3281 | assert(rxq->elts_head < rxq->elts_n); | |
3282 | /* | |
3283 | * Fetch initial bytes of packet descriptor into a | |
3284 | * cacheline while allocating rep. | |
3285 | */ | |
3286 | rte_mbuf_prefetch_part1(seg); | |
3287 | rte_mbuf_prefetch_part2(seg); | |
3288 | ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL, | |
3289 | &flags); | |
3290 | if (unlikely(ret < 0)) { | |
3291 | struct ibv_wc wc; | |
3292 | int wcs_n; | |
3293 | ||
3294 | DEBUG("rxq=%p, poll_length() failed (ret=%d)", | |
3295 | (void *)rxq, ret); | |
3296 | /* ibv_poll_cq() must be used in case of failure. */ | |
3297 | wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); | |
3298 | if (unlikely(wcs_n == 0)) | |
3299 | break; | |
3300 | if (unlikely(wcs_n < 0)) { | |
3301 | DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", | |
3302 | (void *)rxq, wcs_n); | |
3303 | break; | |
3304 | } | |
3305 | assert(wcs_n == 1); | |
3306 | if (unlikely(wc.status != IBV_WC_SUCCESS)) { | |
3307 | /* Whatever, just repost the offending WR. */ | |
3308 | DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" | |
3309 | " completion status (%d): %s", | |
3310 | (void *)rxq, wc.wr_id, wc.status, | |
3311 | ibv_wc_status_str(wc.status)); | |
3312 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
3313 | /* Increment dropped packets counter. */ | |
3314 | ++rxq->stats.idropped; | |
3315 | #endif | |
3316 | /* Add SGE to array for repost. */ | |
3317 | sges[i] = elt->sge; | |
3318 | goto repost; | |
3319 | } | |
3320 | ret = wc.byte_len; | |
3321 | } | |
3322 | if (ret == 0) | |
3323 | break; | |
3324 | len = ret; | |
3325 | rep = rte_mbuf_raw_alloc(rxq->mp); | |
3326 | if (unlikely(rep == NULL)) { | |
3327 | /* | |
3328 | * Unable to allocate a replacement mbuf, | |
3329 | * repost WR. | |
3330 | */ | |
3331 | DEBUG("rxq=%p, wr_id=%" PRIu32 ":" | |
3332 | " can't allocate a new mbuf", | |
3333 | (void *)rxq, WR_ID(wr_id).id); | |
3334 | /* Increase out of memory counters. */ | |
3335 | ++rxq->stats.rx_nombuf; | |
3336 | ++rxq->priv->dev->data->rx_mbuf_alloc_failed; | |
3337 | goto repost; | |
3338 | } | |
3339 | ||
3340 | /* Reconfigure sge to use rep instead of seg. */ | |
3341 | elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; | |
3342 | assert(elt->sge.lkey == rxq->mr->lkey); | |
3343 | WR_ID(wr->wr_id).offset = | |
3344 | (((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) - | |
3345 | (uintptr_t)rep); | |
3346 | assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id); | |
3347 | ||
3348 | /* Add SGE to array for repost. */ | |
3349 | sges[i] = elt->sge; | |
3350 | ||
3351 | /* Update seg information. */ | |
3352 | SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); | |
3353 | NB_SEGS(seg) = 1; | |
3354 | PORT(seg) = rxq->port_id; | |
3355 | NEXT(seg) = NULL; | |
3356 | PKT_LEN(seg) = len; | |
3357 | DATA_LEN(seg) = len; | |
3358 | seg->packet_type = rxq_cq_to_pkt_type(flags); | |
3359 | seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); | |
3360 | ||
3361 | /* Return packet. */ | |
3362 | *(pkts++) = seg; | |
3363 | ++pkts_ret; | |
3364 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
3365 | /* Increase bytes counter. */ | |
3366 | rxq->stats.ibytes += len; | |
3367 | #endif | |
3368 | repost: | |
3369 | if (++elts_head >= elts_n) | |
3370 | elts_head = 0; | |
3371 | continue; | |
3372 | } | |
3373 | if (unlikely(i == 0)) | |
3374 | return 0; | |
3375 | /* Repost WRs. */ | |
3376 | #ifdef DEBUG_RECV | |
3377 | DEBUG("%p: reposting %u WRs", (void *)rxq, i); | |
3378 | #endif | |
3379 | ret = rxq->if_qp->recv_burst(rxq->qp, sges, i); | |
3380 | if (unlikely(ret)) { | |
3381 | /* Inability to repost WRs is fatal. */ | |
3382 | DEBUG("%p: recv_burst(): failed (ret=%d)", | |
3383 | (void *)rxq->priv, | |
3384 | ret); | |
3385 | abort(); | |
3386 | } | |
3387 | rxq->elts_head = elts_head; | |
3388 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
3389 | /* Increase packets counter. */ | |
3390 | rxq->stats.ipackets += pkts_ret; | |
3391 | #endif | |
3392 | return pkts_ret; | |
3393 | } | |
3394 | ||
3395 | /** | |
3396 | * DPDK callback for RX in secondary processes. | |
3397 | * | |
3398 | * This function configures all queues from primary process information | |
3399 | * if necessary before reverting to the normal RX burst callback. | |
3400 | * | |
3401 | * @param dpdk_rxq | |
3402 | * Generic pointer to RX queue structure. | |
3403 | * @param[out] pkts | |
3404 | * Array to store received packets. | |
3405 | * @param pkts_n | |
3406 | * Maximum number of packets in array. | |
3407 | * | |
3408 | * @return | |
3409 | * Number of packets successfully received (<= pkts_n). | |
3410 | */ | |
3411 | static uint16_t | |
3412 | mlx4_rx_burst_secondary_setup(void *dpdk_rxq, struct rte_mbuf **pkts, | |
3413 | uint16_t pkts_n) | |
3414 | { | |
3415 | struct rxq *rxq = dpdk_rxq; | |
3416 | struct priv *priv = mlx4_secondary_data_setup(rxq->priv); | |
3417 | struct priv *primary_priv; | |
3418 | unsigned int index; | |
3419 | ||
3420 | if (priv == NULL) | |
3421 | return 0; | |
3422 | primary_priv = | |
3423 | mlx4_secondary_data[priv->dev->data->port_id].primary_priv; | |
3424 | /* Look for queue index in both private structures. */ | |
3425 | for (index = 0; index != priv->rxqs_n; ++index) | |
3426 | if (((*primary_priv->rxqs)[index] == rxq) || | |
3427 | ((*priv->rxqs)[index] == rxq)) | |
3428 | break; | |
3429 | if (index == priv->rxqs_n) | |
3430 | return 0; | |
3431 | rxq = (*priv->rxqs)[index]; | |
3432 | return priv->dev->rx_pkt_burst(rxq, pkts, pkts_n); | |
3433 | } | |
3434 | ||
3435 | /** | |
3436 | * Allocate a Queue Pair. | |
3437 | * Optionally setup inline receive if supported. | |
3438 | * | |
3439 | * @param priv | |
3440 | * Pointer to private structure. | |
3441 | * @param cq | |
3442 | * Completion queue to associate with QP. | |
3443 | * @param desc | |
3444 | * Number of descriptors in QP (hint only). | |
3445 | * | |
3446 | * @return | |
3447 | * QP pointer or NULL in case of error. | |
3448 | */ | |
3449 | static struct ibv_qp * | |
3450 | rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc, | |
3451 | struct ibv_exp_res_domain *rd) | |
3452 | { | |
3453 | struct ibv_exp_qp_init_attr attr = { | |
3454 | /* CQ to be associated with the send queue. */ | |
3455 | .send_cq = cq, | |
3456 | /* CQ to be associated with the receive queue. */ | |
3457 | .recv_cq = cq, | |
3458 | .cap = { | |
3459 | /* Max number of outstanding WRs. */ | |
3460 | .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ? | |
3461 | priv->device_attr.max_qp_wr : | |
3462 | desc), | |
3463 | /* Max number of scatter/gather elements in a WR. */ | |
3464 | .max_recv_sge = ((priv->device_attr.max_sge < | |
3465 | MLX4_PMD_SGE_WR_N) ? | |
3466 | priv->device_attr.max_sge : | |
3467 | MLX4_PMD_SGE_WR_N), | |
3468 | }, | |
3469 | .qp_type = IBV_QPT_RAW_PACKET, | |
3470 | .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | | |
3471 | IBV_EXP_QP_INIT_ATTR_RES_DOMAIN), | |
3472 | .pd = priv->pd, | |
3473 | .res_domain = rd, | |
3474 | }; | |
3475 | ||
3476 | #ifdef INLINE_RECV | |
3477 | attr.max_inl_recv = priv->inl_recv_size; | |
3478 | attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; | |
3479 | #endif | |
3480 | return ibv_exp_create_qp(priv->ctx, &attr); | |
3481 | } | |
3482 | ||
3483 | #ifdef RSS_SUPPORT | |
3484 | ||
3485 | /** | |
3486 | * Allocate a RSS Queue Pair. | |
3487 | * Optionally setup inline receive if supported. | |
3488 | * | |
3489 | * @param priv | |
3490 | * Pointer to private structure. | |
3491 | * @param cq | |
3492 | * Completion queue to associate with QP. | |
3493 | * @param desc | |
3494 | * Number of descriptors in QP (hint only). | |
3495 | * @param parent | |
3496 | * If nonzero, create a parent QP, otherwise a child. | |
3497 | * | |
3498 | * @return | |
3499 | * QP pointer or NULL in case of error. | |
3500 | */ | |
3501 | static struct ibv_qp * | |
3502 | rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc, | |
3503 | int parent, struct ibv_exp_res_domain *rd) | |
3504 | { | |
3505 | struct ibv_exp_qp_init_attr attr = { | |
3506 | /* CQ to be associated with the send queue. */ | |
3507 | .send_cq = cq, | |
3508 | /* CQ to be associated with the receive queue. */ | |
3509 | .recv_cq = cq, | |
3510 | .cap = { | |
3511 | /* Max number of outstanding WRs. */ | |
3512 | .max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ? | |
3513 | priv->device_attr.max_qp_wr : | |
3514 | desc), | |
3515 | /* Max number of scatter/gather elements in a WR. */ | |
3516 | .max_recv_sge = ((priv->device_attr.max_sge < | |
3517 | MLX4_PMD_SGE_WR_N) ? | |
3518 | priv->device_attr.max_sge : | |
3519 | MLX4_PMD_SGE_WR_N), | |
3520 | }, | |
3521 | .qp_type = IBV_QPT_RAW_PACKET, | |
3522 | .comp_mask = (IBV_EXP_QP_INIT_ATTR_PD | | |
3523 | IBV_EXP_QP_INIT_ATTR_RES_DOMAIN | | |
3524 | IBV_EXP_QP_INIT_ATTR_QPG), | |
3525 | .pd = priv->pd, | |
3526 | .res_domain = rd, | |
3527 | }; | |
3528 | ||
3529 | #ifdef INLINE_RECV | |
3530 | attr.max_inl_recv = priv->inl_recv_size, | |
3531 | attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_INL_RECV; | |
3532 | #endif | |
3533 | if (parent) { | |
3534 | attr.qpg.qpg_type = IBV_EXP_QPG_PARENT; | |
3535 | /* TSS isn't necessary. */ | |
3536 | attr.qpg.parent_attrib.tss_child_count = 0; | |
3537 | attr.qpg.parent_attrib.rss_child_count = | |
3538 | rte_align32pow2(priv->rxqs_n + 1) >> 1; | |
3539 | DEBUG("initializing parent RSS queue"); | |
3540 | } else { | |
3541 | attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX; | |
3542 | attr.qpg.qpg_parent = priv->rxq_parent.qp; | |
3543 | DEBUG("initializing child RSS queue"); | |
3544 | } | |
3545 | return ibv_exp_create_qp(priv->ctx, &attr); | |
3546 | } | |
3547 | ||
3548 | #endif /* RSS_SUPPORT */ | |
3549 | ||
3550 | /** | |
3551 | * Reconfigure a RX queue with new parameters. | |
3552 | * | |
3553 | * rxq_rehash() does not allocate mbufs, which, if not done from the right | |
3554 | * thread (such as a control thread), may corrupt the pool. | |
3555 | * In case of failure, the queue is left untouched. | |
3556 | * | |
3557 | * @param dev | |
3558 | * Pointer to Ethernet device structure. | |
3559 | * @param rxq | |
3560 | * RX queue pointer. | |
3561 | * | |
3562 | * @return | |
3563 | * 0 on success, errno value on failure. | |
3564 | */ | |
3565 | static int | |
3566 | rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq) | |
3567 | { | |
3568 | struct priv *priv = rxq->priv; | |
3569 | struct rxq tmpl = *rxq; | |
3570 | unsigned int mbuf_n; | |
3571 | unsigned int desc_n; | |
3572 | struct rte_mbuf **pool; | |
3573 | unsigned int i, k; | |
3574 | struct ibv_exp_qp_attr mod; | |
3575 | struct ibv_recv_wr *bad_wr; | |
3576 | unsigned int mb_len; | |
3577 | int err; | |
3578 | int parent = (rxq == &priv->rxq_parent); | |
3579 | ||
3580 | if (parent) { | |
3581 | ERROR("%p: cannot rehash parent queue %p", | |
3582 | (void *)dev, (void *)rxq); | |
3583 | return EINVAL; | |
3584 | } | |
3585 | mb_len = rte_pktmbuf_data_room_size(rxq->mp); | |
3586 | DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq); | |
3587 | /* Number of descriptors and mbufs currently allocated. */ | |
3588 | desc_n = (tmpl.elts_n * (tmpl.sp ? MLX4_PMD_SGE_WR_N : 1)); | |
3589 | mbuf_n = desc_n; | |
3590 | /* Toggle RX checksum offload if hardware supports it. */ | |
3591 | if (priv->hw_csum) { | |
3592 | tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum; | |
3593 | rxq->csum = tmpl.csum; | |
3594 | } | |
3595 | if (priv->hw_csum_l2tun) { | |
3596 | tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum; | |
3597 | rxq->csum_l2tun = tmpl.csum_l2tun; | |
3598 | } | |
3599 | /* Enable scattered packets support for this queue if necessary. */ | |
3600 | assert(mb_len >= RTE_PKTMBUF_HEADROOM); | |
3601 | if ((dev->data->dev_conf.rxmode.jumbo_frame) && | |
3602 | (dev->data->dev_conf.rxmode.max_rx_pkt_len > | |
3603 | (mb_len - RTE_PKTMBUF_HEADROOM))) { | |
3604 | tmpl.sp = 1; | |
3605 | desc_n /= MLX4_PMD_SGE_WR_N; | |
3606 | } else | |
3607 | tmpl.sp = 0; | |
3608 | DEBUG("%p: %s scattered packets support (%u WRs)", | |
3609 | (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n); | |
3610 | /* If scatter mode is the same as before, nothing to do. */ | |
3611 | if (tmpl.sp == rxq->sp) { | |
3612 | DEBUG("%p: nothing to do", (void *)dev); | |
3613 | return 0; | |
3614 | } | |
3615 | /* Remove attached flows if RSS is disabled (no parent queue). */ | |
3616 | if (!priv->rss) { | |
3617 | rxq_allmulticast_disable(&tmpl); | |
3618 | rxq_promiscuous_disable(&tmpl); | |
3619 | rxq_mac_addrs_del(&tmpl); | |
3620 | /* Update original queue in case of failure. */ | |
3621 | rxq->allmulti_flow = tmpl.allmulti_flow; | |
3622 | rxq->promisc_flow = tmpl.promisc_flow; | |
3623 | memcpy(rxq->mac_configured, tmpl.mac_configured, | |
3624 | sizeof(rxq->mac_configured)); | |
3625 | memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow)); | |
3626 | } | |
3627 | /* From now on, any failure will render the queue unusable. | |
3628 | * Reinitialize QP. */ | |
3629 | mod = (struct ibv_exp_qp_attr){ .qp_state = IBV_QPS_RESET }; | |
3630 | err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE); | |
3631 | if (err) { | |
3632 | ERROR("%p: cannot reset QP: %s", (void *)dev, strerror(err)); | |
3633 | assert(err > 0); | |
3634 | return err; | |
3635 | } | |
3636 | err = ibv_resize_cq(tmpl.cq, desc_n); | |
3637 | if (err) { | |
3638 | ERROR("%p: cannot resize CQ: %s", (void *)dev, strerror(err)); | |
3639 | assert(err > 0); | |
3640 | return err; | |
3641 | } | |
3642 | mod = (struct ibv_exp_qp_attr){ | |
3643 | /* Move the QP to this state. */ | |
3644 | .qp_state = IBV_QPS_INIT, | |
3645 | /* Primary port number. */ | |
3646 | .port_num = priv->port | |
3647 | }; | |
3648 | err = ibv_exp_modify_qp(tmpl.qp, &mod, | |
3649 | (IBV_EXP_QP_STATE | | |
3650 | #ifdef RSS_SUPPORT | |
3651 | (parent ? IBV_EXP_QP_GROUP_RSS : 0) | | |
3652 | #endif /* RSS_SUPPORT */ | |
3653 | IBV_EXP_QP_PORT)); | |
3654 | if (err) { | |
3655 | ERROR("%p: QP state to IBV_QPS_INIT failed: %s", | |
3656 | (void *)dev, strerror(err)); | |
3657 | assert(err > 0); | |
3658 | return err; | |
3659 | }; | |
3660 | /* Reconfigure flows. Do not care for errors. */ | |
3661 | if (!priv->rss) { | |
3662 | rxq_mac_addrs_add(&tmpl); | |
3663 | if (priv->promisc) | |
3664 | rxq_promiscuous_enable(&tmpl); | |
3665 | if (priv->allmulti) | |
3666 | rxq_allmulticast_enable(&tmpl); | |
3667 | /* Update original queue in case of failure. */ | |
3668 | rxq->allmulti_flow = tmpl.allmulti_flow; | |
3669 | rxq->promisc_flow = tmpl.promisc_flow; | |
3670 | memcpy(rxq->mac_configured, tmpl.mac_configured, | |
3671 | sizeof(rxq->mac_configured)); | |
3672 | memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow)); | |
3673 | } | |
3674 | /* Allocate pool. */ | |
3675 | pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0); | |
3676 | if (pool == NULL) { | |
3677 | ERROR("%p: cannot allocate memory", (void *)dev); | |
3678 | return ENOBUFS; | |
3679 | } | |
3680 | /* Snatch mbufs from original queue. */ | |
3681 | k = 0; | |
3682 | if (rxq->sp) { | |
3683 | struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; | |
3684 | ||
3685 | for (i = 0; (i != elemof(*elts)); ++i) { | |
3686 | struct rxq_elt_sp *elt = &(*elts)[i]; | |
3687 | unsigned int j; | |
3688 | ||
3689 | for (j = 0; (j != elemof(elt->bufs)); ++j) { | |
3690 | assert(elt->bufs[j] != NULL); | |
3691 | pool[k++] = elt->bufs[j]; | |
3692 | } | |
3693 | } | |
3694 | } else { | |
3695 | struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; | |
3696 | ||
3697 | for (i = 0; (i != elemof(*elts)); ++i) { | |
3698 | struct rxq_elt *elt = &(*elts)[i]; | |
3699 | struct rte_mbuf *buf = (void *) | |
3700 | ((uintptr_t)elt->sge.addr - | |
3701 | WR_ID(elt->wr.wr_id).offset); | |
3702 | ||
3703 | assert(WR_ID(elt->wr.wr_id).id == i); | |
3704 | pool[k++] = buf; | |
3705 | } | |
3706 | } | |
3707 | assert(k == mbuf_n); | |
3708 | tmpl.elts_n = 0; | |
3709 | tmpl.elts.sp = NULL; | |
3710 | assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp); | |
3711 | err = ((tmpl.sp) ? | |
3712 | rxq_alloc_elts_sp(&tmpl, desc_n, pool) : | |
3713 | rxq_alloc_elts(&tmpl, desc_n, pool)); | |
3714 | if (err) { | |
3715 | ERROR("%p: cannot reallocate WRs, aborting", (void *)dev); | |
3716 | rte_free(pool); | |
3717 | assert(err > 0); | |
3718 | return err; | |
3719 | } | |
3720 | assert(tmpl.elts_n == desc_n); | |
3721 | assert(tmpl.elts.sp != NULL); | |
3722 | rte_free(pool); | |
3723 | /* Clean up original data. */ | |
3724 | rxq->elts_n = 0; | |
3725 | rte_free(rxq->elts.sp); | |
3726 | rxq->elts.sp = NULL; | |
3727 | /* Post WRs. */ | |
3728 | err = ibv_post_recv(tmpl.qp, | |
3729 | (tmpl.sp ? | |
3730 | &(*tmpl.elts.sp)[0].wr : | |
3731 | &(*tmpl.elts.no_sp)[0].wr), | |
3732 | &bad_wr); | |
3733 | if (err) { | |
3734 | ERROR("%p: ibv_post_recv() failed for WR %p: %s", | |
3735 | (void *)dev, | |
3736 | (void *)bad_wr, | |
3737 | strerror(err)); | |
3738 | goto skip_rtr; | |
3739 | } | |
3740 | mod = (struct ibv_exp_qp_attr){ | |
3741 | .qp_state = IBV_QPS_RTR | |
3742 | }; | |
3743 | err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE); | |
3744 | if (err) | |
3745 | ERROR("%p: QP state to IBV_QPS_RTR failed: %s", | |
3746 | (void *)dev, strerror(err)); | |
3747 | skip_rtr: | |
3748 | *rxq = tmpl; | |
3749 | assert(err >= 0); | |
3750 | return err; | |
3751 | } | |
3752 | ||
3753 | /** | |
3754 | * Configure a RX queue. | |
3755 | * | |
3756 | * @param dev | |
3757 | * Pointer to Ethernet device structure. | |
3758 | * @param rxq | |
3759 | * Pointer to RX queue structure. | |
3760 | * @param desc | |
3761 | * Number of descriptors to configure in queue. | |
3762 | * @param socket | |
3763 | * NUMA socket on which memory must be allocated. | |
3764 | * @param inactive | |
3765 | * If true, the queue is disabled because its index is higher or | |
3766 | * equal to the real number of queues, which must be a power of 2. | |
3767 | * @param[in] conf | |
3768 | * Thresholds parameters. | |
3769 | * @param mp | |
3770 | * Memory pool for buffer allocations. | |
3771 | * | |
3772 | * @return | |
3773 | * 0 on success, errno value on failure. | |
3774 | */ | |
3775 | static int | |
3776 | rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc, | |
3777 | unsigned int socket, int inactive, const struct rte_eth_rxconf *conf, | |
3778 | struct rte_mempool *mp) | |
3779 | { | |
3780 | struct priv *priv = dev->data->dev_private; | |
3781 | struct rxq tmpl = { | |
3782 | .priv = priv, | |
3783 | .mp = mp, | |
3784 | .socket = socket | |
3785 | }; | |
3786 | struct ibv_exp_qp_attr mod; | |
3787 | union { | |
3788 | struct ibv_exp_query_intf_params params; | |
3789 | struct ibv_exp_cq_init_attr cq; | |
3790 | struct ibv_exp_res_domain_init_attr rd; | |
3791 | } attr; | |
3792 | enum ibv_exp_query_intf_status status; | |
3793 | struct ibv_recv_wr *bad_wr; | |
3794 | unsigned int mb_len; | |
3795 | int ret = 0; | |
3796 | int parent = (rxq == &priv->rxq_parent); | |
3797 | ||
3798 | (void)conf; /* Thresholds configuration (ignored). */ | |
3799 | /* | |
3800 | * If this is a parent queue, hardware must support RSS and | |
3801 | * RSS must be enabled. | |
3802 | */ | |
3803 | assert((!parent) || ((priv->hw_rss) && (priv->rss))); | |
3804 | if (parent) { | |
3805 | /* Even if unused, ibv_create_cq() requires at least one | |
3806 | * descriptor. */ | |
3807 | desc = 1; | |
3808 | goto skip_mr; | |
3809 | } | |
3810 | mb_len = rte_pktmbuf_data_room_size(mp); | |
3811 | if ((desc == 0) || (desc % MLX4_PMD_SGE_WR_N)) { | |
3812 | ERROR("%p: invalid number of RX descriptors (must be a" | |
3813 | " multiple of %d)", (void *)dev, MLX4_PMD_SGE_WR_N); | |
3814 | return EINVAL; | |
3815 | } | |
3816 | /* Toggle RX checksum offload if hardware supports it. */ | |
3817 | if (priv->hw_csum) | |
3818 | tmpl.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum; | |
3819 | if (priv->hw_csum_l2tun) | |
3820 | tmpl.csum_l2tun = !!dev->data->dev_conf.rxmode.hw_ip_checksum; | |
3821 | /* Enable scattered packets support for this queue if necessary. */ | |
3822 | assert(mb_len >= RTE_PKTMBUF_HEADROOM); | |
3823 | if ((dev->data->dev_conf.rxmode.jumbo_frame) && | |
3824 | (dev->data->dev_conf.rxmode.max_rx_pkt_len > | |
3825 | (mb_len - RTE_PKTMBUF_HEADROOM))) { | |
3826 | tmpl.sp = 1; | |
3827 | desc /= MLX4_PMD_SGE_WR_N; | |
3828 | } | |
3829 | DEBUG("%p: %s scattered packets support (%u WRs)", | |
3830 | (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc); | |
3831 | /* Use the entire RX mempool as the memory region. */ | |
3832 | tmpl.mr = mlx4_mp2mr(priv->pd, mp); | |
3833 | if (tmpl.mr == NULL) { | |
3834 | ret = EINVAL; | |
3835 | ERROR("%p: MR creation failure: %s", | |
3836 | (void *)dev, strerror(ret)); | |
3837 | goto error; | |
3838 | } | |
3839 | skip_mr: | |
3840 | attr.rd = (struct ibv_exp_res_domain_init_attr){ | |
3841 | .comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL | | |
3842 | IBV_EXP_RES_DOMAIN_MSG_MODEL), | |
3843 | .thread_model = IBV_EXP_THREAD_SINGLE, | |
3844 | .msg_model = IBV_EXP_MSG_HIGH_BW, | |
3845 | }; | |
3846 | tmpl.rd = ibv_exp_create_res_domain(priv->ctx, &attr.rd); | |
3847 | if (tmpl.rd == NULL) { | |
3848 | ret = ENOMEM; | |
3849 | ERROR("%p: RD creation failure: %s", | |
3850 | (void *)dev, strerror(ret)); | |
3851 | goto error; | |
3852 | } | |
3853 | attr.cq = (struct ibv_exp_cq_init_attr){ | |
3854 | .comp_mask = IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN, | |
3855 | .res_domain = tmpl.rd, | |
3856 | }; | |
3857 | tmpl.cq = ibv_exp_create_cq(priv->ctx, desc, NULL, NULL, 0, &attr.cq); | |
3858 | if (tmpl.cq == NULL) { | |
3859 | ret = ENOMEM; | |
3860 | ERROR("%p: CQ creation failure: %s", | |
3861 | (void *)dev, strerror(ret)); | |
3862 | goto error; | |
3863 | } | |
3864 | DEBUG("priv->device_attr.max_qp_wr is %d", | |
3865 | priv->device_attr.max_qp_wr); | |
3866 | DEBUG("priv->device_attr.max_sge is %d", | |
3867 | priv->device_attr.max_sge); | |
3868 | #ifdef RSS_SUPPORT | |
3869 | if (priv->rss && !inactive) | |
3870 | tmpl.qp = rxq_setup_qp_rss(priv, tmpl.cq, desc, parent, | |
3871 | tmpl.rd); | |
3872 | else | |
3873 | #endif /* RSS_SUPPORT */ | |
3874 | tmpl.qp = rxq_setup_qp(priv, tmpl.cq, desc, tmpl.rd); | |
3875 | if (tmpl.qp == NULL) { | |
3876 | ret = (errno ? errno : EINVAL); | |
3877 | ERROR("%p: QP creation failure: %s", | |
3878 | (void *)dev, strerror(ret)); | |
3879 | goto error; | |
3880 | } | |
3881 | mod = (struct ibv_exp_qp_attr){ | |
3882 | /* Move the QP to this state. */ | |
3883 | .qp_state = IBV_QPS_INIT, | |
3884 | /* Primary port number. */ | |
3885 | .port_num = priv->port | |
3886 | }; | |
3887 | ret = ibv_exp_modify_qp(tmpl.qp, &mod, | |
3888 | (IBV_EXP_QP_STATE | | |
3889 | #ifdef RSS_SUPPORT | |
3890 | (parent ? IBV_EXP_QP_GROUP_RSS : 0) | | |
3891 | #endif /* RSS_SUPPORT */ | |
3892 | IBV_EXP_QP_PORT)); | |
3893 | if (ret) { | |
3894 | ERROR("%p: QP state to IBV_QPS_INIT failed: %s", | |
3895 | (void *)dev, strerror(ret)); | |
3896 | goto error; | |
3897 | } | |
3898 | if ((parent) || (!priv->rss)) { | |
3899 | /* Configure MAC and broadcast addresses. */ | |
3900 | ret = rxq_mac_addrs_add(&tmpl); | |
3901 | if (ret) { | |
3902 | ERROR("%p: QP flow attachment failed: %s", | |
3903 | (void *)dev, strerror(ret)); | |
3904 | goto error; | |
3905 | } | |
3906 | } | |
3907 | /* Allocate descriptors for RX queues, except for the RSS parent. */ | |
3908 | if (parent) | |
3909 | goto skip_alloc; | |
3910 | if (tmpl.sp) | |
3911 | ret = rxq_alloc_elts_sp(&tmpl, desc, NULL); | |
3912 | else | |
3913 | ret = rxq_alloc_elts(&tmpl, desc, NULL); | |
3914 | if (ret) { | |
3915 | ERROR("%p: RXQ allocation failed: %s", | |
3916 | (void *)dev, strerror(ret)); | |
3917 | goto error; | |
3918 | } | |
3919 | ret = ibv_post_recv(tmpl.qp, | |
3920 | (tmpl.sp ? | |
3921 | &(*tmpl.elts.sp)[0].wr : | |
3922 | &(*tmpl.elts.no_sp)[0].wr), | |
3923 | &bad_wr); | |
3924 | if (ret) { | |
3925 | ERROR("%p: ibv_post_recv() failed for WR %p: %s", | |
3926 | (void *)dev, | |
3927 | (void *)bad_wr, | |
3928 | strerror(ret)); | |
3929 | goto error; | |
3930 | } | |
3931 | skip_alloc: | |
3932 | mod = (struct ibv_exp_qp_attr){ | |
3933 | .qp_state = IBV_QPS_RTR | |
3934 | }; | |
3935 | ret = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE); | |
3936 | if (ret) { | |
3937 | ERROR("%p: QP state to IBV_QPS_RTR failed: %s", | |
3938 | (void *)dev, strerror(ret)); | |
3939 | goto error; | |
3940 | } | |
3941 | /* Save port ID. */ | |
3942 | tmpl.port_id = dev->data->port_id; | |
3943 | DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id); | |
3944 | attr.params = (struct ibv_exp_query_intf_params){ | |
3945 | .intf_scope = IBV_EXP_INTF_GLOBAL, | |
3946 | .intf = IBV_EXP_INTF_CQ, | |
3947 | .obj = tmpl.cq, | |
3948 | }; | |
3949 | tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status); | |
3950 | if (tmpl.if_cq == NULL) { | |
3951 | ERROR("%p: CQ interface family query failed with status %d", | |
3952 | (void *)dev, status); | |
3953 | goto error; | |
3954 | } | |
3955 | attr.params = (struct ibv_exp_query_intf_params){ | |
3956 | .intf_scope = IBV_EXP_INTF_GLOBAL, | |
3957 | .intf = IBV_EXP_INTF_QP_BURST, | |
3958 | .obj = tmpl.qp, | |
3959 | }; | |
3960 | tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status); | |
3961 | if (tmpl.if_qp == NULL) { | |
3962 | ERROR("%p: QP interface family query failed with status %d", | |
3963 | (void *)dev, status); | |
3964 | goto error; | |
3965 | } | |
3966 | /* Clean up rxq in case we're reinitializing it. */ | |
3967 | DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq); | |
3968 | rxq_cleanup(rxq); | |
3969 | *rxq = tmpl; | |
3970 | DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl); | |
3971 | assert(ret == 0); | |
3972 | return 0; | |
3973 | error: | |
3974 | rxq_cleanup(&tmpl); | |
3975 | assert(ret > 0); | |
3976 | return ret; | |
3977 | } | |
3978 | ||
3979 | /** | |
3980 | * DPDK callback to configure a RX queue. | |
3981 | * | |
3982 | * @param dev | |
3983 | * Pointer to Ethernet device structure. | |
3984 | * @param idx | |
3985 | * RX queue index. | |
3986 | * @param desc | |
3987 | * Number of descriptors to configure in queue. | |
3988 | * @param socket | |
3989 | * NUMA socket on which memory must be allocated. | |
3990 | * @param[in] conf | |
3991 | * Thresholds parameters. | |
3992 | * @param mp | |
3993 | * Memory pool for buffer allocations. | |
3994 | * | |
3995 | * @return | |
3996 | * 0 on success, negative errno value on failure. | |
3997 | */ | |
3998 | static int | |
3999 | mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, | |
4000 | unsigned int socket, const struct rte_eth_rxconf *conf, | |
4001 | struct rte_mempool *mp) | |
4002 | { | |
4003 | struct priv *priv = dev->data->dev_private; | |
4004 | struct rxq *rxq = (*priv->rxqs)[idx]; | |
4005 | int inactive = 0; | |
4006 | int ret; | |
4007 | ||
4008 | if (mlx4_is_secondary()) | |
4009 | return -E_RTE_SECONDARY; | |
4010 | priv_lock(priv); | |
4011 | DEBUG("%p: configuring queue %u for %u descriptors", | |
4012 | (void *)dev, idx, desc); | |
4013 | if (idx >= priv->rxqs_n) { | |
4014 | ERROR("%p: queue index out of range (%u >= %u)", | |
4015 | (void *)dev, idx, priv->rxqs_n); | |
4016 | priv_unlock(priv); | |
4017 | return -EOVERFLOW; | |
4018 | } | |
4019 | if (rxq != NULL) { | |
4020 | DEBUG("%p: reusing already allocated queue index %u (%p)", | |
4021 | (void *)dev, idx, (void *)rxq); | |
4022 | if (priv->started) { | |
4023 | priv_unlock(priv); | |
4024 | return -EEXIST; | |
4025 | } | |
4026 | (*priv->rxqs)[idx] = NULL; | |
4027 | rxq_cleanup(rxq); | |
4028 | } else { | |
4029 | rxq = rte_calloc_socket("RXQ", 1, sizeof(*rxq), 0, socket); | |
4030 | if (rxq == NULL) { | |
4031 | ERROR("%p: unable to allocate queue index %u", | |
4032 | (void *)dev, idx); | |
4033 | priv_unlock(priv); | |
4034 | return -ENOMEM; | |
4035 | } | |
4036 | } | |
4037 | if (idx >= rte_align32pow2(priv->rxqs_n + 1) >> 1) | |
4038 | inactive = 1; | |
4039 | ret = rxq_setup(dev, rxq, desc, socket, inactive, conf, mp); | |
4040 | if (ret) | |
4041 | rte_free(rxq); | |
4042 | else { | |
4043 | rxq->stats.idx = idx; | |
4044 | DEBUG("%p: adding RX queue %p to list", | |
4045 | (void *)dev, (void *)rxq); | |
4046 | (*priv->rxqs)[idx] = rxq; | |
4047 | /* Update receive callback. */ | |
4048 | if (rxq->sp) | |
4049 | dev->rx_pkt_burst = mlx4_rx_burst_sp; | |
4050 | else | |
4051 | dev->rx_pkt_burst = mlx4_rx_burst; | |
4052 | } | |
4053 | priv_unlock(priv); | |
4054 | return -ret; | |
4055 | } | |
4056 | ||
4057 | /** | |
4058 | * DPDK callback to release a RX queue. | |
4059 | * | |
4060 | * @param dpdk_rxq | |
4061 | * Generic RX queue pointer. | |
4062 | */ | |
4063 | static void | |
4064 | mlx4_rx_queue_release(void *dpdk_rxq) | |
4065 | { | |
4066 | struct rxq *rxq = (struct rxq *)dpdk_rxq; | |
4067 | struct priv *priv; | |
4068 | unsigned int i; | |
4069 | ||
4070 | if (mlx4_is_secondary()) | |
4071 | return; | |
4072 | if (rxq == NULL) | |
4073 | return; | |
4074 | priv = rxq->priv; | |
4075 | priv_lock(priv); | |
4076 | assert(rxq != &priv->rxq_parent); | |
4077 | for (i = 0; (i != priv->rxqs_n); ++i) | |
4078 | if ((*priv->rxqs)[i] == rxq) { | |
4079 | DEBUG("%p: removing RX queue %p from list", | |
4080 | (void *)priv->dev, (void *)rxq); | |
4081 | (*priv->rxqs)[i] = NULL; | |
4082 | break; | |
4083 | } | |
4084 | rxq_cleanup(rxq); | |
4085 | rte_free(rxq); | |
4086 | priv_unlock(priv); | |
4087 | } | |
4088 | ||
4089 | static void | |
4090 | priv_dev_interrupt_handler_install(struct priv *, struct rte_eth_dev *); | |
4091 | ||
4092 | /** | |
4093 | * DPDK callback to start the device. | |
4094 | * | |
4095 | * Simulate device start by attaching all configured flows. | |
4096 | * | |
4097 | * @param dev | |
4098 | * Pointer to Ethernet device structure. | |
4099 | * | |
4100 | * @return | |
4101 | * 0 on success, negative errno value on failure. | |
4102 | */ | |
4103 | static int | |
4104 | mlx4_dev_start(struct rte_eth_dev *dev) | |
4105 | { | |
4106 | struct priv *priv = dev->data->dev_private; | |
4107 | unsigned int i = 0; | |
4108 | unsigned int r; | |
4109 | struct rxq *rxq; | |
4110 | ||
4111 | if (mlx4_is_secondary()) | |
4112 | return -E_RTE_SECONDARY; | |
4113 | priv_lock(priv); | |
4114 | if (priv->started) { | |
4115 | priv_unlock(priv); | |
4116 | return 0; | |
4117 | } | |
4118 | DEBUG("%p: attaching configured flows to all RX queues", (void *)dev); | |
4119 | priv->started = 1; | |
4120 | if (priv->rss) { | |
4121 | rxq = &priv->rxq_parent; | |
4122 | r = 1; | |
4123 | } else { | |
4124 | rxq = (*priv->rxqs)[0]; | |
4125 | r = priv->rxqs_n; | |
4126 | } | |
4127 | /* Iterate only once when RSS is enabled. */ | |
4128 | do { | |
4129 | int ret; | |
4130 | ||
4131 | /* Ignore nonexistent RX queues. */ | |
4132 | if (rxq == NULL) | |
4133 | continue; | |
4134 | ret = rxq_mac_addrs_add(rxq); | |
4135 | if (!ret && priv->promisc) | |
4136 | ret = rxq_promiscuous_enable(rxq); | |
4137 | if (!ret && priv->allmulti) | |
4138 | ret = rxq_allmulticast_enable(rxq); | |
4139 | if (!ret) | |
4140 | continue; | |
4141 | WARN("%p: QP flow attachment failed: %s", | |
4142 | (void *)dev, strerror(ret)); | |
4143 | /* Rollback. */ | |
4144 | while (i != 0) { | |
4145 | rxq = (*priv->rxqs)[--i]; | |
4146 | if (rxq != NULL) { | |
4147 | rxq_allmulticast_disable(rxq); | |
4148 | rxq_promiscuous_disable(rxq); | |
4149 | rxq_mac_addrs_del(rxq); | |
4150 | } | |
4151 | } | |
4152 | priv->started = 0; | |
4153 | priv_unlock(priv); | |
4154 | return -ret; | |
4155 | } while ((--r) && ((rxq = (*priv->rxqs)[++i]), i)); | |
4156 | priv_dev_interrupt_handler_install(priv, dev); | |
4157 | priv_unlock(priv); | |
4158 | return 0; | |
4159 | } | |
4160 | ||
4161 | /** | |
4162 | * DPDK callback to stop the device. | |
4163 | * | |
4164 | * Simulate device stop by detaching all configured flows. | |
4165 | * | |
4166 | * @param dev | |
4167 | * Pointer to Ethernet device structure. | |
4168 | */ | |
4169 | static void | |
4170 | mlx4_dev_stop(struct rte_eth_dev *dev) | |
4171 | { | |
4172 | struct priv *priv = dev->data->dev_private; | |
4173 | unsigned int i = 0; | |
4174 | unsigned int r; | |
4175 | struct rxq *rxq; | |
4176 | ||
4177 | if (mlx4_is_secondary()) | |
4178 | return; | |
4179 | priv_lock(priv); | |
4180 | if (!priv->started) { | |
4181 | priv_unlock(priv); | |
4182 | return; | |
4183 | } | |
4184 | DEBUG("%p: detaching flows from all RX queues", (void *)dev); | |
4185 | priv->started = 0; | |
4186 | if (priv->rss) { | |
4187 | rxq = &priv->rxq_parent; | |
4188 | r = 1; | |
4189 | } else { | |
4190 | rxq = (*priv->rxqs)[0]; | |
4191 | r = priv->rxqs_n; | |
4192 | } | |
4193 | /* Iterate only once when RSS is enabled. */ | |
4194 | do { | |
4195 | /* Ignore nonexistent RX queues. */ | |
4196 | if (rxq == NULL) | |
4197 | continue; | |
4198 | rxq_allmulticast_disable(rxq); | |
4199 | rxq_promiscuous_disable(rxq); | |
4200 | rxq_mac_addrs_del(rxq); | |
4201 | } while ((--r) && ((rxq = (*priv->rxqs)[++i]), i)); | |
4202 | priv_unlock(priv); | |
4203 | } | |
4204 | ||
4205 | /** | |
4206 | * Dummy DPDK callback for TX. | |
4207 | * | |
4208 | * This function is used to temporarily replace the real callback during | |
4209 | * unsafe control operations on the queue, or in case of error. | |
4210 | * | |
4211 | * @param dpdk_txq | |
4212 | * Generic pointer to TX queue structure. | |
4213 | * @param[in] pkts | |
4214 | * Packets to transmit. | |
4215 | * @param pkts_n | |
4216 | * Number of packets in array. | |
4217 | * | |
4218 | * @return | |
4219 | * Number of packets successfully transmitted (<= pkts_n). | |
4220 | */ | |
4221 | static uint16_t | |
4222 | removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) | |
4223 | { | |
4224 | (void)dpdk_txq; | |
4225 | (void)pkts; | |
4226 | (void)pkts_n; | |
4227 | return 0; | |
4228 | } | |
4229 | ||
4230 | /** | |
4231 | * Dummy DPDK callback for RX. | |
4232 | * | |
4233 | * This function is used to temporarily replace the real callback during | |
4234 | * unsafe control operations on the queue, or in case of error. | |
4235 | * | |
4236 | * @param dpdk_rxq | |
4237 | * Generic pointer to RX queue structure. | |
4238 | * @param[out] pkts | |
4239 | * Array to store received packets. | |
4240 | * @param pkts_n | |
4241 | * Maximum number of packets in array. | |
4242 | * | |
4243 | * @return | |
4244 | * Number of packets successfully received (<= pkts_n). | |
4245 | */ | |
4246 | static uint16_t | |
4247 | removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) | |
4248 | { | |
4249 | (void)dpdk_rxq; | |
4250 | (void)pkts; | |
4251 | (void)pkts_n; | |
4252 | return 0; | |
4253 | } | |
4254 | ||
4255 | static void | |
4256 | priv_dev_interrupt_handler_uninstall(struct priv *, struct rte_eth_dev *); | |
4257 | ||
4258 | /** | |
4259 | * DPDK callback to close the device. | |
4260 | * | |
4261 | * Destroy all queues and objects, free memory. | |
4262 | * | |
4263 | * @param dev | |
4264 | * Pointer to Ethernet device structure. | |
4265 | */ | |
4266 | static void | |
4267 | mlx4_dev_close(struct rte_eth_dev *dev) | |
4268 | { | |
4269 | struct priv *priv = mlx4_get_priv(dev); | |
4270 | void *tmp; | |
4271 | unsigned int i; | |
4272 | ||
4273 | if (priv == NULL) | |
4274 | return; | |
4275 | priv_lock(priv); | |
4276 | DEBUG("%p: closing device \"%s\"", | |
4277 | (void *)dev, | |
4278 | ((priv->ctx != NULL) ? priv->ctx->device->name : "")); | |
4279 | /* Prevent crashes when queues are still in use. This is unfortunately | |
4280 | * still required for DPDK 1.3 because some programs (such as testpmd) | |
4281 | * never release them before closing the device. */ | |
4282 | dev->rx_pkt_burst = removed_rx_burst; | |
4283 | dev->tx_pkt_burst = removed_tx_burst; | |
4284 | if (priv->rxqs != NULL) { | |
4285 | /* XXX race condition if mlx4_rx_burst() is still running. */ | |
4286 | usleep(1000); | |
4287 | for (i = 0; (i != priv->rxqs_n); ++i) { | |
4288 | tmp = (*priv->rxqs)[i]; | |
4289 | if (tmp == NULL) | |
4290 | continue; | |
4291 | (*priv->rxqs)[i] = NULL; | |
4292 | rxq_cleanup(tmp); | |
4293 | rte_free(tmp); | |
4294 | } | |
4295 | priv->rxqs_n = 0; | |
4296 | priv->rxqs = NULL; | |
4297 | } | |
4298 | if (priv->txqs != NULL) { | |
4299 | /* XXX race condition if mlx4_tx_burst() is still running. */ | |
4300 | usleep(1000); | |
4301 | for (i = 0; (i != priv->txqs_n); ++i) { | |
4302 | tmp = (*priv->txqs)[i]; | |
4303 | if (tmp == NULL) | |
4304 | continue; | |
4305 | (*priv->txqs)[i] = NULL; | |
4306 | txq_cleanup(tmp); | |
4307 | rte_free(tmp); | |
4308 | } | |
4309 | priv->txqs_n = 0; | |
4310 | priv->txqs = NULL; | |
4311 | } | |
4312 | if (priv->rss) | |
4313 | rxq_cleanup(&priv->rxq_parent); | |
4314 | if (priv->pd != NULL) { | |
4315 | assert(priv->ctx != NULL); | |
4316 | claim_zero(ibv_dealloc_pd(priv->pd)); | |
4317 | claim_zero(ibv_close_device(priv->ctx)); | |
4318 | } else | |
4319 | assert(priv->ctx == NULL); | |
4320 | priv_dev_interrupt_handler_uninstall(priv, dev); | |
4321 | priv_unlock(priv); | |
4322 | memset(priv, 0, sizeof(*priv)); | |
4323 | } | |
4324 | ||
4325 | /** | |
4326 | * Change the link state (UP / DOWN). | |
4327 | * | |
4328 | * @param priv | |
4329 | * Pointer to Ethernet device private data. | |
4330 | * @param up | |
4331 | * Nonzero for link up, otherwise link down. | |
4332 | * | |
4333 | * @return | |
4334 | * 0 on success, errno value on failure. | |
4335 | */ | |
4336 | static int | |
4337 | priv_set_link(struct priv *priv, int up) | |
4338 | { | |
4339 | struct rte_eth_dev *dev = priv->dev; | |
4340 | int err; | |
4341 | unsigned int i; | |
4342 | ||
4343 | if (up) { | |
4344 | err = priv_set_flags(priv, ~IFF_UP, IFF_UP); | |
4345 | if (err) | |
4346 | return err; | |
4347 | for (i = 0; i < priv->rxqs_n; i++) | |
4348 | if ((*priv->rxqs)[i]->sp) | |
4349 | break; | |
4350 | /* Check if an sp queue exists. | |
4351 | * Note: Some old frames might be received. | |
4352 | */ | |
4353 | if (i == priv->rxqs_n) | |
4354 | dev->rx_pkt_burst = mlx4_rx_burst; | |
4355 | else | |
4356 | dev->rx_pkt_burst = mlx4_rx_burst_sp; | |
4357 | dev->tx_pkt_burst = mlx4_tx_burst; | |
4358 | } else { | |
4359 | err = priv_set_flags(priv, ~IFF_UP, ~IFF_UP); | |
4360 | if (err) | |
4361 | return err; | |
4362 | dev->rx_pkt_burst = removed_rx_burst; | |
4363 | dev->tx_pkt_burst = removed_tx_burst; | |
4364 | } | |
4365 | return 0; | |
4366 | } | |
4367 | ||
4368 | /** | |
4369 | * DPDK callback to bring the link DOWN. | |
4370 | * | |
4371 | * @param dev | |
4372 | * Pointer to Ethernet device structure. | |
4373 | * | |
4374 | * @return | |
4375 | * 0 on success, errno value on failure. | |
4376 | */ | |
4377 | static int | |
4378 | mlx4_set_link_down(struct rte_eth_dev *dev) | |
4379 | { | |
4380 | struct priv *priv = dev->data->dev_private; | |
4381 | int err; | |
4382 | ||
4383 | priv_lock(priv); | |
4384 | err = priv_set_link(priv, 0); | |
4385 | priv_unlock(priv); | |
4386 | return err; | |
4387 | } | |
4388 | ||
4389 | /** | |
4390 | * DPDK callback to bring the link UP. | |
4391 | * | |
4392 | * @param dev | |
4393 | * Pointer to Ethernet device structure. | |
4394 | * | |
4395 | * @return | |
4396 | * 0 on success, errno value on failure. | |
4397 | */ | |
4398 | static int | |
4399 | mlx4_set_link_up(struct rte_eth_dev *dev) | |
4400 | { | |
4401 | struct priv *priv = dev->data->dev_private; | |
4402 | int err; | |
4403 | ||
4404 | priv_lock(priv); | |
4405 | err = priv_set_link(priv, 1); | |
4406 | priv_unlock(priv); | |
4407 | return err; | |
4408 | } | |
4409 | /** | |
4410 | * DPDK callback to get information about the device. | |
4411 | * | |
4412 | * @param dev | |
4413 | * Pointer to Ethernet device structure. | |
4414 | * @param[out] info | |
4415 | * Info structure output buffer. | |
4416 | */ | |
4417 | static void | |
4418 | mlx4_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info) | |
4419 | { | |
4420 | struct priv *priv = mlx4_get_priv(dev); | |
4421 | unsigned int max; | |
4422 | char ifname[IF_NAMESIZE]; | |
4423 | ||
4424 | if (priv == NULL) | |
4425 | return; | |
4426 | priv_lock(priv); | |
4427 | /* FIXME: we should ask the device for these values. */ | |
4428 | info->min_rx_bufsize = 32; | |
4429 | info->max_rx_pktlen = 65536; | |
4430 | /* | |
4431 | * Since we need one CQ per QP, the limit is the minimum number | |
4432 | * between the two values. | |
4433 | */ | |
4434 | max = ((priv->device_attr.max_cq > priv->device_attr.max_qp) ? | |
4435 | priv->device_attr.max_qp : priv->device_attr.max_cq); | |
4436 | /* If max >= 65535 then max = 0, max_rx_queues is uint16_t. */ | |
4437 | if (max >= 65535) | |
4438 | max = 65535; | |
4439 | info->max_rx_queues = max; | |
4440 | info->max_tx_queues = max; | |
4441 | /* Last array entry is reserved for broadcast. */ | |
4442 | info->max_mac_addrs = (elemof(priv->mac) - 1); | |
4443 | info->rx_offload_capa = | |
4444 | (priv->hw_csum ? | |
4445 | (DEV_RX_OFFLOAD_IPV4_CKSUM | | |
4446 | DEV_RX_OFFLOAD_UDP_CKSUM | | |
4447 | DEV_RX_OFFLOAD_TCP_CKSUM) : | |
4448 | 0); | |
4449 | info->tx_offload_capa = | |
4450 | (priv->hw_csum ? | |
4451 | (DEV_TX_OFFLOAD_IPV4_CKSUM | | |
4452 | DEV_TX_OFFLOAD_UDP_CKSUM | | |
4453 | DEV_TX_OFFLOAD_TCP_CKSUM) : | |
4454 | 0); | |
4455 | if (priv_get_ifname(priv, &ifname) == 0) | |
4456 | info->if_index = if_nametoindex(ifname); | |
4457 | info->speed_capa = | |
4458 | ETH_LINK_SPEED_1G | | |
4459 | ETH_LINK_SPEED_10G | | |
4460 | ETH_LINK_SPEED_20G | | |
4461 | ETH_LINK_SPEED_40G | | |
4462 | ETH_LINK_SPEED_56G; | |
4463 | priv_unlock(priv); | |
4464 | } | |
4465 | ||
4466 | static const uint32_t * | |
4467 | mlx4_dev_supported_ptypes_get(struct rte_eth_dev *dev) | |
4468 | { | |
4469 | static const uint32_t ptypes[] = { | |
4470 | /* refers to rxq_cq_to_pkt_type() */ | |
4471 | RTE_PTYPE_L3_IPV4, | |
4472 | RTE_PTYPE_L3_IPV6, | |
4473 | RTE_PTYPE_INNER_L3_IPV4, | |
4474 | RTE_PTYPE_INNER_L3_IPV6, | |
4475 | RTE_PTYPE_UNKNOWN | |
4476 | }; | |
4477 | ||
4478 | if (dev->rx_pkt_burst == mlx4_rx_burst || | |
4479 | dev->rx_pkt_burst == mlx4_rx_burst_sp) | |
4480 | return ptypes; | |
4481 | return NULL; | |
4482 | } | |
4483 | ||
4484 | /** | |
4485 | * DPDK callback to get device statistics. | |
4486 | * | |
4487 | * @param dev | |
4488 | * Pointer to Ethernet device structure. | |
4489 | * @param[out] stats | |
4490 | * Stats structure output buffer. | |
4491 | */ | |
4492 | static void | |
4493 | mlx4_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) | |
4494 | { | |
4495 | struct priv *priv = mlx4_get_priv(dev); | |
4496 | struct rte_eth_stats tmp = {0}; | |
4497 | unsigned int i; | |
4498 | unsigned int idx; | |
4499 | ||
4500 | if (priv == NULL) | |
4501 | return; | |
4502 | priv_lock(priv); | |
4503 | /* Add software counters. */ | |
4504 | for (i = 0; (i != priv->rxqs_n); ++i) { | |
4505 | struct rxq *rxq = (*priv->rxqs)[i]; | |
4506 | ||
4507 | if (rxq == NULL) | |
4508 | continue; | |
4509 | idx = rxq->stats.idx; | |
4510 | if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { | |
4511 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
4512 | tmp.q_ipackets[idx] += rxq->stats.ipackets; | |
4513 | tmp.q_ibytes[idx] += rxq->stats.ibytes; | |
4514 | #endif | |
4515 | tmp.q_errors[idx] += (rxq->stats.idropped + | |
4516 | rxq->stats.rx_nombuf); | |
4517 | } | |
4518 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
4519 | tmp.ipackets += rxq->stats.ipackets; | |
4520 | tmp.ibytes += rxq->stats.ibytes; | |
4521 | #endif | |
4522 | tmp.ierrors += rxq->stats.idropped; | |
4523 | tmp.rx_nombuf += rxq->stats.rx_nombuf; | |
4524 | } | |
4525 | for (i = 0; (i != priv->txqs_n); ++i) { | |
4526 | struct txq *txq = (*priv->txqs)[i]; | |
4527 | ||
4528 | if (txq == NULL) | |
4529 | continue; | |
4530 | idx = txq->stats.idx; | |
4531 | if (idx < RTE_ETHDEV_QUEUE_STAT_CNTRS) { | |
4532 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
4533 | tmp.q_opackets[idx] += txq->stats.opackets; | |
4534 | tmp.q_obytes[idx] += txq->stats.obytes; | |
4535 | #endif | |
4536 | tmp.q_errors[idx] += txq->stats.odropped; | |
4537 | } | |
4538 | #ifdef MLX4_PMD_SOFT_COUNTERS | |
4539 | tmp.opackets += txq->stats.opackets; | |
4540 | tmp.obytes += txq->stats.obytes; | |
4541 | #endif | |
4542 | tmp.oerrors += txq->stats.odropped; | |
4543 | } | |
4544 | #ifndef MLX4_PMD_SOFT_COUNTERS | |
4545 | /* FIXME: retrieve and add hardware counters. */ | |
4546 | #endif | |
4547 | *stats = tmp; | |
4548 | priv_unlock(priv); | |
4549 | } | |
4550 | ||
4551 | /** | |
4552 | * DPDK callback to clear device statistics. | |
4553 | * | |
4554 | * @param dev | |
4555 | * Pointer to Ethernet device structure. | |
4556 | */ | |
4557 | static void | |
4558 | mlx4_stats_reset(struct rte_eth_dev *dev) | |
4559 | { | |
4560 | struct priv *priv = mlx4_get_priv(dev); | |
4561 | unsigned int i; | |
4562 | unsigned int idx; | |
4563 | ||
4564 | if (priv == NULL) | |
4565 | return; | |
4566 | priv_lock(priv); | |
4567 | for (i = 0; (i != priv->rxqs_n); ++i) { | |
4568 | if ((*priv->rxqs)[i] == NULL) | |
4569 | continue; | |
4570 | idx = (*priv->rxqs)[i]->stats.idx; | |
4571 | (*priv->rxqs)[i]->stats = | |
4572 | (struct mlx4_rxq_stats){ .idx = idx }; | |
4573 | } | |
4574 | for (i = 0; (i != priv->txqs_n); ++i) { | |
4575 | if ((*priv->txqs)[i] == NULL) | |
4576 | continue; | |
4577 | idx = (*priv->txqs)[i]->stats.idx; | |
4578 | (*priv->txqs)[i]->stats = | |
4579 | (struct mlx4_txq_stats){ .idx = idx }; | |
4580 | } | |
4581 | #ifndef MLX4_PMD_SOFT_COUNTERS | |
4582 | /* FIXME: reset hardware counters. */ | |
4583 | #endif | |
4584 | priv_unlock(priv); | |
4585 | } | |
4586 | ||
4587 | /** | |
4588 | * DPDK callback to remove a MAC address. | |
4589 | * | |
4590 | * @param dev | |
4591 | * Pointer to Ethernet device structure. | |
4592 | * @param index | |
4593 | * MAC address index. | |
4594 | */ | |
4595 | static void | |
4596 | mlx4_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index) | |
4597 | { | |
4598 | struct priv *priv = dev->data->dev_private; | |
4599 | ||
4600 | if (mlx4_is_secondary()) | |
4601 | return; | |
4602 | priv_lock(priv); | |
4603 | DEBUG("%p: removing MAC address from index %" PRIu32, | |
4604 | (void *)dev, index); | |
4605 | /* Last array entry is reserved for broadcast. */ | |
4606 | if (index >= (elemof(priv->mac) - 1)) | |
4607 | goto end; | |
4608 | priv_mac_addr_del(priv, index); | |
4609 | end: | |
4610 | priv_unlock(priv); | |
4611 | } | |
4612 | ||
4613 | /** | |
4614 | * DPDK callback to add a MAC address. | |
4615 | * | |
4616 | * @param dev | |
4617 | * Pointer to Ethernet device structure. | |
4618 | * @param mac_addr | |
4619 | * MAC address to register. | |
4620 | * @param index | |
4621 | * MAC address index. | |
4622 | * @param vmdq | |
4623 | * VMDq pool index to associate address with (ignored). | |
4624 | */ | |
4625 | static void | |
4626 | mlx4_mac_addr_add(struct rte_eth_dev *dev, struct ether_addr *mac_addr, | |
4627 | uint32_t index, uint32_t vmdq) | |
4628 | { | |
4629 | struct priv *priv = dev->data->dev_private; | |
4630 | ||
4631 | if (mlx4_is_secondary()) | |
4632 | return; | |
4633 | (void)vmdq; | |
4634 | priv_lock(priv); | |
4635 | DEBUG("%p: adding MAC address at index %" PRIu32, | |
4636 | (void *)dev, index); | |
4637 | /* Last array entry is reserved for broadcast. */ | |
4638 | if (index >= (elemof(priv->mac) - 1)) | |
4639 | goto end; | |
4640 | priv_mac_addr_add(priv, index, | |
4641 | (const uint8_t (*)[ETHER_ADDR_LEN]) | |
4642 | mac_addr->addr_bytes); | |
4643 | end: | |
4644 | priv_unlock(priv); | |
4645 | } | |
4646 | ||
4647 | /** | |
4648 | * DPDK callback to set the primary MAC address. | |
4649 | * | |
4650 | * @param dev | |
4651 | * Pointer to Ethernet device structure. | |
4652 | * @param mac_addr | |
4653 | * MAC address to register. | |
4654 | */ | |
4655 | static void | |
4656 | mlx4_mac_addr_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr) | |
4657 | { | |
4658 | DEBUG("%p: setting primary MAC address", (void *)dev); | |
4659 | mlx4_mac_addr_remove(dev, 0); | |
4660 | mlx4_mac_addr_add(dev, mac_addr, 0, 0); | |
4661 | } | |
4662 | ||
4663 | /** | |
4664 | * DPDK callback to enable promiscuous mode. | |
4665 | * | |
4666 | * @param dev | |
4667 | * Pointer to Ethernet device structure. | |
4668 | */ | |
4669 | static void | |
4670 | mlx4_promiscuous_enable(struct rte_eth_dev *dev) | |
4671 | { | |
4672 | struct priv *priv = dev->data->dev_private; | |
4673 | unsigned int i; | |
4674 | int ret; | |
4675 | ||
4676 | if (mlx4_is_secondary()) | |
4677 | return; | |
4678 | priv_lock(priv); | |
4679 | if (priv->promisc) { | |
4680 | priv_unlock(priv); | |
4681 | return; | |
4682 | } | |
4683 | /* If device isn't started, this is all we need to do. */ | |
4684 | if (!priv->started) | |
4685 | goto end; | |
4686 | if (priv->rss) { | |
4687 | ret = rxq_promiscuous_enable(&priv->rxq_parent); | |
4688 | if (ret) { | |
4689 | priv_unlock(priv); | |
4690 | return; | |
4691 | } | |
4692 | goto end; | |
4693 | } | |
4694 | for (i = 0; (i != priv->rxqs_n); ++i) { | |
4695 | if ((*priv->rxqs)[i] == NULL) | |
4696 | continue; | |
4697 | ret = rxq_promiscuous_enable((*priv->rxqs)[i]); | |
4698 | if (!ret) | |
4699 | continue; | |
4700 | /* Failure, rollback. */ | |
4701 | while (i != 0) | |
4702 | if ((*priv->rxqs)[--i] != NULL) | |
4703 | rxq_promiscuous_disable((*priv->rxqs)[i]); | |
4704 | priv_unlock(priv); | |
4705 | return; | |
4706 | } | |
4707 | end: | |
4708 | priv->promisc = 1; | |
4709 | priv_unlock(priv); | |
4710 | } | |
4711 | ||
4712 | /** | |
4713 | * DPDK callback to disable promiscuous mode. | |
4714 | * | |
4715 | * @param dev | |
4716 | * Pointer to Ethernet device structure. | |
4717 | */ | |
4718 | static void | |
4719 | mlx4_promiscuous_disable(struct rte_eth_dev *dev) | |
4720 | { | |
4721 | struct priv *priv = dev->data->dev_private; | |
4722 | unsigned int i; | |
4723 | ||
4724 | if (mlx4_is_secondary()) | |
4725 | return; | |
4726 | priv_lock(priv); | |
4727 | if (!priv->promisc) { | |
4728 | priv_unlock(priv); | |
4729 | return; | |
4730 | } | |
4731 | if (priv->rss) { | |
4732 | rxq_promiscuous_disable(&priv->rxq_parent); | |
4733 | goto end; | |
4734 | } | |
4735 | for (i = 0; (i != priv->rxqs_n); ++i) | |
4736 | if ((*priv->rxqs)[i] != NULL) | |
4737 | rxq_promiscuous_disable((*priv->rxqs)[i]); | |
4738 | end: | |
4739 | priv->promisc = 0; | |
4740 | priv_unlock(priv); | |
4741 | } | |
4742 | ||
4743 | /** | |
4744 | * DPDK callback to enable allmulti mode. | |
4745 | * | |
4746 | * @param dev | |
4747 | * Pointer to Ethernet device structure. | |
4748 | */ | |
4749 | static void | |
4750 | mlx4_allmulticast_enable(struct rte_eth_dev *dev) | |
4751 | { | |
4752 | struct priv *priv = dev->data->dev_private; | |
4753 | unsigned int i; | |
4754 | int ret; | |
4755 | ||
4756 | if (mlx4_is_secondary()) | |
4757 | return; | |
4758 | priv_lock(priv); | |
4759 | if (priv->allmulti) { | |
4760 | priv_unlock(priv); | |
4761 | return; | |
4762 | } | |
4763 | /* If device isn't started, this is all we need to do. */ | |
4764 | if (!priv->started) | |
4765 | goto end; | |
4766 | if (priv->rss) { | |
4767 | ret = rxq_allmulticast_enable(&priv->rxq_parent); | |
4768 | if (ret) { | |
4769 | priv_unlock(priv); | |
4770 | return; | |
4771 | } | |
4772 | goto end; | |
4773 | } | |
4774 | for (i = 0; (i != priv->rxqs_n); ++i) { | |
4775 | if ((*priv->rxqs)[i] == NULL) | |
4776 | continue; | |
4777 | ret = rxq_allmulticast_enable((*priv->rxqs)[i]); | |
4778 | if (!ret) | |
4779 | continue; | |
4780 | /* Failure, rollback. */ | |
4781 | while (i != 0) | |
4782 | if ((*priv->rxqs)[--i] != NULL) | |
4783 | rxq_allmulticast_disable((*priv->rxqs)[i]); | |
4784 | priv_unlock(priv); | |
4785 | return; | |
4786 | } | |
4787 | end: | |
4788 | priv->allmulti = 1; | |
4789 | priv_unlock(priv); | |
4790 | } | |
4791 | ||
4792 | /** | |
4793 | * DPDK callback to disable allmulti mode. | |
4794 | * | |
4795 | * @param dev | |
4796 | * Pointer to Ethernet device structure. | |
4797 | */ | |
4798 | static void | |
4799 | mlx4_allmulticast_disable(struct rte_eth_dev *dev) | |
4800 | { | |
4801 | struct priv *priv = dev->data->dev_private; | |
4802 | unsigned int i; | |
4803 | ||
4804 | if (mlx4_is_secondary()) | |
4805 | return; | |
4806 | priv_lock(priv); | |
4807 | if (!priv->allmulti) { | |
4808 | priv_unlock(priv); | |
4809 | return; | |
4810 | } | |
4811 | if (priv->rss) { | |
4812 | rxq_allmulticast_disable(&priv->rxq_parent); | |
4813 | goto end; | |
4814 | } | |
4815 | for (i = 0; (i != priv->rxqs_n); ++i) | |
4816 | if ((*priv->rxqs)[i] != NULL) | |
4817 | rxq_allmulticast_disable((*priv->rxqs)[i]); | |
4818 | end: | |
4819 | priv->allmulti = 0; | |
4820 | priv_unlock(priv); | |
4821 | } | |
4822 | ||
4823 | /** | |
4824 | * DPDK callback to retrieve physical link information (unlocked version). | |
4825 | * | |
4826 | * @param dev | |
4827 | * Pointer to Ethernet device structure. | |
4828 | * @param wait_to_complete | |
4829 | * Wait for request completion (ignored). | |
4830 | */ | |
4831 | static int | |
4832 | mlx4_link_update_unlocked(struct rte_eth_dev *dev, int wait_to_complete) | |
4833 | { | |
4834 | struct priv *priv = mlx4_get_priv(dev); | |
4835 | struct ethtool_cmd edata = { | |
4836 | .cmd = ETHTOOL_GSET | |
4837 | }; | |
4838 | struct ifreq ifr; | |
4839 | struct rte_eth_link dev_link; | |
4840 | int link_speed = 0; | |
4841 | ||
4842 | if (priv == NULL) | |
4843 | return -EINVAL; | |
4844 | (void)wait_to_complete; | |
4845 | if (priv_ifreq(priv, SIOCGIFFLAGS, &ifr)) { | |
4846 | WARN("ioctl(SIOCGIFFLAGS) failed: %s", strerror(errno)); | |
4847 | return -1; | |
4848 | } | |
4849 | memset(&dev_link, 0, sizeof(dev_link)); | |
4850 | dev_link.link_status = ((ifr.ifr_flags & IFF_UP) && | |
4851 | (ifr.ifr_flags & IFF_RUNNING)); | |
4852 | ifr.ifr_data = (void *)&edata; | |
4853 | if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { | |
4854 | WARN("ioctl(SIOCETHTOOL, ETHTOOL_GSET) failed: %s", | |
4855 | strerror(errno)); | |
4856 | return -1; | |
4857 | } | |
4858 | link_speed = ethtool_cmd_speed(&edata); | |
4859 | if (link_speed == -1) | |
4860 | dev_link.link_speed = 0; | |
4861 | else | |
4862 | dev_link.link_speed = link_speed; | |
4863 | dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ? | |
4864 | ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX); | |
4865 | dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds & | |
4866 | ETH_LINK_SPEED_FIXED); | |
4867 | if (memcmp(&dev_link, &dev->data->dev_link, sizeof(dev_link))) { | |
4868 | /* Link status changed. */ | |
4869 | dev->data->dev_link = dev_link; | |
4870 | return 0; | |
4871 | } | |
4872 | /* Link status is still the same. */ | |
4873 | return -1; | |
4874 | } | |
4875 | ||
4876 | /** | |
4877 | * DPDK callback to retrieve physical link information. | |
4878 | * | |
4879 | * @param dev | |
4880 | * Pointer to Ethernet device structure. | |
4881 | * @param wait_to_complete | |
4882 | * Wait for request completion (ignored). | |
4883 | */ | |
4884 | static int | |
4885 | mlx4_link_update(struct rte_eth_dev *dev, int wait_to_complete) | |
4886 | { | |
4887 | struct priv *priv = mlx4_get_priv(dev); | |
4888 | int ret; | |
4889 | ||
4890 | if (priv == NULL) | |
4891 | return -EINVAL; | |
4892 | priv_lock(priv); | |
4893 | ret = mlx4_link_update_unlocked(dev, wait_to_complete); | |
4894 | priv_unlock(priv); | |
4895 | return ret; | |
4896 | } | |
4897 | ||
4898 | /** | |
4899 | * DPDK callback to change the MTU. | |
4900 | * | |
4901 | * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be | |
4902 | * received). Use this as a hint to enable/disable scattered packets support | |
4903 | * and improve performance when not needed. | |
4904 | * Since failure is not an option, reconfiguring queues on the fly is not | |
4905 | * recommended. | |
4906 | * | |
4907 | * @param dev | |
4908 | * Pointer to Ethernet device structure. | |
4909 | * @param in_mtu | |
4910 | * New MTU. | |
4911 | * | |
4912 | * @return | |
4913 | * 0 on success, negative errno value on failure. | |
4914 | */ | |
4915 | static int | |
4916 | mlx4_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu) | |
4917 | { | |
4918 | struct priv *priv = dev->data->dev_private; | |
4919 | int ret = 0; | |
4920 | unsigned int i; | |
4921 | uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) = | |
4922 | mlx4_rx_burst; | |
4923 | ||
4924 | if (mlx4_is_secondary()) | |
4925 | return -E_RTE_SECONDARY; | |
4926 | priv_lock(priv); | |
4927 | /* Set kernel interface MTU first. */ | |
4928 | if (priv_set_mtu(priv, mtu)) { | |
4929 | ret = errno; | |
4930 | WARN("cannot set port %u MTU to %u: %s", priv->port, mtu, | |
4931 | strerror(ret)); | |
4932 | goto out; | |
4933 | } else | |
4934 | DEBUG("adapter port %u MTU set to %u", priv->port, mtu); | |
4935 | priv->mtu = mtu; | |
4936 | /* Temporarily replace RX handler with a fake one, assuming it has not | |
4937 | * been copied elsewhere. */ | |
4938 | dev->rx_pkt_burst = removed_rx_burst; | |
4939 | /* Make sure everyone has left mlx4_rx_burst() and uses | |
4940 | * removed_rx_burst() instead. */ | |
4941 | rte_wmb(); | |
4942 | usleep(1000); | |
4943 | /* Reconfigure each RX queue. */ | |
4944 | for (i = 0; (i != priv->rxqs_n); ++i) { | |
4945 | struct rxq *rxq = (*priv->rxqs)[i]; | |
4946 | unsigned int mb_len; | |
4947 | unsigned int max_frame_len; | |
4948 | int sp; | |
4949 | ||
4950 | if (rxq == NULL) | |
4951 | continue; | |
4952 | /* Calculate new maximum frame length according to MTU and | |
4953 | * toggle scattered support (sp) if necessary. */ | |
4954 | max_frame_len = (priv->mtu + ETHER_HDR_LEN + | |
4955 | (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN)); | |
4956 | mb_len = rte_pktmbuf_data_room_size(rxq->mp); | |
4957 | assert(mb_len >= RTE_PKTMBUF_HEADROOM); | |
4958 | sp = (max_frame_len > (mb_len - RTE_PKTMBUF_HEADROOM)); | |
4959 | /* Provide new values to rxq_setup(). */ | |
4960 | dev->data->dev_conf.rxmode.jumbo_frame = sp; | |
4961 | dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len; | |
4962 | ret = rxq_rehash(dev, rxq); | |
4963 | if (ret) { | |
4964 | /* Force SP RX if that queue requires it and abort. */ | |
4965 | if (rxq->sp) | |
4966 | rx_func = mlx4_rx_burst_sp; | |
4967 | break; | |
4968 | } | |
4969 | /* Reenable non-RSS queue attributes. No need to check | |
4970 | * for errors at this stage. */ | |
4971 | if (!priv->rss) { | |
4972 | rxq_mac_addrs_add(rxq); | |
4973 | if (priv->promisc) | |
4974 | rxq_promiscuous_enable(rxq); | |
4975 | if (priv->allmulti) | |
4976 | rxq_allmulticast_enable(rxq); | |
4977 | } | |
4978 | /* Scattered burst function takes priority. */ | |
4979 | if (rxq->sp) | |
4980 | rx_func = mlx4_rx_burst_sp; | |
4981 | } | |
4982 | /* Burst functions can now be called again. */ | |
4983 | rte_wmb(); | |
4984 | dev->rx_pkt_burst = rx_func; | |
4985 | out: | |
4986 | priv_unlock(priv); | |
4987 | assert(ret >= 0); | |
4988 | return -ret; | |
4989 | } | |
4990 | ||
4991 | /** | |
4992 | * DPDK callback to get flow control status. | |
4993 | * | |
4994 | * @param dev | |
4995 | * Pointer to Ethernet device structure. | |
4996 | * @param[out] fc_conf | |
4997 | * Flow control output buffer. | |
4998 | * | |
4999 | * @return | |
5000 | * 0 on success, negative errno value on failure. | |
5001 | */ | |
5002 | static int | |
5003 | mlx4_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) | |
5004 | { | |
5005 | struct priv *priv = dev->data->dev_private; | |
5006 | struct ifreq ifr; | |
5007 | struct ethtool_pauseparam ethpause = { | |
5008 | .cmd = ETHTOOL_GPAUSEPARAM | |
5009 | }; | |
5010 | int ret; | |
5011 | ||
5012 | if (mlx4_is_secondary()) | |
5013 | return -E_RTE_SECONDARY; | |
5014 | ifr.ifr_data = (void *)ðpause; | |
5015 | priv_lock(priv); | |
5016 | if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { | |
5017 | ret = errno; | |
5018 | WARN("ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM)" | |
5019 | " failed: %s", | |
5020 | strerror(ret)); | |
5021 | goto out; | |
5022 | } | |
5023 | ||
5024 | fc_conf->autoneg = ethpause.autoneg; | |
5025 | if (ethpause.rx_pause && ethpause.tx_pause) | |
5026 | fc_conf->mode = RTE_FC_FULL; | |
5027 | else if (ethpause.rx_pause) | |
5028 | fc_conf->mode = RTE_FC_RX_PAUSE; | |
5029 | else if (ethpause.tx_pause) | |
5030 | fc_conf->mode = RTE_FC_TX_PAUSE; | |
5031 | else | |
5032 | fc_conf->mode = RTE_FC_NONE; | |
5033 | ret = 0; | |
5034 | ||
5035 | out: | |
5036 | priv_unlock(priv); | |
5037 | assert(ret >= 0); | |
5038 | return -ret; | |
5039 | } | |
5040 | ||
5041 | /** | |
5042 | * DPDK callback to modify flow control parameters. | |
5043 | * | |
5044 | * @param dev | |
5045 | * Pointer to Ethernet device structure. | |
5046 | * @param[in] fc_conf | |
5047 | * Flow control parameters. | |
5048 | * | |
5049 | * @return | |
5050 | * 0 on success, negative errno value on failure. | |
5051 | */ | |
5052 | static int | |
5053 | mlx4_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) | |
5054 | { | |
5055 | struct priv *priv = dev->data->dev_private; | |
5056 | struct ifreq ifr; | |
5057 | struct ethtool_pauseparam ethpause = { | |
5058 | .cmd = ETHTOOL_SPAUSEPARAM | |
5059 | }; | |
5060 | int ret; | |
5061 | ||
5062 | if (mlx4_is_secondary()) | |
5063 | return -E_RTE_SECONDARY; | |
5064 | ifr.ifr_data = (void *)ðpause; | |
5065 | ethpause.autoneg = fc_conf->autoneg; | |
5066 | if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || | |
5067 | (fc_conf->mode & RTE_FC_RX_PAUSE)) | |
5068 | ethpause.rx_pause = 1; | |
5069 | else | |
5070 | ethpause.rx_pause = 0; | |
5071 | ||
5072 | if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) || | |
5073 | (fc_conf->mode & RTE_FC_TX_PAUSE)) | |
5074 | ethpause.tx_pause = 1; | |
5075 | else | |
5076 | ethpause.tx_pause = 0; | |
5077 | ||
5078 | priv_lock(priv); | |
5079 | if (priv_ifreq(priv, SIOCETHTOOL, &ifr)) { | |
5080 | ret = errno; | |
5081 | WARN("ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)" | |
5082 | " failed: %s", | |
5083 | strerror(ret)); | |
5084 | goto out; | |
5085 | } | |
5086 | ret = 0; | |
5087 | ||
5088 | out: | |
5089 | priv_unlock(priv); | |
5090 | assert(ret >= 0); | |
5091 | return -ret; | |
5092 | } | |
5093 | ||
5094 | /** | |
5095 | * Configure a VLAN filter. | |
5096 | * | |
5097 | * @param dev | |
5098 | * Pointer to Ethernet device structure. | |
5099 | * @param vlan_id | |
5100 | * VLAN ID to filter. | |
5101 | * @param on | |
5102 | * Toggle filter. | |
5103 | * | |
5104 | * @return | |
5105 | * 0 on success, errno value on failure. | |
5106 | */ | |
5107 | static int | |
5108 | vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) | |
5109 | { | |
5110 | struct priv *priv = dev->data->dev_private; | |
5111 | unsigned int i; | |
5112 | unsigned int j = -1; | |
5113 | ||
5114 | DEBUG("%p: %s VLAN filter ID %" PRIu16, | |
5115 | (void *)dev, (on ? "enable" : "disable"), vlan_id); | |
5116 | for (i = 0; (i != elemof(priv->vlan_filter)); ++i) { | |
5117 | if (!priv->vlan_filter[i].enabled) { | |
5118 | /* Unused index, remember it. */ | |
5119 | j = i; | |
5120 | continue; | |
5121 | } | |
5122 | if (priv->vlan_filter[i].id != vlan_id) | |
5123 | continue; | |
5124 | /* This VLAN ID is already known, use its index. */ | |
5125 | j = i; | |
5126 | break; | |
5127 | } | |
5128 | /* Check if there's room for another VLAN filter. */ | |
5129 | if (j == (unsigned int)-1) | |
5130 | return ENOMEM; | |
5131 | /* | |
5132 | * VLAN filters apply to all configured MAC addresses, flow | |
5133 | * specifications must be reconfigured accordingly. | |
5134 | */ | |
5135 | priv->vlan_filter[j].id = vlan_id; | |
5136 | if ((on) && (!priv->vlan_filter[j].enabled)) { | |
5137 | /* | |
5138 | * Filter is disabled, enable it. | |
5139 | * Rehashing flows in all RX queues is necessary. | |
5140 | */ | |
5141 | if (priv->rss) | |
5142 | rxq_mac_addrs_del(&priv->rxq_parent); | |
5143 | else | |
5144 | for (i = 0; (i != priv->rxqs_n); ++i) | |
5145 | if ((*priv->rxqs)[i] != NULL) | |
5146 | rxq_mac_addrs_del((*priv->rxqs)[i]); | |
5147 | priv->vlan_filter[j].enabled = 1; | |
5148 | if (priv->started) { | |
5149 | if (priv->rss) | |
5150 | rxq_mac_addrs_add(&priv->rxq_parent); | |
5151 | else | |
5152 | for (i = 0; (i != priv->rxqs_n); ++i) { | |
5153 | if ((*priv->rxqs)[i] == NULL) | |
5154 | continue; | |
5155 | rxq_mac_addrs_add((*priv->rxqs)[i]); | |
5156 | } | |
5157 | } | |
5158 | } else if ((!on) && (priv->vlan_filter[j].enabled)) { | |
5159 | /* | |
5160 | * Filter is enabled, disable it. | |
5161 | * Rehashing flows in all RX queues is necessary. | |
5162 | */ | |
5163 | if (priv->rss) | |
5164 | rxq_mac_addrs_del(&priv->rxq_parent); | |
5165 | else | |
5166 | for (i = 0; (i != priv->rxqs_n); ++i) | |
5167 | if ((*priv->rxqs)[i] != NULL) | |
5168 | rxq_mac_addrs_del((*priv->rxqs)[i]); | |
5169 | priv->vlan_filter[j].enabled = 0; | |
5170 | if (priv->started) { | |
5171 | if (priv->rss) | |
5172 | rxq_mac_addrs_add(&priv->rxq_parent); | |
5173 | else | |
5174 | for (i = 0; (i != priv->rxqs_n); ++i) { | |
5175 | if ((*priv->rxqs)[i] == NULL) | |
5176 | continue; | |
5177 | rxq_mac_addrs_add((*priv->rxqs)[i]); | |
5178 | } | |
5179 | } | |
5180 | } | |
5181 | return 0; | |
5182 | } | |
5183 | ||
5184 | /** | |
5185 | * DPDK callback to configure a VLAN filter. | |
5186 | * | |
5187 | * @param dev | |
5188 | * Pointer to Ethernet device structure. | |
5189 | * @param vlan_id | |
5190 | * VLAN ID to filter. | |
5191 | * @param on | |
5192 | * Toggle filter. | |
5193 | * | |
5194 | * @return | |
5195 | * 0 on success, negative errno value on failure. | |
5196 | */ | |
5197 | static int | |
5198 | mlx4_vlan_filter_set(struct rte_eth_dev *dev, uint16_t vlan_id, int on) | |
5199 | { | |
5200 | struct priv *priv = dev->data->dev_private; | |
5201 | int ret; | |
5202 | ||
5203 | if (mlx4_is_secondary()) | |
5204 | return -E_RTE_SECONDARY; | |
5205 | priv_lock(priv); | |
5206 | ret = vlan_filter_set(dev, vlan_id, on); | |
5207 | priv_unlock(priv); | |
5208 | assert(ret >= 0); | |
5209 | return -ret; | |
5210 | } | |
5211 | ||
5212 | static const struct eth_dev_ops mlx4_dev_ops = { | |
5213 | .dev_configure = mlx4_dev_configure, | |
5214 | .dev_start = mlx4_dev_start, | |
5215 | .dev_stop = mlx4_dev_stop, | |
5216 | .dev_set_link_down = mlx4_set_link_down, | |
5217 | .dev_set_link_up = mlx4_set_link_up, | |
5218 | .dev_close = mlx4_dev_close, | |
5219 | .promiscuous_enable = mlx4_promiscuous_enable, | |
5220 | .promiscuous_disable = mlx4_promiscuous_disable, | |
5221 | .allmulticast_enable = mlx4_allmulticast_enable, | |
5222 | .allmulticast_disable = mlx4_allmulticast_disable, | |
5223 | .link_update = mlx4_link_update, | |
5224 | .stats_get = mlx4_stats_get, | |
5225 | .stats_reset = mlx4_stats_reset, | |
5226 | .queue_stats_mapping_set = NULL, | |
5227 | .dev_infos_get = mlx4_dev_infos_get, | |
5228 | .dev_supported_ptypes_get = mlx4_dev_supported_ptypes_get, | |
5229 | .vlan_filter_set = mlx4_vlan_filter_set, | |
5230 | .vlan_tpid_set = NULL, | |
5231 | .vlan_strip_queue_set = NULL, | |
5232 | .vlan_offload_set = NULL, | |
5233 | .rx_queue_setup = mlx4_rx_queue_setup, | |
5234 | .tx_queue_setup = mlx4_tx_queue_setup, | |
5235 | .rx_queue_release = mlx4_rx_queue_release, | |
5236 | .tx_queue_release = mlx4_tx_queue_release, | |
5237 | .dev_led_on = NULL, | |
5238 | .dev_led_off = NULL, | |
5239 | .flow_ctrl_get = mlx4_dev_get_flow_ctrl, | |
5240 | .flow_ctrl_set = mlx4_dev_set_flow_ctrl, | |
5241 | .priority_flow_ctrl_set = NULL, | |
5242 | .mac_addr_remove = mlx4_mac_addr_remove, | |
5243 | .mac_addr_add = mlx4_mac_addr_add, | |
5244 | .mac_addr_set = mlx4_mac_addr_set, | |
5245 | .mtu_set = mlx4_dev_set_mtu, | |
5246 | }; | |
5247 | ||
5248 | /** | |
5249 | * Get PCI information from struct ibv_device. | |
5250 | * | |
5251 | * @param device | |
5252 | * Pointer to Ethernet device structure. | |
5253 | * @param[out] pci_addr | |
5254 | * PCI bus address output buffer. | |
5255 | * | |
5256 | * @return | |
5257 | * 0 on success, -1 on failure and errno is set. | |
5258 | */ | |
5259 | static int | |
5260 | mlx4_ibv_device_to_pci_addr(const struct ibv_device *device, | |
5261 | struct rte_pci_addr *pci_addr) | |
5262 | { | |
5263 | FILE *file; | |
5264 | char line[32]; | |
5265 | MKSTR(path, "%s/device/uevent", device->ibdev_path); | |
5266 | ||
5267 | file = fopen(path, "rb"); | |
5268 | if (file == NULL) | |
5269 | return -1; | |
5270 | while (fgets(line, sizeof(line), file) == line) { | |
5271 | size_t len = strlen(line); | |
5272 | int ret; | |
5273 | ||
5274 | /* Truncate long lines. */ | |
5275 | if (len == (sizeof(line) - 1)) | |
5276 | while (line[(len - 1)] != '\n') { | |
5277 | ret = fgetc(file); | |
5278 | if (ret == EOF) | |
5279 | break; | |
5280 | line[(len - 1)] = ret; | |
5281 | } | |
5282 | /* Extract information. */ | |
5283 | if (sscanf(line, | |
5284 | "PCI_SLOT_NAME=" | |
5285 | "%" SCNx16 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n", | |
5286 | &pci_addr->domain, | |
5287 | &pci_addr->bus, | |
5288 | &pci_addr->devid, | |
5289 | &pci_addr->function) == 4) { | |
5290 | ret = 0; | |
5291 | break; | |
5292 | } | |
5293 | } | |
5294 | fclose(file); | |
5295 | return 0; | |
5296 | } | |
5297 | ||
5298 | /** | |
5299 | * Get MAC address by querying netdevice. | |
5300 | * | |
5301 | * @param[in] priv | |
5302 | * struct priv for the requested device. | |
5303 | * @param[out] mac | |
5304 | * MAC address output buffer. | |
5305 | * | |
5306 | * @return | |
5307 | * 0 on success, -1 on failure and errno is set. | |
5308 | */ | |
5309 | static int | |
5310 | priv_get_mac(struct priv *priv, uint8_t (*mac)[ETHER_ADDR_LEN]) | |
5311 | { | |
5312 | struct ifreq request; | |
5313 | ||
5314 | if (priv_ifreq(priv, SIOCGIFHWADDR, &request)) | |
5315 | return -1; | |
5316 | memcpy(mac, request.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); | |
5317 | return 0; | |
5318 | } | |
5319 | ||
5320 | /* Support up to 32 adapters. */ | |
5321 | static struct { | |
5322 | struct rte_pci_addr pci_addr; /* associated PCI address */ | |
5323 | uint32_t ports; /* physical ports bitfield. */ | |
5324 | } mlx4_dev[32]; | |
5325 | ||
5326 | /** | |
5327 | * Get device index in mlx4_dev[] from PCI bus address. | |
5328 | * | |
5329 | * @param[in] pci_addr | |
5330 | * PCI bus address to look for. | |
5331 | * | |
5332 | * @return | |
5333 | * mlx4_dev[] index on success, -1 on failure. | |
5334 | */ | |
5335 | static int | |
5336 | mlx4_dev_idx(struct rte_pci_addr *pci_addr) | |
5337 | { | |
5338 | unsigned int i; | |
5339 | int ret = -1; | |
5340 | ||
5341 | assert(pci_addr != NULL); | |
5342 | for (i = 0; (i != elemof(mlx4_dev)); ++i) { | |
5343 | if ((mlx4_dev[i].pci_addr.domain == pci_addr->domain) && | |
5344 | (mlx4_dev[i].pci_addr.bus == pci_addr->bus) && | |
5345 | (mlx4_dev[i].pci_addr.devid == pci_addr->devid) && | |
5346 | (mlx4_dev[i].pci_addr.function == pci_addr->function)) | |
5347 | return i; | |
5348 | if ((mlx4_dev[i].ports == 0) && (ret == -1)) | |
5349 | ret = i; | |
5350 | } | |
5351 | return ret; | |
5352 | } | |
5353 | ||
5354 | /** | |
5355 | * Retrieve integer value from environment variable. | |
5356 | * | |
5357 | * @param[in] name | |
5358 | * Environment variable name. | |
5359 | * | |
5360 | * @return | |
5361 | * Integer value, 0 if the variable is not set. | |
5362 | */ | |
5363 | static int | |
5364 | mlx4_getenv_int(const char *name) | |
5365 | { | |
5366 | const char *val = getenv(name); | |
5367 | ||
5368 | if (val == NULL) | |
5369 | return 0; | |
5370 | return atoi(val); | |
5371 | } | |
5372 | ||
5373 | static void | |
5374 | mlx4_dev_link_status_handler(void *); | |
5375 | static void | |
5376 | mlx4_dev_interrupt_handler(struct rte_intr_handle *, void *); | |
5377 | ||
5378 | /** | |
5379 | * Link status handler. | |
5380 | * | |
5381 | * @param priv | |
5382 | * Pointer to private structure. | |
5383 | * @param dev | |
5384 | * Pointer to the rte_eth_dev structure. | |
5385 | * | |
5386 | * @return | |
5387 | * Nonzero if the callback process can be called immediately. | |
5388 | */ | |
5389 | static int | |
5390 | priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev) | |
5391 | { | |
5392 | struct ibv_async_event event; | |
5393 | int port_change = 0; | |
5394 | int ret = 0; | |
5395 | ||
5396 | /* Read all message and acknowledge them. */ | |
5397 | for (;;) { | |
5398 | if (ibv_get_async_event(priv->ctx, &event)) | |
5399 | break; | |
5400 | ||
5401 | if (event.event_type == IBV_EVENT_PORT_ACTIVE || | |
5402 | event.event_type == IBV_EVENT_PORT_ERR) | |
5403 | port_change = 1; | |
5404 | else | |
5405 | DEBUG("event type %d on port %d not handled", | |
5406 | event.event_type, event.element.port_num); | |
5407 | ibv_ack_async_event(&event); | |
5408 | } | |
5409 | ||
5410 | if (port_change ^ priv->pending_alarm) { | |
5411 | struct rte_eth_link *link = &dev->data->dev_link; | |
5412 | ||
5413 | priv->pending_alarm = 0; | |
5414 | mlx4_link_update_unlocked(dev, 0); | |
5415 | if (((link->link_speed == 0) && link->link_status) || | |
5416 | ((link->link_speed != 0) && !link->link_status)) { | |
5417 | /* Inconsistent status, check again later. */ | |
5418 | priv->pending_alarm = 1; | |
5419 | rte_eal_alarm_set(MLX4_ALARM_TIMEOUT_US, | |
5420 | mlx4_dev_link_status_handler, | |
5421 | dev); | |
5422 | } else | |
5423 | ret = 1; | |
5424 | } | |
5425 | return ret; | |
5426 | } | |
5427 | ||
5428 | /** | |
5429 | * Handle delayed link status event. | |
5430 | * | |
5431 | * @param arg | |
5432 | * Registered argument. | |
5433 | */ | |
5434 | static void | |
5435 | mlx4_dev_link_status_handler(void *arg) | |
5436 | { | |
5437 | struct rte_eth_dev *dev = arg; | |
5438 | struct priv *priv = dev->data->dev_private; | |
5439 | int ret; | |
5440 | ||
5441 | priv_lock(priv); | |
5442 | assert(priv->pending_alarm == 1); | |
5443 | ret = priv_dev_link_status_handler(priv, dev); | |
5444 | priv_unlock(priv); | |
5445 | if (ret) | |
5446 | _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); | |
5447 | } | |
5448 | ||
5449 | /** | |
5450 | * Handle interrupts from the NIC. | |
5451 | * | |
5452 | * @param[in] intr_handle | |
5453 | * Interrupt handler. | |
5454 | * @param cb_arg | |
5455 | * Callback argument. | |
5456 | */ | |
5457 | static void | |
5458 | mlx4_dev_interrupt_handler(struct rte_intr_handle *intr_handle, void *cb_arg) | |
5459 | { | |
5460 | struct rte_eth_dev *dev = cb_arg; | |
5461 | struct priv *priv = dev->data->dev_private; | |
5462 | int ret; | |
5463 | ||
5464 | (void)intr_handle; | |
5465 | priv_lock(priv); | |
5466 | ret = priv_dev_link_status_handler(priv, dev); | |
5467 | priv_unlock(priv); | |
5468 | if (ret) | |
5469 | _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL); | |
5470 | } | |
5471 | ||
5472 | /** | |
5473 | * Uninstall interrupt handler. | |
5474 | * | |
5475 | * @param priv | |
5476 | * Pointer to private structure. | |
5477 | * @param dev | |
5478 | * Pointer to the rte_eth_dev structure. | |
5479 | */ | |
5480 | static void | |
5481 | priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev) | |
5482 | { | |
5483 | if (!dev->data->dev_conf.intr_conf.lsc) | |
5484 | return; | |
5485 | rte_intr_callback_unregister(&priv->intr_handle, | |
5486 | mlx4_dev_interrupt_handler, | |
5487 | dev); | |
5488 | if (priv->pending_alarm) | |
5489 | rte_eal_alarm_cancel(mlx4_dev_link_status_handler, dev); | |
5490 | priv->pending_alarm = 0; | |
5491 | priv->intr_handle.fd = 0; | |
5492 | priv->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN; | |
5493 | } | |
5494 | ||
5495 | /** | |
5496 | * Install interrupt handler. | |
5497 | * | |
5498 | * @param priv | |
5499 | * Pointer to private structure. | |
5500 | * @param dev | |
5501 | * Pointer to the rte_eth_dev structure. | |
5502 | */ | |
5503 | static void | |
5504 | priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev) | |
5505 | { | |
5506 | int rc, flags; | |
5507 | ||
5508 | if (!dev->data->dev_conf.intr_conf.lsc) | |
5509 | return; | |
5510 | assert(priv->ctx->async_fd > 0); | |
5511 | flags = fcntl(priv->ctx->async_fd, F_GETFL); | |
5512 | rc = fcntl(priv->ctx->async_fd, F_SETFL, flags | O_NONBLOCK); | |
5513 | if (rc < 0) { | |
5514 | INFO("failed to change file descriptor async event queue"); | |
5515 | dev->data->dev_conf.intr_conf.lsc = 0; | |
5516 | } else { | |
5517 | priv->intr_handle.fd = priv->ctx->async_fd; | |
5518 | priv->intr_handle.type = RTE_INTR_HANDLE_EXT; | |
5519 | rte_intr_callback_register(&priv->intr_handle, | |
5520 | mlx4_dev_interrupt_handler, | |
5521 | dev); | |
5522 | } | |
5523 | } | |
5524 | ||
5525 | static struct eth_driver mlx4_driver; | |
5526 | ||
5527 | /** | |
5528 | * DPDK callback to register a PCI device. | |
5529 | * | |
5530 | * This function creates an Ethernet device for each port of a given | |
5531 | * PCI device. | |
5532 | * | |
5533 | * @param[in] pci_drv | |
5534 | * PCI driver structure (mlx4_driver). | |
5535 | * @param[in] pci_dev | |
5536 | * PCI device information. | |
5537 | * | |
5538 | * @return | |
5539 | * 0 on success, negative errno value on failure. | |
5540 | */ | |
5541 | static int | |
5542 | mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) | |
5543 | { | |
5544 | struct ibv_device **list; | |
5545 | struct ibv_device *ibv_dev; | |
5546 | int err = 0; | |
5547 | struct ibv_context *attr_ctx = NULL; | |
5548 | struct ibv_device_attr device_attr; | |
5549 | unsigned int vf; | |
5550 | int idx; | |
5551 | int i; | |
5552 | ||
5553 | (void)pci_drv; | |
5554 | assert(pci_drv == &mlx4_driver.pci_drv); | |
5555 | /* Get mlx4_dev[] index. */ | |
5556 | idx = mlx4_dev_idx(&pci_dev->addr); | |
5557 | if (idx == -1) { | |
5558 | ERROR("this driver cannot support any more adapters"); | |
5559 | return -ENOMEM; | |
5560 | } | |
5561 | DEBUG("using driver device index %d", idx); | |
5562 | ||
5563 | /* Save PCI address. */ | |
5564 | mlx4_dev[idx].pci_addr = pci_dev->addr; | |
5565 | list = ibv_get_device_list(&i); | |
5566 | if (list == NULL) { | |
5567 | assert(errno); | |
5568 | if (errno == ENOSYS) { | |
5569 | WARN("cannot list devices, is ib_uverbs loaded?"); | |
5570 | return 0; | |
5571 | } | |
5572 | return -errno; | |
5573 | } | |
5574 | assert(i >= 0); | |
5575 | /* | |
5576 | * For each listed device, check related sysfs entry against | |
5577 | * the provided PCI ID. | |
5578 | */ | |
5579 | while (i != 0) { | |
5580 | struct rte_pci_addr pci_addr; | |
5581 | ||
5582 | --i; | |
5583 | DEBUG("checking device \"%s\"", list[i]->name); | |
5584 | if (mlx4_ibv_device_to_pci_addr(list[i], &pci_addr)) | |
5585 | continue; | |
5586 | if ((pci_dev->addr.domain != pci_addr.domain) || | |
5587 | (pci_dev->addr.bus != pci_addr.bus) || | |
5588 | (pci_dev->addr.devid != pci_addr.devid) || | |
5589 | (pci_dev->addr.function != pci_addr.function)) | |
5590 | continue; | |
5591 | vf = (pci_dev->id.device_id == | |
5592 | PCI_DEVICE_ID_MELLANOX_CONNECTX3VF); | |
5593 | INFO("PCI information matches, using device \"%s\" (VF: %s)", | |
5594 | list[i]->name, (vf ? "true" : "false")); | |
5595 | attr_ctx = ibv_open_device(list[i]); | |
5596 | err = errno; | |
5597 | break; | |
5598 | } | |
5599 | if (attr_ctx == NULL) { | |
5600 | ibv_free_device_list(list); | |
5601 | switch (err) { | |
5602 | case 0: | |
5603 | WARN("cannot access device, is mlx4_ib loaded?"); | |
5604 | return 0; | |
5605 | case EINVAL: | |
5606 | WARN("cannot use device, are drivers up to date?"); | |
5607 | return 0; | |
5608 | } | |
5609 | assert(err > 0); | |
5610 | return -err; | |
5611 | } | |
5612 | ibv_dev = list[i]; | |
5613 | ||
5614 | DEBUG("device opened"); | |
5615 | if (ibv_query_device(attr_ctx, &device_attr)) | |
5616 | goto error; | |
5617 | INFO("%u port(s) detected", device_attr.phys_port_cnt); | |
5618 | ||
5619 | for (i = 0; i < device_attr.phys_port_cnt; i++) { | |
5620 | uint32_t port = i + 1; /* ports are indexed from one */ | |
5621 | uint32_t test = (1 << i); | |
5622 | struct ibv_context *ctx = NULL; | |
5623 | struct ibv_port_attr port_attr; | |
5624 | struct ibv_pd *pd = NULL; | |
5625 | struct priv *priv = NULL; | |
5626 | struct rte_eth_dev *eth_dev = NULL; | |
5627 | #ifdef HAVE_EXP_QUERY_DEVICE | |
5628 | struct ibv_exp_device_attr exp_device_attr; | |
5629 | #endif /* HAVE_EXP_QUERY_DEVICE */ | |
5630 | struct ether_addr mac; | |
5631 | ||
5632 | #ifdef HAVE_EXP_QUERY_DEVICE | |
5633 | exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS; | |
5634 | #ifdef RSS_SUPPORT | |
5635 | exp_device_attr.comp_mask |= IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ; | |
5636 | #endif /* RSS_SUPPORT */ | |
5637 | #endif /* HAVE_EXP_QUERY_DEVICE */ | |
5638 | ||
5639 | DEBUG("using port %u (%08" PRIx32 ")", port, test); | |
5640 | ||
5641 | ctx = ibv_open_device(ibv_dev); | |
5642 | if (ctx == NULL) | |
5643 | goto port_error; | |
5644 | ||
5645 | /* Check port status. */ | |
5646 | err = ibv_query_port(ctx, port, &port_attr); | |
5647 | if (err) { | |
5648 | ERROR("port query failed: %s", strerror(err)); | |
5649 | goto port_error; | |
5650 | } | |
5651 | ||
5652 | if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) { | |
5653 | ERROR("port %d is not configured in Ethernet mode", | |
5654 | port); | |
5655 | goto port_error; | |
5656 | } | |
5657 | ||
5658 | if (port_attr.state != IBV_PORT_ACTIVE) | |
5659 | DEBUG("port %d is not active: \"%s\" (%d)", | |
5660 | port, ibv_port_state_str(port_attr.state), | |
5661 | port_attr.state); | |
5662 | ||
5663 | /* Allocate protection domain. */ | |
5664 | pd = ibv_alloc_pd(ctx); | |
5665 | if (pd == NULL) { | |
5666 | ERROR("PD allocation failure"); | |
5667 | err = ENOMEM; | |
5668 | goto port_error; | |
5669 | } | |
5670 | ||
5671 | mlx4_dev[idx].ports |= test; | |
5672 | ||
5673 | /* from rte_ethdev.c */ | |
5674 | priv = rte_zmalloc("ethdev private structure", | |
5675 | sizeof(*priv), | |
5676 | RTE_CACHE_LINE_SIZE); | |
5677 | if (priv == NULL) { | |
5678 | ERROR("priv allocation failure"); | |
5679 | err = ENOMEM; | |
5680 | goto port_error; | |
5681 | } | |
5682 | ||
5683 | priv->ctx = ctx; | |
5684 | priv->device_attr = device_attr; | |
5685 | priv->port = port; | |
5686 | priv->pd = pd; | |
5687 | priv->mtu = ETHER_MTU; | |
5688 | #ifdef HAVE_EXP_QUERY_DEVICE | |
5689 | if (ibv_exp_query_device(ctx, &exp_device_attr)) { | |
5690 | ERROR("ibv_exp_query_device() failed"); | |
5691 | goto port_error; | |
5692 | } | |
5693 | #ifdef RSS_SUPPORT | |
5694 | if ((exp_device_attr.exp_device_cap_flags & | |
5695 | IBV_EXP_DEVICE_QPG) && | |
5696 | (exp_device_attr.exp_device_cap_flags & | |
5697 | IBV_EXP_DEVICE_UD_RSS) && | |
5698 | (exp_device_attr.comp_mask & | |
5699 | IBV_EXP_DEVICE_ATTR_RSS_TBL_SZ) && | |
5700 | (exp_device_attr.max_rss_tbl_sz > 0)) { | |
5701 | priv->hw_qpg = 1; | |
5702 | priv->hw_rss = 1; | |
5703 | priv->max_rss_tbl_sz = exp_device_attr.max_rss_tbl_sz; | |
5704 | } else { | |
5705 | priv->hw_qpg = 0; | |
5706 | priv->hw_rss = 0; | |
5707 | priv->max_rss_tbl_sz = 0; | |
5708 | } | |
5709 | priv->hw_tss = !!(exp_device_attr.exp_device_cap_flags & | |
5710 | IBV_EXP_DEVICE_UD_TSS); | |
5711 | DEBUG("device flags: %s%s%s", | |
5712 | (priv->hw_qpg ? "IBV_DEVICE_QPG " : ""), | |
5713 | (priv->hw_tss ? "IBV_DEVICE_TSS " : ""), | |
5714 | (priv->hw_rss ? "IBV_DEVICE_RSS " : "")); | |
5715 | if (priv->hw_rss) | |
5716 | DEBUG("maximum RSS indirection table size: %u", | |
5717 | exp_device_attr.max_rss_tbl_sz); | |
5718 | #endif /* RSS_SUPPORT */ | |
5719 | ||
5720 | priv->hw_csum = | |
5721 | ((exp_device_attr.exp_device_cap_flags & | |
5722 | IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT) && | |
5723 | (exp_device_attr.exp_device_cap_flags & | |
5724 | IBV_EXP_DEVICE_RX_CSUM_IP_PKT)); | |
5725 | DEBUG("checksum offloading is %ssupported", | |
5726 | (priv->hw_csum ? "" : "not ")); | |
5727 | ||
5728 | priv->hw_csum_l2tun = !!(exp_device_attr.exp_device_cap_flags & | |
5729 | IBV_EXP_DEVICE_VXLAN_SUPPORT); | |
5730 | DEBUG("L2 tunnel checksum offloads are %ssupported", | |
5731 | (priv->hw_csum_l2tun ? "" : "not ")); | |
5732 | ||
5733 | #ifdef INLINE_RECV | |
5734 | priv->inl_recv_size = mlx4_getenv_int("MLX4_INLINE_RECV_SIZE"); | |
5735 | ||
5736 | if (priv->inl_recv_size) { | |
5737 | exp_device_attr.comp_mask = | |
5738 | IBV_EXP_DEVICE_ATTR_INLINE_RECV_SZ; | |
5739 | if (ibv_exp_query_device(ctx, &exp_device_attr)) { | |
5740 | INFO("Couldn't query device for inline-receive" | |
5741 | " capabilities."); | |
5742 | priv->inl_recv_size = 0; | |
5743 | } else { | |
5744 | if ((unsigned)exp_device_attr.inline_recv_sz < | |
5745 | priv->inl_recv_size) { | |
5746 | INFO("Max inline-receive (%d) <" | |
5747 | " requested inline-receive (%u)", | |
5748 | exp_device_attr.inline_recv_sz, | |
5749 | priv->inl_recv_size); | |
5750 | priv->inl_recv_size = | |
5751 | exp_device_attr.inline_recv_sz; | |
5752 | } | |
5753 | } | |
5754 | INFO("Set inline receive size to %u", | |
5755 | priv->inl_recv_size); | |
5756 | } | |
5757 | #endif /* INLINE_RECV */ | |
5758 | #endif /* HAVE_EXP_QUERY_DEVICE */ | |
5759 | ||
5760 | (void)mlx4_getenv_int; | |
5761 | priv->vf = vf; | |
5762 | /* Configure the first MAC address by default. */ | |
5763 | if (priv_get_mac(priv, &mac.addr_bytes)) { | |
5764 | ERROR("cannot get MAC address, is mlx4_en loaded?" | |
5765 | " (errno: %s)", strerror(errno)); | |
5766 | goto port_error; | |
5767 | } | |
5768 | INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", | |
5769 | priv->port, | |
5770 | mac.addr_bytes[0], mac.addr_bytes[1], | |
5771 | mac.addr_bytes[2], mac.addr_bytes[3], | |
5772 | mac.addr_bytes[4], mac.addr_bytes[5]); | |
5773 | /* Register MAC and broadcast addresses. */ | |
5774 | claim_zero(priv_mac_addr_add(priv, 0, | |
5775 | (const uint8_t (*)[ETHER_ADDR_LEN]) | |
5776 | mac.addr_bytes)); | |
5777 | claim_zero(priv_mac_addr_add(priv, (elemof(priv->mac) - 1), | |
5778 | &(const uint8_t [ETHER_ADDR_LEN]) | |
5779 | { "\xff\xff\xff\xff\xff\xff" })); | |
5780 | #ifndef NDEBUG | |
5781 | { | |
5782 | char ifname[IF_NAMESIZE]; | |
5783 | ||
5784 | if (priv_get_ifname(priv, &ifname) == 0) | |
5785 | DEBUG("port %u ifname is \"%s\"", | |
5786 | priv->port, ifname); | |
5787 | else | |
5788 | DEBUG("port %u ifname is unknown", priv->port); | |
5789 | } | |
5790 | #endif | |
5791 | /* Get actual MTU if possible. */ | |
5792 | priv_get_mtu(priv, &priv->mtu); | |
5793 | DEBUG("port %u MTU is %u", priv->port, priv->mtu); | |
5794 | ||
5795 | /* from rte_ethdev.c */ | |
5796 | { | |
5797 | char name[RTE_ETH_NAME_MAX_LEN]; | |
5798 | ||
5799 | snprintf(name, sizeof(name), "%s port %u", | |
5800 | ibv_get_device_name(ibv_dev), port); | |
5801 | eth_dev = rte_eth_dev_allocate(name); | |
5802 | } | |
5803 | if (eth_dev == NULL) { | |
5804 | ERROR("can not allocate rte ethdev"); | |
5805 | err = ENOMEM; | |
5806 | goto port_error; | |
5807 | } | |
5808 | ||
5809 | /* Secondary processes have to use local storage for their | |
5810 | * private data as well as a copy of eth_dev->data, but this | |
5811 | * pointer must not be modified before burst functions are | |
5812 | * actually called. */ | |
5813 | if (mlx4_is_secondary()) { | |
5814 | struct mlx4_secondary_data *sd = | |
5815 | &mlx4_secondary_data[eth_dev->data->port_id]; | |
5816 | ||
5817 | sd->primary_priv = eth_dev->data->dev_private; | |
5818 | if (sd->primary_priv == NULL) { | |
5819 | ERROR("no private data for port %u", | |
5820 | eth_dev->data->port_id); | |
5821 | err = EINVAL; | |
5822 | goto port_error; | |
5823 | } | |
5824 | sd->shared_dev_data = eth_dev->data; | |
5825 | rte_spinlock_init(&sd->lock); | |
5826 | memcpy(sd->data.name, sd->shared_dev_data->name, | |
5827 | sizeof(sd->data.name)); | |
5828 | sd->data.dev_private = priv; | |
5829 | sd->data.rx_mbuf_alloc_failed = 0; | |
5830 | sd->data.mtu = ETHER_MTU; | |
5831 | sd->data.port_id = sd->shared_dev_data->port_id; | |
5832 | sd->data.mac_addrs = priv->mac; | |
5833 | eth_dev->tx_pkt_burst = mlx4_tx_burst_secondary_setup; | |
5834 | eth_dev->rx_pkt_burst = mlx4_rx_burst_secondary_setup; | |
5835 | } else { | |
5836 | eth_dev->data->dev_private = priv; | |
5837 | eth_dev->data->rx_mbuf_alloc_failed = 0; | |
5838 | eth_dev->data->mtu = ETHER_MTU; | |
5839 | eth_dev->data->mac_addrs = priv->mac; | |
5840 | } | |
5841 | eth_dev->pci_dev = pci_dev; | |
5842 | ||
5843 | rte_eth_copy_pci_info(eth_dev, pci_dev); | |
5844 | ||
5845 | eth_dev->driver = &mlx4_driver; | |
5846 | ||
5847 | priv->dev = eth_dev; | |
5848 | eth_dev->dev_ops = &mlx4_dev_ops; | |
5849 | TAILQ_INIT(ð_dev->link_intr_cbs); | |
5850 | ||
5851 | /* Bring Ethernet device up. */ | |
5852 | DEBUG("forcing Ethernet interface up"); | |
5853 | priv_set_flags(priv, ~IFF_UP, IFF_UP); | |
5854 | continue; | |
5855 | ||
5856 | port_error: | |
5857 | rte_free(priv); | |
5858 | if (pd) | |
5859 | claim_zero(ibv_dealloc_pd(pd)); | |
5860 | if (ctx) | |
5861 | claim_zero(ibv_close_device(ctx)); | |
5862 | if (eth_dev) | |
5863 | rte_eth_dev_release_port(eth_dev); | |
5864 | break; | |
5865 | } | |
5866 | ||
5867 | /* | |
5868 | * XXX if something went wrong in the loop above, there is a resource | |
5869 | * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as | |
5870 | * long as the dpdk does not provide a way to deallocate a ethdev and a | |
5871 | * way to enumerate the registered ethdevs to free the previous ones. | |
5872 | */ | |
5873 | ||
5874 | /* no port found, complain */ | |
5875 | if (!mlx4_dev[idx].ports) { | |
5876 | err = ENODEV; | |
5877 | goto error; | |
5878 | } | |
5879 | ||
5880 | error: | |
5881 | if (attr_ctx) | |
5882 | claim_zero(ibv_close_device(attr_ctx)); | |
5883 | if (list) | |
5884 | ibv_free_device_list(list); | |
5885 | assert(err >= 0); | |
5886 | return -err; | |
5887 | } | |
5888 | ||
5889 | static const struct rte_pci_id mlx4_pci_id_map[] = { | |
5890 | { | |
5891 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
5892 | PCI_DEVICE_ID_MELLANOX_CONNECTX3) | |
5893 | }, | |
5894 | { | |
5895 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
5896 | PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO) | |
5897 | }, | |
5898 | { | |
5899 | RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, | |
5900 | PCI_DEVICE_ID_MELLANOX_CONNECTX3VF) | |
5901 | }, | |
5902 | { | |
5903 | .vendor_id = 0 | |
5904 | } | |
5905 | }; | |
5906 | ||
5907 | static struct eth_driver mlx4_driver = { | |
5908 | .pci_drv = { | |
5909 | .driver = { | |
5910 | .name = MLX4_DRIVER_NAME | |
5911 | }, | |
5912 | .id_table = mlx4_pci_id_map, | |
5913 | .probe = mlx4_pci_probe, | |
5914 | .drv_flags = RTE_PCI_DRV_INTR_LSC, | |
5915 | }, | |
5916 | .dev_private_size = sizeof(struct priv) | |
5917 | }; | |
5918 | ||
5919 | /** | |
5920 | * Driver initialization routine. | |
5921 | */ | |
5922 | RTE_INIT(rte_mlx4_pmd_init); | |
5923 | static void | |
5924 | rte_mlx4_pmd_init(void) | |
5925 | { | |
5926 | RTE_BUILD_BUG_ON(sizeof(wr_id_t) != sizeof(uint64_t)); | |
5927 | /* | |
5928 | * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use | |
5929 | * huge pages. Calling ibv_fork_init() during init allows | |
5930 | * applications to use fork() safely for purposes other than | |
5931 | * using this PMD, which is not supported in forked processes. | |
5932 | */ | |
5933 | setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); | |
5934 | ibv_fork_init(); | |
5935 | rte_eal_pci_register(&mlx4_driver.pci_drv); | |
5936 | } | |
5937 | ||
5938 | RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__); | |
5939 | RTE_PMD_REGISTER_PCI_TABLE(net_mlx4, mlx4_pci_id_map); |