]>
Commit | Line | Data |
---|---|---|
0de1b425 WT |
1 | /* |
2 | * Copyright (c) 2018, 2019 Nicira, Inc. | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at: | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | #include <config.h> | |
18 | ||
19 | #include "netdev-linux-private.h" | |
20 | #include "netdev-linux.h" | |
21 | #include "netdev-afxdp.h" | |
22 | #include "netdev-afxdp-pool.h" | |
23 | ||
24 | #include <errno.h> | |
25 | #include <inttypes.h> | |
26 | #include <linux/rtnetlink.h> | |
27 | #include <linux/if_xdp.h> | |
28 | #include <net/if.h> | |
e8568993 YHW |
29 | #include <numa.h> |
30 | #include <numaif.h> | |
e50547b5 | 31 | #include <poll.h> |
0de1b425 WT |
32 | #include <stdlib.h> |
33 | #include <sys/resource.h> | |
34 | #include <sys/socket.h> | |
35 | #include <sys/types.h> | |
36 | #include <unistd.h> | |
37 | ||
38 | #include "coverage.h" | |
39 | #include "dp-packet.h" | |
40 | #include "dpif-netdev.h" | |
41 | #include "fatal-signal.h" | |
42 | #include "openvswitch/compiler.h" | |
43 | #include "openvswitch/dynamic-string.h" | |
44 | #include "openvswitch/list.h" | |
28d05016 | 45 | #include "openvswitch/thread.h" |
0de1b425 | 46 | #include "openvswitch/vlog.h" |
e8568993 | 47 | #include "ovs-numa.h" |
0de1b425 WT |
48 | #include "packets.h" |
49 | #include "socket-util.h" | |
50 | #include "util.h" | |
51 | ||
52 | #ifndef SOL_XDP | |
53 | #define SOL_XDP 283 | |
54 | #endif | |
55 | ||
56 | COVERAGE_DEFINE(afxdp_cq_empty); | |
57 | COVERAGE_DEFINE(afxdp_fq_full); | |
58 | COVERAGE_DEFINE(afxdp_tx_full); | |
59 | COVERAGE_DEFINE(afxdp_cq_skip); | |
60 | ||
61 | VLOG_DEFINE_THIS_MODULE(netdev_afxdp); | |
62 | ||
63 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); | |
64 | ||
65 | #define MAX_XSKQ 16 | |
66 | #define FRAME_HEADROOM XDP_PACKET_HEADROOM | |
67 | #define OVS_XDP_HEADROOM 128 | |
68 | #define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE | |
69 | #define FRAME_SHIFT XSK_UMEM__DEFAULT_FRAME_SHIFT | |
70 | #define FRAME_SHIFT_MASK ((1 << FRAME_SHIFT) - 1) | |
71 | ||
72 | #define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS | |
73 | #define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS | |
74 | ||
e50547b5 WT |
75 | #ifdef HAVE_XDP_NEED_WAKEUP |
76 | #define NEED_WAKEUP_DEFAULT true | |
77 | #else | |
78 | #define NEED_WAKEUP_DEFAULT false | |
79 | #endif | |
80 | ||
0de1b425 WT |
81 | /* The worst case is all 4 queues TX/CQ/RX/FILL are full + some packets |
82 | * still on processing in threads. Number of packets currently in OVS | |
83 | * processing is hard to estimate because it depends on number of ports. | |
84 | * Setting NUM_FRAMES twice as large than total of ring sizes should be | |
85 | * enough for most corner cases. | |
86 | */ | |
87 | #define NUM_FRAMES (4 * (PROD_NUM_DESCS + CONS_NUM_DESCS)) | |
88 | #define BATCH_SIZE NETDEV_MAX_BURST | |
89 | ||
90 | BUILD_ASSERT_DECL(IS_POW2(NUM_FRAMES)); | |
91 | BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS); | |
92 | ||
93 | #define UMEM2DESC(elem, base) ((uint64_t)((char *)elem - (char *)base)) | |
94 | ||
95 | static struct xsk_socket_info *xsk_configure(int ifindex, int xdp_queue_id, | |
e8f56344 IM |
96 | enum afxdp_mode mode, |
97 | bool use_need_wakeup, | |
98 | bool report_socket_failures); | |
99 | static void xsk_remove_xdp_program(uint32_t ifindex, enum afxdp_mode); | |
0de1b425 WT |
100 | static void xsk_destroy(struct xsk_socket_info *xsk); |
101 | static int xsk_configure_all(struct netdev *netdev); | |
102 | static void xsk_destroy_all(struct netdev *netdev); | |
103 | ||
e8f56344 IM |
104 | static struct { |
105 | const char *name; | |
106 | uint32_t bind_flags; | |
107 | uint32_t xdp_flags; | |
108 | } xdp_modes[] = { | |
109 | [OVS_AF_XDP_MODE_UNSPEC] = { | |
110 | .name = "unspecified", | |
111 | .bind_flags = 0, | |
112 | .xdp_flags = 0, | |
113 | }, | |
114 | [OVS_AF_XDP_MODE_BEST_EFFORT] = { | |
115 | .name = "best-effort", | |
116 | .bind_flags = 0, | |
117 | .xdp_flags = 0, | |
118 | }, | |
119 | [OVS_AF_XDP_MODE_NATIVE_ZC] = { | |
120 | .name = "native-with-zerocopy", | |
121 | .bind_flags = XDP_ZEROCOPY, | |
122 | .xdp_flags = XDP_FLAGS_DRV_MODE, | |
123 | }, | |
124 | [OVS_AF_XDP_MODE_NATIVE] = { | |
125 | .name = "native", | |
126 | .bind_flags = XDP_COPY, | |
127 | .xdp_flags = XDP_FLAGS_DRV_MODE, | |
128 | }, | |
129 | [OVS_AF_XDP_MODE_GENERIC] = { | |
130 | .name = "generic", | |
131 | .bind_flags = XDP_COPY, | |
132 | .xdp_flags = XDP_FLAGS_SKB_MODE, | |
133 | }, | |
134 | }; | |
135 | ||
0de1b425 WT |
136 | struct unused_pool { |
137 | struct xsk_umem_info *umem_info; | |
138 | int lost_in_rings; /* Number of packets left in tx, rx, cq and fq. */ | |
139 | struct ovs_list list_node; | |
140 | }; | |
141 | ||
142 | static struct ovs_mutex unused_pools_mutex = OVS_MUTEX_INITIALIZER; | |
143 | static struct ovs_list unused_pools OVS_GUARDED_BY(unused_pools_mutex) = | |
144 | OVS_LIST_INITIALIZER(&unused_pools); | |
145 | ||
146 | struct xsk_umem_info { | |
147 | struct umem_pool mpool; | |
148 | struct xpacket_pool xpool; | |
149 | struct xsk_ring_prod fq; | |
150 | struct xsk_ring_cons cq; | |
151 | struct xsk_umem *umem; | |
152 | void *buffer; | |
153 | }; | |
154 | ||
155 | struct xsk_socket_info { | |
156 | struct xsk_ring_cons rx; | |
157 | struct xsk_ring_prod tx; | |
158 | struct xsk_umem_info *umem; | |
159 | struct xsk_socket *xsk; | |
160 | uint32_t outstanding_tx; /* Number of descriptors filled in tx and cq. */ | |
161 | uint32_t available_rx; /* Number of descriptors filled in rx and fq. */ | |
162 | atomic_uint64_t tx_dropped; | |
163 | }; | |
164 | ||
28d05016 IM |
165 | struct netdev_afxdp_tx_lock { |
166 | /* Padding to make netdev_afxdp_tx_lock exactly one cache line long. */ | |
167 | PADDED_MEMBERS(CACHE_LINE_SIZE, | |
168 | struct ovs_spin lock; | |
169 | ); | |
170 | }; | |
171 | ||
e50547b5 WT |
172 | #ifdef HAVE_XDP_NEED_WAKEUP |
173 | static inline void | |
174 | xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem, | |
175 | struct netdev *netdev, int fd) | |
176 | { | |
177 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
178 | struct pollfd pfd; | |
179 | int ret; | |
180 | ||
181 | if (!dev->use_need_wakeup) { | |
182 | return; | |
183 | } | |
184 | ||
185 | if (xsk_ring_prod__needs_wakeup(&umem->fq)) { | |
186 | pfd.fd = fd; | |
187 | pfd.events = POLLIN; | |
188 | ||
189 | ret = poll(&pfd, 1, 0); | |
190 | if (OVS_UNLIKELY(ret < 0)) { | |
191 | VLOG_WARN_RL(&rl, "%s: error polling rx fd: %s.", | |
192 | netdev_get_name(netdev), | |
193 | ovs_strerror(errno)); | |
194 | } | |
195 | } | |
196 | } | |
197 | ||
198 | static inline bool | |
199 | xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info) | |
200 | { | |
201 | return xsk_ring_prod__needs_wakeup(&xsk_info->tx); | |
202 | } | |
203 | ||
204 | #else /* !HAVE_XDP_NEED_WAKEUP */ | |
205 | static inline void | |
206 | xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem OVS_UNUSED, | |
207 | struct netdev *netdev OVS_UNUSED, | |
208 | int fd OVS_UNUSED) | |
209 | { | |
210 | /* Nothing. */ | |
211 | } | |
212 | ||
213 | static inline bool | |
214 | xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info OVS_UNUSED) | |
215 | { | |
216 | return true; | |
217 | } | |
218 | #endif /* HAVE_XDP_NEED_WAKEUP */ | |
219 | ||
0de1b425 WT |
220 | static void |
221 | netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool) | |
222 | { | |
223 | /* Free the packet buffer. */ | |
224 | free_pagealign(pool->umem_info->buffer); | |
225 | ||
226 | /* Cleanup umem pool. */ | |
227 | umem_pool_cleanup(&pool->umem_info->mpool); | |
228 | ||
229 | /* Cleanup metadata pool. */ | |
230 | xpacket_pool_cleanup(&pool->umem_info->xpool); | |
231 | ||
232 | free(pool->umem_info); | |
233 | } | |
234 | ||
235 | static void | |
236 | netdev_afxdp_sweep_unused_pools(void *aux OVS_UNUSED) | |
237 | { | |
238 | struct unused_pool *pool, *next; | |
239 | unsigned int count; | |
240 | ||
241 | ovs_mutex_lock(&unused_pools_mutex); | |
242 | LIST_FOR_EACH_SAFE (pool, next, list_node, &unused_pools) { | |
243 | ||
244 | count = umem_pool_count(&pool->umem_info->mpool); | |
245 | ovs_assert(count + pool->lost_in_rings <= NUM_FRAMES); | |
246 | ||
247 | if (count + pool->lost_in_rings == NUM_FRAMES) { | |
248 | /* OVS doesn't use this memory pool anymore. Kernel doesn't | |
249 | * use it since closing the xdp socket. So, it's safe to free | |
250 | * the pool now. */ | |
251 | VLOG_DBG("Freeing umem pool at 0x%"PRIxPTR, | |
252 | (uintptr_t) pool->umem_info); | |
253 | ovs_list_remove(&pool->list_node); | |
254 | netdev_afxdp_cleanup_unused_pool(pool); | |
255 | free(pool); | |
256 | } | |
257 | } | |
258 | ovs_mutex_unlock(&unused_pools_mutex); | |
259 | } | |
260 | ||
261 | static struct xsk_umem_info * | |
e8f56344 | 262 | xsk_configure_umem(void *buffer, uint64_t size) |
0de1b425 WT |
263 | { |
264 | struct xsk_umem_config uconfig; | |
265 | struct xsk_umem_info *umem; | |
266 | int ret; | |
267 | int i; | |
268 | ||
269 | umem = xzalloc(sizeof *umem); | |
270 | ||
ec92f8d2 | 271 | memset(&uconfig, 0, sizeof uconfig); |
0de1b425 WT |
272 | uconfig.fill_size = PROD_NUM_DESCS; |
273 | uconfig.comp_size = CONS_NUM_DESCS; | |
274 | uconfig.frame_size = FRAME_SIZE; | |
275 | uconfig.frame_headroom = OVS_XDP_HEADROOM; | |
276 | ||
277 | ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, | |
278 | &uconfig); | |
279 | if (ret) { | |
e8f56344 | 280 | VLOG_ERR("xsk_umem__create failed: %s.", ovs_strerror(errno)); |
0de1b425 WT |
281 | free(umem); |
282 | return NULL; | |
283 | } | |
284 | ||
285 | umem->buffer = buffer; | |
286 | ||
287 | /* Set-up umem pool. */ | |
288 | if (umem_pool_init(&umem->mpool, NUM_FRAMES) < 0) { | |
289 | VLOG_ERR("umem_pool_init failed"); | |
290 | if (xsk_umem__delete(umem->umem)) { | |
291 | VLOG_ERR("xsk_umem__delete failed"); | |
292 | } | |
293 | free(umem); | |
294 | return NULL; | |
295 | } | |
296 | ||
297 | for (i = NUM_FRAMES - 1; i >= 0; i--) { | |
298 | void *elem; | |
299 | ||
300 | elem = ALIGNED_CAST(void *, (char *)umem->buffer + i * FRAME_SIZE); | |
301 | umem_elem_push(&umem->mpool, elem); | |
302 | } | |
303 | ||
304 | /* Set-up metadata. */ | |
305 | if (xpacket_pool_init(&umem->xpool, NUM_FRAMES) < 0) { | |
306 | VLOG_ERR("xpacket_pool_init failed"); | |
307 | umem_pool_cleanup(&umem->mpool); | |
308 | if (xsk_umem__delete(umem->umem)) { | |
309 | VLOG_ERR("xsk_umem__delete failed"); | |
310 | } | |
311 | free(umem); | |
312 | return NULL; | |
313 | } | |
314 | ||
315 | VLOG_DBG("%s: xpacket pool from %p to %p", __func__, | |
316 | umem->xpool.array, | |
317 | (char *)umem->xpool.array + | |
318 | NUM_FRAMES * sizeof(struct dp_packet_afxdp)); | |
319 | ||
320 | for (i = NUM_FRAMES - 1; i >= 0; i--) { | |
321 | struct dp_packet_afxdp *xpacket; | |
322 | struct dp_packet *packet; | |
323 | ||
324 | xpacket = &umem->xpool.array[i]; | |
325 | xpacket->mpool = &umem->mpool; | |
326 | ||
327 | packet = &xpacket->packet; | |
328 | packet->source = DPBUF_AFXDP; | |
329 | } | |
330 | ||
331 | return umem; | |
332 | } | |
333 | ||
334 | static struct xsk_socket_info * | |
335 | xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex, | |
e8f56344 IM |
336 | uint32_t queue_id, enum afxdp_mode mode, |
337 | bool use_need_wakeup, bool report_socket_failures) | |
0de1b425 WT |
338 | { |
339 | struct xsk_socket_config cfg; | |
340 | struct xsk_socket_info *xsk; | |
341 | char devname[IF_NAMESIZE]; | |
342 | uint32_t idx = 0, prog_id; | |
343 | int ret; | |
344 | int i; | |
345 | ||
346 | xsk = xzalloc(sizeof *xsk); | |
347 | xsk->umem = umem; | |
348 | cfg.rx_size = CONS_NUM_DESCS; | |
349 | cfg.tx_size = PROD_NUM_DESCS; | |
350 | cfg.libbpf_flags = 0; | |
e8f56344 IM |
351 | cfg.bind_flags = xdp_modes[mode].bind_flags; |
352 | cfg.xdp_flags = xdp_modes[mode].xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST; | |
0de1b425 | 353 | |
e50547b5 WT |
354 | #ifdef HAVE_XDP_NEED_WAKEUP |
355 | if (use_need_wakeup) { | |
356 | cfg.bind_flags |= XDP_USE_NEED_WAKEUP; | |
357 | } | |
358 | #endif | |
359 | ||
0de1b425 WT |
360 | if (if_indextoname(ifindex, devname) == NULL) { |
361 | VLOG_ERR("ifindex %d to devname failed (%s)", | |
362 | ifindex, ovs_strerror(errno)); | |
363 | free(xsk); | |
364 | return NULL; | |
365 | } | |
366 | ||
367 | ret = xsk_socket__create(&xsk->xsk, devname, queue_id, umem->umem, | |
368 | &xsk->rx, &xsk->tx, &cfg); | |
369 | if (ret) { | |
e8f56344 IM |
370 | VLOG(report_socket_failures ? VLL_ERR : VLL_DBG, |
371 | "xsk_socket__create failed (%s) mode: %s, " | |
372 | "use-need-wakeup: %s, qid: %d", | |
373 | ovs_strerror(errno), xdp_modes[mode].name, | |
374 | use_need_wakeup ? "true" : "false", queue_id); | |
0de1b425 WT |
375 | free(xsk); |
376 | return NULL; | |
377 | } | |
378 | ||
379 | /* Make sure the built-in AF_XDP program is loaded. */ | |
380 | ret = bpf_get_link_xdp_id(ifindex, &prog_id, cfg.xdp_flags); | |
22b78906 WT |
381 | if (ret || !prog_id) { |
382 | if (ret) { | |
383 | VLOG_ERR("Get XDP prog ID failed (%s)", ovs_strerror(errno)); | |
384 | } else { | |
385 | VLOG_ERR("No XDP program is loaded at ifindex %d", ifindex); | |
386 | } | |
0de1b425 WT |
387 | xsk_socket__delete(xsk->xsk); |
388 | free(xsk); | |
389 | return NULL; | |
390 | } | |
391 | ||
392 | while (!xsk_ring_prod__reserve(&xsk->umem->fq, | |
393 | PROD_NUM_DESCS, &idx)) { | |
394 | VLOG_WARN_RL(&rl, "Retry xsk_ring_prod__reserve to FILL queue"); | |
395 | } | |
396 | ||
397 | for (i = 0; | |
398 | i < PROD_NUM_DESCS * FRAME_SIZE; | |
399 | i += FRAME_SIZE) { | |
400 | void *elem; | |
401 | uint64_t addr; | |
402 | ||
403 | elem = umem_elem_pop(&xsk->umem->mpool); | |
404 | addr = UMEM2DESC(elem, xsk->umem->buffer); | |
405 | ||
406 | *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = addr; | |
407 | } | |
408 | ||
409 | xsk_ring_prod__submit(&xsk->umem->fq, | |
410 | PROD_NUM_DESCS); | |
411 | return xsk; | |
412 | } | |
413 | ||
414 | static struct xsk_socket_info * | |
e8f56344 IM |
415 | xsk_configure(int ifindex, int xdp_queue_id, enum afxdp_mode mode, |
416 | bool use_need_wakeup, bool report_socket_failures) | |
0de1b425 WT |
417 | { |
418 | struct xsk_socket_info *xsk; | |
419 | struct xsk_umem_info *umem; | |
420 | void *bufs; | |
421 | ||
422 | netdev_afxdp_sweep_unused_pools(NULL); | |
423 | ||
424 | /* Umem memory region. */ | |
425 | bufs = xmalloc_pagealign(NUM_FRAMES * FRAME_SIZE); | |
426 | memset(bufs, 0, NUM_FRAMES * FRAME_SIZE); | |
427 | ||
428 | /* Create AF_XDP socket. */ | |
e8f56344 | 429 | umem = xsk_configure_umem(bufs, NUM_FRAMES * FRAME_SIZE); |
0de1b425 WT |
430 | if (!umem) { |
431 | free_pagealign(bufs); | |
432 | return NULL; | |
433 | } | |
434 | ||
435 | VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem); | |
436 | ||
e8f56344 IM |
437 | xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, mode, |
438 | use_need_wakeup, report_socket_failures); | |
0de1b425 WT |
439 | if (!xsk) { |
440 | /* Clean up umem and xpacket pool. */ | |
441 | if (xsk_umem__delete(umem->umem)) { | |
442 | VLOG_ERR("xsk_umem__delete failed."); | |
443 | } | |
444 | free_pagealign(bufs); | |
445 | umem_pool_cleanup(&umem->mpool); | |
446 | xpacket_pool_cleanup(&umem->xpool); | |
447 | free(umem); | |
448 | } | |
449 | return xsk; | |
450 | } | |
451 | ||
e8f56344 IM |
452 | static int |
453 | xsk_configure_queue(struct netdev_linux *dev, int ifindex, int queue_id, | |
454 | enum afxdp_mode mode, bool report_socket_failures) | |
455 | { | |
456 | struct xsk_socket_info *xsk_info; | |
457 | ||
458 | VLOG_DBG("%s: configuring queue: %d, mode: %s, use-need-wakeup: %s.", | |
459 | netdev_get_name(&dev->up), queue_id, xdp_modes[mode].name, | |
460 | dev->use_need_wakeup ? "true" : "false"); | |
461 | xsk_info = xsk_configure(ifindex, queue_id, mode, dev->use_need_wakeup, | |
462 | report_socket_failures); | |
463 | if (!xsk_info) { | |
464 | VLOG(report_socket_failures ? VLL_ERR : VLL_DBG, | |
465 | "%s: Failed to create AF_XDP socket on queue %d in %s mode.", | |
466 | netdev_get_name(&dev->up), queue_id, xdp_modes[mode].name); | |
467 | dev->xsks[queue_id] = NULL; | |
468 | return -1; | |
469 | } | |
470 | dev->xsks[queue_id] = xsk_info; | |
471 | atomic_init(&xsk_info->tx_dropped, 0); | |
472 | xsk_info->outstanding_tx = 0; | |
473 | xsk_info->available_rx = PROD_NUM_DESCS; | |
474 | return 0; | |
475 | } | |
476 | ||
477 | ||
0de1b425 WT |
478 | static int |
479 | xsk_configure_all(struct netdev *netdev) | |
480 | { | |
481 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
0de1b425 | 482 | int i, ifindex, n_rxq, n_txq; |
e8f56344 | 483 | int qid = 0; |
0de1b425 WT |
484 | |
485 | ifindex = linux_get_ifindex(netdev_get_name(netdev)); | |
486 | ||
487 | ovs_assert(dev->xsks == NULL); | |
488 | ovs_assert(dev->tx_locks == NULL); | |
489 | ||
490 | n_rxq = netdev_n_rxq(netdev); | |
491 | dev->xsks = xcalloc(n_rxq, sizeof *dev->xsks); | |
492 | ||
e8f56344 IM |
493 | if (dev->xdp_mode == OVS_AF_XDP_MODE_BEST_EFFORT) { |
494 | /* Trying to configure first queue with different modes to | |
495 | * find the most suitable. */ | |
496 | for (i = OVS_AF_XDP_MODE_NATIVE_ZC; i < OVS_AF_XDP_MODE_MAX; i++) { | |
497 | if (!xsk_configure_queue(dev, ifindex, qid, i, | |
498 | i == OVS_AF_XDP_MODE_MAX - 1)) { | |
499 | dev->xdp_mode_in_use = i; | |
500 | VLOG_INFO("%s: %s XDP mode will be in use.", | |
501 | netdev_get_name(netdev), xdp_modes[i].name); | |
502 | break; | |
503 | } | |
504 | } | |
505 | if (i == OVS_AF_XDP_MODE_MAX) { | |
506 | VLOG_ERR("%s: Failed to detect suitable XDP mode.", | |
507 | netdev_get_name(netdev)); | |
508 | goto err; | |
509 | } | |
510 | qid++; | |
511 | } else { | |
512 | dev->xdp_mode_in_use = dev->xdp_mode; | |
513 | } | |
514 | ||
515 | /* Configure remaining queues. */ | |
516 | for (; qid < n_rxq; qid++) { | |
517 | if (xsk_configure_queue(dev, ifindex, qid, | |
518 | dev->xdp_mode_in_use, true)) { | |
519 | VLOG_ERR("%s: Failed to create AF_XDP socket on queue %d.", | |
520 | netdev_get_name(netdev), qid); | |
0de1b425 WT |
521 | goto err; |
522 | } | |
0de1b425 WT |
523 | } |
524 | ||
525 | n_txq = netdev_n_txq(netdev); | |
28d05016 | 526 | dev->tx_locks = xzalloc_cacheline(n_txq * sizeof *dev->tx_locks); |
0de1b425 WT |
527 | |
528 | for (i = 0; i < n_txq; i++) { | |
28d05016 | 529 | ovs_spin_init(&dev->tx_locks[i].lock); |
0de1b425 WT |
530 | } |
531 | ||
532 | return 0; | |
533 | ||
534 | err: | |
535 | xsk_destroy_all(netdev); | |
536 | return EINVAL; | |
537 | } | |
538 | ||
539 | static void | |
540 | xsk_destroy(struct xsk_socket_info *xsk_info) | |
541 | { | |
542 | struct xsk_umem *umem; | |
543 | struct unused_pool *pool; | |
544 | ||
545 | xsk_socket__delete(xsk_info->xsk); | |
546 | xsk_info->xsk = NULL; | |
547 | ||
548 | umem = xsk_info->umem->umem; | |
549 | if (xsk_umem__delete(umem)) { | |
550 | VLOG_ERR("xsk_umem__delete failed."); | |
551 | } | |
552 | ||
553 | pool = xzalloc(sizeof *pool); | |
554 | pool->umem_info = xsk_info->umem; | |
555 | pool->lost_in_rings = xsk_info->outstanding_tx + xsk_info->available_rx; | |
556 | ||
557 | ovs_mutex_lock(&unused_pools_mutex); | |
558 | ovs_list_push_back(&unused_pools, &pool->list_node); | |
559 | ovs_mutex_unlock(&unused_pools_mutex); | |
560 | ||
561 | free(xsk_info); | |
562 | ||
563 | netdev_afxdp_sweep_unused_pools(NULL); | |
564 | } | |
565 | ||
566 | static void | |
567 | xsk_destroy_all(struct netdev *netdev) | |
568 | { | |
569 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
570 | int i, ifindex; | |
571 | ||
572 | if (dev->xsks) { | |
573 | for (i = 0; i < netdev_n_rxq(netdev); i++) { | |
574 | if (dev->xsks[i]) { | |
575 | xsk_destroy(dev->xsks[i]); | |
576 | dev->xsks[i] = NULL; | |
e8f56344 | 577 | VLOG_DBG("%s: Destroyed xsk[%d].", netdev_get_name(netdev), i); |
0de1b425 WT |
578 | } |
579 | } | |
580 | ||
581 | free(dev->xsks); | |
582 | dev->xsks = NULL; | |
583 | } | |
584 | ||
585 | VLOG_INFO("%s: Removing xdp program.", netdev_get_name(netdev)); | |
586 | ifindex = linux_get_ifindex(netdev_get_name(netdev)); | |
e8f56344 | 587 | xsk_remove_xdp_program(ifindex, dev->xdp_mode_in_use); |
0de1b425 WT |
588 | |
589 | if (dev->tx_locks) { | |
590 | for (i = 0; i < netdev_n_txq(netdev); i++) { | |
28d05016 | 591 | ovs_spin_destroy(&dev->tx_locks[i].lock); |
0de1b425 | 592 | } |
28d05016 | 593 | free_cacheline(dev->tx_locks); |
0de1b425 WT |
594 | dev->tx_locks = NULL; |
595 | } | |
596 | } | |
597 | ||
0de1b425 WT |
598 | int |
599 | netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args, | |
600 | char **errp OVS_UNUSED) | |
601 | { | |
602 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
e8f56344 IM |
603 | const char *str_xdp_mode; |
604 | enum afxdp_mode xdp_mode; | |
e50547b5 | 605 | bool need_wakeup; |
e8f56344 | 606 | int new_n_rxq; |
0de1b425 WT |
607 | |
608 | ovs_mutex_lock(&dev->mutex); | |
609 | new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1); | |
610 | if (new_n_rxq > MAX_XSKQ) { | |
611 | ovs_mutex_unlock(&dev->mutex); | |
612 | VLOG_ERR("%s: Too big 'n_rxq' (%d > %d).", | |
613 | netdev_get_name(netdev), new_n_rxq, MAX_XSKQ); | |
614 | return EINVAL; | |
615 | } | |
616 | ||
e8f56344 IM |
617 | str_xdp_mode = smap_get_def(args, "xdp-mode", "best-effort"); |
618 | for (xdp_mode = OVS_AF_XDP_MODE_BEST_EFFORT; | |
619 | xdp_mode < OVS_AF_XDP_MODE_MAX; | |
620 | xdp_mode++) { | |
621 | if (!strcasecmp(str_xdp_mode, xdp_modes[xdp_mode].name)) { | |
622 | break; | |
623 | } | |
624 | } | |
625 | if (xdp_mode == OVS_AF_XDP_MODE_MAX) { | |
626 | VLOG_ERR("%s: Incorrect xdp-mode (%s).", | |
627 | netdev_get_name(netdev), str_xdp_mode); | |
0de1b425 WT |
628 | ovs_mutex_unlock(&dev->mutex); |
629 | return EINVAL; | |
630 | } | |
631 | ||
e50547b5 WT |
632 | need_wakeup = smap_get_bool(args, "use-need-wakeup", NEED_WAKEUP_DEFAULT); |
633 | #ifndef HAVE_XDP_NEED_WAKEUP | |
634 | if (need_wakeup) { | |
635 | VLOG_WARN("XDP need_wakeup is not supported in libbpf."); | |
636 | need_wakeup = false; | |
637 | } | |
638 | #endif | |
639 | ||
0de1b425 | 640 | if (dev->requested_n_rxq != new_n_rxq |
e8f56344 | 641 | || dev->requested_xdp_mode != xdp_mode |
e50547b5 | 642 | || dev->requested_need_wakeup != need_wakeup) { |
0de1b425 | 643 | dev->requested_n_rxq = new_n_rxq; |
e8f56344 | 644 | dev->requested_xdp_mode = xdp_mode; |
e50547b5 | 645 | dev->requested_need_wakeup = need_wakeup; |
0de1b425 WT |
646 | netdev_request_reconfigure(netdev); |
647 | } | |
648 | ovs_mutex_unlock(&dev->mutex); | |
649 | return 0; | |
650 | } | |
651 | ||
652 | int | |
653 | netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args) | |
654 | { | |
655 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
656 | ||
657 | ovs_mutex_lock(&dev->mutex); | |
658 | smap_add_format(args, "n_rxq", "%d", netdev->n_rxq); | |
e8f56344 IM |
659 | smap_add_format(args, "xdp-mode", "%s", xdp_modes[dev->xdp_mode].name); |
660 | smap_add_format(args, "xdp-mode-in-use", "%s", | |
661 | xdp_modes[dev->xdp_mode_in_use].name); | |
e50547b5 WT |
662 | smap_add_format(args, "use-need-wakeup", "%s", |
663 | dev->use_need_wakeup ? "true" : "false"); | |
0de1b425 WT |
664 | ovs_mutex_unlock(&dev->mutex); |
665 | return 0; | |
666 | } | |
667 | ||
668 | int | |
669 | netdev_afxdp_reconfigure(struct netdev *netdev) | |
670 | { | |
671 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
672 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | |
e8568993 YHW |
673 | struct bitmask *old_bm = NULL; |
674 | int old_policy, numa_id; | |
0de1b425 WT |
675 | int err = 0; |
676 | ||
e8568993 YHW |
677 | /* Allocate all the xsk related memory in the netdev's NUMA domain. */ |
678 | if (numa_available() != -1 && ovs_numa_get_n_numas() > 1) { | |
679 | numa_id = netdev_get_numa_id(netdev); | |
680 | if (numa_id != NETDEV_NUMA_UNSPEC) { | |
681 | old_bm = numa_allocate_nodemask(); | |
682 | if (get_mempolicy(&old_policy, old_bm->maskp, old_bm->size + 1, | |
683 | NULL, 0)) { | |
684 | VLOG_INFO("Failed to get NUMA memory policy: %s.", | |
685 | ovs_strerror(errno)); | |
686 | numa_bitmask_free(old_bm); | |
687 | old_bm = NULL; | |
688 | } else { | |
689 | numa_set_preferred(numa_id); | |
690 | } | |
691 | } | |
692 | } | |
693 | ||
0de1b425 WT |
694 | ovs_mutex_lock(&dev->mutex); |
695 | ||
696 | if (netdev->n_rxq == dev->requested_n_rxq | |
e8f56344 | 697 | && dev->xdp_mode == dev->requested_xdp_mode |
e50547b5 | 698 | && dev->use_need_wakeup == dev->requested_need_wakeup |
f627cf1d | 699 | && dev->xsks) { |
0de1b425 WT |
700 | goto out; |
701 | } | |
702 | ||
703 | xsk_destroy_all(netdev); | |
704 | ||
705 | netdev->n_rxq = dev->requested_n_rxq; | |
706 | netdev->n_txq = netdev->n_rxq; | |
707 | ||
e8f56344 | 708 | dev->xdp_mode = dev->requested_xdp_mode; |
53c0bd5d | 709 | VLOG_INFO("%s: Setting XDP mode to %s.", netdev_get_name(netdev), |
e8f56344 | 710 | xdp_modes[dev->xdp_mode].name); |
53c0bd5d IM |
711 | |
712 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | |
713 | VLOG_ERR("setrlimit(RLIMIT_MEMLOCK) failed: %s", ovs_strerror(errno)); | |
0de1b425 | 714 | } |
e50547b5 | 715 | dev->use_need_wakeup = dev->requested_need_wakeup; |
0de1b425 WT |
716 | |
717 | err = xsk_configure_all(netdev); | |
718 | if (err) { | |
e8f56344 IM |
719 | VLOG_ERR("%s: AF_XDP device reconfiguration failed.", |
720 | netdev_get_name(netdev)); | |
0de1b425 WT |
721 | } |
722 | netdev_change_seq_changed(netdev); | |
723 | out: | |
724 | ovs_mutex_unlock(&dev->mutex); | |
e8568993 YHW |
725 | if (old_bm) { |
726 | if (set_mempolicy(old_policy, old_bm->maskp, old_bm->size + 1)) { | |
727 | VLOG_WARN("Failed to restore NUMA memory policy: %s.", | |
728 | ovs_strerror(errno)); | |
729 | /* Can't restore correctly. Try to use localalloc as the most | |
730 | * likely default memory policy. */ | |
731 | numa_set_localalloc(); | |
732 | } | |
733 | numa_bitmask_free(old_bm); | |
734 | } | |
0de1b425 WT |
735 | return err; |
736 | } | |
737 | ||
0de1b425 | 738 | static void |
e8f56344 | 739 | xsk_remove_xdp_program(uint32_t ifindex, enum afxdp_mode mode) |
0de1b425 | 740 | { |
e8f56344 | 741 | uint32_t flags = xdp_modes[mode].xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST; |
37a24655 IM |
742 | uint32_t ret, prog_id = 0; |
743 | ||
744 | /* Check whether XDP program is loaded. */ | |
745 | ret = bpf_get_link_xdp_id(ifindex, &prog_id, flags); | |
746 | if (ret) { | |
747 | VLOG_ERR("Failed to get XDP prog id (%s)", ovs_strerror(errno)); | |
748 | return; | |
749 | } | |
750 | ||
751 | if (!prog_id) { | |
752 | VLOG_INFO("No XDP program is loaded at ifindex %d", ifindex); | |
753 | return; | |
754 | } | |
0de1b425 WT |
755 | |
756 | bpf_set_link_xdp_fd(ifindex, -1, flags); | |
757 | } | |
758 | ||
759 | void | |
760 | signal_remove_xdp(struct netdev *netdev) | |
761 | { | |
762 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
763 | int ifindex; | |
764 | ||
765 | ifindex = linux_get_ifindex(netdev_get_name(netdev)); | |
766 | ||
767 | VLOG_WARN("Force removing xdp program."); | |
e8f56344 | 768 | xsk_remove_xdp_program(ifindex, dev->xdp_mode_in_use); |
0de1b425 WT |
769 | } |
770 | ||
771 | static struct dp_packet_afxdp * | |
772 | dp_packet_cast_afxdp(const struct dp_packet *d) | |
773 | { | |
774 | ovs_assert(d->source == DPBUF_AFXDP); | |
775 | return CONTAINER_OF(d, struct dp_packet_afxdp, packet); | |
776 | } | |
777 | ||
778 | static inline void | |
779 | prepare_fill_queue(struct xsk_socket_info *xsk_info) | |
780 | { | |
781 | struct xsk_umem_info *umem; | |
782 | void *elems[BATCH_SIZE]; | |
783 | unsigned int idx_fq; | |
784 | int i, ret; | |
785 | ||
786 | umem = xsk_info->umem; | |
787 | ||
788 | if (xsk_prod_nb_free(&umem->fq, BATCH_SIZE) < BATCH_SIZE) { | |
789 | return; | |
790 | } | |
791 | ||
792 | ret = umem_elem_pop_n(&umem->mpool, BATCH_SIZE, elems); | |
793 | if (OVS_UNLIKELY(ret)) { | |
794 | return; | |
795 | } | |
796 | ||
797 | if (!xsk_ring_prod__reserve(&umem->fq, BATCH_SIZE, &idx_fq)) { | |
798 | umem_elem_push_n(&umem->mpool, BATCH_SIZE, elems); | |
799 | COVERAGE_INC(afxdp_fq_full); | |
800 | return; | |
801 | } | |
802 | ||
803 | for (i = 0; i < BATCH_SIZE; i++) { | |
804 | uint64_t index; | |
805 | void *elem; | |
806 | ||
807 | elem = elems[i]; | |
808 | index = (uint64_t)((char *)elem - (char *)umem->buffer); | |
809 | ovs_assert((index & FRAME_SHIFT_MASK) == 0); | |
810 | *xsk_ring_prod__fill_addr(&umem->fq, idx_fq) = index; | |
811 | ||
812 | idx_fq++; | |
813 | } | |
814 | xsk_ring_prod__submit(&umem->fq, BATCH_SIZE); | |
815 | xsk_info->available_rx += BATCH_SIZE; | |
816 | } | |
817 | ||
818 | int | |
819 | netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, | |
820 | int *qfill) | |
821 | { | |
822 | struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); | |
823 | struct netdev *netdev = rx->up.netdev; | |
824 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
825 | struct xsk_socket_info *xsk_info; | |
826 | struct xsk_umem_info *umem; | |
827 | uint32_t idx_rx = 0; | |
828 | int qid = rxq_->queue_id; | |
829 | unsigned int rcvd, i; | |
830 | ||
831 | xsk_info = dev->xsks[qid]; | |
832 | if (!xsk_info || !xsk_info->xsk) { | |
833 | return EAGAIN; | |
834 | } | |
835 | ||
836 | prepare_fill_queue(xsk_info); | |
837 | ||
838 | umem = xsk_info->umem; | |
839 | rx->fd = xsk_socket__fd(xsk_info->xsk); | |
840 | ||
841 | rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx); | |
842 | if (!rcvd) { | |
e50547b5 | 843 | xsk_rx_wakeup_if_needed(umem, netdev, rx->fd); |
0de1b425 WT |
844 | return EAGAIN; |
845 | } | |
846 | ||
847 | /* Setup a dp_packet batch from descriptors in RX queue. */ | |
848 | for (i = 0; i < rcvd; i++) { | |
849 | struct dp_packet_afxdp *xpacket; | |
850 | const struct xdp_desc *desc; | |
851 | struct dp_packet *packet; | |
852 | uint64_t addr, index; | |
853 | uint32_t len; | |
854 | char *pkt; | |
855 | ||
856 | desc = xsk_ring_cons__rx_desc(&xsk_info->rx, idx_rx); | |
857 | addr = desc->addr; | |
858 | len = desc->len; | |
859 | ||
860 | pkt = xsk_umem__get_data(umem->buffer, addr); | |
861 | index = addr >> FRAME_SHIFT; | |
862 | xpacket = &umem->xpool.array[index]; | |
863 | packet = &xpacket->packet; | |
864 | ||
865 | /* Initialize the struct dp_packet. */ | |
866 | dp_packet_use_afxdp(packet, pkt, | |
867 | FRAME_SIZE - FRAME_HEADROOM, | |
868 | OVS_XDP_HEADROOM); | |
869 | dp_packet_set_size(packet, len); | |
870 | ||
871 | /* Add packet into batch, increase batch->count. */ | |
872 | dp_packet_batch_add(batch, packet); | |
873 | ||
874 | idx_rx++; | |
875 | } | |
876 | /* Release the RX queue. */ | |
877 | xsk_ring_cons__release(&xsk_info->rx, rcvd); | |
878 | xsk_info->available_rx -= rcvd; | |
879 | ||
880 | if (qfill) { | |
881 | /* TODO: return the number of remaining packets in the queue. */ | |
882 | *qfill = 0; | |
883 | } | |
0de1b425 WT |
884 | return 0; |
885 | } | |
886 | ||
887 | static inline int | |
e8f56344 IM |
888 | kick_tx(struct xsk_socket_info *xsk_info, enum afxdp_mode mode, |
889 | bool use_need_wakeup) | |
0de1b425 WT |
890 | { |
891 | int ret, retries; | |
892 | static const int KERNEL_TX_BATCH_SIZE = 16; | |
893 | ||
e50547b5 WT |
894 | if (use_need_wakeup && !xsk_tx_need_wakeup(xsk_info)) { |
895 | return 0; | |
896 | } | |
897 | ||
161773c7 IM |
898 | /* In all modes except native-with-zerocopy packet transmission is |
899 | * synchronous, and the kernel xmits only TX_BATCH_SIZE(16) packets for a | |
900 | * single sendmsg syscall. | |
0de1b425 WT |
901 | * So, we have to kick the kernel (n_packets / 16) times to be sure that |
902 | * all packets are transmitted. */ | |
161773c7 | 903 | retries = (mode != OVS_AF_XDP_MODE_NATIVE_ZC) |
0de1b425 WT |
904 | ? xsk_info->outstanding_tx / KERNEL_TX_BATCH_SIZE |
905 | : 0; | |
906 | kick_retry: | |
161773c7 IM |
907 | /* This causes system call into kernel's xsk_sendmsg, and xsk_generic_xmit |
908 | * (generic and native modes) or xsk_zc_xmit (native-with-zerocopy mode). | |
0de1b425 WT |
909 | */ |
910 | ret = sendto(xsk_socket__fd(xsk_info->xsk), NULL, 0, MSG_DONTWAIT, | |
911 | NULL, 0); | |
912 | if (ret < 0) { | |
913 | if (retries-- && errno == EAGAIN) { | |
914 | goto kick_retry; | |
915 | } | |
916 | if (errno == ENXIO || errno == ENOBUFS || errno == EOPNOTSUPP) { | |
917 | return errno; | |
918 | } | |
919 | } | |
920 | /* No error, or EBUSY, or too many retries on EAGAIN. */ | |
921 | return 0; | |
922 | } | |
923 | ||
924 | void | |
925 | free_afxdp_buf(struct dp_packet *p) | |
926 | { | |
927 | struct dp_packet_afxdp *xpacket; | |
928 | uintptr_t addr; | |
929 | ||
930 | xpacket = dp_packet_cast_afxdp(p); | |
931 | if (xpacket->mpool) { | |
932 | void *base = dp_packet_base(p); | |
933 | ||
934 | addr = (uintptr_t)base & (~FRAME_SHIFT_MASK); | |
935 | umem_elem_push(xpacket->mpool, (void *)addr); | |
936 | } | |
937 | } | |
938 | ||
939 | static void | |
940 | free_afxdp_buf_batch(struct dp_packet_batch *batch) | |
941 | { | |
942 | struct dp_packet_afxdp *xpacket = NULL; | |
943 | struct dp_packet *packet; | |
944 | void *elems[BATCH_SIZE]; | |
945 | uintptr_t addr; | |
946 | ||
947 | DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { | |
948 | void *base; | |
949 | ||
950 | xpacket = dp_packet_cast_afxdp(packet); | |
951 | base = dp_packet_base(packet); | |
952 | addr = (uintptr_t)base & (~FRAME_SHIFT_MASK); | |
953 | elems[i] = (void *)addr; | |
954 | } | |
940ac2ce | 955 | umem_elem_push_n(xpacket->mpool, dp_packet_batch_size(batch), elems); |
0de1b425 WT |
956 | dp_packet_batch_init(batch); |
957 | } | |
958 | ||
959 | static inline bool | |
960 | check_free_batch(struct dp_packet_batch *batch) | |
961 | { | |
962 | struct umem_pool *first_mpool = NULL; | |
963 | struct dp_packet_afxdp *xpacket; | |
964 | struct dp_packet *packet; | |
965 | ||
966 | DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { | |
967 | if (packet->source != DPBUF_AFXDP) { | |
968 | return false; | |
969 | } | |
970 | xpacket = dp_packet_cast_afxdp(packet); | |
971 | if (i == 0) { | |
972 | first_mpool = xpacket->mpool; | |
973 | continue; | |
974 | } | |
975 | if (xpacket->mpool != first_mpool) { | |
976 | return false; | |
977 | } | |
978 | } | |
979 | /* All packets are DPBUF_AFXDP and from the same mpool. */ | |
980 | return true; | |
981 | } | |
982 | ||
983 | static inline void | |
984 | afxdp_complete_tx(struct xsk_socket_info *xsk_info) | |
985 | { | |
986 | void *elems_push[BATCH_SIZE]; | |
987 | struct xsk_umem_info *umem; | |
988 | uint32_t idx_cq = 0; | |
989 | int tx_to_free = 0; | |
990 | int tx_done, j; | |
991 | ||
992 | umem = xsk_info->umem; | |
993 | tx_done = xsk_ring_cons__peek(&umem->cq, CONS_NUM_DESCS, &idx_cq); | |
994 | ||
995 | /* Recycle back to umem pool. */ | |
996 | for (j = 0; j < tx_done; j++) { | |
997 | uint64_t *addr; | |
998 | void *elem; | |
999 | ||
1000 | addr = (uint64_t *)xsk_ring_cons__comp_addr(&umem->cq, idx_cq++); | |
05629ed2 EC |
1001 | if (*addr != UINT64_MAX) { |
1002 | elem = ALIGNED_CAST(void *, (char *)umem->buffer + *addr); | |
1003 | elems_push[tx_to_free] = elem; | |
1004 | *addr = UINT64_MAX; /* Mark as pushed. */ | |
1005 | tx_to_free++; | |
1006 | } else { | |
0de1b425 WT |
1007 | /* The elem has been pushed already. */ |
1008 | COVERAGE_INC(afxdp_cq_skip); | |
0de1b425 | 1009 | } |
0de1b425 WT |
1010 | |
1011 | if (tx_to_free == BATCH_SIZE || j == tx_done - 1) { | |
1012 | umem_elem_push_n(&umem->mpool, tx_to_free, elems_push); | |
1013 | xsk_info->outstanding_tx -= tx_to_free; | |
1014 | tx_to_free = 0; | |
1015 | } | |
1016 | } | |
1017 | ||
1018 | if (tx_done > 0) { | |
1019 | xsk_ring_cons__release(&umem->cq, tx_done); | |
1020 | } else { | |
1021 | COVERAGE_INC(afxdp_cq_empty); | |
1022 | } | |
1023 | } | |
1024 | ||
1025 | static inline int | |
1026 | __netdev_afxdp_batch_send(struct netdev *netdev, int qid, | |
1027 | struct dp_packet_batch *batch) | |
1028 | { | |
1029 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
1030 | struct xsk_socket_info *xsk_info; | |
1031 | void *elems_pop[BATCH_SIZE]; | |
1032 | struct xsk_umem_info *umem; | |
1033 | struct dp_packet *packet; | |
1034 | bool free_batch = false; | |
1035 | unsigned long orig; | |
1036 | uint32_t idx = 0; | |
1037 | int error = 0; | |
1038 | int ret; | |
1039 | ||
1040 | xsk_info = dev->xsks[qid]; | |
1041 | if (!xsk_info || !xsk_info->xsk) { | |
1042 | goto out; | |
1043 | } | |
1044 | ||
1045 | afxdp_complete_tx(xsk_info); | |
1046 | ||
1047 | free_batch = check_free_batch(batch); | |
1048 | ||
1049 | umem = xsk_info->umem; | |
940ac2ce PC |
1050 | ret = umem_elem_pop_n(&umem->mpool, dp_packet_batch_size(batch), |
1051 | elems_pop); | |
0de1b425 | 1052 | if (OVS_UNLIKELY(ret)) { |
940ac2ce PC |
1053 | atomic_add_relaxed(&xsk_info->tx_dropped, dp_packet_batch_size(batch), |
1054 | &orig); | |
0de1b425 WT |
1055 | VLOG_WARN_RL(&rl, "%s: send failed due to exhausted memory pool.", |
1056 | netdev_get_name(netdev)); | |
1057 | error = ENOMEM; | |
1058 | goto out; | |
1059 | } | |
1060 | ||
1061 | /* Make sure we have enough TX descs. */ | |
940ac2ce PC |
1062 | ret = xsk_ring_prod__reserve(&xsk_info->tx, dp_packet_batch_size(batch), |
1063 | &idx); | |
0de1b425 | 1064 | if (OVS_UNLIKELY(ret == 0)) { |
940ac2ce PC |
1065 | umem_elem_push_n(&umem->mpool, dp_packet_batch_size(batch), elems_pop); |
1066 | atomic_add_relaxed(&xsk_info->tx_dropped, dp_packet_batch_size(batch), | |
1067 | &orig); | |
0de1b425 WT |
1068 | COVERAGE_INC(afxdp_tx_full); |
1069 | afxdp_complete_tx(xsk_info); | |
e8f56344 | 1070 | kick_tx(xsk_info, dev->xdp_mode_in_use, dev->use_need_wakeup); |
0de1b425 WT |
1071 | error = ENOMEM; |
1072 | goto out; | |
1073 | } | |
1074 | ||
1075 | DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { | |
1076 | uint64_t index; | |
1077 | void *elem; | |
1078 | ||
1079 | elem = elems_pop[i]; | |
1080 | /* Copy the packet to the umem we just pop from umem pool. | |
1081 | * TODO: avoid this copy if the packet and the pop umem | |
1082 | * are located in the same umem. | |
1083 | */ | |
1084 | memcpy(elem, dp_packet_data(packet), dp_packet_size(packet)); | |
1085 | ||
1086 | index = (uint64_t)((char *)elem - (char *)umem->buffer); | |
1087 | xsk_ring_prod__tx_desc(&xsk_info->tx, idx + i)->addr = index; | |
1088 | xsk_ring_prod__tx_desc(&xsk_info->tx, idx + i)->len | |
1089 | = dp_packet_size(packet); | |
1090 | } | |
940ac2ce PC |
1091 | xsk_ring_prod__submit(&xsk_info->tx, dp_packet_batch_size(batch)); |
1092 | xsk_info->outstanding_tx += dp_packet_batch_size(batch); | |
0de1b425 | 1093 | |
e8f56344 | 1094 | ret = kick_tx(xsk_info, dev->xdp_mode_in_use, dev->use_need_wakeup); |
0de1b425 WT |
1095 | if (OVS_UNLIKELY(ret)) { |
1096 | VLOG_WARN_RL(&rl, "%s: error sending AF_XDP packet: %s.", | |
1097 | netdev_get_name(netdev), ovs_strerror(ret)); | |
1098 | } | |
1099 | ||
1100 | out: | |
1101 | if (free_batch) { | |
1102 | free_afxdp_buf_batch(batch); | |
1103 | } else { | |
1104 | dp_packet_delete_batch(batch, true); | |
1105 | } | |
1106 | ||
1107 | return error; | |
1108 | } | |
1109 | ||
1110 | int | |
1111 | netdev_afxdp_batch_send(struct netdev *netdev, int qid, | |
1112 | struct dp_packet_batch *batch, | |
1113 | bool concurrent_txq) | |
1114 | { | |
1115 | struct netdev_linux *dev; | |
1116 | int ret; | |
1117 | ||
1118 | if (concurrent_txq) { | |
1119 | dev = netdev_linux_cast(netdev); | |
1120 | qid = qid % netdev_n_txq(netdev); | |
1121 | ||
28d05016 | 1122 | ovs_spin_lock(&dev->tx_locks[qid].lock); |
0de1b425 | 1123 | ret = __netdev_afxdp_batch_send(netdev, qid, batch); |
28d05016 | 1124 | ovs_spin_unlock(&dev->tx_locks[qid].lock); |
0de1b425 WT |
1125 | } else { |
1126 | ret = __netdev_afxdp_batch_send(netdev, qid, batch); | |
1127 | } | |
1128 | ||
1129 | return ret; | |
1130 | } | |
1131 | ||
1132 | int | |
1133 | netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_ OVS_UNUSED) | |
1134 | { | |
1135 | /* Done at reconfigure. */ | |
1136 | return 0; | |
1137 | } | |
1138 | ||
1139 | void | |
1140 | netdev_afxdp_rxq_destruct(struct netdev_rxq *rxq_ OVS_UNUSED) | |
1141 | { | |
1142 | /* Nothing. */ | |
1143 | } | |
1144 | ||
7bf075d9 WT |
1145 | static int |
1146 | libbpf_print(enum libbpf_print_level level, | |
1147 | const char *format, va_list args) | |
1148 | { | |
1149 | if (level == LIBBPF_WARN) { | |
1150 | vlog_valist(&this_module, VLL_WARN, format, args); | |
1151 | } else if (level == LIBBPF_INFO) { | |
1152 | vlog_valist(&this_module, VLL_INFO, format, args); | |
1153 | } else { | |
1154 | vlog_valist(&this_module, VLL_DBG, format, args); | |
1155 | } | |
1156 | return 0; | |
1157 | } | |
1158 | ||
1159 | int netdev_afxdp_init(void) | |
1160 | { | |
1161 | libbpf_set_print(libbpf_print); | |
1162 | return 0; | |
1163 | } | |
1164 | ||
f627cf1d IM |
1165 | int |
1166 | netdev_afxdp_construct(struct netdev *netdev) | |
1167 | { | |
1168 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
1169 | int ret; | |
1170 | ||
1171 | /* Configure common netdev-linux first. */ | |
1172 | ret = netdev_linux_construct(netdev); | |
1173 | if (ret) { | |
1174 | return ret; | |
1175 | } | |
1176 | ||
1177 | /* Queues should not be used before the first reconfiguration. Clearing. */ | |
1178 | netdev->n_rxq = 0; | |
1179 | netdev->n_txq = 0; | |
e8f56344 IM |
1180 | dev->xdp_mode = OVS_AF_XDP_MODE_UNSPEC; |
1181 | dev->xdp_mode_in_use = OVS_AF_XDP_MODE_UNSPEC; | |
f627cf1d IM |
1182 | |
1183 | dev->requested_n_rxq = NR_QUEUE; | |
e8f56344 | 1184 | dev->requested_xdp_mode = OVS_AF_XDP_MODE_BEST_EFFORT; |
e50547b5 | 1185 | dev->requested_need_wakeup = NEED_WAKEUP_DEFAULT; |
f627cf1d IM |
1186 | |
1187 | dev->xsks = NULL; | |
1188 | dev->tx_locks = NULL; | |
1189 | ||
1190 | netdev_request_reconfigure(netdev); | |
1191 | return 0; | |
1192 | } | |
1193 | ||
0de1b425 WT |
1194 | void |
1195 | netdev_afxdp_destruct(struct netdev *netdev) | |
1196 | { | |
1197 | static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; | |
1198 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
1199 | ||
1200 | if (ovsthread_once_start(&once)) { | |
1201 | fatal_signal_add_hook(netdev_afxdp_sweep_unused_pools, | |
1202 | NULL, NULL, true); | |
1203 | ovsthread_once_done(&once); | |
1204 | } | |
1205 | ||
1206 | /* Note: tc is by-passed when using drv-mode, but when using | |
1207 | * skb-mode, we might need to clean up tc. */ | |
1208 | ||
1209 | xsk_destroy_all(netdev); | |
1210 | ovs_mutex_destroy(&dev->mutex); | |
1211 | } | |
1212 | ||
52b5a5c0 EC |
1213 | int |
1214 | netdev_afxdp_verify_mtu_size(const struct netdev *netdev OVS_UNUSED, int mtu) | |
1215 | { | |
1216 | /* | |
1217 | * If a device is used in xdpmode skb, no driver-specific MTU size is | |
1218 | * checked and any value is allowed resulting in packet drops. | |
1219 | * This check will verify the maximum supported value based on the | |
1220 | * buffer size allocated and the additional headroom required. | |
1221 | */ | |
1222 | if (mtu > (FRAME_SIZE - OVS_XDP_HEADROOM - | |
1223 | XDP_PACKET_HEADROOM - VLAN_ETH_HEADER_LEN)) { | |
1224 | return EINVAL; | |
1225 | } | |
1226 | ||
1227 | return 0; | |
1228 | } | |
1229 | ||
d560bc1b IM |
1230 | int |
1231 | netdev_afxdp_get_custom_stats(const struct netdev *netdev, | |
1232 | struct netdev_custom_stats *custom_stats) | |
1233 | { | |
1234 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
1235 | struct xsk_socket_info *xsk_info; | |
1236 | struct xdp_statistics stat; | |
1237 | uint32_t i, c = 0; | |
1238 | socklen_t optlen; | |
1239 | ||
1240 | ovs_mutex_lock(&dev->mutex); | |
1241 | ||
1242 | #define XDP_CSTATS \ | |
1243 | XDP_CSTAT(rx_dropped) \ | |
1244 | XDP_CSTAT(rx_invalid_descs) \ | |
1245 | XDP_CSTAT(tx_invalid_descs) | |
1246 | ||
1247 | #define XDP_CSTAT(NAME) + 1 | |
1248 | enum { N_XDP_CSTATS = XDP_CSTATS }; | |
1249 | #undef XDP_CSTAT | |
1250 | ||
1251 | custom_stats->counters = xcalloc(netdev_n_rxq(netdev) * N_XDP_CSTATS, | |
1252 | sizeof *custom_stats->counters); | |
1253 | ||
1254 | /* Account the stats for each xsk. */ | |
1255 | for (i = 0; i < netdev_n_rxq(netdev); i++) { | |
1256 | xsk_info = dev->xsks[i]; | |
1257 | optlen = sizeof stat; | |
1258 | ||
1259 | if (xsk_info && !getsockopt(xsk_socket__fd(xsk_info->xsk), SOL_XDP, | |
1260 | XDP_STATISTICS, &stat, &optlen)) { | |
1261 | #define XDP_CSTAT(NAME) \ | |
1262 | snprintf(custom_stats->counters[c].name, \ | |
1263 | NETDEV_CUSTOM_STATS_NAME_SIZE, \ | |
1264 | "xsk_queue_%d_" #NAME, i); \ | |
1265 | custom_stats->counters[c++].value = stat.NAME; | |
1266 | XDP_CSTATS; | |
1267 | #undef XDP_CSTAT | |
1268 | } | |
1269 | } | |
1270 | custom_stats->size = c; | |
1271 | ovs_mutex_unlock(&dev->mutex); | |
1272 | ||
1273 | return 0; | |
1274 | } | |
1275 | ||
0de1b425 WT |
1276 | int |
1277 | netdev_afxdp_get_stats(const struct netdev *netdev, | |
1278 | struct netdev_stats *stats) | |
1279 | { | |
1280 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
1281 | struct xsk_socket_info *xsk_info; | |
1282 | struct netdev_stats dev_stats; | |
1283 | int error, i; | |
1284 | ||
1285 | ovs_mutex_lock(&dev->mutex); | |
1286 | ||
1287 | error = get_stats_via_netlink(netdev, &dev_stats); | |
1288 | if (error) { | |
1289 | VLOG_WARN_RL(&rl, "%s: Error getting AF_XDP statistics.", | |
1290 | netdev_get_name(netdev)); | |
1291 | } else { | |
1292 | /* Use kernel netdev's packet and byte counts. */ | |
1293 | stats->rx_packets = dev_stats.rx_packets; | |
1294 | stats->rx_bytes = dev_stats.rx_bytes; | |
1295 | stats->tx_packets = dev_stats.tx_packets; | |
1296 | stats->tx_bytes = dev_stats.tx_bytes; | |
1297 | ||
1298 | stats->rx_errors += dev_stats.rx_errors; | |
1299 | stats->tx_errors += dev_stats.tx_errors; | |
1300 | stats->rx_dropped += dev_stats.rx_dropped; | |
1301 | stats->tx_dropped += dev_stats.tx_dropped; | |
1302 | stats->multicast += dev_stats.multicast; | |
1303 | stats->collisions += dev_stats.collisions; | |
1304 | stats->rx_length_errors += dev_stats.rx_length_errors; | |
1305 | stats->rx_over_errors += dev_stats.rx_over_errors; | |
1306 | stats->rx_crc_errors += dev_stats.rx_crc_errors; | |
1307 | stats->rx_frame_errors += dev_stats.rx_frame_errors; | |
1308 | stats->rx_fifo_errors += dev_stats.rx_fifo_errors; | |
1309 | stats->rx_missed_errors += dev_stats.rx_missed_errors; | |
1310 | stats->tx_aborted_errors += dev_stats.tx_aborted_errors; | |
1311 | stats->tx_carrier_errors += dev_stats.tx_carrier_errors; | |
1312 | stats->tx_fifo_errors += dev_stats.tx_fifo_errors; | |
1313 | stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors; | |
1314 | stats->tx_window_errors += dev_stats.tx_window_errors; | |
1315 | ||
1316 | /* Account the dropped in each xsk. */ | |
1317 | for (i = 0; i < netdev_n_rxq(netdev); i++) { | |
1318 | xsk_info = dev->xsks[i]; | |
1319 | if (xsk_info) { | |
1320 | uint64_t tx_dropped; | |
1321 | ||
1322 | atomic_read_relaxed(&xsk_info->tx_dropped, &tx_dropped); | |
1323 | stats->tx_dropped += tx_dropped; | |
1324 | } | |
1325 | } | |
1326 | } | |
1327 | ovs_mutex_unlock(&dev->mutex); | |
1328 | ||
1329 | return error; | |
1330 | } |