]>
Commit | Line | Data |
---|---|---|
0de1b425 WT |
1 | /* |
2 | * Copyright (c) 2018, 2019 Nicira, Inc. | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at: | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | #include <config.h> | |
18 | ||
19 | #include "netdev-linux-private.h" | |
20 | #include "netdev-linux.h" | |
21 | #include "netdev-afxdp.h" | |
22 | #include "netdev-afxdp-pool.h" | |
23 | ||
24 | #include <errno.h> | |
25 | #include <inttypes.h> | |
26 | #include <linux/rtnetlink.h> | |
27 | #include <linux/if_xdp.h> | |
28 | #include <net/if.h> | |
e50547b5 | 29 | #include <poll.h> |
0de1b425 WT |
30 | #include <stdlib.h> |
31 | #include <sys/resource.h> | |
32 | #include <sys/socket.h> | |
33 | #include <sys/types.h> | |
34 | #include <unistd.h> | |
35 | ||
36 | #include "coverage.h" | |
37 | #include "dp-packet.h" | |
38 | #include "dpif-netdev.h" | |
39 | #include "fatal-signal.h" | |
40 | #include "openvswitch/compiler.h" | |
41 | #include "openvswitch/dynamic-string.h" | |
42 | #include "openvswitch/list.h" | |
28d05016 | 43 | #include "openvswitch/thread.h" |
0de1b425 WT |
44 | #include "openvswitch/vlog.h" |
45 | #include "packets.h" | |
46 | #include "socket-util.h" | |
47 | #include "util.h" | |
48 | ||
49 | #ifndef SOL_XDP | |
50 | #define SOL_XDP 283 | |
51 | #endif | |
52 | ||
53 | COVERAGE_DEFINE(afxdp_cq_empty); | |
54 | COVERAGE_DEFINE(afxdp_fq_full); | |
55 | COVERAGE_DEFINE(afxdp_tx_full); | |
56 | COVERAGE_DEFINE(afxdp_cq_skip); | |
57 | ||
58 | VLOG_DEFINE_THIS_MODULE(netdev_afxdp); | |
59 | ||
60 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); | |
61 | ||
62 | #define MAX_XSKQ 16 | |
63 | #define FRAME_HEADROOM XDP_PACKET_HEADROOM | |
64 | #define OVS_XDP_HEADROOM 128 | |
65 | #define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE | |
66 | #define FRAME_SHIFT XSK_UMEM__DEFAULT_FRAME_SHIFT | |
67 | #define FRAME_SHIFT_MASK ((1 << FRAME_SHIFT) - 1) | |
68 | ||
69 | #define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS | |
70 | #define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS | |
71 | ||
e50547b5 WT |
72 | #ifdef HAVE_XDP_NEED_WAKEUP |
73 | #define NEED_WAKEUP_DEFAULT true | |
74 | #else | |
75 | #define NEED_WAKEUP_DEFAULT false | |
76 | #endif | |
77 | ||
0de1b425 WT |
78 | /* The worst case is all 4 queues TX/CQ/RX/FILL are full + some packets |
79 | * still on processing in threads. Number of packets currently in OVS | |
80 | * processing is hard to estimate because it depends on number of ports. | |
81 | * Setting NUM_FRAMES twice as large than total of ring sizes should be | |
82 | * enough for most corner cases. | |
83 | */ | |
84 | #define NUM_FRAMES (4 * (PROD_NUM_DESCS + CONS_NUM_DESCS)) | |
85 | #define BATCH_SIZE NETDEV_MAX_BURST | |
86 | ||
87 | BUILD_ASSERT_DECL(IS_POW2(NUM_FRAMES)); | |
88 | BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS); | |
89 | ||
90 | #define UMEM2DESC(elem, base) ((uint64_t)((char *)elem - (char *)base)) | |
91 | ||
92 | static struct xsk_socket_info *xsk_configure(int ifindex, int xdp_queue_id, | |
e8f56344 IM |
93 | enum afxdp_mode mode, |
94 | bool use_need_wakeup, | |
95 | bool report_socket_failures); | |
96 | static void xsk_remove_xdp_program(uint32_t ifindex, enum afxdp_mode); | |
0de1b425 WT |
97 | static void xsk_destroy(struct xsk_socket_info *xsk); |
98 | static int xsk_configure_all(struct netdev *netdev); | |
99 | static void xsk_destroy_all(struct netdev *netdev); | |
100 | ||
e8f56344 IM |
101 | static struct { |
102 | const char *name; | |
103 | uint32_t bind_flags; | |
104 | uint32_t xdp_flags; | |
105 | } xdp_modes[] = { | |
106 | [OVS_AF_XDP_MODE_UNSPEC] = { | |
107 | .name = "unspecified", | |
108 | .bind_flags = 0, | |
109 | .xdp_flags = 0, | |
110 | }, | |
111 | [OVS_AF_XDP_MODE_BEST_EFFORT] = { | |
112 | .name = "best-effort", | |
113 | .bind_flags = 0, | |
114 | .xdp_flags = 0, | |
115 | }, | |
116 | [OVS_AF_XDP_MODE_NATIVE_ZC] = { | |
117 | .name = "native-with-zerocopy", | |
118 | .bind_flags = XDP_ZEROCOPY, | |
119 | .xdp_flags = XDP_FLAGS_DRV_MODE, | |
120 | }, | |
121 | [OVS_AF_XDP_MODE_NATIVE] = { | |
122 | .name = "native", | |
123 | .bind_flags = XDP_COPY, | |
124 | .xdp_flags = XDP_FLAGS_DRV_MODE, | |
125 | }, | |
126 | [OVS_AF_XDP_MODE_GENERIC] = { | |
127 | .name = "generic", | |
128 | .bind_flags = XDP_COPY, | |
129 | .xdp_flags = XDP_FLAGS_SKB_MODE, | |
130 | }, | |
131 | }; | |
132 | ||
0de1b425 WT |
133 | struct unused_pool { |
134 | struct xsk_umem_info *umem_info; | |
135 | int lost_in_rings; /* Number of packets left in tx, rx, cq and fq. */ | |
136 | struct ovs_list list_node; | |
137 | }; | |
138 | ||
139 | static struct ovs_mutex unused_pools_mutex = OVS_MUTEX_INITIALIZER; | |
140 | static struct ovs_list unused_pools OVS_GUARDED_BY(unused_pools_mutex) = | |
141 | OVS_LIST_INITIALIZER(&unused_pools); | |
142 | ||
143 | struct xsk_umem_info { | |
144 | struct umem_pool mpool; | |
145 | struct xpacket_pool xpool; | |
146 | struct xsk_ring_prod fq; | |
147 | struct xsk_ring_cons cq; | |
148 | struct xsk_umem *umem; | |
149 | void *buffer; | |
150 | }; | |
151 | ||
152 | struct xsk_socket_info { | |
153 | struct xsk_ring_cons rx; | |
154 | struct xsk_ring_prod tx; | |
155 | struct xsk_umem_info *umem; | |
156 | struct xsk_socket *xsk; | |
157 | uint32_t outstanding_tx; /* Number of descriptors filled in tx and cq. */ | |
158 | uint32_t available_rx; /* Number of descriptors filled in rx and fq. */ | |
159 | atomic_uint64_t tx_dropped; | |
160 | }; | |
161 | ||
28d05016 IM |
162 | struct netdev_afxdp_tx_lock { |
163 | /* Padding to make netdev_afxdp_tx_lock exactly one cache line long. */ | |
164 | PADDED_MEMBERS(CACHE_LINE_SIZE, | |
165 | struct ovs_spin lock; | |
166 | ); | |
167 | }; | |
168 | ||
e50547b5 WT |
169 | #ifdef HAVE_XDP_NEED_WAKEUP |
170 | static inline void | |
171 | xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem, | |
172 | struct netdev *netdev, int fd) | |
173 | { | |
174 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
175 | struct pollfd pfd; | |
176 | int ret; | |
177 | ||
178 | if (!dev->use_need_wakeup) { | |
179 | return; | |
180 | } | |
181 | ||
182 | if (xsk_ring_prod__needs_wakeup(&umem->fq)) { | |
183 | pfd.fd = fd; | |
184 | pfd.events = POLLIN; | |
185 | ||
186 | ret = poll(&pfd, 1, 0); | |
187 | if (OVS_UNLIKELY(ret < 0)) { | |
188 | VLOG_WARN_RL(&rl, "%s: error polling rx fd: %s.", | |
189 | netdev_get_name(netdev), | |
190 | ovs_strerror(errno)); | |
191 | } | |
192 | } | |
193 | } | |
194 | ||
195 | static inline bool | |
196 | xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info) | |
197 | { | |
198 | return xsk_ring_prod__needs_wakeup(&xsk_info->tx); | |
199 | } | |
200 | ||
201 | #else /* !HAVE_XDP_NEED_WAKEUP */ | |
202 | static inline void | |
203 | xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem OVS_UNUSED, | |
204 | struct netdev *netdev OVS_UNUSED, | |
205 | int fd OVS_UNUSED) | |
206 | { | |
207 | /* Nothing. */ | |
208 | } | |
209 | ||
210 | static inline bool | |
211 | xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info OVS_UNUSED) | |
212 | { | |
213 | return true; | |
214 | } | |
215 | #endif /* HAVE_XDP_NEED_WAKEUP */ | |
216 | ||
0de1b425 WT |
217 | static void |
218 | netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool) | |
219 | { | |
220 | /* Free the packet buffer. */ | |
221 | free_pagealign(pool->umem_info->buffer); | |
222 | ||
223 | /* Cleanup umem pool. */ | |
224 | umem_pool_cleanup(&pool->umem_info->mpool); | |
225 | ||
226 | /* Cleanup metadata pool. */ | |
227 | xpacket_pool_cleanup(&pool->umem_info->xpool); | |
228 | ||
229 | free(pool->umem_info); | |
230 | } | |
231 | ||
232 | static void | |
233 | netdev_afxdp_sweep_unused_pools(void *aux OVS_UNUSED) | |
234 | { | |
235 | struct unused_pool *pool, *next; | |
236 | unsigned int count; | |
237 | ||
238 | ovs_mutex_lock(&unused_pools_mutex); | |
239 | LIST_FOR_EACH_SAFE (pool, next, list_node, &unused_pools) { | |
240 | ||
241 | count = umem_pool_count(&pool->umem_info->mpool); | |
242 | ovs_assert(count + pool->lost_in_rings <= NUM_FRAMES); | |
243 | ||
244 | if (count + pool->lost_in_rings == NUM_FRAMES) { | |
245 | /* OVS doesn't use this memory pool anymore. Kernel doesn't | |
246 | * use it since closing the xdp socket. So, it's safe to free | |
247 | * the pool now. */ | |
248 | VLOG_DBG("Freeing umem pool at 0x%"PRIxPTR, | |
249 | (uintptr_t) pool->umem_info); | |
250 | ovs_list_remove(&pool->list_node); | |
251 | netdev_afxdp_cleanup_unused_pool(pool); | |
252 | free(pool); | |
253 | } | |
254 | } | |
255 | ovs_mutex_unlock(&unused_pools_mutex); | |
256 | } | |
257 | ||
258 | static struct xsk_umem_info * | |
e8f56344 | 259 | xsk_configure_umem(void *buffer, uint64_t size) |
0de1b425 WT |
260 | { |
261 | struct xsk_umem_config uconfig; | |
262 | struct xsk_umem_info *umem; | |
263 | int ret; | |
264 | int i; | |
265 | ||
266 | umem = xzalloc(sizeof *umem); | |
267 | ||
ec92f8d2 | 268 | memset(&uconfig, 0, sizeof uconfig); |
0de1b425 WT |
269 | uconfig.fill_size = PROD_NUM_DESCS; |
270 | uconfig.comp_size = CONS_NUM_DESCS; | |
271 | uconfig.frame_size = FRAME_SIZE; | |
272 | uconfig.frame_headroom = OVS_XDP_HEADROOM; | |
273 | ||
274 | ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, | |
275 | &uconfig); | |
276 | if (ret) { | |
e8f56344 | 277 | VLOG_ERR("xsk_umem__create failed: %s.", ovs_strerror(errno)); |
0de1b425 WT |
278 | free(umem); |
279 | return NULL; | |
280 | } | |
281 | ||
282 | umem->buffer = buffer; | |
283 | ||
284 | /* Set-up umem pool. */ | |
285 | if (umem_pool_init(&umem->mpool, NUM_FRAMES) < 0) { | |
286 | VLOG_ERR("umem_pool_init failed"); | |
287 | if (xsk_umem__delete(umem->umem)) { | |
288 | VLOG_ERR("xsk_umem__delete failed"); | |
289 | } | |
290 | free(umem); | |
291 | return NULL; | |
292 | } | |
293 | ||
294 | for (i = NUM_FRAMES - 1; i >= 0; i--) { | |
295 | void *elem; | |
296 | ||
297 | elem = ALIGNED_CAST(void *, (char *)umem->buffer + i * FRAME_SIZE); | |
298 | umem_elem_push(&umem->mpool, elem); | |
299 | } | |
300 | ||
301 | /* Set-up metadata. */ | |
302 | if (xpacket_pool_init(&umem->xpool, NUM_FRAMES) < 0) { | |
303 | VLOG_ERR("xpacket_pool_init failed"); | |
304 | umem_pool_cleanup(&umem->mpool); | |
305 | if (xsk_umem__delete(umem->umem)) { | |
306 | VLOG_ERR("xsk_umem__delete failed"); | |
307 | } | |
308 | free(umem); | |
309 | return NULL; | |
310 | } | |
311 | ||
312 | VLOG_DBG("%s: xpacket pool from %p to %p", __func__, | |
313 | umem->xpool.array, | |
314 | (char *)umem->xpool.array + | |
315 | NUM_FRAMES * sizeof(struct dp_packet_afxdp)); | |
316 | ||
317 | for (i = NUM_FRAMES - 1; i >= 0; i--) { | |
318 | struct dp_packet_afxdp *xpacket; | |
319 | struct dp_packet *packet; | |
320 | ||
321 | xpacket = &umem->xpool.array[i]; | |
322 | xpacket->mpool = &umem->mpool; | |
323 | ||
324 | packet = &xpacket->packet; | |
325 | packet->source = DPBUF_AFXDP; | |
326 | } | |
327 | ||
328 | return umem; | |
329 | } | |
330 | ||
331 | static struct xsk_socket_info * | |
332 | xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex, | |
e8f56344 IM |
333 | uint32_t queue_id, enum afxdp_mode mode, |
334 | bool use_need_wakeup, bool report_socket_failures) | |
0de1b425 WT |
335 | { |
336 | struct xsk_socket_config cfg; | |
337 | struct xsk_socket_info *xsk; | |
338 | char devname[IF_NAMESIZE]; | |
339 | uint32_t idx = 0, prog_id; | |
340 | int ret; | |
341 | int i; | |
342 | ||
343 | xsk = xzalloc(sizeof *xsk); | |
344 | xsk->umem = umem; | |
345 | cfg.rx_size = CONS_NUM_DESCS; | |
346 | cfg.tx_size = PROD_NUM_DESCS; | |
347 | cfg.libbpf_flags = 0; | |
e8f56344 IM |
348 | cfg.bind_flags = xdp_modes[mode].bind_flags; |
349 | cfg.xdp_flags = xdp_modes[mode].xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST; | |
0de1b425 | 350 | |
e50547b5 WT |
351 | #ifdef HAVE_XDP_NEED_WAKEUP |
352 | if (use_need_wakeup) { | |
353 | cfg.bind_flags |= XDP_USE_NEED_WAKEUP; | |
354 | } | |
355 | #endif | |
356 | ||
0de1b425 WT |
357 | if (if_indextoname(ifindex, devname) == NULL) { |
358 | VLOG_ERR("ifindex %d to devname failed (%s)", | |
359 | ifindex, ovs_strerror(errno)); | |
360 | free(xsk); | |
361 | return NULL; | |
362 | } | |
363 | ||
364 | ret = xsk_socket__create(&xsk->xsk, devname, queue_id, umem->umem, | |
365 | &xsk->rx, &xsk->tx, &cfg); | |
366 | if (ret) { | |
e8f56344 IM |
367 | VLOG(report_socket_failures ? VLL_ERR : VLL_DBG, |
368 | "xsk_socket__create failed (%s) mode: %s, " | |
369 | "use-need-wakeup: %s, qid: %d", | |
370 | ovs_strerror(errno), xdp_modes[mode].name, | |
371 | use_need_wakeup ? "true" : "false", queue_id); | |
0de1b425 WT |
372 | free(xsk); |
373 | return NULL; | |
374 | } | |
375 | ||
376 | /* Make sure the built-in AF_XDP program is loaded. */ | |
377 | ret = bpf_get_link_xdp_id(ifindex, &prog_id, cfg.xdp_flags); | |
22b78906 WT |
378 | if (ret || !prog_id) { |
379 | if (ret) { | |
380 | VLOG_ERR("Get XDP prog ID failed (%s)", ovs_strerror(errno)); | |
381 | } else { | |
382 | VLOG_ERR("No XDP program is loaded at ifindex %d", ifindex); | |
383 | } | |
0de1b425 WT |
384 | xsk_socket__delete(xsk->xsk); |
385 | free(xsk); | |
386 | return NULL; | |
387 | } | |
388 | ||
389 | while (!xsk_ring_prod__reserve(&xsk->umem->fq, | |
390 | PROD_NUM_DESCS, &idx)) { | |
391 | VLOG_WARN_RL(&rl, "Retry xsk_ring_prod__reserve to FILL queue"); | |
392 | } | |
393 | ||
394 | for (i = 0; | |
395 | i < PROD_NUM_DESCS * FRAME_SIZE; | |
396 | i += FRAME_SIZE) { | |
397 | void *elem; | |
398 | uint64_t addr; | |
399 | ||
400 | elem = umem_elem_pop(&xsk->umem->mpool); | |
401 | addr = UMEM2DESC(elem, xsk->umem->buffer); | |
402 | ||
403 | *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = addr; | |
404 | } | |
405 | ||
406 | xsk_ring_prod__submit(&xsk->umem->fq, | |
407 | PROD_NUM_DESCS); | |
408 | return xsk; | |
409 | } | |
410 | ||
411 | static struct xsk_socket_info * | |
e8f56344 IM |
412 | xsk_configure(int ifindex, int xdp_queue_id, enum afxdp_mode mode, |
413 | bool use_need_wakeup, bool report_socket_failures) | |
0de1b425 WT |
414 | { |
415 | struct xsk_socket_info *xsk; | |
416 | struct xsk_umem_info *umem; | |
417 | void *bufs; | |
418 | ||
419 | netdev_afxdp_sweep_unused_pools(NULL); | |
420 | ||
421 | /* Umem memory region. */ | |
422 | bufs = xmalloc_pagealign(NUM_FRAMES * FRAME_SIZE); | |
423 | memset(bufs, 0, NUM_FRAMES * FRAME_SIZE); | |
424 | ||
425 | /* Create AF_XDP socket. */ | |
e8f56344 | 426 | umem = xsk_configure_umem(bufs, NUM_FRAMES * FRAME_SIZE); |
0de1b425 WT |
427 | if (!umem) { |
428 | free_pagealign(bufs); | |
429 | return NULL; | |
430 | } | |
431 | ||
432 | VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem); | |
433 | ||
e8f56344 IM |
434 | xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, mode, |
435 | use_need_wakeup, report_socket_failures); | |
0de1b425 WT |
436 | if (!xsk) { |
437 | /* Clean up umem and xpacket pool. */ | |
438 | if (xsk_umem__delete(umem->umem)) { | |
439 | VLOG_ERR("xsk_umem__delete failed."); | |
440 | } | |
441 | free_pagealign(bufs); | |
442 | umem_pool_cleanup(&umem->mpool); | |
443 | xpacket_pool_cleanup(&umem->xpool); | |
444 | free(umem); | |
445 | } | |
446 | return xsk; | |
447 | } | |
448 | ||
e8f56344 IM |
449 | static int |
450 | xsk_configure_queue(struct netdev_linux *dev, int ifindex, int queue_id, | |
451 | enum afxdp_mode mode, bool report_socket_failures) | |
452 | { | |
453 | struct xsk_socket_info *xsk_info; | |
454 | ||
455 | VLOG_DBG("%s: configuring queue: %d, mode: %s, use-need-wakeup: %s.", | |
456 | netdev_get_name(&dev->up), queue_id, xdp_modes[mode].name, | |
457 | dev->use_need_wakeup ? "true" : "false"); | |
458 | xsk_info = xsk_configure(ifindex, queue_id, mode, dev->use_need_wakeup, | |
459 | report_socket_failures); | |
460 | if (!xsk_info) { | |
461 | VLOG(report_socket_failures ? VLL_ERR : VLL_DBG, | |
462 | "%s: Failed to create AF_XDP socket on queue %d in %s mode.", | |
463 | netdev_get_name(&dev->up), queue_id, xdp_modes[mode].name); | |
464 | dev->xsks[queue_id] = NULL; | |
465 | return -1; | |
466 | } | |
467 | dev->xsks[queue_id] = xsk_info; | |
468 | atomic_init(&xsk_info->tx_dropped, 0); | |
469 | xsk_info->outstanding_tx = 0; | |
470 | xsk_info->available_rx = PROD_NUM_DESCS; | |
471 | return 0; | |
472 | } | |
473 | ||
474 | ||
0de1b425 WT |
475 | static int |
476 | xsk_configure_all(struct netdev *netdev) | |
477 | { | |
478 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
0de1b425 | 479 | int i, ifindex, n_rxq, n_txq; |
e8f56344 | 480 | int qid = 0; |
0de1b425 WT |
481 | |
482 | ifindex = linux_get_ifindex(netdev_get_name(netdev)); | |
483 | ||
484 | ovs_assert(dev->xsks == NULL); | |
485 | ovs_assert(dev->tx_locks == NULL); | |
486 | ||
487 | n_rxq = netdev_n_rxq(netdev); | |
488 | dev->xsks = xcalloc(n_rxq, sizeof *dev->xsks); | |
489 | ||
e8f56344 IM |
490 | if (dev->xdp_mode == OVS_AF_XDP_MODE_BEST_EFFORT) { |
491 | /* Trying to configure first queue with different modes to | |
492 | * find the most suitable. */ | |
493 | for (i = OVS_AF_XDP_MODE_NATIVE_ZC; i < OVS_AF_XDP_MODE_MAX; i++) { | |
494 | if (!xsk_configure_queue(dev, ifindex, qid, i, | |
495 | i == OVS_AF_XDP_MODE_MAX - 1)) { | |
496 | dev->xdp_mode_in_use = i; | |
497 | VLOG_INFO("%s: %s XDP mode will be in use.", | |
498 | netdev_get_name(netdev), xdp_modes[i].name); | |
499 | break; | |
500 | } | |
501 | } | |
502 | if (i == OVS_AF_XDP_MODE_MAX) { | |
503 | VLOG_ERR("%s: Failed to detect suitable XDP mode.", | |
504 | netdev_get_name(netdev)); | |
505 | goto err; | |
506 | } | |
507 | qid++; | |
508 | } else { | |
509 | dev->xdp_mode_in_use = dev->xdp_mode; | |
510 | } | |
511 | ||
512 | /* Configure remaining queues. */ | |
513 | for (; qid < n_rxq; qid++) { | |
514 | if (xsk_configure_queue(dev, ifindex, qid, | |
515 | dev->xdp_mode_in_use, true)) { | |
516 | VLOG_ERR("%s: Failed to create AF_XDP socket on queue %d.", | |
517 | netdev_get_name(netdev), qid); | |
0de1b425 WT |
518 | goto err; |
519 | } | |
0de1b425 WT |
520 | } |
521 | ||
522 | n_txq = netdev_n_txq(netdev); | |
28d05016 | 523 | dev->tx_locks = xzalloc_cacheline(n_txq * sizeof *dev->tx_locks); |
0de1b425 WT |
524 | |
525 | for (i = 0; i < n_txq; i++) { | |
28d05016 | 526 | ovs_spin_init(&dev->tx_locks[i].lock); |
0de1b425 WT |
527 | } |
528 | ||
529 | return 0; | |
530 | ||
531 | err: | |
532 | xsk_destroy_all(netdev); | |
533 | return EINVAL; | |
534 | } | |
535 | ||
536 | static void | |
537 | xsk_destroy(struct xsk_socket_info *xsk_info) | |
538 | { | |
539 | struct xsk_umem *umem; | |
540 | struct unused_pool *pool; | |
541 | ||
542 | xsk_socket__delete(xsk_info->xsk); | |
543 | xsk_info->xsk = NULL; | |
544 | ||
545 | umem = xsk_info->umem->umem; | |
546 | if (xsk_umem__delete(umem)) { | |
547 | VLOG_ERR("xsk_umem__delete failed."); | |
548 | } | |
549 | ||
550 | pool = xzalloc(sizeof *pool); | |
551 | pool->umem_info = xsk_info->umem; | |
552 | pool->lost_in_rings = xsk_info->outstanding_tx + xsk_info->available_rx; | |
553 | ||
554 | ovs_mutex_lock(&unused_pools_mutex); | |
555 | ovs_list_push_back(&unused_pools, &pool->list_node); | |
556 | ovs_mutex_unlock(&unused_pools_mutex); | |
557 | ||
558 | free(xsk_info); | |
559 | ||
560 | netdev_afxdp_sweep_unused_pools(NULL); | |
561 | } | |
562 | ||
563 | static void | |
564 | xsk_destroy_all(struct netdev *netdev) | |
565 | { | |
566 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
567 | int i, ifindex; | |
568 | ||
569 | if (dev->xsks) { | |
570 | for (i = 0; i < netdev_n_rxq(netdev); i++) { | |
571 | if (dev->xsks[i]) { | |
572 | xsk_destroy(dev->xsks[i]); | |
573 | dev->xsks[i] = NULL; | |
e8f56344 | 574 | VLOG_DBG("%s: Destroyed xsk[%d].", netdev_get_name(netdev), i); |
0de1b425 WT |
575 | } |
576 | } | |
577 | ||
578 | free(dev->xsks); | |
579 | dev->xsks = NULL; | |
580 | } | |
581 | ||
582 | VLOG_INFO("%s: Removing xdp program.", netdev_get_name(netdev)); | |
583 | ifindex = linux_get_ifindex(netdev_get_name(netdev)); | |
e8f56344 | 584 | xsk_remove_xdp_program(ifindex, dev->xdp_mode_in_use); |
0de1b425 WT |
585 | |
586 | if (dev->tx_locks) { | |
587 | for (i = 0; i < netdev_n_txq(netdev); i++) { | |
28d05016 | 588 | ovs_spin_destroy(&dev->tx_locks[i].lock); |
0de1b425 | 589 | } |
28d05016 | 590 | free_cacheline(dev->tx_locks); |
0de1b425 WT |
591 | dev->tx_locks = NULL; |
592 | } | |
593 | } | |
594 | ||
0de1b425 WT |
595 | int |
596 | netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args, | |
597 | char **errp OVS_UNUSED) | |
598 | { | |
599 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
e8f56344 IM |
600 | const char *str_xdp_mode; |
601 | enum afxdp_mode xdp_mode; | |
e50547b5 | 602 | bool need_wakeup; |
e8f56344 | 603 | int new_n_rxq; |
0de1b425 WT |
604 | |
605 | ovs_mutex_lock(&dev->mutex); | |
606 | new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1); | |
607 | if (new_n_rxq > MAX_XSKQ) { | |
608 | ovs_mutex_unlock(&dev->mutex); | |
609 | VLOG_ERR("%s: Too big 'n_rxq' (%d > %d).", | |
610 | netdev_get_name(netdev), new_n_rxq, MAX_XSKQ); | |
611 | return EINVAL; | |
612 | } | |
613 | ||
e8f56344 IM |
614 | str_xdp_mode = smap_get_def(args, "xdp-mode", "best-effort"); |
615 | for (xdp_mode = OVS_AF_XDP_MODE_BEST_EFFORT; | |
616 | xdp_mode < OVS_AF_XDP_MODE_MAX; | |
617 | xdp_mode++) { | |
618 | if (!strcasecmp(str_xdp_mode, xdp_modes[xdp_mode].name)) { | |
619 | break; | |
620 | } | |
621 | } | |
622 | if (xdp_mode == OVS_AF_XDP_MODE_MAX) { | |
623 | VLOG_ERR("%s: Incorrect xdp-mode (%s).", | |
624 | netdev_get_name(netdev), str_xdp_mode); | |
0de1b425 WT |
625 | ovs_mutex_unlock(&dev->mutex); |
626 | return EINVAL; | |
627 | } | |
628 | ||
e50547b5 WT |
629 | need_wakeup = smap_get_bool(args, "use-need-wakeup", NEED_WAKEUP_DEFAULT); |
630 | #ifndef HAVE_XDP_NEED_WAKEUP | |
631 | if (need_wakeup) { | |
632 | VLOG_WARN("XDP need_wakeup is not supported in libbpf."); | |
633 | need_wakeup = false; | |
634 | } | |
635 | #endif | |
636 | ||
0de1b425 | 637 | if (dev->requested_n_rxq != new_n_rxq |
e8f56344 | 638 | || dev->requested_xdp_mode != xdp_mode |
e50547b5 | 639 | || dev->requested_need_wakeup != need_wakeup) { |
0de1b425 | 640 | dev->requested_n_rxq = new_n_rxq; |
e8f56344 | 641 | dev->requested_xdp_mode = xdp_mode; |
e50547b5 | 642 | dev->requested_need_wakeup = need_wakeup; |
0de1b425 WT |
643 | netdev_request_reconfigure(netdev); |
644 | } | |
645 | ovs_mutex_unlock(&dev->mutex); | |
646 | return 0; | |
647 | } | |
648 | ||
649 | int | |
650 | netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args) | |
651 | { | |
652 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
653 | ||
654 | ovs_mutex_lock(&dev->mutex); | |
655 | smap_add_format(args, "n_rxq", "%d", netdev->n_rxq); | |
e8f56344 IM |
656 | smap_add_format(args, "xdp-mode", "%s", xdp_modes[dev->xdp_mode].name); |
657 | smap_add_format(args, "xdp-mode-in-use", "%s", | |
658 | xdp_modes[dev->xdp_mode_in_use].name); | |
e50547b5 WT |
659 | smap_add_format(args, "use-need-wakeup", "%s", |
660 | dev->use_need_wakeup ? "true" : "false"); | |
0de1b425 WT |
661 | ovs_mutex_unlock(&dev->mutex); |
662 | return 0; | |
663 | } | |
664 | ||
665 | int | |
666 | netdev_afxdp_reconfigure(struct netdev *netdev) | |
667 | { | |
668 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
669 | struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; | |
670 | int err = 0; | |
671 | ||
672 | ovs_mutex_lock(&dev->mutex); | |
673 | ||
674 | if (netdev->n_rxq == dev->requested_n_rxq | |
e8f56344 | 675 | && dev->xdp_mode == dev->requested_xdp_mode |
e50547b5 | 676 | && dev->use_need_wakeup == dev->requested_need_wakeup |
f627cf1d | 677 | && dev->xsks) { |
0de1b425 WT |
678 | goto out; |
679 | } | |
680 | ||
681 | xsk_destroy_all(netdev); | |
682 | ||
683 | netdev->n_rxq = dev->requested_n_rxq; | |
684 | netdev->n_txq = netdev->n_rxq; | |
685 | ||
e8f56344 | 686 | dev->xdp_mode = dev->requested_xdp_mode; |
53c0bd5d | 687 | VLOG_INFO("%s: Setting XDP mode to %s.", netdev_get_name(netdev), |
e8f56344 | 688 | xdp_modes[dev->xdp_mode].name); |
53c0bd5d IM |
689 | |
690 | if (setrlimit(RLIMIT_MEMLOCK, &r)) { | |
691 | VLOG_ERR("setrlimit(RLIMIT_MEMLOCK) failed: %s", ovs_strerror(errno)); | |
0de1b425 | 692 | } |
e50547b5 | 693 | dev->use_need_wakeup = dev->requested_need_wakeup; |
0de1b425 WT |
694 | |
695 | err = xsk_configure_all(netdev); | |
696 | if (err) { | |
e8f56344 IM |
697 | VLOG_ERR("%s: AF_XDP device reconfiguration failed.", |
698 | netdev_get_name(netdev)); | |
0de1b425 WT |
699 | } |
700 | netdev_change_seq_changed(netdev); | |
701 | out: | |
702 | ovs_mutex_unlock(&dev->mutex); | |
703 | return err; | |
704 | } | |
705 | ||
0de1b425 | 706 | static void |
e8f56344 | 707 | xsk_remove_xdp_program(uint32_t ifindex, enum afxdp_mode mode) |
0de1b425 | 708 | { |
e8f56344 | 709 | uint32_t flags = xdp_modes[mode].xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST; |
37a24655 IM |
710 | uint32_t ret, prog_id = 0; |
711 | ||
712 | /* Check whether XDP program is loaded. */ | |
713 | ret = bpf_get_link_xdp_id(ifindex, &prog_id, flags); | |
714 | if (ret) { | |
715 | VLOG_ERR("Failed to get XDP prog id (%s)", ovs_strerror(errno)); | |
716 | return; | |
717 | } | |
718 | ||
719 | if (!prog_id) { | |
720 | VLOG_INFO("No XDP program is loaded at ifindex %d", ifindex); | |
721 | return; | |
722 | } | |
0de1b425 WT |
723 | |
724 | bpf_set_link_xdp_fd(ifindex, -1, flags); | |
725 | } | |
726 | ||
727 | void | |
728 | signal_remove_xdp(struct netdev *netdev) | |
729 | { | |
730 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
731 | int ifindex; | |
732 | ||
733 | ifindex = linux_get_ifindex(netdev_get_name(netdev)); | |
734 | ||
735 | VLOG_WARN("Force removing xdp program."); | |
e8f56344 | 736 | xsk_remove_xdp_program(ifindex, dev->xdp_mode_in_use); |
0de1b425 WT |
737 | } |
738 | ||
739 | static struct dp_packet_afxdp * | |
740 | dp_packet_cast_afxdp(const struct dp_packet *d) | |
741 | { | |
742 | ovs_assert(d->source == DPBUF_AFXDP); | |
743 | return CONTAINER_OF(d, struct dp_packet_afxdp, packet); | |
744 | } | |
745 | ||
746 | static inline void | |
747 | prepare_fill_queue(struct xsk_socket_info *xsk_info) | |
748 | { | |
749 | struct xsk_umem_info *umem; | |
750 | void *elems[BATCH_SIZE]; | |
751 | unsigned int idx_fq; | |
752 | int i, ret; | |
753 | ||
754 | umem = xsk_info->umem; | |
755 | ||
756 | if (xsk_prod_nb_free(&umem->fq, BATCH_SIZE) < BATCH_SIZE) { | |
757 | return; | |
758 | } | |
759 | ||
760 | ret = umem_elem_pop_n(&umem->mpool, BATCH_SIZE, elems); | |
761 | if (OVS_UNLIKELY(ret)) { | |
762 | return; | |
763 | } | |
764 | ||
765 | if (!xsk_ring_prod__reserve(&umem->fq, BATCH_SIZE, &idx_fq)) { | |
766 | umem_elem_push_n(&umem->mpool, BATCH_SIZE, elems); | |
767 | COVERAGE_INC(afxdp_fq_full); | |
768 | return; | |
769 | } | |
770 | ||
771 | for (i = 0; i < BATCH_SIZE; i++) { | |
772 | uint64_t index; | |
773 | void *elem; | |
774 | ||
775 | elem = elems[i]; | |
776 | index = (uint64_t)((char *)elem - (char *)umem->buffer); | |
777 | ovs_assert((index & FRAME_SHIFT_MASK) == 0); | |
778 | *xsk_ring_prod__fill_addr(&umem->fq, idx_fq) = index; | |
779 | ||
780 | idx_fq++; | |
781 | } | |
782 | xsk_ring_prod__submit(&umem->fq, BATCH_SIZE); | |
783 | xsk_info->available_rx += BATCH_SIZE; | |
784 | } | |
785 | ||
786 | int | |
787 | netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, | |
788 | int *qfill) | |
789 | { | |
790 | struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_); | |
791 | struct netdev *netdev = rx->up.netdev; | |
792 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
793 | struct xsk_socket_info *xsk_info; | |
794 | struct xsk_umem_info *umem; | |
795 | uint32_t idx_rx = 0; | |
796 | int qid = rxq_->queue_id; | |
797 | unsigned int rcvd, i; | |
798 | ||
799 | xsk_info = dev->xsks[qid]; | |
800 | if (!xsk_info || !xsk_info->xsk) { | |
801 | return EAGAIN; | |
802 | } | |
803 | ||
804 | prepare_fill_queue(xsk_info); | |
805 | ||
806 | umem = xsk_info->umem; | |
807 | rx->fd = xsk_socket__fd(xsk_info->xsk); | |
808 | ||
809 | rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx); | |
810 | if (!rcvd) { | |
e50547b5 | 811 | xsk_rx_wakeup_if_needed(umem, netdev, rx->fd); |
0de1b425 WT |
812 | return EAGAIN; |
813 | } | |
814 | ||
815 | /* Setup a dp_packet batch from descriptors in RX queue. */ | |
816 | for (i = 0; i < rcvd; i++) { | |
817 | struct dp_packet_afxdp *xpacket; | |
818 | const struct xdp_desc *desc; | |
819 | struct dp_packet *packet; | |
820 | uint64_t addr, index; | |
821 | uint32_t len; | |
822 | char *pkt; | |
823 | ||
824 | desc = xsk_ring_cons__rx_desc(&xsk_info->rx, idx_rx); | |
825 | addr = desc->addr; | |
826 | len = desc->len; | |
827 | ||
828 | pkt = xsk_umem__get_data(umem->buffer, addr); | |
829 | index = addr >> FRAME_SHIFT; | |
830 | xpacket = &umem->xpool.array[index]; | |
831 | packet = &xpacket->packet; | |
832 | ||
833 | /* Initialize the struct dp_packet. */ | |
834 | dp_packet_use_afxdp(packet, pkt, | |
835 | FRAME_SIZE - FRAME_HEADROOM, | |
836 | OVS_XDP_HEADROOM); | |
837 | dp_packet_set_size(packet, len); | |
838 | ||
839 | /* Add packet into batch, increase batch->count. */ | |
840 | dp_packet_batch_add(batch, packet); | |
841 | ||
842 | idx_rx++; | |
843 | } | |
844 | /* Release the RX queue. */ | |
845 | xsk_ring_cons__release(&xsk_info->rx, rcvd); | |
846 | xsk_info->available_rx -= rcvd; | |
847 | ||
848 | if (qfill) { | |
849 | /* TODO: return the number of remaining packets in the queue. */ | |
850 | *qfill = 0; | |
851 | } | |
0de1b425 WT |
852 | return 0; |
853 | } | |
854 | ||
855 | static inline int | |
e8f56344 IM |
856 | kick_tx(struct xsk_socket_info *xsk_info, enum afxdp_mode mode, |
857 | bool use_need_wakeup) | |
0de1b425 WT |
858 | { |
859 | int ret, retries; | |
860 | static const int KERNEL_TX_BATCH_SIZE = 16; | |
861 | ||
e50547b5 WT |
862 | if (use_need_wakeup && !xsk_tx_need_wakeup(xsk_info)) { |
863 | return 0; | |
864 | } | |
865 | ||
161773c7 IM |
866 | /* In all modes except native-with-zerocopy packet transmission is |
867 | * synchronous, and the kernel xmits only TX_BATCH_SIZE(16) packets for a | |
868 | * single sendmsg syscall. | |
0de1b425 WT |
869 | * So, we have to kick the kernel (n_packets / 16) times to be sure that |
870 | * all packets are transmitted. */ | |
161773c7 | 871 | retries = (mode != OVS_AF_XDP_MODE_NATIVE_ZC) |
0de1b425 WT |
872 | ? xsk_info->outstanding_tx / KERNEL_TX_BATCH_SIZE |
873 | : 0; | |
874 | kick_retry: | |
161773c7 IM |
875 | /* This causes system call into kernel's xsk_sendmsg, and xsk_generic_xmit |
876 | * (generic and native modes) or xsk_zc_xmit (native-with-zerocopy mode). | |
0de1b425 WT |
877 | */ |
878 | ret = sendto(xsk_socket__fd(xsk_info->xsk), NULL, 0, MSG_DONTWAIT, | |
879 | NULL, 0); | |
880 | if (ret < 0) { | |
881 | if (retries-- && errno == EAGAIN) { | |
882 | goto kick_retry; | |
883 | } | |
884 | if (errno == ENXIO || errno == ENOBUFS || errno == EOPNOTSUPP) { | |
885 | return errno; | |
886 | } | |
887 | } | |
888 | /* No error, or EBUSY, or too many retries on EAGAIN. */ | |
889 | return 0; | |
890 | } | |
891 | ||
892 | void | |
893 | free_afxdp_buf(struct dp_packet *p) | |
894 | { | |
895 | struct dp_packet_afxdp *xpacket; | |
896 | uintptr_t addr; | |
897 | ||
898 | xpacket = dp_packet_cast_afxdp(p); | |
899 | if (xpacket->mpool) { | |
900 | void *base = dp_packet_base(p); | |
901 | ||
902 | addr = (uintptr_t)base & (~FRAME_SHIFT_MASK); | |
903 | umem_elem_push(xpacket->mpool, (void *)addr); | |
904 | } | |
905 | } | |
906 | ||
907 | static void | |
908 | free_afxdp_buf_batch(struct dp_packet_batch *batch) | |
909 | { | |
910 | struct dp_packet_afxdp *xpacket = NULL; | |
911 | struct dp_packet *packet; | |
912 | void *elems[BATCH_SIZE]; | |
913 | uintptr_t addr; | |
914 | ||
915 | DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { | |
916 | void *base; | |
917 | ||
918 | xpacket = dp_packet_cast_afxdp(packet); | |
919 | base = dp_packet_base(packet); | |
920 | addr = (uintptr_t)base & (~FRAME_SHIFT_MASK); | |
921 | elems[i] = (void *)addr; | |
922 | } | |
940ac2ce | 923 | umem_elem_push_n(xpacket->mpool, dp_packet_batch_size(batch), elems); |
0de1b425 WT |
924 | dp_packet_batch_init(batch); |
925 | } | |
926 | ||
927 | static inline bool | |
928 | check_free_batch(struct dp_packet_batch *batch) | |
929 | { | |
930 | struct umem_pool *first_mpool = NULL; | |
931 | struct dp_packet_afxdp *xpacket; | |
932 | struct dp_packet *packet; | |
933 | ||
934 | DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { | |
935 | if (packet->source != DPBUF_AFXDP) { | |
936 | return false; | |
937 | } | |
938 | xpacket = dp_packet_cast_afxdp(packet); | |
939 | if (i == 0) { | |
940 | first_mpool = xpacket->mpool; | |
941 | continue; | |
942 | } | |
943 | if (xpacket->mpool != first_mpool) { | |
944 | return false; | |
945 | } | |
946 | } | |
947 | /* All packets are DPBUF_AFXDP and from the same mpool. */ | |
948 | return true; | |
949 | } | |
950 | ||
951 | static inline void | |
952 | afxdp_complete_tx(struct xsk_socket_info *xsk_info) | |
953 | { | |
954 | void *elems_push[BATCH_SIZE]; | |
955 | struct xsk_umem_info *umem; | |
956 | uint32_t idx_cq = 0; | |
957 | int tx_to_free = 0; | |
958 | int tx_done, j; | |
959 | ||
960 | umem = xsk_info->umem; | |
961 | tx_done = xsk_ring_cons__peek(&umem->cq, CONS_NUM_DESCS, &idx_cq); | |
962 | ||
963 | /* Recycle back to umem pool. */ | |
964 | for (j = 0; j < tx_done; j++) { | |
965 | uint64_t *addr; | |
966 | void *elem; | |
967 | ||
968 | addr = (uint64_t *)xsk_ring_cons__comp_addr(&umem->cq, idx_cq++); | |
05629ed2 EC |
969 | if (*addr != UINT64_MAX) { |
970 | elem = ALIGNED_CAST(void *, (char *)umem->buffer + *addr); | |
971 | elems_push[tx_to_free] = elem; | |
972 | *addr = UINT64_MAX; /* Mark as pushed. */ | |
973 | tx_to_free++; | |
974 | } else { | |
0de1b425 WT |
975 | /* The elem has been pushed already. */ |
976 | COVERAGE_INC(afxdp_cq_skip); | |
0de1b425 | 977 | } |
0de1b425 WT |
978 | |
979 | if (tx_to_free == BATCH_SIZE || j == tx_done - 1) { | |
980 | umem_elem_push_n(&umem->mpool, tx_to_free, elems_push); | |
981 | xsk_info->outstanding_tx -= tx_to_free; | |
982 | tx_to_free = 0; | |
983 | } | |
984 | } | |
985 | ||
986 | if (tx_done > 0) { | |
987 | xsk_ring_cons__release(&umem->cq, tx_done); | |
988 | } else { | |
989 | COVERAGE_INC(afxdp_cq_empty); | |
990 | } | |
991 | } | |
992 | ||
993 | static inline int | |
994 | __netdev_afxdp_batch_send(struct netdev *netdev, int qid, | |
995 | struct dp_packet_batch *batch) | |
996 | { | |
997 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
998 | struct xsk_socket_info *xsk_info; | |
999 | void *elems_pop[BATCH_SIZE]; | |
1000 | struct xsk_umem_info *umem; | |
1001 | struct dp_packet *packet; | |
1002 | bool free_batch = false; | |
1003 | unsigned long orig; | |
1004 | uint32_t idx = 0; | |
1005 | int error = 0; | |
1006 | int ret; | |
1007 | ||
1008 | xsk_info = dev->xsks[qid]; | |
1009 | if (!xsk_info || !xsk_info->xsk) { | |
1010 | goto out; | |
1011 | } | |
1012 | ||
1013 | afxdp_complete_tx(xsk_info); | |
1014 | ||
1015 | free_batch = check_free_batch(batch); | |
1016 | ||
1017 | umem = xsk_info->umem; | |
940ac2ce PC |
1018 | ret = umem_elem_pop_n(&umem->mpool, dp_packet_batch_size(batch), |
1019 | elems_pop); | |
0de1b425 | 1020 | if (OVS_UNLIKELY(ret)) { |
940ac2ce PC |
1021 | atomic_add_relaxed(&xsk_info->tx_dropped, dp_packet_batch_size(batch), |
1022 | &orig); | |
0de1b425 WT |
1023 | VLOG_WARN_RL(&rl, "%s: send failed due to exhausted memory pool.", |
1024 | netdev_get_name(netdev)); | |
1025 | error = ENOMEM; | |
1026 | goto out; | |
1027 | } | |
1028 | ||
1029 | /* Make sure we have enough TX descs. */ | |
940ac2ce PC |
1030 | ret = xsk_ring_prod__reserve(&xsk_info->tx, dp_packet_batch_size(batch), |
1031 | &idx); | |
0de1b425 | 1032 | if (OVS_UNLIKELY(ret == 0)) { |
940ac2ce PC |
1033 | umem_elem_push_n(&umem->mpool, dp_packet_batch_size(batch), elems_pop); |
1034 | atomic_add_relaxed(&xsk_info->tx_dropped, dp_packet_batch_size(batch), | |
1035 | &orig); | |
0de1b425 WT |
1036 | COVERAGE_INC(afxdp_tx_full); |
1037 | afxdp_complete_tx(xsk_info); | |
e8f56344 | 1038 | kick_tx(xsk_info, dev->xdp_mode_in_use, dev->use_need_wakeup); |
0de1b425 WT |
1039 | error = ENOMEM; |
1040 | goto out; | |
1041 | } | |
1042 | ||
1043 | DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { | |
1044 | uint64_t index; | |
1045 | void *elem; | |
1046 | ||
1047 | elem = elems_pop[i]; | |
1048 | /* Copy the packet to the umem we just pop from umem pool. | |
1049 | * TODO: avoid this copy if the packet and the pop umem | |
1050 | * are located in the same umem. | |
1051 | */ | |
1052 | memcpy(elem, dp_packet_data(packet), dp_packet_size(packet)); | |
1053 | ||
1054 | index = (uint64_t)((char *)elem - (char *)umem->buffer); | |
1055 | xsk_ring_prod__tx_desc(&xsk_info->tx, idx + i)->addr = index; | |
1056 | xsk_ring_prod__tx_desc(&xsk_info->tx, idx + i)->len | |
1057 | = dp_packet_size(packet); | |
1058 | } | |
940ac2ce PC |
1059 | xsk_ring_prod__submit(&xsk_info->tx, dp_packet_batch_size(batch)); |
1060 | xsk_info->outstanding_tx += dp_packet_batch_size(batch); | |
0de1b425 | 1061 | |
e8f56344 | 1062 | ret = kick_tx(xsk_info, dev->xdp_mode_in_use, dev->use_need_wakeup); |
0de1b425 WT |
1063 | if (OVS_UNLIKELY(ret)) { |
1064 | VLOG_WARN_RL(&rl, "%s: error sending AF_XDP packet: %s.", | |
1065 | netdev_get_name(netdev), ovs_strerror(ret)); | |
1066 | } | |
1067 | ||
1068 | out: | |
1069 | if (free_batch) { | |
1070 | free_afxdp_buf_batch(batch); | |
1071 | } else { | |
1072 | dp_packet_delete_batch(batch, true); | |
1073 | } | |
1074 | ||
1075 | return error; | |
1076 | } | |
1077 | ||
1078 | int | |
1079 | netdev_afxdp_batch_send(struct netdev *netdev, int qid, | |
1080 | struct dp_packet_batch *batch, | |
1081 | bool concurrent_txq) | |
1082 | { | |
1083 | struct netdev_linux *dev; | |
1084 | int ret; | |
1085 | ||
1086 | if (concurrent_txq) { | |
1087 | dev = netdev_linux_cast(netdev); | |
1088 | qid = qid % netdev_n_txq(netdev); | |
1089 | ||
28d05016 | 1090 | ovs_spin_lock(&dev->tx_locks[qid].lock); |
0de1b425 | 1091 | ret = __netdev_afxdp_batch_send(netdev, qid, batch); |
28d05016 | 1092 | ovs_spin_unlock(&dev->tx_locks[qid].lock); |
0de1b425 WT |
1093 | } else { |
1094 | ret = __netdev_afxdp_batch_send(netdev, qid, batch); | |
1095 | } | |
1096 | ||
1097 | return ret; | |
1098 | } | |
1099 | ||
1100 | int | |
1101 | netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_ OVS_UNUSED) | |
1102 | { | |
1103 | /* Done at reconfigure. */ | |
1104 | return 0; | |
1105 | } | |
1106 | ||
1107 | void | |
1108 | netdev_afxdp_rxq_destruct(struct netdev_rxq *rxq_ OVS_UNUSED) | |
1109 | { | |
1110 | /* Nothing. */ | |
1111 | } | |
1112 | ||
7bf075d9 WT |
1113 | static int |
1114 | libbpf_print(enum libbpf_print_level level, | |
1115 | const char *format, va_list args) | |
1116 | { | |
1117 | if (level == LIBBPF_WARN) { | |
1118 | vlog_valist(&this_module, VLL_WARN, format, args); | |
1119 | } else if (level == LIBBPF_INFO) { | |
1120 | vlog_valist(&this_module, VLL_INFO, format, args); | |
1121 | } else { | |
1122 | vlog_valist(&this_module, VLL_DBG, format, args); | |
1123 | } | |
1124 | return 0; | |
1125 | } | |
1126 | ||
1127 | int netdev_afxdp_init(void) | |
1128 | { | |
1129 | libbpf_set_print(libbpf_print); | |
1130 | return 0; | |
1131 | } | |
1132 | ||
f627cf1d IM |
1133 | int |
1134 | netdev_afxdp_construct(struct netdev *netdev) | |
1135 | { | |
1136 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
1137 | int ret; | |
1138 | ||
1139 | /* Configure common netdev-linux first. */ | |
1140 | ret = netdev_linux_construct(netdev); | |
1141 | if (ret) { | |
1142 | return ret; | |
1143 | } | |
1144 | ||
1145 | /* Queues should not be used before the first reconfiguration. Clearing. */ | |
1146 | netdev->n_rxq = 0; | |
1147 | netdev->n_txq = 0; | |
e8f56344 IM |
1148 | dev->xdp_mode = OVS_AF_XDP_MODE_UNSPEC; |
1149 | dev->xdp_mode_in_use = OVS_AF_XDP_MODE_UNSPEC; | |
f627cf1d IM |
1150 | |
1151 | dev->requested_n_rxq = NR_QUEUE; | |
e8f56344 | 1152 | dev->requested_xdp_mode = OVS_AF_XDP_MODE_BEST_EFFORT; |
e50547b5 | 1153 | dev->requested_need_wakeup = NEED_WAKEUP_DEFAULT; |
f627cf1d IM |
1154 | |
1155 | dev->xsks = NULL; | |
1156 | dev->tx_locks = NULL; | |
1157 | ||
1158 | netdev_request_reconfigure(netdev); | |
1159 | return 0; | |
1160 | } | |
1161 | ||
0de1b425 WT |
1162 | void |
1163 | netdev_afxdp_destruct(struct netdev *netdev) | |
1164 | { | |
1165 | static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; | |
1166 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
1167 | ||
1168 | if (ovsthread_once_start(&once)) { | |
1169 | fatal_signal_add_hook(netdev_afxdp_sweep_unused_pools, | |
1170 | NULL, NULL, true); | |
1171 | ovsthread_once_done(&once); | |
1172 | } | |
1173 | ||
1174 | /* Note: tc is by-passed when using drv-mode, but when using | |
1175 | * skb-mode, we might need to clean up tc. */ | |
1176 | ||
1177 | xsk_destroy_all(netdev); | |
1178 | ovs_mutex_destroy(&dev->mutex); | |
1179 | } | |
1180 | ||
52b5a5c0 EC |
1181 | int |
1182 | netdev_afxdp_verify_mtu_size(const struct netdev *netdev OVS_UNUSED, int mtu) | |
1183 | { | |
1184 | /* | |
1185 | * If a device is used in xdpmode skb, no driver-specific MTU size is | |
1186 | * checked and any value is allowed resulting in packet drops. | |
1187 | * This check will verify the maximum supported value based on the | |
1188 | * buffer size allocated and the additional headroom required. | |
1189 | */ | |
1190 | if (mtu > (FRAME_SIZE - OVS_XDP_HEADROOM - | |
1191 | XDP_PACKET_HEADROOM - VLAN_ETH_HEADER_LEN)) { | |
1192 | return EINVAL; | |
1193 | } | |
1194 | ||
1195 | return 0; | |
1196 | } | |
1197 | ||
d560bc1b IM |
1198 | int |
1199 | netdev_afxdp_get_custom_stats(const struct netdev *netdev, | |
1200 | struct netdev_custom_stats *custom_stats) | |
1201 | { | |
1202 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
1203 | struct xsk_socket_info *xsk_info; | |
1204 | struct xdp_statistics stat; | |
1205 | uint32_t i, c = 0; | |
1206 | socklen_t optlen; | |
1207 | ||
1208 | ovs_mutex_lock(&dev->mutex); | |
1209 | ||
1210 | #define XDP_CSTATS \ | |
1211 | XDP_CSTAT(rx_dropped) \ | |
1212 | XDP_CSTAT(rx_invalid_descs) \ | |
1213 | XDP_CSTAT(tx_invalid_descs) | |
1214 | ||
1215 | #define XDP_CSTAT(NAME) + 1 | |
1216 | enum { N_XDP_CSTATS = XDP_CSTATS }; | |
1217 | #undef XDP_CSTAT | |
1218 | ||
1219 | custom_stats->counters = xcalloc(netdev_n_rxq(netdev) * N_XDP_CSTATS, | |
1220 | sizeof *custom_stats->counters); | |
1221 | ||
1222 | /* Account the stats for each xsk. */ | |
1223 | for (i = 0; i < netdev_n_rxq(netdev); i++) { | |
1224 | xsk_info = dev->xsks[i]; | |
1225 | optlen = sizeof stat; | |
1226 | ||
1227 | if (xsk_info && !getsockopt(xsk_socket__fd(xsk_info->xsk), SOL_XDP, | |
1228 | XDP_STATISTICS, &stat, &optlen)) { | |
1229 | #define XDP_CSTAT(NAME) \ | |
1230 | snprintf(custom_stats->counters[c].name, \ | |
1231 | NETDEV_CUSTOM_STATS_NAME_SIZE, \ | |
1232 | "xsk_queue_%d_" #NAME, i); \ | |
1233 | custom_stats->counters[c++].value = stat.NAME; | |
1234 | XDP_CSTATS; | |
1235 | #undef XDP_CSTAT | |
1236 | } | |
1237 | } | |
1238 | custom_stats->size = c; | |
1239 | ovs_mutex_unlock(&dev->mutex); | |
1240 | ||
1241 | return 0; | |
1242 | } | |
1243 | ||
0de1b425 WT |
1244 | int |
1245 | netdev_afxdp_get_stats(const struct netdev *netdev, | |
1246 | struct netdev_stats *stats) | |
1247 | { | |
1248 | struct netdev_linux *dev = netdev_linux_cast(netdev); | |
1249 | struct xsk_socket_info *xsk_info; | |
1250 | struct netdev_stats dev_stats; | |
1251 | int error, i; | |
1252 | ||
1253 | ovs_mutex_lock(&dev->mutex); | |
1254 | ||
1255 | error = get_stats_via_netlink(netdev, &dev_stats); | |
1256 | if (error) { | |
1257 | VLOG_WARN_RL(&rl, "%s: Error getting AF_XDP statistics.", | |
1258 | netdev_get_name(netdev)); | |
1259 | } else { | |
1260 | /* Use kernel netdev's packet and byte counts. */ | |
1261 | stats->rx_packets = dev_stats.rx_packets; | |
1262 | stats->rx_bytes = dev_stats.rx_bytes; | |
1263 | stats->tx_packets = dev_stats.tx_packets; | |
1264 | stats->tx_bytes = dev_stats.tx_bytes; | |
1265 | ||
1266 | stats->rx_errors += dev_stats.rx_errors; | |
1267 | stats->tx_errors += dev_stats.tx_errors; | |
1268 | stats->rx_dropped += dev_stats.rx_dropped; | |
1269 | stats->tx_dropped += dev_stats.tx_dropped; | |
1270 | stats->multicast += dev_stats.multicast; | |
1271 | stats->collisions += dev_stats.collisions; | |
1272 | stats->rx_length_errors += dev_stats.rx_length_errors; | |
1273 | stats->rx_over_errors += dev_stats.rx_over_errors; | |
1274 | stats->rx_crc_errors += dev_stats.rx_crc_errors; | |
1275 | stats->rx_frame_errors += dev_stats.rx_frame_errors; | |
1276 | stats->rx_fifo_errors += dev_stats.rx_fifo_errors; | |
1277 | stats->rx_missed_errors += dev_stats.rx_missed_errors; | |
1278 | stats->tx_aborted_errors += dev_stats.tx_aborted_errors; | |
1279 | stats->tx_carrier_errors += dev_stats.tx_carrier_errors; | |
1280 | stats->tx_fifo_errors += dev_stats.tx_fifo_errors; | |
1281 | stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors; | |
1282 | stats->tx_window_errors += dev_stats.tx_window_errors; | |
1283 | ||
1284 | /* Account the dropped in each xsk. */ | |
1285 | for (i = 0; i < netdev_n_rxq(netdev); i++) { | |
1286 | xsk_info = dev->xsks[i]; | |
1287 | if (xsk_info) { | |
1288 | uint64_t tx_dropped; | |
1289 | ||
1290 | atomic_read_relaxed(&xsk_info->tx_dropped, &tx_dropped); | |
1291 | stats->tx_dropped += tx_dropped; | |
1292 | } | |
1293 | } | |
1294 | } | |
1295 | ovs_mutex_unlock(&dev->mutex); | |
1296 | ||
1297 | return error; | |
1298 | } |