]> git.proxmox.com Git - mirror_qemu.git/blob - net/af-xdp.c
vdpa: Add SetSteeringEBPF method for NetClientState
[mirror_qemu.git] / net / af-xdp.c
1 /*
2 * AF_XDP network backend.
3 *
4 * Copyright (c) 2023 Red Hat, Inc.
5 *
6 * Authors:
7 * Ilya Maximets <i.maximets@ovn.org>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
10 * See the COPYING file in the top-level directory.
11 */
12
13
14 #include "qemu/osdep.h"
15 #include <bpf/bpf.h>
16 #include <inttypes.h>
17 #include <linux/if_link.h>
18 #include <linux/if_xdp.h>
19 #include <net/if.h>
20 #include <xdp/xsk.h>
21
22 #include "clients.h"
23 #include "monitor/monitor.h"
24 #include "net/net.h"
25 #include "qapi/error.h"
26 #include "qemu/cutils.h"
27 #include "qemu/error-report.h"
28 #include "qemu/iov.h"
29 #include "qemu/main-loop.h"
30 #include "qemu/memalign.h"
31
32
33 typedef struct AFXDPState {
34 NetClientState nc;
35
36 struct xsk_socket *xsk;
37 struct xsk_ring_cons rx;
38 struct xsk_ring_prod tx;
39 struct xsk_ring_cons cq;
40 struct xsk_ring_prod fq;
41
42 char ifname[IFNAMSIZ];
43 int ifindex;
44 bool read_poll;
45 bool write_poll;
46 uint32_t outstanding_tx;
47
48 uint64_t *pool;
49 uint32_t n_pool;
50 char *buffer;
51 struct xsk_umem *umem;
52
53 uint32_t n_queues;
54 uint32_t xdp_flags;
55 bool inhibit;
56 } AFXDPState;
57
58 #define AF_XDP_BATCH_SIZE 64
59
60 static void af_xdp_send(void *opaque);
61 static void af_xdp_writable(void *opaque);
62
63 /* Set the event-loop handlers for the af-xdp backend. */
64 static void af_xdp_update_fd_handler(AFXDPState *s)
65 {
66 qemu_set_fd_handler(xsk_socket__fd(s->xsk),
67 s->read_poll ? af_xdp_send : NULL,
68 s->write_poll ? af_xdp_writable : NULL,
69 s);
70 }
71
72 /* Update the read handler. */
73 static void af_xdp_read_poll(AFXDPState *s, bool enable)
74 {
75 if (s->read_poll != enable) {
76 s->read_poll = enable;
77 af_xdp_update_fd_handler(s);
78 }
79 }
80
81 /* Update the write handler. */
82 static void af_xdp_write_poll(AFXDPState *s, bool enable)
83 {
84 if (s->write_poll != enable) {
85 s->write_poll = enable;
86 af_xdp_update_fd_handler(s);
87 }
88 }
89
90 static void af_xdp_poll(NetClientState *nc, bool enable)
91 {
92 AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
93
94 if (s->read_poll != enable || s->write_poll != enable) {
95 s->write_poll = enable;
96 s->read_poll = enable;
97 af_xdp_update_fd_handler(s);
98 }
99 }
100
101 static void af_xdp_complete_tx(AFXDPState *s)
102 {
103 uint32_t idx = 0;
104 uint32_t done, i;
105 uint64_t *addr;
106
107 done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
108
109 for (i = 0; i < done; i++) {
110 addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++);
111 s->pool[s->n_pool++] = *addr;
112 s->outstanding_tx--;
113 }
114
115 if (done) {
116 xsk_ring_cons__release(&s->cq, done);
117 }
118 }
119
120 /*
121 * The fd_write() callback, invoked if the fd is marked as writable
122 * after a poll.
123 */
124 static void af_xdp_writable(void *opaque)
125 {
126 AFXDPState *s = opaque;
127
128 /* Try to recover buffers that are already sent. */
129 af_xdp_complete_tx(s);
130
131 /*
132 * Unregister the handler, unless we still have packets to transmit
133 * and kernel needs a wake up.
134 */
135 if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) {
136 af_xdp_write_poll(s, false);
137 }
138
139 /* Flush any buffered packets. */
140 qemu_flush_queued_packets(&s->nc);
141 }
142
143 static ssize_t af_xdp_receive(NetClientState *nc,
144 const uint8_t *buf, size_t size)
145 {
146 AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
147 struct xdp_desc *desc;
148 uint32_t idx;
149 void *data;
150
151 /* Try to recover buffers that are already sent. */
152 af_xdp_complete_tx(s);
153
154 if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) {
155 /* We can't transmit packet this size... */
156 return size;
157 }
158
159 if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) {
160 /*
161 * Out of buffers or space in tx ring. Poll until we can write.
162 * This will also kick the Tx, if it was waiting on CQ.
163 */
164 af_xdp_write_poll(s, true);
165 return 0;
166 }
167
168 desc = xsk_ring_prod__tx_desc(&s->tx, idx);
169 desc->addr = s->pool[--s->n_pool];
170 desc->len = size;
171
172 data = xsk_umem__get_data(s->buffer, desc->addr);
173 memcpy(data, buf, size);
174
175 xsk_ring_prod__submit(&s->tx, 1);
176 s->outstanding_tx++;
177
178 if (xsk_ring_prod__needs_wakeup(&s->tx)) {
179 af_xdp_write_poll(s, true);
180 }
181
182 return size;
183 }
184
185 /*
186 * Complete a previous send (backend --> guest) and enable the
187 * fd_read callback.
188 */
189 static void af_xdp_send_completed(NetClientState *nc, ssize_t len)
190 {
191 AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
192
193 af_xdp_read_poll(s, true);
194 }
195
196 static void af_xdp_fq_refill(AFXDPState *s, uint32_t n)
197 {
198 uint32_t i, idx = 0;
199
200 /* Leave one packet for Tx, just in case. */
201 if (s->n_pool < n + 1) {
202 n = s->n_pool;
203 }
204
205 if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) {
206 return;
207 }
208
209 for (i = 0; i < n; i++) {
210 *xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool];
211 }
212 xsk_ring_prod__submit(&s->fq, n);
213
214 if (xsk_ring_prod__needs_wakeup(&s->fq)) {
215 /* Receive was blocked by not having enough buffers. Wake it up. */
216 af_xdp_read_poll(s, true);
217 }
218 }
219
220 static void af_xdp_send(void *opaque)
221 {
222 uint32_t i, n_rx, idx = 0;
223 AFXDPState *s = opaque;
224
225 n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx);
226 if (!n_rx) {
227 return;
228 }
229
230 for (i = 0; i < n_rx; i++) {
231 const struct xdp_desc *desc;
232 struct iovec iov;
233
234 desc = xsk_ring_cons__rx_desc(&s->rx, idx++);
235
236 iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr);
237 iov.iov_len = desc->len;
238
239 s->pool[s->n_pool++] = desc->addr;
240
241 if (!qemu_sendv_packet_async(&s->nc, &iov, 1,
242 af_xdp_send_completed)) {
243 /*
244 * The peer does not receive anymore. Packet is queued, stop
245 * reading from the backend until af_xdp_send_completed().
246 */
247 af_xdp_read_poll(s, false);
248
249 /* Return unused descriptors to not break the ring cache. */
250 xsk_ring_cons__cancel(&s->rx, n_rx - i - 1);
251 n_rx = i + 1;
252 break;
253 }
254 }
255
256 /* Release actually sent descriptors and try to re-fill. */
257 xsk_ring_cons__release(&s->rx, n_rx);
258 af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE);
259 }
260
261 /* Flush and close. */
262 static void af_xdp_cleanup(NetClientState *nc)
263 {
264 AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
265
266 qemu_purge_queued_packets(nc);
267
268 af_xdp_poll(nc, false);
269
270 xsk_socket__delete(s->xsk);
271 s->xsk = NULL;
272 g_free(s->pool);
273 s->pool = NULL;
274 xsk_umem__delete(s->umem);
275 s->umem = NULL;
276 qemu_vfree(s->buffer);
277 s->buffer = NULL;
278
279 /* Remove the program if it's the last open queue. */
280 if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags
281 && bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) {
282 fprintf(stderr,
283 "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
284 s->ifname, s->ifindex);
285 }
286 }
287
288 static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
289 {
290 struct xsk_umem_config config = {
291 .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
292 .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
293 .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
294 .frame_headroom = 0,
295 };
296 uint64_t n_descs;
297 uint64_t size;
298 int64_t i;
299 int ret;
300
301 /* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */
302 n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS
303 + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2;
304 size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE;
305
306 s->buffer = qemu_memalign(qemu_real_host_page_size(), size);
307 memset(s->buffer, 0, size);
308
309 if (sock_fd < 0) {
310 ret = xsk_umem__create(&s->umem, s->buffer, size,
311 &s->fq, &s->cq, &config);
312 } else {
313 ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size,
314 &s->fq, &s->cq, &config);
315 }
316
317 if (ret) {
318 qemu_vfree(s->buffer);
319 error_setg_errno(errp, errno,
320 "failed to create umem for %s queue_index: %d",
321 s->ifname, s->nc.queue_index);
322 return -1;
323 }
324
325 s->pool = g_new(uint64_t, n_descs);
326 /* Fill the pool in the opposite order, because it's a LIFO queue. */
327 for (i = n_descs; i >= 0; i--) {
328 s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
329 }
330 s->n_pool = n_descs;
331
332 af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS);
333
334 return 0;
335 }
336
337 static int af_xdp_socket_create(AFXDPState *s,
338 const NetdevAFXDPOptions *opts, Error **errp)
339 {
340 struct xsk_socket_config cfg = {
341 .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
342 .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
343 .libxdp_flags = 0,
344 .bind_flags = XDP_USE_NEED_WAKEUP,
345 .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
346 };
347 int queue_id, error = 0;
348
349 s->inhibit = opts->has_inhibit && opts->inhibit;
350 if (s->inhibit) {
351 cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
352 }
353
354 if (opts->has_force_copy && opts->force_copy) {
355 cfg.bind_flags |= XDP_COPY;
356 }
357
358 queue_id = s->nc.queue_index;
359 if (opts->has_start_queue && opts->start_queue > 0) {
360 queue_id += opts->start_queue;
361 }
362
363 if (opts->has_mode) {
364 /* Specific mode requested. */
365 cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE)
366 ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE;
367 if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
368 s->umem, &s->rx, &s->tx, &cfg)) {
369 error = errno;
370 }
371 } else {
372 /* No mode requested, try native first. */
373 cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;
374
375 if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
376 s->umem, &s->rx, &s->tx, &cfg)) {
377 /* Can't use native mode, try skb. */
378 cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE;
379 cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;
380
381 if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
382 s->umem, &s->rx, &s->tx, &cfg)) {
383 error = errno;
384 }
385 }
386 }
387
388 if (error) {
389 error_setg_errno(errp, error,
390 "failed to create AF_XDP socket for %s queue_id: %d",
391 s->ifname, queue_id);
392 return -1;
393 }
394
395 s->xdp_flags = cfg.xdp_flags;
396
397 return 0;
398 }
399
400 /* NetClientInfo methods. */
401 static NetClientInfo net_af_xdp_info = {
402 .type = NET_CLIENT_DRIVER_AF_XDP,
403 .size = sizeof(AFXDPState),
404 .receive = af_xdp_receive,
405 .poll = af_xdp_poll,
406 .cleanup = af_xdp_cleanup,
407 };
408
409 static int *parse_socket_fds(const char *sock_fds_str,
410 int64_t n_expected, Error **errp)
411 {
412 gchar **substrings = g_strsplit(sock_fds_str, ":", -1);
413 int64_t i, n_sock_fds = g_strv_length(substrings);
414 int *sock_fds = NULL;
415
416 if (n_sock_fds != n_expected) {
417 error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64,
418 n_expected, n_sock_fds);
419 goto exit;
420 }
421
422 sock_fds = g_new(int, n_sock_fds);
423
424 for (i = 0; i < n_sock_fds; i++) {
425 sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp);
426 if (sock_fds[i] < 0) {
427 g_free(sock_fds);
428 sock_fds = NULL;
429 goto exit;
430 }
431 }
432
433 exit:
434 g_strfreev(substrings);
435 return sock_fds;
436 }
437
438 /*
439 * The exported init function.
440 *
441 * ... -netdev af-xdp,ifname="..."
442 */
443 int net_init_af_xdp(const Netdev *netdev,
444 const char *name, NetClientState *peer, Error **errp)
445 {
446 const NetdevAFXDPOptions *opts = &netdev->u.af_xdp;
447 NetClientState *nc, *nc0 = NULL;
448 unsigned int ifindex;
449 uint32_t prog_id = 0;
450 int *sock_fds = NULL;
451 int64_t i, queues;
452 Error *err = NULL;
453 AFXDPState *s;
454
455 ifindex = if_nametoindex(opts->ifname);
456 if (!ifindex) {
457 error_setg_errno(errp, errno, "failed to get ifindex for '%s'",
458 opts->ifname);
459 return -1;
460 }
461
462 queues = opts->has_queues ? opts->queues : 1;
463 if (queues < 1) {
464 error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'",
465 queues, opts->ifname);
466 return -1;
467 }
468
469 if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
470 error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
471 return -1;
472 }
473
474 if (opts->sock_fds) {
475 sock_fds = parse_socket_fds(opts->sock_fds, queues, errp);
476 if (!sock_fds) {
477 return -1;
478 }
479 }
480
481 for (i = 0; i < queues; i++) {
482 nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name);
483 qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname);
484 nc->queue_index = i;
485
486 if (!nc0) {
487 nc0 = nc;
488 }
489
490 s = DO_UPCAST(AFXDPState, nc, nc);
491
492 pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
493 s->ifindex = ifindex;
494 s->n_queues = queues;
495
496 if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
497 || af_xdp_socket_create(s, opts, errp)) {
498 /* Make sure the XDP program will be removed. */
499 s->n_queues = i;
500 error_propagate(errp, err);
501 goto err;
502 }
503 }
504
505 if (nc0) {
506 s = DO_UPCAST(AFXDPState, nc, nc0);
507 if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
508 error_setg_errno(errp, errno,
509 "no XDP program loaded on '%s', ifindex: %d",
510 s->ifname, s->ifindex);
511 goto err;
512 }
513 }
514
515 af_xdp_read_poll(s, true); /* Initially only poll for reads. */
516
517 return 0;
518
519 err:
520 g_free(sock_fds);
521 if (nc0) {
522 qemu_del_net_client(nc0);
523 }
524
525 return -1;
526 }