]>
Commit | Line | Data |
---|---|---|
0e88ec19 FE |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Ding Hui <dinghui@sangfor.com.cn> | |
3 | Date: Wed, 29 Jun 2022 17:40:26 +0800 | |
4 | Subject: [PATCH] e1000: set RX descriptor status in a separate operation | |
5 | ||
6 | The code of setting RX descriptor status field maybe work fine in | |
7 | previously, however with the update of glibc version, it shows two | |
8 | issues when guest using dpdk receive packets: | |
9 | ||
10 | 1. The dpdk has a certain probability getting wrong buffer_addr | |
11 | ||
12 | this impact may be not obvious, such as lost a packet once in | |
13 | a while | |
14 | ||
15 | 2. The dpdk may consume a packet twice when scan the RX desc queue | |
16 | over again | |
17 | ||
18 | this impact will lead a infinite wait in Qemu, since the RDT | |
19 | (tail pointer) be inscreased to equal to RDH by unexpected, | |
20 | which regard as the RX desc queue is full | |
21 | ||
22 | Write a whole of RX desc with DD flag on is not quite correct, because | |
23 | when the underlying implementation of memcpy using XMM registers to | |
24 | copy e1000_rx_desc (when AVX or something else CPU feature is usable), | |
25 | the bytes order of desc writing to memory is indeterminacy | |
26 | ||
27 | We can use full-scale test case to reproduce the issue-2 by | |
28 | https://github.com/BASM/qemu_dpdk_e1000_test (thanks to Leonid Myravjev) | |
29 | ||
30 | I also write a POC test case at https://github.com/cdkey/e1000_poc | |
31 | which can reproduce both of them, and easy to verify the patch effect. | |
32 | ||
33 | The hw watchpoint also shows that, when Qemu using XMM related instructions | |
34 | writing 16 bytes e1000_rx_desc, concurrent with DPDK using movb | |
35 | writing 1 byte status, the final result of writing to memory will be one | |
36 | of them, if it made by Qemu which DD flag is on, DPDK will consume it | |
37 | again. | |
38 | ||
39 | Setting DD status in a separate operation, can prevent the impact of | |
40 | disorder memory writing by memcpy, also avoid unexpected data when | |
41 | concurrent writing status by qemu and guest dpdk. | |
42 | ||
43 | Links: https://lore.kernel.org/qemu-devel/20200102110504.GG121208@stefanha-x1.localdomain/T/ | |
44 | ||
45 | Reported-by: Leonid Myravjev <asm@asm.pp.ru> | |
46 | Cc: Stefan Hajnoczi <stefanha@gmail.com> | |
47 | Cc: Paolo Bonzini <pbonzini@redhat.com> | |
48 | Cc: Michael S. Tsirkin <mst@redhat.com> | |
49 | Cc: qemu-stable@nongnu.org | |
50 | Tested-by: Jing Zhang <zhangjing@sangfor.com.cn> | |
51 | Reviewed-by: Frank Lee <lifan38153@sangfor.com.cn> | |
52 | Signed-off-by: Ding Hui <dinghui@sangfor.com.cn> | |
53 | Signed-off-by: Jason Wang <jasowang@redhat.com> | |
54 | (cherry-picked from commit 034d00d4858161e1d4cff82d8d230bce874a04d3) | |
55 | Signed-off-by: Fabian Ebner <f.ebner@proxmox.com> | |
56 | --- | |
57 | hw/net/e1000.c | 5 ++++- | |
58 | 1 file changed, 4 insertions(+), 1 deletion(-) | |
59 | ||
60 | diff --git a/hw/net/e1000.c b/hw/net/e1000.c | |
61 | index f5bc81296d..e26e0a64c1 100644 | |
62 | --- a/hw/net/e1000.c | |
63 | +++ b/hw/net/e1000.c | |
64 | @@ -979,7 +979,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) | |
65 | base = rx_desc_base(s) + sizeof(desc) * s->mac_reg[RDH]; | |
66 | pci_dma_read(d, base, &desc, sizeof(desc)); | |
67 | desc.special = vlan_special; | |
68 | - desc.status |= (vlan_status | E1000_RXD_STAT_DD); | |
69 | + desc.status &= ~E1000_RXD_STAT_DD; | |
70 | if (desc.buffer_addr) { | |
71 | if (desc_offset < size) { | |
72 | size_t iov_copy; | |
73 | @@ -1013,6 +1013,9 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) | |
74 | DBGOUT(RX, "Null RX descriptor!!\n"); | |
75 | } | |
76 | pci_dma_write(d, base, &desc, sizeof(desc)); | |
77 | + desc.status |= (vlan_status | E1000_RXD_STAT_DD); | |
78 | + pci_dma_write(d, base + offsetof(struct e1000_rx_desc, status), | |
79 | + &desc.status, sizeof(desc.status)); | |
80 | ||
81 | if (++s->mac_reg[RDH] * sizeof(desc) >= s->mac_reg[RDLEN]) | |
82 | s->mac_reg[RDH] = 0; |