]> git.proxmox.com Git - pve-kernel.git/commitdiff
cherry-pick vhost perf regression and mem-leak fix
authorFabian Grünbichler <f.gruenbichler@proxmox.com>
Mon, 4 Dec 2017 08:08:19 +0000 (09:08 +0100)
committerFabian Grünbichler <f.gruenbichler@proxmox.com>
Mon, 4 Dec 2017 08:27:58 +0000 (09:27 +0100)
patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch [new file with mode: 0644]
patches/kernel/0009-tun-free-skb-in-early-errors.patch [new file with mode: 0644]
patches/kernel/0010-tap-free-skb-if-flags-error.patch [new file with mode: 0644]

diff --git a/patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch b/patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch
new file mode 100644 (file)
index 0000000..7938167
--- /dev/null
@@ -0,0 +1,72 @@
+From 15bea51dda49a0403a9a84fb26f3376636d2a3c2 Mon Sep 17 00:00:00 2001
+From: Wei Xu <wexu@redhat.com>
+Date: Fri, 1 Dec 2017 05:10:36 -0500
+Subject: [PATCH 08/13] vhost: fix skb leak in handle_rx()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Matthew found a roughly 40% tcp throughput regression with commit
+c67df11f(vhost_net: try batch dequing from skb array) as discussed
+in the following thread:
+https://www.mail-archive.com/netdev@vger.kernel.org/msg187936.html
+
+Eventually we figured out that it was a skb leak in handle_rx()
+when sending packets to the VM. This usually happens when a guest
+can not drain out vq as fast as vhost fills in, afterwards it sets
+off the traffic jam and leaks skb(s) which occurs as no headcount
+to send on the vq from vhost side.
+
+This can be avoided by making sure we have got enough headcount
+before actually consuming a skb from the batched rx array while
+transmitting, which is simply done by moving checking the zero
+headcount a bit ahead.
+
+Signed-off-by: Wei Xu <wexu@redhat.com>
+Reported-by: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ drivers/vhost/net.c | 20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
+index 1c75572f5a3f..010253847022 100644
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -781,16 +781,6 @@ static void handle_rx(struct vhost_net *net)
+               /* On error, stop handling until the next kick. */
+               if (unlikely(headcount < 0))
+                       goto out;
+-              if (nvq->rx_array)
+-                      msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
+-              /* On overrun, truncate and discard */
+-              if (unlikely(headcount > UIO_MAXIOV)) {
+-                      iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
+-                      err = sock->ops->recvmsg(sock, &msg,
+-                                               1, MSG_DONTWAIT | MSG_TRUNC);
+-                      pr_debug("Discarded rx packet: len %zd\n", sock_len);
+-                      continue;
+-              }
+               /* OK, now we need to know about added descriptors. */
+               if (!headcount) {
+                       if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+@@ -803,6 +793,16 @@ static void handle_rx(struct vhost_net *net)
+                        * they refilled. */
+                       goto out;
+               }
++              if (nvq->rx_array)
++                      msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
++              /* On overrun, truncate and discard */
++              if (unlikely(headcount > UIO_MAXIOV)) {
++                      iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
++                      err = sock->ops->recvmsg(sock, &msg,
++                                               1, MSG_DONTWAIT | MSG_TRUNC);
++                      pr_debug("Discarded rx packet: len %zd\n", sock_len);
++                      continue;
++              }
+               /* We don't need to be notified again. */
+               iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
+               fixup = msg.msg_iter;
+-- 
+2.14.2
+
diff --git a/patches/kernel/0009-tun-free-skb-in-early-errors.patch b/patches/kernel/0009-tun-free-skb-in-early-errors.patch
new file mode 100644 (file)
index 0000000..f76eab5
--- /dev/null
@@ -0,0 +1,86 @@
+From ccc0b8662620d562798183b77bcd30125d2d14f3 Mon Sep 17 00:00:00 2001
+From: Wei Xu <wexu@redhat.com>
+Date: Fri, 1 Dec 2017 05:10:37 -0500
+Subject: [PATCH 09/13] tun: free skb in early errors
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+tun_recvmsg() supports accepting skb by msg_control after
+commit ac77cfd4258f ("tun: support receiving skb through msg_control"),
+the skb if presented should be freed no matter how far it can go
+along, otherwise it would be leaked.
+
+This patch fixes several missed cases.
+
+Signed-off-by: Wei Xu <wexu@redhat.com>
+Reported-by: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ drivers/net/tun.c | 24 ++++++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index cb1f7747adad..5143e948d7d1 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1519,8 +1519,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
+       tun_debug(KERN_INFO, tun, "tun_do_read\n");
+-      if (!iov_iter_count(to))
++      if (!iov_iter_count(to)) {
++              if (skb)
++                      kfree_skb(skb);
+               return 0;
++      }
+       if (!skb) {
+               /* Read frames from ring */
+@@ -1636,22 +1639,24 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
+ {
+       struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+       struct tun_struct *tun = __tun_get(tfile);
++      struct sk_buff *skb = m->msg_control;
+       int ret;
+-      if (!tun)
+-              return -EBADFD;
++      if (!tun) {
++              ret = -EBADFD;
++              goto out_free_skb;
++      }
+       if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
+               ret = -EINVAL;
+-              goto out;
++              goto out_put_tun;
+       }
+       if (flags & MSG_ERRQUEUE) {
+               ret = sock_recv_errqueue(sock->sk, m, total_len,
+                                        SOL_PACKET, TUN_TX_TIMESTAMP);
+               goto out;
+       }
+-      ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT,
+-                        m->msg_control);
++      ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, skb);
+       if (ret > (ssize_t)total_len) {
+               m->msg_flags |= MSG_TRUNC;
+               ret = flags & MSG_TRUNC ? ret : total_len;
+@@ -1659,6 +1664,13 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
+ out:
+       tun_put(tun);
+       return ret;
++
++out_put_tun:
++      tun_put(tun);
++out_free_skb:
++      if (skb)
++              kfree_skb(skb);
++      return ret;
+ }
+ static int tun_peek_len(struct socket *sock)
+-- 
+2.14.2
+
diff --git a/patches/kernel/0010-tap-free-skb-if-flags-error.patch b/patches/kernel/0010-tap-free-skb-if-flags-error.patch
new file mode 100644 (file)
index 0000000..c9521b3
--- /dev/null
@@ -0,0 +1,58 @@
+From 68783aa6989756cda8e9e305292afbb9f4f5677c Mon Sep 17 00:00:00 2001
+From: Wei Xu <wexu@redhat.com>
+Date: Fri, 1 Dec 2017 05:10:38 -0500
+Subject: [PATCH 10/13] tap: free skb if flags error
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+tap_recvmsg() supports accepting skb by msg_control after
+commit 3b4ba04acca8 ("tap: support receiving skb from msg_control"),
+the skb if presented should be freed within the function, otherwise
+it would be leaked.
+
+Signed-off-by: Wei Xu <wexu@redhat.com>
+Reported-by: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+---
+ drivers/net/tap.c | 14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/net/tap.c b/drivers/net/tap.c
+index 3570c7576993..4e04b6094f3c 100644
+--- a/drivers/net/tap.c
++++ b/drivers/net/tap.c
+@@ -829,8 +829,11 @@ static ssize_t tap_do_read(struct tap_queue *q,
+       DEFINE_WAIT(wait);
+       ssize_t ret = 0;
+-      if (!iov_iter_count(to))
++      if (!iov_iter_count(to)) {
++              if (skb)
++                      kfree_skb(skb);
+               return 0;
++      }
+       if (skb)
+               goto put;
+@@ -1155,11 +1158,14 @@ static int tap_recvmsg(struct socket *sock, struct msghdr *m,
+                      size_t total_len, int flags)
+ {
+       struct tap_queue *q = container_of(sock, struct tap_queue, sock);
++      struct sk_buff *skb = m->msg_control;
+       int ret;
+-      if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
++      if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) {
++              if (skb)
++                      kfree_skb(skb);
+               return -EINVAL;
+-      ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT,
+-                        m->msg_control);
++      }
++      ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb);
+       if (ret > total_len) {
+               m->msg_flags |= MSG_TRUNC;
+               ret = flags & MSG_TRUNC ? ret : total_len;
+-- 
+2.14.2
+