]> git.proxmox.com Git - pve-kernel.git/blame - patches/kernel/0016-net-tcp-close-sock-if-net-namespace-is-exiting.patch
update sources to Ubuntu-4.13.0-45.50
[pve-kernel.git] / patches / kernel / 0016-net-tcp-close-sock-if-net-namespace-is-exiting.patch
CommitLineData
38c79e81
FG
1From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: Dan Streetman <ddstreet@ieee.org>
3Date: Thu, 18 Jan 2018 16:14:26 -0500
4Subject: [PATCH] net: tcp: close sock if net namespace is exiting
5MIME-Version: 1.0
6Content-Type: text/plain; charset=UTF-8
7Content-Transfer-Encoding: 8bit
8
9When a tcp socket is closed, if it detects that its net namespace is
10exiting, close immediately and do not wait for FIN sequence.
11
12For normal sockets, a reference is taken to their net namespace, so it will
13never exit while the socket is open. However, kernel sockets do not take a
14reference to their net namespace, so it may begin exiting while the kernel
15socket is still open. In this case if the kernel socket is a tcp socket,
16it will stay open trying to complete its close sequence. The sock's dst(s)
17hold a reference to their interface, which are all transferred to the
18namespace's loopback interface when the real interfaces are taken down.
19When the namespace tries to take down its loopback interface, it hangs
20waiting for all references to the loopback interface to release, which
21results in messages like:
22
23unregister_netdevice: waiting for lo to become free. Usage count = 1
24
25These messages continue until the socket finally times out and closes.
26Since the net namespace cleanup holds the net_mutex while calling its
27registered pernet callbacks, any new net namespace initialization is
28blocked until the current net namespace finishes exiting.
29
30After this change, the tcp socket notices the exiting net namespace, and
31closes immediately, releasing its dst(s) and their reference to the
32loopback interface, which lets the net namespace continue exiting.
33
34Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
35Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
36Signed-off-by: Dan Streetman <ddstreet@canonical.com>
37Signed-off-by: David S. Miller <davem@davemloft.net>
38Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
39---
40 include/net/net_namespace.h | 10 ++++++++++
41 net/ipv4/tcp.c | 3 +++
42 net/ipv4/tcp_timer.c | 15 +++++++++++++++
43 3 files changed, 28 insertions(+)
44
45diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
46index 1c401bd4c2e0..a5d023fa78db 100644
47--- a/include/net/net_namespace.h
48+++ b/include/net/net_namespace.h
49@@ -221,6 +221,11 @@ int net_eq(const struct net *net1, const struct net *net2)
50 return net1 == net2;
51 }
52
53+static inline int check_net(const struct net *net)
54+{
55+ return atomic_read(&net->count) != 0;
56+}
57+
58 void net_drop_ns(void *);
59
60 #else
61@@ -245,6 +250,11 @@ int net_eq(const struct net *net1, const struct net *net2)
62 return 1;
63 }
64
65+static inline int check_net(const struct net *net)
66+{
67+ return 1;
68+}
69+
70 #define net_drop_ns NULL
71 #endif
72
73diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
74index a3e91b552edc..fd2a086da910 100644
75--- a/net/ipv4/tcp.c
76+++ b/net/ipv4/tcp.c
77@@ -2258,6 +2258,9 @@ void tcp_close(struct sock *sk, long timeout)
78 tcp_send_active_reset(sk, GFP_ATOMIC);
79 __NET_INC_STATS(sock_net(sk),
80 LINUX_MIB_TCPABORTONMEMORY);
81+ } else if (!check_net(sock_net(sk))) {
82+ /* Not possible to send reset; just close */
83+ tcp_set_state(sk, TCP_CLOSE);
84 }
85 }
86
87diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
88index e906014890b6..ec1e5de41653 100644
89--- a/net/ipv4/tcp_timer.c
90+++ b/net/ipv4/tcp_timer.c
91@@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk)
92 * to prevent DoS attacks. It is called when a retransmission timeout
93 * or zero probe timeout occurs on orphaned socket.
94 *
95+ * Also close if our net namespace is exiting; in that case there is no
96+ * hope of ever communicating again since all netns interfaces are already
97+ * down (or about to be down), and we need to release our dst references,
98+ * which have been moved to the netns loopback interface, so the namespace
99+ * can finish exiting. This condition is only possible if we are a kernel
100+ * socket, as those do not hold references to the namespace.
101+ *
102 * Criteria is still not confirmed experimentally and may change.
103 * We kill the socket, if:
104 * 1. If number of orphaned sockets exceeds an administratively configured
105 * limit.
106 * 2. If we have strong memory pressure.
107+ * 3. If our net namespace is exiting.
108 */
109 static int tcp_out_of_resources(struct sock *sk, bool do_reset)
110 {
111@@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
112 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
113 return 1;
114 }
115+
116+ if (!check_net(sock_net(sk))) {
117+ /* Not possible to send reset; just close */
118+ tcp_done(sk);
119+ return 1;
120+ }
121+
122 return 0;
123 }
124
125--
1262.14.2
127