]>
Commit | Line | Data |
---|---|---|
38c79e81 FG |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Dan Streetman <ddstreet@ieee.org> | |
3 | Date: Thu, 18 Jan 2018 16:14:26 -0500 | |
4 | Subject: [PATCH] net: tcp: close sock if net namespace is exiting | |
5 | MIME-Version: 1.0 | |
6 | Content-Type: text/plain; charset=UTF-8 | |
7 | Content-Transfer-Encoding: 8bit | |
8 | ||
9 | When a tcp socket is closed, if it detects that its net namespace is | |
10 | exiting, close immediately and do not wait for FIN sequence. | |
11 | ||
12 | For normal sockets, a reference is taken to their net namespace, so it will | |
13 | never exit while the socket is open. However, kernel sockets do not take a | |
14 | reference to their net namespace, so it may begin exiting while the kernel | |
15 | socket is still open. In this case if the kernel socket is a tcp socket, | |
16 | it will stay open trying to complete its close sequence. The sock's dst(s) | |
17 | hold a reference to their interface, which are all transferred to the | |
18 | namespace's loopback interface when the real interfaces are taken down. | |
19 | When the namespace tries to take down its loopback interface, it hangs | |
20 | waiting for all references to the loopback interface to release, which | |
21 | results in messages like: | |
22 | ||
23 | unregister_netdevice: waiting for lo to become free. Usage count = 1 | |
24 | ||
25 | These messages continue until the socket finally times out and closes. | |
26 | Since the net namespace cleanup holds the net_mutex while calling its | |
27 | registered pernet callbacks, any new net namespace initialization is | |
28 | blocked until the current net namespace finishes exiting. | |
29 | ||
30 | After this change, the tcp socket notices the exiting net namespace, and | |
31 | closes immediately, releasing its dst(s) and their reference to the | |
32 | loopback interface, which lets the net namespace continue exiting. | |
33 | ||
34 | Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407 | |
35 | Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811 | |
36 | Signed-off-by: Dan Streetman <ddstreet@canonical.com> | |
37 | Signed-off-by: David S. Miller <davem@davemloft.net> | |
38 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
39 | --- | |
40 | include/net/net_namespace.h | 10 ++++++++++ | |
41 | net/ipv4/tcp.c | 3 +++ | |
42 | net/ipv4/tcp_timer.c | 15 +++++++++++++++ | |
43 | 3 files changed, 28 insertions(+) | |
44 | ||
45 | diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h | |
46 | index 1c401bd4c2e0..a5d023fa78db 100644 | |
47 | --- a/include/net/net_namespace.h | |
48 | +++ b/include/net/net_namespace.h | |
49 | @@ -221,6 +221,11 @@ int net_eq(const struct net *net1, const struct net *net2) | |
50 | return net1 == net2; | |
51 | } | |
52 | ||
53 | +static inline int check_net(const struct net *net) | |
54 | +{ | |
55 | + return atomic_read(&net->count) != 0; | |
56 | +} | |
57 | + | |
58 | void net_drop_ns(void *); | |
59 | ||
60 | #else | |
61 | @@ -245,6 +250,11 @@ int net_eq(const struct net *net1, const struct net *net2) | |
62 | return 1; | |
63 | } | |
64 | ||
65 | +static inline int check_net(const struct net *net) | |
66 | +{ | |
67 | + return 1; | |
68 | +} | |
69 | + | |
70 | #define net_drop_ns NULL | |
71 | #endif | |
72 | ||
73 | diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c | |
74 | index a3e91b552edc..fd2a086da910 100644 | |
75 | --- a/net/ipv4/tcp.c | |
76 | +++ b/net/ipv4/tcp.c | |
77 | @@ -2258,6 +2258,9 @@ void tcp_close(struct sock *sk, long timeout) | |
78 | tcp_send_active_reset(sk, GFP_ATOMIC); | |
79 | __NET_INC_STATS(sock_net(sk), | |
80 | LINUX_MIB_TCPABORTONMEMORY); | |
81 | + } else if (!check_net(sock_net(sk))) { | |
82 | + /* Not possible to send reset; just close */ | |
83 | + tcp_set_state(sk, TCP_CLOSE); | |
84 | } | |
85 | } | |
86 | ||
87 | diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c | |
88 | index e906014890b6..ec1e5de41653 100644 | |
89 | --- a/net/ipv4/tcp_timer.c | |
90 | +++ b/net/ipv4/tcp_timer.c | |
91 | @@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk) | |
92 | * to prevent DoS attacks. It is called when a retransmission timeout | |
93 | * or zero probe timeout occurs on orphaned socket. | |
94 | * | |
95 | + * Also close if our net namespace is exiting; in that case there is no | |
96 | + * hope of ever communicating again since all netns interfaces are already | |
97 | + * down (or about to be down), and we need to release our dst references, | |
98 | + * which have been moved to the netns loopback interface, so the namespace | |
99 | + * can finish exiting. This condition is only possible if we are a kernel | |
100 | + * socket, as those do not hold references to the namespace. | |
101 | + * | |
102 | * Criteria is still not confirmed experimentally and may change. | |
103 | * We kill the socket, if: | |
104 | * 1. If number of orphaned sockets exceeds an administratively configured | |
105 | * limit. | |
106 | * 2. If we have strong memory pressure. | |
107 | + * 3. If our net namespace is exiting. | |
108 | */ | |
109 | static int tcp_out_of_resources(struct sock *sk, bool do_reset) | |
110 | { | |
111 | @@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) | |
112 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); | |
113 | return 1; | |
114 | } | |
115 | + | |
116 | + if (!check_net(sock_net(sk))) { | |
117 | + /* Not possible to send reset; just close */ | |
118 | + tcp_done(sk); | |
119 | + return 1; | |
120 | + } | |
121 | + | |
122 | return 0; | |
123 | } | |
124 | ||
125 | -- | |
126 | 2.14.2 | |
127 |