]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/ipv4/inet_connection_sock.c
inet: kill smallest_size and smallest_port
[mirror_ubuntu-artful-kernel.git] / net / ipv4 / inet_connection_sock.c
CommitLineData
3f421baa
ACM
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Support for INET connection oriented protocols.
7 *
8 * Authors: See the TCP sources
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or(at your option) any later version.
14 */
15
3f421baa
ACM
16#include <linux/module.h>
17#include <linux/jhash.h>
18
19#include <net/inet_connection_sock.h>
20#include <net/inet_hashtables.h>
21#include <net/inet_timewait_sock.h>
22#include <net/ip.h>
23#include <net/route.h>
24#include <net/tcp_states.h>
a019d6fe 25#include <net/xfrm.h>
fa76ce73 26#include <net/tcp.h>
c125e80b 27#include <net/sock_reuseport.h>
3f421baa
ACM
28
29#ifdef INET_CSK_DEBUG
30const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
31EXPORT_SYMBOL(inet_csk_timer_bug_msg);
32#endif
33
fe38d2a1
JB
34#if IS_ENABLED(CONFIG_IPV6)
35/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
36 * only, and any IPv4 addresses if not IPv6 only
37 * match_wildcard == false: addresses must be exactly the same, i.e.
38 * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
39 * and 0.0.0.0 equals to 0.0.0.0 only
40 */
41static int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
42 bool match_wildcard)
43{
44 const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
45 int sk2_ipv6only = inet_v6_ipv6only(sk2);
46 int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
47 int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
48
49 /* if both are mapped, treat as IPv4 */
50 if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
51 if (!sk2_ipv6only) {
52 if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
53 return 1;
54 if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
55 return match_wildcard;
56 }
57 return 0;
58 }
59
60 if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
61 return 1;
62
63 if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
64 !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
65 return 1;
66
67 if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
68 !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
69 return 1;
70
71 if (sk2_rcv_saddr6 &&
72 ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
73 return 1;
74
75 return 0;
76}
77#endif
78
79/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
80 * match_wildcard == false: addresses must be exactly the same, i.e.
81 * 0.0.0.0 only equals to 0.0.0.0
82 */
83static int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
84 bool match_wildcard)
85{
86 if (!ipv6_only_sock(sk2)) {
87 if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
88 return 1;
89 if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
90 return match_wildcard;
91 }
92 return 0;
93}
94
95int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
96 bool match_wildcard)
97{
98#if IS_ENABLED(CONFIG_IPV6)
99 if (sk->sk_family == AF_INET6)
100 return ipv6_rcv_saddr_equal(sk, sk2, match_wildcard);
101#endif
102 return ipv4_rcv_saddr_equal(sk, sk2, match_wildcard);
103}
104EXPORT_SYMBOL(inet_rcv_saddr_equal);
105
0bbf87d8 106void inet_get_local_port_range(struct net *net, int *low, int *high)
227b60f5 107{
95c96174
ED
108 unsigned int seq;
109
227b60f5 110 do {
c9d8f1a6 111 seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
227b60f5 112
c9d8f1a6
CW
113 *low = net->ipv4.ip_local_ports.range[0];
114 *high = net->ipv4.ip_local_ports.range[1];
115 } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
227b60f5
SH
116}
117EXPORT_SYMBOL(inet_get_local_port_range);
3f421baa 118
aa078842
JB
119static int inet_csk_bind_conflict(const struct sock *sk,
120 const struct inet_bind_bucket *tb,
121 bool relax, bool reuseport_ok)
3f421baa 122{
3f421baa 123 struct sock *sk2;
0643ee4f
TH
124 bool reuse = sk->sk_reuse;
125 bool reuseport = !!sk->sk_reuseport && reuseport_ok;
da5e3630 126 kuid_t uid = sock_i_uid((struct sock *)sk);
3f421baa 127
7477fd2e
PE
128 /*
129 * Unlike other sk lookup places we do not check
130 * for sk_net here, since _all_ the socks listed
131 * in tb->owners list belong to the same net - the
132 * one this bucket belongs to.
133 */
134
b67bfe0d 135 sk_for_each_bound(sk2, &tb->owners) {
3f421baa 136 if (sk != sk2 &&
3f421baa
ACM
137 (!sk->sk_bound_dev_if ||
138 !sk2->sk_bound_dev_if ||
139 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
da5e3630
TH
140 if ((!reuse || !sk2->sk_reuse ||
141 sk2->sk_state == TCP_LISTEN) &&
142 (!reuseport || !sk2->sk_reuseport ||
c125e80b
CG
143 rcu_access_pointer(sk->sk_reuseport_cb) ||
144 (sk2->sk_state != TCP_TIME_WAIT &&
da5e3630 145 !uid_eq(uid, sock_i_uid(sk2))))) {
aa078842 146 if (inet_rcv_saddr_equal(sk, sk2, true))
3f421baa 147 break;
8d238b25 148 }
aacd9289
AC
149 if (!relax && reuse && sk2->sk_reuse &&
150 sk2->sk_state != TCP_LISTEN) {
aa078842 151 if (inet_rcv_saddr_equal(sk, sk2, true))
aacd9289
AC
152 break;
153 }
3f421baa
ACM
154 }
155 }
b67bfe0d 156 return sk2 != NULL;
3f421baa 157}
971af18b 158
3f421baa
ACM
159/* Obtain a reference to a local port for the given sock,
160 * if snum is zero it means select any available local port.
ea8add2b 161 * We try to allocate an odd port (and leave even ports for connect())
3f421baa 162 */
ab1e0a13 163int inet_csk_get_port(struct sock *sk, unsigned short snum)
3f421baa 164{
ea8add2b
ED
165 bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
166 struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
167 int ret = 1, attempts = 5, port = snum;
3f421baa 168 struct inet_bind_hashbucket *head;
3b1e0a65 169 struct net *net = sock_net(sk);
ea8add2b
ED
170 int i, low, high, attempt_half;
171 struct inet_bind_bucket *tb;
da5e3630 172 kuid_t uid = sock_i_uid(sk);
ea8add2b 173 u32 remaining, offset;
0643ee4f 174 bool reuseport_ok = !!snum;
3f421baa 175
ea8add2b 176 if (port) {
ea8add2b
ED
177 head = &hinfo->bhash[inet_bhashfn(net, port,
178 hinfo->bhash_size)];
179 spin_lock_bh(&head->lock);
180 inet_bind_bucket_for_each(tb, &head->chain)
181 if (net_eq(ib_net(tb), net) && tb->port == port)
182 goto tb_found;
227b60f5 183
ea8add2b
ED
184 goto tb_not_found;
185 }
a9d8f911 186again:
ea8add2b
ED
187 attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
188other_half_scan:
189 inet_get_local_port_range(net, &low, &high);
190 high++; /* [32768, 60999] -> [32768, 61000[ */
191 if (high - low < 4)
192 attempt_half = 0;
193 if (attempt_half) {
194 int half = low + (((high - low) >> 2) << 1);
195
196 if (attempt_half == 1)
197 high = half;
198 else
199 low = half;
200 }
201 remaining = high - low;
202 if (likely(remaining > 1))
203 remaining &= ~1U;
3f421baa 204
ea8add2b
ED
205 offset = prandom_u32() % remaining;
206 /* __inet_hash_connect() favors ports having @low parity
207 * We do the opposite to not pollute connect() users.
208 */
209 offset |= 1U;
ea8add2b
ED
210
211other_parity_scan:
212 port = low + offset;
213 for (i = 0; i < remaining; i += 2, port += 2) {
214 if (unlikely(port >= high))
215 port -= remaining;
216 if (inet_is_local_reserved_port(net, port))
217 continue;
218 head = &hinfo->bhash[inet_bhashfn(net, port,
219 hinfo->bhash_size)];
220 spin_lock_bh(&head->lock);
221 inet_bind_bucket_for_each(tb, &head->chain)
222 if (net_eq(ib_net(tb), net) && tb->port == port) {
aa078842 223 if (!inet_csk_bind_conflict(sk, tb, false, reuseport_ok))
ea8add2b
ED
224 goto tb_found;
225 goto next_port;
946f9eb2 226 }
ea8add2b
ED
227 goto tb_not_found;
228next_port:
229 spin_unlock_bh(&head->lock);
230 cond_resched();
231 }
232
ea8add2b
ED
233 offset--;
234 if (!(offset & 1))
235 goto other_parity_scan;
236
237 if (attempt_half == 1) {
238 /* OK we now try the upper half of the range */
239 attempt_half = 2;
240 goto other_half_scan;
241 }
242 return ret;
243
244tb_not_found:
245 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
246 net, head, port);
247 if (!tb)
248 goto fail_unlock;
3f421baa
ACM
249tb_found:
250 if (!hlist_empty(&tb->owners)) {
4a17fd52
PE
251 if (sk->sk_reuse == SK_FORCE_REUSE)
252 goto success;
253
b9470c27 254 if ((tb->fastreuse > 0 && reuse) ||
da5e3630 255 (tb->fastreuseport > 0 &&
e5fbfc1c 256 !rcu_access_pointer(sk->sk_reuseport_cb) &&
b9470c27 257 sk->sk_reuseport && uid_eq(tb->fastuid, uid)))
3f421baa 258 goto success;
aa078842 259 if (inet_csk_bind_conflict(sk, tb, true, reuseport_ok)) {
ea8add2b
ED
260 if ((reuse ||
261 (tb->fastreuseport > 0 &&
262 sk->sk_reuseport &&
263 !rcu_access_pointer(sk->sk_reuseport_cb) &&
b9470c27
JB
264 uid_eq(tb->fastuid, uid))) && !snum &&
265 --attempts >= 0) {
ea8add2b
ED
266 spin_unlock_bh(&head->lock);
267 goto again;
a9d8f911 268 }
ea8add2b 269 goto fail_unlock;
3f421baa 270 }
ea8add2b 271 if (!reuse)
3f421baa 272 tb->fastreuse = 0;
ea8add2b
ED
273 if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
274 tb->fastreuseport = 0;
275 } else {
276 tb->fastreuse = reuse;
da5e3630
TH
277 if (sk->sk_reuseport) {
278 tb->fastreuseport = 1;
279 tb->fastuid = uid;
ea8add2b 280 } else {
da5e3630 281 tb->fastreuseport = 0;
ea8add2b 282 }
da5e3630 283 }
3f421baa
ACM
284success:
285 if (!inet_csk(sk)->icsk_bind_hash)
ea8add2b 286 inet_bind_hash(sk, tb, port);
547b792c 287 WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
e905a9ed 288 ret = 0;
3f421baa
ACM
289
290fail_unlock:
ea8add2b 291 spin_unlock_bh(&head->lock);
3f421baa
ACM
292 return ret;
293}
3f421baa
ACM
294EXPORT_SYMBOL_GPL(inet_csk_get_port);
295
296/*
297 * Wait for an incoming connection, avoid race conditions. This must be called
298 * with the socket locked.
299 */
300static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
301{
302 struct inet_connection_sock *icsk = inet_csk(sk);
303 DEFINE_WAIT(wait);
304 int err;
305
306 /*
307 * True wake-one mechanism for incoming connections: only
308 * one process gets woken up, not the 'whole herd'.
309 * Since we do not 'race & poll' for established sockets
310 * anymore, the common case will execute the loop only once.
311 *
312 * Subtle issue: "add_wait_queue_exclusive()" will be added
313 * after any current non-exclusive waiters, and we know that
314 * it will always _stay_ after any new non-exclusive waiters
315 * because all non-exclusive waiters are added at the
316 * beginning of the wait-queue. As such, it's ok to "drop"
317 * our exclusiveness temporarily when we get woken up without
318 * having to remove and re-insert us on the wait queue.
319 */
320 for (;;) {
aa395145 321 prepare_to_wait_exclusive(sk_sleep(sk), &wait,
3f421baa
ACM
322 TASK_INTERRUPTIBLE);
323 release_sock(sk);
324 if (reqsk_queue_empty(&icsk->icsk_accept_queue))
325 timeo = schedule_timeout(timeo);
cb7cf8a3 326 sched_annotate_sleep();
3f421baa
ACM
327 lock_sock(sk);
328 err = 0;
329 if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
330 break;
331 err = -EINVAL;
332 if (sk->sk_state != TCP_LISTEN)
333 break;
334 err = sock_intr_errno(timeo);
335 if (signal_pending(current))
336 break;
337 err = -EAGAIN;
338 if (!timeo)
339 break;
340 }
aa395145 341 finish_wait(sk_sleep(sk), &wait);
3f421baa
ACM
342 return err;
343}
344
345/*
346 * This will accept the next outstanding connection.
347 */
348struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
349{
350 struct inet_connection_sock *icsk = inet_csk(sk);
8336886f 351 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
8336886f 352 struct request_sock *req;
e3d95ad7 353 struct sock *newsk;
3f421baa
ACM
354 int error;
355
356 lock_sock(sk);
357
358 /* We need to make sure that this socket is listening,
359 * and that it has something pending.
360 */
361 error = -EINVAL;
362 if (sk->sk_state != TCP_LISTEN)
363 goto out_err;
364
365 /* Find already established connection */
8336886f 366 if (reqsk_queue_empty(queue)) {
3f421baa
ACM
367 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
368
369 /* If this is a non blocking socket don't sleep */
370 error = -EAGAIN;
371 if (!timeo)
372 goto out_err;
373
374 error = inet_csk_wait_for_connect(sk, timeo);
375 if (error)
376 goto out_err;
377 }
fff1f300 378 req = reqsk_queue_remove(queue, sk);
8336886f
JC
379 newsk = req->sk;
380
e3d95ad7 381 if (sk->sk_protocol == IPPROTO_TCP &&
0536fcc0
ED
382 tcp_rsk(req)->tfo_listener) {
383 spin_lock_bh(&queue->fastopenq.lock);
9439ce00 384 if (tcp_rsk(req)->tfo_listener) {
8336886f
JC
385 /* We are still waiting for the final ACK from 3WHS
386 * so can't free req now. Instead, we set req->sk to
387 * NULL to signify that the child socket is taken
388 * so reqsk_fastopen_remove() will free the req
389 * when 3WHS finishes (or is aborted).
390 */
391 req->sk = NULL;
392 req = NULL;
393 }
0536fcc0 394 spin_unlock_bh(&queue->fastopenq.lock);
8336886f 395 }
3f421baa
ACM
396out:
397 release_sock(sk);
8336886f 398 if (req)
13854e5a 399 reqsk_put(req);
3f421baa
ACM
400 return newsk;
401out_err:
402 newsk = NULL;
8336886f 403 req = NULL;
3f421baa
ACM
404 *err = error;
405 goto out;
406}
3f421baa
ACM
407EXPORT_SYMBOL(inet_csk_accept);
408
409/*
410 * Using different timers for retransmit, delayed acks and probes
e905a9ed 411 * We may wish use just one timer maintaining a list of expire jiffies
3f421baa
ACM
412 * to optimize.
413 */
414void inet_csk_init_xmit_timers(struct sock *sk,
415 void (*retransmit_handler)(unsigned long),
416 void (*delack_handler)(unsigned long),
417 void (*keepalive_handler)(unsigned long))
418{
419 struct inet_connection_sock *icsk = inet_csk(sk);
420
b24b8a24
PE
421 setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
422 (unsigned long)sk);
423 setup_timer(&icsk->icsk_delack_timer, delack_handler,
424 (unsigned long)sk);
425 setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
3f421baa
ACM
426 icsk->icsk_pending = icsk->icsk_ack.pending = 0;
427}
3f421baa
ACM
428EXPORT_SYMBOL(inet_csk_init_xmit_timers);
429
430void inet_csk_clear_xmit_timers(struct sock *sk)
431{
432 struct inet_connection_sock *icsk = inet_csk(sk);
433
434 icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
435
436 sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
437 sk_stop_timer(sk, &icsk->icsk_delack_timer);
438 sk_stop_timer(sk, &sk->sk_timer);
439}
3f421baa
ACM
440EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
441
442void inet_csk_delete_keepalive_timer(struct sock *sk)
443{
444 sk_stop_timer(sk, &sk->sk_timer);
445}
3f421baa
ACM
446EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
447
448void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
449{
450 sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
451}
3f421baa
ACM
452EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
453
e5895bc6 454struct dst_entry *inet_csk_route_req(const struct sock *sk,
6bd023f3 455 struct flowi4 *fl4,
ba3f7f04 456 const struct request_sock *req)
3f421baa 457{
3f421baa 458 const struct inet_request_sock *ireq = inet_rsk(req);
8b929ab1
ED
459 struct net *net = read_pnet(&ireq->ireq_net);
460 struct ip_options_rcu *opt = ireq->opt;
461 struct rtable *rt;
3f421baa 462
8b929ab1 463 flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
e79d9bc7 464 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
8b929ab1 465 sk->sk_protocol, inet_sk_flowi_flags(sk),
634fb979 466 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
8b929ab1 467 ireq->ir_loc_addr, ireq->ir_rmt_port,
e2d118a1 468 htons(ireq->ir_num), sk->sk_uid);
6bd023f3
DM
469 security_req_classify_flow(req, flowi4_to_flowi(fl4));
470 rt = ip_route_output_flow(net, fl4, sk);
b23dd4fe 471 if (IS_ERR(rt))
857a6e0a 472 goto no_route;
155e8336 473 if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
857a6e0a 474 goto route_err;
d8d1f30b 475 return &rt->dst;
857a6e0a
IJ
476
477route_err:
478 ip_rt_put(rt);
479no_route:
b45386ef 480 __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
857a6e0a 481 return NULL;
3f421baa 482}
3f421baa
ACM
483EXPORT_SYMBOL_GPL(inet_csk_route_req);
484
a2432c4f 485struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
77357a95
DM
486 struct sock *newsk,
487 const struct request_sock *req)
488{
489 const struct inet_request_sock *ireq = inet_rsk(req);
8b929ab1 490 struct net *net = read_pnet(&ireq->ireq_net);
77357a95 491 struct inet_sock *newinet = inet_sk(newsk);
1a7b27c9 492 struct ip_options_rcu *opt;
77357a95
DM
493 struct flowi4 *fl4;
494 struct rtable *rt;
495
496 fl4 = &newinet->cork.fl.u.ip4;
1a7b27c9
CP
497
498 rcu_read_lock();
499 opt = rcu_dereference(newinet->inet_opt);
8b929ab1 500 flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
77357a95
DM
501 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
502 sk->sk_protocol, inet_sk_flowi_flags(sk),
634fb979 503 (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
8b929ab1 504 ireq->ir_loc_addr, ireq->ir_rmt_port,
e2d118a1 505 htons(ireq->ir_num), sk->sk_uid);
77357a95
DM
506 security_req_classify_flow(req, flowi4_to_flowi(fl4));
507 rt = ip_route_output_flow(net, fl4, sk);
508 if (IS_ERR(rt))
509 goto no_route;
155e8336 510 if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
77357a95 511 goto route_err;
1a7b27c9 512 rcu_read_unlock();
77357a95
DM
513 return &rt->dst;
514
515route_err:
516 ip_rt_put(rt);
517no_route:
1a7b27c9 518 rcu_read_unlock();
b45386ef 519 __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
77357a95
DM
520 return NULL;
521}
522EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
523
dfd56b8b 524#if IS_ENABLED(CONFIG_IPV6)
3f421baa
ACM
525#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
526#else
fa76ce73 527#define AF_INET_FAMILY(fam) true
3f421baa
ACM
528#endif
529
0c3d79bc
JA
530/* Decide when to expire the request and when to resend SYN-ACK */
531static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
532 const int max_retries,
533 const u8 rskq_defer_accept,
534 int *expire, int *resend)
535{
536 if (!rskq_defer_accept) {
e6c022a4 537 *expire = req->num_timeout >= thresh;
0c3d79bc
JA
538 *resend = 1;
539 return;
540 }
e6c022a4
ED
541 *expire = req->num_timeout >= thresh &&
542 (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
0c3d79bc
JA
543 /*
544 * Do not resend while waiting for data after ACK,
545 * start to resend on end of deferring period to give
546 * last chance for data or ACK to create established socket.
547 */
548 *resend = !inet_rsk(req)->acked ||
e6c022a4 549 req->num_timeout >= rskq_defer_accept - 1;
0c3d79bc
JA
550}
551
1b70e977 552int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
e6c022a4 553{
1a2c6181 554 int err = req->rsk_ops->rtx_syn_ack(parent, req);
e6c022a4
ED
555
556 if (!err)
557 req->num_retrans++;
558 return err;
559}
560EXPORT_SYMBOL(inet_rtx_syn_ack);
561
079096f1 562/* return true if req was found in the ehash table */
b357a364
ED
563static bool reqsk_queue_unlink(struct request_sock_queue *queue,
564 struct request_sock *req)
565{
079096f1 566 struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
5e0724d0 567 bool found = false;
b357a364 568
5e0724d0
ED
569 if (sk_hashed(req_to_sk(req))) {
570 spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
b357a364 571
5e0724d0
ED
572 spin_lock(lock);
573 found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
574 spin_unlock(lock);
575 }
83fccfc3 576 if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
b357a364
ED
577 reqsk_put(req);
578 return found;
579}
580
581void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
582{
583 if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
584 reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
585 reqsk_put(req);
586 }
587}
588EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
589
f03f2e15
ED
590void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
591{
592 inet_csk_reqsk_queue_drop(sk, req);
593 reqsk_put(req);
594}
595EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
596
fa76ce73 597static void reqsk_timer_handler(unsigned long data)
a019d6fe 598{
fa76ce73
ED
599 struct request_sock *req = (struct request_sock *)data;
600 struct sock *sk_listener = req->rsk_listener;
7c083ecb 601 struct net *net = sock_net(sk_listener);
fa76ce73 602 struct inet_connection_sock *icsk = inet_csk(sk_listener);
a019d6fe 603 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
2b41fab7 604 int qlen, expire = 0, resend = 0;
fa76ce73 605 int max_retries, thresh;
2b41fab7 606 u8 defer_accept;
a019d6fe 607
00fd38d9 608 if (sk_state_load(sk_listener) != TCP_LISTEN)
079096f1 609 goto drop;
a019d6fe 610
7c083ecb 611 max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
fa76ce73 612 thresh = max_retries;
a019d6fe
ACM
613 /* Normally all the openreqs are young and become mature
614 * (i.e. converted to established socket) for first timeout.
fd4f2cea 615 * If synack was not acknowledged for 1 second, it means
a019d6fe
ACM
616 * one of the following things: synack was lost, ack was lost,
617 * rtt is high or nobody planned to ack (i.e. synflood).
618 * When server is a bit loaded, queue is populated with old
619 * open requests, reducing effective size of queue.
620 * When server is well loaded, queue size reduces to zero
621 * after several minutes of work. It is not synflood,
622 * it is normal operation. The solution is pruning
623 * too old entries overriding normal timeout, when
624 * situation becomes dangerous.
625 *
626 * Essentially, we reserve half of room for young
627 * embrions; and abort old ones without pity, if old
628 * ones are about to clog our table.
629 */
aac065c5 630 qlen = reqsk_queue_len(queue);
acb4a6bf 631 if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
aac065c5 632 int young = reqsk_queue_len_young(queue) << 1;
a019d6fe
ACM
633
634 while (thresh > 2) {
2b41fab7 635 if (qlen < young)
a019d6fe
ACM
636 break;
637 thresh--;
638 young <<= 1;
639 }
640 }
2b41fab7
ED
641 defer_accept = READ_ONCE(queue->rskq_defer_accept);
642 if (defer_accept)
643 max_retries = defer_accept;
644 syn_ack_recalc(req, thresh, max_retries, defer_accept,
fa76ce73 645 &expire, &resend);
42cb80a2 646 req->rsk_ops->syn_ack_timeout(req);
fa76ce73
ED
647 if (!expire &&
648 (!resend ||
649 !inet_rtx_syn_ack(sk_listener, req) ||
650 inet_rsk(req)->acked)) {
651 unsigned long timeo;
652
653 if (req->num_timeout++ == 0)
aac065c5 654 atomic_dec(&queue->young);
fa76ce73 655 timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
f3438bc7 656 mod_timer(&req->rsk_timer, jiffies + timeo);
fa76ce73
ED
657 return;
658 }
079096f1 659drop:
f03f2e15 660 inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
fa76ce73 661}
ec0a1966 662
079096f1
ED
663static void reqsk_queue_hash_req(struct request_sock *req,
664 unsigned long timeout)
fa76ce73 665{
fa76ce73
ED
666 req->num_retrans = 0;
667 req->num_timeout = 0;
668 req->sk = NULL;
a019d6fe 669
f3438bc7
TG
670 setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler,
671 (unsigned long)req);
672 mod_timer(&req->rsk_timer, jiffies + timeout);
29c68526 673
079096f1 674 inet_ehash_insert(req_to_sk(req), NULL);
fa76ce73
ED
675 /* before letting lookups find us, make sure all req fields
676 * are committed to memory and refcnt initialized.
677 */
678 smp_wmb();
ca6fb065 679 atomic_set(&req->rsk_refcnt, 2 + 1);
079096f1 680}
a019d6fe 681
079096f1
ED
682void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
683 unsigned long timeout)
684{
685 reqsk_queue_hash_req(req, timeout);
686 inet_csk_reqsk_queue_added(sk);
a019d6fe 687}
079096f1 688EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
a019d6fe 689
e56c57d0
ED
690/**
691 * inet_csk_clone_lock - clone an inet socket, and lock its clone
692 * @sk: the socket to clone
693 * @req: request_sock
694 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
695 *
696 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
697 */
698struct sock *inet_csk_clone_lock(const struct sock *sk,
699 const struct request_sock *req,
700 const gfp_t priority)
9f1d2604 701{
e56c57d0 702 struct sock *newsk = sk_clone_lock(sk, priority);
9f1d2604 703
00db4124 704 if (newsk) {
9f1d2604
ACM
705 struct inet_connection_sock *newicsk = inet_csk(newsk);
706
707 newsk->sk_state = TCP_SYN_RECV;
708 newicsk->icsk_bind_hash = NULL;
709
634fb979 710 inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
b44084c2
ED
711 inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
712 inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
9f1d2604
ACM
713 newsk->sk_write_space = sk_stream_write_space;
714
85017869
ED
715 /* listeners have SOCK_RCU_FREE, not the children */
716 sock_reset_flag(newsk, SOCK_RCU_FREE);
717
84f39b08 718 newsk->sk_mark = inet_rsk(req)->ir_mark;
33cf7c90
ED
719 atomic64_set(&newsk->sk_cookie,
720 atomic64_read(&inet_rsk(req)->ir_cookie));
84f39b08 721
9f1d2604 722 newicsk->icsk_retransmits = 0;
6687e988
ACM
723 newicsk->icsk_backoff = 0;
724 newicsk->icsk_probes_out = 0;
9f1d2604
ACM
725
726 /* Deinitialize accept_queue to trap illegal accesses. */
727 memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
4237c75c
VY
728
729 security_inet_csk_clone(newsk, req);
9f1d2604
ACM
730 }
731 return newsk;
732}
e56c57d0 733EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
a019d6fe
ACM
734
735/*
736 * At this point, there should be no process reference to this
737 * socket, and thus no user references at all. Therefore we
738 * can assume the socket waitqueue is inactive and nobody will
739 * try to jump onto it.
740 */
741void inet_csk_destroy_sock(struct sock *sk)
742{
547b792c
IJ
743 WARN_ON(sk->sk_state != TCP_CLOSE);
744 WARN_ON(!sock_flag(sk, SOCK_DEAD));
a019d6fe
ACM
745
746 /* It cannot be in hash table! */
547b792c 747 WARN_ON(!sk_unhashed(sk));
a019d6fe 748
c720c7e8
ED
749 /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
750 WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
a019d6fe
ACM
751
752 sk->sk_prot->destroy(sk);
753
754 sk_stream_kill_queues(sk);
755
756 xfrm_sk_free_policy(sk);
757
758 sk_refcnt_debug_release(sk);
759
777c6ae5 760 local_bh_disable();
dd24c001 761 percpu_counter_dec(sk->sk_prot->orphan_count);
777c6ae5 762 local_bh_enable();
a019d6fe
ACM
763 sock_put(sk);
764}
a019d6fe
ACM
765EXPORT_SYMBOL(inet_csk_destroy_sock);
766
e337e24d
CP
767/* This function allows to force a closure of a socket after the call to
768 * tcp/dccp_create_openreq_child().
769 */
770void inet_csk_prepare_forced_close(struct sock *sk)
c10cb5fc 771 __releases(&sk->sk_lock.slock)
e337e24d
CP
772{
773 /* sk_clone_lock locked the socket and set refcnt to 2 */
774 bh_unlock_sock(sk);
775 sock_put(sk);
776
777 /* The below has to be done to allow calling inet_csk_destroy_sock */
778 sock_set_flag(sk, SOCK_DEAD);
779 percpu_counter_inc(sk->sk_prot->orphan_count);
780 inet_sk(sk)->inet_num = 0;
781}
782EXPORT_SYMBOL(inet_csk_prepare_forced_close);
783
f985c65c 784int inet_csk_listen_start(struct sock *sk, int backlog)
a019d6fe 785{
a019d6fe 786 struct inet_connection_sock *icsk = inet_csk(sk);
10cbc8f1 787 struct inet_sock *inet = inet_sk(sk);
086c653f 788 int err = -EADDRINUSE;
a019d6fe 789
ef547f2a 790 reqsk_queue_alloc(&icsk->icsk_accept_queue);
a019d6fe 791
f985c65c 792 sk->sk_max_ack_backlog = backlog;
a019d6fe
ACM
793 sk->sk_ack_backlog = 0;
794 inet_csk_delack_init(sk);
795
796 /* There is race window here: we announce ourselves listening,
797 * but this transition is still not validated by get_port().
798 * It is OK, because this socket enters to hash table only
799 * after validation is complete.
800 */
00fd38d9 801 sk_state_store(sk, TCP_LISTEN);
c720c7e8
ED
802 if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
803 inet->inet_sport = htons(inet->inet_num);
a019d6fe
ACM
804
805 sk_dst_reset(sk);
086c653f 806 err = sk->sk_prot->hash(sk);
a019d6fe 807
086c653f
CG
808 if (likely(!err))
809 return 0;
a019d6fe
ACM
810 }
811
812 sk->sk_state = TCP_CLOSE;
086c653f 813 return err;
a019d6fe 814}
a019d6fe
ACM
815EXPORT_SYMBOL_GPL(inet_csk_listen_start);
816
ebb516af
ED
817static void inet_child_forget(struct sock *sk, struct request_sock *req,
818 struct sock *child)
819{
820 sk->sk_prot->disconnect(child, O_NONBLOCK);
821
822 sock_orphan(child);
823
824 percpu_counter_inc(sk->sk_prot->orphan_count);
825
826 if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
827 BUG_ON(tcp_sk(child)->fastopen_rsk != req);
828 BUG_ON(sk != req->rsk_listener);
829
830 /* Paranoid, to prevent race condition if
831 * an inbound pkt destined for child is
832 * blocked by sock lock in tcp_v4_rcv().
833 * Also to satisfy an assertion in
834 * tcp_v4_destroy_sock().
835 */
836 tcp_sk(child)->fastopen_rsk = NULL;
837 }
838 inet_csk_destroy_sock(child);
839 reqsk_put(req);
840}
841
7716682c
ED
842struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
843 struct request_sock *req,
844 struct sock *child)
ebb516af
ED
845{
846 struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
847
848 spin_lock(&queue->rskq_lock);
849 if (unlikely(sk->sk_state != TCP_LISTEN)) {
850 inet_child_forget(sk, req, child);
7716682c 851 child = NULL;
ebb516af
ED
852 } else {
853 req->sk = child;
854 req->dl_next = NULL;
855 if (queue->rskq_accept_head == NULL)
856 queue->rskq_accept_head = req;
857 else
858 queue->rskq_accept_tail->dl_next = req;
859 queue->rskq_accept_tail = req;
860 sk_acceptq_added(sk);
861 }
862 spin_unlock(&queue->rskq_lock);
7716682c 863 return child;
ebb516af
ED
864}
865EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
866
5e0724d0
ED
867struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
868 struct request_sock *req, bool own_req)
869{
870 if (own_req) {
871 inet_csk_reqsk_queue_drop(sk, req);
872 reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
7716682c
ED
873 if (inet_csk_reqsk_queue_add(sk, req, child))
874 return child;
5e0724d0
ED
875 }
876 /* Too bad, another child took ownership of the request, undo. */
877 bh_unlock_sock(child);
878 sock_put(child);
879 return NULL;
880}
881EXPORT_SYMBOL(inet_csk_complete_hashdance);
882
a019d6fe
ACM
883/*
884 * This routine closes sockets which have been at least partially
885 * opened, but not yet accepted.
886 */
887void inet_csk_listen_stop(struct sock *sk)
888{
889 struct inet_connection_sock *icsk = inet_csk(sk);
8336886f 890 struct request_sock_queue *queue = &icsk->icsk_accept_queue;
fff1f300 891 struct request_sock *next, *req;
a019d6fe
ACM
892
893 /* Following specs, it would be better either to send FIN
894 * (and enter FIN-WAIT-1, it is normal close)
895 * or to send active reset (abort).
896 * Certainly, it is pretty dangerous while synflood, but it is
897 * bad justification for our negligence 8)
898 * To be honest, we are not able to make either
899 * of the variants now. --ANK
900 */
fff1f300 901 while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
a019d6fe
ACM
902 struct sock *child = req->sk;
903
a019d6fe
ACM
904 local_bh_disable();
905 bh_lock_sock(child);
547b792c 906 WARN_ON(sock_owned_by_user(child));
a019d6fe
ACM
907 sock_hold(child);
908
ebb516af 909 inet_child_forget(sk, req, child);
a019d6fe
ACM
910 bh_unlock_sock(child);
911 local_bh_enable();
912 sock_put(child);
913
92d6f176 914 cond_resched();
a019d6fe 915 }
0536fcc0 916 if (queue->fastopenq.rskq_rst_head) {
8336886f 917 /* Free all the reqs queued in rskq_rst_head. */
0536fcc0 918 spin_lock_bh(&queue->fastopenq.lock);
fff1f300 919 req = queue->fastopenq.rskq_rst_head;
0536fcc0
ED
920 queue->fastopenq.rskq_rst_head = NULL;
921 spin_unlock_bh(&queue->fastopenq.lock);
fff1f300
ED
922 while (req != NULL) {
923 next = req->dl_next;
13854e5a 924 reqsk_put(req);
fff1f300 925 req = next;
8336886f
JC
926 }
927 }
ebb516af 928 WARN_ON_ONCE(sk->sk_ack_backlog);
a019d6fe 929}
a019d6fe 930EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
af05dc93
ACM
931
932void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
933{
934 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
935 const struct inet_sock *inet = inet_sk(sk);
936
937 sin->sin_family = AF_INET;
c720c7e8
ED
938 sin->sin_addr.s_addr = inet->inet_daddr;
939 sin->sin_port = inet->inet_dport;
af05dc93 940}
af05dc93 941EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
c4d93909 942
dec73ff0
ACM
943#ifdef CONFIG_COMPAT
944int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
945 char __user *optval, int __user *optlen)
946{
dbeff12b 947 const struct inet_connection_sock *icsk = inet_csk(sk);
dec73ff0 948
00db4124 949 if (icsk->icsk_af_ops->compat_getsockopt)
dec73ff0
ACM
950 return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
951 optval, optlen);
952 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
953 optval, optlen);
954}
dec73ff0
ACM
955EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
956
957int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
b7058842 958 char __user *optval, unsigned int optlen)
dec73ff0 959{
dbeff12b 960 const struct inet_connection_sock *icsk = inet_csk(sk);
dec73ff0 961
00db4124 962 if (icsk->icsk_af_ops->compat_setsockopt)
dec73ff0
ACM
963 return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
964 optval, optlen);
965 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
966 optval, optlen);
967}
dec73ff0
ACM
968EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
969#endif
80d0a69f
DM
970
971static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
972{
5abf7f7e
ED
973 const struct inet_sock *inet = inet_sk(sk);
974 const struct ip_options_rcu *inet_opt;
80d0a69f
DM
975 __be32 daddr = inet->inet_daddr;
976 struct flowi4 *fl4;
977 struct rtable *rt;
978
979 rcu_read_lock();
980 inet_opt = rcu_dereference(inet->inet_opt);
981 if (inet_opt && inet_opt->opt.srr)
982 daddr = inet_opt->opt.faddr;
983 fl4 = &fl->u.ip4;
984 rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
985 inet->inet_saddr, inet->inet_dport,
986 inet->inet_sport, sk->sk_protocol,
987 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
988 if (IS_ERR(rt))
989 rt = NULL;
990 if (rt)
991 sk_setup_caps(sk, &rt->dst);
992 rcu_read_unlock();
993
994 return &rt->dst;
995}
996
997struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
998{
999 struct dst_entry *dst = __sk_dst_check(sk, 0);
1000 struct inet_sock *inet = inet_sk(sk);
1001
1002 if (!dst) {
1003 dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
1004 if (!dst)
1005 goto out;
1006 }
6700c270 1007 dst->ops->update_pmtu(dst, sk, NULL, mtu);
80d0a69f
DM
1008
1009 dst = __sk_dst_check(sk, 0);
1010 if (!dst)
1011 dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
1012out:
1013 return dst;
1014}
1015EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);