]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - net/ipv4/tcp_ipv4.c
tcp: pass previous skb to tcp_shifted_skb()
[mirror_ubuntu-eoan-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
afd46503 53#define pr_fmt(fmt) "TCP: " fmt
1da177e4 54
eb4dea58 55#include <linux/bottom_half.h>
1da177e4
LT
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
5a0e3ad6 64#include <linux/slab.h>
1da177e4 65
457c4cbc 66#include <net/net_namespace.h>
1da177e4 67#include <net/icmp.h>
304a1618 68#include <net/inet_hashtables.h>
1da177e4 69#include <net/tcp.h>
20380731 70#include <net/transp_v6.h>
1da177e4
LT
71#include <net/ipv6.h>
72#include <net/inet_common.h>
6d6ee43e 73#include <net/timewait_sock.h>
1da177e4 74#include <net/xfrm.h>
6e5714ea 75#include <net/secure_seq.h>
076bb0c8 76#include <net/busy_poll.h>
1da177e4
LT
77
78#include <linux/inet.h>
79#include <linux/ipv6.h>
80#include <linux/stddef.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
6797318e 83#include <linux/inetdevice.h>
1da177e4 84
cf80e0e4 85#include <crypto/hash.h>
cfb6eeb4
YH
86#include <linux/scatterlist.h>
87
cfb6eeb4 88#ifdef CONFIG_TCP_MD5SIG
a915da9b 89static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 90 __be32 daddr, __be32 saddr, const struct tcphdr *th);
cfb6eeb4
YH
91#endif
92
5caea4ea 93struct inet_hashinfo tcp_hashinfo;
4bc2f18b 94EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 95
84b114b9 96static u32 tcp_v4_init_seq(const struct sk_buff *skb)
1da177e4 97{
84b114b9
ED
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102}
103
5d2ed052 104static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
84b114b9 105{
5d2ed052 106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
1da177e4
LT
107}
108
6d6ee43e
ACM
109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110{
111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 struct tcp_sock *tp = tcp_sk(sk);
113
114 /* With PAWS, it is safe from the viewpoint
115 of data integrity. Even without PAWS it is safe provided sequence
116 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117
118 Actually, the idea is close to VJ's one, only timestamp cache is
119 held not per host, but per port pair and TW bucket is used as state
120 holder.
121
122 If TW bucket has been already destroyed we fall back to VJ's scheme
123 and use initial timestamp retrieved from peer table.
124 */
125 if (tcptw->tw_ts_recent_stamp &&
56ab6b93 126 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
9d729f72 127 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
128 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129 if (tp->write_seq == 0)
130 tp->write_seq = 1;
131 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
132 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133 sock_hold(sktw);
134 return 1;
135 }
136
137 return 0;
138}
6d6ee43e
ACM
139EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140
1da177e4
LT
141/* This will initiate an outgoing connection. */
142int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143{
2d7192d6 144 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
1da177e4
LT
145 struct inet_sock *inet = inet_sk(sk);
146 struct tcp_sock *tp = tcp_sk(sk);
dca8b089 147 __be16 orig_sport, orig_dport;
bada8adc 148 __be32 daddr, nexthop;
da905bd1 149 struct flowi4 *fl4;
2d7192d6 150 struct rtable *rt;
1da177e4 151 int err;
f6d8bd05 152 struct ip_options_rcu *inet_opt;
1946e672 153 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
1da177e4
LT
154
155 if (addr_len < sizeof(struct sockaddr_in))
156 return -EINVAL;
157
158 if (usin->sin_family != AF_INET)
159 return -EAFNOSUPPORT;
160
161 nexthop = daddr = usin->sin_addr.s_addr;
f6d8bd05 162 inet_opt = rcu_dereference_protected(inet->inet_opt,
1e1d04e6 163 lockdep_sock_is_held(sk));
f6d8bd05 164 if (inet_opt && inet_opt->opt.srr) {
1da177e4
LT
165 if (!daddr)
166 return -EINVAL;
f6d8bd05 167 nexthop = inet_opt->opt.faddr;
1da177e4
LT
168 }
169
dca8b089
DM
170 orig_sport = inet->inet_sport;
171 orig_dport = usin->sin_port;
da905bd1
DM
172 fl4 = &inet->cork.fl.u.ip4;
173 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
b23dd4fe
DM
174 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 IPPROTO_TCP,
0e0d44ab 176 orig_sport, orig_dport, sk);
b23dd4fe
DM
177 if (IS_ERR(rt)) {
178 err = PTR_ERR(rt);
179 if (err == -ENETUNREACH)
f1d8cba6 180 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
b23dd4fe 181 return err;
584bdf8c 182 }
1da177e4
LT
183
184 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
185 ip_rt_put(rt);
186 return -ENETUNREACH;
187 }
188
f6d8bd05 189 if (!inet_opt || !inet_opt->opt.srr)
da905bd1 190 daddr = fl4->daddr;
1da177e4 191
c720c7e8 192 if (!inet->inet_saddr)
da905bd1 193 inet->inet_saddr = fl4->saddr;
d1e559d0 194 sk_rcv_saddr_set(sk, inet->inet_saddr);
1da177e4 195
c720c7e8 196 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
197 /* Reset inherited state */
198 tp->rx_opt.ts_recent = 0;
199 tp->rx_opt.ts_recent_stamp = 0;
ee995283
PE
200 if (likely(!tp->repair))
201 tp->write_seq = 0;
1da177e4
LT
202 }
203
c720c7e8 204 inet->inet_dport = usin->sin_port;
d1e559d0 205 sk_daddr_set(sk, daddr);
1da177e4 206
d83d8461 207 inet_csk(sk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
208 if (inet_opt)
209 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1da177e4 210
bee7ca9e 211 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
212
213 /* Socket identity is still unknown (sport may be zero).
214 * However we set state to SYN-SENT and not releasing socket
215 * lock select source port, enter ourselves into the hash tables and
216 * complete initialization after this.
217 */
218 tcp_set_state(sk, TCP_SYN_SENT);
1946e672 219 err = inet_hash_connect(tcp_death_row, sk);
1da177e4
LT
220 if (err)
221 goto failure;
222
877d1f62 223 sk_set_txhash(sk);
9e7ceb06 224
da905bd1 225 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
b23dd4fe
DM
226 inet->inet_sport, inet->inet_dport, sk);
227 if (IS_ERR(rt)) {
228 err = PTR_ERR(rt);
229 rt = NULL;
1da177e4 230 goto failure;
b23dd4fe 231 }
1da177e4 232 /* OK, now commit destination to socket. */
bcd76111 233 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 234 sk_setup_caps(sk, &rt->dst);
19f6d3f3 235 rt = NULL;
1da177e4 236
00355fa5 237 if (likely(!tp->repair)) {
00355fa5 238 if (!tp->write_seq)
84b114b9
ED
239 tp->write_seq = secure_tcp_seq(inet->inet_saddr,
240 inet->inet_daddr,
241 inet->inet_sport,
242 usin->sin_port);
5d2ed052
ED
243 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
244 inet->inet_saddr,
84b114b9 245 inet->inet_daddr);
00355fa5 246 }
1da177e4 247
c720c7e8 248 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4 249
19f6d3f3
WW
250 if (tcp_fastopen_defer_connect(sk, &err))
251 return err;
252 if (err)
253 goto failure;
254
2b916477 255 err = tcp_connect(sk);
ee995283 256
1da177e4
LT
257 if (err)
258 goto failure;
259
260 return 0;
261
262failure:
7174259e
ACM
263 /*
264 * This unhashes the socket and releases the local port,
265 * if necessary.
266 */
1da177e4
LT
267 tcp_set_state(sk, TCP_CLOSE);
268 ip_rt_put(rt);
269 sk->sk_route_caps = 0;
c720c7e8 270 inet->inet_dport = 0;
1da177e4
LT
271 return err;
272}
4bc2f18b 273EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 274
1da177e4 275/*
563d34d0
ED
276 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
277 * It can be called through tcp_release_cb() if socket was owned by user
278 * at the time tcp_v4_err() was called to handle ICMP message.
1da177e4 279 */
4fab9071 280void tcp_v4_mtu_reduced(struct sock *sk)
1da177e4 281{
1da177e4 282 struct inet_sock *inet = inet_sk(sk);
02b2faaf
ED
283 struct dst_entry *dst;
284 u32 mtu;
1da177e4 285
02b2faaf
ED
286 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
287 return;
288 mtu = tcp_sk(sk)->mtu_info;
80d0a69f
DM
289 dst = inet_csk_update_pmtu(sk, mtu);
290 if (!dst)
1da177e4
LT
291 return;
292
1da177e4
LT
293 /* Something is about to be wrong... Remember soft error
294 * for the case, if this connection will not able to recover.
295 */
296 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
297 sk->sk_err_soft = EMSGSIZE;
298
299 mtu = dst_mtu(dst);
300
301 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
482fc609 302 ip_sk_accept_pmtu(sk) &&
d83d8461 303 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
304 tcp_sync_mss(sk, mtu);
305
306 /* Resend the TCP packet because it's
307 * clear that the old packet has been
308 * dropped. This is the new "fast" path mtu
309 * discovery.
310 */
311 tcp_simple_retransmit(sk);
312 } /* else let the usual retransmit timer handle it */
313}
4fab9071 314EXPORT_SYMBOL(tcp_v4_mtu_reduced);
1da177e4 315
55be7a9c
DM
316static void do_redirect(struct sk_buff *skb, struct sock *sk)
317{
318 struct dst_entry *dst = __sk_dst_check(sk, 0);
319
1ed5c48f 320 if (dst)
6700c270 321 dst->ops->redirect(dst, sk, skb);
55be7a9c
DM
322}
323
26e37360
ED
324
325/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
9cf74903 326void tcp_req_err(struct sock *sk, u32 seq, bool abort)
26e37360
ED
327{
328 struct request_sock *req = inet_reqsk(sk);
329 struct net *net = sock_net(sk);
330
331 /* ICMPs are not backlogged, hence we cannot get
332 * an established socket here.
333 */
26e37360 334 if (seq != tcp_rsk(req)->snt_isn) {
02a1d6e7 335 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
9cf74903 336 } else if (abort) {
26e37360
ED
337 /*
338 * Still in SYN_RECV, just remove it silently.
339 * There is no good way to pass the error to the newly
340 * created socket, and POSIX does not want network
341 * errors returned from accept().
342 */
c6973669 343 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
9caad864 344 tcp_listendrop(req->rsk_listener);
26e37360 345 }
ef84d8ce 346 reqsk_put(req);
26e37360
ED
347}
348EXPORT_SYMBOL(tcp_req_err);
349
1da177e4
LT
350/*
351 * This routine is called by the ICMP module when it gets some
352 * sort of error condition. If err < 0 then the socket should
353 * be closed and the error returned to the user. If err > 0
354 * it's just the icmp type << 8 | icmp code. After adjustment
355 * header points to the first 8 bytes of the tcp header. We need
356 * to find the appropriate port.
357 *
358 * The locking strategy used here is very "optimistic". When
359 * someone else accesses the socket the ICMP is just dropped
360 * and for some paths there is no check at all.
361 * A more general error queue to queue errors for later handling
362 * is probably better.
363 *
364 */
365
4d1a2d9e 366void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 367{
b71d1d42 368 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
4d1a2d9e 369 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 370 struct inet_connection_sock *icsk;
1da177e4
LT
371 struct tcp_sock *tp;
372 struct inet_sock *inet;
4d1a2d9e
DL
373 const int type = icmp_hdr(icmp_skb)->type;
374 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 375 struct sock *sk;
f1ecd5d9 376 struct sk_buff *skb;
0a672f74 377 struct request_sock *fastopen;
9a568de4
ED
378 u32 seq, snd_una;
379 s32 remaining;
380 u32 delta_us;
1da177e4 381 int err;
4d1a2d9e 382 struct net *net = dev_net(icmp_skb->dev);
1da177e4 383
26e37360
ED
384 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
385 th->dest, iph->saddr, ntohs(th->source),
3fa6f616 386 inet_iif(icmp_skb), 0);
1da177e4 387 if (!sk) {
5d3848bc 388 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
1da177e4
LT
389 return;
390 }
391 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 392 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
393 return;
394 }
26e37360
ED
395 seq = ntohl(th->seq);
396 if (sk->sk_state == TCP_NEW_SYN_RECV)
9cf74903
ED
397 return tcp_req_err(sk, seq,
398 type == ICMP_PARAMETERPROB ||
399 type == ICMP_TIME_EXCEEDED ||
400 (type == ICMP_DEST_UNREACH &&
401 (code == ICMP_NET_UNREACH ||
402 code == ICMP_HOST_UNREACH)));
1da177e4
LT
403
404 bh_lock_sock(sk);
405 /* If too many ICMPs get dropped on busy
406 * servers this needs to be solved differently.
563d34d0
ED
407 * We do take care of PMTU discovery (RFC1191) special case :
408 * we can receive locally generated ICMP messages while socket is held.
1da177e4 409 */
b74aa930
ED
410 if (sock_owned_by_user(sk)) {
411 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
02a1d6e7 412 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
b74aa930 413 }
1da177e4
LT
414 if (sk->sk_state == TCP_CLOSE)
415 goto out;
416
97e3ecd1 417 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 418 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
97e3ecd1 419 goto out;
420 }
421
f1ecd5d9 422 icsk = inet_csk(sk);
1da177e4 423 tp = tcp_sk(sk);
0a672f74
YC
424 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
425 fastopen = tp->fastopen_rsk;
426 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
1da177e4 427 if (sk->sk_state != TCP_LISTEN &&
0a672f74 428 !between(seq, snd_una, tp->snd_nxt)) {
02a1d6e7 429 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
430 goto out;
431 }
432
433 switch (type) {
55be7a9c 434 case ICMP_REDIRECT:
45caeaa5
JM
435 if (!sock_owned_by_user(sk))
436 do_redirect(icmp_skb, sk);
55be7a9c 437 goto out;
1da177e4
LT
438 case ICMP_SOURCE_QUENCH:
439 /* Just silently ignore these. */
440 goto out;
441 case ICMP_PARAMETERPROB:
442 err = EPROTO;
443 break;
444 case ICMP_DEST_UNREACH:
445 if (code > NR_ICMP_UNREACH)
446 goto out;
447
448 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
0d4f0608
ED
449 /* We are not interested in TCP_LISTEN and open_requests
450 * (SYN-ACKs send out by Linux are always <576bytes so
451 * they should go through unfragmented).
452 */
453 if (sk->sk_state == TCP_LISTEN)
454 goto out;
455
563d34d0 456 tp->mtu_info = info;
144d56e9 457 if (!sock_owned_by_user(sk)) {
563d34d0 458 tcp_v4_mtu_reduced(sk);
144d56e9 459 } else {
7aa5470c 460 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
144d56e9
ED
461 sock_hold(sk);
462 }
1da177e4
LT
463 goto out;
464 }
465
466 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
467 /* check if icmp_skb allows revert of backoff
468 * (see draft-zimmermann-tcp-lcd) */
469 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
470 break;
471 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
0a672f74 472 !icsk->icsk_backoff || fastopen)
f1ecd5d9
DL
473 break;
474
8f49c270
DM
475 if (sock_owned_by_user(sk))
476 break;
477
f1ecd5d9 478 icsk->icsk_backoff--;
fcdd1cf4
ED
479 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
480 TCP_TIMEOUT_INIT;
481 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
f1ecd5d9
DL
482
483 skb = tcp_write_queue_head(sk);
484 BUG_ON(!skb);
485
9a568de4
ED
486 tcp_mstamp_refresh(tp);
487 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
7faee5c0 488 remaining = icsk->icsk_rto -
9a568de4 489 usecs_to_jiffies(delta_us);
f1ecd5d9 490
9a568de4 491 if (remaining > 0) {
f1ecd5d9
DL
492 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
493 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
494 } else {
495 /* RTO revert clocked out retransmission.
496 * Will retransmit now */
497 tcp_retransmit_timer(sk);
498 }
499
1da177e4
LT
500 break;
501 case ICMP_TIME_EXCEEDED:
502 err = EHOSTUNREACH;
503 break;
504 default:
505 goto out;
506 }
507
508 switch (sk->sk_state) {
1da177e4 509 case TCP_SYN_SENT:
0a672f74
YC
510 case TCP_SYN_RECV:
511 /* Only in fast or simultaneous open. If a fast open socket is
512 * is already accepted it is treated as a connected one below.
513 */
51456b29 514 if (fastopen && !fastopen->sk)
0a672f74
YC
515 break;
516
1da177e4 517 if (!sock_owned_by_user(sk)) {
1da177e4
LT
518 sk->sk_err = err;
519
520 sk->sk_error_report(sk);
521
522 tcp_done(sk);
523 } else {
524 sk->sk_err_soft = err;
525 }
526 goto out;
527 }
528
529 /* If we've already connected we will keep trying
530 * until we time out, or the user gives up.
531 *
532 * rfc1122 4.2.3.9 allows to consider as hard errors
533 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
534 * but it is obsoleted by pmtu discovery).
535 *
536 * Note, that in modern internet, where routing is unreliable
537 * and in each dark corner broken firewalls sit, sending random
538 * errors ordered by their masters even this two messages finally lose
539 * their original sense (even Linux sends invalid PORT_UNREACHs)
540 *
541 * Now we are in compliance with RFCs.
542 * --ANK (980905)
543 */
544
545 inet = inet_sk(sk);
546 if (!sock_owned_by_user(sk) && inet->recverr) {
547 sk->sk_err = err;
548 sk->sk_error_report(sk);
549 } else { /* Only an error on timeout */
550 sk->sk_err_soft = err;
551 }
552
553out:
554 bh_unlock_sock(sk);
555 sock_put(sk);
556}
557
28850dc7 558void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1da177e4 559{
aa8223c7 560 struct tcphdr *th = tcp_hdr(skb);
1da177e4 561
84fa7933 562 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 563 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 564 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 565 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 566 } else {
419f9f89 567 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 568 csum_partial(th,
1da177e4
LT
569 th->doff << 2,
570 skb->csum));
571 }
572}
573
419f9f89 574/* This routine computes an IPv4 TCP checksum. */
bb296246 575void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89 576{
cf533ea5 577 const struct inet_sock *inet = inet_sk(sk);
419f9f89
HX
578
579 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
580}
4bc2f18b 581EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 582
1da177e4
LT
583/*
584 * This routine will send an RST to the other tcp.
585 *
586 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
587 * for reset.
588 * Answer: if a packet caused RST, it is not for a socket
589 * existing in our system, if it is matched to a socket,
590 * it is just duplicate segment or bug in other side's TCP.
591 * So that we build reply only basing on parameters
592 * arrived with segment.
593 * Exception: precedence violation. We do not implement it in any case.
594 */
595
a00e7444 596static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
1da177e4 597{
cf533ea5 598 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
599 struct {
600 struct tcphdr th;
601#ifdef CONFIG_TCP_MD5SIG
714e85be 602 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
603#endif
604 } rep;
1da177e4 605 struct ip_reply_arg arg;
cfb6eeb4 606#ifdef CONFIG_TCP_MD5SIG
e46787f0 607 struct tcp_md5sig_key *key = NULL;
658ddaaf
SL
608 const __u8 *hash_location = NULL;
609 unsigned char newhash[16];
610 int genhash;
611 struct sock *sk1 = NULL;
cfb6eeb4 612#endif
a86b1e30 613 struct net *net;
1da177e4
LT
614
615 /* Never send a reset in response to a reset. */
616 if (th->rst)
617 return;
618
c3658e8d
ED
619 /* If sk not NULL, it means we did a successful lookup and incoming
620 * route had to be correct. prequeue might have dropped our dst.
621 */
622 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
623 return;
624
625 /* Swap the send and the receive. */
cfb6eeb4
YH
626 memset(&rep, 0, sizeof(rep));
627 rep.th.dest = th->source;
628 rep.th.source = th->dest;
629 rep.th.doff = sizeof(struct tcphdr) / 4;
630 rep.th.rst = 1;
1da177e4
LT
631
632 if (th->ack) {
cfb6eeb4 633 rep.th.seq = th->ack_seq;
1da177e4 634 } else {
cfb6eeb4
YH
635 rep.th.ack = 1;
636 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
637 skb->len - (th->doff << 2));
1da177e4
LT
638 }
639
7174259e 640 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
641 arg.iov[0].iov_base = (unsigned char *)&rep;
642 arg.iov[0].iov_len = sizeof(rep.th);
643
0f85feae 644 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
cfb6eeb4 645#ifdef CONFIG_TCP_MD5SIG
3b24d854 646 rcu_read_lock();
658ddaaf 647 hash_location = tcp_parse_md5sig_option(th);
271c3b9b 648 if (sk && sk_fullsock(sk)) {
e46787f0
FW
649 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
650 &ip_hdr(skb)->saddr, AF_INET);
651 } else if (hash_location) {
658ddaaf
SL
652 /*
653 * active side is lost. Try to find listening socket through
654 * source port, and then find md5 key through listening socket.
655 * we are not loose security here:
656 * Incoming packet is checked with md5 hash with finding key,
657 * no RST generated if md5 hash doesn't match.
658 */
a583636a
CG
659 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
660 ip_hdr(skb)->saddr,
da5e3630 661 th->source, ip_hdr(skb)->daddr,
3fa6f616
DA
662 ntohs(th->source), inet_iif(skb),
663 tcp_v4_sdif(skb));
658ddaaf
SL
664 /* don't send rst if it can't find key */
665 if (!sk1)
3b24d854
ED
666 goto out;
667
658ddaaf
SL
668 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
669 &ip_hdr(skb)->saddr, AF_INET);
670 if (!key)
3b24d854
ED
671 goto out;
672
658ddaaf 673
39f8e58e 674 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
658ddaaf 675 if (genhash || memcmp(hash_location, newhash, 16) != 0)
3b24d854
ED
676 goto out;
677
658ddaaf
SL
678 }
679
cfb6eeb4
YH
680 if (key) {
681 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
682 (TCPOPT_NOP << 16) |
683 (TCPOPT_MD5SIG << 8) |
684 TCPOLEN_MD5SIG);
685 /* Update length and the length the header thinks exists */
686 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
687 rep.th.doff = arg.iov[0].iov_len / 4;
688
49a72dfb 689 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
690 key, ip_hdr(skb)->saddr,
691 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
692 }
693#endif
eddc9ec5
ACM
694 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
695 ip_hdr(skb)->saddr, /* XXX */
52cd5750 696 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 697 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
271c3b9b
FW
698 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
699
e2446eaa 700 /* When socket is gone, all binding information is lost.
4c675258
AK
701 * routing might fail in this case. No choice here, if we choose to force
702 * input interface, we will misroute in case of asymmetric route.
e2446eaa 703 */
4c675258
AK
704 if (sk)
705 arg.bound_dev_if = sk->sk_bound_dev_if;
1da177e4 706
271c3b9b
FW
707 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
708 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
709
66b13d99 710 arg.tos = ip_hdr(skb)->tos;
e2d118a1 711 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
47dcc20a 712 local_bh_disable();
bdbbb852
ED
713 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
714 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
715 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
716 &arg, arg.iov[0].iov_len);
1da177e4 717
90bbcc60
ED
718 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
719 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
47dcc20a 720 local_bh_enable();
658ddaaf
SL
721
722#ifdef CONFIG_TCP_MD5SIG
3b24d854
ED
723out:
724 rcu_read_unlock();
658ddaaf 725#endif
1da177e4
LT
726}
727
728/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
729 outside socket context is ugly, certainly. What can I do?
730 */
731
e2d118a1 732static void tcp_v4_send_ack(const struct sock *sk,
e62a123b 733 struct sk_buff *skb, u32 seq, u32 ack,
ee684b6f 734 u32 win, u32 tsval, u32 tsecr, int oif,
88ef4a5a 735 struct tcp_md5sig_key *key,
66b13d99 736 int reply_flags, u8 tos)
1da177e4 737{
cf533ea5 738 const struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
739 struct {
740 struct tcphdr th;
714e85be 741 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 742#ifdef CONFIG_TCP_MD5SIG
714e85be 743 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
744#endif
745 ];
1da177e4 746 } rep;
e2d118a1 747 struct net *net = sock_net(sk);
1da177e4
LT
748 struct ip_reply_arg arg;
749
750 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 751 memset(&arg, 0, sizeof(arg));
1da177e4
LT
752
753 arg.iov[0].iov_base = (unsigned char *)&rep;
754 arg.iov[0].iov_len = sizeof(rep.th);
ee684b6f 755 if (tsecr) {
cfb6eeb4
YH
756 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
757 (TCPOPT_TIMESTAMP << 8) |
758 TCPOLEN_TIMESTAMP);
ee684b6f
AV
759 rep.opt[1] = htonl(tsval);
760 rep.opt[2] = htonl(tsecr);
cb48cfe8 761 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
762 }
763
764 /* Swap the send and the receive. */
765 rep.th.dest = th->source;
766 rep.th.source = th->dest;
767 rep.th.doff = arg.iov[0].iov_len / 4;
768 rep.th.seq = htonl(seq);
769 rep.th.ack_seq = htonl(ack);
770 rep.th.ack = 1;
771 rep.th.window = htons(win);
772
cfb6eeb4 773#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4 774 if (key) {
ee684b6f 775 int offset = (tsecr) ? 3 : 0;
cfb6eeb4
YH
776
777 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
778 (TCPOPT_NOP << 16) |
779 (TCPOPT_MD5SIG << 8) |
780 TCPOLEN_MD5SIG);
781 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
782 rep.th.doff = arg.iov[0].iov_len/4;
783
49a72dfb 784 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
785 key, ip_hdr(skb)->saddr,
786 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
787 }
788#endif
88ef4a5a 789 arg.flags = reply_flags;
eddc9ec5
ACM
790 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
791 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
792 arg.iov[0].iov_len, IPPROTO_TCP, 0);
793 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
794 if (oif)
795 arg.bound_dev_if = oif;
66b13d99 796 arg.tos = tos;
e2d118a1 797 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
47dcc20a 798 local_bh_disable();
bdbbb852
ED
799 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
800 skb, &TCP_SKB_CB(skb)->header.h4.opt,
24a2d43d
ED
801 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
802 &arg, arg.iov[0].iov_len);
1da177e4 803
90bbcc60 804 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
47dcc20a 805 local_bh_enable();
1da177e4
LT
806}
807
808static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
809{
8feaf0c0 810 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 811 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 812
e2d118a1 813 tcp_v4_send_ack(sk, skb,
e62a123b 814 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 815 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9a568de4 816 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
9501f972
YH
817 tcptw->tw_ts_recent,
818 tw->tw_bound_dev_if,
88ef4a5a 819 tcp_twsk_md5_key(tcptw),
66b13d99
ED
820 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
821 tw->tw_tos
9501f972 822 );
1da177e4 823
8feaf0c0 824 inet_twsk_put(tw);
1da177e4
LT
825}
826
a00e7444 827static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
7174259e 828 struct request_sock *req)
1da177e4 829{
168a8f58
JC
830 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
831 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
832 */
e62a123b
ED
833 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
834 tcp_sk(sk)->snd_nxt;
835
20a2b49f
ED
836 /* RFC 7323 2.3
837 * The window field (SEG.WND) of every outgoing segment, with the
838 * exception of <SYN> segments, MUST be right-shifted by
839 * Rcv.Wind.Shift bits:
840 */
e2d118a1 841 tcp_v4_send_ack(sk, skb, seq,
20a2b49f
ED
842 tcp_rsk(req)->rcv_nxt,
843 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
9a568de4 844 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
9501f972
YH
845 req->ts_recent,
846 0,
a915da9b
ED
847 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
848 AF_INET),
66b13d99
ED
849 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
850 ip_hdr(skb)->tos);
1da177e4
LT
851}
852
1da177e4 853/*
9bf1d83e 854 * Send a SYN-ACK after having received a SYN.
60236fdd 855 * This still operates on a request_sock only, not on a big
1da177e4
LT
856 * socket.
857 */
0f935dbe 858static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
d6274bd8 859 struct flowi *fl,
72659ecc 860 struct request_sock *req,
ca6fb065 861 struct tcp_fastopen_cookie *foc,
b3d05147 862 enum tcp_synack_type synack_type)
1da177e4 863{
2e6599cb 864 const struct inet_request_sock *ireq = inet_rsk(req);
6bd023f3 865 struct flowi4 fl4;
1da177e4 866 int err = -1;
d41db5af 867 struct sk_buff *skb;
1da177e4
LT
868
869 /* First, grab a route. */
ba3f7f04 870 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
fd80eb94 871 return -1;
1da177e4 872
b3d05147 873 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
1da177e4
LT
874
875 if (skb) {
634fb979 876 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1da177e4 877
634fb979
ED
878 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
879 ireq->ir_rmt_addr,
2e6599cb 880 ireq->opt);
b9df3cb8 881 err = net_xmit_eval(err);
1da177e4
LT
882 }
883
1da177e4
LT
884 return err;
885}
886
887/*
60236fdd 888 * IPv4 request_sock destructor.
1da177e4 889 */
60236fdd 890static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 891{
a51482bd 892 kfree(inet_rsk(req)->opt);
1da177e4
LT
893}
894
cfb6eeb4
YH
895#ifdef CONFIG_TCP_MD5SIG
896/*
897 * RFC2385 MD5 checksumming requires a mapping of
898 * IP address->MD5 Key.
899 * We need to maintain these in the sk structure.
900 */
901
902/* Find the Key structure for an address. */
b83e3deb 903struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
a915da9b
ED
904 const union tcp_md5_addr *addr,
905 int family)
cfb6eeb4 906{
fd3a154a 907 const struct tcp_sock *tp = tcp_sk(sk);
a915da9b 908 struct tcp_md5sig_key *key;
fd3a154a 909 const struct tcp_md5sig_info *md5sig;
6797318e
ID
910 __be32 mask;
911 struct tcp_md5sig_key *best_match = NULL;
912 bool match;
cfb6eeb4 913
a8afca03
ED
914 /* caller either holds rcu_read_lock() or socket lock */
915 md5sig = rcu_dereference_check(tp->md5sig_info,
1e1d04e6 916 lockdep_sock_is_held(sk));
a8afca03 917 if (!md5sig)
cfb6eeb4 918 return NULL;
083a0326 919
b67bfe0d 920 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
a915da9b
ED
921 if (key->family != family)
922 continue;
6797318e
ID
923
924 if (family == AF_INET) {
925 mask = inet_make_mask(key->prefixlen);
926 match = (key->addr.a4.s_addr & mask) ==
927 (addr->a4.s_addr & mask);
928#if IS_ENABLED(CONFIG_IPV6)
929 } else if (family == AF_INET6) {
930 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
931 key->prefixlen);
932#endif
933 } else {
934 match = false;
935 }
936
937 if (match && (!best_match ||
938 key->prefixlen > best_match->prefixlen))
939 best_match = key;
940 }
941 return best_match;
942}
943EXPORT_SYMBOL(tcp_md5_do_lookup);
944
e8f37d57
WF
945static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
946 const union tcp_md5_addr *addr,
947 int family, u8 prefixlen)
6797318e
ID
948{
949 const struct tcp_sock *tp = tcp_sk(sk);
950 struct tcp_md5sig_key *key;
951 unsigned int size = sizeof(struct in_addr);
952 const struct tcp_md5sig_info *md5sig;
953
954 /* caller either holds rcu_read_lock() or socket lock */
955 md5sig = rcu_dereference_check(tp->md5sig_info,
956 lockdep_sock_is_held(sk));
957 if (!md5sig)
958 return NULL;
959#if IS_ENABLED(CONFIG_IPV6)
960 if (family == AF_INET6)
961 size = sizeof(struct in6_addr);
962#endif
963 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
964 if (key->family != family)
965 continue;
966 if (!memcmp(&key->addr, addr, size) &&
967 key->prefixlen == prefixlen)
a915da9b 968 return key;
cfb6eeb4
YH
969 }
970 return NULL;
971}
972
b83e3deb 973struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
fd3a154a 974 const struct sock *addr_sk)
cfb6eeb4 975{
b52e6921 976 const union tcp_md5_addr *addr;
a915da9b 977
b52e6921 978 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
a915da9b 979 return tcp_md5_do_lookup(sk, addr, AF_INET);
cfb6eeb4 980}
cfb6eeb4
YH
981EXPORT_SYMBOL(tcp_v4_md5_lookup);
982
cfb6eeb4 983/* This can be called on a newly created socket, from other files */
a915da9b 984int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
6797318e
ID
985 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
986 gfp_t gfp)
cfb6eeb4
YH
987{
988 /* Add Key to the list */
b0a713e9 989 struct tcp_md5sig_key *key;
cfb6eeb4 990 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 991 struct tcp_md5sig_info *md5sig;
cfb6eeb4 992
6797318e 993 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
cfb6eeb4
YH
994 if (key) {
995 /* Pre-existing entry - just update that one. */
a915da9b 996 memcpy(key->key, newkey, newkeylen);
b0a713e9 997 key->keylen = newkeylen;
a915da9b
ED
998 return 0;
999 }
260fcbeb 1000
a8afca03 1001 md5sig = rcu_dereference_protected(tp->md5sig_info,
1e1d04e6 1002 lockdep_sock_is_held(sk));
a915da9b
ED
1003 if (!md5sig) {
1004 md5sig = kmalloc(sizeof(*md5sig), gfp);
1005 if (!md5sig)
cfb6eeb4 1006 return -ENOMEM;
cfb6eeb4 1007
a915da9b
ED
1008 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1009 INIT_HLIST_HEAD(&md5sig->head);
a8afca03 1010 rcu_assign_pointer(tp->md5sig_info, md5sig);
a915da9b 1011 }
cfb6eeb4 1012
5f3d9cb2 1013 key = sock_kmalloc(sk, sizeof(*key), gfp);
a915da9b
ED
1014 if (!key)
1015 return -ENOMEM;
71cea17e 1016 if (!tcp_alloc_md5sig_pool()) {
5f3d9cb2 1017 sock_kfree_s(sk, key, sizeof(*key));
a915da9b 1018 return -ENOMEM;
cfb6eeb4 1019 }
a915da9b
ED
1020
1021 memcpy(key->key, newkey, newkeylen);
1022 key->keylen = newkeylen;
1023 key->family = family;
6797318e 1024 key->prefixlen = prefixlen;
a915da9b
ED
1025 memcpy(&key->addr, addr,
1026 (family == AF_INET6) ? sizeof(struct in6_addr) :
1027 sizeof(struct in_addr));
1028 hlist_add_head_rcu(&key->node, &md5sig->head);
cfb6eeb4
YH
1029 return 0;
1030}
a915da9b 1031EXPORT_SYMBOL(tcp_md5_do_add);
cfb6eeb4 1032
6797318e
ID
1033int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1034 u8 prefixlen)
cfb6eeb4 1035{
a915da9b
ED
1036 struct tcp_md5sig_key *key;
1037
6797318e 1038 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
a915da9b
ED
1039 if (!key)
1040 return -ENOENT;
1041 hlist_del_rcu(&key->node);
5f3d9cb2 1042 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1043 kfree_rcu(key, rcu);
a915da9b 1044 return 0;
cfb6eeb4 1045}
a915da9b 1046EXPORT_SYMBOL(tcp_md5_do_del);
cfb6eeb4 1047
e0683e70 1048static void tcp_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
1049{
1050 struct tcp_sock *tp = tcp_sk(sk);
a915da9b 1051 struct tcp_md5sig_key *key;
b67bfe0d 1052 struct hlist_node *n;
a8afca03 1053 struct tcp_md5sig_info *md5sig;
cfb6eeb4 1054
a8afca03
ED
1055 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1056
b67bfe0d 1057 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
a915da9b 1058 hlist_del_rcu(&key->node);
5f3d9cb2 1059 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
a915da9b 1060 kfree_rcu(key, rcu);
cfb6eeb4
YH
1061 }
1062}
1063
8917a777
ID
1064static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1065 char __user *optval, int optlen)
cfb6eeb4
YH
1066{
1067 struct tcp_md5sig cmd;
1068 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
8917a777 1069 u8 prefixlen = 32;
cfb6eeb4
YH
1070
1071 if (optlen < sizeof(cmd))
1072 return -EINVAL;
1073
7174259e 1074 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1075 return -EFAULT;
1076
1077 if (sin->sin_family != AF_INET)
1078 return -EINVAL;
1079
8917a777
ID
1080 if (optname == TCP_MD5SIG_EXT &&
1081 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1082 prefixlen = cmd.tcpm_prefixlen;
1083 if (prefixlen > 32)
1084 return -EINVAL;
1085 }
1086
64a124ed 1087 if (!cmd.tcpm_keylen)
a915da9b 1088 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
8917a777 1089 AF_INET, prefixlen);
cfb6eeb4
YH
1090
1091 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1092 return -EINVAL;
1093
a915da9b 1094 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
8917a777 1095 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
a915da9b 1096 GFP_KERNEL);
cfb6eeb4
YH
1097}
1098
19689e38
ED
1099static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1100 __be32 daddr, __be32 saddr,
1101 const struct tcphdr *th, int nbytes)
cfb6eeb4 1102{
cfb6eeb4 1103 struct tcp4_pseudohdr *bp;
49a72dfb 1104 struct scatterlist sg;
19689e38 1105 struct tcphdr *_th;
cfb6eeb4 1106
19689e38 1107 bp = hp->scratch;
cfb6eeb4
YH
1108 bp->saddr = saddr;
1109 bp->daddr = daddr;
1110 bp->pad = 0;
076fb722 1111 bp->protocol = IPPROTO_TCP;
49a72dfb 1112 bp->len = cpu_to_be16(nbytes);
c7da57a1 1113
19689e38
ED
1114 _th = (struct tcphdr *)(bp + 1);
1115 memcpy(_th, th, sizeof(*th));
1116 _th->check = 0;
1117
1118 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1119 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1120 sizeof(*bp) + sizeof(*th));
cf80e0e4 1121 return crypto_ahash_update(hp->md5_req);
49a72dfb
AL
1122}
1123
a915da9b 1124static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
318cf7aa 1125 __be32 daddr, __be32 saddr, const struct tcphdr *th)
49a72dfb
AL
1126{
1127 struct tcp_md5sig_pool *hp;
cf80e0e4 1128 struct ahash_request *req;
49a72dfb
AL
1129
1130 hp = tcp_get_md5sig_pool();
1131 if (!hp)
1132 goto clear_hash_noput;
cf80e0e4 1133 req = hp->md5_req;
49a72dfb 1134
cf80e0e4 1135 if (crypto_ahash_init(req))
49a72dfb 1136 goto clear_hash;
19689e38 1137 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
49a72dfb
AL
1138 goto clear_hash;
1139 if (tcp_md5_hash_key(hp, key))
1140 goto clear_hash;
cf80e0e4
HX
1141 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1142 if (crypto_ahash_final(req))
cfb6eeb4
YH
1143 goto clear_hash;
1144
cfb6eeb4 1145 tcp_put_md5sig_pool();
cfb6eeb4 1146 return 0;
49a72dfb 1147
cfb6eeb4
YH
1148clear_hash:
1149 tcp_put_md5sig_pool();
1150clear_hash_noput:
1151 memset(md5_hash, 0, 16);
49a72dfb 1152 return 1;
cfb6eeb4
YH
1153}
1154
39f8e58e
ED
1155int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1156 const struct sock *sk,
318cf7aa 1157 const struct sk_buff *skb)
cfb6eeb4 1158{
49a72dfb 1159 struct tcp_md5sig_pool *hp;
cf80e0e4 1160 struct ahash_request *req;
318cf7aa 1161 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1162 __be32 saddr, daddr;
1163
39f8e58e
ED
1164 if (sk) { /* valid for establish/request sockets */
1165 saddr = sk->sk_rcv_saddr;
1166 daddr = sk->sk_daddr;
cfb6eeb4 1167 } else {
49a72dfb
AL
1168 const struct iphdr *iph = ip_hdr(skb);
1169 saddr = iph->saddr;
1170 daddr = iph->daddr;
cfb6eeb4 1171 }
49a72dfb
AL
1172
1173 hp = tcp_get_md5sig_pool();
1174 if (!hp)
1175 goto clear_hash_noput;
cf80e0e4 1176 req = hp->md5_req;
49a72dfb 1177
cf80e0e4 1178 if (crypto_ahash_init(req))
49a72dfb
AL
1179 goto clear_hash;
1180
19689e38 1181 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
49a72dfb
AL
1182 goto clear_hash;
1183 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1184 goto clear_hash;
1185 if (tcp_md5_hash_key(hp, key))
1186 goto clear_hash;
cf80e0e4
HX
1187 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1188 if (crypto_ahash_final(req))
49a72dfb
AL
1189 goto clear_hash;
1190
1191 tcp_put_md5sig_pool();
1192 return 0;
1193
1194clear_hash:
1195 tcp_put_md5sig_pool();
1196clear_hash_noput:
1197 memset(md5_hash, 0, 16);
1198 return 1;
cfb6eeb4 1199}
49a72dfb 1200EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1201
ba8e275a
ED
1202#endif
1203
ff74e23f 1204/* Called with rcu_read_lock() */
ba8e275a 1205static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
ff74e23f 1206 const struct sk_buff *skb)
cfb6eeb4 1207{
ba8e275a 1208#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
1209 /*
1210 * This gets called for each TCP segment that arrives
1211 * so we want to be efficient.
1212 * We have 3 drop cases:
1213 * o No MD5 hash and one expected.
1214 * o MD5 hash and we're not expecting one.
1215 * o MD5 hash and its wrong.
1216 */
cf533ea5 1217 const __u8 *hash_location = NULL;
cfb6eeb4 1218 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1219 const struct iphdr *iph = ip_hdr(skb);
cf533ea5 1220 const struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1221 int genhash;
cfb6eeb4
YH
1222 unsigned char newhash[16];
1223
a915da9b
ED
1224 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1225 AF_INET);
7d5d5525 1226 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1227
cfb6eeb4
YH
1228 /* We've parsed the options - do we have a hash? */
1229 if (!hash_expected && !hash_location)
a2a385d6 1230 return false;
cfb6eeb4
YH
1231
1232 if (hash_expected && !hash_location) {
c10d9310 1233 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
a2a385d6 1234 return true;
cfb6eeb4
YH
1235 }
1236
1237 if (!hash_expected && hash_location) {
c10d9310 1238 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
a2a385d6 1239 return true;
cfb6eeb4
YH
1240 }
1241
1242 /* Okay, so this is hash_expected and hash_location -
1243 * so we need to calculate the checksum.
1244 */
49a72dfb
AL
1245 genhash = tcp_v4_md5_hash_skb(newhash,
1246 hash_expected,
39f8e58e 1247 NULL, skb);
cfb6eeb4
YH
1248
1249 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
72145a68 1250 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
e87cc472
JP
1251 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1252 &iph->saddr, ntohs(th->source),
1253 &iph->daddr, ntohs(th->dest),
1254 genhash ? " tcp_v4_calc_md5_hash failed"
1255 : "");
a2a385d6 1256 return true;
cfb6eeb4 1257 }
a2a385d6 1258 return false;
cfb6eeb4 1259#endif
ba8e275a
ED
1260 return false;
1261}
cfb6eeb4 1262
b40cf18e
ED
1263static void tcp_v4_init_req(struct request_sock *req,
1264 const struct sock *sk_listener,
16bea70a
OP
1265 struct sk_buff *skb)
1266{
1267 struct inet_request_sock *ireq = inet_rsk(req);
1268
08d2cc3b
ED
1269 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1270 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
91ed1e66 1271 ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb);
16bea70a
OP
1272}
1273
f964629e
ED
1274static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1275 struct flowi *fl,
4396e461 1276 const struct request_sock *req)
d94e0417 1277{
4396e461 1278 return inet_csk_route_req(sk, &fl->u.ip4, req);
d94e0417
OP
1279}
1280
72a3effa 1281struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1282 .family = PF_INET,
2e6599cb 1283 .obj_size = sizeof(struct tcp_request_sock),
5db92c99 1284 .rtx_syn_ack = tcp_rtx_synack,
60236fdd
ACM
1285 .send_ack = tcp_v4_reqsk_send_ack,
1286 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1287 .send_reset = tcp_v4_send_reset,
688d1945 1288 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1289};
1290
b2e4b3de 1291static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
2aec4a29 1292 .mss_clamp = TCP_MSS_DEFAULT,
16bea70a 1293#ifdef CONFIG_TCP_MD5SIG
fd3a154a 1294 .req_md5_lookup = tcp_v4_md5_lookup,
e3afe7b7 1295 .calc_md5_hash = tcp_v4_md5_hash_skb,
b6332e6c 1296#endif
16bea70a 1297 .init_req = tcp_v4_init_req,
fb7b37a7
OP
1298#ifdef CONFIG_SYN_COOKIES
1299 .cookie_init_seq = cookie_v4_init_sequence,
1300#endif
d94e0417 1301 .route_req = tcp_v4_route_req,
84b114b9
ED
1302 .init_seq = tcp_v4_init_seq,
1303 .init_ts_off = tcp_v4_init_ts_off,
d6274bd8 1304 .send_synack = tcp_v4_send_synack,
16bea70a 1305};
cfb6eeb4 1306
1da177e4
LT
1307int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1308{
1da177e4 1309 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1310 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1311 goto drop;
1312
1fb6f159
OP
1313 return tcp_conn_request(&tcp_request_sock_ops,
1314 &tcp_request_sock_ipv4_ops, sk, skb);
1da177e4 1315
1da177e4 1316drop:
9caad864 1317 tcp_listendrop(sk);
1da177e4
LT
1318 return 0;
1319}
4bc2f18b 1320EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1321
1322
1323/*
1324 * The three way handshake has completed - we got a valid synack -
1325 * now create the new socket.
1326 */
0c27171e 1327struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
60236fdd 1328 struct request_sock *req,
5e0724d0
ED
1329 struct dst_entry *dst,
1330 struct request_sock *req_unhash,
1331 bool *own_req)
1da177e4 1332{
2e6599cb 1333 struct inet_request_sock *ireq;
1da177e4
LT
1334 struct inet_sock *newinet;
1335 struct tcp_sock *newtp;
1336 struct sock *newsk;
cfb6eeb4
YH
1337#ifdef CONFIG_TCP_MD5SIG
1338 struct tcp_md5sig_key *key;
1339#endif
f6d8bd05 1340 struct ip_options_rcu *inet_opt;
1da177e4
LT
1341
1342 if (sk_acceptq_is_full(sk))
1343 goto exit_overflow;
1344
1da177e4
LT
1345 newsk = tcp_create_openreq_child(sk, req, skb);
1346 if (!newsk)
093d2823 1347 goto exit_nonewsk;
1da177e4 1348
bcd76111 1349 newsk->sk_gso_type = SKB_GSO_TCPV4;
fae6ef87 1350 inet_sk_rx_dst_set(newsk, skb);
1da177e4
LT
1351
1352 newtp = tcp_sk(newsk);
1353 newinet = inet_sk(newsk);
2e6599cb 1354 ireq = inet_rsk(req);
d1e559d0
ED
1355 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1356 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
6dd9a14e 1357 newsk->sk_bound_dev_if = ireq->ir_iif;
634fb979 1358 newinet->inet_saddr = ireq->ir_loc_addr;
f6d8bd05
ED
1359 inet_opt = ireq->opt;
1360 rcu_assign_pointer(newinet->inet_opt, inet_opt);
2e6599cb 1361 ireq->opt = NULL;
463c84b9 1362 newinet->mc_index = inet_iif(skb);
eddc9ec5 1363 newinet->mc_ttl = ip_hdr(skb)->ttl;
4c507d28 1364 newinet->rcv_tos = ip_hdr(skb)->tos;
d83d8461 1365 inet_csk(newsk)->icsk_ext_hdr_len = 0;
f6d8bd05
ED
1366 if (inet_opt)
1367 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
c720c7e8 1368 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1369
dfd25fff
ED
1370 if (!dst) {
1371 dst = inet_csk_route_child_sock(sk, newsk, req);
1372 if (!dst)
1373 goto put_and_exit;
1374 } else {
1375 /* syncookie case : see end of cookie_v4_check() */
1376 }
0e734419
DM
1377 sk_setup_caps(newsk, dst);
1378
81164413
DB
1379 tcp_ca_openreq_child(newsk, dst);
1380
1da177e4 1381 tcp_sync_mss(newsk, dst_mtu(dst));
3541f9e8 1382 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
f5fff5dc 1383
1da177e4
LT
1384 tcp_initialize_rcv_mss(newsk);
1385
cfb6eeb4
YH
1386#ifdef CONFIG_TCP_MD5SIG
1387 /* Copy over the MD5 key from the original socket */
a915da9b
ED
1388 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1389 AF_INET);
00db4124 1390 if (key) {
cfb6eeb4
YH
1391 /*
1392 * We're using one, so create a matching key
1393 * on the newsk structure. If we fail to get
1394 * memory, then we end up not copying the key
1395 * across. Shucks.
1396 */
a915da9b 1397 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
6797318e 1398 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
a465419b 1399 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1400 }
1401#endif
1402
0e734419
DM
1403 if (__inet_inherit_port(sk, newsk) < 0)
1404 goto put_and_exit;
5e0724d0 1405 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
805c4bc0 1406 if (*own_req)
49a496c9 1407 tcp_move_syn(newtp, req);
1da177e4
LT
1408
1409 return newsk;
1410
1411exit_overflow:
c10d9310 1412 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1413exit_nonewsk:
1414 dst_release(dst);
1da177e4 1415exit:
9caad864 1416 tcp_listendrop(sk);
1da177e4 1417 return NULL;
0e734419 1418put_and_exit:
e337e24d
CP
1419 inet_csk_prepare_forced_close(newsk);
1420 tcp_done(newsk);
0e734419 1421 goto exit;
1da177e4 1422}
4bc2f18b 1423EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4 1424
079096f1 1425static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1da177e4 1426{
079096f1 1427#ifdef CONFIG_SYN_COOKIES
52452c54 1428 const struct tcphdr *th = tcp_hdr(skb);
1da177e4 1429
af9b4738 1430 if (!th->syn)
461b74c3 1431 sk = cookie_v4_check(sk, skb);
1da177e4
LT
1432#endif
1433 return sk;
1434}
1435
1da177e4 1436/* The socket must have it's spinlock held when we get
e994b2f0 1437 * here, unless it is a TCP_LISTEN socket.
1da177e4
LT
1438 *
1439 * We have a potential double-lock case here, so even when
1440 * doing backlog processing we use the BH locking scheme.
1441 * This is because we cannot sleep with the original spinlock
1442 * held.
1443 */
1444int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1445{
cfb6eeb4 1446 struct sock *rsk;
cfb6eeb4 1447
1da177e4 1448 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
404e0a8b
ED
1449 struct dst_entry *dst = sk->sk_rx_dst;
1450
bdeab991 1451 sock_rps_save_rxhash(sk, skb);
3d97379a 1452 sk_mark_napi_id(sk, skb);
404e0a8b 1453 if (dst) {
505fbcf0 1454 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
51456b29 1455 !dst->ops->check(dst, 0)) {
92101b3b
DM
1456 dst_release(dst);
1457 sk->sk_rx_dst = NULL;
1458 }
1459 }
e42e24c3 1460 tcp_rcv_established(sk, skb, tcp_hdr(skb));
1da177e4
LT
1461 return 0;
1462 }
1463
12e25e10 1464 if (tcp_checksum_complete(skb))
1da177e4
LT
1465 goto csum_err;
1466
1467 if (sk->sk_state == TCP_LISTEN) {
079096f1
ED
1468 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1469
1da177e4
LT
1470 if (!nsk)
1471 goto discard;
1da177e4 1472 if (nsk != sk) {
cfb6eeb4
YH
1473 if (tcp_child_process(sk, nsk, skb)) {
1474 rsk = nsk;
1da177e4 1475 goto reset;
cfb6eeb4 1476 }
1da177e4
LT
1477 return 0;
1478 }
ca55158c 1479 } else
bdeab991 1480 sock_rps_save_rxhash(sk, skb);
ca55158c 1481
72ab4a86 1482 if (tcp_rcv_state_process(sk, skb)) {
cfb6eeb4 1483 rsk = sk;
1da177e4 1484 goto reset;
cfb6eeb4 1485 }
1da177e4
LT
1486 return 0;
1487
1488reset:
cfb6eeb4 1489 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1490discard:
1491 kfree_skb(skb);
1492 /* Be careful here. If this function gets more complicated and
1493 * gcc suffers from register pressure on the x86, sk (in %ebx)
1494 * might be destroyed here. This current version compiles correctly,
1495 * but you have been warned.
1496 */
1497 return 0;
1498
1499csum_err:
c10d9310
ED
1500 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1501 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1502 goto discard;
1503}
4bc2f18b 1504EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4 1505
7487449c 1506int tcp_v4_early_demux(struct sk_buff *skb)
41063e9d 1507{
41063e9d
DM
1508 const struct iphdr *iph;
1509 const struct tcphdr *th;
1510 struct sock *sk;
41063e9d 1511
41063e9d 1512 if (skb->pkt_type != PACKET_HOST)
7487449c 1513 return 0;
41063e9d 1514
45f00f99 1515 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
7487449c 1516 return 0;
41063e9d
DM
1517
1518 iph = ip_hdr(skb);
45f00f99 1519 th = tcp_hdr(skb);
41063e9d
DM
1520
1521 if (th->doff < sizeof(struct tcphdr) / 4)
7487449c 1522 return 0;
41063e9d 1523
45f00f99 1524 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
41063e9d 1525 iph->saddr, th->source,
7011d085 1526 iph->daddr, ntohs(th->dest),
3fa6f616 1527 skb->skb_iif, inet_sdif(skb));
41063e9d
DM
1528 if (sk) {
1529 skb->sk = sk;
1530 skb->destructor = sock_edemux;
f7e4eb03 1531 if (sk_fullsock(sk)) {
d0c294c5 1532 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
505fbcf0 1533
41063e9d
DM
1534 if (dst)
1535 dst = dst_check(dst, 0);
92101b3b 1536 if (dst &&
505fbcf0 1537 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
92101b3b 1538 skb_dst_set_noref(skb, dst);
41063e9d
DM
1539 }
1540 }
7487449c 1541 return 0;
41063e9d
DM
1542}
1543
c9c33212
ED
1544bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1545{
1546 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1547
1548 /* Only socket owner can try to collapse/prune rx queues
1549 * to reduce memory overhead, so add a little headroom here.
1550 * Few sockets backlog are possibly concurrently non empty.
1551 */
1552 limit += 64*1024;
1553
1554 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1555 * we can fix skb->truesize to its real value to avoid future drops.
1556 * This is valid because skb is not yet charged to the socket.
1557 * It has been noticed pure SACK packets were sometimes dropped
1558 * (if cooked by drivers without copybreak feature).
1559 */
60b1af33 1560 skb_condense(skb);
c9c33212
ED
1561
1562 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1563 bh_unlock_sock(sk);
1564 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1565 return true;
1566 }
1567 return false;
1568}
1569EXPORT_SYMBOL(tcp_add_backlog);
1570
ac6e7800
ED
1571int tcp_filter(struct sock *sk, struct sk_buff *skb)
1572{
1573 struct tcphdr *th = (struct tcphdr *)skb->data;
1574 unsigned int eaten = skb->len;
1575 int err;
1576
1577 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1578 if (!err) {
1579 eaten -= skb->len;
1580 TCP_SKB_CB(skb)->end_seq -= eaten;
1581 }
1582 return err;
1583}
1584EXPORT_SYMBOL(tcp_filter);
1585
1da177e4
LT
1586/*
1587 * From tcp_input.c
1588 */
1589
1590int tcp_v4_rcv(struct sk_buff *skb)
1591{
3b24d854 1592 struct net *net = dev_net(skb->dev);
3fa6f616 1593 int sdif = inet_sdif(skb);
eddc9ec5 1594 const struct iphdr *iph;
cf533ea5 1595 const struct tcphdr *th;
3b24d854 1596 bool refcounted;
1da177e4
LT
1597 struct sock *sk;
1598 int ret;
1599
1600 if (skb->pkt_type != PACKET_HOST)
1601 goto discard_it;
1602
1603 /* Count it even if it's bad */
90bbcc60 1604 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1da177e4
LT
1605
1606 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1607 goto discard_it;
1608
ea1627c2 1609 th = (const struct tcphdr *)skb->data;
1da177e4 1610
ea1627c2 1611 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1da177e4
LT
1612 goto bad_packet;
1613 if (!pskb_may_pull(skb, th->doff * 4))
1614 goto discard_it;
1615
1616 /* An explanation is required here, I think.
1617 * Packet length and doff are validated by header prediction,
caa20d9a 1618 * provided case of th->doff==0 is eliminated.
1da177e4 1619 * So, we defer the checks. */
ed70fcfc
TH
1620
1621 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
6a5dc9e5 1622 goto csum_error;
1da177e4 1623
ea1627c2 1624 th = (const struct tcphdr *)skb->data;
eddc9ec5 1625 iph = ip_hdr(skb);
971f10ec
ED
1626 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1627 * barrier() makes sure compiler wont play fool^Waliasing games.
1628 */
1629 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1630 sizeof(struct inet_skb_parm));
1631 barrier();
1632
1da177e4
LT
1633 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1634 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1635 skb->len - th->doff * 4);
1636 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
e11ecddf 1637 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
04317daf 1638 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
b82d1bb4 1639 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1da177e4 1640 TCP_SKB_CB(skb)->sacked = 0;
98aaa913
MM
1641 TCP_SKB_CB(skb)->has_rxtstamp =
1642 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1da177e4 1643
4bdc3d66 1644lookup:
a583636a 1645 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
3fa6f616 1646 th->dest, sdif, &refcounted);
1da177e4
LT
1647 if (!sk)
1648 goto no_tcp_socket;
1649
bb134d5d
ED
1650process:
1651 if (sk->sk_state == TCP_TIME_WAIT)
1652 goto do_time_wait;
1653
079096f1
ED
1654 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1655 struct request_sock *req = inet_reqsk(sk);
7716682c 1656 struct sock *nsk;
079096f1
ED
1657
1658 sk = req->rsk_listener;
72923555 1659 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
e65c332d 1660 sk_drops_add(sk, skb);
72923555
ED
1661 reqsk_put(req);
1662 goto discard_it;
1663 }
7716682c 1664 if (unlikely(sk->sk_state != TCP_LISTEN)) {
f03f2e15 1665 inet_csk_reqsk_queue_drop_and_put(sk, req);
4bdc3d66
ED
1666 goto lookup;
1667 }
3b24d854
ED
1668 /* We own a reference on the listener, increase it again
1669 * as we might lose it too soon.
1670 */
7716682c 1671 sock_hold(sk);
3b24d854 1672 refcounted = true;
1f3b359f
ED
1673 nsk = NULL;
1674 if (!tcp_filter(sk, skb))
1675 nsk = tcp_check_req(sk, skb, req, false);
079096f1
ED
1676 if (!nsk) {
1677 reqsk_put(req);
7716682c 1678 goto discard_and_relse;
079096f1
ED
1679 }
1680 if (nsk == sk) {
079096f1
ED
1681 reqsk_put(req);
1682 } else if (tcp_child_process(sk, nsk, skb)) {
1683 tcp_v4_send_reset(nsk, skb);
7716682c 1684 goto discard_and_relse;
079096f1 1685 } else {
7716682c 1686 sock_put(sk);
079096f1
ED
1687 return 0;
1688 }
1689 }
6cce09f8 1690 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
02a1d6e7 1691 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1692 goto discard_and_relse;
6cce09f8 1693 }
d218d111 1694
1da177e4
LT
1695 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1696 goto discard_and_relse;
9ea88a15 1697
9ea88a15
DP
1698 if (tcp_v4_inbound_md5_hash(sk, skb))
1699 goto discard_and_relse;
9ea88a15 1700
b59c2701 1701 nf_reset(skb);
1da177e4 1702
ac6e7800 1703 if (tcp_filter(sk, skb))
1da177e4 1704 goto discard_and_relse;
ac6e7800
ED
1705 th = (const struct tcphdr *)skb->data;
1706 iph = ip_hdr(skb);
1da177e4
LT
1707
1708 skb->dev = NULL;
1709
e994b2f0
ED
1710 if (sk->sk_state == TCP_LISTEN) {
1711 ret = tcp_v4_do_rcv(sk, skb);
1712 goto put_and_return;
1713 }
1714
1715 sk_incoming_cpu_update(sk);
1716
c6366184 1717 bh_lock_sock_nested(sk);
a44d6eac 1718 tcp_segs_in(tcp_sk(sk), skb);
1da177e4
LT
1719 ret = 0;
1720 if (!sock_owned_by_user(sk)) {
e7942d06 1721 ret = tcp_v4_do_rcv(sk, skb);
c9c33212 1722 } else if (tcp_add_backlog(sk, skb)) {
6b03a53a
ZY
1723 goto discard_and_relse;
1724 }
1da177e4
LT
1725 bh_unlock_sock(sk);
1726
e994b2f0 1727put_and_return:
3b24d854
ED
1728 if (refcounted)
1729 sock_put(sk);
1da177e4
LT
1730
1731 return ret;
1732
1733no_tcp_socket:
1734 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1735 goto discard_it;
1736
12e25e10 1737 if (tcp_checksum_complete(skb)) {
6a5dc9e5 1738csum_error:
90bbcc60 1739 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1da177e4 1740bad_packet:
90bbcc60 1741 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1da177e4 1742 } else {
cfb6eeb4 1743 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1744 }
1745
1746discard_it:
1747 /* Discard frame. */
1748 kfree_skb(skb);
e905a9ed 1749 return 0;
1da177e4
LT
1750
1751discard_and_relse:
532182cd 1752 sk_drops_add(sk, skb);
3b24d854
ED
1753 if (refcounted)
1754 sock_put(sk);
1da177e4
LT
1755 goto discard_it;
1756
1757do_time_wait:
1758 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1759 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1760 goto discard_it;
1761 }
1762
6a5dc9e5
ED
1763 if (tcp_checksum_complete(skb)) {
1764 inet_twsk_put(inet_twsk(sk));
1765 goto csum_error;
1da177e4 1766 }
9469c7b4 1767 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1768 case TCP_TW_SYN: {
c346dca1 1769 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
a583636a
CG
1770 &tcp_hashinfo, skb,
1771 __tcp_hdrlen(th),
da5e3630 1772 iph->saddr, th->source,
eddc9ec5 1773 iph->daddr, th->dest,
3fa6f616
DA
1774 inet_iif(skb),
1775 sdif);
1da177e4 1776 if (sk2) {
dbe7faa4 1777 inet_twsk_deschedule_put(inet_twsk(sk));
1da177e4 1778 sk = sk2;
3b24d854 1779 refcounted = false;
1da177e4
LT
1780 goto process;
1781 }
1782 /* Fall through to ACK */
1783 }
1784 case TCP_TW_ACK:
1785 tcp_v4_timewait_ack(sk, skb);
1786 break;
1787 case TCP_TW_RST:
271c3b9b
FW
1788 tcp_v4_send_reset(sk, skb);
1789 inet_twsk_deschedule_put(inet_twsk(sk));
1790 goto discard_it;
1da177e4
LT
1791 case TCP_TW_SUCCESS:;
1792 }
1793 goto discard_it;
1794}
1795
ccb7c410
DM
1796static struct timewait_sock_ops tcp_timewait_sock_ops = {
1797 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1798 .twsk_unique = tcp_twsk_unique,
1799 .twsk_destructor= tcp_twsk_destructor,
ccb7c410 1800};
1da177e4 1801
63d02d15 1802void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
5d299f3d
ED
1803{
1804 struct dst_entry *dst = skb_dst(skb);
1805
5037e9ef 1806 if (dst && dst_hold_safe(dst)) {
ca777eff
ED
1807 sk->sk_rx_dst = dst;
1808 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1809 }
5d299f3d 1810}
63d02d15 1811EXPORT_SYMBOL(inet_sk_rx_dst_set);
5d299f3d 1812
3b401a81 1813const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1814 .queue_xmit = ip_queue_xmit,
1815 .send_check = tcp_v4_send_check,
1816 .rebuild_header = inet_sk_rebuild_header,
5d299f3d 1817 .sk_rx_dst_set = inet_sk_rx_dst_set,
543d9cfe
ACM
1818 .conn_request = tcp_v4_conn_request,
1819 .syn_recv_sock = tcp_v4_syn_recv_sock,
543d9cfe
ACM
1820 .net_header_len = sizeof(struct iphdr),
1821 .setsockopt = ip_setsockopt,
1822 .getsockopt = ip_getsockopt,
1823 .addr2sockaddr = inet_csk_addr2sockaddr,
1824 .sockaddr_len = sizeof(struct sockaddr_in),
3fdadf7d 1825#ifdef CONFIG_COMPAT
543d9cfe
ACM
1826 .compat_setsockopt = compat_ip_setsockopt,
1827 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1828#endif
4fab9071 1829 .mtu_reduced = tcp_v4_mtu_reduced,
1da177e4 1830};
4bc2f18b 1831EXPORT_SYMBOL(ipv4_specific);
1da177e4 1832
cfb6eeb4 1833#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1834static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1835 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1836 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1837 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1838};
b6332e6c 1839#endif
cfb6eeb4 1840
1da177e4
LT
1841/* NOTE: A lot of things set to zero explicitly by call to
1842 * sk_alloc() so need not be done here.
1843 */
1844static int tcp_v4_init_sock(struct sock *sk)
1845{
6687e988 1846 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4 1847
900f65d3 1848 tcp_init_sock(sk);
1da177e4 1849
8292a17a 1850 icsk->icsk_af_ops = &ipv4_specific;
900f65d3 1851
cfb6eeb4 1852#ifdef CONFIG_TCP_MD5SIG
ac807fa8 1853 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
cfb6eeb4 1854#endif
1da177e4 1855
1da177e4
LT
1856 return 0;
1857}
1858
7d06b2e0 1859void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1860{
1861 struct tcp_sock *tp = tcp_sk(sk);
1862
1863 tcp_clear_xmit_timers(sk);
1864
6687e988 1865 tcp_cleanup_congestion_control(sk);
317a76f9 1866
734942cc
DW
1867 tcp_cleanup_ulp(sk);
1868
1da177e4 1869 /* Cleanup up the write buffer. */
fe067e8a 1870 tcp_write_queue_purge(sk);
1da177e4 1871
cf1ef3f0
WW
1872 /* Check if we want to disable active TFO */
1873 tcp_fastopen_active_disable_ofo_check(sk);
1874
1da177e4 1875 /* Cleans up our, hopefully empty, out_of_order_queue. */
9f5afeae 1876 skb_rbtree_purge(&tp->out_of_order_queue);
1da177e4 1877
cfb6eeb4
YH
1878#ifdef CONFIG_TCP_MD5SIG
1879 /* Clean up the MD5 key list, if any */
1880 if (tp->md5sig_info) {
a915da9b 1881 tcp_clear_md5_list(sk);
a8afca03 1882 kfree_rcu(tp->md5sig_info, rcu);
cfb6eeb4
YH
1883 tp->md5sig_info = NULL;
1884 }
1885#endif
1a2449a8 1886
1da177e4 1887 /* Clean up a referenced TCP bind bucket. */
463c84b9 1888 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1889 inet_put_port(sk);
1da177e4 1890
00db4124 1891 BUG_ON(tp->fastopen_rsk);
435cf559 1892
cf60af03
YC
1893 /* If socket is aborted during connect operation */
1894 tcp_free_fastopen_req(tp);
cd8ae852 1895 tcp_saved_syn_free(tp);
cf60af03 1896
180d8cd9 1897 sk_sockets_allocated_dec(sk);
1da177e4 1898}
1da177e4
LT
1899EXPORT_SYMBOL(tcp_v4_destroy_sock);
1900
1901#ifdef CONFIG_PROC_FS
1902/* Proc filesystem TCP sock list dumping. */
1903
a8b690f9
TH
1904/*
1905 * Get next listener socket follow cur. If cur is NULL, get first socket
1906 * starting from bucket given in st->bucket; when st->bucket is zero the
1907 * very first socket in the hash table is returned.
1908 */
1da177e4
LT
1909static void *listening_get_next(struct seq_file *seq, void *cur)
1910{
5799de0b 1911 struct tcp_iter_state *st = seq->private;
a4146b1b 1912 struct net *net = seq_file_net(seq);
3b24d854 1913 struct inet_listen_hashbucket *ilb;
3b24d854 1914 struct sock *sk = cur;
1da177e4
LT
1915
1916 if (!sk) {
3b24d854 1917get_head:
a8b690f9 1918 ilb = &tcp_hashinfo.listening_hash[st->bucket];
9652dc2e 1919 spin_lock(&ilb->lock);
3b24d854 1920 sk = sk_head(&ilb->head);
a8b690f9 1921 st->offset = 0;
1da177e4
LT
1922 goto get_sk;
1923 }
5caea4ea 1924 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1925 ++st->num;
a8b690f9 1926 ++st->offset;
1da177e4 1927
3b24d854 1928 sk = sk_next(sk);
1da177e4 1929get_sk:
3b24d854 1930 sk_for_each_from(sk) {
8475ef9f
PE
1931 if (!net_eq(sock_net(sk), net))
1932 continue;
3b24d854
ED
1933 if (sk->sk_family == st->family)
1934 return sk;
1da177e4 1935 }
9652dc2e 1936 spin_unlock(&ilb->lock);
a8b690f9 1937 st->offset = 0;
3b24d854
ED
1938 if (++st->bucket < INET_LHTABLE_SIZE)
1939 goto get_head;
1940 return NULL;
1da177e4
LT
1941}
1942
1943static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1944{
a8b690f9
TH
1945 struct tcp_iter_state *st = seq->private;
1946 void *rc;
1947
1948 st->bucket = 0;
1949 st->offset = 0;
1950 rc = listening_get_next(seq, NULL);
1da177e4
LT
1951
1952 while (rc && *pos) {
1953 rc = listening_get_next(seq, rc);
1954 --*pos;
1955 }
1956 return rc;
1957}
1958
05dbc7b5 1959static inline bool empty_bucket(const struct tcp_iter_state *st)
6eac5604 1960{
05dbc7b5 1961 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
6eac5604
AK
1962}
1963
a8b690f9
TH
1964/*
1965 * Get first established socket starting from bucket given in st->bucket.
1966 * If st->bucket is zero, the very first socket in the hash is returned.
1967 */
1da177e4
LT
1968static void *established_get_first(struct seq_file *seq)
1969{
5799de0b 1970 struct tcp_iter_state *st = seq->private;
a4146b1b 1971 struct net *net = seq_file_net(seq);
1da177e4
LT
1972 void *rc = NULL;
1973
a8b690f9
TH
1974 st->offset = 0;
1975 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 1976 struct sock *sk;
3ab5aee7 1977 struct hlist_nulls_node *node;
9db66bdc 1978 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 1979
6eac5604
AK
1980 /* Lockless fast path for the common case of empty buckets */
1981 if (empty_bucket(st))
1982 continue;
1983
9db66bdc 1984 spin_lock_bh(lock);
3ab5aee7 1985 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 1986 if (sk->sk_family != st->family ||
878628fb 1987 !net_eq(sock_net(sk), net)) {
1da177e4
LT
1988 continue;
1989 }
1990 rc = sk;
1991 goto out;
1992 }
9db66bdc 1993 spin_unlock_bh(lock);
1da177e4
LT
1994 }
1995out:
1996 return rc;
1997}
1998
1999static void *established_get_next(struct seq_file *seq, void *cur)
2000{
2001 struct sock *sk = cur;
3ab5aee7 2002 struct hlist_nulls_node *node;
5799de0b 2003 struct tcp_iter_state *st = seq->private;
a4146b1b 2004 struct net *net = seq_file_net(seq);
1da177e4
LT
2005
2006 ++st->num;
a8b690f9 2007 ++st->offset;
1da177e4 2008
05dbc7b5 2009 sk = sk_nulls_next(sk);
1da177e4 2010
3ab5aee7 2011 sk_nulls_for_each_from(sk, node) {
878628fb 2012 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
05dbc7b5 2013 return sk;
1da177e4
LT
2014 }
2015
05dbc7b5
ED
2016 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2017 ++st->bucket;
2018 return established_get_first(seq);
1da177e4
LT
2019}
2020
2021static void *established_get_idx(struct seq_file *seq, loff_t pos)
2022{
a8b690f9
TH
2023 struct tcp_iter_state *st = seq->private;
2024 void *rc;
2025
2026 st->bucket = 0;
2027 rc = established_get_first(seq);
1da177e4
LT
2028
2029 while (rc && pos) {
2030 rc = established_get_next(seq, rc);
2031 --pos;
7174259e 2032 }
1da177e4
LT
2033 return rc;
2034}
2035
2036static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2037{
2038 void *rc;
5799de0b 2039 struct tcp_iter_state *st = seq->private;
1da177e4 2040
1da177e4
LT
2041 st->state = TCP_SEQ_STATE_LISTENING;
2042 rc = listening_get_idx(seq, &pos);
2043
2044 if (!rc) {
1da177e4
LT
2045 st->state = TCP_SEQ_STATE_ESTABLISHED;
2046 rc = established_get_idx(seq, pos);
2047 }
2048
2049 return rc;
2050}
2051
a8b690f9
TH
2052static void *tcp_seek_last_pos(struct seq_file *seq)
2053{
2054 struct tcp_iter_state *st = seq->private;
2055 int offset = st->offset;
2056 int orig_num = st->num;
2057 void *rc = NULL;
2058
2059 switch (st->state) {
a8b690f9
TH
2060 case TCP_SEQ_STATE_LISTENING:
2061 if (st->bucket >= INET_LHTABLE_SIZE)
2062 break;
2063 st->state = TCP_SEQ_STATE_LISTENING;
2064 rc = listening_get_next(seq, NULL);
2065 while (offset-- && rc)
2066 rc = listening_get_next(seq, rc);
2067 if (rc)
2068 break;
2069 st->bucket = 0;
05dbc7b5 2070 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2071 /* Fallthrough */
2072 case TCP_SEQ_STATE_ESTABLISHED:
a8b690f9
TH
2073 if (st->bucket > tcp_hashinfo.ehash_mask)
2074 break;
2075 rc = established_get_first(seq);
2076 while (offset-- && rc)
2077 rc = established_get_next(seq, rc);
2078 }
2079
2080 st->num = orig_num;
2081
2082 return rc;
2083}
2084
1da177e4
LT
2085static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2086{
5799de0b 2087 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2088 void *rc;
2089
2090 if (*pos && *pos == st->last_pos) {
2091 rc = tcp_seek_last_pos(seq);
2092 if (rc)
2093 goto out;
2094 }
2095
1da177e4
LT
2096 st->state = TCP_SEQ_STATE_LISTENING;
2097 st->num = 0;
a8b690f9
TH
2098 st->bucket = 0;
2099 st->offset = 0;
2100 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2101
2102out:
2103 st->last_pos = *pos;
2104 return rc;
1da177e4
LT
2105}
2106
2107static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2108{
a8b690f9 2109 struct tcp_iter_state *st = seq->private;
1da177e4 2110 void *rc = NULL;
1da177e4
LT
2111
2112 if (v == SEQ_START_TOKEN) {
2113 rc = tcp_get_idx(seq, 0);
2114 goto out;
2115 }
1da177e4
LT
2116
2117 switch (st->state) {
1da177e4
LT
2118 case TCP_SEQ_STATE_LISTENING:
2119 rc = listening_get_next(seq, v);
2120 if (!rc) {
1da177e4 2121 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2122 st->bucket = 0;
2123 st->offset = 0;
1da177e4
LT
2124 rc = established_get_first(seq);
2125 }
2126 break;
2127 case TCP_SEQ_STATE_ESTABLISHED:
1da177e4
LT
2128 rc = established_get_next(seq, v);
2129 break;
2130 }
2131out:
2132 ++*pos;
a8b690f9 2133 st->last_pos = *pos;
1da177e4
LT
2134 return rc;
2135}
2136
2137static void tcp_seq_stop(struct seq_file *seq, void *v)
2138{
5799de0b 2139 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2140
2141 switch (st->state) {
1da177e4
LT
2142 case TCP_SEQ_STATE_LISTENING:
2143 if (v != SEQ_START_TOKEN)
9652dc2e 2144 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4 2145 break;
1da177e4
LT
2146 case TCP_SEQ_STATE_ESTABLISHED:
2147 if (v)
9db66bdc 2148 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2149 break;
2150 }
2151}
2152
73cb88ec 2153int tcp_seq_open(struct inode *inode, struct file *file)
1da177e4 2154{
d9dda78b 2155 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
1da177e4 2156 struct tcp_iter_state *s;
52d6f3f1 2157 int err;
1da177e4 2158
52d6f3f1
DL
2159 err = seq_open_net(inode, file, &afinfo->seq_ops,
2160 sizeof(struct tcp_iter_state));
2161 if (err < 0)
2162 return err;
f40c8174 2163
52d6f3f1 2164 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2165 s->family = afinfo->family;
688d1945 2166 s->last_pos = 0;
f40c8174
DL
2167 return 0;
2168}
73cb88ec 2169EXPORT_SYMBOL(tcp_seq_open);
f40c8174 2170
6f8b13bc 2171int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2172{
2173 int rc = 0;
2174 struct proc_dir_entry *p;
2175
9427c4b3
DL
2176 afinfo->seq_ops.start = tcp_seq_start;
2177 afinfo->seq_ops.next = tcp_seq_next;
2178 afinfo->seq_ops.stop = tcp_seq_stop;
2179
84841c3c 2180 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
73cb88ec 2181 afinfo->seq_fops, afinfo);
84841c3c 2182 if (!p)
1da177e4
LT
2183 rc = -ENOMEM;
2184 return rc;
2185}
4bc2f18b 2186EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2187
6f8b13bc 2188void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2189{
ece31ffd 2190 remove_proc_entry(afinfo->name, net->proc_net);
1da177e4 2191}
4bc2f18b 2192EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2193
d4f06873 2194static void get_openreq4(const struct request_sock *req,
aa3a0c8c 2195 struct seq_file *f, int i)
1da177e4 2196{
2e6599cb 2197 const struct inet_request_sock *ireq = inet_rsk(req);
fa76ce73 2198 long delta = req->rsk_timer.expires - jiffies;
1da177e4 2199
5e659e4c 2200 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2201 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
1da177e4 2202 i,
634fb979 2203 ireq->ir_loc_addr,
d4f06873 2204 ireq->ir_num,
634fb979
ED
2205 ireq->ir_rmt_addr,
2206 ntohs(ireq->ir_rmt_port),
1da177e4
LT
2207 TCP_SYN_RECV,
2208 0, 0, /* could print option size, but that is af dependent. */
2209 1, /* timers active (only the expire timer) */
a399a805 2210 jiffies_delta_to_clock_t(delta),
e6c022a4 2211 req->num_timeout,
aa3a0c8c
ED
2212 from_kuid_munged(seq_user_ns(f),
2213 sock_i_uid(req->rsk_listener)),
1da177e4
LT
2214 0, /* non standard timer */
2215 0, /* open_requests have no inode */
d4f06873 2216 0,
652586df 2217 req);
1da177e4
LT
2218}
2219
652586df 2220static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
1da177e4
LT
2221{
2222 int timer_active;
2223 unsigned long timer_expires;
cf533ea5 2224 const struct tcp_sock *tp = tcp_sk(sk);
cf4c6bf8 2225 const struct inet_connection_sock *icsk = inet_csk(sk);
cf533ea5 2226 const struct inet_sock *inet = inet_sk(sk);
0536fcc0 2227 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
c720c7e8
ED
2228 __be32 dest = inet->inet_daddr;
2229 __be32 src = inet->inet_rcv_saddr;
2230 __u16 destp = ntohs(inet->inet_dport);
2231 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2232 int rx_queue;
00fd38d9 2233 int state;
1da177e4 2234
6ba8a3b1 2235 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
57dde7f7 2236 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
6ba8a3b1 2237 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
1da177e4 2238 timer_active = 1;
463c84b9
ACM
2239 timer_expires = icsk->icsk_timeout;
2240 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2241 timer_active = 4;
463c84b9 2242 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2243 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2244 timer_active = 2;
cf4c6bf8 2245 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2246 } else {
2247 timer_active = 0;
2248 timer_expires = jiffies;
2249 }
2250
00fd38d9
ED
2251 state = sk_state_load(sk);
2252 if (state == TCP_LISTEN)
49d09007
ED
2253 rx_queue = sk->sk_ack_backlog;
2254 else
00fd38d9
ED
2255 /* Because we don't lock the socket,
2256 * we might find a transient negative value.
49d09007
ED
2257 */
2258 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2259
5e659e4c 2260 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
652586df 2261 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
00fd38d9 2262 i, src, srcp, dest, destp, state,
47da8ee6 2263 tp->write_seq - tp->snd_una,
49d09007 2264 rx_queue,
1da177e4 2265 timer_active,
a399a805 2266 jiffies_delta_to_clock_t(timer_expires - jiffies),
463c84b9 2267 icsk->icsk_retransmits,
a7cb5a49 2268 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
6687e988 2269 icsk->icsk_probes_out,
cf4c6bf8 2270 sock_i_ino(sk),
41c6d650 2271 refcount_read(&sk->sk_refcnt), sk,
7be87351
SH
2272 jiffies_to_clock_t(icsk->icsk_rto),
2273 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2274 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2275 tp->snd_cwnd,
00fd38d9
ED
2276 state == TCP_LISTEN ?
2277 fastopenq->max_qlen :
652586df 2278 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
1da177e4
LT
2279}
2280
cf533ea5 2281static void get_timewait4_sock(const struct inet_timewait_sock *tw,
652586df 2282 struct seq_file *f, int i)
1da177e4 2283{
789f558c 2284 long delta = tw->tw_timer.expires - jiffies;
23f33c2d 2285 __be32 dest, src;
1da177e4 2286 __u16 destp, srcp;
1da177e4
LT
2287
2288 dest = tw->tw_daddr;
2289 src = tw->tw_rcv_saddr;
2290 destp = ntohs(tw->tw_dport);
2291 srcp = ntohs(tw->tw_sport);
2292
5e659e4c 2293 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
652586df 2294 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
1da177e4 2295 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
a399a805 2296 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
41c6d650 2297 refcount_read(&tw->tw_refcnt), tw);
1da177e4
LT
2298}
2299
2300#define TMPSZ 150
2301
2302static int tcp4_seq_show(struct seq_file *seq, void *v)
2303{
5799de0b 2304 struct tcp_iter_state *st;
05dbc7b5 2305 struct sock *sk = v;
1da177e4 2306
652586df 2307 seq_setwidth(seq, TMPSZ - 1);
1da177e4 2308 if (v == SEQ_START_TOKEN) {
652586df 2309 seq_puts(seq, " sl local_address rem_address st tx_queue "
1da177e4
LT
2310 "rx_queue tr tm->when retrnsmt uid timeout "
2311 "inode");
2312 goto out;
2313 }
2314 st = seq->private;
2315
079096f1
ED
2316 if (sk->sk_state == TCP_TIME_WAIT)
2317 get_timewait4_sock(v, seq, st->num);
2318 else if (sk->sk_state == TCP_NEW_SYN_RECV)
aa3a0c8c 2319 get_openreq4(v, seq, st->num);
079096f1
ED
2320 else
2321 get_tcp4_sock(v, seq, st->num);
1da177e4 2322out:
652586df 2323 seq_pad(seq, '\n');
1da177e4
LT
2324 return 0;
2325}
2326
73cb88ec
AV
2327static const struct file_operations tcp_afinfo_seq_fops = {
2328 .owner = THIS_MODULE,
2329 .open = tcp_seq_open,
2330 .read = seq_read,
2331 .llseek = seq_lseek,
2332 .release = seq_release_net
2333};
2334
1da177e4 2335static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2336 .name = "tcp",
2337 .family = AF_INET,
73cb88ec 2338 .seq_fops = &tcp_afinfo_seq_fops,
9427c4b3
DL
2339 .seq_ops = {
2340 .show = tcp4_seq_show,
2341 },
1da177e4
LT
2342};
2343
2c8c1e72 2344static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2345{
2346 return tcp_proc_register(net, &tcp4_seq_afinfo);
2347}
2348
2c8c1e72 2349static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2350{
2351 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2352}
2353
2354static struct pernet_operations tcp4_net_ops = {
2355 .init = tcp4_proc_init_net,
2356 .exit = tcp4_proc_exit_net,
2357};
2358
1da177e4
LT
2359int __init tcp4_proc_init(void)
2360{
757764f6 2361 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2362}
2363
2364void tcp4_proc_exit(void)
2365{
757764f6 2366 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2367}
2368#endif /* CONFIG_PROC_FS */
2369
2370struct proto tcp_prot = {
2371 .name = "TCP",
2372 .owner = THIS_MODULE,
2373 .close = tcp_close,
2374 .connect = tcp_v4_connect,
2375 .disconnect = tcp_disconnect,
463c84b9 2376 .accept = inet_csk_accept,
1da177e4
LT
2377 .ioctl = tcp_ioctl,
2378 .init = tcp_v4_init_sock,
2379 .destroy = tcp_v4_destroy_sock,
2380 .shutdown = tcp_shutdown,
2381 .setsockopt = tcp_setsockopt,
2382 .getsockopt = tcp_getsockopt,
4b9d07a4 2383 .keepalive = tcp_set_keepalive,
1da177e4 2384 .recvmsg = tcp_recvmsg,
7ba42910
CG
2385 .sendmsg = tcp_sendmsg,
2386 .sendpage = tcp_sendpage,
1da177e4 2387 .backlog_rcv = tcp_v4_do_rcv,
46d3ceab 2388 .release_cb = tcp_release_cb,
ab1e0a13
ACM
2389 .hash = inet_hash,
2390 .unhash = inet_unhash,
2391 .get_port = inet_csk_get_port,
1da177e4 2392 .enter_memory_pressure = tcp_enter_memory_pressure,
06044751 2393 .leave_memory_pressure = tcp_leave_memory_pressure,
c9bee3b7 2394 .stream_memory_free = tcp_stream_memory_free,
1da177e4 2395 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2396 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2397 .memory_allocated = &tcp_memory_allocated,
2398 .memory_pressure = &tcp_memory_pressure,
a4fe34bf 2399 .sysctl_mem = sysctl_tcp_mem,
1da177e4
LT
2400 .sysctl_wmem = sysctl_tcp_wmem,
2401 .sysctl_rmem = sysctl_tcp_rmem,
2402 .max_header = MAX_TCP_HEADER,
2403 .obj_size = sizeof(struct tcp_sock),
5f0d5a3a 2404 .slab_flags = SLAB_TYPESAFE_BY_RCU,
6d6ee43e 2405 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2406 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2407 .h.hashinfo = &tcp_hashinfo,
7ba42910 2408 .no_autobind = true,
543d9cfe
ACM
2409#ifdef CONFIG_COMPAT
2410 .compat_setsockopt = compat_tcp_setsockopt,
2411 .compat_getsockopt = compat_tcp_getsockopt,
d1a4c0b3 2412#endif
c1e64e29 2413 .diag_destroy = tcp_abort,
1da177e4 2414};
4bc2f18b 2415EXPORT_SYMBOL(tcp_prot);
1da177e4 2416
bdbbb852
ED
2417static void __net_exit tcp_sk_exit(struct net *net)
2418{
2419 int cpu;
2420
2421 for_each_possible_cpu(cpu)
2422 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2423 free_percpu(net->ipv4.tcp_sk);
2424}
2425
046ee902
DL
2426static int __net_init tcp_sk_init(struct net *net)
2427{
fee83d09 2428 int res, cpu, cnt;
bdbbb852
ED
2429
2430 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2431 if (!net->ipv4.tcp_sk)
2432 return -ENOMEM;
2433
2434 for_each_possible_cpu(cpu) {
2435 struct sock *sk;
2436
2437 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2438 IPPROTO_TCP, net);
2439 if (res)
2440 goto fail;
a9d6532b 2441 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
bdbbb852
ED
2442 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2443 }
49213555 2444
5d134f1c 2445 net->ipv4.sysctl_tcp_ecn = 2;
49213555
DB
2446 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2447
b0f9ca53 2448 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
6b58e0a5 2449 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
05cbc0db 2450 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
046ee902 2451
13b287e8 2452 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
9bd6861b 2453 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
b840d15d 2454 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
13b287e8 2455
6fa25166 2456 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
7c083ecb 2457 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
0aca737d 2458 net->ipv4.sysctl_tcp_syncookies = 1;
1043e25f 2459 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
ae5c3f40 2460 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
c6214a97 2461 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
c402d9be 2462 net->ipv4.sysctl_tcp_orphan_retries = 0;
1e579caa 2463 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
4979f2d9 2464 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
56ab6b93 2465 net->ipv4.sysctl_tcp_tw_reuse = 0;
12ed8244 2466
fee83d09 2467 cnt = tcp_hashinfo.ehash_mask + 1;
fee83d09 2468 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
1946e672
HY
2469 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2470
fee83d09 2471 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
f9301034 2472 net->ipv4.sysctl_tcp_sack = 1;
9bb37ef0 2473 net->ipv4.sysctl_tcp_window_scaling = 1;
5d2ed052 2474 net->ipv4.sysctl_tcp_timestamps = 1;
fee83d09 2475
e1cfcbe8 2476 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
43713848 2477 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
3733be14
HY
2478 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2479 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
e1cfcbe8 2480
49213555 2481 return 0;
bdbbb852
ED
2482fail:
2483 tcp_sk_exit(net);
2484
2485 return res;
b099ce26
EB
2486}
2487
2488static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2489{
43713848
HY
2490 struct net *net;
2491
1946e672 2492 inet_twsk_purge(&tcp_hashinfo, AF_INET);
43713848
HY
2493
2494 list_for_each_entry(net, net_exit_list, exit_list)
2495 tcp_fastopen_ctx_destroy(net);
046ee902
DL
2496}
2497
2498static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2499 .init = tcp_sk_init,
2500 .exit = tcp_sk_exit,
2501 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2502};
2503
9b0f976f 2504void __init tcp_v4_init(void)
1da177e4 2505{
6a1b3054 2506 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2507 panic("Failed to create the TCP control socket.\n");
1da177e4 2508}