]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - net/ipv4/tcp_ipv4.c
xfrm: Return dst directly from xfrm_lookup()
[mirror_ubuntu-hirsute-kernel.git] / net / ipv4 / tcp_ipv4.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
1da177e4
LT
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
60236fdd 37 * request_sock handling and moved
1da177e4
LT
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
caa20d9a 40 * Added new listen semantics.
1da177e4
LT
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
1da177e4 53
eb4dea58 54#include <linux/bottom_half.h>
1da177e4
LT
55#include <linux/types.h>
56#include <linux/fcntl.h>
57#include <linux/module.h>
58#include <linux/random.h>
59#include <linux/cache.h>
60#include <linux/jhash.h>
61#include <linux/init.h>
62#include <linux/times.h>
5a0e3ad6 63#include <linux/slab.h>
1da177e4 64
457c4cbc 65#include <net/net_namespace.h>
1da177e4 66#include <net/icmp.h>
304a1618 67#include <net/inet_hashtables.h>
1da177e4 68#include <net/tcp.h>
20380731 69#include <net/transp_v6.h>
1da177e4
LT
70#include <net/ipv6.h>
71#include <net/inet_common.h>
6d6ee43e 72#include <net/timewait_sock.h>
1da177e4 73#include <net/xfrm.h>
1a2449a8 74#include <net/netdma.h>
1da177e4
LT
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81
cfb6eeb4
YH
82#include <linux/crypto.h>
83#include <linux/scatterlist.h>
84
ab32ea5d
BH
85int sysctl_tcp_tw_reuse __read_mostly;
86int sysctl_tcp_low_latency __read_mostly;
4bc2f18b 87EXPORT_SYMBOL(sysctl_tcp_low_latency);
1da177e4 88
1da177e4 89
cfb6eeb4 90#ifdef CONFIG_TCP_MD5SIG
7174259e
ACM
91static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92 __be32 addr);
49a72dfb
AL
93static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94 __be32 daddr, __be32 saddr, struct tcphdr *th);
9501f972
YH
95#else
96static inline
97struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98{
99 return NULL;
100}
cfb6eeb4
YH
101#endif
102
5caea4ea 103struct inet_hashinfo tcp_hashinfo;
4bc2f18b 104EXPORT_SYMBOL(tcp_hashinfo);
1da177e4 105
a94f723d 106static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
1da177e4 107{
eddc9ec5
ACM
108 return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109 ip_hdr(skb)->saddr,
aa8223c7
ACM
110 tcp_hdr(skb)->dest,
111 tcp_hdr(skb)->source);
1da177e4
LT
112}
113
6d6ee43e
ACM
114int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115{
116 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 struct tcp_sock *tp = tcp_sk(sk);
118
119 /* With PAWS, it is safe from the viewpoint
120 of data integrity. Even without PAWS it is safe provided sequence
121 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123 Actually, the idea is close to VJ's one, only timestamp cache is
124 held not per host, but per port pair and TW bucket is used as state
125 holder.
126
127 If TW bucket has been already destroyed we fall back to VJ's scheme
128 and use initial timestamp retrieved from peer table.
129 */
130 if (tcptw->tw_ts_recent_stamp &&
131 (twp == NULL || (sysctl_tcp_tw_reuse &&
9d729f72 132 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
6d6ee43e
ACM
133 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134 if (tp->write_seq == 0)
135 tp->write_seq = 1;
136 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
137 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 sock_hold(sktw);
139 return 1;
140 }
141
142 return 0;
143}
6d6ee43e
ACM
144EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145
1da177e4
LT
146/* This will initiate an outgoing connection. */
147int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148{
149 struct inet_sock *inet = inet_sk(sk);
150 struct tcp_sock *tp = tcp_sk(sk);
151 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
dca8b089 152 __be16 orig_sport, orig_dport;
1da177e4 153 struct rtable *rt;
bada8adc 154 __be32 daddr, nexthop;
1da177e4
LT
155 int tmp;
156 int err;
157
158 if (addr_len < sizeof(struct sockaddr_in))
159 return -EINVAL;
160
161 if (usin->sin_family != AF_INET)
162 return -EAFNOSUPPORT;
163
164 nexthop = daddr = usin->sin_addr.s_addr;
165 if (inet->opt && inet->opt->srr) {
166 if (!daddr)
167 return -EINVAL;
168 nexthop = inet->opt->faddr;
169 }
170
dca8b089
DM
171 orig_sport = inet->inet_sport;
172 orig_dport = usin->sin_port;
c720c7e8 173 tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
1da177e4
LT
174 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175 IPPROTO_TCP,
abdf7e72 176 orig_sport, orig_dport, sk, true);
584bdf8c
WD
177 if (tmp < 0) {
178 if (tmp == -ENETUNREACH)
7c73a6fa 179 IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
1da177e4 180 return tmp;
584bdf8c 181 }
1da177e4
LT
182
183 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184 ip_rt_put(rt);
185 return -ENETUNREACH;
186 }
187
188 if (!inet->opt || !inet->opt->srr)
189 daddr = rt->rt_dst;
190
c720c7e8
ED
191 if (!inet->inet_saddr)
192 inet->inet_saddr = rt->rt_src;
193 inet->inet_rcv_saddr = inet->inet_saddr;
1da177e4 194
c720c7e8 195 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
1da177e4
LT
196 /* Reset inherited state */
197 tp->rx_opt.ts_recent = 0;
198 tp->rx_opt.ts_recent_stamp = 0;
199 tp->write_seq = 0;
200 }
201
295ff7ed 202 if (tcp_death_row.sysctl_tw_recycle &&
1da177e4
LT
203 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
204 struct inet_peer *peer = rt_get_peer(rt);
7174259e
ACM
205 /*
206 * VJ's idea. We save last timestamp seen from
207 * the destination in peer table, when entering state
208 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
209 * when trying new connection.
1da177e4 210 */
317fe0e6
ED
211 if (peer) {
212 inet_peer_refcheck(peer);
213 if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
214 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
215 tp->rx_opt.ts_recent = peer->tcp_ts;
216 }
1da177e4
LT
217 }
218 }
219
c720c7e8
ED
220 inet->inet_dport = usin->sin_port;
221 inet->inet_daddr = daddr;
1da177e4 222
d83d8461 223 inet_csk(sk)->icsk_ext_hdr_len = 0;
1da177e4 224 if (inet->opt)
d83d8461 225 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
1da177e4 226
bee7ca9e 227 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
1da177e4
LT
228
229 /* Socket identity is still unknown (sport may be zero).
230 * However we set state to SYN-SENT and not releasing socket
231 * lock select source port, enter ourselves into the hash tables and
232 * complete initialization after this.
233 */
234 tcp_set_state(sk, TCP_SYN_SENT);
a7f5e7f1 235 err = inet_hash_connect(&tcp_death_row, sk);
1da177e4
LT
236 if (err)
237 goto failure;
238
7174259e 239 err = ip_route_newports(&rt, IPPROTO_TCP,
dca8b089 240 orig_sport, orig_dport,
c720c7e8 241 inet->inet_sport, inet->inet_dport, sk);
1da177e4
LT
242 if (err)
243 goto failure;
244
245 /* OK, now commit destination to socket. */
bcd76111 246 sk->sk_gso_type = SKB_GSO_TCPV4;
d8d1f30b 247 sk_setup_caps(sk, &rt->dst);
1da177e4
LT
248
249 if (!tp->write_seq)
c720c7e8
ED
250 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
251 inet->inet_daddr,
252 inet->inet_sport,
1da177e4
LT
253 usin->sin_port);
254
c720c7e8 255 inet->inet_id = tp->write_seq ^ jiffies;
1da177e4
LT
256
257 err = tcp_connect(sk);
258 rt = NULL;
259 if (err)
260 goto failure;
261
262 return 0;
263
264failure:
7174259e
ACM
265 /*
266 * This unhashes the socket and releases the local port,
267 * if necessary.
268 */
1da177e4
LT
269 tcp_set_state(sk, TCP_CLOSE);
270 ip_rt_put(rt);
271 sk->sk_route_caps = 0;
c720c7e8 272 inet->inet_dport = 0;
1da177e4
LT
273 return err;
274}
4bc2f18b 275EXPORT_SYMBOL(tcp_v4_connect);
1da177e4 276
1da177e4
LT
277/*
278 * This routine does path mtu discovery as defined in RFC1191.
279 */
40efc6fa 280static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
1da177e4
LT
281{
282 struct dst_entry *dst;
283 struct inet_sock *inet = inet_sk(sk);
1da177e4
LT
284
285 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
286 * send out by Linux are always <576bytes so they should go through
287 * unfragmented).
288 */
289 if (sk->sk_state == TCP_LISTEN)
290 return;
291
292 /* We don't check in the destentry if pmtu discovery is forbidden
293 * on this route. We just assume that no packet_to_big packets
294 * are send back when pmtu discovery is not active.
e905a9ed 295 * There is a small race when the user changes this flag in the
1da177e4
LT
296 * route, but I think that's acceptable.
297 */
298 if ((dst = __sk_dst_check(sk, 0)) == NULL)
299 return;
300
301 dst->ops->update_pmtu(dst, mtu);
302
303 /* Something is about to be wrong... Remember soft error
304 * for the case, if this connection will not able to recover.
305 */
306 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
307 sk->sk_err_soft = EMSGSIZE;
308
309 mtu = dst_mtu(dst);
310
311 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
d83d8461 312 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
1da177e4
LT
313 tcp_sync_mss(sk, mtu);
314
315 /* Resend the TCP packet because it's
316 * clear that the old packet has been
317 * dropped. This is the new "fast" path mtu
318 * discovery.
319 */
320 tcp_simple_retransmit(sk);
321 } /* else let the usual retransmit timer handle it */
322}
323
324/*
325 * This routine is called by the ICMP module when it gets some
326 * sort of error condition. If err < 0 then the socket should
327 * be closed and the error returned to the user. If err > 0
328 * it's just the icmp type << 8 | icmp code. After adjustment
329 * header points to the first 8 bytes of the tcp header. We need
330 * to find the appropriate port.
331 *
332 * The locking strategy used here is very "optimistic". When
333 * someone else accesses the socket the ICMP is just dropped
334 * and for some paths there is no check at all.
335 * A more general error queue to queue errors for later handling
336 * is probably better.
337 *
338 */
339
4d1a2d9e 340void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
1da177e4 341{
4d1a2d9e
DL
342 struct iphdr *iph = (struct iphdr *)icmp_skb->data;
343 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
f1ecd5d9 344 struct inet_connection_sock *icsk;
1da177e4
LT
345 struct tcp_sock *tp;
346 struct inet_sock *inet;
4d1a2d9e
DL
347 const int type = icmp_hdr(icmp_skb)->type;
348 const int code = icmp_hdr(icmp_skb)->code;
1da177e4 349 struct sock *sk;
f1ecd5d9 350 struct sk_buff *skb;
1da177e4 351 __u32 seq;
f1ecd5d9 352 __u32 remaining;
1da177e4 353 int err;
4d1a2d9e 354 struct net *net = dev_net(icmp_skb->dev);
1da177e4 355
4d1a2d9e 356 if (icmp_skb->len < (iph->ihl << 2) + 8) {
dcfc23ca 357 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
358 return;
359 }
360
fd54d716 361 sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
4d1a2d9e 362 iph->saddr, th->source, inet_iif(icmp_skb));
1da177e4 363 if (!sk) {
dcfc23ca 364 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
1da177e4
LT
365 return;
366 }
367 if (sk->sk_state == TCP_TIME_WAIT) {
9469c7b4 368 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
369 return;
370 }
371
372 bh_lock_sock(sk);
373 /* If too many ICMPs get dropped on busy
374 * servers this needs to be solved differently.
375 */
376 if (sock_owned_by_user(sk))
de0744af 377 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
1da177e4
LT
378
379 if (sk->sk_state == TCP_CLOSE)
380 goto out;
381
97e3ecd1 382 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
383 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
384 goto out;
385 }
386
f1ecd5d9 387 icsk = inet_csk(sk);
1da177e4
LT
388 tp = tcp_sk(sk);
389 seq = ntohl(th->seq);
390 if (sk->sk_state != TCP_LISTEN &&
391 !between(seq, tp->snd_una, tp->snd_nxt)) {
de0744af 392 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
393 goto out;
394 }
395
396 switch (type) {
397 case ICMP_SOURCE_QUENCH:
398 /* Just silently ignore these. */
399 goto out;
400 case ICMP_PARAMETERPROB:
401 err = EPROTO;
402 break;
403 case ICMP_DEST_UNREACH:
404 if (code > NR_ICMP_UNREACH)
405 goto out;
406
407 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
408 if (!sock_owned_by_user(sk))
409 do_pmtu_discovery(sk, iph, info);
410 goto out;
411 }
412
413 err = icmp_err_convert[code].errno;
f1ecd5d9
DL
414 /* check if icmp_skb allows revert of backoff
415 * (see draft-zimmermann-tcp-lcd) */
416 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
417 break;
418 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
419 !icsk->icsk_backoff)
420 break;
421
8f49c270
DM
422 if (sock_owned_by_user(sk))
423 break;
424
f1ecd5d9
DL
425 icsk->icsk_backoff--;
426 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
427 icsk->icsk_backoff;
428 tcp_bound_rto(sk);
429
430 skb = tcp_write_queue_head(sk);
431 BUG_ON(!skb);
432
433 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
434 tcp_time_stamp - TCP_SKB_CB(skb)->when);
435
436 if (remaining) {
437 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438 remaining, TCP_RTO_MAX);
f1ecd5d9
DL
439 } else {
440 /* RTO revert clocked out retransmission.
441 * Will retransmit now */
442 tcp_retransmit_timer(sk);
443 }
444
1da177e4
LT
445 break;
446 case ICMP_TIME_EXCEEDED:
447 err = EHOSTUNREACH;
448 break;
449 default:
450 goto out;
451 }
452
453 switch (sk->sk_state) {
60236fdd 454 struct request_sock *req, **prev;
1da177e4
LT
455 case TCP_LISTEN:
456 if (sock_owned_by_user(sk))
457 goto out;
458
463c84b9
ACM
459 req = inet_csk_search_req(sk, &prev, th->dest,
460 iph->daddr, iph->saddr);
1da177e4
LT
461 if (!req)
462 goto out;
463
464 /* ICMPs are not backlogged, hence we cannot get
465 an established socket here.
466 */
547b792c 467 WARN_ON(req->sk);
1da177e4 468
2e6599cb 469 if (seq != tcp_rsk(req)->snt_isn) {
de0744af 470 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
1da177e4
LT
471 goto out;
472 }
473
474 /*
475 * Still in SYN_RECV, just remove it silently.
476 * There is no good way to pass the error to the newly
477 * created socket, and POSIX does not want network
478 * errors returned from accept().
479 */
463c84b9 480 inet_csk_reqsk_queue_drop(sk, req, prev);
1da177e4
LT
481 goto out;
482
483 case TCP_SYN_SENT:
484 case TCP_SYN_RECV: /* Cannot happen.
485 It can f.e. if SYNs crossed.
486 */
487 if (!sock_owned_by_user(sk)) {
1da177e4
LT
488 sk->sk_err = err;
489
490 sk->sk_error_report(sk);
491
492 tcp_done(sk);
493 } else {
494 sk->sk_err_soft = err;
495 }
496 goto out;
497 }
498
499 /* If we've already connected we will keep trying
500 * until we time out, or the user gives up.
501 *
502 * rfc1122 4.2.3.9 allows to consider as hard errors
503 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
504 * but it is obsoleted by pmtu discovery).
505 *
506 * Note, that in modern internet, where routing is unreliable
507 * and in each dark corner broken firewalls sit, sending random
508 * errors ordered by their masters even this two messages finally lose
509 * their original sense (even Linux sends invalid PORT_UNREACHs)
510 *
511 * Now we are in compliance with RFCs.
512 * --ANK (980905)
513 */
514
515 inet = inet_sk(sk);
516 if (!sock_owned_by_user(sk) && inet->recverr) {
517 sk->sk_err = err;
518 sk->sk_error_report(sk);
519 } else { /* Only an error on timeout */
520 sk->sk_err_soft = err;
521 }
522
523out:
524 bh_unlock_sock(sk);
525 sock_put(sk);
526}
527
419f9f89
HX
528static void __tcp_v4_send_check(struct sk_buff *skb,
529 __be32 saddr, __be32 daddr)
1da177e4 530{
aa8223c7 531 struct tcphdr *th = tcp_hdr(skb);
1da177e4 532
84fa7933 533 if (skb->ip_summed == CHECKSUM_PARTIAL) {
419f9f89 534 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
663ead3b 535 skb->csum_start = skb_transport_header(skb) - skb->head;
ff1dcadb 536 skb->csum_offset = offsetof(struct tcphdr, check);
1da177e4 537 } else {
419f9f89 538 th->check = tcp_v4_check(skb->len, saddr, daddr,
07f0757a 539 csum_partial(th,
1da177e4
LT
540 th->doff << 2,
541 skb->csum));
542 }
543}
544
419f9f89 545/* This routine computes an IPv4 TCP checksum. */
bb296246 546void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
419f9f89
HX
547{
548 struct inet_sock *inet = inet_sk(sk);
549
550 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
551}
4bc2f18b 552EXPORT_SYMBOL(tcp_v4_send_check);
419f9f89 553
a430a43d
HX
554int tcp_v4_gso_send_check(struct sk_buff *skb)
555{
eddc9ec5 556 const struct iphdr *iph;
a430a43d
HX
557 struct tcphdr *th;
558
559 if (!pskb_may_pull(skb, sizeof(*th)))
560 return -EINVAL;
561
eddc9ec5 562 iph = ip_hdr(skb);
aa8223c7 563 th = tcp_hdr(skb);
a430a43d
HX
564
565 th->check = 0;
84fa7933 566 skb->ip_summed = CHECKSUM_PARTIAL;
419f9f89 567 __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
a430a43d
HX
568 return 0;
569}
570
1da177e4
LT
571/*
572 * This routine will send an RST to the other tcp.
573 *
574 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
575 * for reset.
576 * Answer: if a packet caused RST, it is not for a socket
577 * existing in our system, if it is matched to a socket,
578 * it is just duplicate segment or bug in other side's TCP.
579 * So that we build reply only basing on parameters
580 * arrived with segment.
581 * Exception: precedence violation. We do not implement it in any case.
582 */
583
cfb6eeb4 584static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
1da177e4 585{
aa8223c7 586 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
587 struct {
588 struct tcphdr th;
589#ifdef CONFIG_TCP_MD5SIG
714e85be 590 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
cfb6eeb4
YH
591#endif
592 } rep;
1da177e4 593 struct ip_reply_arg arg;
cfb6eeb4
YH
594#ifdef CONFIG_TCP_MD5SIG
595 struct tcp_md5sig_key *key;
596#endif
a86b1e30 597 struct net *net;
1da177e4
LT
598
599 /* Never send a reset in response to a reset. */
600 if (th->rst)
601 return;
602
511c3f92 603 if (skb_rtable(skb)->rt_type != RTN_LOCAL)
1da177e4
LT
604 return;
605
606 /* Swap the send and the receive. */
cfb6eeb4
YH
607 memset(&rep, 0, sizeof(rep));
608 rep.th.dest = th->source;
609 rep.th.source = th->dest;
610 rep.th.doff = sizeof(struct tcphdr) / 4;
611 rep.th.rst = 1;
1da177e4
LT
612
613 if (th->ack) {
cfb6eeb4 614 rep.th.seq = th->ack_seq;
1da177e4 615 } else {
cfb6eeb4
YH
616 rep.th.ack = 1;
617 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
618 skb->len - (th->doff << 2));
1da177e4
LT
619 }
620
7174259e 621 memset(&arg, 0, sizeof(arg));
cfb6eeb4
YH
622 arg.iov[0].iov_base = (unsigned char *)&rep;
623 arg.iov[0].iov_len = sizeof(rep.th);
624
625#ifdef CONFIG_TCP_MD5SIG
eddc9ec5 626 key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
cfb6eeb4
YH
627 if (key) {
628 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
629 (TCPOPT_NOP << 16) |
630 (TCPOPT_MD5SIG << 8) |
631 TCPOLEN_MD5SIG);
632 /* Update length and the length the header thinks exists */
633 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
634 rep.th.doff = arg.iov[0].iov_len / 4;
635
49a72dfb 636 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
78e645cb
IJ
637 key, ip_hdr(skb)->saddr,
638 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
639 }
640#endif
eddc9ec5
ACM
641 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
642 ip_hdr(skb)->saddr, /* XXX */
52cd5750 643 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1da177e4 644 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
88ef4a5a 645 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
1da177e4 646
adf30907 647 net = dev_net(skb_dst(skb)->dev);
a86b1e30 648 ip_send_reply(net->ipv4.tcp_sock, skb,
7feb49c8 649 &arg, arg.iov[0].iov_len);
1da177e4 650
63231bdd
PE
651 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
652 TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
1da177e4
LT
653}
654
655/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
656 outside socket context is ugly, certainly. What can I do?
657 */
658
9501f972
YH
659static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
660 u32 win, u32 ts, int oif,
88ef4a5a
KK
661 struct tcp_md5sig_key *key,
662 int reply_flags)
1da177e4 663{
aa8223c7 664 struct tcphdr *th = tcp_hdr(skb);
1da177e4
LT
665 struct {
666 struct tcphdr th;
714e85be 667 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
cfb6eeb4 668#ifdef CONFIG_TCP_MD5SIG
714e85be 669 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
cfb6eeb4
YH
670#endif
671 ];
1da177e4
LT
672 } rep;
673 struct ip_reply_arg arg;
adf30907 674 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4
LT
675
676 memset(&rep.th, 0, sizeof(struct tcphdr));
7174259e 677 memset(&arg, 0, sizeof(arg));
1da177e4
LT
678
679 arg.iov[0].iov_base = (unsigned char *)&rep;
680 arg.iov[0].iov_len = sizeof(rep.th);
681 if (ts) {
cfb6eeb4
YH
682 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
683 (TCPOPT_TIMESTAMP << 8) |
684 TCPOLEN_TIMESTAMP);
685 rep.opt[1] = htonl(tcp_time_stamp);
686 rep.opt[2] = htonl(ts);
cb48cfe8 687 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4
LT
688 }
689
690 /* Swap the send and the receive. */
691 rep.th.dest = th->source;
692 rep.th.source = th->dest;
693 rep.th.doff = arg.iov[0].iov_len / 4;
694 rep.th.seq = htonl(seq);
695 rep.th.ack_seq = htonl(ack);
696 rep.th.ack = 1;
697 rep.th.window = htons(win);
698
cfb6eeb4 699#ifdef CONFIG_TCP_MD5SIG
cfb6eeb4
YH
700 if (key) {
701 int offset = (ts) ? 3 : 0;
702
703 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
704 (TCPOPT_NOP << 16) |
705 (TCPOPT_MD5SIG << 8) |
706 TCPOLEN_MD5SIG);
707 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
708 rep.th.doff = arg.iov[0].iov_len/4;
709
49a72dfb 710 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
90b7e112
AL
711 key, ip_hdr(skb)->saddr,
712 ip_hdr(skb)->daddr, &rep.th);
cfb6eeb4
YH
713 }
714#endif
88ef4a5a 715 arg.flags = reply_flags;
eddc9ec5
ACM
716 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
717 ip_hdr(skb)->saddr, /* XXX */
1da177e4
LT
718 arg.iov[0].iov_len, IPPROTO_TCP, 0);
719 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
9501f972
YH
720 if (oif)
721 arg.bound_dev_if = oif;
1da177e4 722
a86b1e30 723 ip_send_reply(net->ipv4.tcp_sock, skb,
7feb49c8 724 &arg, arg.iov[0].iov_len);
1da177e4 725
63231bdd 726 TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
1da177e4
LT
727}
728
729static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
730{
8feaf0c0 731 struct inet_timewait_sock *tw = inet_twsk(sk);
cfb6eeb4 732 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1da177e4 733
9501f972 734 tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
7174259e 735 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
9501f972
YH
736 tcptw->tw_ts_recent,
737 tw->tw_bound_dev_if,
88ef4a5a
KK
738 tcp_twsk_md5_key(tcptw),
739 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
9501f972 740 );
1da177e4 741
8feaf0c0 742 inet_twsk_put(tw);
1da177e4
LT
743}
744
6edafaaf 745static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
7174259e 746 struct request_sock *req)
1da177e4 747{
9501f972 748 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
cfb6eeb4 749 tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
9501f972
YH
750 req->ts_recent,
751 0,
88ef4a5a
KK
752 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
753 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
1da177e4
LT
754}
755
1da177e4 756/*
9bf1d83e 757 * Send a SYN-ACK after having received a SYN.
60236fdd 758 * This still operates on a request_sock only, not on a big
1da177e4
LT
759 * socket.
760 */
72659ecc
OP
761static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
762 struct request_sock *req,
763 struct request_values *rvp)
1da177e4 764{
2e6599cb 765 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
766 int err = -1;
767 struct sk_buff * skb;
768
769 /* First, grab a route. */
463c84b9 770 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
fd80eb94 771 return -1;
1da177e4 772
e6b4d113 773 skb = tcp_make_synack(sk, dst, req, rvp);
1da177e4
LT
774
775 if (skb) {
419f9f89 776 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
1da177e4 777
2e6599cb
ACM
778 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
779 ireq->rmt_addr,
780 ireq->opt);
b9df3cb8 781 err = net_xmit_eval(err);
1da177e4
LT
782 }
783
1da177e4
LT
784 dst_release(dst);
785 return err;
786}
787
72659ecc 788static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
e6b4d113 789 struct request_values *rvp)
fd80eb94 790{
72659ecc
OP
791 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
792 return tcp_v4_send_synack(sk, NULL, req, rvp);
fd80eb94
DL
793}
794
1da177e4 795/*
60236fdd 796 * IPv4 request_sock destructor.
1da177e4 797 */
60236fdd 798static void tcp_v4_reqsk_destructor(struct request_sock *req)
1da177e4 799{
a51482bd 800 kfree(inet_rsk(req)->opt);
1da177e4
LT
801}
802
2a1d4bd4 803static void syn_flood_warning(const struct sk_buff *skb)
1da177e4 804{
2a1d4bd4 805 const char *msg;
1da177e4 806
2a1d4bd4
FW
807#ifdef CONFIG_SYN_COOKIES
808 if (sysctl_tcp_syncookies)
809 msg = "Sending cookies";
810 else
80e40daa 811#endif
2a1d4bd4
FW
812 msg = "Dropping request";
813
814 pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
815 ntohs(tcp_hdr(skb)->dest), msg);
816}
1da177e4
LT
817
818/*
60236fdd 819 * Save and compile IPv4 options into the request_sock if needed.
1da177e4 820 */
40efc6fa
SH
821static struct ip_options *tcp_v4_save_options(struct sock *sk,
822 struct sk_buff *skb)
1da177e4
LT
823{
824 struct ip_options *opt = &(IPCB(skb)->opt);
825 struct ip_options *dopt = NULL;
826
827 if (opt && opt->optlen) {
828 int opt_size = optlength(opt);
829 dopt = kmalloc(opt_size, GFP_ATOMIC);
830 if (dopt) {
831 if (ip_options_echo(dopt, skb)) {
832 kfree(dopt);
833 dopt = NULL;
834 }
835 }
836 }
837 return dopt;
838}
839
cfb6eeb4
YH
840#ifdef CONFIG_TCP_MD5SIG
841/*
842 * RFC2385 MD5 checksumming requires a mapping of
843 * IP address->MD5 Key.
844 * We need to maintain these in the sk structure.
845 */
846
847/* Find the Key structure for an address. */
7174259e
ACM
848static struct tcp_md5sig_key *
849 tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
cfb6eeb4
YH
850{
851 struct tcp_sock *tp = tcp_sk(sk);
852 int i;
853
854 if (!tp->md5sig_info || !tp->md5sig_info->entries4)
855 return NULL;
856 for (i = 0; i < tp->md5sig_info->entries4; i++) {
857 if (tp->md5sig_info->keys4[i].addr == addr)
f8ab18d2 858 return &tp->md5sig_info->keys4[i].base;
cfb6eeb4
YH
859 }
860 return NULL;
861}
862
863struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
864 struct sock *addr_sk)
865{
c720c7e8 866 return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
cfb6eeb4 867}
cfb6eeb4
YH
868EXPORT_SYMBOL(tcp_v4_md5_lookup);
869
f5b99bcd
AB
870static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
871 struct request_sock *req)
cfb6eeb4
YH
872{
873 return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
874}
875
876/* This can be called on a newly created socket, from other files */
877int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
878 u8 *newkey, u8 newkeylen)
879{
880 /* Add Key to the list */
b0a713e9 881 struct tcp_md5sig_key *key;
cfb6eeb4
YH
882 struct tcp_sock *tp = tcp_sk(sk);
883 struct tcp4_md5sig_key *keys;
884
b0a713e9 885 key = tcp_v4_md5_do_lookup(sk, addr);
cfb6eeb4
YH
886 if (key) {
887 /* Pre-existing entry - just update that one. */
b0a713e9
MD
888 kfree(key->key);
889 key->key = newkey;
890 key->keylen = newkeylen;
cfb6eeb4 891 } else {
f6685938
ACM
892 struct tcp_md5sig_info *md5sig;
893
cfb6eeb4 894 if (!tp->md5sig_info) {
f6685938
ACM
895 tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
896 GFP_ATOMIC);
cfb6eeb4
YH
897 if (!tp->md5sig_info) {
898 kfree(newkey);
899 return -ENOMEM;
900 }
a465419b 901 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
cfb6eeb4 902 }
aa133076 903 if (tcp_alloc_md5sig_pool(sk) == NULL) {
cfb6eeb4
YH
904 kfree(newkey);
905 return -ENOMEM;
906 }
f6685938
ACM
907 md5sig = tp->md5sig_info;
908
909 if (md5sig->alloced4 == md5sig->entries4) {
910 keys = kmalloc((sizeof(*keys) *
e905a9ed 911 (md5sig->entries4 + 1)), GFP_ATOMIC);
cfb6eeb4
YH
912 if (!keys) {
913 kfree(newkey);
914 tcp_free_md5sig_pool();
915 return -ENOMEM;
916 }
917
f6685938
ACM
918 if (md5sig->entries4)
919 memcpy(keys, md5sig->keys4,
920 sizeof(*keys) * md5sig->entries4);
cfb6eeb4
YH
921
922 /* Free old key list, and reference new one */
a80cc20d 923 kfree(md5sig->keys4);
f6685938
ACM
924 md5sig->keys4 = keys;
925 md5sig->alloced4++;
cfb6eeb4 926 }
f6685938 927 md5sig->entries4++;
f8ab18d2
DM
928 md5sig->keys4[md5sig->entries4 - 1].addr = addr;
929 md5sig->keys4[md5sig->entries4 - 1].base.key = newkey;
930 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
cfb6eeb4
YH
931 }
932 return 0;
933}
cfb6eeb4
YH
934EXPORT_SYMBOL(tcp_v4_md5_do_add);
935
936static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
937 u8 *newkey, u8 newkeylen)
938{
c720c7e8 939 return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
cfb6eeb4
YH
940 newkey, newkeylen);
941}
942
943int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
944{
945 struct tcp_sock *tp = tcp_sk(sk);
946 int i;
947
948 for (i = 0; i < tp->md5sig_info->entries4; i++) {
949 if (tp->md5sig_info->keys4[i].addr == addr) {
950 /* Free the key */
f8ab18d2 951 kfree(tp->md5sig_info->keys4[i].base.key);
cfb6eeb4
YH
952 tp->md5sig_info->entries4--;
953
954 if (tp->md5sig_info->entries4 == 0) {
955 kfree(tp->md5sig_info->keys4);
956 tp->md5sig_info->keys4 = NULL;
8228a18d 957 tp->md5sig_info->alloced4 = 0;
7174259e 958 } else if (tp->md5sig_info->entries4 != i) {
cfb6eeb4 959 /* Need to do some manipulation */
354faf09
YH
960 memmove(&tp->md5sig_info->keys4[i],
961 &tp->md5sig_info->keys4[i+1],
962 (tp->md5sig_info->entries4 - i) *
963 sizeof(struct tcp4_md5sig_key));
cfb6eeb4
YH
964 }
965 tcp_free_md5sig_pool();
966 return 0;
967 }
968 }
969 return -ENOENT;
970}
cfb6eeb4
YH
971EXPORT_SYMBOL(tcp_v4_md5_do_del);
972
7174259e 973static void tcp_v4_clear_md5_list(struct sock *sk)
cfb6eeb4
YH
974{
975 struct tcp_sock *tp = tcp_sk(sk);
976
977 /* Free each key, then the set of key keys,
978 * the crypto element, and then decrement our
979 * hold on the last resort crypto.
980 */
981 if (tp->md5sig_info->entries4) {
982 int i;
983 for (i = 0; i < tp->md5sig_info->entries4; i++)
f8ab18d2 984 kfree(tp->md5sig_info->keys4[i].base.key);
cfb6eeb4
YH
985 tp->md5sig_info->entries4 = 0;
986 tcp_free_md5sig_pool();
987 }
988 if (tp->md5sig_info->keys4) {
989 kfree(tp->md5sig_info->keys4);
990 tp->md5sig_info->keys4 = NULL;
991 tp->md5sig_info->alloced4 = 0;
992 }
993}
994
7174259e
ACM
995static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
996 int optlen)
cfb6eeb4
YH
997{
998 struct tcp_md5sig cmd;
999 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1000 u8 *newkey;
1001
1002 if (optlen < sizeof(cmd))
1003 return -EINVAL;
1004
7174259e 1005 if (copy_from_user(&cmd, optval, sizeof(cmd)))
cfb6eeb4
YH
1006 return -EFAULT;
1007
1008 if (sin->sin_family != AF_INET)
1009 return -EINVAL;
1010
1011 if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1012 if (!tcp_sk(sk)->md5sig_info)
1013 return -ENOENT;
1014 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1015 }
1016
1017 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1018 return -EINVAL;
1019
1020 if (!tcp_sk(sk)->md5sig_info) {
1021 struct tcp_sock *tp = tcp_sk(sk);
aa133076 1022 struct tcp_md5sig_info *p;
cfb6eeb4 1023
aa133076 1024 p = kzalloc(sizeof(*p), sk->sk_allocation);
cfb6eeb4
YH
1025 if (!p)
1026 return -EINVAL;
1027
1028 tp->md5sig_info = p;
a465419b 1029 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1030 }
1031
aa133076 1032 newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
cfb6eeb4
YH
1033 if (!newkey)
1034 return -ENOMEM;
cfb6eeb4
YH
1035 return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1036 newkey, cmd.tcpm_keylen);
1037}
1038
49a72dfb
AL
1039static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1040 __be32 daddr, __be32 saddr, int nbytes)
cfb6eeb4 1041{
cfb6eeb4 1042 struct tcp4_pseudohdr *bp;
49a72dfb 1043 struct scatterlist sg;
cfb6eeb4
YH
1044
1045 bp = &hp->md5_blk.ip4;
cfb6eeb4
YH
1046
1047 /*
49a72dfb 1048 * 1. the TCP pseudo-header (in the order: source IP address,
cfb6eeb4
YH
1049 * destination IP address, zero-padded protocol number, and
1050 * segment length)
1051 */
1052 bp->saddr = saddr;
1053 bp->daddr = daddr;
1054 bp->pad = 0;
076fb722 1055 bp->protocol = IPPROTO_TCP;
49a72dfb 1056 bp->len = cpu_to_be16(nbytes);
c7da57a1 1057
49a72dfb
AL
1058 sg_init_one(&sg, bp, sizeof(*bp));
1059 return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1060}
1061
1062static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1063 __be32 daddr, __be32 saddr, struct tcphdr *th)
1064{
1065 struct tcp_md5sig_pool *hp;
1066 struct hash_desc *desc;
1067
1068 hp = tcp_get_md5sig_pool();
1069 if (!hp)
1070 goto clear_hash_noput;
1071 desc = &hp->md5_desc;
1072
1073 if (crypto_hash_init(desc))
1074 goto clear_hash;
1075 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1076 goto clear_hash;
1077 if (tcp_md5_hash_header(hp, th))
1078 goto clear_hash;
1079 if (tcp_md5_hash_key(hp, key))
1080 goto clear_hash;
1081 if (crypto_hash_final(desc, md5_hash))
cfb6eeb4
YH
1082 goto clear_hash;
1083
cfb6eeb4 1084 tcp_put_md5sig_pool();
cfb6eeb4 1085 return 0;
49a72dfb 1086
cfb6eeb4
YH
1087clear_hash:
1088 tcp_put_md5sig_pool();
1089clear_hash_noput:
1090 memset(md5_hash, 0, 16);
49a72dfb 1091 return 1;
cfb6eeb4
YH
1092}
1093
49a72dfb
AL
1094int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1095 struct sock *sk, struct request_sock *req,
1096 struct sk_buff *skb)
cfb6eeb4 1097{
49a72dfb
AL
1098 struct tcp_md5sig_pool *hp;
1099 struct hash_desc *desc;
1100 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4
YH
1101 __be32 saddr, daddr;
1102
1103 if (sk) {
c720c7e8
ED
1104 saddr = inet_sk(sk)->inet_saddr;
1105 daddr = inet_sk(sk)->inet_daddr;
49a72dfb
AL
1106 } else if (req) {
1107 saddr = inet_rsk(req)->loc_addr;
1108 daddr = inet_rsk(req)->rmt_addr;
cfb6eeb4 1109 } else {
49a72dfb
AL
1110 const struct iphdr *iph = ip_hdr(skb);
1111 saddr = iph->saddr;
1112 daddr = iph->daddr;
cfb6eeb4 1113 }
49a72dfb
AL
1114
1115 hp = tcp_get_md5sig_pool();
1116 if (!hp)
1117 goto clear_hash_noput;
1118 desc = &hp->md5_desc;
1119
1120 if (crypto_hash_init(desc))
1121 goto clear_hash;
1122
1123 if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1124 goto clear_hash;
1125 if (tcp_md5_hash_header(hp, th))
1126 goto clear_hash;
1127 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1128 goto clear_hash;
1129 if (tcp_md5_hash_key(hp, key))
1130 goto clear_hash;
1131 if (crypto_hash_final(desc, md5_hash))
1132 goto clear_hash;
1133
1134 tcp_put_md5sig_pool();
1135 return 0;
1136
1137clear_hash:
1138 tcp_put_md5sig_pool();
1139clear_hash_noput:
1140 memset(md5_hash, 0, 16);
1141 return 1;
cfb6eeb4 1142}
49a72dfb 1143EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
cfb6eeb4 1144
7174259e 1145static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
cfb6eeb4
YH
1146{
1147 /*
1148 * This gets called for each TCP segment that arrives
1149 * so we want to be efficient.
1150 * We have 3 drop cases:
1151 * o No MD5 hash and one expected.
1152 * o MD5 hash and we're not expecting one.
1153 * o MD5 hash and its wrong.
1154 */
1155 __u8 *hash_location = NULL;
1156 struct tcp_md5sig_key *hash_expected;
eddc9ec5 1157 const struct iphdr *iph = ip_hdr(skb);
aa8223c7 1158 struct tcphdr *th = tcp_hdr(skb);
cfb6eeb4 1159 int genhash;
cfb6eeb4
YH
1160 unsigned char newhash[16];
1161
1162 hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
7d5d5525 1163 hash_location = tcp_parse_md5sig_option(th);
cfb6eeb4 1164
cfb6eeb4
YH
1165 /* We've parsed the options - do we have a hash? */
1166 if (!hash_expected && !hash_location)
1167 return 0;
1168
1169 if (hash_expected && !hash_location) {
785957d3 1170 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
cfb6eeb4
YH
1171 return 1;
1172 }
1173
1174 if (!hash_expected && hash_location) {
785957d3 1175 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
cfb6eeb4
YH
1176 return 1;
1177 }
1178
1179 /* Okay, so this is hash_expected and hash_location -
1180 * so we need to calculate the checksum.
1181 */
49a72dfb
AL
1182 genhash = tcp_v4_md5_hash_skb(newhash,
1183 hash_expected,
1184 NULL, NULL, skb);
cfb6eeb4
YH
1185
1186 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1187 if (net_ratelimit()) {
673d57e7
HH
1188 printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1189 &iph->saddr, ntohs(th->source),
1190 &iph->daddr, ntohs(th->dest),
cfb6eeb4 1191 genhash ? " tcp_v4_calc_md5_hash failed" : "");
cfb6eeb4
YH
1192 }
1193 return 1;
1194 }
1195 return 0;
1196}
1197
1198#endif
1199
72a3effa 1200struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1da177e4 1201 .family = PF_INET,
2e6599cb 1202 .obj_size = sizeof(struct tcp_request_sock),
72659ecc 1203 .rtx_syn_ack = tcp_v4_rtx_synack,
60236fdd
ACM
1204 .send_ack = tcp_v4_reqsk_send_ack,
1205 .destructor = tcp_v4_reqsk_destructor,
1da177e4 1206 .send_reset = tcp_v4_send_reset,
72659ecc 1207 .syn_ack_timeout = tcp_syn_ack_timeout,
1da177e4
LT
1208};
1209
cfb6eeb4 1210#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1211static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
cfb6eeb4 1212 .md5_lookup = tcp_v4_reqsk_md5_lookup,
e3afe7b7 1213 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4 1214};
b6332e6c 1215#endif
cfb6eeb4 1216
1da177e4
LT
1217int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1218{
4957faad 1219 struct tcp_extend_values tmp_ext;
1da177e4 1220 struct tcp_options_received tmp_opt;
4957faad 1221 u8 *hash_location;
60236fdd 1222 struct request_sock *req;
e6b4d113 1223 struct inet_request_sock *ireq;
4957faad 1224 struct tcp_sock *tp = tcp_sk(sk);
e6b4d113 1225 struct dst_entry *dst = NULL;
eddc9ec5
ACM
1226 __be32 saddr = ip_hdr(skb)->saddr;
1227 __be32 daddr = ip_hdr(skb)->daddr;
1da177e4 1228 __u32 isn = TCP_SKB_CB(skb)->when;
1da177e4
LT
1229#ifdef CONFIG_SYN_COOKIES
1230 int want_cookie = 0;
1231#else
1232#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1233#endif
1234
1235 /* Never answer to SYNs send to broadcast or multicast */
511c3f92 1236 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1da177e4
LT
1237 goto drop;
1238
1239 /* TW buckets are converted to open requests without
1240 * limitations, they conserve resources and peer is
1241 * evidently real one.
1242 */
463c84b9 1243 if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
2a1d4bd4
FW
1244 if (net_ratelimit())
1245 syn_flood_warning(skb);
1da177e4
LT
1246#ifdef CONFIG_SYN_COOKIES
1247 if (sysctl_tcp_syncookies) {
1248 want_cookie = 1;
1249 } else
1250#endif
1251 goto drop;
1252 }
1253
1254 /* Accept backlog is full. If we have already queued enough
1255 * of warm entries in syn queue, drop request. It is better than
1256 * clogging syn queue with openreqs with exponentially increasing
1257 * timeout.
1258 */
463c84b9 1259 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1da177e4
LT
1260 goto drop;
1261
ce4a7d0d 1262 req = inet_reqsk_alloc(&tcp_request_sock_ops);
1da177e4
LT
1263 if (!req)
1264 goto drop;
1265
cfb6eeb4
YH
1266#ifdef CONFIG_TCP_MD5SIG
1267 tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1268#endif
1269
1da177e4 1270 tcp_clear_options(&tmp_opt);
bee7ca9e 1271 tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
4957faad 1272 tmp_opt.user_mss = tp->rx_opt.user_mss;
bb5b7c11 1273 tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
4957faad
WAS
1274
1275 if (tmp_opt.cookie_plus > 0 &&
1276 tmp_opt.saw_tstamp &&
1277 !tp->rx_opt.cookie_out_never &&
1278 (sysctl_tcp_cookie_size > 0 ||
1279 (tp->cookie_values != NULL &&
1280 tp->cookie_values->cookie_desired > 0))) {
1281 u8 *c;
1282 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1283 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1284
1285 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1286 goto drop_and_release;
1287
1288 /* Secret recipe starts with IP addresses */
0eae88f3
ED
1289 *mess++ ^= (__force u32)daddr;
1290 *mess++ ^= (__force u32)saddr;
1da177e4 1291
4957faad
WAS
1292 /* plus variable length Initiator Cookie */
1293 c = (u8 *)mess;
1294 while (l-- > 0)
1295 *c++ ^= *hash_location++;
1296
1297#ifdef CONFIG_SYN_COOKIES
1298 want_cookie = 0; /* not our kind of cookie */
1299#endif
1300 tmp_ext.cookie_out_never = 0; /* false */
1301 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1302 } else if (!tp->rx_opt.cookie_in_always) {
1303 /* redundant indications, but ensure initialization. */
1304 tmp_ext.cookie_out_never = 1; /* true */
1305 tmp_ext.cookie_plus = 0;
1306 } else {
1307 goto drop_and_release;
1308 }
1309 tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1da177e4 1310
4dfc2817 1311 if (want_cookie && !tmp_opt.saw_tstamp)
1da177e4 1312 tcp_clear_options(&tmp_opt);
1da177e4 1313
1da177e4 1314 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1da177e4
LT
1315 tcp_openreq_init(req, &tmp_opt, skb);
1316
bb5b7c11
DM
1317 ireq = inet_rsk(req);
1318 ireq->loc_addr = daddr;
1319 ireq->rmt_addr = saddr;
1320 ireq->no_srccheck = inet_sk(sk)->transparent;
1321 ireq->opt = tcp_v4_save_options(sk, skb);
1322
284904aa 1323 if (security_inet_conn_request(sk, skb, req))
bb5b7c11 1324 goto drop_and_free;
284904aa 1325
172d69e6 1326 if (!want_cookie || tmp_opt.tstamp_ok)
aa8223c7 1327 TCP_ECN_create_request(req, tcp_hdr(skb));
1da177e4
LT
1328
1329 if (want_cookie) {
1da177e4 1330 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
172d69e6 1331 req->cookie_ts = tmp_opt.tstamp_ok;
1da177e4
LT
1332 } else if (!isn) {
1333 struct inet_peer *peer = NULL;
1334
1335 /* VJ's idea. We save last timestamp seen
1336 * from the destination in peer table, when entering
1337 * state TIME-WAIT, and check against it before
1338 * accepting new connection request.
1339 *
1340 * If "isn" is not zero, this request hit alive
1341 * timewait bucket, so that all the necessary checks
1342 * are made in the function processing timewait state.
1343 */
1344 if (tmp_opt.saw_tstamp &&
295ff7ed 1345 tcp_death_row.sysctl_tw_recycle &&
bb5b7c11 1346 (dst = inet_csk_route_req(sk, req)) != NULL &&
1da177e4 1347 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
7a71ed89 1348 peer->daddr.addr.a4 == saddr) {
317fe0e6 1349 inet_peer_refcheck(peer);
2c1409a0 1350 if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1da177e4
LT
1351 (s32)(peer->tcp_ts - req->ts_recent) >
1352 TCP_PAWS_WINDOW) {
de0744af 1353 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
7cd04fa7 1354 goto drop_and_release;
1da177e4
LT
1355 }
1356 }
1357 /* Kill the following clause, if you dislike this way. */
1358 else if (!sysctl_tcp_syncookies &&
463c84b9 1359 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1da177e4
LT
1360 (sysctl_max_syn_backlog >> 2)) &&
1361 (!peer || !peer->tcp_ts_stamp) &&
1362 (!dst || !dst_metric(dst, RTAX_RTT))) {
1363 /* Without syncookies last quarter of
1364 * backlog is filled with destinations,
1365 * proven to be alive.
1366 * It means that we continue to communicate
1367 * to destinations, already remembered
1368 * to the moment of synflood.
1369 */
673d57e7
HH
1370 LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1371 &saddr, ntohs(tcp_hdr(skb)->source));
7cd04fa7 1372 goto drop_and_release;
1da177e4
LT
1373 }
1374
a94f723d 1375 isn = tcp_v4_init_sequence(skb);
1da177e4 1376 }
2e6599cb 1377 tcp_rsk(req)->snt_isn = isn;
1da177e4 1378
72659ecc
OP
1379 if (tcp_v4_send_synack(sk, dst, req,
1380 (struct request_values *)&tmp_ext) ||
4957faad 1381 want_cookie)
1da177e4
LT
1382 goto drop_and_free;
1383
7cd04fa7 1384 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1da177e4
LT
1385 return 0;
1386
7cd04fa7
DL
1387drop_and_release:
1388 dst_release(dst);
1da177e4 1389drop_and_free:
60236fdd 1390 reqsk_free(req);
1da177e4 1391drop:
1da177e4
LT
1392 return 0;
1393}
4bc2f18b 1394EXPORT_SYMBOL(tcp_v4_conn_request);
1da177e4
LT
1395
1396
1397/*
1398 * The three way handshake has completed - we got a valid synack -
1399 * now create the new socket.
1400 */
1401struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
60236fdd 1402 struct request_sock *req,
1da177e4
LT
1403 struct dst_entry *dst)
1404{
2e6599cb 1405 struct inet_request_sock *ireq;
1da177e4
LT
1406 struct inet_sock *newinet;
1407 struct tcp_sock *newtp;
1408 struct sock *newsk;
cfb6eeb4
YH
1409#ifdef CONFIG_TCP_MD5SIG
1410 struct tcp_md5sig_key *key;
1411#endif
1da177e4
LT
1412
1413 if (sk_acceptq_is_full(sk))
1414 goto exit_overflow;
1415
463c84b9 1416 if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1da177e4
LT
1417 goto exit;
1418
1419 newsk = tcp_create_openreq_child(sk, req, skb);
1420 if (!newsk)
093d2823 1421 goto exit_nonewsk;
1da177e4 1422
bcd76111 1423 newsk->sk_gso_type = SKB_GSO_TCPV4;
6cbb0df7 1424 sk_setup_caps(newsk, dst);
1da177e4
LT
1425
1426 newtp = tcp_sk(newsk);
1427 newinet = inet_sk(newsk);
2e6599cb 1428 ireq = inet_rsk(req);
c720c7e8
ED
1429 newinet->inet_daddr = ireq->rmt_addr;
1430 newinet->inet_rcv_saddr = ireq->loc_addr;
1431 newinet->inet_saddr = ireq->loc_addr;
2e6599cb
ACM
1432 newinet->opt = ireq->opt;
1433 ireq->opt = NULL;
463c84b9 1434 newinet->mc_index = inet_iif(skb);
eddc9ec5 1435 newinet->mc_ttl = ip_hdr(skb)->ttl;
d83d8461 1436 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1da177e4 1437 if (newinet->opt)
d83d8461 1438 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
c720c7e8 1439 newinet->inet_id = newtp->write_seq ^ jiffies;
1da177e4 1440
5d424d5a 1441 tcp_mtup_init(newsk);
1da177e4 1442 tcp_sync_mss(newsk, dst_mtu(dst));
0dbaee3b 1443 newtp->advmss = dst_metric_advmss(dst);
f5fff5dc
TQ
1444 if (tcp_sk(sk)->rx_opt.user_mss &&
1445 tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1446 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1447
1da177e4
LT
1448 tcp_initialize_rcv_mss(newsk);
1449
cfb6eeb4
YH
1450#ifdef CONFIG_TCP_MD5SIG
1451 /* Copy over the MD5 key from the original socket */
c720c7e8
ED
1452 key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1453 if (key != NULL) {
cfb6eeb4
YH
1454 /*
1455 * We're using one, so create a matching key
1456 * on the newsk structure. If we fail to get
1457 * memory, then we end up not copying the key
1458 * across. Shucks.
1459 */
f6685938
ACM
1460 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1461 if (newkey != NULL)
c720c7e8 1462 tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
cfb6eeb4 1463 newkey, key->keylen);
a465419b 1464 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
cfb6eeb4
YH
1465 }
1466#endif
1467
093d2823
BS
1468 if (__inet_inherit_port(sk, newsk) < 0) {
1469 sock_put(newsk);
1470 goto exit;
1471 }
9327f705 1472 __inet_hash_nolisten(newsk, NULL);
1da177e4
LT
1473
1474 return newsk;
1475
1476exit_overflow:
de0744af 1477 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
093d2823
BS
1478exit_nonewsk:
1479 dst_release(dst);
1da177e4 1480exit:
de0744af 1481 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1da177e4
LT
1482 return NULL;
1483}
4bc2f18b 1484EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1da177e4
LT
1485
1486static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1487{
aa8223c7 1488 struct tcphdr *th = tcp_hdr(skb);
eddc9ec5 1489 const struct iphdr *iph = ip_hdr(skb);
1da177e4 1490 struct sock *nsk;
60236fdd 1491 struct request_sock **prev;
1da177e4 1492 /* Find possible connection requests. */
463c84b9
ACM
1493 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1494 iph->saddr, iph->daddr);
1da177e4
LT
1495 if (req)
1496 return tcp_check_req(sk, skb, req, prev);
1497
3b1e0a65 1498 nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
c67499c0 1499 th->source, iph->daddr, th->dest, inet_iif(skb));
1da177e4
LT
1500
1501 if (nsk) {
1502 if (nsk->sk_state != TCP_TIME_WAIT) {
1503 bh_lock_sock(nsk);
1504 return nsk;
1505 }
9469c7b4 1506 inet_twsk_put(inet_twsk(nsk));
1da177e4
LT
1507 return NULL;
1508 }
1509
1510#ifdef CONFIG_SYN_COOKIES
af9b4738 1511 if (!th->syn)
1da177e4
LT
1512 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1513#endif
1514 return sk;
1515}
1516
b51655b9 1517static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1da177e4 1518{
eddc9ec5
ACM
1519 const struct iphdr *iph = ip_hdr(skb);
1520
84fa7933 1521 if (skb->ip_summed == CHECKSUM_COMPLETE) {
eddc9ec5
ACM
1522 if (!tcp_v4_check(skb->len, iph->saddr,
1523 iph->daddr, skb->csum)) {
fb286bb2 1524 skb->ip_summed = CHECKSUM_UNNECESSARY;
1da177e4 1525 return 0;
fb286bb2 1526 }
1da177e4 1527 }
fb286bb2 1528
eddc9ec5 1529 skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
fb286bb2
HX
1530 skb->len, IPPROTO_TCP, 0);
1531
1da177e4 1532 if (skb->len <= 76) {
fb286bb2 1533 return __skb_checksum_complete(skb);
1da177e4
LT
1534 }
1535 return 0;
1536}
1537
1538
1539/* The socket must have it's spinlock held when we get
1540 * here.
1541 *
1542 * We have a potential double-lock case here, so even when
1543 * doing backlog processing we use the BH locking scheme.
1544 * This is because we cannot sleep with the original spinlock
1545 * held.
1546 */
1547int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1548{
cfb6eeb4
YH
1549 struct sock *rsk;
1550#ifdef CONFIG_TCP_MD5SIG
1551 /*
1552 * We really want to reject the packet as early as possible
1553 * if:
1554 * o We're expecting an MD5'd packet and this is no MD5 tcp option
1555 * o There is an MD5 option and we're not expecting one
1556 */
7174259e 1557 if (tcp_v4_inbound_md5_hash(sk, skb))
cfb6eeb4
YH
1558 goto discard;
1559#endif
1560
1da177e4 1561 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
ca55158c 1562 sock_rps_save_rxhash(sk, skb->rxhash);
aa8223c7 1563 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1564 rsk = sk;
1da177e4 1565 goto reset;
cfb6eeb4 1566 }
1da177e4
LT
1567 return 0;
1568 }
1569
ab6a5bb6 1570 if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1da177e4
LT
1571 goto csum_err;
1572
1573 if (sk->sk_state == TCP_LISTEN) {
1574 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1575 if (!nsk)
1576 goto discard;
1577
1578 if (nsk != sk) {
cfb6eeb4
YH
1579 if (tcp_child_process(sk, nsk, skb)) {
1580 rsk = nsk;
1da177e4 1581 goto reset;
cfb6eeb4 1582 }
1da177e4
LT
1583 return 0;
1584 }
ca55158c
ED
1585 } else
1586 sock_rps_save_rxhash(sk, skb->rxhash);
1587
aa8223c7 1588 if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
cfb6eeb4 1589 rsk = sk;
1da177e4 1590 goto reset;
cfb6eeb4 1591 }
1da177e4
LT
1592 return 0;
1593
1594reset:
cfb6eeb4 1595 tcp_v4_send_reset(rsk, skb);
1da177e4
LT
1596discard:
1597 kfree_skb(skb);
1598 /* Be careful here. If this function gets more complicated and
1599 * gcc suffers from register pressure on the x86, sk (in %ebx)
1600 * might be destroyed here. This current version compiles correctly,
1601 * but you have been warned.
1602 */
1603 return 0;
1604
1605csum_err:
63231bdd 1606 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4
LT
1607 goto discard;
1608}
4bc2f18b 1609EXPORT_SYMBOL(tcp_v4_do_rcv);
1da177e4
LT
1610
1611/*
1612 * From tcp_input.c
1613 */
1614
1615int tcp_v4_rcv(struct sk_buff *skb)
1616{
eddc9ec5 1617 const struct iphdr *iph;
1da177e4
LT
1618 struct tcphdr *th;
1619 struct sock *sk;
1620 int ret;
a86b1e30 1621 struct net *net = dev_net(skb->dev);
1da177e4
LT
1622
1623 if (skb->pkt_type != PACKET_HOST)
1624 goto discard_it;
1625
1626 /* Count it even if it's bad */
63231bdd 1627 TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1da177e4
LT
1628
1629 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1630 goto discard_it;
1631
aa8223c7 1632 th = tcp_hdr(skb);
1da177e4
LT
1633
1634 if (th->doff < sizeof(struct tcphdr) / 4)
1635 goto bad_packet;
1636 if (!pskb_may_pull(skb, th->doff * 4))
1637 goto discard_it;
1638
1639 /* An explanation is required here, I think.
1640 * Packet length and doff are validated by header prediction,
caa20d9a 1641 * provided case of th->doff==0 is eliminated.
1da177e4 1642 * So, we defer the checks. */
60476372 1643 if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1da177e4
LT
1644 goto bad_packet;
1645
aa8223c7 1646 th = tcp_hdr(skb);
eddc9ec5 1647 iph = ip_hdr(skb);
1da177e4
LT
1648 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1649 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1650 skb->len - th->doff * 4);
1651 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1652 TCP_SKB_CB(skb)->when = 0;
eddc9ec5 1653 TCP_SKB_CB(skb)->flags = iph->tos;
1da177e4
LT
1654 TCP_SKB_CB(skb)->sacked = 0;
1655
9a1f27c4 1656 sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1da177e4
LT
1657 if (!sk)
1658 goto no_tcp_socket;
1659
bb134d5d
ED
1660process:
1661 if (sk->sk_state == TCP_TIME_WAIT)
1662 goto do_time_wait;
1663
6cce09f8
ED
1664 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1665 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
d218d111 1666 goto discard_and_relse;
6cce09f8 1667 }
d218d111 1668
1da177e4
LT
1669 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1670 goto discard_and_relse;
b59c2701 1671 nf_reset(skb);
1da177e4 1672
fda9ef5d 1673 if (sk_filter(sk, skb))
1da177e4
LT
1674 goto discard_and_relse;
1675
1676 skb->dev = NULL;
1677
c6366184 1678 bh_lock_sock_nested(sk);
1da177e4
LT
1679 ret = 0;
1680 if (!sock_owned_by_user(sk)) {
1a2449a8
CL
1681#ifdef CONFIG_NET_DMA
1682 struct tcp_sock *tp = tcp_sk(sk);
1683 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
f67b4599 1684 tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1a2449a8 1685 if (tp->ucopy.dma_chan)
1da177e4 1686 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8
CL
1687 else
1688#endif
1689 {
1690 if (!tcp_prequeue(sk, skb))
ae8d7f88 1691 ret = tcp_v4_do_rcv(sk, skb);
1a2449a8 1692 }
6cce09f8 1693 } else if (unlikely(sk_add_backlog(sk, skb))) {
6b03a53a 1694 bh_unlock_sock(sk);
6cce09f8 1695 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6b03a53a
ZY
1696 goto discard_and_relse;
1697 }
1da177e4
LT
1698 bh_unlock_sock(sk);
1699
1700 sock_put(sk);
1701
1702 return ret;
1703
1704no_tcp_socket:
1705 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1706 goto discard_it;
1707
1708 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1709bad_packet:
63231bdd 1710 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1da177e4 1711 } else {
cfb6eeb4 1712 tcp_v4_send_reset(NULL, skb);
1da177e4
LT
1713 }
1714
1715discard_it:
1716 /* Discard frame. */
1717 kfree_skb(skb);
e905a9ed 1718 return 0;
1da177e4
LT
1719
1720discard_and_relse:
1721 sock_put(sk);
1722 goto discard_it;
1723
1724do_time_wait:
1725 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
9469c7b4 1726 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1727 goto discard_it;
1728 }
1729
1730 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
63231bdd 1731 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
9469c7b4 1732 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1733 goto discard_it;
1734 }
9469c7b4 1735 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1da177e4 1736 case TCP_TW_SYN: {
c346dca1 1737 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
c67499c0 1738 &tcp_hashinfo,
eddc9ec5 1739 iph->daddr, th->dest,
463c84b9 1740 inet_iif(skb));
1da177e4 1741 if (sk2) {
9469c7b4
YH
1742 inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1743 inet_twsk_put(inet_twsk(sk));
1da177e4
LT
1744 sk = sk2;
1745 goto process;
1746 }
1747 /* Fall through to ACK */
1748 }
1749 case TCP_TW_ACK:
1750 tcp_v4_timewait_ack(sk, skb);
1751 break;
1752 case TCP_TW_RST:
1753 goto no_tcp_socket;
1754 case TCP_TW_SUCCESS:;
1755 }
1756 goto discard_it;
1757}
1758
3f419d2d 1759struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1da177e4 1760{
3f419d2d 1761 struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1da177e4 1762 struct inet_sock *inet = inet_sk(sk);
3f419d2d 1763 struct inet_peer *peer;
1da177e4 1764
c720c7e8 1765 if (!rt || rt->rt_dst != inet->inet_daddr) {
b534ecf1 1766 peer = inet_getpeer_v4(inet->inet_daddr, 1);
3f419d2d 1767 *release_it = true;
1da177e4
LT
1768 } else {
1769 if (!rt->peer)
1770 rt_bind_peer(rt, 1);
1771 peer = rt->peer;
3f419d2d 1772 *release_it = false;
1da177e4
LT
1773 }
1774
3f419d2d 1775 return peer;
1da177e4 1776}
3f419d2d 1777EXPORT_SYMBOL(tcp_v4_get_peer);
1da177e4 1778
ccb7c410 1779void *tcp_v4_tw_get_peer(struct sock *sk)
1da177e4 1780{
ccb7c410 1781 struct inet_timewait_sock *tw = inet_twsk(sk);
1da177e4 1782
ccb7c410 1783 return inet_getpeer_v4(tw->tw_daddr, 1);
1da177e4 1784}
ccb7c410
DM
1785EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1786
1787static struct timewait_sock_ops tcp_timewait_sock_ops = {
1788 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1789 .twsk_unique = tcp_twsk_unique,
1790 .twsk_destructor= tcp_twsk_destructor,
1791 .twsk_getpeer = tcp_v4_tw_get_peer,
1792};
1da177e4 1793
3b401a81 1794const struct inet_connection_sock_af_ops ipv4_specific = {
543d9cfe
ACM
1795 .queue_xmit = ip_queue_xmit,
1796 .send_check = tcp_v4_send_check,
1797 .rebuild_header = inet_sk_rebuild_header,
1798 .conn_request = tcp_v4_conn_request,
1799 .syn_recv_sock = tcp_v4_syn_recv_sock,
3f419d2d 1800 .get_peer = tcp_v4_get_peer,
543d9cfe
ACM
1801 .net_header_len = sizeof(struct iphdr),
1802 .setsockopt = ip_setsockopt,
1803 .getsockopt = ip_getsockopt,
1804 .addr2sockaddr = inet_csk_addr2sockaddr,
1805 .sockaddr_len = sizeof(struct sockaddr_in),
ab1e0a13 1806 .bind_conflict = inet_csk_bind_conflict,
3fdadf7d 1807#ifdef CONFIG_COMPAT
543d9cfe
ACM
1808 .compat_setsockopt = compat_ip_setsockopt,
1809 .compat_getsockopt = compat_ip_getsockopt,
3fdadf7d 1810#endif
1da177e4 1811};
4bc2f18b 1812EXPORT_SYMBOL(ipv4_specific);
1da177e4 1813
cfb6eeb4 1814#ifdef CONFIG_TCP_MD5SIG
b2e4b3de 1815static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
cfb6eeb4 1816 .md5_lookup = tcp_v4_md5_lookup,
49a72dfb 1817 .calc_md5_hash = tcp_v4_md5_hash_skb,
cfb6eeb4
YH
1818 .md5_add = tcp_v4_md5_add_func,
1819 .md5_parse = tcp_v4_parse_md5_keys,
cfb6eeb4 1820};
b6332e6c 1821#endif
cfb6eeb4 1822
1da177e4
LT
1823/* NOTE: A lot of things set to zero explicitly by call to
1824 * sk_alloc() so need not be done here.
1825 */
1826static int tcp_v4_init_sock(struct sock *sk)
1827{
6687e988 1828 struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4
LT
1829 struct tcp_sock *tp = tcp_sk(sk);
1830
1831 skb_queue_head_init(&tp->out_of_order_queue);
1832 tcp_init_xmit_timers(sk);
1833 tcp_prequeue_init(tp);
1834
6687e988 1835 icsk->icsk_rto = TCP_TIMEOUT_INIT;
1da177e4
LT
1836 tp->mdev = TCP_TIMEOUT_INIT;
1837
1838 /* So many TCP implementations out there (incorrectly) count the
1839 * initial SYN frame in their delayed-ACK and congestion control
1840 * algorithms that we must have the following bandaid to talk
1841 * efficiently to them. -DaveM
1842 */
1843 tp->snd_cwnd = 2;
1844
1845 /* See draft-stevens-tcpca-spec-01 for discussion of the
1846 * initialization of these values.
1847 */
0b6a05c1 1848 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1da177e4 1849 tp->snd_cwnd_clamp = ~0;
bee7ca9e 1850 tp->mss_cache = TCP_MSS_DEFAULT;
1da177e4
LT
1851
1852 tp->reordering = sysctl_tcp_reordering;
6687e988 1853 icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1da177e4
LT
1854
1855 sk->sk_state = TCP_CLOSE;
1856
1857 sk->sk_write_space = sk_stream_write_space;
1858 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1859
8292a17a 1860 icsk->icsk_af_ops = &ipv4_specific;
d83d8461 1861 icsk->icsk_sync_mss = tcp_sync_mss;
cfb6eeb4
YH
1862#ifdef CONFIG_TCP_MD5SIG
1863 tp->af_specific = &tcp_sock_ipv4_specific;
1864#endif
1da177e4 1865
435cf559
WAS
1866 /* TCP Cookie Transactions */
1867 if (sysctl_tcp_cookie_size > 0) {
1868 /* Default, cookies without s_data_payload. */
1869 tp->cookie_values =
1870 kzalloc(sizeof(*tp->cookie_values),
1871 sk->sk_allocation);
1872 if (tp->cookie_values != NULL)
1873 kref_init(&tp->cookie_values->kref);
1874 }
1875 /* Presumed zeroed, in order of appearance:
1876 * cookie_in_always, cookie_out_never,
1877 * s_data_constant, s_data_in, s_data_out
1878 */
1da177e4
LT
1879 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1880 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1881
eb4dea58 1882 local_bh_disable();
1748376b 1883 percpu_counter_inc(&tcp_sockets_allocated);
eb4dea58 1884 local_bh_enable();
1da177e4
LT
1885
1886 return 0;
1887}
1888
7d06b2e0 1889void tcp_v4_destroy_sock(struct sock *sk)
1da177e4
LT
1890{
1891 struct tcp_sock *tp = tcp_sk(sk);
1892
1893 tcp_clear_xmit_timers(sk);
1894
6687e988 1895 tcp_cleanup_congestion_control(sk);
317a76f9 1896
1da177e4 1897 /* Cleanup up the write buffer. */
fe067e8a 1898 tcp_write_queue_purge(sk);
1da177e4
LT
1899
1900 /* Cleans up our, hopefully empty, out_of_order_queue. */
e905a9ed 1901 __skb_queue_purge(&tp->out_of_order_queue);
1da177e4 1902
cfb6eeb4
YH
1903#ifdef CONFIG_TCP_MD5SIG
1904 /* Clean up the MD5 key list, if any */
1905 if (tp->md5sig_info) {
1906 tcp_v4_clear_md5_list(sk);
1907 kfree(tp->md5sig_info);
1908 tp->md5sig_info = NULL;
1909 }
1910#endif
1911
1a2449a8
CL
1912#ifdef CONFIG_NET_DMA
1913 /* Cleans up our sk_async_wait_queue */
e905a9ed 1914 __skb_queue_purge(&sk->sk_async_wait_queue);
1a2449a8
CL
1915#endif
1916
1da177e4
LT
1917 /* Clean prequeue, it must be empty really */
1918 __skb_queue_purge(&tp->ucopy.prequeue);
1919
1920 /* Clean up a referenced TCP bind bucket. */
463c84b9 1921 if (inet_csk(sk)->icsk_bind_hash)
ab1e0a13 1922 inet_put_port(sk);
1da177e4
LT
1923
1924 /*
1925 * If sendmsg cached page exists, toss it.
1926 */
1927 if (sk->sk_sndmsg_page) {
1928 __free_page(sk->sk_sndmsg_page);
1929 sk->sk_sndmsg_page = NULL;
1930 }
1931
435cf559
WAS
1932 /* TCP Cookie Transactions */
1933 if (tp->cookie_values != NULL) {
1934 kref_put(&tp->cookie_values->kref,
1935 tcp_cookie_values_release);
1936 tp->cookie_values = NULL;
1937 }
1938
1748376b 1939 percpu_counter_dec(&tcp_sockets_allocated);
1da177e4 1940}
1da177e4
LT
1941EXPORT_SYMBOL(tcp_v4_destroy_sock);
1942
1943#ifdef CONFIG_PROC_FS
1944/* Proc filesystem TCP sock list dumping. */
1945
3ab5aee7 1946static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1da177e4 1947{
3ab5aee7 1948 return hlist_nulls_empty(head) ? NULL :
8feaf0c0 1949 list_entry(head->first, struct inet_timewait_sock, tw_node);
1da177e4
LT
1950}
1951
8feaf0c0 1952static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1da177e4 1953{
3ab5aee7
ED
1954 return !is_a_nulls(tw->tw_node.next) ?
1955 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1da177e4
LT
1956}
1957
a8b690f9
TH
1958/*
1959 * Get next listener socket follow cur. If cur is NULL, get first socket
1960 * starting from bucket given in st->bucket; when st->bucket is zero the
1961 * very first socket in the hash table is returned.
1962 */
1da177e4
LT
1963static void *listening_get_next(struct seq_file *seq, void *cur)
1964{
463c84b9 1965 struct inet_connection_sock *icsk;
c25eb3bf 1966 struct hlist_nulls_node *node;
1da177e4 1967 struct sock *sk = cur;
5caea4ea 1968 struct inet_listen_hashbucket *ilb;
5799de0b 1969 struct tcp_iter_state *st = seq->private;
a4146b1b 1970 struct net *net = seq_file_net(seq);
1da177e4
LT
1971
1972 if (!sk) {
a8b690f9 1973 ilb = &tcp_hashinfo.listening_hash[st->bucket];
5caea4ea 1974 spin_lock_bh(&ilb->lock);
c25eb3bf 1975 sk = sk_nulls_head(&ilb->head);
a8b690f9 1976 st->offset = 0;
1da177e4
LT
1977 goto get_sk;
1978 }
5caea4ea 1979 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1da177e4 1980 ++st->num;
a8b690f9 1981 ++st->offset;
1da177e4
LT
1982
1983 if (st->state == TCP_SEQ_STATE_OPENREQ) {
60236fdd 1984 struct request_sock *req = cur;
1da177e4 1985
72a3effa 1986 icsk = inet_csk(st->syn_wait_sk);
1da177e4
LT
1987 req = req->dl_next;
1988 while (1) {
1989 while (req) {
bdccc4ca 1990 if (req->rsk_ops->family == st->family) {
1da177e4
LT
1991 cur = req;
1992 goto out;
1993 }
1994 req = req->dl_next;
1995 }
72a3effa 1996 if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1da177e4
LT
1997 break;
1998get_req:
463c84b9 1999 req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1da177e4 2000 }
1bde5ac4 2001 sk = sk_nulls_next(st->syn_wait_sk);
1da177e4 2002 st->state = TCP_SEQ_STATE_LISTENING;
463c84b9 2003 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2004 } else {
e905a9ed 2005 icsk = inet_csk(sk);
463c84b9
ACM
2006 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2007 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1da177e4 2008 goto start_req;
463c84b9 2009 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1bde5ac4 2010 sk = sk_nulls_next(sk);
1da177e4
LT
2011 }
2012get_sk:
c25eb3bf 2013 sk_nulls_for_each_from(sk, node) {
8475ef9f
PE
2014 if (!net_eq(sock_net(sk), net))
2015 continue;
2016 if (sk->sk_family == st->family) {
1da177e4
LT
2017 cur = sk;
2018 goto out;
2019 }
e905a9ed 2020 icsk = inet_csk(sk);
463c84b9
ACM
2021 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2022 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1da177e4
LT
2023start_req:
2024 st->uid = sock_i_uid(sk);
2025 st->syn_wait_sk = sk;
2026 st->state = TCP_SEQ_STATE_OPENREQ;
2027 st->sbucket = 0;
2028 goto get_req;
2029 }
463c84b9 2030 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4 2031 }
5caea4ea 2032 spin_unlock_bh(&ilb->lock);
a8b690f9 2033 st->offset = 0;
0f7ff927 2034 if (++st->bucket < INET_LHTABLE_SIZE) {
5caea4ea
ED
2035 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2036 spin_lock_bh(&ilb->lock);
c25eb3bf 2037 sk = sk_nulls_head(&ilb->head);
1da177e4
LT
2038 goto get_sk;
2039 }
2040 cur = NULL;
2041out:
2042 return cur;
2043}
2044
2045static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2046{
a8b690f9
TH
2047 struct tcp_iter_state *st = seq->private;
2048 void *rc;
2049
2050 st->bucket = 0;
2051 st->offset = 0;
2052 rc = listening_get_next(seq, NULL);
1da177e4
LT
2053
2054 while (rc && *pos) {
2055 rc = listening_get_next(seq, rc);
2056 --*pos;
2057 }
2058 return rc;
2059}
2060
6eac5604
AK
2061static inline int empty_bucket(struct tcp_iter_state *st)
2062{
3ab5aee7
ED
2063 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2064 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
6eac5604
AK
2065}
2066
a8b690f9
TH
2067/*
2068 * Get first established socket starting from bucket given in st->bucket.
2069 * If st->bucket is zero, the very first socket in the hash is returned.
2070 */
1da177e4
LT
2071static void *established_get_first(struct seq_file *seq)
2072{
5799de0b 2073 struct tcp_iter_state *st = seq->private;
a4146b1b 2074 struct net *net = seq_file_net(seq);
1da177e4
LT
2075 void *rc = NULL;
2076
a8b690f9
TH
2077 st->offset = 0;
2078 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
1da177e4 2079 struct sock *sk;
3ab5aee7 2080 struct hlist_nulls_node *node;
8feaf0c0 2081 struct inet_timewait_sock *tw;
9db66bdc 2082 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1da177e4 2083
6eac5604
AK
2084 /* Lockless fast path for the common case of empty buckets */
2085 if (empty_bucket(st))
2086 continue;
2087
9db66bdc 2088 spin_lock_bh(lock);
3ab5aee7 2089 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
f40c8174 2090 if (sk->sk_family != st->family ||
878628fb 2091 !net_eq(sock_net(sk), net)) {
1da177e4
LT
2092 continue;
2093 }
2094 rc = sk;
2095 goto out;
2096 }
2097 st->state = TCP_SEQ_STATE_TIME_WAIT;
8feaf0c0 2098 inet_twsk_for_each(tw, node,
dbca9b27 2099 &tcp_hashinfo.ehash[st->bucket].twchain) {
28518fc1 2100 if (tw->tw_family != st->family ||
878628fb 2101 !net_eq(twsk_net(tw), net)) {
1da177e4
LT
2102 continue;
2103 }
2104 rc = tw;
2105 goto out;
2106 }
9db66bdc 2107 spin_unlock_bh(lock);
1da177e4
LT
2108 st->state = TCP_SEQ_STATE_ESTABLISHED;
2109 }
2110out:
2111 return rc;
2112}
2113
2114static void *established_get_next(struct seq_file *seq, void *cur)
2115{
2116 struct sock *sk = cur;
8feaf0c0 2117 struct inet_timewait_sock *tw;
3ab5aee7 2118 struct hlist_nulls_node *node;
5799de0b 2119 struct tcp_iter_state *st = seq->private;
a4146b1b 2120 struct net *net = seq_file_net(seq);
1da177e4
LT
2121
2122 ++st->num;
a8b690f9 2123 ++st->offset;
1da177e4
LT
2124
2125 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2126 tw = cur;
2127 tw = tw_next(tw);
2128get_tw:
878628fb 2129 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
1da177e4
LT
2130 tw = tw_next(tw);
2131 }
2132 if (tw) {
2133 cur = tw;
2134 goto out;
2135 }
9db66bdc 2136 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2137 st->state = TCP_SEQ_STATE_ESTABLISHED;
2138
6eac5604 2139 /* Look for next non empty bucket */
a8b690f9 2140 st->offset = 0;
f373b53b 2141 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
6eac5604
AK
2142 empty_bucket(st))
2143 ;
f373b53b 2144 if (st->bucket > tcp_hashinfo.ehash_mask)
6eac5604
AK
2145 return NULL;
2146
9db66bdc 2147 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
3ab5aee7 2148 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
1da177e4 2149 } else
3ab5aee7 2150 sk = sk_nulls_next(sk);
1da177e4 2151
3ab5aee7 2152 sk_nulls_for_each_from(sk, node) {
878628fb 2153 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
1da177e4
LT
2154 goto found;
2155 }
2156
2157 st->state = TCP_SEQ_STATE_TIME_WAIT;
dbca9b27 2158 tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
1da177e4
LT
2159 goto get_tw;
2160found:
2161 cur = sk;
2162out:
2163 return cur;
2164}
2165
2166static void *established_get_idx(struct seq_file *seq, loff_t pos)
2167{
a8b690f9
TH
2168 struct tcp_iter_state *st = seq->private;
2169 void *rc;
2170
2171 st->bucket = 0;
2172 rc = established_get_first(seq);
1da177e4
LT
2173
2174 while (rc && pos) {
2175 rc = established_get_next(seq, rc);
2176 --pos;
7174259e 2177 }
1da177e4
LT
2178 return rc;
2179}
2180
2181static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2182{
2183 void *rc;
5799de0b 2184 struct tcp_iter_state *st = seq->private;
1da177e4 2185
1da177e4
LT
2186 st->state = TCP_SEQ_STATE_LISTENING;
2187 rc = listening_get_idx(seq, &pos);
2188
2189 if (!rc) {
1da177e4
LT
2190 st->state = TCP_SEQ_STATE_ESTABLISHED;
2191 rc = established_get_idx(seq, pos);
2192 }
2193
2194 return rc;
2195}
2196
a8b690f9
TH
2197static void *tcp_seek_last_pos(struct seq_file *seq)
2198{
2199 struct tcp_iter_state *st = seq->private;
2200 int offset = st->offset;
2201 int orig_num = st->num;
2202 void *rc = NULL;
2203
2204 switch (st->state) {
2205 case TCP_SEQ_STATE_OPENREQ:
2206 case TCP_SEQ_STATE_LISTENING:
2207 if (st->bucket >= INET_LHTABLE_SIZE)
2208 break;
2209 st->state = TCP_SEQ_STATE_LISTENING;
2210 rc = listening_get_next(seq, NULL);
2211 while (offset-- && rc)
2212 rc = listening_get_next(seq, rc);
2213 if (rc)
2214 break;
2215 st->bucket = 0;
2216 /* Fallthrough */
2217 case TCP_SEQ_STATE_ESTABLISHED:
2218 case TCP_SEQ_STATE_TIME_WAIT:
2219 st->state = TCP_SEQ_STATE_ESTABLISHED;
2220 if (st->bucket > tcp_hashinfo.ehash_mask)
2221 break;
2222 rc = established_get_first(seq);
2223 while (offset-- && rc)
2224 rc = established_get_next(seq, rc);
2225 }
2226
2227 st->num = orig_num;
2228
2229 return rc;
2230}
2231
1da177e4
LT
2232static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2233{
5799de0b 2234 struct tcp_iter_state *st = seq->private;
a8b690f9
TH
2235 void *rc;
2236
2237 if (*pos && *pos == st->last_pos) {
2238 rc = tcp_seek_last_pos(seq);
2239 if (rc)
2240 goto out;
2241 }
2242
1da177e4
LT
2243 st->state = TCP_SEQ_STATE_LISTENING;
2244 st->num = 0;
a8b690f9
TH
2245 st->bucket = 0;
2246 st->offset = 0;
2247 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2248
2249out:
2250 st->last_pos = *pos;
2251 return rc;
1da177e4
LT
2252}
2253
2254static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2255{
a8b690f9 2256 struct tcp_iter_state *st = seq->private;
1da177e4 2257 void *rc = NULL;
1da177e4
LT
2258
2259 if (v == SEQ_START_TOKEN) {
2260 rc = tcp_get_idx(seq, 0);
2261 goto out;
2262 }
1da177e4
LT
2263
2264 switch (st->state) {
2265 case TCP_SEQ_STATE_OPENREQ:
2266 case TCP_SEQ_STATE_LISTENING:
2267 rc = listening_get_next(seq, v);
2268 if (!rc) {
1da177e4 2269 st->state = TCP_SEQ_STATE_ESTABLISHED;
a8b690f9
TH
2270 st->bucket = 0;
2271 st->offset = 0;
1da177e4
LT
2272 rc = established_get_first(seq);
2273 }
2274 break;
2275 case TCP_SEQ_STATE_ESTABLISHED:
2276 case TCP_SEQ_STATE_TIME_WAIT:
2277 rc = established_get_next(seq, v);
2278 break;
2279 }
2280out:
2281 ++*pos;
a8b690f9 2282 st->last_pos = *pos;
1da177e4
LT
2283 return rc;
2284}
2285
2286static void tcp_seq_stop(struct seq_file *seq, void *v)
2287{
5799de0b 2288 struct tcp_iter_state *st = seq->private;
1da177e4
LT
2289
2290 switch (st->state) {
2291 case TCP_SEQ_STATE_OPENREQ:
2292 if (v) {
463c84b9
ACM
2293 struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2294 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1da177e4
LT
2295 }
2296 case TCP_SEQ_STATE_LISTENING:
2297 if (v != SEQ_START_TOKEN)
5caea4ea 2298 spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
1da177e4
LT
2299 break;
2300 case TCP_SEQ_STATE_TIME_WAIT:
2301 case TCP_SEQ_STATE_ESTABLISHED:
2302 if (v)
9db66bdc 2303 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
1da177e4
LT
2304 break;
2305 }
2306}
2307
2308static int tcp_seq_open(struct inode *inode, struct file *file)
2309{
2310 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1da177e4 2311 struct tcp_iter_state *s;
52d6f3f1 2312 int err;
1da177e4 2313
52d6f3f1
DL
2314 err = seq_open_net(inode, file, &afinfo->seq_ops,
2315 sizeof(struct tcp_iter_state));
2316 if (err < 0)
2317 return err;
f40c8174 2318
52d6f3f1 2319 s = ((struct seq_file *)file->private_data)->private;
1da177e4 2320 s->family = afinfo->family;
a8b690f9 2321 s->last_pos = 0;
f40c8174
DL
2322 return 0;
2323}
2324
6f8b13bc 2325int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4
LT
2326{
2327 int rc = 0;
2328 struct proc_dir_entry *p;
2329
68fcadd1
DL
2330 afinfo->seq_fops.open = tcp_seq_open;
2331 afinfo->seq_fops.read = seq_read;
2332 afinfo->seq_fops.llseek = seq_lseek;
2333 afinfo->seq_fops.release = seq_release_net;
7174259e 2334
9427c4b3
DL
2335 afinfo->seq_ops.start = tcp_seq_start;
2336 afinfo->seq_ops.next = tcp_seq_next;
2337 afinfo->seq_ops.stop = tcp_seq_stop;
2338
84841c3c
DL
2339 p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2340 &afinfo->seq_fops, afinfo);
2341 if (!p)
1da177e4
LT
2342 rc = -ENOMEM;
2343 return rc;
2344}
4bc2f18b 2345EXPORT_SYMBOL(tcp_proc_register);
1da177e4 2346
6f8b13bc 2347void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
1da177e4 2348{
6f8b13bc 2349 proc_net_remove(net, afinfo->name);
1da177e4 2350}
4bc2f18b 2351EXPORT_SYMBOL(tcp_proc_unregister);
1da177e4 2352
60236fdd 2353static void get_openreq4(struct sock *sk, struct request_sock *req,
5e659e4c 2354 struct seq_file *f, int i, int uid, int *len)
1da177e4 2355{
2e6599cb 2356 const struct inet_request_sock *ireq = inet_rsk(req);
1da177e4
LT
2357 int ttd = req->expires - jiffies;
2358
5e659e4c
PE
2359 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2360 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
1da177e4 2361 i,
2e6599cb 2362 ireq->loc_addr,
c720c7e8 2363 ntohs(inet_sk(sk)->inet_sport),
2e6599cb
ACM
2364 ireq->rmt_addr,
2365 ntohs(ireq->rmt_port),
1da177e4
LT
2366 TCP_SYN_RECV,
2367 0, 0, /* could print option size, but that is af dependent. */
2368 1, /* timers active (only the expire timer) */
2369 jiffies_to_clock_t(ttd),
2370 req->retrans,
2371 uid,
2372 0, /* non standard timer */
2373 0, /* open_requests have no inode */
2374 atomic_read(&sk->sk_refcnt),
5e659e4c
PE
2375 req,
2376 len);
1da177e4
LT
2377}
2378
5e659e4c 2379static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
1da177e4
LT
2380{
2381 int timer_active;
2382 unsigned long timer_expires;
cf4c6bf8
IJ
2383 struct tcp_sock *tp = tcp_sk(sk);
2384 const struct inet_connection_sock *icsk = inet_csk(sk);
2385 struct inet_sock *inet = inet_sk(sk);
c720c7e8
ED
2386 __be32 dest = inet->inet_daddr;
2387 __be32 src = inet->inet_rcv_saddr;
2388 __u16 destp = ntohs(inet->inet_dport);
2389 __u16 srcp = ntohs(inet->inet_sport);
49d09007 2390 int rx_queue;
1da177e4 2391
463c84b9 2392 if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1da177e4 2393 timer_active = 1;
463c84b9
ACM
2394 timer_expires = icsk->icsk_timeout;
2395 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1da177e4 2396 timer_active = 4;
463c84b9 2397 timer_expires = icsk->icsk_timeout;
cf4c6bf8 2398 } else if (timer_pending(&sk->sk_timer)) {
1da177e4 2399 timer_active = 2;
cf4c6bf8 2400 timer_expires = sk->sk_timer.expires;
1da177e4
LT
2401 } else {
2402 timer_active = 0;
2403 timer_expires = jiffies;
2404 }
2405
49d09007
ED
2406 if (sk->sk_state == TCP_LISTEN)
2407 rx_queue = sk->sk_ack_backlog;
2408 else
2409 /*
2410 * because we dont lock socket, we might find a transient negative value
2411 */
2412 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2413
5e659e4c 2414 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
7be87351 2415 "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
cf4c6bf8 2416 i, src, srcp, dest, destp, sk->sk_state,
47da8ee6 2417 tp->write_seq - tp->snd_una,
49d09007 2418 rx_queue,
1da177e4
LT
2419 timer_active,
2420 jiffies_to_clock_t(timer_expires - jiffies),
463c84b9 2421 icsk->icsk_retransmits,
cf4c6bf8 2422 sock_i_uid(sk),
6687e988 2423 icsk->icsk_probes_out,
cf4c6bf8
IJ
2424 sock_i_ino(sk),
2425 atomic_read(&sk->sk_refcnt), sk,
7be87351
SH
2426 jiffies_to_clock_t(icsk->icsk_rto),
2427 jiffies_to_clock_t(icsk->icsk_ack.ato),
463c84b9 2428 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1da177e4 2429 tp->snd_cwnd,
0b6a05c1 2430 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
5e659e4c 2431 len);
1da177e4
LT
2432}
2433
7174259e 2434static void get_timewait4_sock(struct inet_timewait_sock *tw,
5e659e4c 2435 struct seq_file *f, int i, int *len)
1da177e4 2436{
23f33c2d 2437 __be32 dest, src;
1da177e4
LT
2438 __u16 destp, srcp;
2439 int ttd = tw->tw_ttd - jiffies;
2440
2441 if (ttd < 0)
2442 ttd = 0;
2443
2444 dest = tw->tw_daddr;
2445 src = tw->tw_rcv_saddr;
2446 destp = ntohs(tw->tw_dport);
2447 srcp = ntohs(tw->tw_sport);
2448
5e659e4c
PE
2449 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2450 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
1da177e4
LT
2451 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2452 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
5e659e4c 2453 atomic_read(&tw->tw_refcnt), tw, len);
1da177e4
LT
2454}
2455
2456#define TMPSZ 150
2457
2458static int tcp4_seq_show(struct seq_file *seq, void *v)
2459{
5799de0b 2460 struct tcp_iter_state *st;
5e659e4c 2461 int len;
1da177e4
LT
2462
2463 if (v == SEQ_START_TOKEN) {
2464 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2465 " sl local_address rem_address st tx_queue "
2466 "rx_queue tr tm->when retrnsmt uid timeout "
2467 "inode");
2468 goto out;
2469 }
2470 st = seq->private;
2471
2472 switch (st->state) {
2473 case TCP_SEQ_STATE_LISTENING:
2474 case TCP_SEQ_STATE_ESTABLISHED:
5e659e4c 2475 get_tcp4_sock(v, seq, st->num, &len);
1da177e4
LT
2476 break;
2477 case TCP_SEQ_STATE_OPENREQ:
5e659e4c 2478 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
1da177e4
LT
2479 break;
2480 case TCP_SEQ_STATE_TIME_WAIT:
5e659e4c 2481 get_timewait4_sock(v, seq, st->num, &len);
1da177e4
LT
2482 break;
2483 }
5e659e4c 2484 seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
1da177e4
LT
2485out:
2486 return 0;
2487}
2488
1da177e4 2489static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1da177e4
LT
2490 .name = "tcp",
2491 .family = AF_INET,
5f4472c5
DL
2492 .seq_fops = {
2493 .owner = THIS_MODULE,
2494 },
9427c4b3
DL
2495 .seq_ops = {
2496 .show = tcp4_seq_show,
2497 },
1da177e4
LT
2498};
2499
2c8c1e72 2500static int __net_init tcp4_proc_init_net(struct net *net)
757764f6
PE
2501{
2502 return tcp_proc_register(net, &tcp4_seq_afinfo);
2503}
2504
2c8c1e72 2505static void __net_exit tcp4_proc_exit_net(struct net *net)
757764f6
PE
2506{
2507 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2508}
2509
2510static struct pernet_operations tcp4_net_ops = {
2511 .init = tcp4_proc_init_net,
2512 .exit = tcp4_proc_exit_net,
2513};
2514
1da177e4
LT
2515int __init tcp4_proc_init(void)
2516{
757764f6 2517 return register_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2518}
2519
2520void tcp4_proc_exit(void)
2521{
757764f6 2522 unregister_pernet_subsys(&tcp4_net_ops);
1da177e4
LT
2523}
2524#endif /* CONFIG_PROC_FS */
2525
bf296b12
HX
2526struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2527{
36e7b1b8 2528 struct iphdr *iph = skb_gro_network_header(skb);
bf296b12
HX
2529
2530 switch (skb->ip_summed) {
2531 case CHECKSUM_COMPLETE:
86911732 2532 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
bf296b12
HX
2533 skb->csum)) {
2534 skb->ip_summed = CHECKSUM_UNNECESSARY;
2535 break;
2536 }
2537
2538 /* fall through */
2539 case CHECKSUM_NONE:
2540 NAPI_GRO_CB(skb)->flush = 1;
2541 return NULL;
2542 }
2543
2544 return tcp_gro_receive(head, skb);
2545}
bf296b12
HX
2546
2547int tcp4_gro_complete(struct sk_buff *skb)
2548{
2549 struct iphdr *iph = ip_hdr(skb);
2550 struct tcphdr *th = tcp_hdr(skb);
2551
2552 th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2553 iph->saddr, iph->daddr, 0);
2554 skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2555
2556 return tcp_gro_complete(skb);
2557}
bf296b12 2558
1da177e4
LT
2559struct proto tcp_prot = {
2560 .name = "TCP",
2561 .owner = THIS_MODULE,
2562 .close = tcp_close,
2563 .connect = tcp_v4_connect,
2564 .disconnect = tcp_disconnect,
463c84b9 2565 .accept = inet_csk_accept,
1da177e4
LT
2566 .ioctl = tcp_ioctl,
2567 .init = tcp_v4_init_sock,
2568 .destroy = tcp_v4_destroy_sock,
2569 .shutdown = tcp_shutdown,
2570 .setsockopt = tcp_setsockopt,
2571 .getsockopt = tcp_getsockopt,
1da177e4 2572 .recvmsg = tcp_recvmsg,
7ba42910
CG
2573 .sendmsg = tcp_sendmsg,
2574 .sendpage = tcp_sendpage,
1da177e4 2575 .backlog_rcv = tcp_v4_do_rcv,
ab1e0a13
ACM
2576 .hash = inet_hash,
2577 .unhash = inet_unhash,
2578 .get_port = inet_csk_get_port,
1da177e4
LT
2579 .enter_memory_pressure = tcp_enter_memory_pressure,
2580 .sockets_allocated = &tcp_sockets_allocated,
0a5578cf 2581 .orphan_count = &tcp_orphan_count,
1da177e4
LT
2582 .memory_allocated = &tcp_memory_allocated,
2583 .memory_pressure = &tcp_memory_pressure,
2584 .sysctl_mem = sysctl_tcp_mem,
2585 .sysctl_wmem = sysctl_tcp_wmem,
2586 .sysctl_rmem = sysctl_tcp_rmem,
2587 .max_header = MAX_TCP_HEADER,
2588 .obj_size = sizeof(struct tcp_sock),
3ab5aee7 2589 .slab_flags = SLAB_DESTROY_BY_RCU,
6d6ee43e 2590 .twsk_prot = &tcp_timewait_sock_ops,
60236fdd 2591 .rsk_prot = &tcp_request_sock_ops,
39d8cda7 2592 .h.hashinfo = &tcp_hashinfo,
7ba42910 2593 .no_autobind = true,
543d9cfe
ACM
2594#ifdef CONFIG_COMPAT
2595 .compat_setsockopt = compat_tcp_setsockopt,
2596 .compat_getsockopt = compat_tcp_getsockopt,
2597#endif
1da177e4 2598};
4bc2f18b 2599EXPORT_SYMBOL(tcp_prot);
1da177e4 2600
046ee902
DL
2601
2602static int __net_init tcp_sk_init(struct net *net)
2603{
2604 return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2605 PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2606}
2607
2608static void __net_exit tcp_sk_exit(struct net *net)
2609{
2610 inet_ctl_sock_destroy(net->ipv4.tcp_sock);
b099ce26
EB
2611}
2612
2613static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2614{
2615 inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
046ee902
DL
2616}
2617
2618static struct pernet_operations __net_initdata tcp_sk_ops = {
b099ce26
EB
2619 .init = tcp_sk_init,
2620 .exit = tcp_sk_exit,
2621 .exit_batch = tcp_sk_exit_batch,
046ee902
DL
2622};
2623
9b0f976f 2624void __init tcp_v4_init(void)
1da177e4 2625{
5caea4ea 2626 inet_hashinfo_init(&tcp_hashinfo);
6a1b3054 2627 if (register_pernet_subsys(&tcp_sk_ops))
1da177e4 2628 panic("Failed to create the TCP control socket.\n");
1da177e4 2629}