2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * The Internet Protocol (IP) output module.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
18 * See ip_input.c for original log
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
42 * Hirokazu Takahashi: sendfile() on UDP works now.
45 #include <asm/uaccess.h>
46 #include <linux/module.h>
47 #include <linux/types.h>
48 #include <linux/kernel.h>
50 #include <linux/string.h>
51 #include <linux/errno.h>
52 #include <linux/highmem.h>
53 #include <linux/slab.h>
55 #include <linux/socket.h>
56 #include <linux/sockios.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/etherdevice.h>
61 #include <linux/proc_fs.h>
62 #include <linux/stat.h>
63 #include <linux/init.h>
67 #include <net/protocol.h>
68 #include <net/route.h>
70 #include <linux/skbuff.h>
74 #include <net/checksum.h>
75 #include <net/inetpeer.h>
76 #include <linux/igmp.h>
77 #include <linux/netfilter_ipv4.h>
78 #include <linux/netfilter_bridge.h>
79 #include <linux/mroute.h>
80 #include <linux/netlink.h>
81 #include <linux/tcp.h>
83 int sysctl_ip_default_ttl __read_mostly
= IPDEFTTL
;
84 EXPORT_SYMBOL(sysctl_ip_default_ttl
);
86 /* Generate a checksum for an outgoing IP datagram. */
87 __inline__
void ip_send_check(struct iphdr
*iph
)
90 iph
->check
= ip_fast_csum((unsigned char *)iph
, iph
->ihl
);
92 EXPORT_SYMBOL(ip_send_check
);
94 int __ip_local_out(struct sk_buff
*skb
)
96 struct iphdr
*iph
= ip_hdr(skb
);
98 iph
->tot_len
= htons(skb
->len
);
100 return nf_hook(NFPROTO_IPV4
, NF_INET_LOCAL_OUT
, skb
, NULL
,
101 skb_dst(skb
)->dev
, dst_output
);
104 int ip_local_out(struct sk_buff
*skb
)
108 err
= __ip_local_out(skb
);
109 if (likely(err
== 1))
110 err
= dst_output(skb
);
114 EXPORT_SYMBOL_GPL(ip_local_out
);
116 static inline int ip_select_ttl(struct inet_sock
*inet
, struct dst_entry
*dst
)
118 int ttl
= inet
->uc_ttl
;
121 ttl
= ip4_dst_hoplimit(dst
);
126 * Add an ip header to a skbuff and send it out.
129 int ip_build_and_send_pkt(struct sk_buff
*skb
, struct sock
*sk
,
130 __be32 saddr
, __be32 daddr
, struct ip_options_rcu
*opt
)
132 struct inet_sock
*inet
= inet_sk(sk
);
133 struct rtable
*rt
= skb_rtable(skb
);
136 /* Build the IP header. */
137 skb_push(skb
, sizeof(struct iphdr
) + (opt
? opt
->opt
.optlen
: 0));
138 skb_reset_network_header(skb
);
142 iph
->tos
= inet
->tos
;
143 if (ip_dont_fragment(sk
, &rt
->dst
))
144 iph
->frag_off
= htons(IP_DF
);
147 iph
->ttl
= ip_select_ttl(inet
, &rt
->dst
);
148 iph
->daddr
= (opt
&& opt
->opt
.srr
? opt
->opt
.faddr
: daddr
);
150 iph
->protocol
= sk
->sk_protocol
;
151 ip_select_ident(iph
, &rt
->dst
, sk
);
153 if (opt
&& opt
->opt
.optlen
) {
154 iph
->ihl
+= opt
->opt
.optlen
>>2;
155 ip_options_build(skb
, &opt
->opt
, daddr
, rt
, 0);
158 skb
->priority
= sk
->sk_priority
;
159 skb
->mark
= sk
->sk_mark
;
162 return ip_local_out(skb
);
164 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt
);
166 static inline int ip_finish_output2(struct sk_buff
*skb
)
168 struct dst_entry
*dst
= skb_dst(skb
);
169 struct rtable
*rt
= (struct rtable
*)dst
;
170 struct net_device
*dev
= dst
->dev
;
171 unsigned int hh_len
= LL_RESERVED_SPACE(dev
);
172 struct neighbour
*neigh
;
175 if (rt
->rt_type
== RTN_MULTICAST
) {
176 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUTMCAST
, skb
->len
);
177 } else if (rt
->rt_type
== RTN_BROADCAST
)
178 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUTBCAST
, skb
->len
);
180 /* Be paranoid, rather than too clever. */
181 if (unlikely(skb_headroom(skb
) < hh_len
&& dev
->header_ops
)) {
182 struct sk_buff
*skb2
;
184 skb2
= skb_realloc_headroom(skb
, LL_RESERVED_SPACE(dev
));
190 skb_set_owner_w(skb2
, skb
->sk
);
196 nexthop
= rt
->rt_gateway
? rt
->rt_gateway
: ip_hdr(skb
)->daddr
;
197 neigh
= __ipv4_neigh_lookup_noref(dev
, nexthop
);
198 if (unlikely(!neigh
))
199 neigh
= __neigh_create(&arp_tbl
, &nexthop
, dev
, false);
201 int res
= dst_neigh_output(dst
, neigh
, skb
);
203 rcu_read_unlock_bh();
206 rcu_read_unlock_bh();
208 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
214 static inline int ip_skb_dst_mtu(struct sk_buff
*skb
)
216 struct inet_sock
*inet
= skb
->sk
? inet_sk(skb
->sk
) : NULL
;
218 return (inet
&& inet
->pmtudisc
== IP_PMTUDISC_PROBE
) ?
219 skb_dst(skb
)->dev
->mtu
: dst_mtu(skb_dst(skb
));
222 static int ip_finish_output(struct sk_buff
*skb
)
224 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
225 /* Policy lookup after SNAT yielded a new policy */
226 if (skb_dst(skb
)->xfrm
!= NULL
) {
227 IPCB(skb
)->flags
|= IPSKB_REROUTED
;
228 return dst_output(skb
);
231 if (skb
->len
> ip_skb_dst_mtu(skb
) && !skb_is_gso(skb
))
232 return ip_fragment(skb
, ip_finish_output2
);
234 return ip_finish_output2(skb
);
237 int ip_mc_output(struct sk_buff
*skb
)
239 struct sock
*sk
= skb
->sk
;
240 struct rtable
*rt
= skb_rtable(skb
);
241 struct net_device
*dev
= rt
->dst
.dev
;
244 * If the indicated interface is up and running, send the packet.
246 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUT
, skb
->len
);
249 skb
->protocol
= htons(ETH_P_IP
);
252 * Multicasts are looped back for other local users
255 if (rt
->rt_flags
&RTCF_MULTICAST
) {
257 #ifdef CONFIG_IP_MROUTE
258 /* Small optimization: do not loopback not local frames,
259 which returned after forwarding; they will be dropped
260 by ip_mr_input in any case.
261 Note, that local frames are looped back to be delivered
264 This check is duplicated in ip_mr_input at the moment.
267 ((rt
->rt_flags
& RTCF_LOCAL
) ||
268 !(IPCB(skb
)->flags
& IPSKB_FORWARDED
))
271 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
273 NF_HOOK(NFPROTO_IPV4
, NF_INET_POST_ROUTING
,
274 newskb
, NULL
, newskb
->dev
,
278 /* Multicasts with ttl 0 must not go beyond the host */
280 if (ip_hdr(skb
)->ttl
== 0) {
286 if (rt
->rt_flags
&RTCF_BROADCAST
) {
287 struct sk_buff
*newskb
= skb_clone(skb
, GFP_ATOMIC
);
289 NF_HOOK(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, newskb
,
290 NULL
, newskb
->dev
, dev_loopback_xmit
);
293 return NF_HOOK_COND(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, skb
, NULL
,
294 skb
->dev
, ip_finish_output
,
295 !(IPCB(skb
)->flags
& IPSKB_REROUTED
));
298 int ip_output(struct sk_buff
*skb
)
300 struct net_device
*dev
= skb_dst(skb
)->dev
;
302 IP_UPD_PO_STATS(dev_net(dev
), IPSTATS_MIB_OUT
, skb
->len
);
305 skb
->protocol
= htons(ETH_P_IP
);
307 return NF_HOOK_COND(NFPROTO_IPV4
, NF_INET_POST_ROUTING
, skb
, NULL
, dev
,
309 !(IPCB(skb
)->flags
& IPSKB_REROUTED
));
313 * copy saddr and daddr, possibly using 64bit load/stores
315 * iph->saddr = fl4->saddr;
316 * iph->daddr = fl4->daddr;
318 static void ip_copy_addrs(struct iphdr
*iph
, const struct flowi4
*fl4
)
320 BUILD_BUG_ON(offsetof(typeof(*fl4
), daddr
) !=
321 offsetof(typeof(*fl4
), saddr
) + sizeof(fl4
->saddr
));
322 memcpy(&iph
->saddr
, &fl4
->saddr
,
323 sizeof(fl4
->saddr
) + sizeof(fl4
->daddr
));
326 int ip_queue_xmit(struct sk_buff
*skb
, struct flowi
*fl
)
328 struct sock
*sk
= skb
->sk
;
329 struct inet_sock
*inet
= inet_sk(sk
);
330 struct ip_options_rcu
*inet_opt
;
336 /* Skip all of this if the packet is already routed,
337 * f.e. by something like SCTP.
340 inet_opt
= rcu_dereference(inet
->inet_opt
);
342 rt
= skb_rtable(skb
);
346 /* Make sure we can route this packet. */
347 rt
= (struct rtable
*)__sk_dst_check(sk
, 0);
351 /* Use correct destination address if we have options. */
352 daddr
= inet
->inet_daddr
;
353 if (inet_opt
&& inet_opt
->opt
.srr
)
354 daddr
= inet_opt
->opt
.faddr
;
356 /* If this fails, retransmit mechanism of transport layer will
357 * keep trying until route appears or the connection times
360 rt
= ip_route_output_ports(sock_net(sk
), fl4
, sk
,
361 daddr
, inet
->inet_saddr
,
366 sk
->sk_bound_dev_if
);
369 sk_setup_caps(sk
, &rt
->dst
);
371 skb_dst_set_noref(skb
, &rt
->dst
);
374 if (inet_opt
&& inet_opt
->opt
.is_strictroute
&& fl4
->daddr
!= rt
->rt_gateway
)
377 /* OK, we know where to send it, allocate and build IP header. */
378 skb_push(skb
, sizeof(struct iphdr
) + (inet_opt
? inet_opt
->opt
.optlen
: 0));
379 skb_reset_network_header(skb
);
381 *((__be16
*)iph
) = htons((4 << 12) | (5 << 8) | (inet
->tos
& 0xff));
382 if (ip_dont_fragment(sk
, &rt
->dst
) && !skb
->local_df
)
383 iph
->frag_off
= htons(IP_DF
);
386 iph
->ttl
= ip_select_ttl(inet
, &rt
->dst
);
387 iph
->protocol
= sk
->sk_protocol
;
388 ip_copy_addrs(iph
, fl4
);
390 /* Transport layer set skb->h.foo itself. */
392 if (inet_opt
&& inet_opt
->opt
.optlen
) {
393 iph
->ihl
+= inet_opt
->opt
.optlen
>> 2;
394 ip_options_build(skb
, &inet_opt
->opt
, inet
->inet_daddr
, rt
, 0);
397 ip_select_ident_more(iph
, &rt
->dst
, sk
,
398 (skb_shinfo(skb
)->gso_segs
?: 1) - 1);
400 skb
->priority
= sk
->sk_priority
;
401 skb
->mark
= sk
->sk_mark
;
403 res
= ip_local_out(skb
);
409 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTNOROUTES
);
411 return -EHOSTUNREACH
;
413 EXPORT_SYMBOL(ip_queue_xmit
);
416 static void ip_copy_metadata(struct sk_buff
*to
, struct sk_buff
*from
)
418 to
->pkt_type
= from
->pkt_type
;
419 to
->priority
= from
->priority
;
420 to
->protocol
= from
->protocol
;
422 skb_dst_copy(to
, from
);
424 to
->mark
= from
->mark
;
426 /* Copy the flags to each fragment. */
427 IPCB(to
)->flags
= IPCB(from
)->flags
;
429 #ifdef CONFIG_NET_SCHED
430 to
->tc_index
= from
->tc_index
;
433 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
434 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
435 to
->nf_trace
= from
->nf_trace
;
437 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
438 to
->ipvs_property
= from
->ipvs_property
;
440 skb_copy_secmark(to
, from
);
444 * This IP datagram is too large to be sent in one piece. Break it up into
445 * smaller pieces (each of size equal to IP header plus
446 * a block of the data of the original IP data part) that will yet fit in a
447 * single device frame, and queue such a frame for sending.
450 int ip_fragment(struct sk_buff
*skb
, int (*output
)(struct sk_buff
*))
454 struct net_device
*dev
;
455 struct sk_buff
*skb2
;
456 unsigned int mtu
, hlen
, left
, len
, ll_rs
;
458 __be16 not_last_frag
;
459 struct rtable
*rt
= skb_rtable(skb
);
465 * Point into the IP datagram header.
470 if (unlikely((iph
->frag_off
& htons(IP_DF
)) && !skb
->local_df
)) {
471 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
472 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
,
473 htonl(ip_skb_dst_mtu(skb
)));
479 * Setup starting values.
483 mtu
= dst_mtu(&rt
->dst
) - hlen
; /* Size of data space */
484 #ifdef CONFIG_BRIDGE_NETFILTER
486 mtu
-= nf_bridge_mtu_reduction(skb
);
488 IPCB(skb
)->flags
|= IPSKB_FRAG_COMPLETE
;
490 /* When frag_list is given, use it. First, check its validity:
491 * some transformers could create wrong frag_list or break existing
492 * one, it is not prohibited. In this case fall back to copying.
494 * LATER: this step can be merged to real generation of fragments,
495 * we can switch to copy when see the first bad fragment.
497 if (skb_has_frag_list(skb
)) {
498 struct sk_buff
*frag
, *frag2
;
499 int first_len
= skb_pagelen(skb
);
501 if (first_len
- hlen
> mtu
||
502 ((first_len
- hlen
) & 7) ||
503 ip_is_fragment(iph
) ||
507 skb_walk_frags(skb
, frag
) {
508 /* Correct geometry. */
509 if (frag
->len
> mtu
||
510 ((frag
->len
& 7) && frag
->next
) ||
511 skb_headroom(frag
) < hlen
)
512 goto slow_path_clean
;
514 /* Partially cloned skb? */
515 if (skb_shared(frag
))
516 goto slow_path_clean
;
521 frag
->destructor
= sock_wfree
;
523 skb
->truesize
-= frag
->truesize
;
526 /* Everything is OK. Generate! */
530 frag
= skb_shinfo(skb
)->frag_list
;
531 skb_frag_list_init(skb
);
532 skb
->data_len
= first_len
- skb_headlen(skb
);
533 skb
->len
= first_len
;
534 iph
->tot_len
= htons(first_len
);
535 iph
->frag_off
= htons(IP_MF
);
539 /* Prepare header of the next frame,
540 * before previous one went down. */
542 frag
->ip_summed
= CHECKSUM_NONE
;
543 skb_reset_transport_header(frag
);
544 __skb_push(frag
, hlen
);
545 skb_reset_network_header(frag
);
546 memcpy(skb_network_header(frag
), iph
, hlen
);
548 iph
->tot_len
= htons(frag
->len
);
549 ip_copy_metadata(frag
, skb
);
551 ip_options_fragment(frag
);
552 offset
+= skb
->len
- hlen
;
553 iph
->frag_off
= htons(offset
>>3);
554 if (frag
->next
!= NULL
)
555 iph
->frag_off
|= htons(IP_MF
);
556 /* Ready, complete checksum */
563 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGCREATES
);
573 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGOKS
);
582 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
586 skb_walk_frags(skb
, frag2
) {
590 frag2
->destructor
= NULL
;
591 skb
->truesize
+= frag2
->truesize
;
596 left
= skb
->len
- hlen
; /* Space per frame */
597 ptr
= hlen
; /* Where to start from */
599 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
600 * we need to make room for the encapsulating header
602 ll_rs
= LL_RESERVED_SPACE_EXTRA(rt
->dst
.dev
, nf_bridge_pad(skb
));
605 * Fragment the datagram.
608 offset
= (ntohs(iph
->frag_off
) & IP_OFFSET
) << 3;
609 not_last_frag
= iph
->frag_off
& htons(IP_MF
);
612 * Keep copying data until we run out.
617 /* IF: it doesn't fit, use 'mtu' - the data space left */
620 /* IF: we are not sending up to and including the packet end
621 then align the next start on an eight byte boundary */
629 if ((skb2
= alloc_skb(len
+hlen
+ll_rs
, GFP_ATOMIC
)) == NULL
) {
630 NETDEBUG(KERN_INFO
"IP: frag: no memory for new fragment!\n");
636 * Set up data on packet
639 ip_copy_metadata(skb2
, skb
);
640 skb_reserve(skb2
, ll_rs
);
641 skb_put(skb2
, len
+ hlen
);
642 skb_reset_network_header(skb2
);
643 skb2
->transport_header
= skb2
->network_header
+ hlen
;
646 * Charge the memory for the fragment to any owner
651 skb_set_owner_w(skb2
, skb
->sk
);
654 * Copy the packet header into the new buffer.
657 skb_copy_from_linear_data(skb
, skb_network_header(skb2
), hlen
);
660 * Copy a block of the IP datagram.
662 if (skb_copy_bits(skb
, ptr
, skb_transport_header(skb2
), len
))
667 * Fill in the new header fields.
670 iph
->frag_off
= htons((offset
>> 3));
672 /* ANK: dirty, but effective trick. Upgrade options only if
673 * the segment to be fragmented was THE FIRST (otherwise,
674 * options are already fixed) and make it ONCE
675 * on the initial skb, so that all the following fragments
676 * will inherit fixed options.
679 ip_options_fragment(skb
);
682 * Added AC : If we are fragmenting a fragment that's not the
683 * last fragment then keep MF on each bit
685 if (left
> 0 || not_last_frag
)
686 iph
->frag_off
|= htons(IP_MF
);
691 * Put this fragment into the sending queue.
693 iph
->tot_len
= htons(len
+ hlen
);
701 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGCREATES
);
704 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGOKS
);
709 IP_INC_STATS(dev_net(dev
), IPSTATS_MIB_FRAGFAILS
);
712 EXPORT_SYMBOL(ip_fragment
);
715 ip_generic_getfrag(void *from
, char *to
, int offset
, int len
, int odd
, struct sk_buff
*skb
)
717 struct iovec
*iov
= from
;
719 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
720 if (memcpy_fromiovecend(to
, iov
, offset
, len
) < 0)
724 if (csum_partial_copy_fromiovecend(to
, iov
, offset
, len
, &csum
) < 0)
726 skb
->csum
= csum_block_add(skb
->csum
, csum
, odd
);
730 EXPORT_SYMBOL(ip_generic_getfrag
);
733 csum_page(struct page
*page
, int offset
, int copy
)
738 csum
= csum_partial(kaddr
+ offset
, copy
, 0);
743 static inline int ip_ufo_append_data(struct sock
*sk
,
744 struct sk_buff_head
*queue
,
745 int getfrag(void *from
, char *to
, int offset
, int len
,
746 int odd
, struct sk_buff
*skb
),
747 void *from
, int length
, int hh_len
, int fragheaderlen
,
748 int transhdrlen
, int maxfraglen
, unsigned int flags
)
753 /* There is support for UDP fragmentation offload by network
754 * device, so create one single skb packet containing complete
757 if ((skb
= skb_peek_tail(queue
)) == NULL
) {
758 skb
= sock_alloc_send_skb(sk
,
759 hh_len
+ fragheaderlen
+ transhdrlen
+ 20,
760 (flags
& MSG_DONTWAIT
), &err
);
765 /* reserve space for Hardware header */
766 skb_reserve(skb
, hh_len
);
768 /* create space for UDP/IP header */
769 skb_put(skb
, fragheaderlen
+ transhdrlen
);
771 /* initialize network header pointer */
772 skb_reset_network_header(skb
);
774 /* initialize protocol header pointer */
775 skb
->transport_header
= skb
->network_header
+ fragheaderlen
;
777 skb
->ip_summed
= CHECKSUM_PARTIAL
;
780 /* specify the length of each IP datagram fragment */
781 skb_shinfo(skb
)->gso_size
= maxfraglen
- fragheaderlen
;
782 skb_shinfo(skb
)->gso_type
= SKB_GSO_UDP
;
783 __skb_queue_tail(queue
, skb
);
786 return skb_append_datato_frags(sk
, skb
, getfrag
, from
,
787 (length
- transhdrlen
));
790 static int __ip_append_data(struct sock
*sk
,
792 struct sk_buff_head
*queue
,
793 struct inet_cork
*cork
,
794 int getfrag(void *from
, char *to
, int offset
,
795 int len
, int odd
, struct sk_buff
*skb
),
796 void *from
, int length
, int transhdrlen
,
799 struct inet_sock
*inet
= inet_sk(sk
);
802 struct ip_options
*opt
= cork
->opt
;
809 unsigned int maxfraglen
, fragheaderlen
;
810 int csummode
= CHECKSUM_NONE
;
811 struct rtable
*rt
= (struct rtable
*)cork
->dst
;
813 skb
= skb_peek_tail(queue
);
815 exthdrlen
= !skb
? rt
->dst
.header_len
: 0;
816 mtu
= cork
->fragsize
;
818 hh_len
= LL_RESERVED_SPACE(rt
->dst
.dev
);
820 fragheaderlen
= sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0);
821 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
;
823 if (cork
->length
+ length
> 0xFFFF - fragheaderlen
) {
824 ip_local_error(sk
, EMSGSIZE
, fl4
->daddr
, inet
->inet_dport
,
830 * transhdrlen > 0 means that this is the first fragment and we wish
831 * it won't be fragmented in the future.
834 length
+ fragheaderlen
<= mtu
&&
835 rt
->dst
.dev
->features
& NETIF_F_V4_CSUM
&&
837 csummode
= CHECKSUM_PARTIAL
;
839 cork
->length
+= length
;
840 if (((length
> mtu
) || (skb
&& skb_is_gso(skb
))) &&
841 (sk
->sk_protocol
== IPPROTO_UDP
) &&
842 (rt
->dst
.dev
->features
& NETIF_F_UFO
) && !rt
->dst
.header_len
) {
843 err
= ip_ufo_append_data(sk
, queue
, getfrag
, from
, length
,
844 hh_len
, fragheaderlen
, transhdrlen
,
851 /* So, what's going on in the loop below?
853 * We use calculated fragment length to generate chained skb,
854 * each of segments is IP fragment ready for sending to network after
855 * adding appropriate IP header.
862 /* Check if the remaining data fits into current packet. */
863 copy
= mtu
- skb
->len
;
865 copy
= maxfraglen
- skb
->len
;
868 unsigned int datalen
;
869 unsigned int fraglen
;
870 unsigned int fraggap
;
871 unsigned int alloclen
;
872 struct sk_buff
*skb_prev
;
876 fraggap
= skb_prev
->len
- maxfraglen
;
881 * If remaining data exceeds the mtu,
882 * we know we need more fragment(s).
884 datalen
= length
+ fraggap
;
885 if (datalen
> mtu
- fragheaderlen
)
886 datalen
= maxfraglen
- fragheaderlen
;
887 fraglen
= datalen
+ fragheaderlen
;
889 if ((flags
& MSG_MORE
) &&
890 !(rt
->dst
.dev
->features
&NETIF_F_SG
))
895 alloclen
+= exthdrlen
;
897 /* The last fragment gets additional space at tail.
898 * Note, with MSG_MORE we overallocate on fragments,
899 * because we have no idea what fragment will be
902 if (datalen
== length
+ fraggap
)
903 alloclen
+= rt
->dst
.trailer_len
;
906 skb
= sock_alloc_send_skb(sk
,
907 alloclen
+ hh_len
+ 15,
908 (flags
& MSG_DONTWAIT
), &err
);
911 if (atomic_read(&sk
->sk_wmem_alloc
) <=
913 skb
= sock_wmalloc(sk
,
914 alloclen
+ hh_len
+ 15, 1,
916 if (unlikely(skb
== NULL
))
919 /* only the initial fragment is
927 * Fill in the control structures
929 skb
->ip_summed
= csummode
;
931 skb_reserve(skb
, hh_len
);
932 skb_shinfo(skb
)->tx_flags
= cork
->tx_flags
;
935 * Find where to start putting bytes.
937 data
= skb_put(skb
, fraglen
+ exthdrlen
);
938 skb_set_network_header(skb
, exthdrlen
);
939 skb
->transport_header
= (skb
->network_header
+
941 data
+= fragheaderlen
+ exthdrlen
;
944 skb
->csum
= skb_copy_and_csum_bits(
945 skb_prev
, maxfraglen
,
946 data
+ transhdrlen
, fraggap
, 0);
947 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
950 pskb_trim_unique(skb_prev
, maxfraglen
);
953 copy
= datalen
- transhdrlen
- fraggap
;
954 if (copy
> 0 && getfrag(from
, data
+ transhdrlen
, offset
, copy
, fraggap
, skb
) < 0) {
961 length
-= datalen
- fraggap
;
964 csummode
= CHECKSUM_NONE
;
967 * Put the packet on the pending queue.
969 __skb_queue_tail(queue
, skb
);
976 if (!(rt
->dst
.dev
->features
&NETIF_F_SG
)) {
980 if (getfrag(from
, skb_put(skb
, copy
),
981 offset
, copy
, off
, skb
) < 0) {
982 __skb_trim(skb
, off
);
987 int i
= skb_shinfo(skb
)->nr_frags
;
988 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
-1];
989 struct page
*page
= cork
->page
;
993 if (page
&& (left
= PAGE_SIZE
- off
) > 0) {
996 if (page
!= skb_frag_page(frag
)) {
997 if (i
== MAX_SKB_FRAGS
) {
1001 skb_fill_page_desc(skb
, i
, page
, off
, 0);
1002 skb_frag_ref(skb
, i
);
1003 frag
= &skb_shinfo(skb
)->frags
[i
];
1005 } else if (i
< MAX_SKB_FRAGS
) {
1006 if (copy
> PAGE_SIZE
)
1008 page
= alloc_pages(sk
->sk_allocation
, 0);
1016 skb_fill_page_desc(skb
, i
, page
, 0, 0);
1017 frag
= &skb_shinfo(skb
)->frags
[i
];
1022 if (getfrag(from
, skb_frag_address(frag
)+skb_frag_size(frag
),
1023 offset
, copy
, skb
->len
, skb
) < 0) {
1028 skb_frag_size_add(frag
, copy
);
1030 skb
->data_len
+= copy
;
1031 skb
->truesize
+= copy
;
1032 atomic_add(copy
, &sk
->sk_wmem_alloc
);
1041 cork
->length
-= length
;
1042 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTDISCARDS
);
1046 static int ip_setup_cork(struct sock
*sk
, struct inet_cork
*cork
,
1047 struct ipcm_cookie
*ipc
, struct rtable
**rtp
)
1049 struct inet_sock
*inet
= inet_sk(sk
);
1050 struct ip_options_rcu
*opt
;
1054 * setup for corking.
1058 if (cork
->opt
== NULL
) {
1059 cork
->opt
= kmalloc(sizeof(struct ip_options
) + 40,
1061 if (unlikely(cork
->opt
== NULL
))
1064 memcpy(cork
->opt
, &opt
->opt
, sizeof(struct ip_options
) + opt
->opt
.optlen
);
1065 cork
->flags
|= IPCORK_OPT
;
1066 cork
->addr
= ipc
->addr
;
1072 * We steal reference to this route, caller should not release it
1075 cork
->fragsize
= inet
->pmtudisc
== IP_PMTUDISC_PROBE
?
1076 rt
->dst
.dev
->mtu
: dst_mtu(&rt
->dst
);
1077 cork
->dst
= &rt
->dst
;
1079 cork
->tx_flags
= ipc
->tx_flags
;
1087 * ip_append_data() and ip_append_page() can make one large IP datagram
1088 * from many pieces of data. Each pieces will be holded on the socket
1089 * until ip_push_pending_frames() is called. Each piece can be a page
1092 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1093 * this interface potentially.
1095 * LATER: length must be adjusted by pad at tail, when it is required.
1097 int ip_append_data(struct sock
*sk
, struct flowi4
*fl4
,
1098 int getfrag(void *from
, char *to
, int offset
, int len
,
1099 int odd
, struct sk_buff
*skb
),
1100 void *from
, int length
, int transhdrlen
,
1101 struct ipcm_cookie
*ipc
, struct rtable
**rtp
,
1104 struct inet_sock
*inet
= inet_sk(sk
);
1107 if (flags
&MSG_PROBE
)
1110 if (skb_queue_empty(&sk
->sk_write_queue
)) {
1111 err
= ip_setup_cork(sk
, &inet
->cork
.base
, ipc
, rtp
);
1118 return __ip_append_data(sk
, fl4
, &sk
->sk_write_queue
, &inet
->cork
.base
, getfrag
,
1119 from
, length
, transhdrlen
, flags
);
1122 ssize_t
ip_append_page(struct sock
*sk
, struct flowi4
*fl4
, struct page
*page
,
1123 int offset
, size_t size
, int flags
)
1125 struct inet_sock
*inet
= inet_sk(sk
);
1126 struct sk_buff
*skb
;
1128 struct ip_options
*opt
= NULL
;
1129 struct inet_cork
*cork
;
1134 unsigned int maxfraglen
, fragheaderlen
, fraggap
;
1139 if (flags
&MSG_PROBE
)
1142 if (skb_queue_empty(&sk
->sk_write_queue
))
1145 cork
= &inet
->cork
.base
;
1146 rt
= (struct rtable
*)cork
->dst
;
1147 if (cork
->flags
& IPCORK_OPT
)
1150 if (!(rt
->dst
.dev
->features
&NETIF_F_SG
))
1153 hh_len
= LL_RESERVED_SPACE(rt
->dst
.dev
);
1154 mtu
= cork
->fragsize
;
1156 fragheaderlen
= sizeof(struct iphdr
) + (opt
? opt
->optlen
: 0);
1157 maxfraglen
= ((mtu
- fragheaderlen
) & ~7) + fragheaderlen
;
1159 if (cork
->length
+ size
> 0xFFFF - fragheaderlen
) {
1160 ip_local_error(sk
, EMSGSIZE
, fl4
->daddr
, inet
->inet_dport
, mtu
);
1164 if ((skb
= skb_peek_tail(&sk
->sk_write_queue
)) == NULL
)
1167 cork
->length
+= size
;
1168 if ((size
+ skb
->len
> mtu
) &&
1169 (sk
->sk_protocol
== IPPROTO_UDP
) &&
1170 (rt
->dst
.dev
->features
& NETIF_F_UFO
)) {
1171 skb_shinfo(skb
)->gso_size
= mtu
- fragheaderlen
;
1172 skb_shinfo(skb
)->gso_type
= SKB_GSO_UDP
;
1179 if (skb_is_gso(skb
))
1183 /* Check if the remaining data fits into current packet. */
1184 len
= mtu
- skb
->len
;
1186 len
= maxfraglen
- skb
->len
;
1189 struct sk_buff
*skb_prev
;
1193 fraggap
= skb_prev
->len
- maxfraglen
;
1195 alloclen
= fragheaderlen
+ hh_len
+ fraggap
+ 15;
1196 skb
= sock_wmalloc(sk
, alloclen
, 1, sk
->sk_allocation
);
1197 if (unlikely(!skb
)) {
1203 * Fill in the control structures
1205 skb
->ip_summed
= CHECKSUM_NONE
;
1207 skb_reserve(skb
, hh_len
);
1210 * Find where to start putting bytes.
1212 skb_put(skb
, fragheaderlen
+ fraggap
);
1213 skb_reset_network_header(skb
);
1214 skb
->transport_header
= (skb
->network_header
+
1217 skb
->csum
= skb_copy_and_csum_bits(skb_prev
,
1219 skb_transport_header(skb
),
1221 skb_prev
->csum
= csum_sub(skb_prev
->csum
,
1223 pskb_trim_unique(skb_prev
, maxfraglen
);
1227 * Put the packet on the pending queue.
1229 __skb_queue_tail(&sk
->sk_write_queue
, skb
);
1233 i
= skb_shinfo(skb
)->nr_frags
;
1236 if (skb_can_coalesce(skb
, i
, page
, offset
)) {
1237 skb_frag_size_add(&skb_shinfo(skb
)->frags
[i
-1], len
);
1238 } else if (i
< MAX_SKB_FRAGS
) {
1240 skb_fill_page_desc(skb
, i
, page
, offset
, len
);
1246 if (skb
->ip_summed
== CHECKSUM_NONE
) {
1248 csum
= csum_page(page
, offset
, len
);
1249 skb
->csum
= csum_block_add(skb
->csum
, csum
, skb
->len
);
1253 skb
->data_len
+= len
;
1254 skb
->truesize
+= len
;
1255 atomic_add(len
, &sk
->sk_wmem_alloc
);
1262 cork
->length
-= size
;
1263 IP_INC_STATS(sock_net(sk
), IPSTATS_MIB_OUTDISCARDS
);
1267 static void ip_cork_release(struct inet_cork
*cork
)
1269 cork
->flags
&= ~IPCORK_OPT
;
1272 dst_release(cork
->dst
);
1277 * Combined all pending IP fragments on the socket as one IP datagram
1278 * and push them out.
1280 struct sk_buff
*__ip_make_skb(struct sock
*sk
,
1282 struct sk_buff_head
*queue
,
1283 struct inet_cork
*cork
)
1285 struct sk_buff
*skb
, *tmp_skb
;
1286 struct sk_buff
**tail_skb
;
1287 struct inet_sock
*inet
= inet_sk(sk
);
1288 struct net
*net
= sock_net(sk
);
1289 struct ip_options
*opt
= NULL
;
1290 struct rtable
*rt
= (struct rtable
*)cork
->dst
;
1295 if ((skb
= __skb_dequeue(queue
)) == NULL
)
1297 tail_skb
= &(skb_shinfo(skb
)->frag_list
);
1299 /* move skb->data to ip header from ext header */
1300 if (skb
->data
< skb_network_header(skb
))
1301 __skb_pull(skb
, skb_network_offset(skb
));
1302 while ((tmp_skb
= __skb_dequeue(queue
)) != NULL
) {
1303 __skb_pull(tmp_skb
, skb_network_header_len(skb
));
1304 *tail_skb
= tmp_skb
;
1305 tail_skb
= &(tmp_skb
->next
);
1306 skb
->len
+= tmp_skb
->len
;
1307 skb
->data_len
+= tmp_skb
->len
;
1308 skb
->truesize
+= tmp_skb
->truesize
;
1309 tmp_skb
->destructor
= NULL
;
1313 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1314 * to fragment the frame generated here. No matter, what transforms
1315 * how transforms change size of the packet, it will come out.
1317 if (inet
->pmtudisc
< IP_PMTUDISC_DO
)
1320 /* DF bit is set when we want to see DF on outgoing frames.
1321 * If local_df is set too, we still allow to fragment this frame
1323 if (inet
->pmtudisc
>= IP_PMTUDISC_DO
||
1324 (skb
->len
<= dst_mtu(&rt
->dst
) &&
1325 ip_dont_fragment(sk
, &rt
->dst
)))
1328 if (cork
->flags
& IPCORK_OPT
)
1331 if (rt
->rt_type
== RTN_MULTICAST
)
1334 ttl
= ip_select_ttl(inet
, &rt
->dst
);
1336 iph
= (struct iphdr
*)skb
->data
;
1339 iph
->tos
= inet
->tos
;
1341 ip_select_ident(iph
, &rt
->dst
, sk
);
1343 iph
->protocol
= sk
->sk_protocol
;
1344 ip_copy_addrs(iph
, fl4
);
1347 iph
->ihl
+= opt
->optlen
>>2;
1348 ip_options_build(skb
, opt
, cork
->addr
, rt
, 0);
1351 skb
->priority
= sk
->sk_priority
;
1352 skb
->mark
= sk
->sk_mark
;
1354 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1358 skb_dst_set(skb
, &rt
->dst
);
1360 if (iph
->protocol
== IPPROTO_ICMP
)
1361 icmp_out_count(net
, ((struct icmphdr
*)
1362 skb_transport_header(skb
))->type
);
1364 ip_cork_release(cork
);
1369 int ip_send_skb(struct sk_buff
*skb
)
1371 struct net
*net
= sock_net(skb
->sk
);
1374 err
= ip_local_out(skb
);
1377 err
= net_xmit_errno(err
);
1379 IP_INC_STATS(net
, IPSTATS_MIB_OUTDISCARDS
);
1385 int ip_push_pending_frames(struct sock
*sk
, struct flowi4
*fl4
)
1387 struct sk_buff
*skb
;
1389 skb
= ip_finish_skb(sk
, fl4
);
1393 /* Netfilter gets whole the not fragmented skb. */
1394 return ip_send_skb(skb
);
1398 * Throw away all pending data on the socket.
1400 static void __ip_flush_pending_frames(struct sock
*sk
,
1401 struct sk_buff_head
*queue
,
1402 struct inet_cork
*cork
)
1404 struct sk_buff
*skb
;
1406 while ((skb
= __skb_dequeue_tail(queue
)) != NULL
)
1409 ip_cork_release(cork
);
1412 void ip_flush_pending_frames(struct sock
*sk
)
1414 __ip_flush_pending_frames(sk
, &sk
->sk_write_queue
, &inet_sk(sk
)->cork
.base
);
1417 struct sk_buff
*ip_make_skb(struct sock
*sk
,
1419 int getfrag(void *from
, char *to
, int offset
,
1420 int len
, int odd
, struct sk_buff
*skb
),
1421 void *from
, int length
, int transhdrlen
,
1422 struct ipcm_cookie
*ipc
, struct rtable
**rtp
,
1425 struct inet_cork cork
;
1426 struct sk_buff_head queue
;
1429 if (flags
& MSG_PROBE
)
1432 __skb_queue_head_init(&queue
);
1437 err
= ip_setup_cork(sk
, &cork
, ipc
, rtp
);
1439 return ERR_PTR(err
);
1441 err
= __ip_append_data(sk
, fl4
, &queue
, &cork
, getfrag
,
1442 from
, length
, transhdrlen
, flags
);
1444 __ip_flush_pending_frames(sk
, &queue
, &cork
);
1445 return ERR_PTR(err
);
1448 return __ip_make_skb(sk
, fl4
, &queue
, &cork
);
1452 * Fetch data from kernel space and fill in checksum if needed.
1454 static int ip_reply_glue_bits(void *dptr
, char *to
, int offset
,
1455 int len
, int odd
, struct sk_buff
*skb
)
1459 csum
= csum_partial_copy_nocheck(dptr
+offset
, to
, len
, 0);
1460 skb
->csum
= csum_block_add(skb
->csum
, csum
, odd
);
1465 * Generic function to send a packet as reply to another packet.
1466 * Used to send TCP resets so far.
1468 * Should run single threaded per socket because it uses the sock
1469 * structure to pass arguments.
1471 void ip_send_unicast_reply(struct sock
*sk
, struct sk_buff
*skb
, __be32 daddr
,
1472 __be32 saddr
, const struct ip_reply_arg
*arg
,
1475 struct inet_sock
*inet
= inet_sk(sk
);
1476 struct ip_options_data replyopts
;
1477 struct ipcm_cookie ipc
;
1479 struct rtable
*rt
= skb_rtable(skb
);
1481 if (ip_options_echo(&replyopts
.opt
.opt
, skb
))
1488 if (replyopts
.opt
.opt
.optlen
) {
1489 ipc
.opt
= &replyopts
.opt
;
1491 if (replyopts
.opt
.opt
.srr
)
1492 daddr
= replyopts
.opt
.opt
.faddr
;
1495 flowi4_init_output(&fl4
, arg
->bound_dev_if
, 0,
1497 RT_SCOPE_UNIVERSE
, sk
->sk_protocol
,
1498 ip_reply_arg_flowi_flags(arg
),
1500 tcp_hdr(skb
)->source
, tcp_hdr(skb
)->dest
);
1501 security_skb_classify_flow(skb
, flowi4_to_flowi(&fl4
));
1502 rt
= ip_route_output_key(sock_net(sk
), &fl4
);
1506 /* And let IP do all the hard work.
1508 This chunk is not reenterable, hence spinlock.
1509 Note that it uses the fact, that this function is called
1510 with locally disabled BH and that sk cannot be already spinlocked.
1513 inet
->tos
= arg
->tos
;
1514 sk
->sk_priority
= skb
->priority
;
1515 sk
->sk_protocol
= ip_hdr(skb
)->protocol
;
1516 sk
->sk_bound_dev_if
= arg
->bound_dev_if
;
1517 ip_append_data(sk
, &fl4
, ip_reply_glue_bits
, arg
->iov
->iov_base
, len
, 0,
1518 &ipc
, &rt
, MSG_DONTWAIT
);
1519 if ((skb
= skb_peek(&sk
->sk_write_queue
)) != NULL
) {
1520 if (arg
->csumoffset
>= 0)
1521 *((__sum16
*)skb_transport_header(skb
) +
1522 arg
->csumoffset
) = csum_fold(csum_add(skb
->csum
,
1524 skb
->ip_summed
= CHECKSUM_NONE
;
1525 ip_push_pending_frames(sk
, &fl4
);
1533 void __init
ip_init(void)
1538 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1539 igmp_mc_proc_init();