]>
git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/netfilter/ipvs/ip_vs_xmit.c
2 * ip_vs_xmit.c: various packet transmitters for IPVS
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
14 * Description of forwarding methods:
15 * - all transmitters are called from LOCAL_IN (remote clients) and
16 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
17 * - not all connections have destination server, for example,
18 * connections in backup server when fwmark is used
19 * - bypass connections use daddr from packet
21 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
22 * - skb->pkt_type is not set yet
23 * - the only place where we can see skb->sk != NULL
26 #define KMSG_COMPONENT "IPVS"
27 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29 #include <linux/kernel.h>
30 #include <linux/slab.h>
31 #include <linux/tcp.h> /* for tcphdr */
33 #include <net/tcp.h> /* for csum_tcpudp_magic */
35 #include <net/icmp.h> /* for icmp_send */
36 #include <net/route.h> /* for ip_route_output */
38 #include <net/ip6_route.h>
39 #include <net/addrconf.h>
40 #include <linux/icmpv6.h>
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv4.h>
44 #include <net/ip_vs.h>
47 IP_VS_RT_MODE_LOCAL
= 1, /* Allow local dest */
48 IP_VS_RT_MODE_NON_LOCAL
= 2, /* Allow non-local dest */
49 IP_VS_RT_MODE_RDR
= 4, /* Allow redirect from remote daddr to
52 IP_VS_RT_MODE_CONNECT
= 8, /* Always bind route to saddr */
53 IP_VS_RT_MODE_KNOWN_NH
= 16,/* Route via remote addr */
57 * Destination cache to speed up outgoing route lookup
60 __ip_vs_dst_set(struct ip_vs_dest
*dest
, struct dst_entry
*dst
, u32 dst_cookie
)
62 struct dst_entry
*old_dst
;
64 old_dst
= dest
->dst_cache
;
65 dest
->dst_cache
= dst
;
66 dest
->dst_cookie
= dst_cookie
;
70 static inline struct dst_entry
*
71 __ip_vs_dst_check(struct ip_vs_dest
*dest
)
73 struct dst_entry
*dst
= dest
->dst_cache
;
77 if (dst
->obsolete
&& dst
->ops
->check(dst
, dest
->dst_cookie
) == NULL
) {
78 dest
->dst_cache
= NULL
;
87 __mtu_check_toobig_v6(const struct sk_buff
*skb
, u32 mtu
)
89 if (IP6CB(skb
)->frag_max_size
) {
90 /* frag_max_size tell us that, this packet have been
91 * defragmented by netfilter IPv6 conntrack module.
93 if (IP6CB(skb
)->frag_max_size
> mtu
)
94 return true; /* largest fragment violate MTU */
96 else if (skb
->len
> mtu
&& !skb_is_gso(skb
)) {
97 return true; /* Packet size violate MTU size */
102 /* Get route to daddr, update *saddr, optionally bind route to saddr */
103 static struct rtable
*do_output_route4(struct net
*net
, __be32 daddr
,
104 int rt_mode
, __be32
*saddr
)
110 memset(&fl4
, 0, sizeof(fl4
));
112 fl4
.saddr
= (rt_mode
& IP_VS_RT_MODE_CONNECT
) ? *saddr
: 0;
113 fl4
.flowi4_flags
= (rt_mode
& IP_VS_RT_MODE_KNOWN_NH
) ?
114 FLOWI_FLAG_KNOWN_NH
: 0;
117 rt
= ip_route_output_key(net
, &fl4
);
119 /* Invalid saddr ? */
120 if (PTR_ERR(rt
) == -EINVAL
&& *saddr
&&
121 rt_mode
& IP_VS_RT_MODE_CONNECT
&& !loop
) {
123 flowi4_update_output(&fl4
, 0, 0, daddr
, 0);
126 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr
);
128 } else if (!*saddr
&& rt_mode
& IP_VS_RT_MODE_CONNECT
&& fl4
.saddr
) {
131 flowi4_update_output(&fl4
, 0, 0, daddr
, fl4
.saddr
);
139 /* Get route to destination or remote server */
140 static struct rtable
*
141 __ip_vs_get_out_rt(struct sk_buff
*skb
, struct ip_vs_dest
*dest
,
142 __be32 daddr
, int rt_mode
, __be32
*ret_saddr
)
144 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
145 struct rtable
*rt
; /* Route to the other host */
146 struct rtable
*ort
; /* Original route */
150 spin_lock(&dest
->dst_lock
);
151 rt
= (struct rtable
*) __ip_vs_dst_check(dest
);
153 rt
= do_output_route4(net
, dest
->addr
.ip
, rt_mode
,
154 &dest
->dst_saddr
.ip
);
156 spin_unlock(&dest
->dst_lock
);
159 __ip_vs_dst_set(dest
, dst_clone(&rt
->dst
), 0);
160 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
161 &dest
->addr
.ip
, &dest
->dst_saddr
.ip
,
162 atomic_read(&rt
->dst
.__refcnt
));
164 daddr
= dest
->addr
.ip
;
166 *ret_saddr
= dest
->dst_saddr
.ip
;
167 spin_unlock(&dest
->dst_lock
);
169 __be32 saddr
= htonl(INADDR_ANY
);
171 /* For such unconfigured boxes avoid many route lookups
172 * for performance reasons because we do not remember saddr
174 rt_mode
&= ~IP_VS_RT_MODE_CONNECT
;
175 rt
= do_output_route4(net
, daddr
, rt_mode
, &saddr
);
182 local
= rt
->rt_flags
& RTCF_LOCAL
;
183 if (!((local
? IP_VS_RT_MODE_LOCAL
: IP_VS_RT_MODE_NON_LOCAL
) &
185 IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
186 (rt
->rt_flags
& RTCF_LOCAL
) ?
187 "local":"non-local", &daddr
);
191 if (local
&& !(rt_mode
& IP_VS_RT_MODE_RDR
) &&
192 !((ort
= skb_rtable(skb
)) && ort
->rt_flags
& RTCF_LOCAL
)) {
193 IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
194 "requires NAT method, dest: %pI4\n",
195 &ip_hdr(skb
)->daddr
, &daddr
);
199 if (unlikely(!local
&& ipv4_is_loopback(ip_hdr(skb
)->saddr
))) {
200 IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
201 "to non-local address, dest: %pI4\n",
202 &ip_hdr(skb
)->saddr
, &daddr
);
210 #ifdef CONFIG_IP_VS_IPV6
212 static inline int __ip_vs_is_local_route6(struct rt6_info
*rt
)
214 return rt
->dst
.dev
&& rt
->dst
.dev
->flags
& IFF_LOOPBACK
;
217 static struct dst_entry
*
218 __ip_vs_route_output_v6(struct net
*net
, struct in6_addr
*daddr
,
219 struct in6_addr
*ret_saddr
, int do_xfrm
)
221 struct dst_entry
*dst
;
222 struct flowi6 fl6
= {
226 dst
= ip6_route_output(net
, NULL
, &fl6
);
231 if (ipv6_addr_any(&fl6
.saddr
) &&
232 ipv6_dev_get_saddr(net
, ip6_dst_idev(dst
)->dev
,
233 &fl6
.daddr
, 0, &fl6
.saddr
) < 0)
236 dst
= xfrm_lookup(net
, dst
, flowi6_to_flowi(&fl6
), NULL
, 0);
242 *ret_saddr
= fl6
.saddr
;
247 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr
);
252 * Get route to destination or remote server
254 static struct rt6_info
*
255 __ip_vs_get_out_rt_v6(struct sk_buff
*skb
, struct ip_vs_dest
*dest
,
256 struct in6_addr
*daddr
, struct in6_addr
*ret_saddr
,
257 int do_xfrm
, int rt_mode
)
259 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
260 struct rt6_info
*rt
; /* Route to the other host */
261 struct rt6_info
*ort
; /* Original route */
262 struct dst_entry
*dst
;
266 spin_lock(&dest
->dst_lock
);
267 rt
= (struct rt6_info
*)__ip_vs_dst_check(dest
);
271 dst
= __ip_vs_route_output_v6(net
, &dest
->addr
.in6
,
272 &dest
->dst_saddr
.in6
,
275 spin_unlock(&dest
->dst_lock
);
278 rt
= (struct rt6_info
*) dst
;
279 cookie
= rt
->rt6i_node
? rt
->rt6i_node
->fn_sernum
: 0;
280 __ip_vs_dst_set(dest
, dst_clone(&rt
->dst
), cookie
);
281 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
282 &dest
->addr
.in6
, &dest
->dst_saddr
.in6
,
283 atomic_read(&rt
->dst
.__refcnt
));
286 *ret_saddr
= dest
->dst_saddr
.in6
;
287 spin_unlock(&dest
->dst_lock
);
289 dst
= __ip_vs_route_output_v6(net
, daddr
, ret_saddr
, do_xfrm
);
292 rt
= (struct rt6_info
*) dst
;
295 local
= __ip_vs_is_local_route6(rt
);
296 if (!((local
? IP_VS_RT_MODE_LOCAL
: IP_VS_RT_MODE_NON_LOCAL
) &
298 IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",
299 local
? "local":"non-local", daddr
);
300 dst_release(&rt
->dst
);
303 if (local
&& !(rt_mode
& IP_VS_RT_MODE_RDR
) &&
304 !((ort
= (struct rt6_info
*) skb_dst(skb
)) &&
305 __ip_vs_is_local_route6(ort
))) {
306 IP_VS_DBG_RL("Redirect from non-local address %pI6c to local "
307 "requires NAT method, dest: %pI6c\n",
308 &ipv6_hdr(skb
)->daddr
, daddr
);
309 dst_release(&rt
->dst
);
312 if (unlikely(!local
&& (!skb
->dev
|| skb
->dev
->flags
& IFF_LOOPBACK
) &&
313 ipv6_addr_type(&ipv6_hdr(skb
)->saddr
) &
314 IPV6_ADDR_LOOPBACK
)) {
315 IP_VS_DBG_RL("Stopping traffic from loopback address %pI6c "
316 "to non-local address, dest: %pI6c\n",
317 &ipv6_hdr(skb
)->saddr
, daddr
);
318 dst_release(&rt
->dst
);
327 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
328 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff
*skb
,
329 struct ip_vs_conn
*cp
)
333 skb
->ipvs_property
= 1;
334 if (unlikely(cp
->flags
& IP_VS_CONN_F_NFCT
))
335 ret
= ip_vs_confirm_conntrack(skb
);
336 if (ret
== NF_ACCEPT
) {
338 skb_forward_csum(skb
);
343 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
344 static inline int ip_vs_nat_send_or_cont(int pf
, struct sk_buff
*skb
,
345 struct ip_vs_conn
*cp
, int local
)
349 skb
->ipvs_property
= 1;
350 if (likely(!(cp
->flags
& IP_VS_CONN_F_NFCT
)))
353 ip_vs_update_conntrack(skb
, cp
, 1);
355 skb_forward_csum(skb
);
356 NF_HOOK(pf
, NF_INET_LOCAL_OUT
, skb
, NULL
, skb_dst(skb
)->dev
,
363 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
364 static inline int ip_vs_send_or_cont(int pf
, struct sk_buff
*skb
,
365 struct ip_vs_conn
*cp
, int local
)
369 skb
->ipvs_property
= 1;
370 if (likely(!(cp
->flags
& IP_VS_CONN_F_NFCT
)))
373 skb_forward_csum(skb
);
374 NF_HOOK(pf
, NF_INET_LOCAL_OUT
, skb
, NULL
, skb_dst(skb
)->dev
,
383 * NULL transmitter (do nothing except return NF_ACCEPT)
386 ip_vs_null_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
387 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
389 /* we do not touch skb and do not need pskb ptr */
390 return ip_vs_send_or_cont(NFPROTO_IPV4
, skb
, cp
, 1);
396 * Let packets bypass the destination when the destination is not
397 * available, it may be only used in transparent cache cluster.
400 ip_vs_bypass_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
401 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
403 struct rtable
*rt
; /* Route to the other host */
404 struct iphdr
*iph
= ip_hdr(skb
);
409 rt
= __ip_vs_get_out_rt(skb
, NULL
, iph
->daddr
, IP_VS_RT_MODE_NON_LOCAL
,
415 mtu
= dst_mtu(&rt
->dst
);
416 if ((skb
->len
> mtu
) && (iph
->frag_off
& htons(IP_DF
)) &&
419 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
420 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
425 * Call ip_send_check because we are not sure it is called
426 * after ip_defrag. Is copy-on-write needed?
428 if (unlikely((skb
= skb_share_check(skb
, GFP_ATOMIC
)) == NULL
)) {
432 ip_send_check(ip_hdr(skb
));
436 skb_dst_set(skb
, &rt
->dst
);
438 /* Another hack: avoid icmp_send in ip_fragment */
441 ip_vs_send_or_cont(NFPROTO_IPV4
, skb
, cp
, 0);
447 dst_link_failure(skb
);
454 #ifdef CONFIG_IP_VS_IPV6
456 ip_vs_bypass_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
457 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*iph
)
459 struct rt6_info
*rt
; /* Route to the other host */
464 rt
= __ip_vs_get_out_rt_v6(skb
, NULL
, &iph
->daddr
.in6
, NULL
, 0,
465 IP_VS_RT_MODE_NON_LOCAL
);
470 mtu
= dst_mtu(&rt
->dst
);
471 if (__mtu_check_toobig_v6(skb
, mtu
)) {
473 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
475 skb
->dev
= net
->loopback_dev
;
477 /* only send ICMP too big on first fragment */
479 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
480 dst_release(&rt
->dst
);
481 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
486 * Call ip_send_check because we are not sure it is called
487 * after ip_defrag. Is copy-on-write needed?
489 skb
= skb_share_check(skb
, GFP_ATOMIC
);
490 if (unlikely(skb
== NULL
)) {
491 dst_release(&rt
->dst
);
497 skb_dst_set(skb
, &rt
->dst
);
499 /* Another hack: avoid icmp_send in ip_fragment */
502 ip_vs_send_or_cont(NFPROTO_IPV6
, skb
, cp
, 0);
508 dst_link_failure(skb
);
517 * NAT transmitter (only for outside-to-inside nat forwarding)
518 * Not used for related ICMP
521 ip_vs_nat_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
522 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
524 struct rtable
*rt
; /* Route to the other host */
526 struct iphdr
*iph
= ip_hdr(skb
);
531 /* check if it is a connection of no-client-port */
532 if (unlikely(cp
->flags
& IP_VS_CONN_F_NO_CPORT
)) {
534 p
= skb_header_pointer(skb
, iph
->ihl
*4, sizeof(_pt
), &_pt
);
537 ip_vs_conn_fill_cport(cp
, *p
);
538 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p
));
541 if (!(rt
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
542 IP_VS_RT_MODE_LOCAL
|
543 IP_VS_RT_MODE_NON_LOCAL
|
544 IP_VS_RT_MODE_RDR
, NULL
)))
546 local
= rt
->rt_flags
& RTCF_LOCAL
;
548 * Avoid duplicate tuple in reply direction for NAT traffic
549 * to local address when connection is sync-ed
551 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
552 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
553 enum ip_conntrack_info ctinfo
;
554 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
556 if (ct
&& !nf_ct_is_untracked(ct
)) {
557 IP_VS_DBG_RL_PKT(10, AF_INET
, pp
, skb
, 0,
559 "stopping DNAT to local address");
565 /* From world but DNAT to loopback address? */
566 if (local
&& ipv4_is_loopback(cp
->daddr
.ip
) &&
567 rt_is_input_route(skb_rtable(skb
))) {
568 IP_VS_DBG_RL_PKT(1, AF_INET
, pp
, skb
, 0, "ip_vs_nat_xmit(): "
569 "stopping DNAT to loopback address");
574 mtu
= dst_mtu(&rt
->dst
);
575 if ((skb
->len
> mtu
) && (iph
->frag_off
& htons(IP_DF
)) &&
577 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
578 IP_VS_DBG_RL_PKT(0, AF_INET
, pp
, skb
, 0,
579 "ip_vs_nat_xmit(): frag needed for");
583 /* copy-on-write the packet before mangling it */
584 if (!skb_make_writable(skb
, sizeof(struct iphdr
)))
587 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
590 /* mangle the packet */
591 if (pp
->dnat_handler
&& !pp
->dnat_handler(skb
, pp
, cp
, ipvsh
))
593 ip_hdr(skb
)->daddr
= cp
->daddr
.ip
;
594 ip_send_check(ip_hdr(skb
));
599 skb_dst_set(skb
, &rt
->dst
);
603 IP_VS_DBG_PKT(10, AF_INET
, pp
, skb
, 0, "After DNAT");
605 /* FIXME: when application helper enlarges the packet and the length
606 is larger than the MTU of outgoing device, there will be still
609 /* Another hack: avoid icmp_send in ip_fragment */
612 rc
= ip_vs_nat_send_or_cont(NFPROTO_IPV4
, skb
, cp
, local
);
618 dst_link_failure(skb
);
628 #ifdef CONFIG_IP_VS_IPV6
630 ip_vs_nat_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
631 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*iph
)
633 struct rt6_info
*rt
; /* Route to the other host */
639 /* check if it is a connection of no-client-port */
640 if (unlikely(cp
->flags
& IP_VS_CONN_F_NO_CPORT
&& !iph
->fragoffs
)) {
642 p
= skb_header_pointer(skb
, iph
->len
, sizeof(_pt
), &_pt
);
645 ip_vs_conn_fill_cport(cp
, *p
);
646 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p
));
649 if (!(rt
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
, NULL
,
650 0, (IP_VS_RT_MODE_LOCAL
|
651 IP_VS_RT_MODE_NON_LOCAL
|
652 IP_VS_RT_MODE_RDR
))))
654 local
= __ip_vs_is_local_route6(rt
);
656 * Avoid duplicate tuple in reply direction for NAT traffic
657 * to local address when connection is sync-ed
659 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
660 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
661 enum ip_conntrack_info ctinfo
;
662 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
664 if (ct
&& !nf_ct_is_untracked(ct
)) {
665 IP_VS_DBG_RL_PKT(10, AF_INET6
, pp
, skb
, 0,
666 "ip_vs_nat_xmit_v6(): "
667 "stopping DNAT to local address");
673 /* From world but DNAT to loopback address? */
674 if (local
&& skb
->dev
&& !(skb
->dev
->flags
& IFF_LOOPBACK
) &&
675 ipv6_addr_type(&rt
->rt6i_dst
.addr
) & IPV6_ADDR_LOOPBACK
) {
676 IP_VS_DBG_RL_PKT(1, AF_INET6
, pp
, skb
, 0,
677 "ip_vs_nat_xmit_v6(): "
678 "stopping DNAT to loopback address");
683 mtu
= dst_mtu(&rt
->dst
);
684 if (__mtu_check_toobig_v6(skb
, mtu
)) {
686 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
688 skb
->dev
= net
->loopback_dev
;
690 /* only send ICMP too big on first fragment */
692 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
693 IP_VS_DBG_RL_PKT(0, AF_INET6
, pp
, skb
, 0,
694 "ip_vs_nat_xmit_v6(): frag needed for");
698 /* copy-on-write the packet before mangling it */
699 if (!skb_make_writable(skb
, sizeof(struct ipv6hdr
)))
702 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
705 /* mangle the packet */
706 if (pp
->dnat_handler
&& !pp
->dnat_handler(skb
, pp
, cp
, iph
))
708 ipv6_hdr(skb
)->daddr
= cp
->daddr
.in6
;
710 if (!local
|| !skb
->dev
) {
711 /* drop the old route when skb is not shared */
713 skb_dst_set(skb
, &rt
->dst
);
715 /* destined to loopback, do we need to change route? */
716 dst_release(&rt
->dst
);
719 IP_VS_DBG_PKT(10, AF_INET6
, pp
, skb
, 0, "After DNAT");
721 /* FIXME: when application helper enlarges the packet and the length
722 is larger than the MTU of outgoing device, there will be still
725 /* Another hack: avoid icmp_send in ip_fragment */
728 rc
= ip_vs_nat_send_or_cont(NFPROTO_IPV6
, skb
, cp
, local
);
734 dst_link_failure(skb
);
740 dst_release(&rt
->dst
);
747 * IP Tunneling transmitter
749 * This function encapsulates the packet in a new IP packet, its
750 * destination will be set to cp->daddr. Most code of this function
751 * is taken from ipip.c.
753 * It is used in VS/TUN cluster. The load balancer selects a real
754 * server from a cluster based on a scheduling algorithm,
755 * encapsulates the request packet and forwards it to the selected
756 * server. For example, all real servers are configured with
757 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
758 * the encapsulated packet, it will decapsulate the packet, processe
759 * the request and return the response packets directly to the client
760 * without passing the load balancer. This can greatly increase the
761 * scalability of virtual server.
763 * Used for ANY protocol
766 ip_vs_tunnel_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
767 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
769 struct netns_ipvs
*ipvs
= net_ipvs(skb_net(skb
));
770 struct rtable
*rt
; /* Route to the other host */
771 __be32 saddr
; /* Source for tunnel */
772 struct net_device
*tdev
; /* Device to other host */
773 struct iphdr
*old_iph
= ip_hdr(skb
);
774 u8 tos
= old_iph
->tos
;
776 struct iphdr
*iph
; /* Our new IP header */
777 unsigned int max_headroom
; /* The extra header space needed */
783 if (!(rt
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
784 IP_VS_RT_MODE_LOCAL
|
785 IP_VS_RT_MODE_NON_LOCAL
|
786 IP_VS_RT_MODE_CONNECT
, &saddr
)))
788 if (rt
->rt_flags
& RTCF_LOCAL
) {
790 return ip_vs_send_or_cont(NFPROTO_IPV4
, skb
, cp
, 1);
795 mtu
= dst_mtu(&rt
->dst
) - sizeof(struct iphdr
);
797 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__
);
800 if (rt_is_output_route(skb_rtable(skb
)))
801 skb_dst(skb
)->ops
->update_pmtu(skb_dst(skb
), NULL
, skb
, mtu
);
803 /* Copy DF, reset fragment offset and MF */
804 df
= sysctl_pmtu_disc(ipvs
) ? old_iph
->frag_off
& htons(IP_DF
) : 0;
806 if (df
&& mtu
< ntohs(old_iph
->tot_len
) && !skb_is_gso(skb
)) {
807 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
808 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
813 * Okay, now see if we can stuff it in the buffer as-is.
815 max_headroom
= LL_RESERVED_SPACE(tdev
) + sizeof(struct iphdr
);
817 if (skb_headroom(skb
) < max_headroom
818 || skb_cloned(skb
) || skb_shared(skb
)) {
819 struct sk_buff
*new_skb
=
820 skb_realloc_headroom(skb
, max_headroom
);
824 IP_VS_ERR_RL("%s(): no memory\n", __func__
);
829 old_iph
= ip_hdr(skb
);
832 skb
->transport_header
= skb
->network_header
;
834 /* fix old IP header checksum */
835 ip_send_check(old_iph
);
837 skb_push(skb
, sizeof(struct iphdr
));
838 skb_reset_network_header(skb
);
839 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
843 skb_dst_set(skb
, &rt
->dst
);
846 * Push down and install the IPIP header.
850 iph
->ihl
= sizeof(struct iphdr
)>>2;
852 iph
->protocol
= IPPROTO_IPIP
;
854 iph
->daddr
= cp
->daddr
.ip
;
856 iph
->ttl
= old_iph
->ttl
;
857 ip_select_ident(iph
, &rt
->dst
, NULL
);
859 /* Another hack: avoid icmp_send in ip_fragment */
862 ret
= ip_vs_tunnel_xmit_prepare(skb
, cp
);
863 if (ret
== NF_ACCEPT
)
865 else if (ret
== NF_DROP
)
873 dst_link_failure(skb
);
883 #ifdef CONFIG_IP_VS_IPV6
885 ip_vs_tunnel_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
886 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
888 struct rt6_info
*rt
; /* Route to the other host */
889 struct in6_addr saddr
; /* Source for tunnel */
890 struct net_device
*tdev
; /* Device to other host */
891 struct ipv6hdr
*old_iph
= ipv6_hdr(skb
);
892 struct ipv6hdr
*iph
; /* Our new IP header */
893 unsigned int max_headroom
; /* The extra header space needed */
899 if (!(rt
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
,
900 &saddr
, 1, (IP_VS_RT_MODE_LOCAL
|
901 IP_VS_RT_MODE_NON_LOCAL
))))
903 if (__ip_vs_is_local_route6(rt
)) {
904 dst_release(&rt
->dst
);
905 return ip_vs_send_or_cont(NFPROTO_IPV6
, skb
, cp
, 1);
910 mtu
= dst_mtu(&rt
->dst
) - sizeof(struct ipv6hdr
);
911 if (mtu
< IPV6_MIN_MTU
) {
912 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__
,
917 skb_dst(skb
)->ops
->update_pmtu(skb_dst(skb
), NULL
, skb
, mtu
);
919 /* MTU checking: Notice that 'mtu' have been adjusted before hand */
920 if (__mtu_check_toobig_v6(skb
, mtu
)) {
922 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
924 skb
->dev
= net
->loopback_dev
;
926 /* only send ICMP too big on first fragment */
927 if (!ipvsh
->fragoffs
)
928 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
929 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
934 * Okay, now see if we can stuff it in the buffer as-is.
936 max_headroom
= LL_RESERVED_SPACE(tdev
) + sizeof(struct ipv6hdr
);
938 if (skb_headroom(skb
) < max_headroom
939 || skb_cloned(skb
) || skb_shared(skb
)) {
940 struct sk_buff
*new_skb
=
941 skb_realloc_headroom(skb
, max_headroom
);
943 dst_release(&rt
->dst
);
945 IP_VS_ERR_RL("%s(): no memory\n", __func__
);
950 old_iph
= ipv6_hdr(skb
);
953 skb
->transport_header
= skb
->network_header
;
955 skb_push(skb
, sizeof(struct ipv6hdr
));
956 skb_reset_network_header(skb
);
957 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
961 skb_dst_set(skb
, &rt
->dst
);
964 * Push down and install the IPIP header.
968 iph
->nexthdr
= IPPROTO_IPV6
;
969 iph
->payload_len
= old_iph
->payload_len
;
970 be16_add_cpu(&iph
->payload_len
, sizeof(*old_iph
));
971 iph
->priority
= old_iph
->priority
;
972 memset(&iph
->flow_lbl
, 0, sizeof(iph
->flow_lbl
));
973 iph
->daddr
= cp
->daddr
.in6
;
975 iph
->hop_limit
= old_iph
->hop_limit
;
977 /* Another hack: avoid icmp_send in ip_fragment */
980 ret
= ip_vs_tunnel_xmit_prepare(skb
, cp
);
981 if (ret
== NF_ACCEPT
)
983 else if (ret
== NF_DROP
)
991 dst_link_failure(skb
);
997 dst_release(&rt
->dst
);
1004 * Direct Routing transmitter
1005 * Used for ANY protocol
1008 ip_vs_dr_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1009 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
1011 struct rtable
*rt
; /* Route to the other host */
1012 struct iphdr
*iph
= ip_hdr(skb
);
1017 if (!(rt
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
1018 IP_VS_RT_MODE_LOCAL
|
1019 IP_VS_RT_MODE_NON_LOCAL
|
1020 IP_VS_RT_MODE_KNOWN_NH
, NULL
)))
1022 if (rt
->rt_flags
& RTCF_LOCAL
) {
1024 return ip_vs_send_or_cont(NFPROTO_IPV4
, skb
, cp
, 1);
1028 mtu
= dst_mtu(&rt
->dst
);
1029 if ((iph
->frag_off
& htons(IP_DF
)) && skb
->len
> mtu
&&
1031 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
1033 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
1038 * Call ip_send_check because we are not sure it is called
1039 * after ip_defrag. Is copy-on-write needed?
1041 if (unlikely((skb
= skb_share_check(skb
, GFP_ATOMIC
)) == NULL
)) {
1045 ip_send_check(ip_hdr(skb
));
1047 /* drop old route */
1049 skb_dst_set(skb
, &rt
->dst
);
1051 /* Another hack: avoid icmp_send in ip_fragment */
1054 ip_vs_send_or_cont(NFPROTO_IPV4
, skb
, cp
, 0);
1060 dst_link_failure(skb
);
1067 #ifdef CONFIG_IP_VS_IPV6
1069 ip_vs_dr_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1070 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*iph
)
1072 struct rt6_info
*rt
; /* Route to the other host */
1077 if (!(rt
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
, NULL
,
1078 0, (IP_VS_RT_MODE_LOCAL
|
1079 IP_VS_RT_MODE_NON_LOCAL
))))
1081 if (__ip_vs_is_local_route6(rt
)) {
1082 dst_release(&rt
->dst
);
1083 return ip_vs_send_or_cont(NFPROTO_IPV6
, skb
, cp
, 1);
1087 mtu
= dst_mtu(&rt
->dst
);
1088 if (__mtu_check_toobig_v6(skb
, mtu
)) {
1090 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
1092 skb
->dev
= net
->loopback_dev
;
1094 /* only send ICMP too big on first fragment */
1096 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
1097 dst_release(&rt
->dst
);
1098 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
1103 * Call ip_send_check because we are not sure it is called
1104 * after ip_defrag. Is copy-on-write needed?
1106 skb
= skb_share_check(skb
, GFP_ATOMIC
);
1107 if (unlikely(skb
== NULL
)) {
1108 dst_release(&rt
->dst
);
1112 /* drop old route */
1114 skb_dst_set(skb
, &rt
->dst
);
1116 /* Another hack: avoid icmp_send in ip_fragment */
1119 ip_vs_send_or_cont(NFPROTO_IPV6
, skb
, cp
, 0);
1125 dst_link_failure(skb
);
1135 * ICMP packet transmitter
1136 * called by the ip_vs_in_icmp
1139 ip_vs_icmp_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1140 struct ip_vs_protocol
*pp
, int offset
, unsigned int hooknum
,
1141 struct ip_vs_iphdr
*iph
)
1143 struct rtable
*rt
; /* Route to the other host */
1151 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1152 forwarded directly here, because there is no need to
1153 translate address/port back */
1154 if (IP_VS_FWD_METHOD(cp
) != IP_VS_CONN_F_MASQ
) {
1155 if (cp
->packet_xmit
)
1156 rc
= cp
->packet_xmit(skb
, cp
, pp
, iph
);
1159 /* do not touch skb anymore */
1160 atomic_inc(&cp
->in_pkts
);
1165 * mangle and send the packet here (only for VS/NAT)
1168 /* LOCALNODE from FORWARD hook is not supported */
1169 rt_mode
= (hooknum
!= NF_INET_FORWARD
) ?
1170 IP_VS_RT_MODE_LOCAL
| IP_VS_RT_MODE_NON_LOCAL
|
1171 IP_VS_RT_MODE_RDR
: IP_VS_RT_MODE_NON_LOCAL
;
1172 if (!(rt
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
1175 local
= rt
->rt_flags
& RTCF_LOCAL
;
1178 * Avoid duplicate tuple in reply direction for NAT traffic
1179 * to local address when connection is sync-ed
1181 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1182 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
1183 enum ip_conntrack_info ctinfo
;
1184 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
1186 if (ct
&& !nf_ct_is_untracked(ct
)) {
1187 IP_VS_DBG(10, "%s(): "
1188 "stopping DNAT to local address %pI4\n",
1189 __func__
, &cp
->daddr
.ip
);
1195 /* From world but DNAT to loopback address? */
1196 if (local
&& ipv4_is_loopback(cp
->daddr
.ip
) &&
1197 rt_is_input_route(skb_rtable(skb
))) {
1198 IP_VS_DBG(1, "%s(): "
1199 "stopping DNAT to loopback %pI4\n",
1200 __func__
, &cp
->daddr
.ip
);
1205 mtu
= dst_mtu(&rt
->dst
);
1206 if ((skb
->len
> mtu
) && (ip_hdr(skb
)->frag_off
& htons(IP_DF
)) &&
1208 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
, htonl(mtu
));
1209 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
1213 /* copy-on-write the packet before mangling it */
1214 if (!skb_make_writable(skb
, offset
))
1217 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
1220 ip_vs_nat_icmp(skb
, pp
, cp
, 0);
1223 /* drop the old route when skb is not shared */
1225 skb_dst_set(skb
, &rt
->dst
);
1229 /* Another hack: avoid icmp_send in ip_fragment */
1232 rc
= ip_vs_nat_send_or_cont(NFPROTO_IPV4
, skb
, cp
, local
);
1236 dst_link_failure(skb
);
1248 #ifdef CONFIG_IP_VS_IPV6
1250 ip_vs_icmp_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1251 struct ip_vs_protocol
*pp
, int offset
, unsigned int hooknum
,
1252 struct ip_vs_iphdr
*iph
)
1254 struct rt6_info
*rt
; /* Route to the other host */
1262 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1263 forwarded directly here, because there is no need to
1264 translate address/port back */
1265 if (IP_VS_FWD_METHOD(cp
) != IP_VS_CONN_F_MASQ
) {
1266 if (cp
->packet_xmit
)
1267 rc
= cp
->packet_xmit(skb
, cp
, pp
, iph
);
1270 /* do not touch skb anymore */
1271 atomic_inc(&cp
->in_pkts
);
1276 * mangle and send the packet here (only for VS/NAT)
1279 /* LOCALNODE from FORWARD hook is not supported */
1280 rt_mode
= (hooknum
!= NF_INET_FORWARD
) ?
1281 IP_VS_RT_MODE_LOCAL
| IP_VS_RT_MODE_NON_LOCAL
|
1282 IP_VS_RT_MODE_RDR
: IP_VS_RT_MODE_NON_LOCAL
;
1283 if (!(rt
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
, NULL
,
1287 local
= __ip_vs_is_local_route6(rt
);
1289 * Avoid duplicate tuple in reply direction for NAT traffic
1290 * to local address when connection is sync-ed
1292 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1293 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
1294 enum ip_conntrack_info ctinfo
;
1295 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
1297 if (ct
&& !nf_ct_is_untracked(ct
)) {
1298 IP_VS_DBG(10, "%s(): "
1299 "stopping DNAT to local address %pI6\n",
1300 __func__
, &cp
->daddr
.in6
);
1306 /* From world but DNAT to loopback address? */
1307 if (local
&& skb
->dev
&& !(skb
->dev
->flags
& IFF_LOOPBACK
) &&
1308 ipv6_addr_type(&rt
->rt6i_dst
.addr
) & IPV6_ADDR_LOOPBACK
) {
1309 IP_VS_DBG(1, "%s(): "
1310 "stopping DNAT to loopback %pI6\n",
1311 __func__
, &cp
->daddr
.in6
);
1316 mtu
= dst_mtu(&rt
->dst
);
1317 if (__mtu_check_toobig_v6(skb
, mtu
)) {
1319 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
1321 skb
->dev
= net
->loopback_dev
;
1323 /* only send ICMP too big on first fragment */
1325 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
1326 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
1330 /* copy-on-write the packet before mangling it */
1331 if (!skb_make_writable(skb
, offset
))
1334 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
1337 ip_vs_nat_icmp_v6(skb
, pp
, cp
, 0);
1339 if (!local
|| !skb
->dev
) {
1340 /* drop the old route when skb is not shared */
1342 skb_dst_set(skb
, &rt
->dst
);
1344 /* destined to loopback, do we need to change route? */
1345 dst_release(&rt
->dst
);
1348 /* Another hack: avoid icmp_send in ip_fragment */
1351 rc
= ip_vs_nat_send_or_cont(NFPROTO_IPV6
, skb
, cp
, local
);
1355 dst_link_failure(skb
);
1363 dst_release(&rt
->dst
);