]>
git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - net/netfilter/ipvs/ip_vs_xmit.c
2 * ip_vs_xmit.c: various packet transmitters for IPVS
4 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
5 * Julian Anastasov <ja@ssi.bg>
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
14 * Description of forwarding methods:
15 * - all transmitters are called from LOCAL_IN (remote clients) and
16 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
17 * - not all connections have destination server, for example,
18 * connections in backup server when fwmark is used
19 * - bypass connections use daddr from packet
21 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
22 * - skb->pkt_type is not set yet
23 * - the only place where we can see skb->sk != NULL
26 #define KMSG_COMPONENT "IPVS"
27 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29 #include <linux/kernel.h>
30 #include <linux/slab.h>
31 #include <linux/tcp.h> /* for tcphdr */
33 #include <net/tcp.h> /* for csum_tcpudp_magic */
35 #include <net/icmp.h> /* for icmp_send */
36 #include <net/route.h> /* for ip_route_output */
38 #include <net/ip6_route.h>
39 #include <net/addrconf.h>
40 #include <linux/icmpv6.h>
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv4.h>
44 #include <net/ip_vs.h>
47 IP_VS_RT_MODE_LOCAL
= 1, /* Allow local dest */
48 IP_VS_RT_MODE_NON_LOCAL
= 2, /* Allow non-local dest */
49 IP_VS_RT_MODE_RDR
= 4, /* Allow redirect from remote daddr to
52 IP_VS_RT_MODE_CONNECT
= 8, /* Always bind route to saddr */
53 IP_VS_RT_MODE_KNOWN_NH
= 16,/* Route via remote addr */
57 * Destination cache to speed up outgoing route lookup
60 __ip_vs_dst_set(struct ip_vs_dest
*dest
, u32 rtos
, struct dst_entry
*dst
,
63 struct dst_entry
*old_dst
;
65 old_dst
= dest
->dst_cache
;
66 dest
->dst_cache
= dst
;
67 dest
->dst_rtos
= rtos
;
68 dest
->dst_cookie
= dst_cookie
;
72 static inline struct dst_entry
*
73 __ip_vs_dst_check(struct ip_vs_dest
*dest
, u32 rtos
)
75 struct dst_entry
*dst
= dest
->dst_cache
;
79 if ((dst
->obsolete
|| rtos
!= dest
->dst_rtos
) &&
80 dst
->ops
->check(dst
, dest
->dst_cookie
) == NULL
) {
81 dest
->dst_cache
= NULL
;
90 __mtu_check_toobig_v6(const struct sk_buff
*skb
, u32 mtu
)
92 if (IP6CB(skb
)->frag_max_size
) {
93 /* frag_max_size tell us that, this packet have been
94 * defragmented by netfilter IPv6 conntrack module.
96 if (IP6CB(skb
)->frag_max_size
> mtu
)
97 return true; /* largest fragment violate MTU */
99 else if (skb
->len
> mtu
&& !skb_is_gso(skb
)) {
100 return true; /* Packet size violate MTU size */
105 /* Get route to daddr, update *saddr, optionally bind route to saddr */
106 static struct rtable
*do_output_route4(struct net
*net
, __be32 daddr
,
107 u32 rtos
, int rt_mode
, __be32
*saddr
)
113 memset(&fl4
, 0, sizeof(fl4
));
115 fl4
.saddr
= (rt_mode
& IP_VS_RT_MODE_CONNECT
) ? *saddr
: 0;
116 fl4
.flowi4_tos
= rtos
;
117 fl4
.flowi4_flags
= (rt_mode
& IP_VS_RT_MODE_KNOWN_NH
) ?
118 FLOWI_FLAG_KNOWN_NH
: 0;
121 rt
= ip_route_output_key(net
, &fl4
);
123 /* Invalid saddr ? */
124 if (PTR_ERR(rt
) == -EINVAL
&& *saddr
&&
125 rt_mode
& IP_VS_RT_MODE_CONNECT
&& !loop
) {
127 flowi4_update_output(&fl4
, 0, rtos
, daddr
, 0);
130 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr
);
132 } else if (!*saddr
&& rt_mode
& IP_VS_RT_MODE_CONNECT
&& fl4
.saddr
) {
135 flowi4_update_output(&fl4
, 0, rtos
, daddr
, fl4
.saddr
);
143 /* Get route to destination or remote server */
144 static struct rtable
*
145 __ip_vs_get_out_rt(struct sk_buff
*skb
, struct ip_vs_dest
*dest
,
146 __be32 daddr
, u32 rtos
, int rt_mode
, __be32
*ret_saddr
)
148 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
149 struct rtable
*rt
; /* Route to the other host */
150 struct rtable
*ort
; /* Original route */
154 spin_lock(&dest
->dst_lock
);
155 if (!(rt
= (struct rtable
*)
156 __ip_vs_dst_check(dest
, rtos
))) {
157 rt
= do_output_route4(net
, dest
->addr
.ip
, rtos
,
158 rt_mode
, &dest
->dst_saddr
.ip
);
160 spin_unlock(&dest
->dst_lock
);
163 __ip_vs_dst_set(dest
, rtos
, dst_clone(&rt
->dst
), 0);
164 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, "
166 &dest
->addr
.ip
, &dest
->dst_saddr
.ip
,
167 atomic_read(&rt
->dst
.__refcnt
), rtos
);
169 daddr
= dest
->addr
.ip
;
171 *ret_saddr
= dest
->dst_saddr
.ip
;
172 spin_unlock(&dest
->dst_lock
);
174 __be32 saddr
= htonl(INADDR_ANY
);
176 /* For such unconfigured boxes avoid many route lookups
177 * for performance reasons because we do not remember saddr
179 rt_mode
&= ~IP_VS_RT_MODE_CONNECT
;
180 rt
= do_output_route4(net
, daddr
, rtos
, rt_mode
, &saddr
);
187 local
= rt
->rt_flags
& RTCF_LOCAL
;
188 if (!((local
? IP_VS_RT_MODE_LOCAL
: IP_VS_RT_MODE_NON_LOCAL
) &
190 IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
191 (rt
->rt_flags
& RTCF_LOCAL
) ?
192 "local":"non-local", &daddr
);
196 if (local
&& !(rt_mode
& IP_VS_RT_MODE_RDR
) &&
197 !((ort
= skb_rtable(skb
)) && ort
->rt_flags
& RTCF_LOCAL
)) {
198 IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
199 "requires NAT method, dest: %pI4\n",
200 &ip_hdr(skb
)->daddr
, &daddr
);
204 if (unlikely(!local
&& ipv4_is_loopback(ip_hdr(skb
)->saddr
))) {
205 IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
206 "to non-local address, dest: %pI4\n",
207 &ip_hdr(skb
)->saddr
, &daddr
);
215 /* Reroute packet to local IPv4 stack after DNAT */
217 __ip_vs_reroute_locally(struct sk_buff
*skb
)
219 struct rtable
*rt
= skb_rtable(skb
);
220 struct net_device
*dev
= rt
->dst
.dev
;
221 struct net
*net
= dev_net(dev
);
222 struct iphdr
*iph
= ip_hdr(skb
);
224 if (rt_is_input_route(rt
)) {
225 unsigned long orefdst
= skb
->_skb_refdst
;
227 if (ip_route_input(skb
, iph
->daddr
, iph
->saddr
,
230 refdst_drop(orefdst
);
232 struct flowi4 fl4
= {
235 .flowi4_tos
= RT_TOS(iph
->tos
),
236 .flowi4_mark
= skb
->mark
,
239 rt
= ip_route_output_key(net
, &fl4
);
242 if (!(rt
->rt_flags
& RTCF_LOCAL
)) {
246 /* Drop old route. */
248 skb_dst_set(skb
, &rt
->dst
);
253 #ifdef CONFIG_IP_VS_IPV6
255 static inline int __ip_vs_is_local_route6(struct rt6_info
*rt
)
257 return rt
->dst
.dev
&& rt
->dst
.dev
->flags
& IFF_LOOPBACK
;
260 static struct dst_entry
*
261 __ip_vs_route_output_v6(struct net
*net
, struct in6_addr
*daddr
,
262 struct in6_addr
*ret_saddr
, int do_xfrm
)
264 struct dst_entry
*dst
;
265 struct flowi6 fl6
= {
269 dst
= ip6_route_output(net
, NULL
, &fl6
);
274 if (ipv6_addr_any(&fl6
.saddr
) &&
275 ipv6_dev_get_saddr(net
, ip6_dst_idev(dst
)->dev
,
276 &fl6
.daddr
, 0, &fl6
.saddr
) < 0)
279 dst
= xfrm_lookup(net
, dst
, flowi6_to_flowi(&fl6
), NULL
, 0);
285 *ret_saddr
= fl6
.saddr
;
290 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr
);
295 * Get route to destination or remote server
297 static struct rt6_info
*
298 __ip_vs_get_out_rt_v6(struct sk_buff
*skb
, struct ip_vs_dest
*dest
,
299 struct in6_addr
*daddr
, struct in6_addr
*ret_saddr
,
300 int do_xfrm
, int rt_mode
)
302 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
303 struct rt6_info
*rt
; /* Route to the other host */
304 struct rt6_info
*ort
; /* Original route */
305 struct dst_entry
*dst
;
309 spin_lock(&dest
->dst_lock
);
310 rt
= (struct rt6_info
*)__ip_vs_dst_check(dest
, 0);
314 dst
= __ip_vs_route_output_v6(net
, &dest
->addr
.in6
,
315 &dest
->dst_saddr
.in6
,
318 spin_unlock(&dest
->dst_lock
);
321 rt
= (struct rt6_info
*) dst
;
322 cookie
= rt
->rt6i_node
? rt
->rt6i_node
->fn_sernum
: 0;
323 __ip_vs_dst_set(dest
, 0, dst_clone(&rt
->dst
), cookie
);
324 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
325 &dest
->addr
.in6
, &dest
->dst_saddr
.in6
,
326 atomic_read(&rt
->dst
.__refcnt
));
329 *ret_saddr
= dest
->dst_saddr
.in6
;
330 spin_unlock(&dest
->dst_lock
);
332 dst
= __ip_vs_route_output_v6(net
, daddr
, ret_saddr
, do_xfrm
);
335 rt
= (struct rt6_info
*) dst
;
338 local
= __ip_vs_is_local_route6(rt
);
339 if (!((local
? IP_VS_RT_MODE_LOCAL
: IP_VS_RT_MODE_NON_LOCAL
) &
341 IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",
342 local
? "local":"non-local", daddr
);
343 dst_release(&rt
->dst
);
346 if (local
&& !(rt_mode
& IP_VS_RT_MODE_RDR
) &&
347 !((ort
= (struct rt6_info
*) skb_dst(skb
)) &&
348 __ip_vs_is_local_route6(ort
))) {
349 IP_VS_DBG_RL("Redirect from non-local address %pI6c to local "
350 "requires NAT method, dest: %pI6c\n",
351 &ipv6_hdr(skb
)->daddr
, daddr
);
352 dst_release(&rt
->dst
);
355 if (unlikely(!local
&& (!skb
->dev
|| skb
->dev
->flags
& IFF_LOOPBACK
) &&
356 ipv6_addr_type(&ipv6_hdr(skb
)->saddr
) &
357 IPV6_ADDR_LOOPBACK
)) {
358 IP_VS_DBG_RL("Stopping traffic from loopback address %pI6c "
359 "to non-local address, dest: %pI6c\n",
360 &ipv6_hdr(skb
)->saddr
, daddr
);
361 dst_release(&rt
->dst
);
371 * Release dest->dst_cache before a dest is removed
374 ip_vs_dst_reset(struct ip_vs_dest
*dest
)
376 struct dst_entry
*old_dst
;
378 old_dst
= dest
->dst_cache
;
379 dest
->dst_cache
= NULL
;
380 dst_release(old_dst
);
381 dest
->dst_saddr
.ip
= 0;
384 #define IP_VS_XMIT_TUNNEL(skb, cp) \
386 int __ret = NF_ACCEPT; \
388 (skb)->ipvs_property = 1; \
389 if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
390 __ret = ip_vs_confirm_conntrack(skb); \
391 if (__ret == NF_ACCEPT) { \
393 skb_forward_csum(skb); \
398 #define IP_VS_XMIT_NAT(pf, skb, cp, local) \
400 (skb)->ipvs_property = 1; \
401 if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
402 ip_vs_notrack(skb); \
404 ip_vs_update_conntrack(skb, cp, 1); \
407 skb_forward_csum(skb); \
408 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
409 skb_dst(skb)->dev, dst_output); \
412 #define IP_VS_XMIT(pf, skb, cp, local) \
414 (skb)->ipvs_property = 1; \
415 if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
416 ip_vs_notrack(skb); \
419 skb_forward_csum(skb); \
420 NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
421 skb_dst(skb)->dev, dst_output); \
426 * NULL transmitter (do nothing except return NF_ACCEPT)
429 ip_vs_null_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
430 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
432 /* we do not touch skb and do not need pskb ptr */
433 IP_VS_XMIT(NFPROTO_IPV4
, skb
, cp
, 1);
439 * Let packets bypass the destination when the destination is not
440 * available, it may be only used in transparent cache cluster.
443 ip_vs_bypass_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
444 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
446 struct rtable
*rt
; /* Route to the other host */
447 struct iphdr
*iph
= ip_hdr(skb
);
452 if (!(rt
= __ip_vs_get_out_rt(skb
, NULL
, iph
->daddr
, RT_TOS(iph
->tos
),
453 IP_VS_RT_MODE_NON_LOCAL
, NULL
)))
457 mtu
= dst_mtu(&rt
->dst
);
458 if ((skb
->len
> mtu
) && (iph
->frag_off
& htons(IP_DF
)) &&
461 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
462 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
467 * Call ip_send_check because we are not sure it is called
468 * after ip_defrag. Is copy-on-write needed?
470 if (unlikely((skb
= skb_share_check(skb
, GFP_ATOMIC
)) == NULL
)) {
474 ip_send_check(ip_hdr(skb
));
478 skb_dst_set(skb
, &rt
->dst
);
480 /* Another hack: avoid icmp_send in ip_fragment */
483 IP_VS_XMIT(NFPROTO_IPV4
, skb
, cp
, 0);
489 dst_link_failure(skb
);
496 #ifdef CONFIG_IP_VS_IPV6
498 ip_vs_bypass_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
499 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*iph
)
501 struct rt6_info
*rt
; /* Route to the other host */
506 rt
= __ip_vs_get_out_rt_v6(skb
, NULL
, &iph
->daddr
.in6
, NULL
, 0,
507 IP_VS_RT_MODE_NON_LOCAL
);
512 mtu
= dst_mtu(&rt
->dst
);
513 if (__mtu_check_toobig_v6(skb
, mtu
)) {
515 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
517 skb
->dev
= net
->loopback_dev
;
519 /* only send ICMP too big on first fragment */
521 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
522 dst_release(&rt
->dst
);
523 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
528 * Call ip_send_check because we are not sure it is called
529 * after ip_defrag. Is copy-on-write needed?
531 skb
= skb_share_check(skb
, GFP_ATOMIC
);
532 if (unlikely(skb
== NULL
)) {
533 dst_release(&rt
->dst
);
539 skb_dst_set(skb
, &rt
->dst
);
541 /* Another hack: avoid icmp_send in ip_fragment */
544 IP_VS_XMIT(NFPROTO_IPV6
, skb
, cp
, 0);
550 dst_link_failure(skb
);
559 * NAT transmitter (only for outside-to-inside nat forwarding)
560 * Not used for related ICMP
563 ip_vs_nat_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
564 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
566 struct rtable
*rt
; /* Route to the other host */
568 struct iphdr
*iph
= ip_hdr(skb
);
573 /* check if it is a connection of no-client-port */
574 if (unlikely(cp
->flags
& IP_VS_CONN_F_NO_CPORT
)) {
576 p
= skb_header_pointer(skb
, iph
->ihl
*4, sizeof(_pt
), &_pt
);
579 ip_vs_conn_fill_cport(cp
, *p
);
580 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p
));
583 if (!(rt
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
585 IP_VS_RT_MODE_LOCAL
|
586 IP_VS_RT_MODE_NON_LOCAL
|
587 IP_VS_RT_MODE_RDR
, NULL
)))
589 local
= rt
->rt_flags
& RTCF_LOCAL
;
591 * Avoid duplicate tuple in reply direction for NAT traffic
592 * to local address when connection is sync-ed
594 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
595 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
596 enum ip_conntrack_info ctinfo
;
597 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
599 if (ct
&& !nf_ct_is_untracked(ct
)) {
600 IP_VS_DBG_RL_PKT(10, AF_INET
, pp
, skb
, 0,
602 "stopping DNAT to local address");
608 /* From world but DNAT to loopback address? */
609 if (local
&& ipv4_is_loopback(cp
->daddr
.ip
) &&
610 rt_is_input_route(skb_rtable(skb
))) {
611 IP_VS_DBG_RL_PKT(1, AF_INET
, pp
, skb
, 0, "ip_vs_nat_xmit(): "
612 "stopping DNAT to loopback address");
617 mtu
= dst_mtu(&rt
->dst
);
618 if ((skb
->len
> mtu
) && (iph
->frag_off
& htons(IP_DF
)) &&
620 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
621 IP_VS_DBG_RL_PKT(0, AF_INET
, pp
, skb
, 0,
622 "ip_vs_nat_xmit(): frag needed for");
626 /* copy-on-write the packet before mangling it */
627 if (!skb_make_writable(skb
, sizeof(struct iphdr
)))
630 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
633 /* mangle the packet */
634 if (pp
->dnat_handler
&& !pp
->dnat_handler(skb
, pp
, cp
, ipvsh
))
636 ip_hdr(skb
)->daddr
= cp
->daddr
.ip
;
637 ip_send_check(ip_hdr(skb
));
642 skb_dst_set(skb
, &rt
->dst
);
646 * Some IPv4 replies get local address from routes,
647 * not from iph, so while we DNAT after routing
648 * we need this second input/output route.
650 if (!__ip_vs_reroute_locally(skb
))
654 IP_VS_DBG_PKT(10, AF_INET
, pp
, skb
, 0, "After DNAT");
656 /* FIXME: when application helper enlarges the packet and the length
657 is larger than the MTU of outgoing device, there will be still
660 /* Another hack: avoid icmp_send in ip_fragment */
663 IP_VS_XMIT_NAT(NFPROTO_IPV4
, skb
, cp
, local
);
669 dst_link_failure(skb
);
679 #ifdef CONFIG_IP_VS_IPV6
681 ip_vs_nat_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
682 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*iph
)
684 struct rt6_info
*rt
; /* Route to the other host */
690 /* check if it is a connection of no-client-port */
691 if (unlikely(cp
->flags
& IP_VS_CONN_F_NO_CPORT
&& !iph
->fragoffs
)) {
693 p
= skb_header_pointer(skb
, iph
->len
, sizeof(_pt
), &_pt
);
696 ip_vs_conn_fill_cport(cp
, *p
);
697 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p
));
700 if (!(rt
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
, NULL
,
701 0, (IP_VS_RT_MODE_LOCAL
|
702 IP_VS_RT_MODE_NON_LOCAL
|
703 IP_VS_RT_MODE_RDR
))))
705 local
= __ip_vs_is_local_route6(rt
);
707 * Avoid duplicate tuple in reply direction for NAT traffic
708 * to local address when connection is sync-ed
710 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
711 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
712 enum ip_conntrack_info ctinfo
;
713 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
715 if (ct
&& !nf_ct_is_untracked(ct
)) {
716 IP_VS_DBG_RL_PKT(10, AF_INET6
, pp
, skb
, 0,
717 "ip_vs_nat_xmit_v6(): "
718 "stopping DNAT to local address");
724 /* From world but DNAT to loopback address? */
725 if (local
&& skb
->dev
&& !(skb
->dev
->flags
& IFF_LOOPBACK
) &&
726 ipv6_addr_type(&rt
->rt6i_dst
.addr
) & IPV6_ADDR_LOOPBACK
) {
727 IP_VS_DBG_RL_PKT(1, AF_INET6
, pp
, skb
, 0,
728 "ip_vs_nat_xmit_v6(): "
729 "stopping DNAT to loopback address");
734 mtu
= dst_mtu(&rt
->dst
);
735 if (__mtu_check_toobig_v6(skb
, mtu
)) {
737 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
739 skb
->dev
= net
->loopback_dev
;
741 /* only send ICMP too big on first fragment */
743 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
744 IP_VS_DBG_RL_PKT(0, AF_INET6
, pp
, skb
, 0,
745 "ip_vs_nat_xmit_v6(): frag needed for");
749 /* copy-on-write the packet before mangling it */
750 if (!skb_make_writable(skb
, sizeof(struct ipv6hdr
)))
753 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
756 /* mangle the packet */
757 if (pp
->dnat_handler
&& !pp
->dnat_handler(skb
, pp
, cp
, iph
))
759 ipv6_hdr(skb
)->daddr
= cp
->daddr
.in6
;
761 if (!local
|| !skb
->dev
) {
762 /* drop the old route when skb is not shared */
764 skb_dst_set(skb
, &rt
->dst
);
766 /* destined to loopback, do we need to change route? */
767 dst_release(&rt
->dst
);
770 IP_VS_DBG_PKT(10, AF_INET6
, pp
, skb
, 0, "After DNAT");
772 /* FIXME: when application helper enlarges the packet and the length
773 is larger than the MTU of outgoing device, there will be still
776 /* Another hack: avoid icmp_send in ip_fragment */
779 IP_VS_XMIT_NAT(NFPROTO_IPV6
, skb
, cp
, local
);
785 dst_link_failure(skb
);
791 dst_release(&rt
->dst
);
798 * IP Tunneling transmitter
800 * This function encapsulates the packet in a new IP packet, its
801 * destination will be set to cp->daddr. Most code of this function
802 * is taken from ipip.c.
804 * It is used in VS/TUN cluster. The load balancer selects a real
805 * server from a cluster based on a scheduling algorithm,
806 * encapsulates the request packet and forwards it to the selected
807 * server. For example, all real servers are configured with
808 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
809 * the encapsulated packet, it will decapsulate the packet, processe
810 * the request and return the response packets directly to the client
811 * without passing the load balancer. This can greatly increase the
812 * scalability of virtual server.
814 * Used for ANY protocol
817 ip_vs_tunnel_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
818 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
820 struct netns_ipvs
*ipvs
= net_ipvs(skb_net(skb
));
821 struct rtable
*rt
; /* Route to the other host */
822 __be32 saddr
; /* Source for tunnel */
823 struct net_device
*tdev
; /* Device to other host */
824 struct iphdr
*old_iph
= ip_hdr(skb
);
825 u8 tos
= old_iph
->tos
;
827 struct iphdr
*iph
; /* Our new IP header */
828 unsigned int max_headroom
; /* The extra header space needed */
834 if (!(rt
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
835 RT_TOS(tos
), IP_VS_RT_MODE_LOCAL
|
836 IP_VS_RT_MODE_NON_LOCAL
|
837 IP_VS_RT_MODE_CONNECT
,
840 if (rt
->rt_flags
& RTCF_LOCAL
) {
842 IP_VS_XMIT(NFPROTO_IPV4
, skb
, cp
, 1);
847 mtu
= dst_mtu(&rt
->dst
) - sizeof(struct iphdr
);
849 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__
);
852 if (rt_is_output_route(skb_rtable(skb
)))
853 skb_dst(skb
)->ops
->update_pmtu(skb_dst(skb
), NULL
, skb
, mtu
);
855 /* Copy DF, reset fragment offset and MF */
856 df
= sysctl_pmtu_disc(ipvs
) ? old_iph
->frag_off
& htons(IP_DF
) : 0;
858 if (df
&& mtu
< ntohs(old_iph
->tot_len
) && !skb_is_gso(skb
)) {
859 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
860 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
865 * Okay, now see if we can stuff it in the buffer as-is.
867 max_headroom
= LL_RESERVED_SPACE(tdev
) + sizeof(struct iphdr
);
869 if (skb_headroom(skb
) < max_headroom
870 || skb_cloned(skb
) || skb_shared(skb
)) {
871 struct sk_buff
*new_skb
=
872 skb_realloc_headroom(skb
, max_headroom
);
876 IP_VS_ERR_RL("%s(): no memory\n", __func__
);
881 old_iph
= ip_hdr(skb
);
884 skb
->transport_header
= skb
->network_header
;
886 /* fix old IP header checksum */
887 ip_send_check(old_iph
);
889 skb_push(skb
, sizeof(struct iphdr
));
890 skb_reset_network_header(skb
);
891 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
895 skb_dst_set(skb
, &rt
->dst
);
898 * Push down and install the IPIP header.
902 iph
->ihl
= sizeof(struct iphdr
)>>2;
904 iph
->protocol
= IPPROTO_IPIP
;
906 iph
->daddr
= cp
->daddr
.ip
;
908 iph
->ttl
= old_iph
->ttl
;
909 ip_select_ident(iph
, &rt
->dst
, NULL
);
911 /* Another hack: avoid icmp_send in ip_fragment */
914 ret
= IP_VS_XMIT_TUNNEL(skb
, cp
);
915 if (ret
== NF_ACCEPT
)
917 else if (ret
== NF_DROP
)
925 dst_link_failure(skb
);
935 #ifdef CONFIG_IP_VS_IPV6
937 ip_vs_tunnel_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
938 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
940 struct rt6_info
*rt
; /* Route to the other host */
941 struct in6_addr saddr
; /* Source for tunnel */
942 struct net_device
*tdev
; /* Device to other host */
943 struct ipv6hdr
*old_iph
= ipv6_hdr(skb
);
944 struct ipv6hdr
*iph
; /* Our new IP header */
945 unsigned int max_headroom
; /* The extra header space needed */
951 if (!(rt
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
,
952 &saddr
, 1, (IP_VS_RT_MODE_LOCAL
|
953 IP_VS_RT_MODE_NON_LOCAL
))))
955 if (__ip_vs_is_local_route6(rt
)) {
956 dst_release(&rt
->dst
);
957 IP_VS_XMIT(NFPROTO_IPV6
, skb
, cp
, 1);
962 mtu
= dst_mtu(&rt
->dst
) - sizeof(struct ipv6hdr
);
963 if (mtu
< IPV6_MIN_MTU
) {
964 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__
,
969 skb_dst(skb
)->ops
->update_pmtu(skb_dst(skb
), NULL
, skb
, mtu
);
971 /* MTU checking: Notice that 'mtu' have been adjusted before hand */
972 if (__mtu_check_toobig_v6(skb
, mtu
)) {
974 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
976 skb
->dev
= net
->loopback_dev
;
978 /* only send ICMP too big on first fragment */
979 if (!ipvsh
->fragoffs
)
980 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
981 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
986 * Okay, now see if we can stuff it in the buffer as-is.
988 max_headroom
= LL_RESERVED_SPACE(tdev
) + sizeof(struct ipv6hdr
);
990 if (skb_headroom(skb
) < max_headroom
991 || skb_cloned(skb
) || skb_shared(skb
)) {
992 struct sk_buff
*new_skb
=
993 skb_realloc_headroom(skb
, max_headroom
);
995 dst_release(&rt
->dst
);
997 IP_VS_ERR_RL("%s(): no memory\n", __func__
);
1002 old_iph
= ipv6_hdr(skb
);
1005 skb
->transport_header
= skb
->network_header
;
1007 skb_push(skb
, sizeof(struct ipv6hdr
));
1008 skb_reset_network_header(skb
);
1009 memset(&(IPCB(skb
)->opt
), 0, sizeof(IPCB(skb
)->opt
));
1011 /* drop old route */
1013 skb_dst_set(skb
, &rt
->dst
);
1016 * Push down and install the IPIP header.
1018 iph
= ipv6_hdr(skb
);
1020 iph
->nexthdr
= IPPROTO_IPV6
;
1021 iph
->payload_len
= old_iph
->payload_len
;
1022 be16_add_cpu(&iph
->payload_len
, sizeof(*old_iph
));
1023 iph
->priority
= old_iph
->priority
;
1024 memset(&iph
->flow_lbl
, 0, sizeof(iph
->flow_lbl
));
1025 iph
->daddr
= cp
->daddr
.in6
;
1027 iph
->hop_limit
= old_iph
->hop_limit
;
1029 /* Another hack: avoid icmp_send in ip_fragment */
1032 ret
= IP_VS_XMIT_TUNNEL(skb
, cp
);
1033 if (ret
== NF_ACCEPT
)
1035 else if (ret
== NF_DROP
)
1043 dst_link_failure(skb
);
1049 dst_release(&rt
->dst
);
1056 * Direct Routing transmitter
1057 * Used for ANY protocol
1060 ip_vs_dr_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1061 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*ipvsh
)
1063 struct rtable
*rt
; /* Route to the other host */
1064 struct iphdr
*iph
= ip_hdr(skb
);
1069 if (!(rt
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
1071 IP_VS_RT_MODE_LOCAL
|
1072 IP_VS_RT_MODE_NON_LOCAL
|
1073 IP_VS_RT_MODE_KNOWN_NH
, NULL
)))
1075 if (rt
->rt_flags
& RTCF_LOCAL
) {
1077 IP_VS_XMIT(NFPROTO_IPV4
, skb
, cp
, 1);
1081 mtu
= dst_mtu(&rt
->dst
);
1082 if ((iph
->frag_off
& htons(IP_DF
)) && skb
->len
> mtu
&&
1084 icmp_send(skb
, ICMP_DEST_UNREACH
,ICMP_FRAG_NEEDED
, htonl(mtu
));
1086 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
1091 * Call ip_send_check because we are not sure it is called
1092 * after ip_defrag. Is copy-on-write needed?
1094 if (unlikely((skb
= skb_share_check(skb
, GFP_ATOMIC
)) == NULL
)) {
1098 ip_send_check(ip_hdr(skb
));
1100 /* drop old route */
1102 skb_dst_set(skb
, &rt
->dst
);
1104 /* Another hack: avoid icmp_send in ip_fragment */
1107 IP_VS_XMIT(NFPROTO_IPV4
, skb
, cp
, 0);
1113 dst_link_failure(skb
);
1120 #ifdef CONFIG_IP_VS_IPV6
1122 ip_vs_dr_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1123 struct ip_vs_protocol
*pp
, struct ip_vs_iphdr
*iph
)
1125 struct rt6_info
*rt
; /* Route to the other host */
1130 if (!(rt
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
, NULL
,
1131 0, (IP_VS_RT_MODE_LOCAL
|
1132 IP_VS_RT_MODE_NON_LOCAL
))))
1134 if (__ip_vs_is_local_route6(rt
)) {
1135 dst_release(&rt
->dst
);
1136 IP_VS_XMIT(NFPROTO_IPV6
, skb
, cp
, 1);
1140 mtu
= dst_mtu(&rt
->dst
);
1141 if (__mtu_check_toobig_v6(skb
, mtu
)) {
1143 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
1145 skb
->dev
= net
->loopback_dev
;
1147 /* only send ICMP too big on first fragment */
1149 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
1150 dst_release(&rt
->dst
);
1151 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
1156 * Call ip_send_check because we are not sure it is called
1157 * after ip_defrag. Is copy-on-write needed?
1159 skb
= skb_share_check(skb
, GFP_ATOMIC
);
1160 if (unlikely(skb
== NULL
)) {
1161 dst_release(&rt
->dst
);
1165 /* drop old route */
1167 skb_dst_set(skb
, &rt
->dst
);
1169 /* Another hack: avoid icmp_send in ip_fragment */
1172 IP_VS_XMIT(NFPROTO_IPV6
, skb
, cp
, 0);
1178 dst_link_failure(skb
);
1188 * ICMP packet transmitter
1189 * called by the ip_vs_in_icmp
1192 ip_vs_icmp_xmit(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1193 struct ip_vs_protocol
*pp
, int offset
, unsigned int hooknum
,
1194 struct ip_vs_iphdr
*iph
)
1196 struct rtable
*rt
; /* Route to the other host */
1204 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1205 forwarded directly here, because there is no need to
1206 translate address/port back */
1207 if (IP_VS_FWD_METHOD(cp
) != IP_VS_CONN_F_MASQ
) {
1208 if (cp
->packet_xmit
)
1209 rc
= cp
->packet_xmit(skb
, cp
, pp
, iph
);
1212 /* do not touch skb anymore */
1213 atomic_inc(&cp
->in_pkts
);
1218 * mangle and send the packet here (only for VS/NAT)
1221 /* LOCALNODE from FORWARD hook is not supported */
1222 rt_mode
= (hooknum
!= NF_INET_FORWARD
) ?
1223 IP_VS_RT_MODE_LOCAL
| IP_VS_RT_MODE_NON_LOCAL
|
1224 IP_VS_RT_MODE_RDR
: IP_VS_RT_MODE_NON_LOCAL
;
1225 if (!(rt
= __ip_vs_get_out_rt(skb
, cp
->dest
, cp
->daddr
.ip
,
1226 RT_TOS(ip_hdr(skb
)->tos
),
1229 local
= rt
->rt_flags
& RTCF_LOCAL
;
1232 * Avoid duplicate tuple in reply direction for NAT traffic
1233 * to local address when connection is sync-ed
1235 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1236 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
1237 enum ip_conntrack_info ctinfo
;
1238 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
1240 if (ct
&& !nf_ct_is_untracked(ct
)) {
1241 IP_VS_DBG(10, "%s(): "
1242 "stopping DNAT to local address %pI4\n",
1243 __func__
, &cp
->daddr
.ip
);
1249 /* From world but DNAT to loopback address? */
1250 if (local
&& ipv4_is_loopback(cp
->daddr
.ip
) &&
1251 rt_is_input_route(skb_rtable(skb
))) {
1252 IP_VS_DBG(1, "%s(): "
1253 "stopping DNAT to loopback %pI4\n",
1254 __func__
, &cp
->daddr
.ip
);
1259 mtu
= dst_mtu(&rt
->dst
);
1260 if ((skb
->len
> mtu
) && (ip_hdr(skb
)->frag_off
& htons(IP_DF
)) &&
1262 icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_FRAG_NEEDED
, htonl(mtu
));
1263 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
1267 /* copy-on-write the packet before mangling it */
1268 if (!skb_make_writable(skb
, offset
))
1271 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
1274 ip_vs_nat_icmp(skb
, pp
, cp
, 0);
1277 /* drop the old route when skb is not shared */
1279 skb_dst_set(skb
, &rt
->dst
);
1283 * Some IPv4 replies get local address from routes,
1284 * not from iph, so while we DNAT after routing
1285 * we need this second input/output route.
1287 if (!__ip_vs_reroute_locally(skb
))
1291 /* Another hack: avoid icmp_send in ip_fragment */
1294 IP_VS_XMIT_NAT(NFPROTO_IPV4
, skb
, cp
, local
);
1300 dst_link_failure(skb
);
1312 #ifdef CONFIG_IP_VS_IPV6
1314 ip_vs_icmp_xmit_v6(struct sk_buff
*skb
, struct ip_vs_conn
*cp
,
1315 struct ip_vs_protocol
*pp
, int offset
, unsigned int hooknum
,
1316 struct ip_vs_iphdr
*iph
)
1318 struct rt6_info
*rt
; /* Route to the other host */
1326 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1327 forwarded directly here, because there is no need to
1328 translate address/port back */
1329 if (IP_VS_FWD_METHOD(cp
) != IP_VS_CONN_F_MASQ
) {
1330 if (cp
->packet_xmit
)
1331 rc
= cp
->packet_xmit(skb
, cp
, pp
, iph
);
1334 /* do not touch skb anymore */
1335 atomic_inc(&cp
->in_pkts
);
1340 * mangle and send the packet here (only for VS/NAT)
1343 /* LOCALNODE from FORWARD hook is not supported */
1344 rt_mode
= (hooknum
!= NF_INET_FORWARD
) ?
1345 IP_VS_RT_MODE_LOCAL
| IP_VS_RT_MODE_NON_LOCAL
|
1346 IP_VS_RT_MODE_RDR
: IP_VS_RT_MODE_NON_LOCAL
;
1347 if (!(rt
= __ip_vs_get_out_rt_v6(skb
, cp
->dest
, &cp
->daddr
.in6
, NULL
,
1351 local
= __ip_vs_is_local_route6(rt
);
1353 * Avoid duplicate tuple in reply direction for NAT traffic
1354 * to local address when connection is sync-ed
1356 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1357 if (cp
->flags
& IP_VS_CONN_F_SYNC
&& local
) {
1358 enum ip_conntrack_info ctinfo
;
1359 struct nf_conn
*ct
= nf_ct_get(skb
, &ctinfo
);
1361 if (ct
&& !nf_ct_is_untracked(ct
)) {
1362 IP_VS_DBG(10, "%s(): "
1363 "stopping DNAT to local address %pI6\n",
1364 __func__
, &cp
->daddr
.in6
);
1370 /* From world but DNAT to loopback address? */
1371 if (local
&& skb
->dev
&& !(skb
->dev
->flags
& IFF_LOOPBACK
) &&
1372 ipv6_addr_type(&rt
->rt6i_dst
.addr
) & IPV6_ADDR_LOOPBACK
) {
1373 IP_VS_DBG(1, "%s(): "
1374 "stopping DNAT to loopback %pI6\n",
1375 __func__
, &cp
->daddr
.in6
);
1380 mtu
= dst_mtu(&rt
->dst
);
1381 if (__mtu_check_toobig_v6(skb
, mtu
)) {
1383 struct net
*net
= dev_net(skb_dst(skb
)->dev
);
1385 skb
->dev
= net
->loopback_dev
;
1387 /* only send ICMP too big on first fragment */
1389 icmpv6_send(skb
, ICMPV6_PKT_TOOBIG
, 0, mtu
);
1390 IP_VS_DBG_RL("%s(): frag needed\n", __func__
);
1394 /* copy-on-write the packet before mangling it */
1395 if (!skb_make_writable(skb
, offset
))
1398 if (skb_cow(skb
, rt
->dst
.dev
->hard_header_len
))
1401 ip_vs_nat_icmp_v6(skb
, pp
, cp
, 0);
1403 if (!local
|| !skb
->dev
) {
1404 /* drop the old route when skb is not shared */
1406 skb_dst_set(skb
, &rt
->dst
);
1408 /* destined to loopback, do we need to change route? */
1409 dst_release(&rt
->dst
);
1412 /* Another hack: avoid icmp_send in ip_fragment */
1415 IP_VS_XMIT_NAT(NFPROTO_IPV6
, skb
, cp
, local
);
1421 dst_link_failure(skb
);
1429 dst_release(&rt
->dst
);