2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Version 1, is capable of handling both version 0 and 1 messages.
9 * Version 0 is the plain old format.
10 * Note Version 0 receivers will just drop Ver 1 messages.
11 * Version 1 is capable of handle IPv6, Persistence data,
12 * time-outs, and firewall marks.
13 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
14 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
16 * Definitions Message: is a complete datagram
17 * Sync_conn: is a part of a Message
18 * Param Data is an option to a Sync_conn.
20 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
22 * ip_vs_sync: sync connection info from master load balancer to backups
26 * Alexandre Cassen : Added master & backup support at a time.
27 * Alexandre Cassen : Added SyncID support for incoming sync
29 * Justin Ossevoort : Fix endian problem on sync message size.
30 * Hans Schillstrom : Added Version 1: i.e. IPv6,
31 * Persistence support, fwmark and time-out.
34 #define KMSG_COMPONENT "IPVS"
35 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
37 #include <linux/module.h>
38 #include <linux/slab.h>
39 #include <linux/inetdevice.h>
40 #include <linux/net.h>
41 #include <linux/completion.h>
42 #include <linux/delay.h>
43 #include <linux/skbuff.h>
45 #include <linux/igmp.h> /* for ip_mc_join_group */
46 #include <linux/udp.h>
47 #include <linux/err.h>
48 #include <linux/kthread.h>
49 #include <linux/wait.h>
50 #include <linux/kernel.h>
52 #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
57 #include <net/ip_vs.h>
59 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
60 #define IP_VS_SYNC_PORT 8848 /* multicast port */
62 #define SYNC_PROTO_VER 1 /* Protocol version in header */
64 static struct lock_class_key __ipvs_sync_key
;
66 * IPVS sync connection entry
67 * Version 0, i.e. original version.
69 struct ip_vs_sync_conn_v0
{
72 /* Protocol, addresses and port numbers */
73 __u8 protocol
; /* Which protocol (TCP/UDP) */
77 __be32 caddr
; /* client address */
78 __be32 vaddr
; /* virtual address */
79 __be32 daddr
; /* destination address */
81 /* Flags and state transition */
82 __be16 flags
; /* status flags */
83 __be16 state
; /* state info */
85 /* The sequence options start here */
88 struct ip_vs_sync_conn_options
{
89 struct ip_vs_seq in_seq
; /* incoming seq. struct */
90 struct ip_vs_seq out_seq
; /* outgoing seq. struct */
94 Sync Connection format (sync_conn)
97 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 | Type | Protocol | Ver. | Size |
100 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
102 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
108 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
109 | timeout (in sec.) |
110 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
112 | IP-Addresses (v4 or v6) |
114 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
116 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
117 | Param. Type | Param. Length | Param. data |
118 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
120 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
121 | | Param Type | Param. Length |
122 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
124 | Last Param data should be padded for 32 bit alignment |
125 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
129 * Type 0, IPv4 sync connection format
131 struct ip_vs_sync_v4
{
133 __u8 protocol
; /* Which protocol (TCP/UDP) */
134 __be16 ver_size
; /* Version msb 4 bits */
135 /* Flags and state transition */
136 __be32 flags
; /* status flags */
137 __be16 state
; /* state info */
138 /* Protocol, addresses and port numbers */
142 __be32 fwmark
; /* Firewall mark from skb */
143 __be32 timeout
; /* cp timeout */
144 __be32 caddr
; /* client address */
145 __be32 vaddr
; /* virtual address */
146 __be32 daddr
; /* destination address */
147 /* The sequence options start here */
148 /* PE data padded to 32bit alignment after seq. options */
151 * Type 2 messages IPv6
153 struct ip_vs_sync_v6
{
155 __u8 protocol
; /* Which protocol (TCP/UDP) */
156 __be16 ver_size
; /* Version msb 4 bits */
157 /* Flags and state transition */
158 __be32 flags
; /* status flags */
159 __be16 state
; /* state info */
160 /* Protocol, addresses and port numbers */
164 __be32 fwmark
; /* Firewall mark from skb */
165 __be32 timeout
; /* cp timeout */
166 struct in6_addr caddr
; /* client address */
167 struct in6_addr vaddr
; /* virtual address */
168 struct in6_addr daddr
; /* destination address */
169 /* The sequence options start here */
170 /* PE data padded to 32bit alignment after seq. options */
173 union ip_vs_sync_conn
{
174 struct ip_vs_sync_v4 v4
;
175 struct ip_vs_sync_v6 v6
;
178 /* Bits in Type field in above */
179 #define STYPE_INET6 0
180 #define STYPE_F_INET6 (1 << STYPE_INET6)
182 #define SVER_SHIFT 12 /* Shift to get version */
183 #define SVER_MASK 0x0fff /* Mask to strip version */
185 #define IPVS_OPT_SEQ_DATA 1
186 #define IPVS_OPT_PE_DATA 2
187 #define IPVS_OPT_PE_NAME 3
188 #define IPVS_OPT_PARAM 7
190 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
191 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
192 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
193 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
195 struct ip_vs_sync_thread_data
{
202 /* Version 0 definition of packet sizes */
203 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
204 #define FULL_CONN_SIZE \
205 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
209 The master mulitcasts messages (Datagrams) to the backup load balancers
210 in the following format.
213 Note, first byte should be Zero, so ver 0 receivers will drop the packet.
216 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
217 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
218 | 0 | SyncID | Size |
219 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
220 | Count Conns | Version | Reserved, set to Zero |
221 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
223 | IPVS Sync Connection (1) |
224 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
228 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
230 | IPVS Sync Connection (n) |
231 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
235 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
236 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
237 | Count Conns | SyncID | Size |
238 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
239 | IPVS Sync Connection (1) |
242 #define SYNC_MESG_HEADER_LEN 4
243 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
245 /* Version 0 header */
246 struct ip_vs_sync_mesg_v0
{
251 /* ip_vs_sync_conn entries start here */
254 /* Version 1 header */
255 struct ip_vs_sync_mesg
{
256 __u8 reserved
; /* must be zero */
260 __s8 version
; /* SYNC_PROTO_VER */
262 /* ip_vs_sync_conn entries start here */
265 struct ip_vs_sync_buff
{
266 struct list_head list
;
267 unsigned long firstuse
;
269 /* pointers for the message data */
270 struct ip_vs_sync_mesg
*mesg
;
276 * Copy of struct ip_vs_seq
277 * From unaligned network order to aligned host order
279 static void ntoh_seq(struct ip_vs_seq
*no
, struct ip_vs_seq
*ho
)
281 ho
->init_seq
= get_unaligned_be32(&no
->init_seq
);
282 ho
->delta
= get_unaligned_be32(&no
->delta
);
283 ho
->previous_delta
= get_unaligned_be32(&no
->previous_delta
);
287 * Copy of struct ip_vs_seq
288 * From Aligned host order to unaligned network order
290 static void hton_seq(struct ip_vs_seq
*ho
, struct ip_vs_seq
*no
)
292 put_unaligned_be32(ho
->init_seq
, &no
->init_seq
);
293 put_unaligned_be32(ho
->delta
, &no
->delta
);
294 put_unaligned_be32(ho
->previous_delta
, &no
->previous_delta
);
297 static inline struct ip_vs_sync_buff
*
298 sb_dequeue(struct netns_ipvs
*ipvs
, struct ipvs_master_sync_state
*ms
)
300 struct ip_vs_sync_buff
*sb
;
302 spin_lock_bh(&ipvs
->sync_lock
);
303 if (list_empty(&ms
->sync_queue
)) {
305 __set_current_state(TASK_INTERRUPTIBLE
);
307 sb
= list_entry(ms
->sync_queue
.next
, struct ip_vs_sync_buff
,
310 ms
->sync_queue_len
--;
311 if (!ms
->sync_queue_len
)
312 ms
->sync_queue_delay
= 0;
314 spin_unlock_bh(&ipvs
->sync_lock
);
320 * Create a new sync buffer for Version 1 proto.
322 static inline struct ip_vs_sync_buff
*
323 ip_vs_sync_buff_create(struct netns_ipvs
*ipvs
)
325 struct ip_vs_sync_buff
*sb
;
327 if (!(sb
=kmalloc(sizeof(struct ip_vs_sync_buff
), GFP_ATOMIC
)))
330 sb
->mesg
= kmalloc(ipvs
->send_mesg_maxlen
, GFP_ATOMIC
);
335 sb
->mesg
->reserved
= 0; /* old nr_conns i.e. must be zero now */
336 sb
->mesg
->version
= SYNC_PROTO_VER
;
337 sb
->mesg
->syncid
= ipvs
->master_syncid
;
338 sb
->mesg
->size
= htons(sizeof(struct ip_vs_sync_mesg
));
339 sb
->mesg
->nr_conns
= 0;
341 sb
->head
= (unsigned char *)sb
->mesg
+ sizeof(struct ip_vs_sync_mesg
);
342 sb
->end
= (unsigned char *)sb
->mesg
+ ipvs
->send_mesg_maxlen
;
344 sb
->firstuse
= jiffies
;
348 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff
*sb
)
354 static inline void sb_queue_tail(struct netns_ipvs
*ipvs
,
355 struct ipvs_master_sync_state
*ms
)
357 struct ip_vs_sync_buff
*sb
= ms
->sync_buff
;
359 spin_lock(&ipvs
->sync_lock
);
360 if (ipvs
->sync_state
& IP_VS_STATE_MASTER
&&
361 ms
->sync_queue_len
< sysctl_sync_qlen_max(ipvs
)) {
362 if (!ms
->sync_queue_len
)
363 schedule_delayed_work(&ms
->master_wakeup_work
,
364 max(IPVS_SYNC_SEND_DELAY
, 1));
365 ms
->sync_queue_len
++;
366 list_add_tail(&sb
->list
, &ms
->sync_queue
);
367 if ((++ms
->sync_queue_delay
) == IPVS_SYNC_WAKEUP_RATE
)
368 wake_up_process(ms
->master_thread
);
370 ip_vs_sync_buff_release(sb
);
371 spin_unlock(&ipvs
->sync_lock
);
375 * Get the current sync buffer if it has been created for more
376 * than the specified time or the specified time is zero.
378 static inline struct ip_vs_sync_buff
*
379 get_curr_sync_buff(struct netns_ipvs
*ipvs
, struct ipvs_master_sync_state
*ms
,
382 struct ip_vs_sync_buff
*sb
;
384 spin_lock_bh(&ipvs
->sync_buff_lock
);
386 if (sb
&& time_after_eq(jiffies
- sb
->firstuse
, time
)) {
387 ms
->sync_buff
= NULL
;
388 __set_current_state(TASK_RUNNING
);
391 spin_unlock_bh(&ipvs
->sync_buff_lock
);
396 select_master_thread_id(struct netns_ipvs
*ipvs
, struct ip_vs_conn
*cp
)
398 return ((long) cp
>> (1 + ilog2(sizeof(*cp
)))) & ipvs
->threads_mask
;
402 * Create a new sync buffer for Version 0 proto.
404 static inline struct ip_vs_sync_buff
*
405 ip_vs_sync_buff_create_v0(struct netns_ipvs
*ipvs
)
407 struct ip_vs_sync_buff
*sb
;
408 struct ip_vs_sync_mesg_v0
*mesg
;
410 if (!(sb
=kmalloc(sizeof(struct ip_vs_sync_buff
), GFP_ATOMIC
)))
413 sb
->mesg
= kmalloc(ipvs
->send_mesg_maxlen
, GFP_ATOMIC
);
418 mesg
= (struct ip_vs_sync_mesg_v0
*)sb
->mesg
;
420 mesg
->syncid
= ipvs
->master_syncid
;
421 mesg
->size
= htons(sizeof(struct ip_vs_sync_mesg_v0
));
422 sb
->head
= (unsigned char *)mesg
+ sizeof(struct ip_vs_sync_mesg_v0
);
423 sb
->end
= (unsigned char *)mesg
+ ipvs
->send_mesg_maxlen
;
424 sb
->firstuse
= jiffies
;
428 /* Check if connection is controlled by persistence */
429 static inline bool in_persistence(struct ip_vs_conn
*cp
)
431 for (cp
= cp
->control
; cp
; cp
= cp
->control
) {
432 if (cp
->flags
& IP_VS_CONN_F_TEMPLATE
)
438 /* Check if conn should be synced.
439 * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
440 * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
441 * sync_retries times with period of sync_refresh_period/8
442 * - (2) if both sync_refresh_period and sync_period are 0 send sync only
443 * for state changes or only once when pkts matches sync_threshold
444 * - (3) templates: rate can be reduced only with sync_refresh_period or
447 static int ip_vs_sync_conn_needed(struct netns_ipvs
*ipvs
,
448 struct ip_vs_conn
*cp
, int pkts
)
450 unsigned long orig
= ACCESS_ONCE(cp
->sync_endtime
);
451 unsigned long now
= jiffies
;
452 unsigned long n
= (now
+ cp
->timeout
) & ~3UL;
453 unsigned int sync_refresh_period
;
457 /* Check if we sync in current state */
458 if (unlikely(cp
->flags
& IP_VS_CONN_F_TEMPLATE
))
460 else if (unlikely(sysctl_sync_persist_mode(ipvs
) && in_persistence(cp
)))
462 else if (likely(cp
->protocol
== IPPROTO_TCP
)) {
463 if (!((1 << cp
->state
) &
464 ((1 << IP_VS_TCP_S_ESTABLISHED
) |
465 (1 << IP_VS_TCP_S_FIN_WAIT
) |
466 (1 << IP_VS_TCP_S_CLOSE
) |
467 (1 << IP_VS_TCP_S_CLOSE_WAIT
) |
468 (1 << IP_VS_TCP_S_TIME_WAIT
))))
470 force
= cp
->state
!= cp
->old_state
;
471 if (force
&& cp
->state
!= IP_VS_TCP_S_ESTABLISHED
)
473 } else if (unlikely(cp
->protocol
== IPPROTO_SCTP
)) {
474 if (!((1 << cp
->state
) &
475 ((1 << IP_VS_SCTP_S_ESTABLISHED
) |
476 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT
) |
477 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED
) |
478 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT
) |
479 (1 << IP_VS_SCTP_S_CLOSED
))))
481 force
= cp
->state
!= cp
->old_state
;
482 if (force
&& cp
->state
!= IP_VS_SCTP_S_ESTABLISHED
)
485 /* UDP or another protocol with single state */
489 sync_refresh_period
= sysctl_sync_refresh_period(ipvs
);
490 if (sync_refresh_period
> 0) {
491 long diff
= n
- orig
;
492 long min_diff
= max(cp
->timeout
>> 1, 10UL * HZ
);
494 /* Avoid sync if difference is below sync_refresh_period
495 * and below the half timeout.
497 if (abs(diff
) < min_t(long, sync_refresh_period
, min_diff
)) {
498 int retries
= orig
& 3;
500 if (retries
>= sysctl_sync_retries(ipvs
))
502 if (time_before(now
, orig
- cp
->timeout
+
503 (sync_refresh_period
>> 3)))
508 sync_period
= sysctl_sync_period(ipvs
);
509 if (sync_period
> 0) {
510 if (!(cp
->flags
& IP_VS_CONN_F_TEMPLATE
) &&
511 pkts
% sync_period
!= sysctl_sync_threshold(ipvs
))
513 } else if (sync_refresh_period
<= 0 &&
514 pkts
!= sysctl_sync_threshold(ipvs
))
518 cp
->old_state
= cp
->state
;
519 n
= cmpxchg(&cp
->sync_endtime
, orig
, n
);
520 return n
== orig
|| force
;
524 * Version 0 , could be switched in by sys_ctl.
525 * Add an ip_vs_conn information into the current sync_buff.
527 static void ip_vs_sync_conn_v0(struct net
*net
, struct ip_vs_conn
*cp
,
530 struct netns_ipvs
*ipvs
= net_ipvs(net
);
531 struct ip_vs_sync_mesg_v0
*m
;
532 struct ip_vs_sync_conn_v0
*s
;
533 struct ip_vs_sync_buff
*buff
;
534 struct ipvs_master_sync_state
*ms
;
538 if (unlikely(cp
->af
!= AF_INET
))
540 /* Do not sync ONE PACKET */
541 if (cp
->flags
& IP_VS_CONN_F_ONE_PACKET
)
544 if (!ip_vs_sync_conn_needed(ipvs
, cp
, pkts
))
547 spin_lock_bh(&ipvs
->sync_buff_lock
);
548 if (!(ipvs
->sync_state
& IP_VS_STATE_MASTER
)) {
549 spin_unlock_bh(&ipvs
->sync_buff_lock
);
553 id
= select_master_thread_id(ipvs
, cp
);
555 buff
= ms
->sync_buff
;
557 m
= (struct ip_vs_sync_mesg_v0
*) buff
->mesg
;
558 /* Send buffer if it is for v1 */
560 sb_queue_tail(ipvs
, ms
);
561 ms
->sync_buff
= NULL
;
566 buff
= ip_vs_sync_buff_create_v0(ipvs
);
568 spin_unlock_bh(&ipvs
->sync_buff_lock
);
569 pr_err("ip_vs_sync_buff_create failed.\n");
572 ms
->sync_buff
= buff
;
575 len
= (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
) ? FULL_CONN_SIZE
:
577 m
= (struct ip_vs_sync_mesg_v0
*) buff
->mesg
;
578 s
= (struct ip_vs_sync_conn_v0
*) buff
->head
;
582 s
->protocol
= cp
->protocol
;
583 s
->cport
= cp
->cport
;
584 s
->vport
= cp
->vport
;
585 s
->dport
= cp
->dport
;
586 s
->caddr
= cp
->caddr
.ip
;
587 s
->vaddr
= cp
->vaddr
.ip
;
588 s
->daddr
= cp
->daddr
.ip
;
589 s
->flags
= htons(cp
->flags
& ~IP_VS_CONN_F_HASHED
);
590 s
->state
= htons(cp
->state
);
591 if (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
) {
592 struct ip_vs_sync_conn_options
*opt
=
593 (struct ip_vs_sync_conn_options
*)&s
[1];
594 memcpy(opt
, &cp
->in_seq
, sizeof(*opt
));
598 m
->size
= htons(ntohs(m
->size
) + len
);
601 /* check if there is a space for next one */
602 if (buff
->head
+ FULL_CONN_SIZE
> buff
->end
) {
603 sb_queue_tail(ipvs
, ms
);
604 ms
->sync_buff
= NULL
;
606 spin_unlock_bh(&ipvs
->sync_buff_lock
);
608 /* synchronize its controller if it has */
611 if (cp
->flags
& IP_VS_CONN_F_TEMPLATE
)
612 pkts
= atomic_add_return(1, &cp
->in_pkts
);
614 pkts
= sysctl_sync_threshold(ipvs
);
615 ip_vs_sync_conn(net
, cp
, pkts
);
620 * Add an ip_vs_conn information into the current sync_buff.
621 * Called by ip_vs_in.
622 * Sending Version 1 messages
624 void ip_vs_sync_conn(struct net
*net
, struct ip_vs_conn
*cp
, int pkts
)
626 struct netns_ipvs
*ipvs
= net_ipvs(net
);
627 struct ip_vs_sync_mesg
*m
;
628 union ip_vs_sync_conn
*s
;
629 struct ip_vs_sync_buff
*buff
;
630 struct ipvs_master_sync_state
*ms
;
633 unsigned int len
, pe_name_len
, pad
;
635 /* Handle old version of the protocol */
636 if (sysctl_sync_ver(ipvs
) == 0) {
637 ip_vs_sync_conn_v0(net
, cp
, pkts
);
640 /* Do not sync ONE PACKET */
641 if (cp
->flags
& IP_VS_CONN_F_ONE_PACKET
)
644 if (!ip_vs_sync_conn_needed(ipvs
, cp
, pkts
))
649 if (cp
->pe_data_len
) {
650 if (!cp
->pe_data
|| !cp
->dest
) {
651 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
654 pe_name_len
= strnlen(cp
->pe
->name
, IP_VS_PENAME_MAXLEN
);
657 spin_lock_bh(&ipvs
->sync_buff_lock
);
658 if (!(ipvs
->sync_state
& IP_VS_STATE_MASTER
)) {
659 spin_unlock_bh(&ipvs
->sync_buff_lock
);
663 id
= select_master_thread_id(ipvs
, cp
);
666 #ifdef CONFIG_IP_VS_IPV6
667 if (cp
->af
== AF_INET6
)
668 len
= sizeof(struct ip_vs_sync_v6
);
671 len
= sizeof(struct ip_vs_sync_v4
);
673 if (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
)
674 len
+= sizeof(struct ip_vs_sync_conn_options
) + 2;
677 len
+= cp
->pe_data_len
+ 2; /* + Param hdr field */
679 len
+= pe_name_len
+ 2;
681 /* check if there is a space for this one */
683 buff
= ms
->sync_buff
;
686 pad
= (4 - (size_t) buff
->head
) & 3;
687 /* Send buffer if it is for v0 */
688 if (buff
->head
+ len
+ pad
> buff
->end
|| m
->reserved
) {
689 sb_queue_tail(ipvs
, ms
);
690 ms
->sync_buff
= NULL
;
697 buff
= ip_vs_sync_buff_create(ipvs
);
699 spin_unlock_bh(&ipvs
->sync_buff_lock
);
700 pr_err("ip_vs_sync_buff_create failed.\n");
703 ms
->sync_buff
= buff
;
708 buff
->head
+= pad
+ len
;
709 m
->size
= htons(ntohs(m
->size
) + pad
+ len
);
710 /* Add ev. padding from prev. sync_conn */
714 s
= (union ip_vs_sync_conn
*)p
;
716 /* Set message type & copy members */
717 s
->v4
.type
= (cp
->af
== AF_INET6
? STYPE_F_INET6
: 0);
718 s
->v4
.ver_size
= htons(len
& SVER_MASK
); /* Version 0 */
719 s
->v4
.flags
= htonl(cp
->flags
& ~IP_VS_CONN_F_HASHED
);
720 s
->v4
.state
= htons(cp
->state
);
721 s
->v4
.protocol
= cp
->protocol
;
722 s
->v4
.cport
= cp
->cport
;
723 s
->v4
.vport
= cp
->vport
;
724 s
->v4
.dport
= cp
->dport
;
725 s
->v4
.fwmark
= htonl(cp
->fwmark
);
726 s
->v4
.timeout
= htonl(cp
->timeout
/ HZ
);
729 #ifdef CONFIG_IP_VS_IPV6
730 if (cp
->af
== AF_INET6
) {
731 p
+= sizeof(struct ip_vs_sync_v6
);
732 s
->v6
.caddr
= cp
->caddr
.in6
;
733 s
->v6
.vaddr
= cp
->vaddr
.in6
;
734 s
->v6
.daddr
= cp
->daddr
.in6
;
738 p
+= sizeof(struct ip_vs_sync_v4
); /* options ptr */
739 s
->v4
.caddr
= cp
->caddr
.ip
;
740 s
->v4
.vaddr
= cp
->vaddr
.ip
;
741 s
->v4
.daddr
= cp
->daddr
.ip
;
743 if (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
) {
744 *(p
++) = IPVS_OPT_SEQ_DATA
;
745 *(p
++) = sizeof(struct ip_vs_sync_conn_options
);
746 hton_seq((struct ip_vs_seq
*)p
, &cp
->in_seq
);
747 p
+= sizeof(struct ip_vs_seq
);
748 hton_seq((struct ip_vs_seq
*)p
, &cp
->out_seq
);
749 p
+= sizeof(struct ip_vs_seq
);
752 if (cp
->pe_data_len
&& cp
->pe_data
) {
753 *(p
++) = IPVS_OPT_PE_DATA
;
754 *(p
++) = cp
->pe_data_len
;
755 memcpy(p
, cp
->pe_data
, cp
->pe_data_len
);
756 p
+= cp
->pe_data_len
;
759 *(p
++) = IPVS_OPT_PE_NAME
;
760 *(p
++) = pe_name_len
;
761 memcpy(p
, cp
->pe
->name
, pe_name_len
);
766 spin_unlock_bh(&ipvs
->sync_buff_lock
);
769 /* synchronize its controller if it has */
773 if (cp
->flags
& IP_VS_CONN_F_TEMPLATE
)
774 pkts
= atomic_add_return(1, &cp
->in_pkts
);
776 pkts
= sysctl_sync_threshold(ipvs
);
781 * fill_param used by version 1
784 ip_vs_conn_fill_param_sync(struct net
*net
, int af
, union ip_vs_sync_conn
*sc
,
785 struct ip_vs_conn_param
*p
,
786 __u8
*pe_data
, unsigned int pe_data_len
,
787 __u8
*pe_name
, unsigned int pe_name_len
)
789 #ifdef CONFIG_IP_VS_IPV6
791 ip_vs_conn_fill_param(net
, af
, sc
->v6
.protocol
,
792 (const union nf_inet_addr
*)&sc
->v6
.caddr
,
794 (const union nf_inet_addr
*)&sc
->v6
.vaddr
,
798 ip_vs_conn_fill_param(net
, af
, sc
->v4
.protocol
,
799 (const union nf_inet_addr
*)&sc
->v4
.caddr
,
801 (const union nf_inet_addr
*)&sc
->v4
.vaddr
,
806 char buff
[IP_VS_PENAME_MAXLEN
+1];
808 memcpy(buff
, pe_name
, pe_name_len
);
810 p
->pe
= __ip_vs_pe_getbyname(buff
);
812 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
817 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
821 p
->pe_data
= kmemdup(pe_data
, pe_data_len
, GFP_ATOMIC
);
823 module_put(p
->pe
->module
);
826 p
->pe_data_len
= pe_data_len
;
832 * Connection Add / Update.
833 * Common for version 0 and 1 reception of backup sync_conns.
837 static void ip_vs_proc_conn(struct net
*net
, struct ip_vs_conn_param
*param
,
838 unsigned int flags
, unsigned int state
,
839 unsigned int protocol
, unsigned int type
,
840 const union nf_inet_addr
*daddr
, __be16 dport
,
841 unsigned long timeout
, __u32 fwmark
,
842 struct ip_vs_sync_conn_options
*opt
)
844 struct ip_vs_dest
*dest
;
845 struct ip_vs_conn
*cp
;
846 struct netns_ipvs
*ipvs
= net_ipvs(net
);
848 if (!(flags
& IP_VS_CONN_F_TEMPLATE
)) {
849 cp
= ip_vs_conn_in_get(param
);
850 if (cp
&& ((cp
->dport
!= dport
) ||
851 !ip_vs_addr_equal(cp
->daf
, &cp
->daddr
, daddr
))) {
852 if (!(flags
& IP_VS_CONN_F_INACTIVE
)) {
853 ip_vs_conn_expire_now(cp
);
854 __ip_vs_conn_put(cp
);
857 /* This is the expiration message for the
858 * connection that was already replaced, so we
861 __ip_vs_conn_put(cp
);
862 kfree(param
->pe_data
);
867 cp
= ip_vs_ct_in_get(param
);
872 kfree(param
->pe_data
);
875 spin_lock_bh(&cp
->lock
);
876 if ((cp
->flags
^ flags
) & IP_VS_CONN_F_INACTIVE
&&
877 !(flags
& IP_VS_CONN_F_TEMPLATE
) && dest
) {
878 if (flags
& IP_VS_CONN_F_INACTIVE
) {
879 atomic_dec(&dest
->activeconns
);
880 atomic_inc(&dest
->inactconns
);
882 atomic_inc(&dest
->activeconns
);
883 atomic_dec(&dest
->inactconns
);
886 flags
&= IP_VS_CONN_F_BACKUP_UPD_MASK
;
887 flags
|= cp
->flags
& ~IP_VS_CONN_F_BACKUP_UPD_MASK
;
889 spin_unlock_bh(&cp
->lock
);
891 ip_vs_try_bind_dest(cp
);
894 * Find the appropriate destination for the connection.
895 * If it is not found the connection will remain unbound
899 /* This function is only invoked by the synchronization
900 * code. We do not currently support heterogeneous pools
901 * with synchronization, so we can make the assumption that
902 * the svc_af is the same as the dest_af
904 dest
= ip_vs_find_dest(net
, type
, type
, daddr
, dport
,
905 param
->vaddr
, param
->vport
, protocol
,
908 cp
= ip_vs_conn_new(param
, type
, daddr
, dport
, flags
, dest
,
912 kfree(param
->pe_data
);
913 IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
916 if (!(flags
& IP_VS_CONN_F_TEMPLATE
))
917 kfree(param
->pe_data
);
921 memcpy(&cp
->in_seq
, opt
, sizeof(*opt
));
922 atomic_set(&cp
->in_pkts
, sysctl_sync_threshold(ipvs
));
924 cp
->old_state
= cp
->state
;
926 * For Ver 0 messages style
927 * - Not possible to recover the right timeout for templates
928 * - can not find the right fwmark
929 * virtual service. If needed, we can do it for
930 * non-fwmark persistent services.
931 * Ver 1 messages style.
935 if (timeout
> MAX_SCHEDULE_TIMEOUT
/ HZ
)
936 timeout
= MAX_SCHEDULE_TIMEOUT
/ HZ
;
937 cp
->timeout
= timeout
*HZ
;
939 struct ip_vs_proto_data
*pd
;
941 pd
= ip_vs_proto_data_get(net
, protocol
);
942 if (!(flags
& IP_VS_CONN_F_TEMPLATE
) && pd
&& pd
->timeout_table
)
943 cp
->timeout
= pd
->timeout_table
[state
];
945 cp
->timeout
= (3*60*HZ
);
951 * Process received multicast message for Version 0
953 static void ip_vs_process_message_v0(struct net
*net
, const char *buffer
,
956 struct ip_vs_sync_mesg_v0
*m
= (struct ip_vs_sync_mesg_v0
*)buffer
;
957 struct ip_vs_sync_conn_v0
*s
;
958 struct ip_vs_sync_conn_options
*opt
;
959 struct ip_vs_protocol
*pp
;
960 struct ip_vs_conn_param param
;
964 p
= (char *)buffer
+ sizeof(struct ip_vs_sync_mesg_v0
);
965 for (i
=0; i
<m
->nr_conns
; i
++) {
966 unsigned int flags
, state
;
968 if (p
+ SIMPLE_CONN_SIZE
> buffer
+buflen
) {
969 IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
972 s
= (struct ip_vs_sync_conn_v0
*) p
;
973 flags
= ntohs(s
->flags
) | IP_VS_CONN_F_SYNC
;
974 flags
&= ~IP_VS_CONN_F_HASHED
;
975 if (flags
& IP_VS_CONN_F_SEQ_MASK
) {
976 opt
= (struct ip_vs_sync_conn_options
*)&s
[1];
978 if (p
> buffer
+buflen
) {
979 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
984 p
+= SIMPLE_CONN_SIZE
;
987 state
= ntohs(s
->state
);
988 if (!(flags
& IP_VS_CONN_F_TEMPLATE
)) {
989 pp
= ip_vs_proto_get(s
->protocol
);
991 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
995 if (state
>= pp
->num_states
) {
996 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
1001 /* protocol in templates is not used for state/timeout */
1003 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
1009 ip_vs_conn_fill_param(net
, AF_INET
, s
->protocol
,
1010 (const union nf_inet_addr
*)&s
->caddr
,
1012 (const union nf_inet_addr
*)&s
->vaddr
,
1015 /* Send timeout as Zero */
1016 ip_vs_proc_conn(net
, ¶m
, flags
, state
, s
->protocol
, AF_INET
,
1017 (union nf_inet_addr
*)&s
->daddr
, s
->dport
,
1025 static inline int ip_vs_proc_seqopt(__u8
*p
, unsigned int plen
,
1027 struct ip_vs_sync_conn_options
*opt
)
1029 struct ip_vs_sync_conn_options
*topt
;
1031 topt
= (struct ip_vs_sync_conn_options
*)p
;
1033 if (plen
!= sizeof(struct ip_vs_sync_conn_options
)) {
1034 IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
1037 if (*opt_flags
& IPVS_OPT_F_SEQ_DATA
) {
1038 IP_VS_DBG(2, "BACKUP, conn options found twice\n");
1041 ntoh_seq(&topt
->in_seq
, &opt
->in_seq
);
1042 ntoh_seq(&topt
->out_seq
, &opt
->out_seq
);
1043 *opt_flags
|= IPVS_OPT_F_SEQ_DATA
;
1047 static int ip_vs_proc_str(__u8
*p
, unsigned int plen
, unsigned int *data_len
,
1048 __u8
**data
, unsigned int maxlen
,
1049 __u32
*opt_flags
, __u32 flag
)
1051 if (plen
> maxlen
) {
1052 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen
);
1055 if (*opt_flags
& flag
) {
1056 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag
);
1065 * Process a Version 1 sync. connection
1067 static inline int ip_vs_proc_sync_conn(struct net
*net
, __u8
*p
, __u8
*msg_end
)
1069 struct ip_vs_sync_conn_options opt
;
1070 union ip_vs_sync_conn
*s
;
1071 struct ip_vs_protocol
*pp
;
1072 struct ip_vs_conn_param param
;
1074 unsigned int af
, state
, pe_data_len
=0, pe_name_len
=0;
1075 __u8
*pe_data
=NULL
, *pe_name
=NULL
;
1079 s
= (union ip_vs_sync_conn
*) p
;
1081 if (s
->v6
.type
& STYPE_F_INET6
) {
1082 #ifdef CONFIG_IP_VS_IPV6
1084 p
+= sizeof(struct ip_vs_sync_v6
);
1086 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
1090 } else if (!s
->v4
.type
) {
1092 p
+= sizeof(struct ip_vs_sync_v4
);
1099 /* Process optional params check Type & Len. */
1100 while (p
< msg_end
) {
1109 if (!plen
|| ((p
+ plen
) > msg_end
))
1111 /* Handle seq option p = param data */
1112 switch (ptype
& ~IPVS_OPT_F_PARAM
) {
1113 case IPVS_OPT_SEQ_DATA
:
1114 if (ip_vs_proc_seqopt(p
, plen
, &opt_flags
, &opt
))
1118 case IPVS_OPT_PE_DATA
:
1119 if (ip_vs_proc_str(p
, plen
, &pe_data_len
, &pe_data
,
1120 IP_VS_PEDATA_MAXLEN
, &opt_flags
,
1121 IPVS_OPT_F_PE_DATA
))
1125 case IPVS_OPT_PE_NAME
:
1126 if (ip_vs_proc_str(p
, plen
,&pe_name_len
, &pe_name
,
1127 IP_VS_PENAME_MAXLEN
, &opt_flags
,
1128 IPVS_OPT_F_PE_NAME
))
1133 /* Param data mandatory ? */
1134 if (!(ptype
& IPVS_OPT_F_PARAM
)) {
1135 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1136 ptype
& ~IPVS_OPT_F_PARAM
);
1141 p
+= plen
; /* Next option */
1144 /* Get flags and Mask off unsupported */
1145 flags
= ntohl(s
->v4
.flags
) & IP_VS_CONN_F_BACKUP_MASK
;
1146 flags
|= IP_VS_CONN_F_SYNC
;
1147 state
= ntohs(s
->v4
.state
);
1149 if (!(flags
& IP_VS_CONN_F_TEMPLATE
)) {
1150 pp
= ip_vs_proto_get(s
->v4
.protocol
);
1152 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
1157 if (state
>= pp
->num_states
) {
1158 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
1164 /* protocol in templates is not used for state/timeout */
1166 IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
1171 if (ip_vs_conn_fill_param_sync(net
, af
, s
, ¶m
, pe_data
,
1172 pe_data_len
, pe_name
, pe_name_len
)) {
1176 /* If only IPv4, just silent skip IPv6 */
1178 ip_vs_proc_conn(net
, ¶m
, flags
, state
, s
->v4
.protocol
, af
,
1179 (union nf_inet_addr
*)&s
->v4
.daddr
, s
->v4
.dport
,
1180 ntohl(s
->v4
.timeout
), ntohl(s
->v4
.fwmark
),
1181 (opt_flags
& IPVS_OPT_F_SEQ_DATA
? &opt
: NULL
)
1183 #ifdef CONFIG_IP_VS_IPV6
1185 ip_vs_proc_conn(net
, ¶m
, flags
, state
, s
->v6
.protocol
, af
,
1186 (union nf_inet_addr
*)&s
->v6
.daddr
, s
->v6
.dport
,
1187 ntohl(s
->v6
.timeout
), ntohl(s
->v6
.fwmark
),
1188 (opt_flags
& IPVS_OPT_F_SEQ_DATA
? &opt
: NULL
)
1191 ip_vs_pe_put(param
.pe
);
1195 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc
);
1200 * Process received multicast message and create the corresponding
1201 * ip_vs_conn entries.
1202 * Handles Version 0 & 1
1204 static void ip_vs_process_message(struct net
*net
, __u8
*buffer
,
1205 const size_t buflen
)
1207 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1208 struct ip_vs_sync_mesg
*m2
= (struct ip_vs_sync_mesg
*)buffer
;
1212 if (buflen
< sizeof(struct ip_vs_sync_mesg_v0
)) {
1213 IP_VS_DBG(2, "BACKUP, message header too short\n");
1217 if (buflen
!= ntohs(m2
->size
)) {
1218 IP_VS_DBG(2, "BACKUP, bogus message size\n");
1221 /* SyncID sanity check */
1222 if (ipvs
->backup_syncid
!= 0 && m2
->syncid
!= ipvs
->backup_syncid
) {
1223 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2
->syncid
);
1226 /* Handle version 1 message */
1227 if ((m2
->version
== SYNC_PROTO_VER
) && (m2
->reserved
== 0)
1228 && (m2
->spare
== 0)) {
1230 msg_end
= buffer
+ sizeof(struct ip_vs_sync_mesg
);
1231 nr_conns
= m2
->nr_conns
;
1233 for (i
=0; i
<nr_conns
; i
++) {
1234 union ip_vs_sync_conn
*s
;
1239 if (p
+ sizeof(s
->v4
) > buffer
+buflen
) {
1240 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
1243 s
= (union ip_vs_sync_conn
*)p
;
1244 size
= ntohs(s
->v4
.ver_size
) & SVER_MASK
;
1246 /* Basic sanity checks */
1247 if (msg_end
> buffer
+buflen
) {
1248 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
1251 if (ntohs(s
->v4
.ver_size
) >> SVER_SHIFT
) {
1252 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
1253 ntohs(s
->v4
.ver_size
) >> SVER_SHIFT
);
1256 /* Process a single sync_conn */
1257 retc
= ip_vs_proc_sync_conn(net
, p
, msg_end
);
1259 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
1263 /* Make sure we have 32 bit alignment */
1264 msg_end
= p
+ ((size
+ 3) & ~3);
1267 /* Old type of message */
1268 ip_vs_process_message_v0(net
, buffer
, buflen
);
1275 * Setup sndbuf (mode=1) or rcvbuf (mode=0)
1277 static void set_sock_size(struct sock
*sk
, int mode
, int val
)
1279 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
1280 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
1283 val
= clamp_t(int, val
, (SOCK_MIN_SNDBUF
+ 1) / 2,
1285 sk
->sk_sndbuf
= val
* 2;
1286 sk
->sk_userlocks
|= SOCK_SNDBUF_LOCK
;
1288 val
= clamp_t(int, val
, (SOCK_MIN_RCVBUF
+ 1) / 2,
1290 sk
->sk_rcvbuf
= val
* 2;
1291 sk
->sk_userlocks
|= SOCK_RCVBUF_LOCK
;
1297 * Setup loopback of outgoing multicasts on a sending socket
1299 static void set_mcast_loop(struct sock
*sk
, u_char loop
)
1301 struct inet_sock
*inet
= inet_sk(sk
);
1303 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
1305 inet
->mc_loop
= loop
? 1 : 0;
1310 * Specify TTL for outgoing multicasts on a sending socket
1312 static void set_mcast_ttl(struct sock
*sk
, u_char ttl
)
1314 struct inet_sock
*inet
= inet_sk(sk
);
1316 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
1323 * Specifiy default interface for outgoing multicasts
1325 static int set_mcast_if(struct sock
*sk
, char *ifname
)
1327 struct net_device
*dev
;
1328 struct inet_sock
*inet
= inet_sk(sk
);
1329 struct net
*net
= sock_net(sk
);
1331 dev
= __dev_get_by_name(net
, ifname
);
1335 if (sk
->sk_bound_dev_if
&& dev
->ifindex
!= sk
->sk_bound_dev_if
)
1339 inet
->mc_index
= dev
->ifindex
;
1340 /* inet->mc_addr = 0; */
1348 * Set the maximum length of sync message according to the
1349 * specified interface's MTU.
1351 static int set_sync_mesg_maxlen(struct net
*net
, int sync_state
)
1353 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1354 struct net_device
*dev
;
1357 if (sync_state
== IP_VS_STATE_MASTER
) {
1358 dev
= __dev_get_by_name(net
, ipvs
->master_mcast_ifn
);
1362 num
= (dev
->mtu
- sizeof(struct iphdr
) -
1363 sizeof(struct udphdr
) -
1364 SYNC_MESG_HEADER_LEN
- 20) / SIMPLE_CONN_SIZE
;
1365 ipvs
->send_mesg_maxlen
= SYNC_MESG_HEADER_LEN
+
1366 SIMPLE_CONN_SIZE
* min(num
, MAX_CONNS_PER_SYNCBUFF
);
1367 IP_VS_DBG(7, "setting the maximum length of sync sending "
1368 "message %d.\n", ipvs
->send_mesg_maxlen
);
1369 } else if (sync_state
== IP_VS_STATE_BACKUP
) {
1370 dev
= __dev_get_by_name(net
, ipvs
->backup_mcast_ifn
);
1374 ipvs
->recv_mesg_maxlen
= dev
->mtu
-
1375 sizeof(struct iphdr
) - sizeof(struct udphdr
);
1376 IP_VS_DBG(7, "setting the maximum length of sync receiving "
1377 "message %d.\n", ipvs
->recv_mesg_maxlen
);
1385 * Join a multicast group.
1386 * the group is specified by a class D multicast address 224.0.0.0/8
1387 * in the in_addr structure passed in as a parameter.
1390 join_mcast_group(struct sock
*sk
, struct in_addr
*addr
, char *ifname
)
1392 struct net
*net
= sock_net(sk
);
1393 struct ip_mreqn mreq
;
1394 struct net_device
*dev
;
1397 memset(&mreq
, 0, sizeof(mreq
));
1398 memcpy(&mreq
.imr_multiaddr
, addr
, sizeof(struct in_addr
));
1400 dev
= __dev_get_by_name(net
, ifname
);
1403 if (sk
->sk_bound_dev_if
&& dev
->ifindex
!= sk
->sk_bound_dev_if
)
1406 mreq
.imr_ifindex
= dev
->ifindex
;
1409 ret
= ip_mc_join_group(sk
, &mreq
);
1416 static int bind_mcastif_addr(struct socket
*sock
, char *ifname
)
1418 struct net
*net
= sock_net(sock
->sk
);
1419 struct net_device
*dev
;
1421 struct sockaddr_in sin
;
1423 dev
= __dev_get_by_name(net
, ifname
);
1427 addr
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
1429 pr_err("You probably need to specify IP address on "
1430 "multicast interface.\n");
1432 IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
1435 /* Now bind the socket with the address of multicast interface */
1436 sin
.sin_family
= AF_INET
;
1437 sin
.sin_addr
.s_addr
= addr
;
1440 return sock
->ops
->bind(sock
, (struct sockaddr
*)&sin
, sizeof(sin
));
1444 * Set up sending multicast socket over UDP
1446 static struct socket
*make_send_sock(struct net
*net
, int id
)
1448 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1449 /* multicast addr */
1450 struct sockaddr_in mcast_addr
= {
1451 .sin_family
= AF_INET
,
1452 .sin_port
= cpu_to_be16(IP_VS_SYNC_PORT
+ id
),
1453 .sin_addr
.s_addr
= cpu_to_be32(IP_VS_SYNC_GROUP
),
1455 struct socket
*sock
;
1458 /* First create a socket */
1459 result
= sock_create_kern(net
, PF_INET
, SOCK_DGRAM
, IPPROTO_UDP
, &sock
);
1461 pr_err("Error during creation of socket; terminating\n");
1462 return ERR_PTR(result
);
1464 result
= set_mcast_if(sock
->sk
, ipvs
->master_mcast_ifn
);
1466 pr_err("Error setting outbound mcast interface\n");
1470 set_mcast_loop(sock
->sk
, 0);
1471 set_mcast_ttl(sock
->sk
, 1);
1472 result
= sysctl_sync_sock_size(ipvs
);
1474 set_sock_size(sock
->sk
, 1, result
);
1476 result
= bind_mcastif_addr(sock
, ipvs
->master_mcast_ifn
);
1478 pr_err("Error binding address of the mcast interface\n");
1482 result
= sock
->ops
->connect(sock
, (struct sockaddr
*) &mcast_addr
,
1483 sizeof(struct sockaddr
), 0);
1485 pr_err("Error connecting to the multicast addr\n");
1493 return ERR_PTR(result
);
1498 * Set up receiving multicast socket over UDP
1500 static struct socket
*make_receive_sock(struct net
*net
, int id
)
1502 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1503 /* multicast addr */
1504 struct sockaddr_in mcast_addr
= {
1505 .sin_family
= AF_INET
,
1506 .sin_port
= cpu_to_be16(IP_VS_SYNC_PORT
+ id
),
1507 .sin_addr
.s_addr
= cpu_to_be32(IP_VS_SYNC_GROUP
),
1509 struct socket
*sock
;
1512 /* First create a socket */
1513 result
= sock_create_kern(net
, PF_INET
, SOCK_DGRAM
, IPPROTO_UDP
, &sock
);
1515 pr_err("Error during creation of socket; terminating\n");
1516 return ERR_PTR(result
);
1518 /* it is equivalent to the REUSEADDR option in user-space */
1519 sock
->sk
->sk_reuse
= SK_CAN_REUSE
;
1520 result
= sysctl_sync_sock_size(ipvs
);
1522 set_sock_size(sock
->sk
, 0, result
);
1524 result
= sock
->ops
->bind(sock
, (struct sockaddr
*) &mcast_addr
,
1525 sizeof(struct sockaddr
));
1527 pr_err("Error binding to the multicast addr\n");
1531 /* join the multicast group */
1532 result
= join_mcast_group(sock
->sk
,
1533 (struct in_addr
*) &mcast_addr
.sin_addr
,
1534 ipvs
->backup_mcast_ifn
);
1536 pr_err("Error joining to the multicast group\n");
1544 return ERR_PTR(result
);
1549 ip_vs_send_async(struct socket
*sock
, const char *buffer
, const size_t length
)
1551 struct msghdr msg
= {.msg_flags
= MSG_DONTWAIT
|MSG_NOSIGNAL
};
1556 iov
.iov_base
= (void *)buffer
;
1557 iov
.iov_len
= length
;
1559 len
= kernel_sendmsg(sock
, &msg
, &iov
, 1, (size_t)(length
));
1566 ip_vs_send_sync_msg(struct socket
*sock
, struct ip_vs_sync_mesg
*msg
)
1571 msize
= ntohs(msg
->size
);
1573 ret
= ip_vs_send_async(sock
, (char *)msg
, msize
);
1574 if (ret
>= 0 || ret
== -EAGAIN
)
1576 pr_err("ip_vs_send_async error %d\n", ret
);
1581 ip_vs_receive(struct socket
*sock
, char *buffer
, const size_t buflen
)
1583 struct msghdr msg
= {NULL
,};
1589 /* Receive a packet */
1590 iov
.iov_base
= buffer
;
1591 iov
.iov_len
= (size_t)buflen
;
1593 len
= kernel_recvmsg(sock
, &msg
, &iov
, 1, buflen
, MSG_DONTWAIT
);
1602 /* Wakeup the master thread for sending */
1603 static void master_wakeup_work_handler(struct work_struct
*work
)
1605 struct ipvs_master_sync_state
*ms
=
1606 container_of(work
, struct ipvs_master_sync_state
,
1607 master_wakeup_work
.work
);
1608 struct netns_ipvs
*ipvs
= ms
->ipvs
;
1610 spin_lock_bh(&ipvs
->sync_lock
);
1611 if (ms
->sync_queue_len
&&
1612 ms
->sync_queue_delay
< IPVS_SYNC_WAKEUP_RATE
) {
1613 ms
->sync_queue_delay
= IPVS_SYNC_WAKEUP_RATE
;
1614 wake_up_process(ms
->master_thread
);
1616 spin_unlock_bh(&ipvs
->sync_lock
);
1619 /* Get next buffer to send */
1620 static inline struct ip_vs_sync_buff
*
1621 next_sync_buff(struct netns_ipvs
*ipvs
, struct ipvs_master_sync_state
*ms
)
1623 struct ip_vs_sync_buff
*sb
;
1625 sb
= sb_dequeue(ipvs
, ms
);
1628 /* Do not delay entries in buffer for more than 2 seconds */
1629 return get_curr_sync_buff(ipvs
, ms
, IPVS_SYNC_FLUSH_TIME
);
1632 static int sync_thread_master(void *data
)
1634 struct ip_vs_sync_thread_data
*tinfo
= data
;
1635 struct netns_ipvs
*ipvs
= net_ipvs(tinfo
->net
);
1636 struct ipvs_master_sync_state
*ms
= &ipvs
->ms
[tinfo
->id
];
1637 struct sock
*sk
= tinfo
->sock
->sk
;
1638 struct ip_vs_sync_buff
*sb
;
1640 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
1641 "syncid = %d, id = %d\n",
1642 ipvs
->master_mcast_ifn
, ipvs
->master_syncid
, tinfo
->id
);
1645 sb
= next_sync_buff(ipvs
, ms
);
1646 if (unlikely(kthread_should_stop()))
1649 schedule_timeout(IPVS_SYNC_CHECK_PERIOD
);
1652 while (ip_vs_send_sync_msg(tinfo
->sock
, sb
->mesg
) < 0) {
1653 /* (Ab)use interruptible sleep to avoid increasing
1656 __wait_event_interruptible(*sk_sleep(sk
),
1657 sock_writeable(sk
) ||
1658 kthread_should_stop());
1659 if (unlikely(kthread_should_stop()))
1662 ip_vs_sync_buff_release(sb
);
1666 __set_current_state(TASK_RUNNING
);
1668 ip_vs_sync_buff_release(sb
);
1670 /* clean up the sync_buff queue */
1671 while ((sb
= sb_dequeue(ipvs
, ms
)))
1672 ip_vs_sync_buff_release(sb
);
1673 __set_current_state(TASK_RUNNING
);
1675 /* clean up the current sync_buff */
1676 sb
= get_curr_sync_buff(ipvs
, ms
, 0);
1678 ip_vs_sync_buff_release(sb
);
1680 /* release the sending multicast socket */
1681 sock_release(tinfo
->sock
);
1688 static int sync_thread_backup(void *data
)
1690 struct ip_vs_sync_thread_data
*tinfo
= data
;
1691 struct netns_ipvs
*ipvs
= net_ipvs(tinfo
->net
);
1694 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
1695 "syncid = %d, id = %d\n",
1696 ipvs
->backup_mcast_ifn
, ipvs
->backup_syncid
, tinfo
->id
);
1698 while (!kthread_should_stop()) {
1699 wait_event_interruptible(*sk_sleep(tinfo
->sock
->sk
),
1700 !skb_queue_empty(&tinfo
->sock
->sk
->sk_receive_queue
)
1701 || kthread_should_stop());
1703 /* do we have data now? */
1704 while (!skb_queue_empty(&(tinfo
->sock
->sk
->sk_receive_queue
))) {
1705 len
= ip_vs_receive(tinfo
->sock
, tinfo
->buf
,
1706 ipvs
->recv_mesg_maxlen
);
1709 pr_err("receiving message error\n");
1713 ip_vs_process_message(tinfo
->net
, tinfo
->buf
, len
);
1717 /* release the sending multicast socket */
1718 sock_release(tinfo
->sock
);
1726 int start_sync_thread(struct net
*net
, int state
, char *mcast_ifn
, __u8 syncid
)
1728 struct ip_vs_sync_thread_data
*tinfo
;
1729 struct task_struct
**array
= NULL
, *task
;
1730 struct socket
*sock
;
1731 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1733 int (*threadfn
)(void *data
);
1735 int result
= -ENOMEM
;
1737 IP_VS_DBG(7, "%s(): pid %d\n", __func__
, task_pid_nr(current
));
1738 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
1739 sizeof(struct ip_vs_sync_conn_v0
));
1741 if (!ipvs
->sync_state
) {
1742 count
= clamp(sysctl_sync_ports(ipvs
), 1, IPVS_SYNC_PORTS_MAX
);
1743 ipvs
->threads_mask
= count
- 1;
1745 count
= ipvs
->threads_mask
+ 1;
1747 if (state
== IP_VS_STATE_MASTER
) {
1751 strlcpy(ipvs
->master_mcast_ifn
, mcast_ifn
,
1752 sizeof(ipvs
->master_mcast_ifn
));
1753 ipvs
->master_syncid
= syncid
;
1754 name
= "ipvs-m:%d:%d";
1755 threadfn
= sync_thread_master
;
1756 } else if (state
== IP_VS_STATE_BACKUP
) {
1757 if (ipvs
->backup_threads
)
1760 strlcpy(ipvs
->backup_mcast_ifn
, mcast_ifn
,
1761 sizeof(ipvs
->backup_mcast_ifn
));
1762 ipvs
->backup_syncid
= syncid
;
1763 name
= "ipvs-b:%d:%d";
1764 threadfn
= sync_thread_backup
;
1769 if (state
== IP_VS_STATE_MASTER
) {
1770 struct ipvs_master_sync_state
*ms
;
1772 ipvs
->ms
= kzalloc(count
* sizeof(ipvs
->ms
[0]), GFP_KERNEL
);
1776 for (id
= 0; id
< count
; id
++, ms
++) {
1777 INIT_LIST_HEAD(&ms
->sync_queue
);
1778 ms
->sync_queue_len
= 0;
1779 ms
->sync_queue_delay
= 0;
1780 INIT_DELAYED_WORK(&ms
->master_wakeup_work
,
1781 master_wakeup_work_handler
);
1785 array
= kzalloc(count
* sizeof(struct task_struct
*),
1790 set_sync_mesg_maxlen(net
, state
);
1793 for (id
= 0; id
< count
; id
++) {
1794 if (state
== IP_VS_STATE_MASTER
)
1795 sock
= make_send_sock(net
, id
);
1797 sock
= make_receive_sock(net
, id
);
1799 result
= PTR_ERR(sock
);
1802 tinfo
= kmalloc(sizeof(*tinfo
), GFP_KERNEL
);
1807 if (state
== IP_VS_STATE_BACKUP
) {
1808 tinfo
->buf
= kmalloc(ipvs
->recv_mesg_maxlen
,
1817 task
= kthread_run(threadfn
, tinfo
, name
, ipvs
->gen
, id
);
1819 result
= PTR_ERR(task
);
1823 if (state
== IP_VS_STATE_MASTER
)
1824 ipvs
->ms
[id
].master_thread
= task
;
1829 /* mark as active */
1831 if (state
== IP_VS_STATE_BACKUP
)
1832 ipvs
->backup_threads
= array
;
1833 spin_lock_bh(&ipvs
->sync_buff_lock
);
1834 ipvs
->sync_state
|= state
;
1835 spin_unlock_bh(&ipvs
->sync_buff_lock
);
1837 /* increase the module use count */
1838 ip_vs_use_count_inc();
1847 sock_release(tinfo
->sock
);
1852 while (count
-- > 0) {
1853 if (state
== IP_VS_STATE_MASTER
)
1854 kthread_stop(ipvs
->ms
[count
].master_thread
);
1856 kthread_stop(array
[count
]);
1861 if (!(ipvs
->sync_state
& IP_VS_STATE_MASTER
)) {
1869 int stop_sync_thread(struct net
*net
, int state
)
1871 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1872 struct task_struct
**array
;
1876 IP_VS_DBG(7, "%s(): pid %d\n", __func__
, task_pid_nr(current
));
1878 if (state
== IP_VS_STATE_MASTER
) {
1883 * The lock synchronizes with sb_queue_tail(), so that we don't
1884 * add sync buffers to the queue, when we are already in
1885 * progress of stopping the master sync daemon.
1888 spin_lock_bh(&ipvs
->sync_buff_lock
);
1889 spin_lock(&ipvs
->sync_lock
);
1890 ipvs
->sync_state
&= ~IP_VS_STATE_MASTER
;
1891 spin_unlock(&ipvs
->sync_lock
);
1892 spin_unlock_bh(&ipvs
->sync_buff_lock
);
1895 for (id
= ipvs
->threads_mask
; id
>= 0; id
--) {
1896 struct ipvs_master_sync_state
*ms
= &ipvs
->ms
[id
];
1899 pr_info("stopping master sync thread %d ...\n",
1900 task_pid_nr(ms
->master_thread
));
1901 cancel_delayed_work_sync(&ms
->master_wakeup_work
);
1902 ret
= kthread_stop(ms
->master_thread
);
1908 } else if (state
== IP_VS_STATE_BACKUP
) {
1909 if (!ipvs
->backup_threads
)
1912 ipvs
->sync_state
&= ~IP_VS_STATE_BACKUP
;
1913 array
= ipvs
->backup_threads
;
1915 for (id
= ipvs
->threads_mask
; id
>= 0; id
--) {
1918 pr_info("stopping backup sync thread %d ...\n",
1919 task_pid_nr(array
[id
]));
1920 ret
= kthread_stop(array
[id
]);
1925 ipvs
->backup_threads
= NULL
;
1928 /* decrease the module use count */
1929 ip_vs_use_count_dec();
1935 * Initialize data struct for each netns
1937 int __net_init
ip_vs_sync_net_init(struct net
*net
)
1939 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1941 __mutex_init(&ipvs
->sync_mutex
, "ipvs->sync_mutex", &__ipvs_sync_key
);
1942 spin_lock_init(&ipvs
->sync_lock
);
1943 spin_lock_init(&ipvs
->sync_buff_lock
);
1947 void ip_vs_sync_net_cleanup(struct net
*net
)
1950 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1952 mutex_lock(&ipvs
->sync_mutex
);
1953 retc
= stop_sync_thread(net
, IP_VS_STATE_MASTER
);
1954 if (retc
&& retc
!= -ESRCH
)
1955 pr_err("Failed to stop Master Daemon\n");
1957 retc
= stop_sync_thread(net
, IP_VS_STATE_BACKUP
);
1958 if (retc
&& retc
!= -ESRCH
)
1959 pr_err("Failed to stop Backup Daemon\n");
1960 mutex_unlock(&ipvs
->sync_mutex
);