1 // SPDX-License-Identifier: GPL-2.0
3 * IPVS An implementation of the IP virtual server support for the
4 * LINUX operating system. IPVS is now implemented as a module
5 * over the NetFilter framework. IPVS can be used to build a
6 * high-performance and highly available server based on a
9 * Version 1, is capable of handling both version 0 and 1 messages.
10 * Version 0 is the plain old format.
11 * Note Version 0 receivers will just drop Ver 1 messages.
12 * Version 1 is capable of handle IPv6, Persistence data,
13 * time-outs, and firewall marks.
14 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
15 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
17 * Definitions Message: is a complete datagram
18 * Sync_conn: is a part of a Message
19 * Param Data is an option to a Sync_conn.
21 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
23 * ip_vs_sync: sync connection info from master load balancer to backups
27 * Alexandre Cassen : Added master & backup support at a time.
28 * Alexandre Cassen : Added SyncID support for incoming sync
30 * Justin Ossevoort : Fix endian problem on sync message size.
31 * Hans Schillstrom : Added Version 1: i.e. IPv6,
32 * Persistence support, fwmark and time-out.
35 #define KMSG_COMPONENT "IPVS"
36 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
38 #include <linux/module.h>
39 #include <linux/slab.h>
40 #include <linux/inetdevice.h>
41 #include <linux/net.h>
42 #include <linux/completion.h>
43 #include <linux/delay.h>
44 #include <linux/skbuff.h>
46 #include <linux/igmp.h> /* for ip_mc_join_group */
47 #include <linux/udp.h>
48 #include <linux/err.h>
49 #include <linux/kthread.h>
50 #include <linux/wait.h>
51 #include <linux/kernel.h>
52 #include <linux/sched/signal.h>
54 #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
59 #include <net/ip_vs.h>
61 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
62 #define IP_VS_SYNC_PORT 8848 /* multicast port */
64 #define SYNC_PROTO_VER 1 /* Protocol version in header */
66 static struct lock_class_key __ipvs_sync_key
;
68 * IPVS sync connection entry
69 * Version 0, i.e. original version.
71 struct ip_vs_sync_conn_v0
{
74 /* Protocol, addresses and port numbers */
75 __u8 protocol
; /* Which protocol (TCP/UDP) */
79 __be32 caddr
; /* client address */
80 __be32 vaddr
; /* virtual address */
81 __be32 daddr
; /* destination address */
83 /* Flags and state transition */
84 __be16 flags
; /* status flags */
85 __be16 state
; /* state info */
87 /* The sequence options start here */
90 struct ip_vs_sync_conn_options
{
91 struct ip_vs_seq in_seq
; /* incoming seq. struct */
92 struct ip_vs_seq out_seq
; /* outgoing seq. struct */
96 Sync Connection format (sync_conn)
99 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
100 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 | Type | Protocol | Ver. | Size |
102 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
108 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
110 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
111 | timeout (in sec.) |
112 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
114 | IP-Addresses (v4 or v6) |
116 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
118 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
119 | Param. Type | Param. Length | Param. data |
120 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
122 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
123 | | Param Type | Param. Length |
124 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
126 | Last Param data should be padded for 32 bit alignment |
127 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
131 * Type 0, IPv4 sync connection format
133 struct ip_vs_sync_v4
{
135 __u8 protocol
; /* Which protocol (TCP/UDP) */
136 __be16 ver_size
; /* Version msb 4 bits */
137 /* Flags and state transition */
138 __be32 flags
; /* status flags */
139 __be16 state
; /* state info */
140 /* Protocol, addresses and port numbers */
144 __be32 fwmark
; /* Firewall mark from skb */
145 __be32 timeout
; /* cp timeout */
146 __be32 caddr
; /* client address */
147 __be32 vaddr
; /* virtual address */
148 __be32 daddr
; /* destination address */
149 /* The sequence options start here */
150 /* PE data padded to 32bit alignment after seq. options */
153 * Type 2 messages IPv6
155 struct ip_vs_sync_v6
{
157 __u8 protocol
; /* Which protocol (TCP/UDP) */
158 __be16 ver_size
; /* Version msb 4 bits */
159 /* Flags and state transition */
160 __be32 flags
; /* status flags */
161 __be16 state
; /* state info */
162 /* Protocol, addresses and port numbers */
166 __be32 fwmark
; /* Firewall mark from skb */
167 __be32 timeout
; /* cp timeout */
168 struct in6_addr caddr
; /* client address */
169 struct in6_addr vaddr
; /* virtual address */
170 struct in6_addr daddr
; /* destination address */
171 /* The sequence options start here */
172 /* PE data padded to 32bit alignment after seq. options */
175 union ip_vs_sync_conn
{
176 struct ip_vs_sync_v4 v4
;
177 struct ip_vs_sync_v6 v6
;
180 /* Bits in Type field in above */
181 #define STYPE_INET6 0
182 #define STYPE_F_INET6 (1 << STYPE_INET6)
184 #define SVER_SHIFT 12 /* Shift to get version */
185 #define SVER_MASK 0x0fff /* Mask to strip version */
187 #define IPVS_OPT_SEQ_DATA 1
188 #define IPVS_OPT_PE_DATA 2
189 #define IPVS_OPT_PE_NAME 3
190 #define IPVS_OPT_PARAM 7
192 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
193 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
194 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
195 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
197 struct ip_vs_sync_thread_data
{
198 struct task_struct
*task
;
199 struct netns_ipvs
*ipvs
;
205 /* Version 0 definition of packet sizes */
206 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
207 #define FULL_CONN_SIZE \
208 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
212 The master mulitcasts messages (Datagrams) to the backup load balancers
213 in the following format.
216 Note, first byte should be Zero, so ver 0 receivers will drop the packet.
219 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
220 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
221 | 0 | SyncID | Size |
222 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
223 | Count Conns | Version | Reserved, set to Zero |
224 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
226 | IPVS Sync Connection (1) |
227 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
231 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
233 | IPVS Sync Connection (n) |
234 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
238 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
239 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
240 | Count Conns | SyncID | Size |
241 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
242 | IPVS Sync Connection (1) |
245 #define SYNC_MESG_HEADER_LEN 4
246 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
248 /* Version 0 header */
249 struct ip_vs_sync_mesg_v0
{
254 /* ip_vs_sync_conn entries start here */
257 /* Version 1 header */
258 struct ip_vs_sync_mesg
{
259 __u8 reserved
; /* must be zero */
263 __s8 version
; /* SYNC_PROTO_VER */
265 /* ip_vs_sync_conn entries start here */
268 union ipvs_sockaddr
{
269 struct sockaddr_in in
;
270 struct sockaddr_in6 in6
;
273 struct ip_vs_sync_buff
{
274 struct list_head list
;
275 unsigned long firstuse
;
277 /* pointers for the message data */
278 struct ip_vs_sync_mesg
*mesg
;
284 * Copy of struct ip_vs_seq
285 * From unaligned network order to aligned host order
287 static void ntoh_seq(struct ip_vs_seq
*no
, struct ip_vs_seq
*ho
)
289 memset(ho
, 0, sizeof(*ho
));
290 ho
->init_seq
= get_unaligned_be32(&no
->init_seq
);
291 ho
->delta
= get_unaligned_be32(&no
->delta
);
292 ho
->previous_delta
= get_unaligned_be32(&no
->previous_delta
);
296 * Copy of struct ip_vs_seq
297 * From Aligned host order to unaligned network order
299 static void hton_seq(struct ip_vs_seq
*ho
, struct ip_vs_seq
*no
)
301 put_unaligned_be32(ho
->init_seq
, &no
->init_seq
);
302 put_unaligned_be32(ho
->delta
, &no
->delta
);
303 put_unaligned_be32(ho
->previous_delta
, &no
->previous_delta
);
306 static inline struct ip_vs_sync_buff
*
307 sb_dequeue(struct netns_ipvs
*ipvs
, struct ipvs_master_sync_state
*ms
)
309 struct ip_vs_sync_buff
*sb
;
311 spin_lock_bh(&ipvs
->sync_lock
);
312 if (list_empty(&ms
->sync_queue
)) {
314 __set_current_state(TASK_INTERRUPTIBLE
);
316 sb
= list_entry(ms
->sync_queue
.next
, struct ip_vs_sync_buff
,
319 ms
->sync_queue_len
--;
320 if (!ms
->sync_queue_len
)
321 ms
->sync_queue_delay
= 0;
323 spin_unlock_bh(&ipvs
->sync_lock
);
329 * Create a new sync buffer for Version 1 proto.
331 static inline struct ip_vs_sync_buff
*
332 ip_vs_sync_buff_create(struct netns_ipvs
*ipvs
, unsigned int len
)
334 struct ip_vs_sync_buff
*sb
;
336 if (!(sb
=kmalloc(sizeof(struct ip_vs_sync_buff
), GFP_ATOMIC
)))
339 len
= max_t(unsigned int, len
+ sizeof(struct ip_vs_sync_mesg
),
340 ipvs
->mcfg
.sync_maxlen
);
341 sb
->mesg
= kmalloc(len
, GFP_ATOMIC
);
346 sb
->mesg
->reserved
= 0; /* old nr_conns i.e. must be zero now */
347 sb
->mesg
->version
= SYNC_PROTO_VER
;
348 sb
->mesg
->syncid
= ipvs
->mcfg
.syncid
;
349 sb
->mesg
->size
= htons(sizeof(struct ip_vs_sync_mesg
));
350 sb
->mesg
->nr_conns
= 0;
352 sb
->head
= (unsigned char *)sb
->mesg
+ sizeof(struct ip_vs_sync_mesg
);
353 sb
->end
= (unsigned char *)sb
->mesg
+ len
;
355 sb
->firstuse
= jiffies
;
359 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff
*sb
)
365 static inline void sb_queue_tail(struct netns_ipvs
*ipvs
,
366 struct ipvs_master_sync_state
*ms
)
368 struct ip_vs_sync_buff
*sb
= ms
->sync_buff
;
370 spin_lock(&ipvs
->sync_lock
);
371 if (ipvs
->sync_state
& IP_VS_STATE_MASTER
&&
372 ms
->sync_queue_len
< sysctl_sync_qlen_max(ipvs
)) {
373 if (!ms
->sync_queue_len
)
374 schedule_delayed_work(&ms
->master_wakeup_work
,
375 max(IPVS_SYNC_SEND_DELAY
, 1));
376 ms
->sync_queue_len
++;
377 list_add_tail(&sb
->list
, &ms
->sync_queue
);
378 if ((++ms
->sync_queue_delay
) == IPVS_SYNC_WAKEUP_RATE
) {
379 int id
= (int)(ms
- ipvs
->ms
);
381 wake_up_process(ipvs
->master_tinfo
[id
].task
);
384 ip_vs_sync_buff_release(sb
);
385 spin_unlock(&ipvs
->sync_lock
);
389 * Get the current sync buffer if it has been created for more
390 * than the specified time or the specified time is zero.
392 static inline struct ip_vs_sync_buff
*
393 get_curr_sync_buff(struct netns_ipvs
*ipvs
, struct ipvs_master_sync_state
*ms
,
396 struct ip_vs_sync_buff
*sb
;
398 spin_lock_bh(&ipvs
->sync_buff_lock
);
400 if (sb
&& time_after_eq(jiffies
- sb
->firstuse
, time
)) {
401 ms
->sync_buff
= NULL
;
402 __set_current_state(TASK_RUNNING
);
405 spin_unlock_bh(&ipvs
->sync_buff_lock
);
410 select_master_thread_id(struct netns_ipvs
*ipvs
, struct ip_vs_conn
*cp
)
412 return ((long) cp
>> (1 + ilog2(sizeof(*cp
)))) & ipvs
->threads_mask
;
416 * Create a new sync buffer for Version 0 proto.
418 static inline struct ip_vs_sync_buff
*
419 ip_vs_sync_buff_create_v0(struct netns_ipvs
*ipvs
, unsigned int len
)
421 struct ip_vs_sync_buff
*sb
;
422 struct ip_vs_sync_mesg_v0
*mesg
;
424 if (!(sb
=kmalloc(sizeof(struct ip_vs_sync_buff
), GFP_ATOMIC
)))
427 len
= max_t(unsigned int, len
+ sizeof(struct ip_vs_sync_mesg_v0
),
428 ipvs
->mcfg
.sync_maxlen
);
429 sb
->mesg
= kmalloc(len
, GFP_ATOMIC
);
434 mesg
= (struct ip_vs_sync_mesg_v0
*)sb
->mesg
;
436 mesg
->syncid
= ipvs
->mcfg
.syncid
;
437 mesg
->size
= htons(sizeof(struct ip_vs_sync_mesg_v0
));
438 sb
->head
= (unsigned char *)mesg
+ sizeof(struct ip_vs_sync_mesg_v0
);
439 sb
->end
= (unsigned char *)mesg
+ len
;
440 sb
->firstuse
= jiffies
;
444 /* Check if connection is controlled by persistence */
445 static inline bool in_persistence(struct ip_vs_conn
*cp
)
447 for (cp
= cp
->control
; cp
; cp
= cp
->control
) {
448 if (cp
->flags
& IP_VS_CONN_F_TEMPLATE
)
454 /* Check if conn should be synced.
455 * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
456 * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
457 * sync_retries times with period of sync_refresh_period/8
458 * - (2) if both sync_refresh_period and sync_period are 0 send sync only
459 * for state changes or only once when pkts matches sync_threshold
460 * - (3) templates: rate can be reduced only with sync_refresh_period or
463 static int ip_vs_sync_conn_needed(struct netns_ipvs
*ipvs
,
464 struct ip_vs_conn
*cp
, int pkts
)
466 unsigned long orig
= READ_ONCE(cp
->sync_endtime
);
467 unsigned long now
= jiffies
;
468 unsigned long n
= (now
+ cp
->timeout
) & ~3UL;
469 unsigned int sync_refresh_period
;
473 /* Check if we sync in current state */
474 if (unlikely(cp
->flags
& IP_VS_CONN_F_TEMPLATE
))
476 else if (unlikely(sysctl_sync_persist_mode(ipvs
) && in_persistence(cp
)))
478 else if (likely(cp
->protocol
== IPPROTO_TCP
)) {
479 if (!((1 << cp
->state
) &
480 ((1 << IP_VS_TCP_S_ESTABLISHED
) |
481 (1 << IP_VS_TCP_S_FIN_WAIT
) |
482 (1 << IP_VS_TCP_S_CLOSE
) |
483 (1 << IP_VS_TCP_S_CLOSE_WAIT
) |
484 (1 << IP_VS_TCP_S_TIME_WAIT
))))
486 force
= cp
->state
!= cp
->old_state
;
487 if (force
&& cp
->state
!= IP_VS_TCP_S_ESTABLISHED
)
489 } else if (unlikely(cp
->protocol
== IPPROTO_SCTP
)) {
490 if (!((1 << cp
->state
) &
491 ((1 << IP_VS_SCTP_S_ESTABLISHED
) |
492 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT
) |
493 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED
) |
494 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT
) |
495 (1 << IP_VS_SCTP_S_CLOSED
))))
497 force
= cp
->state
!= cp
->old_state
;
498 if (force
&& cp
->state
!= IP_VS_SCTP_S_ESTABLISHED
)
501 /* UDP or another protocol with single state */
505 sync_refresh_period
= sysctl_sync_refresh_period(ipvs
);
506 if (sync_refresh_period
> 0) {
507 long diff
= n
- orig
;
508 long min_diff
= max(cp
->timeout
>> 1, 10UL * HZ
);
510 /* Avoid sync if difference is below sync_refresh_period
511 * and below the half timeout.
513 if (abs(diff
) < min_t(long, sync_refresh_period
, min_diff
)) {
514 int retries
= orig
& 3;
516 if (retries
>= sysctl_sync_retries(ipvs
))
518 if (time_before(now
, orig
- cp
->timeout
+
519 (sync_refresh_period
>> 3)))
524 sync_period
= sysctl_sync_period(ipvs
);
525 if (sync_period
> 0) {
526 if (!(cp
->flags
& IP_VS_CONN_F_TEMPLATE
) &&
527 pkts
% sync_period
!= sysctl_sync_threshold(ipvs
))
529 } else if (!sync_refresh_period
&&
530 pkts
!= sysctl_sync_threshold(ipvs
))
534 cp
->old_state
= cp
->state
;
535 n
= cmpxchg(&cp
->sync_endtime
, orig
, n
);
536 return n
== orig
|| force
;
540 * Version 0 , could be switched in by sys_ctl.
541 * Add an ip_vs_conn information into the current sync_buff.
543 static void ip_vs_sync_conn_v0(struct netns_ipvs
*ipvs
, struct ip_vs_conn
*cp
,
546 struct ip_vs_sync_mesg_v0
*m
;
547 struct ip_vs_sync_conn_v0
*s
;
548 struct ip_vs_sync_buff
*buff
;
549 struct ipvs_master_sync_state
*ms
;
553 if (unlikely(cp
->af
!= AF_INET
))
555 /* Do not sync ONE PACKET */
556 if (cp
->flags
& IP_VS_CONN_F_ONE_PACKET
)
559 if (!ip_vs_sync_conn_needed(ipvs
, cp
, pkts
))
562 spin_lock_bh(&ipvs
->sync_buff_lock
);
563 if (!(ipvs
->sync_state
& IP_VS_STATE_MASTER
)) {
564 spin_unlock_bh(&ipvs
->sync_buff_lock
);
568 id
= select_master_thread_id(ipvs
, cp
);
570 buff
= ms
->sync_buff
;
571 len
= (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
) ? FULL_CONN_SIZE
:
574 m
= (struct ip_vs_sync_mesg_v0
*) buff
->mesg
;
575 /* Send buffer if it is for v1 */
576 if (buff
->head
+ len
> buff
->end
|| !m
->nr_conns
) {
577 sb_queue_tail(ipvs
, ms
);
578 ms
->sync_buff
= NULL
;
583 buff
= ip_vs_sync_buff_create_v0(ipvs
, len
);
585 spin_unlock_bh(&ipvs
->sync_buff_lock
);
586 pr_err("ip_vs_sync_buff_create failed.\n");
589 ms
->sync_buff
= buff
;
592 m
= (struct ip_vs_sync_mesg_v0
*) buff
->mesg
;
593 s
= (struct ip_vs_sync_conn_v0
*) buff
->head
;
597 s
->protocol
= cp
->protocol
;
598 s
->cport
= cp
->cport
;
599 s
->vport
= cp
->vport
;
600 s
->dport
= cp
->dport
;
601 s
->caddr
= cp
->caddr
.ip
;
602 s
->vaddr
= cp
->vaddr
.ip
;
603 s
->daddr
= cp
->daddr
.ip
;
604 s
->flags
= htons(cp
->flags
& ~IP_VS_CONN_F_HASHED
);
605 s
->state
= htons(cp
->state
);
606 if (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
) {
607 struct ip_vs_sync_conn_options
*opt
=
608 (struct ip_vs_sync_conn_options
*)&s
[1];
609 memcpy(opt
, &cp
->in_seq
, sizeof(*opt
));
613 m
->size
= htons(ntohs(m
->size
) + len
);
615 spin_unlock_bh(&ipvs
->sync_buff_lock
);
617 /* synchronize its controller if it has */
620 if (cp
->flags
& IP_VS_CONN_F_TEMPLATE
)
621 pkts
= atomic_add_return(1, &cp
->in_pkts
);
623 pkts
= sysctl_sync_threshold(ipvs
);
624 ip_vs_sync_conn(ipvs
, cp
, pkts
);
629 * Add an ip_vs_conn information into the current sync_buff.
630 * Called by ip_vs_in.
631 * Sending Version 1 messages
633 void ip_vs_sync_conn(struct netns_ipvs
*ipvs
, struct ip_vs_conn
*cp
, int pkts
)
635 struct ip_vs_sync_mesg
*m
;
636 union ip_vs_sync_conn
*s
;
637 struct ip_vs_sync_buff
*buff
;
638 struct ipvs_master_sync_state
*ms
;
641 unsigned int len
, pe_name_len
, pad
;
643 /* Handle old version of the protocol */
644 if (sysctl_sync_ver(ipvs
) == 0) {
645 ip_vs_sync_conn_v0(ipvs
, cp
, pkts
);
648 /* Do not sync ONE PACKET */
649 if (cp
->flags
& IP_VS_CONN_F_ONE_PACKET
)
652 if (!ip_vs_sync_conn_needed(ipvs
, cp
, pkts
))
657 if (cp
->pe_data_len
) {
658 if (!cp
->pe_data
|| !cp
->dest
) {
659 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
662 pe_name_len
= strnlen(cp
->pe
->name
, IP_VS_PENAME_MAXLEN
);
665 spin_lock_bh(&ipvs
->sync_buff_lock
);
666 if (!(ipvs
->sync_state
& IP_VS_STATE_MASTER
)) {
667 spin_unlock_bh(&ipvs
->sync_buff_lock
);
671 id
= select_master_thread_id(ipvs
, cp
);
674 #ifdef CONFIG_IP_VS_IPV6
675 if (cp
->af
== AF_INET6
)
676 len
= sizeof(struct ip_vs_sync_v6
);
679 len
= sizeof(struct ip_vs_sync_v4
);
681 if (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
)
682 len
+= sizeof(struct ip_vs_sync_conn_options
) + 2;
685 len
+= cp
->pe_data_len
+ 2; /* + Param hdr field */
687 len
+= pe_name_len
+ 2;
689 /* check if there is a space for this one */
691 buff
= ms
->sync_buff
;
694 pad
= (4 - (size_t) buff
->head
) & 3;
695 /* Send buffer if it is for v0 */
696 if (buff
->head
+ len
+ pad
> buff
->end
|| m
->reserved
) {
697 sb_queue_tail(ipvs
, ms
);
698 ms
->sync_buff
= NULL
;
705 buff
= ip_vs_sync_buff_create(ipvs
, len
);
707 spin_unlock_bh(&ipvs
->sync_buff_lock
);
708 pr_err("ip_vs_sync_buff_create failed.\n");
711 ms
->sync_buff
= buff
;
716 buff
->head
+= pad
+ len
;
717 m
->size
= htons(ntohs(m
->size
) + pad
+ len
);
718 /* Add ev. padding from prev. sync_conn */
722 s
= (union ip_vs_sync_conn
*)p
;
724 /* Set message type & copy members */
725 s
->v4
.type
= (cp
->af
== AF_INET6
? STYPE_F_INET6
: 0);
726 s
->v4
.ver_size
= htons(len
& SVER_MASK
); /* Version 0 */
727 s
->v4
.flags
= htonl(cp
->flags
& ~IP_VS_CONN_F_HASHED
);
728 s
->v4
.state
= htons(cp
->state
);
729 s
->v4
.protocol
= cp
->protocol
;
730 s
->v4
.cport
= cp
->cport
;
731 s
->v4
.vport
= cp
->vport
;
732 s
->v4
.dport
= cp
->dport
;
733 s
->v4
.fwmark
= htonl(cp
->fwmark
);
734 s
->v4
.timeout
= htonl(cp
->timeout
/ HZ
);
737 #ifdef CONFIG_IP_VS_IPV6
738 if (cp
->af
== AF_INET6
) {
739 p
+= sizeof(struct ip_vs_sync_v6
);
740 s
->v6
.caddr
= cp
->caddr
.in6
;
741 s
->v6
.vaddr
= cp
->vaddr
.in6
;
742 s
->v6
.daddr
= cp
->daddr
.in6
;
746 p
+= sizeof(struct ip_vs_sync_v4
); /* options ptr */
747 s
->v4
.caddr
= cp
->caddr
.ip
;
748 s
->v4
.vaddr
= cp
->vaddr
.ip
;
749 s
->v4
.daddr
= cp
->daddr
.ip
;
751 if (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
) {
752 *(p
++) = IPVS_OPT_SEQ_DATA
;
753 *(p
++) = sizeof(struct ip_vs_sync_conn_options
);
754 hton_seq((struct ip_vs_seq
*)p
, &cp
->in_seq
);
755 p
+= sizeof(struct ip_vs_seq
);
756 hton_seq((struct ip_vs_seq
*)p
, &cp
->out_seq
);
757 p
+= sizeof(struct ip_vs_seq
);
760 if (cp
->pe_data_len
&& cp
->pe_data
) {
761 *(p
++) = IPVS_OPT_PE_DATA
;
762 *(p
++) = cp
->pe_data_len
;
763 memcpy(p
, cp
->pe_data
, cp
->pe_data_len
);
764 p
+= cp
->pe_data_len
;
767 *(p
++) = IPVS_OPT_PE_NAME
;
768 *(p
++) = pe_name_len
;
769 memcpy(p
, cp
->pe
->name
, pe_name_len
);
774 spin_unlock_bh(&ipvs
->sync_buff_lock
);
777 /* synchronize its controller if it has */
781 if (cp
->flags
& IP_VS_CONN_F_TEMPLATE
)
782 pkts
= atomic_add_return(1, &cp
->in_pkts
);
784 pkts
= sysctl_sync_threshold(ipvs
);
789 * fill_param used by version 1
792 ip_vs_conn_fill_param_sync(struct netns_ipvs
*ipvs
, int af
, union ip_vs_sync_conn
*sc
,
793 struct ip_vs_conn_param
*p
,
794 __u8
*pe_data
, unsigned int pe_data_len
,
795 __u8
*pe_name
, unsigned int pe_name_len
)
797 #ifdef CONFIG_IP_VS_IPV6
799 ip_vs_conn_fill_param(ipvs
, af
, sc
->v6
.protocol
,
800 (const union nf_inet_addr
*)&sc
->v6
.caddr
,
802 (const union nf_inet_addr
*)&sc
->v6
.vaddr
,
806 ip_vs_conn_fill_param(ipvs
, af
, sc
->v4
.protocol
,
807 (const union nf_inet_addr
*)&sc
->v4
.caddr
,
809 (const union nf_inet_addr
*)&sc
->v4
.vaddr
,
814 char buff
[IP_VS_PENAME_MAXLEN
+1];
816 memcpy(buff
, pe_name
, pe_name_len
);
818 p
->pe
= __ip_vs_pe_getbyname(buff
);
820 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
825 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
829 p
->pe_data
= kmemdup(pe_data
, pe_data_len
, GFP_ATOMIC
);
831 module_put(p
->pe
->module
);
834 p
->pe_data_len
= pe_data_len
;
840 * Connection Add / Update.
841 * Common for version 0 and 1 reception of backup sync_conns.
845 static void ip_vs_proc_conn(struct netns_ipvs
*ipvs
, struct ip_vs_conn_param
*param
,
846 unsigned int flags
, unsigned int state
,
847 unsigned int protocol
, unsigned int type
,
848 const union nf_inet_addr
*daddr
, __be16 dport
,
849 unsigned long timeout
, __u32 fwmark
,
850 struct ip_vs_sync_conn_options
*opt
)
852 struct ip_vs_dest
*dest
;
853 struct ip_vs_conn
*cp
;
855 if (!(flags
& IP_VS_CONN_F_TEMPLATE
)) {
856 cp
= ip_vs_conn_in_get(param
);
857 if (cp
&& ((cp
->dport
!= dport
) ||
858 !ip_vs_addr_equal(cp
->daf
, &cp
->daddr
, daddr
))) {
859 if (!(flags
& IP_VS_CONN_F_INACTIVE
)) {
860 ip_vs_conn_expire_now(cp
);
861 __ip_vs_conn_put(cp
);
864 /* This is the expiration message for the
865 * connection that was already replaced, so we
868 __ip_vs_conn_put(cp
);
869 kfree(param
->pe_data
);
874 cp
= ip_vs_ct_in_get(param
);
879 kfree(param
->pe_data
);
882 spin_lock_bh(&cp
->lock
);
883 if ((cp
->flags
^ flags
) & IP_VS_CONN_F_INACTIVE
&&
884 !(flags
& IP_VS_CONN_F_TEMPLATE
) && dest
) {
885 if (flags
& IP_VS_CONN_F_INACTIVE
) {
886 atomic_dec(&dest
->activeconns
);
887 atomic_inc(&dest
->inactconns
);
889 atomic_inc(&dest
->activeconns
);
890 atomic_dec(&dest
->inactconns
);
893 flags
&= IP_VS_CONN_F_BACKUP_UPD_MASK
;
894 flags
|= cp
->flags
& ~IP_VS_CONN_F_BACKUP_UPD_MASK
;
896 spin_unlock_bh(&cp
->lock
);
898 ip_vs_try_bind_dest(cp
);
901 * Find the appropriate destination for the connection.
902 * If it is not found the connection will remain unbound
906 /* This function is only invoked by the synchronization
907 * code. We do not currently support heterogeneous pools
908 * with synchronization, so we can make the assumption that
909 * the svc_af is the same as the dest_af
911 dest
= ip_vs_find_dest(ipvs
, type
, type
, daddr
, dport
,
912 param
->vaddr
, param
->vport
, protocol
,
915 cp
= ip_vs_conn_new(param
, type
, daddr
, dport
, flags
, dest
,
919 kfree(param
->pe_data
);
920 IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
923 if (!(flags
& IP_VS_CONN_F_TEMPLATE
))
924 kfree(param
->pe_data
);
928 cp
->in_seq
= opt
->in_seq
;
929 cp
->out_seq
= opt
->out_seq
;
931 atomic_set(&cp
->in_pkts
, sysctl_sync_threshold(ipvs
));
933 cp
->old_state
= cp
->state
;
935 * For Ver 0 messages style
936 * - Not possible to recover the right timeout for templates
937 * - can not find the right fwmark
938 * virtual service. If needed, we can do it for
939 * non-fwmark persistent services.
940 * Ver 1 messages style.
944 if (timeout
> MAX_SCHEDULE_TIMEOUT
/ HZ
)
945 timeout
= MAX_SCHEDULE_TIMEOUT
/ HZ
;
946 cp
->timeout
= timeout
*HZ
;
948 struct ip_vs_proto_data
*pd
;
950 pd
= ip_vs_proto_data_get(ipvs
, protocol
);
951 if (!(flags
& IP_VS_CONN_F_TEMPLATE
) && pd
&& pd
->timeout_table
)
952 cp
->timeout
= pd
->timeout_table
[state
];
954 cp
->timeout
= (3*60*HZ
);
960 * Process received multicast message for Version 0
962 static void ip_vs_process_message_v0(struct netns_ipvs
*ipvs
, const char *buffer
,
965 struct ip_vs_sync_mesg_v0
*m
= (struct ip_vs_sync_mesg_v0
*)buffer
;
966 struct ip_vs_sync_conn_v0
*s
;
967 struct ip_vs_sync_conn_options
*opt
;
968 struct ip_vs_protocol
*pp
;
969 struct ip_vs_conn_param param
;
973 p
= (char *)buffer
+ sizeof(struct ip_vs_sync_mesg_v0
);
974 for (i
=0; i
<m
->nr_conns
; i
++) {
975 unsigned int flags
, state
;
977 if (p
+ SIMPLE_CONN_SIZE
> buffer
+buflen
) {
978 IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
981 s
= (struct ip_vs_sync_conn_v0
*) p
;
982 flags
= ntohs(s
->flags
) | IP_VS_CONN_F_SYNC
;
983 flags
&= ~IP_VS_CONN_F_HASHED
;
984 if (flags
& IP_VS_CONN_F_SEQ_MASK
) {
985 opt
= (struct ip_vs_sync_conn_options
*)&s
[1];
987 if (p
> buffer
+buflen
) {
988 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
993 p
+= SIMPLE_CONN_SIZE
;
996 state
= ntohs(s
->state
);
997 if (!(flags
& IP_VS_CONN_F_TEMPLATE
)) {
998 pp
= ip_vs_proto_get(s
->protocol
);
1000 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
1004 if (state
>= pp
->num_states
) {
1005 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
1010 /* protocol in templates is not used for state/timeout */
1012 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
1018 ip_vs_conn_fill_param(ipvs
, AF_INET
, s
->protocol
,
1019 (const union nf_inet_addr
*)&s
->caddr
,
1021 (const union nf_inet_addr
*)&s
->vaddr
,
1024 /* Send timeout as Zero */
1025 ip_vs_proc_conn(ipvs
, ¶m
, flags
, state
, s
->protocol
, AF_INET
,
1026 (union nf_inet_addr
*)&s
->daddr
, s
->dport
,
1034 static inline int ip_vs_proc_seqopt(__u8
*p
, unsigned int plen
,
1036 struct ip_vs_sync_conn_options
*opt
)
1038 struct ip_vs_sync_conn_options
*topt
;
1040 topt
= (struct ip_vs_sync_conn_options
*)p
;
1042 if (plen
!= sizeof(struct ip_vs_sync_conn_options
)) {
1043 IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
1046 if (*opt_flags
& IPVS_OPT_F_SEQ_DATA
) {
1047 IP_VS_DBG(2, "BACKUP, conn options found twice\n");
1050 ntoh_seq(&topt
->in_seq
, &opt
->in_seq
);
1051 ntoh_seq(&topt
->out_seq
, &opt
->out_seq
);
1052 *opt_flags
|= IPVS_OPT_F_SEQ_DATA
;
1056 static int ip_vs_proc_str(__u8
*p
, unsigned int plen
, unsigned int *data_len
,
1057 __u8
**data
, unsigned int maxlen
,
1058 __u32
*opt_flags
, __u32 flag
)
1060 if (plen
> maxlen
) {
1061 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen
);
1064 if (*opt_flags
& flag
) {
1065 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag
);
1074 * Process a Version 1 sync. connection
1076 static inline int ip_vs_proc_sync_conn(struct netns_ipvs
*ipvs
, __u8
*p
, __u8
*msg_end
)
1078 struct ip_vs_sync_conn_options opt
;
1079 union ip_vs_sync_conn
*s
;
1080 struct ip_vs_protocol
*pp
;
1081 struct ip_vs_conn_param param
;
1083 unsigned int af
, state
, pe_data_len
=0, pe_name_len
=0;
1084 __u8
*pe_data
=NULL
, *pe_name
=NULL
;
1088 s
= (union ip_vs_sync_conn
*) p
;
1090 if (s
->v6
.type
& STYPE_F_INET6
) {
1091 #ifdef CONFIG_IP_VS_IPV6
1093 p
+= sizeof(struct ip_vs_sync_v6
);
1095 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
1099 } else if (!s
->v4
.type
) {
1101 p
+= sizeof(struct ip_vs_sync_v4
);
1108 /* Process optional params check Type & Len. */
1109 while (p
< msg_end
) {
1118 if (!plen
|| ((p
+ plen
) > msg_end
))
1120 /* Handle seq option p = param data */
1121 switch (ptype
& ~IPVS_OPT_F_PARAM
) {
1122 case IPVS_OPT_SEQ_DATA
:
1123 if (ip_vs_proc_seqopt(p
, plen
, &opt_flags
, &opt
))
1127 case IPVS_OPT_PE_DATA
:
1128 if (ip_vs_proc_str(p
, plen
, &pe_data_len
, &pe_data
,
1129 IP_VS_PEDATA_MAXLEN
, &opt_flags
,
1130 IPVS_OPT_F_PE_DATA
))
1134 case IPVS_OPT_PE_NAME
:
1135 if (ip_vs_proc_str(p
, plen
,&pe_name_len
, &pe_name
,
1136 IP_VS_PENAME_MAXLEN
, &opt_flags
,
1137 IPVS_OPT_F_PE_NAME
))
1142 /* Param data mandatory ? */
1143 if (!(ptype
& IPVS_OPT_F_PARAM
)) {
1144 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1145 ptype
& ~IPVS_OPT_F_PARAM
);
1150 p
+= plen
; /* Next option */
1153 /* Get flags and Mask off unsupported */
1154 flags
= ntohl(s
->v4
.flags
) & IP_VS_CONN_F_BACKUP_MASK
;
1155 flags
|= IP_VS_CONN_F_SYNC
;
1156 state
= ntohs(s
->v4
.state
);
1158 if (!(flags
& IP_VS_CONN_F_TEMPLATE
)) {
1159 pp
= ip_vs_proto_get(s
->v4
.protocol
);
1161 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
1166 if (state
>= pp
->num_states
) {
1167 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
1173 /* protocol in templates is not used for state/timeout */
1175 IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
1180 if (ip_vs_conn_fill_param_sync(ipvs
, af
, s
, ¶m
, pe_data
,
1181 pe_data_len
, pe_name
, pe_name_len
)) {
1185 /* If only IPv4, just silent skip IPv6 */
1187 ip_vs_proc_conn(ipvs
, ¶m
, flags
, state
, s
->v4
.protocol
, af
,
1188 (union nf_inet_addr
*)&s
->v4
.daddr
, s
->v4
.dport
,
1189 ntohl(s
->v4
.timeout
), ntohl(s
->v4
.fwmark
),
1190 (opt_flags
& IPVS_OPT_F_SEQ_DATA
? &opt
: NULL
)
1192 #ifdef CONFIG_IP_VS_IPV6
1194 ip_vs_proc_conn(ipvs
, ¶m
, flags
, state
, s
->v6
.protocol
, af
,
1195 (union nf_inet_addr
*)&s
->v6
.daddr
, s
->v6
.dport
,
1196 ntohl(s
->v6
.timeout
), ntohl(s
->v6
.fwmark
),
1197 (opt_flags
& IPVS_OPT_F_SEQ_DATA
? &opt
: NULL
)
1200 ip_vs_pe_put(param
.pe
);
1204 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc
);
1209 * Process received multicast message and create the corresponding
1210 * ip_vs_conn entries.
1211 * Handles Version 0 & 1
1213 static void ip_vs_process_message(struct netns_ipvs
*ipvs
, __u8
*buffer
,
1214 const size_t buflen
)
1216 struct ip_vs_sync_mesg
*m2
= (struct ip_vs_sync_mesg
*)buffer
;
1220 if (buflen
< sizeof(struct ip_vs_sync_mesg_v0
)) {
1221 IP_VS_DBG(2, "BACKUP, message header too short\n");
1225 if (buflen
!= ntohs(m2
->size
)) {
1226 IP_VS_DBG(2, "BACKUP, bogus message size\n");
1229 /* SyncID sanity check */
1230 if (ipvs
->bcfg
.syncid
!= 0 && m2
->syncid
!= ipvs
->bcfg
.syncid
) {
1231 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2
->syncid
);
1234 /* Handle version 1 message */
1235 if ((m2
->version
== SYNC_PROTO_VER
) && (m2
->reserved
== 0)
1236 && (m2
->spare
== 0)) {
1238 msg_end
= buffer
+ sizeof(struct ip_vs_sync_mesg
);
1239 nr_conns
= m2
->nr_conns
;
1241 for (i
=0; i
<nr_conns
; i
++) {
1242 union ip_vs_sync_conn
*s
;
1247 if (p
+ sizeof(s
->v4
) > buffer
+buflen
) {
1248 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
1251 s
= (union ip_vs_sync_conn
*)p
;
1252 size
= ntohs(s
->v4
.ver_size
) & SVER_MASK
;
1254 /* Basic sanity checks */
1255 if (msg_end
> buffer
+buflen
) {
1256 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
1259 if (ntohs(s
->v4
.ver_size
) >> SVER_SHIFT
) {
1260 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
1261 ntohs(s
->v4
.ver_size
) >> SVER_SHIFT
);
1264 /* Process a single sync_conn */
1265 retc
= ip_vs_proc_sync_conn(ipvs
, p
, msg_end
);
1267 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
1271 /* Make sure we have 32 bit alignment */
1272 msg_end
= p
+ ((size
+ 3) & ~3);
1275 /* Old type of message */
1276 ip_vs_process_message_v0(ipvs
, buffer
, buflen
);
1283 * Setup sndbuf (mode=1) or rcvbuf (mode=0)
1285 static void set_sock_size(struct sock
*sk
, int mode
, int val
)
1287 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
1288 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
1291 val
= clamp_t(int, val
, (SOCK_MIN_SNDBUF
+ 1) / 2,
1293 sk
->sk_sndbuf
= val
* 2;
1294 sk
->sk_userlocks
|= SOCK_SNDBUF_LOCK
;
1296 val
= clamp_t(int, val
, (SOCK_MIN_RCVBUF
+ 1) / 2,
1298 sk
->sk_rcvbuf
= val
* 2;
1299 sk
->sk_userlocks
|= SOCK_RCVBUF_LOCK
;
1305 * Setup loopback of outgoing multicasts on a sending socket
1307 static void set_mcast_loop(struct sock
*sk
, u_char loop
)
1309 struct inet_sock
*inet
= inet_sk(sk
);
1311 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
1313 inet
->mc_loop
= loop
? 1 : 0;
1314 #ifdef CONFIG_IP_VS_IPV6
1315 if (sk
->sk_family
== AF_INET6
) {
1316 struct ipv6_pinfo
*np
= inet6_sk(sk
);
1318 /* IPV6_MULTICAST_LOOP */
1319 np
->mc_loop
= loop
? 1 : 0;
1326 * Specify TTL for outgoing multicasts on a sending socket
1328 static void set_mcast_ttl(struct sock
*sk
, u_char ttl
)
1330 struct inet_sock
*inet
= inet_sk(sk
);
1332 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
1335 #ifdef CONFIG_IP_VS_IPV6
1336 if (sk
->sk_family
== AF_INET6
) {
1337 struct ipv6_pinfo
*np
= inet6_sk(sk
);
1339 /* IPV6_MULTICAST_HOPS */
1340 np
->mcast_hops
= ttl
;
1346 /* Control fragmentation of messages */
1347 static void set_mcast_pmtudisc(struct sock
*sk
, int val
)
1349 struct inet_sock
*inet
= inet_sk(sk
);
1351 /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
1353 inet
->pmtudisc
= val
;
1354 #ifdef CONFIG_IP_VS_IPV6
1355 if (sk
->sk_family
== AF_INET6
) {
1356 struct ipv6_pinfo
*np
= inet6_sk(sk
);
1358 /* IPV6_MTU_DISCOVER */
1366 * Specifiy default interface for outgoing multicasts
1368 static int set_mcast_if(struct sock
*sk
, struct net_device
*dev
)
1370 struct inet_sock
*inet
= inet_sk(sk
);
1372 if (sk
->sk_bound_dev_if
&& dev
->ifindex
!= sk
->sk_bound_dev_if
)
1376 inet
->mc_index
= dev
->ifindex
;
1377 /* inet->mc_addr = 0; */
1378 #ifdef CONFIG_IP_VS_IPV6
1379 if (sk
->sk_family
== AF_INET6
) {
1380 struct ipv6_pinfo
*np
= inet6_sk(sk
);
1382 /* IPV6_MULTICAST_IF */
1383 np
->mcast_oif
= dev
->ifindex
;
1393 * Join a multicast group.
1394 * the group is specified by a class D multicast address 224.0.0.0/8
1395 * in the in_addr structure passed in as a parameter.
1398 join_mcast_group(struct sock
*sk
, struct in_addr
*addr
, struct net_device
*dev
)
1400 struct ip_mreqn mreq
;
1403 memset(&mreq
, 0, sizeof(mreq
));
1404 memcpy(&mreq
.imr_multiaddr
, addr
, sizeof(struct in_addr
));
1406 if (sk
->sk_bound_dev_if
&& dev
->ifindex
!= sk
->sk_bound_dev_if
)
1409 mreq
.imr_ifindex
= dev
->ifindex
;
1412 ret
= ip_mc_join_group(sk
, &mreq
);
1418 #ifdef CONFIG_IP_VS_IPV6
1419 static int join_mcast_group6(struct sock
*sk
, struct in6_addr
*addr
,
1420 struct net_device
*dev
)
1424 if (sk
->sk_bound_dev_if
&& dev
->ifindex
!= sk
->sk_bound_dev_if
)
1428 ret
= ipv6_sock_mc_join(sk
, dev
->ifindex
, addr
);
1435 static int bind_mcastif_addr(struct socket
*sock
, struct net_device
*dev
)
1438 struct sockaddr_in sin
;
1440 addr
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
1442 pr_err("You probably need to specify IP address on "
1443 "multicast interface.\n");
1445 IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
1448 /* Now bind the socket with the address of multicast interface */
1449 sin
.sin_family
= AF_INET
;
1450 sin
.sin_addr
.s_addr
= addr
;
1453 return sock
->ops
->bind(sock
, (struct sockaddr
*)&sin
, sizeof(sin
));
1456 static void get_mcast_sockaddr(union ipvs_sockaddr
*sa
, int *salen
,
1457 struct ipvs_sync_daemon_cfg
*c
, int id
)
1459 if (AF_INET6
== c
->mcast_af
) {
1460 sa
->in6
= (struct sockaddr_in6
) {
1461 .sin6_family
= AF_INET6
,
1462 .sin6_port
= htons(c
->mcast_port
+ id
),
1464 sa
->in6
.sin6_addr
= c
->mcast_group
.in6
;
1465 *salen
= sizeof(sa
->in6
);
1467 sa
->in
= (struct sockaddr_in
) {
1468 .sin_family
= AF_INET
,
1469 .sin_port
= htons(c
->mcast_port
+ id
),
1471 sa
->in
.sin_addr
= c
->mcast_group
.in
;
1472 *salen
= sizeof(sa
->in
);
1477 * Set up sending multicast socket over UDP
1479 static int make_send_sock(struct netns_ipvs
*ipvs
, int id
,
1480 struct net_device
*dev
, struct socket
**sock_ret
)
1482 /* multicast addr */
1483 union ipvs_sockaddr mcast_addr
;
1484 struct socket
*sock
;
1487 /* First create a socket */
1488 result
= sock_create_kern(ipvs
->net
, ipvs
->mcfg
.mcast_af
, SOCK_DGRAM
,
1489 IPPROTO_UDP
, &sock
);
1491 pr_err("Error during creation of socket; terminating\n");
1495 result
= set_mcast_if(sock
->sk
, dev
);
1497 pr_err("Error setting outbound mcast interface\n");
1501 set_mcast_loop(sock
->sk
, 0);
1502 set_mcast_ttl(sock
->sk
, ipvs
->mcfg
.mcast_ttl
);
1503 /* Allow fragmentation if MTU changes */
1504 set_mcast_pmtudisc(sock
->sk
, IP_PMTUDISC_DONT
);
1505 result
= sysctl_sync_sock_size(ipvs
);
1507 set_sock_size(sock
->sk
, 1, result
);
1509 if (AF_INET
== ipvs
->mcfg
.mcast_af
)
1510 result
= bind_mcastif_addr(sock
, dev
);
1514 pr_err("Error binding address of the mcast interface\n");
1518 get_mcast_sockaddr(&mcast_addr
, &salen
, &ipvs
->mcfg
, id
);
1519 result
= sock
->ops
->connect(sock
, (struct sockaddr
*) &mcast_addr
,
1522 pr_err("Error connecting to the multicast addr\n");
1534 * Set up receiving multicast socket over UDP
1536 static int make_receive_sock(struct netns_ipvs
*ipvs
, int id
,
1537 struct net_device
*dev
, struct socket
**sock_ret
)
1539 /* multicast addr */
1540 union ipvs_sockaddr mcast_addr
;
1541 struct socket
*sock
;
1544 /* First create a socket */
1545 result
= sock_create_kern(ipvs
->net
, ipvs
->bcfg
.mcast_af
, SOCK_DGRAM
,
1546 IPPROTO_UDP
, &sock
);
1548 pr_err("Error during creation of socket; terminating\n");
1552 /* it is equivalent to the REUSEADDR option in user-space */
1553 sock
->sk
->sk_reuse
= SK_CAN_REUSE
;
1554 result
= sysctl_sync_sock_size(ipvs
);
1556 set_sock_size(sock
->sk
, 0, result
);
1558 get_mcast_sockaddr(&mcast_addr
, &salen
, &ipvs
->bcfg
, id
);
1559 sock
->sk
->sk_bound_dev_if
= dev
->ifindex
;
1560 result
= sock
->ops
->bind(sock
, (struct sockaddr
*)&mcast_addr
, salen
);
1562 pr_err("Error binding to the multicast addr\n");
1566 /* join the multicast group */
1567 #ifdef CONFIG_IP_VS_IPV6
1568 if (ipvs
->bcfg
.mcast_af
== AF_INET6
)
1569 result
= join_mcast_group6(sock
->sk
, &mcast_addr
.in6
.sin6_addr
,
1573 result
= join_mcast_group(sock
->sk
, &mcast_addr
.in
.sin_addr
,
1576 pr_err("Error joining to the multicast group\n");
1588 ip_vs_send_async(struct socket
*sock
, const char *buffer
, const size_t length
)
1590 struct msghdr msg
= {.msg_flags
= MSG_DONTWAIT
|MSG_NOSIGNAL
};
1595 iov
.iov_base
= (void *)buffer
;
1596 iov
.iov_len
= length
;
1598 len
= kernel_sendmsg(sock
, &msg
, &iov
, 1, (size_t)(length
));
1605 ip_vs_send_sync_msg(struct socket
*sock
, struct ip_vs_sync_mesg
*msg
)
1610 msize
= ntohs(msg
->size
);
1612 ret
= ip_vs_send_async(sock
, (char *)msg
, msize
);
1613 if (ret
>= 0 || ret
== -EAGAIN
)
1615 pr_err("ip_vs_send_async error %d\n", ret
);
1620 ip_vs_receive(struct socket
*sock
, char *buffer
, const size_t buflen
)
1622 struct msghdr msg
= {NULL
,};
1628 /* Receive a packet */
1629 iov
.iov_base
= buffer
;
1630 iov
.iov_len
= (size_t)buflen
;
1632 len
= kernel_recvmsg(sock
, &msg
, &iov
, 1, buflen
, MSG_DONTWAIT
);
1641 /* Wakeup the master thread for sending */
1642 static void master_wakeup_work_handler(struct work_struct
*work
)
1644 struct ipvs_master_sync_state
*ms
=
1645 container_of(work
, struct ipvs_master_sync_state
,
1646 master_wakeup_work
.work
);
1647 struct netns_ipvs
*ipvs
= ms
->ipvs
;
1649 spin_lock_bh(&ipvs
->sync_lock
);
1650 if (ms
->sync_queue_len
&&
1651 ms
->sync_queue_delay
< IPVS_SYNC_WAKEUP_RATE
) {
1652 int id
= (int)(ms
- ipvs
->ms
);
1654 ms
->sync_queue_delay
= IPVS_SYNC_WAKEUP_RATE
;
1655 wake_up_process(ipvs
->master_tinfo
[id
].task
);
1657 spin_unlock_bh(&ipvs
->sync_lock
);
1660 /* Get next buffer to send */
1661 static inline struct ip_vs_sync_buff
*
1662 next_sync_buff(struct netns_ipvs
*ipvs
, struct ipvs_master_sync_state
*ms
)
1664 struct ip_vs_sync_buff
*sb
;
1666 sb
= sb_dequeue(ipvs
, ms
);
1669 /* Do not delay entries in buffer for more than 2 seconds */
1670 return get_curr_sync_buff(ipvs
, ms
, IPVS_SYNC_FLUSH_TIME
);
1673 static int sync_thread_master(void *data
)
1675 struct ip_vs_sync_thread_data
*tinfo
= data
;
1676 struct netns_ipvs
*ipvs
= tinfo
->ipvs
;
1677 struct ipvs_master_sync_state
*ms
= &ipvs
->ms
[tinfo
->id
];
1678 struct sock
*sk
= tinfo
->sock
->sk
;
1679 struct ip_vs_sync_buff
*sb
;
1681 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
1682 "syncid = %d, id = %d\n",
1683 ipvs
->mcfg
.mcast_ifn
, ipvs
->mcfg
.syncid
, tinfo
->id
);
1686 sb
= next_sync_buff(ipvs
, ms
);
1687 if (unlikely(kthread_should_stop()))
1690 schedule_timeout(IPVS_SYNC_CHECK_PERIOD
);
1693 while (ip_vs_send_sync_msg(tinfo
->sock
, sb
->mesg
) < 0) {
1694 /* (Ab)use interruptible sleep to avoid increasing
1697 __wait_event_interruptible(*sk_sleep(sk
),
1698 sock_writeable(sk
) ||
1699 kthread_should_stop());
1700 if (unlikely(kthread_should_stop()))
1703 ip_vs_sync_buff_release(sb
);
1707 __set_current_state(TASK_RUNNING
);
1709 ip_vs_sync_buff_release(sb
);
1711 /* clean up the sync_buff queue */
1712 while ((sb
= sb_dequeue(ipvs
, ms
)))
1713 ip_vs_sync_buff_release(sb
);
1714 __set_current_state(TASK_RUNNING
);
1716 /* clean up the current sync_buff */
1717 sb
= get_curr_sync_buff(ipvs
, ms
, 0);
1719 ip_vs_sync_buff_release(sb
);
1725 static int sync_thread_backup(void *data
)
1727 struct ip_vs_sync_thread_data
*tinfo
= data
;
1728 struct netns_ipvs
*ipvs
= tinfo
->ipvs
;
1731 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
1732 "syncid = %d, id = %d\n",
1733 ipvs
->bcfg
.mcast_ifn
, ipvs
->bcfg
.syncid
, tinfo
->id
);
1735 while (!kthread_should_stop()) {
1736 wait_event_interruptible(*sk_sleep(tinfo
->sock
->sk
),
1737 !skb_queue_empty(&tinfo
->sock
->sk
->sk_receive_queue
)
1738 || kthread_should_stop());
1740 /* do we have data now? */
1741 while (!skb_queue_empty(&(tinfo
->sock
->sk
->sk_receive_queue
))) {
1742 len
= ip_vs_receive(tinfo
->sock
, tinfo
->buf
,
1743 ipvs
->bcfg
.sync_maxlen
);
1746 pr_err("receiving message error\n");
1750 ip_vs_process_message(ipvs
, tinfo
->buf
, len
);
1758 int start_sync_thread(struct netns_ipvs
*ipvs
, struct ipvs_sync_daemon_cfg
*c
,
1761 struct ip_vs_sync_thread_data
*ti
= NULL
, *tinfo
;
1762 struct task_struct
*task
;
1763 struct net_device
*dev
;
1765 int (*threadfn
)(void *data
);
1766 int id
= 0, count
, hlen
;
1767 int result
= -ENOMEM
;
1770 IP_VS_DBG(7, "%s(): pid %d\n", __func__
, task_pid_nr(current
));
1771 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n",
1772 sizeof(struct ip_vs_sync_conn_v0
));
1774 /* Do not hold one mutex and then to block on another */
1777 if (mutex_trylock(&ipvs
->sync_mutex
))
1780 mutex_lock(&ipvs
->sync_mutex
);
1783 mutex_unlock(&ipvs
->sync_mutex
);
1786 if (!ipvs
->sync_state
) {
1787 count
= clamp(sysctl_sync_ports(ipvs
), 1, IPVS_SYNC_PORTS_MAX
);
1788 ipvs
->threads_mask
= count
- 1;
1790 count
= ipvs
->threads_mask
+ 1;
1792 if (c
->mcast_af
== AF_UNSPEC
) {
1793 c
->mcast_af
= AF_INET
;
1794 c
->mcast_group
.ip
= cpu_to_be32(IP_VS_SYNC_GROUP
);
1797 c
->mcast_port
= IP_VS_SYNC_PORT
;
1801 dev
= __dev_get_by_name(ipvs
->net
, c
->mcast_ifn
);
1803 pr_err("Unknown mcast interface: %s\n", c
->mcast_ifn
);
1807 hlen
= (AF_INET6
== c
->mcast_af
) ?
1808 sizeof(struct ipv6hdr
) + sizeof(struct udphdr
) :
1809 sizeof(struct iphdr
) + sizeof(struct udphdr
);
1810 mtu
= (state
== IP_VS_STATE_BACKUP
) ?
1811 clamp(dev
->mtu
, 1500U, 65535U) : 1500U;
1812 min_mtu
= (state
== IP_VS_STATE_BACKUP
) ? 1024 : 1;
1815 c
->sync_maxlen
= clamp_t(unsigned int,
1816 c
->sync_maxlen
, min_mtu
,
1819 c
->sync_maxlen
= mtu
- hlen
;
1821 if (state
== IP_VS_STATE_MASTER
) {
1827 name
= "ipvs-m:%d:%d";
1828 threadfn
= sync_thread_master
;
1829 } else if (state
== IP_VS_STATE_BACKUP
) {
1831 if (ipvs
->backup_tinfo
)
1835 name
= "ipvs-b:%d:%d";
1836 threadfn
= sync_thread_backup
;
1842 if (state
== IP_VS_STATE_MASTER
) {
1843 struct ipvs_master_sync_state
*ms
;
1846 ipvs
->ms
= kcalloc(count
, sizeof(ipvs
->ms
[0]), GFP_KERNEL
);
1850 for (id
= 0; id
< count
; id
++, ms
++) {
1851 INIT_LIST_HEAD(&ms
->sync_queue
);
1852 ms
->sync_queue_len
= 0;
1853 ms
->sync_queue_delay
= 0;
1854 INIT_DELAYED_WORK(&ms
->master_wakeup_work
,
1855 master_wakeup_work_handler
);
1860 ti
= kcalloc(count
, sizeof(struct ip_vs_sync_thread_data
),
1865 for (id
= 0; id
< count
; id
++) {
1868 if (state
== IP_VS_STATE_BACKUP
) {
1870 tinfo
->buf
= kmalloc(ipvs
->bcfg
.sync_maxlen
,
1876 if (state
== IP_VS_STATE_MASTER
)
1877 result
= make_send_sock(ipvs
, id
, dev
, &tinfo
->sock
);
1879 result
= make_receive_sock(ipvs
, id
, dev
, &tinfo
->sock
);
1883 task
= kthread_run(threadfn
, tinfo
, name
, ipvs
->gen
, id
);
1885 result
= PTR_ERR(task
);
1891 /* mark as active */
1893 if (state
== IP_VS_STATE_MASTER
)
1894 ipvs
->master_tinfo
= ti
;
1896 ipvs
->backup_tinfo
= ti
;
1897 spin_lock_bh(&ipvs
->sync_buff_lock
);
1898 ipvs
->sync_state
|= state
;
1899 spin_unlock_bh(&ipvs
->sync_buff_lock
);
1901 mutex_unlock(&ipvs
->sync_mutex
);
1904 /* increase the module use count */
1905 ip_vs_use_count_inc();
1910 /* We do not need RTNL lock anymore, release it here so that
1911 * sock_release below can use rtnl_lock to leave the mcast group.
1914 id
= min(id
, count
- 1);
1916 for (tinfo
= ti
+ id
; tinfo
>= ti
; tinfo
--) {
1918 kthread_stop(tinfo
->task
);
1921 if (!(ipvs
->sync_state
& IP_VS_STATE_MASTER
)) {
1925 mutex_unlock(&ipvs
->sync_mutex
);
1927 /* No more mutexes, release socks */
1929 for (tinfo
= ti
+ id
; tinfo
>= ti
; tinfo
--) {
1931 sock_release(tinfo
->sock
);
1939 mutex_unlock(&ipvs
->sync_mutex
);
1945 int stop_sync_thread(struct netns_ipvs
*ipvs
, int state
)
1947 struct ip_vs_sync_thread_data
*ti
, *tinfo
;
1951 IP_VS_DBG(7, "%s(): pid %d\n", __func__
, task_pid_nr(current
));
1953 mutex_lock(&ipvs
->sync_mutex
);
1954 if (state
== IP_VS_STATE_MASTER
) {
1958 ti
= ipvs
->master_tinfo
;
1961 * The lock synchronizes with sb_queue_tail(), so that we don't
1962 * add sync buffers to the queue, when we are already in
1963 * progress of stopping the master sync daemon.
1966 spin_lock_bh(&ipvs
->sync_buff_lock
);
1967 spin_lock(&ipvs
->sync_lock
);
1968 ipvs
->sync_state
&= ~IP_VS_STATE_MASTER
;
1969 spin_unlock(&ipvs
->sync_lock
);
1970 spin_unlock_bh(&ipvs
->sync_buff_lock
);
1973 for (id
= ipvs
->threads_mask
; id
>= 0; id
--) {
1974 struct ipvs_master_sync_state
*ms
= &ipvs
->ms
[id
];
1978 pr_info("stopping master sync thread %d ...\n",
1979 task_pid_nr(tinfo
->task
));
1980 cancel_delayed_work_sync(&ms
->master_wakeup_work
);
1981 ret
= kthread_stop(tinfo
->task
);
1987 ipvs
->master_tinfo
= NULL
;
1988 } else if (state
== IP_VS_STATE_BACKUP
) {
1990 if (!ipvs
->backup_tinfo
)
1992 ti
= ipvs
->backup_tinfo
;
1994 ipvs
->sync_state
&= ~IP_VS_STATE_BACKUP
;
1996 for (id
= ipvs
->threads_mask
; id
>= 0; id
--) {
2000 pr_info("stopping backup sync thread %d ...\n",
2001 task_pid_nr(tinfo
->task
));
2002 ret
= kthread_stop(tinfo
->task
);
2006 ipvs
->backup_tinfo
= NULL
;
2010 id
= ipvs
->threads_mask
;
2011 mutex_unlock(&ipvs
->sync_mutex
);
2013 /* No more mutexes, release socks */
2014 for (tinfo
= ti
+ id
; tinfo
>= ti
; tinfo
--) {
2016 sock_release(tinfo
->sock
);
2021 /* decrease the module use count */
2022 ip_vs_use_count_dec();
2026 mutex_unlock(&ipvs
->sync_mutex
);
2031 * Initialize data struct for each netns
2033 int __net_init
ip_vs_sync_net_init(struct netns_ipvs
*ipvs
)
2035 __mutex_init(&ipvs
->sync_mutex
, "ipvs->sync_mutex", &__ipvs_sync_key
);
2036 spin_lock_init(&ipvs
->sync_lock
);
2037 spin_lock_init(&ipvs
->sync_buff_lock
);
2041 void ip_vs_sync_net_cleanup(struct netns_ipvs
*ipvs
)
2045 retc
= stop_sync_thread(ipvs
, IP_VS_STATE_MASTER
);
2046 if (retc
&& retc
!= -ESRCH
)
2047 pr_err("Failed to stop Master Daemon\n");
2049 retc
= stop_sync_thread(ipvs
, IP_VS_STATE_BACKUP
);
2050 if (retc
&& retc
!= -ESRCH
)
2051 pr_err("Failed to stop Backup Daemon\n");