2 * IPVS An implementation of the IP virtual server support for the
3 * LINUX operating system. IPVS is now implemented as a module
4 * over the NetFilter framework. IPVS can be used to build a
5 * high-performance and highly available server based on a
8 * Version 1, is capable of handling both version 0 and 1 messages.
9 * Version 0 is the plain old format.
10 * Note Version 0 receivers will just drop Ver 1 messages.
11 * Version 1 is capable of handle IPv6, Persistence data,
12 * time-outs, and firewall marks.
13 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
14 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
16 * Definitions Message: is a complete datagram
17 * Sync_conn: is a part of a Message
18 * Param Data is an option to a Sync_conn.
20 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
22 * ip_vs_sync: sync connection info from master load balancer to backups
26 * Alexandre Cassen : Added master & backup support at a time.
27 * Alexandre Cassen : Added SyncID support for incoming sync
29 * Justin Ossevoort : Fix endian problem on sync message size.
30 * Hans Schillstrom : Added Version 1: i.e. IPv6,
31 * Persistence support, fwmark and time-out.
34 #define KMSG_COMPONENT "IPVS"
35 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
37 #include <linux/module.h>
38 #include <linux/slab.h>
39 #include <linux/inetdevice.h>
40 #include <linux/net.h>
41 #include <linux/completion.h>
42 #include <linux/delay.h>
43 #include <linux/skbuff.h>
45 #include <linux/igmp.h> /* for ip_mc_join_group */
46 #include <linux/udp.h>
47 #include <linux/err.h>
48 #include <linux/kthread.h>
49 #include <linux/wait.h>
50 #include <linux/kernel.h>
52 #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
57 #include <net/ip_vs.h>
59 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
60 #define IP_VS_SYNC_PORT 8848 /* multicast port */
62 #define SYNC_PROTO_VER 1 /* Protocol version in header */
65 * IPVS sync connection entry
66 * Version 0, i.e. original version.
68 struct ip_vs_sync_conn_v0
{
71 /* Protocol, addresses and port numbers */
72 __u8 protocol
; /* Which protocol (TCP/UDP) */
76 __be32 caddr
; /* client address */
77 __be32 vaddr
; /* virtual address */
78 __be32 daddr
; /* destination address */
80 /* Flags and state transition */
81 __be16 flags
; /* status flags */
82 __be16 state
; /* state info */
84 /* The sequence options start here */
87 struct ip_vs_sync_conn_options
{
88 struct ip_vs_seq in_seq
; /* incoming seq. struct */
89 struct ip_vs_seq out_seq
; /* outgoing seq. struct */
93 Sync Connection format (sync_conn)
96 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
97 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
98 | Type | Protocol | Ver. | Size |
99 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
103 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
108 | timeout (in sec.) |
109 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
111 | IP-Addresses (v4 or v6) |
113 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
115 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
116 | Param. Type | Param. Length | Param. data |
117 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
119 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
120 | | Param Type | Param. Length |
121 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
123 | Last Param data should be padded for 32 bit alignment |
124 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
128 * Type 0, IPv4 sync connection format
130 struct ip_vs_sync_v4
{
132 __u8 protocol
; /* Which protocol (TCP/UDP) */
133 __be16 ver_size
; /* Version msb 4 bits */
134 /* Flags and state transition */
135 __be32 flags
; /* status flags */
136 __be16 state
; /* state info */
137 /* Protocol, addresses and port numbers */
141 __be32 fwmark
; /* Firewall mark from skb */
142 __be32 timeout
; /* cp timeout */
143 __be32 caddr
; /* client address */
144 __be32 vaddr
; /* virtual address */
145 __be32 daddr
; /* destination address */
146 /* The sequence options start here */
147 /* PE data padded to 32bit alignment after seq. options */
150 * Type 2 messages IPv6
152 struct ip_vs_sync_v6
{
154 __u8 protocol
; /* Which protocol (TCP/UDP) */
155 __be16 ver_size
; /* Version msb 4 bits */
156 /* Flags and state transition */
157 __be32 flags
; /* status flags */
158 __be16 state
; /* state info */
159 /* Protocol, addresses and port numbers */
163 __be32 fwmark
; /* Firewall mark from skb */
164 __be32 timeout
; /* cp timeout */
165 struct in6_addr caddr
; /* client address */
166 struct in6_addr vaddr
; /* virtual address */
167 struct in6_addr daddr
; /* destination address */
168 /* The sequence options start here */
169 /* PE data padded to 32bit alignment after seq. options */
172 union ip_vs_sync_conn
{
173 struct ip_vs_sync_v4 v4
;
174 struct ip_vs_sync_v6 v6
;
177 /* Bits in Type field in above */
178 #define STYPE_INET6 0
179 #define STYPE_F_INET6 (1 << STYPE_INET6)
181 #define SVER_SHIFT 12 /* Shift to get version */
182 #define SVER_MASK 0x0fff /* Mask to strip version */
184 #define IPVS_OPT_SEQ_DATA 1
185 #define IPVS_OPT_PE_DATA 2
186 #define IPVS_OPT_PE_NAME 3
187 #define IPVS_OPT_PARAM 7
189 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
190 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
191 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
192 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
194 struct ip_vs_sync_thread_data
{
200 /* Version 0 definition of packet sizes */
201 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
202 #define FULL_CONN_SIZE \
203 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
207 The master mulitcasts messages (Datagrams) to the backup load balancers
208 in the following format.
211 Note, first byte should be Zero, so ver 0 receivers will drop the packet.
214 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
215 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
216 | 0 | SyncID | Size |
217 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
218 | Count Conns | Version | Reserved, set to Zero |
219 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
221 | IPVS Sync Connection (1) |
222 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
226 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
228 | IPVS Sync Connection (n) |
229 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
233 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
234 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
235 | Count Conns | SyncID | Size |
236 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
237 | IPVS Sync Connection (1) |
240 #define SYNC_MESG_HEADER_LEN 4
241 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
243 /* Version 0 header */
244 struct ip_vs_sync_mesg_v0
{
249 /* ip_vs_sync_conn entries start here */
252 /* Version 1 header */
253 struct ip_vs_sync_mesg
{
254 __u8 reserved
; /* must be zero */
258 __s8 version
; /* SYNC_PROTO_VER */
260 /* ip_vs_sync_conn entries start here */
263 struct ip_vs_sync_buff
{
264 struct list_head list
;
265 unsigned long firstuse
;
267 /* pointers for the message data */
268 struct ip_vs_sync_mesg
*mesg
;
274 static struct sockaddr_in mcast_addr
= {
275 .sin_family
= AF_INET
,
276 .sin_port
= cpu_to_be16(IP_VS_SYNC_PORT
),
277 .sin_addr
.s_addr
= cpu_to_be32(IP_VS_SYNC_GROUP
),
281 * Copy of struct ip_vs_seq
282 * From unaligned network order to aligned host order
284 static void ntoh_seq(struct ip_vs_seq
*no
, struct ip_vs_seq
*ho
)
286 ho
->init_seq
= get_unaligned_be32(&no
->init_seq
);
287 ho
->delta
= get_unaligned_be32(&no
->delta
);
288 ho
->previous_delta
= get_unaligned_be32(&no
->previous_delta
);
292 * Copy of struct ip_vs_seq
293 * From Aligned host order to unaligned network order
295 static void hton_seq(struct ip_vs_seq
*ho
, struct ip_vs_seq
*no
)
297 put_unaligned_be32(ho
->init_seq
, &no
->init_seq
);
298 put_unaligned_be32(ho
->delta
, &no
->delta
);
299 put_unaligned_be32(ho
->previous_delta
, &no
->previous_delta
);
302 static inline struct ip_vs_sync_buff
*sb_dequeue(struct netns_ipvs
*ipvs
)
304 struct ip_vs_sync_buff
*sb
;
306 spin_lock_bh(&ipvs
->sync_lock
);
307 if (list_empty(&ipvs
->sync_queue
)) {
310 sb
= list_entry(ipvs
->sync_queue
.next
,
311 struct ip_vs_sync_buff
,
315 spin_unlock_bh(&ipvs
->sync_lock
);
321 * Create a new sync buffer for Version 1 proto.
323 static inline struct ip_vs_sync_buff
*
324 ip_vs_sync_buff_create(struct netns_ipvs
*ipvs
)
326 struct ip_vs_sync_buff
*sb
;
328 if (!(sb
=kmalloc(sizeof(struct ip_vs_sync_buff
), GFP_ATOMIC
)))
331 sb
->mesg
= kmalloc(ipvs
->send_mesg_maxlen
, GFP_ATOMIC
);
336 sb
->mesg
->reserved
= 0; /* old nr_conns i.e. must be zeo now */
337 sb
->mesg
->version
= SYNC_PROTO_VER
;
338 sb
->mesg
->syncid
= ipvs
->master_syncid
;
339 sb
->mesg
->size
= sizeof(struct ip_vs_sync_mesg
);
340 sb
->mesg
->nr_conns
= 0;
342 sb
->head
= (unsigned char *)sb
->mesg
+ sizeof(struct ip_vs_sync_mesg
);
343 sb
->end
= (unsigned char *)sb
->mesg
+ ipvs
->send_mesg_maxlen
;
345 sb
->firstuse
= jiffies
;
349 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff
*sb
)
355 static inline void sb_queue_tail(struct netns_ipvs
*ipvs
)
357 struct ip_vs_sync_buff
*sb
= ipvs
->sync_buff
;
359 spin_lock(&ipvs
->sync_lock
);
360 if (ipvs
->sync_state
& IP_VS_STATE_MASTER
)
361 list_add_tail(&sb
->list
, &ipvs
->sync_queue
);
363 ip_vs_sync_buff_release(sb
);
364 spin_unlock(&ipvs
->sync_lock
);
368 * Get the current sync buffer if it has been created for more
369 * than the specified time or the specified time is zero.
371 static inline struct ip_vs_sync_buff
*
372 get_curr_sync_buff(struct netns_ipvs
*ipvs
, unsigned long time
)
374 struct ip_vs_sync_buff
*sb
;
376 spin_lock_bh(&ipvs
->sync_buff_lock
);
377 if (ipvs
->sync_buff
&&
378 time_after_eq(jiffies
- ipvs
->sync_buff
->firstuse
, time
)) {
379 sb
= ipvs
->sync_buff
;
380 ipvs
->sync_buff
= NULL
;
383 spin_unlock_bh(&ipvs
->sync_buff_lock
);
388 * Switch mode from sending version 0 or 1
389 * - must handle sync_buf
391 void ip_vs_sync_switch_mode(struct net
*net
, int mode
)
393 struct netns_ipvs
*ipvs
= net_ipvs(net
);
395 if (!(ipvs
->sync_state
& IP_VS_STATE_MASTER
))
397 if (mode
== sysctl_sync_ver(ipvs
) || !ipvs
->sync_buff
)
400 spin_lock_bh(&ipvs
->sync_buff_lock
);
401 /* Buffer empty ? then let buf_create do the job */
402 if (ipvs
->sync_buff
->mesg
->size
<= sizeof(struct ip_vs_sync_mesg
)) {
403 kfree(ipvs
->sync_buff
);
404 ipvs
->sync_buff
= NULL
;
406 spin_lock_bh(&ipvs
->sync_lock
);
407 if (ipvs
->sync_state
& IP_VS_STATE_MASTER
)
408 list_add_tail(&ipvs
->sync_buff
->list
,
411 ip_vs_sync_buff_release(ipvs
->sync_buff
);
412 spin_unlock_bh(&ipvs
->sync_lock
);
414 spin_unlock_bh(&ipvs
->sync_buff_lock
);
418 * Create a new sync buffer for Version 0 proto.
420 static inline struct ip_vs_sync_buff
*
421 ip_vs_sync_buff_create_v0(struct netns_ipvs
*ipvs
)
423 struct ip_vs_sync_buff
*sb
;
424 struct ip_vs_sync_mesg_v0
*mesg
;
426 if (!(sb
=kmalloc(sizeof(struct ip_vs_sync_buff
), GFP_ATOMIC
)))
429 sb
->mesg
= kmalloc(ipvs
->send_mesg_maxlen
, GFP_ATOMIC
);
434 mesg
= (struct ip_vs_sync_mesg_v0
*)sb
->mesg
;
436 mesg
->syncid
= ipvs
->master_syncid
;
437 mesg
->size
= sizeof(struct ip_vs_sync_mesg_v0
);
438 sb
->head
= (unsigned char *)mesg
+ sizeof(struct ip_vs_sync_mesg_v0
);
439 sb
->end
= (unsigned char *)mesg
+ ipvs
->send_mesg_maxlen
;
440 sb
->firstuse
= jiffies
;
445 * Version 0 , could be switched in by sys_ctl.
446 * Add an ip_vs_conn information into the current sync_buff.
448 void ip_vs_sync_conn_v0(struct net
*net
, struct ip_vs_conn
*cp
)
450 struct netns_ipvs
*ipvs
= net_ipvs(net
);
451 struct ip_vs_sync_mesg_v0
*m
;
452 struct ip_vs_sync_conn_v0
*s
;
455 if (unlikely(cp
->af
!= AF_INET
))
457 /* Do not sync ONE PACKET */
458 if (cp
->flags
& IP_VS_CONN_F_ONE_PACKET
)
461 spin_lock(&ipvs
->sync_buff_lock
);
462 if (!ipvs
->sync_buff
) {
464 ip_vs_sync_buff_create_v0(ipvs
);
465 if (!ipvs
->sync_buff
) {
466 spin_unlock(&ipvs
->sync_buff_lock
);
467 pr_err("ip_vs_sync_buff_create failed.\n");
472 len
= (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
) ? FULL_CONN_SIZE
:
474 m
= (struct ip_vs_sync_mesg_v0
*)ipvs
->sync_buff
->mesg
;
475 s
= (struct ip_vs_sync_conn_v0
*)ipvs
->sync_buff
->head
;
479 s
->protocol
= cp
->protocol
;
480 s
->cport
= cp
->cport
;
481 s
->vport
= cp
->vport
;
482 s
->dport
= cp
->dport
;
483 s
->caddr
= cp
->caddr
.ip
;
484 s
->vaddr
= cp
->vaddr
.ip
;
485 s
->daddr
= cp
->daddr
.ip
;
486 s
->flags
= htons(cp
->flags
& ~IP_VS_CONN_F_HASHED
);
487 s
->state
= htons(cp
->state
);
488 if (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
) {
489 struct ip_vs_sync_conn_options
*opt
=
490 (struct ip_vs_sync_conn_options
*)&s
[1];
491 memcpy(opt
, &cp
->in_seq
, sizeof(*opt
));
496 ipvs
->sync_buff
->head
+= len
;
498 /* check if there is a space for next one */
499 if (ipvs
->sync_buff
->head
+ FULL_CONN_SIZE
> ipvs
->sync_buff
->end
) {
501 ipvs
->sync_buff
= NULL
;
503 spin_unlock(&ipvs
->sync_buff_lock
);
505 /* synchronize its controller if it has */
507 ip_vs_sync_conn(net
, cp
->control
);
511 * Add an ip_vs_conn information into the current sync_buff.
512 * Called by ip_vs_in.
513 * Sending Version 1 messages
515 void ip_vs_sync_conn(struct net
*net
, struct ip_vs_conn
*cp
)
517 struct netns_ipvs
*ipvs
= net_ipvs(net
);
518 struct ip_vs_sync_mesg
*m
;
519 union ip_vs_sync_conn
*s
;
521 unsigned int len
, pe_name_len
, pad
;
523 /* Handle old version of the protocol */
524 if (sysctl_sync_ver(ipvs
) == 0) {
525 ip_vs_sync_conn_v0(net
, cp
);
528 /* Do not sync ONE PACKET */
529 if (cp
->flags
& IP_VS_CONN_F_ONE_PACKET
)
534 if (cp
->pe_data_len
) {
535 if (!cp
->pe_data
|| !cp
->dest
) {
536 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
539 pe_name_len
= strnlen(cp
->pe
->name
, IP_VS_PENAME_MAXLEN
);
542 spin_lock(&ipvs
->sync_buff_lock
);
544 #ifdef CONFIG_IP_VS_IPV6
545 if (cp
->af
== AF_INET6
)
546 len
= sizeof(struct ip_vs_sync_v6
);
549 len
= sizeof(struct ip_vs_sync_v4
);
551 if (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
)
552 len
+= sizeof(struct ip_vs_sync_conn_options
) + 2;
555 len
+= cp
->pe_data_len
+ 2; /* + Param hdr field */
557 len
+= pe_name_len
+ 2;
559 /* check if there is a space for this one */
561 if (ipvs
->sync_buff
) {
562 pad
= (4 - (size_t)ipvs
->sync_buff
->head
) & 3;
563 if (ipvs
->sync_buff
->head
+ len
+ pad
> ipvs
->sync_buff
->end
) {
565 ipvs
->sync_buff
= NULL
;
570 if (!ipvs
->sync_buff
) {
571 ipvs
->sync_buff
= ip_vs_sync_buff_create(ipvs
);
572 if (!ipvs
->sync_buff
) {
573 spin_unlock(&ipvs
->sync_buff_lock
);
574 pr_err("ip_vs_sync_buff_create failed.\n");
579 m
= ipvs
->sync_buff
->mesg
;
580 p
= ipvs
->sync_buff
->head
;
581 ipvs
->sync_buff
->head
+= pad
+ len
;
582 m
->size
+= pad
+ len
;
583 /* Add ev. padding from prev. sync_conn */
587 s
= (union ip_vs_sync_conn
*)p
;
589 /* Set message type & copy members */
590 s
->v4
.type
= (cp
->af
== AF_INET6
? STYPE_F_INET6
: 0);
591 s
->v4
.ver_size
= htons(len
& SVER_MASK
); /* Version 0 */
592 s
->v4
.flags
= htonl(cp
->flags
& ~IP_VS_CONN_F_HASHED
);
593 s
->v4
.state
= htons(cp
->state
);
594 s
->v4
.protocol
= cp
->protocol
;
595 s
->v4
.cport
= cp
->cport
;
596 s
->v4
.vport
= cp
->vport
;
597 s
->v4
.dport
= cp
->dport
;
598 s
->v4
.fwmark
= htonl(cp
->fwmark
);
599 s
->v4
.timeout
= htonl(cp
->timeout
/ HZ
);
602 #ifdef CONFIG_IP_VS_IPV6
603 if (cp
->af
== AF_INET6
) {
604 p
+= sizeof(struct ip_vs_sync_v6
);
605 ipv6_addr_copy(&s
->v6
.caddr
, &cp
->caddr
.in6
);
606 ipv6_addr_copy(&s
->v6
.vaddr
, &cp
->vaddr
.in6
);
607 ipv6_addr_copy(&s
->v6
.daddr
, &cp
->daddr
.in6
);
611 p
+= sizeof(struct ip_vs_sync_v4
); /* options ptr */
612 s
->v4
.caddr
= cp
->caddr
.ip
;
613 s
->v4
.vaddr
= cp
->vaddr
.ip
;
614 s
->v4
.daddr
= cp
->daddr
.ip
;
616 if (cp
->flags
& IP_VS_CONN_F_SEQ_MASK
) {
617 *(p
++) = IPVS_OPT_SEQ_DATA
;
618 *(p
++) = sizeof(struct ip_vs_sync_conn_options
);
619 hton_seq((struct ip_vs_seq
*)p
, &cp
->in_seq
);
620 p
+= sizeof(struct ip_vs_seq
);
621 hton_seq((struct ip_vs_seq
*)p
, &cp
->out_seq
);
622 p
+= sizeof(struct ip_vs_seq
);
625 if (cp
->pe_data_len
&& cp
->pe_data
) {
626 *(p
++) = IPVS_OPT_PE_DATA
;
627 *(p
++) = cp
->pe_data_len
;
628 memcpy(p
, cp
->pe_data
, cp
->pe_data_len
);
629 p
+= cp
->pe_data_len
;
632 *(p
++) = IPVS_OPT_PE_NAME
;
633 *(p
++) = pe_name_len
;
634 memcpy(p
, cp
->pe
->name
, pe_name_len
);
639 spin_unlock(&ipvs
->sync_buff_lock
);
642 /* synchronize its controller if it has */
647 * Reduce sync rate for templates
648 * i.e only increment in_pkts for Templates.
650 if (cp
->flags
& IP_VS_CONN_F_TEMPLATE
) {
651 int pkts
= atomic_add_return(1, &cp
->in_pkts
);
653 if (pkts
% sysctl_sync_period(ipvs
) != 1)
660 * fill_param used by version 1
663 ip_vs_conn_fill_param_sync(struct net
*net
, int af
, union ip_vs_sync_conn
*sc
,
664 struct ip_vs_conn_param
*p
,
665 __u8
*pe_data
, unsigned int pe_data_len
,
666 __u8
*pe_name
, unsigned int pe_name_len
)
668 #ifdef CONFIG_IP_VS_IPV6
670 ip_vs_conn_fill_param(net
, af
, sc
->v6
.protocol
,
671 (const union nf_inet_addr
*)&sc
->v6
.caddr
,
673 (const union nf_inet_addr
*)&sc
->v6
.vaddr
,
677 ip_vs_conn_fill_param(net
, af
, sc
->v4
.protocol
,
678 (const union nf_inet_addr
*)&sc
->v4
.caddr
,
680 (const union nf_inet_addr
*)&sc
->v4
.vaddr
,
685 char buff
[IP_VS_PENAME_MAXLEN
+1];
687 memcpy(buff
, pe_name
, pe_name_len
);
689 p
->pe
= __ip_vs_pe_getbyname(buff
);
691 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
696 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
700 p
->pe_data
= kmemdup(pe_data
, pe_data_len
, GFP_ATOMIC
);
703 module_put(p
->pe
->module
);
706 p
->pe_data_len
= pe_data_len
;
712 * Connection Add / Update.
713 * Common for version 0 and 1 reception of backup sync_conns.
717 static void ip_vs_proc_conn(struct net
*net
, struct ip_vs_conn_param
*param
,
718 unsigned int flags
, unsigned int state
,
719 unsigned int protocol
, unsigned int type
,
720 const union nf_inet_addr
*daddr
, __be16 dport
,
721 unsigned long timeout
, __u32 fwmark
,
722 struct ip_vs_sync_conn_options
*opt
)
724 struct ip_vs_dest
*dest
;
725 struct ip_vs_conn
*cp
;
726 struct netns_ipvs
*ipvs
= net_ipvs(net
);
728 if (!(flags
& IP_VS_CONN_F_TEMPLATE
))
729 cp
= ip_vs_conn_in_get(param
);
731 cp
= ip_vs_ct_in_get(param
);
733 if (cp
&& param
->pe_data
) /* Free pe_data */
734 kfree(param
->pe_data
);
737 * Find the appropriate destination for the connection.
738 * If it is not found the connection will remain unbound
741 dest
= ip_vs_find_dest(net
, type
, daddr
, dport
, param
->vaddr
,
742 param
->vport
, protocol
, fwmark
);
744 /* Set the approprite ativity flag */
745 if (protocol
== IPPROTO_TCP
) {
746 if (state
!= IP_VS_TCP_S_ESTABLISHED
)
747 flags
|= IP_VS_CONN_F_INACTIVE
;
749 flags
&= ~IP_VS_CONN_F_INACTIVE
;
750 } else if (protocol
== IPPROTO_SCTP
) {
751 if (state
!= IP_VS_SCTP_S_ESTABLISHED
)
752 flags
|= IP_VS_CONN_F_INACTIVE
;
754 flags
&= ~IP_VS_CONN_F_INACTIVE
;
756 cp
= ip_vs_conn_new(param
, daddr
, dport
, flags
, dest
, fwmark
);
758 atomic_dec(&dest
->refcnt
);
761 kfree(param
->pe_data
);
762 IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
765 } else if (!cp
->dest
) {
766 dest
= ip_vs_try_bind_dest(cp
);
768 atomic_dec(&dest
->refcnt
);
769 } else if ((cp
->dest
) && (cp
->protocol
== IPPROTO_TCP
) &&
770 (cp
->state
!= state
)) {
771 /* update active/inactive flag for the connection */
773 if (!(cp
->flags
& IP_VS_CONN_F_INACTIVE
) &&
774 (state
!= IP_VS_TCP_S_ESTABLISHED
)) {
775 atomic_dec(&dest
->activeconns
);
776 atomic_inc(&dest
->inactconns
);
777 cp
->flags
|= IP_VS_CONN_F_INACTIVE
;
778 } else if ((cp
->flags
& IP_VS_CONN_F_INACTIVE
) &&
779 (state
== IP_VS_TCP_S_ESTABLISHED
)) {
780 atomic_inc(&dest
->activeconns
);
781 atomic_dec(&dest
->inactconns
);
782 cp
->flags
&= ~IP_VS_CONN_F_INACTIVE
;
784 } else if ((cp
->dest
) && (cp
->protocol
== IPPROTO_SCTP
) &&
785 (cp
->state
!= state
)) {
787 if (!(cp
->flags
& IP_VS_CONN_F_INACTIVE
) &&
788 (state
!= IP_VS_SCTP_S_ESTABLISHED
)) {
789 atomic_dec(&dest
->activeconns
);
790 atomic_inc(&dest
->inactconns
);
791 cp
->flags
&= ~IP_VS_CONN_F_INACTIVE
;
796 memcpy(&cp
->in_seq
, opt
, sizeof(*opt
));
797 atomic_set(&cp
->in_pkts
, sysctl_sync_threshold(ipvs
));
799 cp
->old_state
= cp
->state
;
801 * For Ver 0 messages style
802 * - Not possible to recover the right timeout for templates
803 * - can not find the right fwmark
804 * virtual service. If needed, we can do it for
805 * non-fwmark persistent services.
806 * Ver 1 messages style.
810 if (timeout
> MAX_SCHEDULE_TIMEOUT
/ HZ
)
811 timeout
= MAX_SCHEDULE_TIMEOUT
/ HZ
;
812 cp
->timeout
= timeout
*HZ
;
814 struct ip_vs_proto_data
*pd
;
816 pd
= ip_vs_proto_data_get(net
, protocol
);
817 if (!(flags
& IP_VS_CONN_F_TEMPLATE
) && pd
&& pd
->timeout_table
)
818 cp
->timeout
= pd
->timeout_table
[state
];
820 cp
->timeout
= (3*60*HZ
);
826 * Process received multicast message for Version 0
828 static void ip_vs_process_message_v0(struct net
*net
, const char *buffer
,
831 struct ip_vs_sync_mesg_v0
*m
= (struct ip_vs_sync_mesg_v0
*)buffer
;
832 struct ip_vs_sync_conn_v0
*s
;
833 struct ip_vs_sync_conn_options
*opt
;
834 struct ip_vs_protocol
*pp
;
835 struct ip_vs_conn_param param
;
839 p
= (char *)buffer
+ sizeof(struct ip_vs_sync_mesg_v0
);
840 for (i
=0; i
<m
->nr_conns
; i
++) {
841 unsigned flags
, state
;
843 if (p
+ SIMPLE_CONN_SIZE
> buffer
+buflen
) {
844 IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
847 s
= (struct ip_vs_sync_conn_v0
*) p
;
848 flags
= ntohs(s
->flags
) | IP_VS_CONN_F_SYNC
;
849 flags
&= ~IP_VS_CONN_F_HASHED
;
850 if (flags
& IP_VS_CONN_F_SEQ_MASK
) {
851 opt
= (struct ip_vs_sync_conn_options
*)&s
[1];
853 if (p
> buffer
+buflen
) {
854 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
859 p
+= SIMPLE_CONN_SIZE
;
862 state
= ntohs(s
->state
);
863 if (!(flags
& IP_VS_CONN_F_TEMPLATE
)) {
864 pp
= ip_vs_proto_get(s
->protocol
);
866 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
870 if (state
>= pp
->num_states
) {
871 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
876 /* protocol in templates is not used for state/timeout */
878 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
884 ip_vs_conn_fill_param(net
, AF_INET
, s
->protocol
,
885 (const union nf_inet_addr
*)&s
->caddr
,
887 (const union nf_inet_addr
*)&s
->vaddr
,
890 /* Send timeout as Zero */
891 ip_vs_proc_conn(net
, ¶m
, flags
, state
, s
->protocol
, AF_INET
,
892 (union nf_inet_addr
*)&s
->daddr
, s
->dport
,
900 static inline int ip_vs_proc_seqopt(__u8
*p
, unsigned int plen
,
902 struct ip_vs_sync_conn_options
*opt
)
904 struct ip_vs_sync_conn_options
*topt
;
906 topt
= (struct ip_vs_sync_conn_options
*)p
;
908 if (plen
!= sizeof(struct ip_vs_sync_conn_options
)) {
909 IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
912 if (*opt_flags
& IPVS_OPT_F_SEQ_DATA
) {
913 IP_VS_DBG(2, "BACKUP, conn options found twice\n");
916 ntoh_seq(&topt
->in_seq
, &opt
->in_seq
);
917 ntoh_seq(&topt
->out_seq
, &opt
->out_seq
);
918 *opt_flags
|= IPVS_OPT_F_SEQ_DATA
;
922 static int ip_vs_proc_str(__u8
*p
, unsigned int plen
, unsigned int *data_len
,
923 __u8
**data
, unsigned int maxlen
,
924 __u32
*opt_flags
, __u32 flag
)
927 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen
);
930 if (*opt_flags
& flag
) {
931 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag
);
940 * Process a Version 1 sync. connection
942 static inline int ip_vs_proc_sync_conn(struct net
*net
, __u8
*p
, __u8
*msg_end
)
944 struct ip_vs_sync_conn_options opt
;
945 union ip_vs_sync_conn
*s
;
946 struct ip_vs_protocol
*pp
;
947 struct ip_vs_conn_param param
;
949 unsigned int af
, state
, pe_data_len
=0, pe_name_len
=0;
950 __u8
*pe_data
=NULL
, *pe_name
=NULL
;
954 s
= (union ip_vs_sync_conn
*) p
;
956 if (s
->v6
.type
& STYPE_F_INET6
) {
957 #ifdef CONFIG_IP_VS_IPV6
959 p
+= sizeof(struct ip_vs_sync_v6
);
961 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
965 } else if (!s
->v4
.type
) {
967 p
+= sizeof(struct ip_vs_sync_v4
);
974 /* Process optional params check Type & Len. */
975 while (p
< msg_end
) {
984 if (!plen
|| ((p
+ plen
) > msg_end
))
986 /* Handle seq option p = param data */
987 switch (ptype
& ~IPVS_OPT_F_PARAM
) {
988 case IPVS_OPT_SEQ_DATA
:
989 if (ip_vs_proc_seqopt(p
, plen
, &opt_flags
, &opt
))
993 case IPVS_OPT_PE_DATA
:
994 if (ip_vs_proc_str(p
, plen
, &pe_data_len
, &pe_data
,
995 IP_VS_PEDATA_MAXLEN
, &opt_flags
,
1000 case IPVS_OPT_PE_NAME
:
1001 if (ip_vs_proc_str(p
, plen
,&pe_name_len
, &pe_name
,
1002 IP_VS_PENAME_MAXLEN
, &opt_flags
,
1003 IPVS_OPT_F_PE_NAME
))
1008 /* Param data mandatory ? */
1009 if (!(ptype
& IPVS_OPT_F_PARAM
)) {
1010 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1011 ptype
& ~IPVS_OPT_F_PARAM
);
1016 p
+= plen
; /* Next option */
1019 /* Get flags and Mask off unsupported */
1020 flags
= ntohl(s
->v4
.flags
) & IP_VS_CONN_F_BACKUP_MASK
;
1021 flags
|= IP_VS_CONN_F_SYNC
;
1022 state
= ntohs(s
->v4
.state
);
1024 if (!(flags
& IP_VS_CONN_F_TEMPLATE
)) {
1025 pp
= ip_vs_proto_get(s
->v4
.protocol
);
1027 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
1032 if (state
>= pp
->num_states
) {
1033 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
1039 /* protocol in templates is not used for state/timeout */
1041 IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
1046 if (ip_vs_conn_fill_param_sync(net
, af
, s
, ¶m
, pe_data
,
1047 pe_data_len
, pe_name
, pe_name_len
)) {
1051 /* If only IPv4, just silent skip IPv6 */
1053 ip_vs_proc_conn(net
, ¶m
, flags
, state
, s
->v4
.protocol
, af
,
1054 (union nf_inet_addr
*)&s
->v4
.daddr
, s
->v4
.dport
,
1055 ntohl(s
->v4
.timeout
), ntohl(s
->v4
.fwmark
),
1056 (opt_flags
& IPVS_OPT_F_SEQ_DATA
? &opt
: NULL
)
1058 #ifdef CONFIG_IP_VS_IPV6
1060 ip_vs_proc_conn(net
, ¶m
, flags
, state
, s
->v6
.protocol
, af
,
1061 (union nf_inet_addr
*)&s
->v6
.daddr
, s
->v6
.dport
,
1062 ntohl(s
->v6
.timeout
), ntohl(s
->v6
.fwmark
),
1063 (opt_flags
& IPVS_OPT_F_SEQ_DATA
? &opt
: NULL
)
1069 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc
);
1074 * Process received multicast message and create the corresponding
1075 * ip_vs_conn entries.
1076 * Handles Version 0 & 1
1078 static void ip_vs_process_message(struct net
*net
, __u8
*buffer
,
1079 const size_t buflen
)
1081 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1082 struct ip_vs_sync_mesg
*m2
= (struct ip_vs_sync_mesg
*)buffer
;
1086 if (buflen
< sizeof(struct ip_vs_sync_mesg_v0
)) {
1087 IP_VS_DBG(2, "BACKUP, message header too short\n");
1090 /* Convert size back to host byte order */
1091 m2
->size
= ntohs(m2
->size
);
1093 if (buflen
!= m2
->size
) {
1094 IP_VS_DBG(2, "BACKUP, bogus message size\n");
1097 /* SyncID sanity check */
1098 if (ipvs
->backup_syncid
!= 0 && m2
->syncid
!= ipvs
->backup_syncid
) {
1099 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2
->syncid
);
1102 /* Handle version 1 message */
1103 if ((m2
->version
== SYNC_PROTO_VER
) && (m2
->reserved
== 0)
1104 && (m2
->spare
== 0)) {
1106 msg_end
= buffer
+ sizeof(struct ip_vs_sync_mesg
);
1107 nr_conns
= m2
->nr_conns
;
1109 for (i
=0; i
<nr_conns
; i
++) {
1110 union ip_vs_sync_conn
*s
;
1115 if (p
+ sizeof(s
->v4
) > buffer
+buflen
) {
1116 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
1119 s
= (union ip_vs_sync_conn
*)p
;
1120 size
= ntohs(s
->v4
.ver_size
) & SVER_MASK
;
1122 /* Basic sanity checks */
1123 if (msg_end
> buffer
+buflen
) {
1124 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
1127 if (ntohs(s
->v4
.ver_size
) >> SVER_SHIFT
) {
1128 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
1129 ntohs(s
->v4
.ver_size
) >> SVER_SHIFT
);
1132 /* Process a single sync_conn */
1133 retc
= ip_vs_proc_sync_conn(net
, p
, msg_end
);
1135 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
1139 /* Make sure we have 32 bit alignment */
1140 msg_end
= p
+ ((size
+ 3) & ~3);
1143 /* Old type of message */
1144 ip_vs_process_message_v0(net
, buffer
, buflen
);
1151 * Setup loopback of outgoing multicasts on a sending socket
1153 static void set_mcast_loop(struct sock
*sk
, u_char loop
)
1155 struct inet_sock
*inet
= inet_sk(sk
);
1157 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
1159 inet
->mc_loop
= loop
? 1 : 0;
1164 * Specify TTL for outgoing multicasts on a sending socket
1166 static void set_mcast_ttl(struct sock
*sk
, u_char ttl
)
1168 struct inet_sock
*inet
= inet_sk(sk
);
1170 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
1177 * Specifiy default interface for outgoing multicasts
1179 static int set_mcast_if(struct sock
*sk
, char *ifname
)
1181 struct net_device
*dev
;
1182 struct inet_sock
*inet
= inet_sk(sk
);
1183 struct net
*net
= sock_net(sk
);
1185 dev
= __dev_get_by_name(net
, ifname
);
1189 if (sk
->sk_bound_dev_if
&& dev
->ifindex
!= sk
->sk_bound_dev_if
)
1193 inet
->mc_index
= dev
->ifindex
;
1194 /* inet->mc_addr = 0; */
1202 * Set the maximum length of sync message according to the
1203 * specified interface's MTU.
1205 static int set_sync_mesg_maxlen(struct net
*net
, int sync_state
)
1207 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1208 struct net_device
*dev
;
1211 if (sync_state
== IP_VS_STATE_MASTER
) {
1212 dev
= __dev_get_by_name(net
, ipvs
->master_mcast_ifn
);
1216 num
= (dev
->mtu
- sizeof(struct iphdr
) -
1217 sizeof(struct udphdr
) -
1218 SYNC_MESG_HEADER_LEN
- 20) / SIMPLE_CONN_SIZE
;
1219 ipvs
->send_mesg_maxlen
= SYNC_MESG_HEADER_LEN
+
1220 SIMPLE_CONN_SIZE
* min(num
, MAX_CONNS_PER_SYNCBUFF
);
1221 IP_VS_DBG(7, "setting the maximum length of sync sending "
1222 "message %d.\n", ipvs
->send_mesg_maxlen
);
1223 } else if (sync_state
== IP_VS_STATE_BACKUP
) {
1224 dev
= __dev_get_by_name(net
, ipvs
->backup_mcast_ifn
);
1228 ipvs
->recv_mesg_maxlen
= dev
->mtu
-
1229 sizeof(struct iphdr
) - sizeof(struct udphdr
);
1230 IP_VS_DBG(7, "setting the maximum length of sync receiving "
1231 "message %d.\n", ipvs
->recv_mesg_maxlen
);
1239 * Join a multicast group.
1240 * the group is specified by a class D multicast address 224.0.0.0/8
1241 * in the in_addr structure passed in as a parameter.
1244 join_mcast_group(struct sock
*sk
, struct in_addr
*addr
, char *ifname
)
1246 struct net
*net
= sock_net(sk
);
1247 struct ip_mreqn mreq
;
1248 struct net_device
*dev
;
1251 memset(&mreq
, 0, sizeof(mreq
));
1252 memcpy(&mreq
.imr_multiaddr
, addr
, sizeof(struct in_addr
));
1254 dev
= __dev_get_by_name(net
, ifname
);
1257 if (sk
->sk_bound_dev_if
&& dev
->ifindex
!= sk
->sk_bound_dev_if
)
1260 mreq
.imr_ifindex
= dev
->ifindex
;
1263 ret
= ip_mc_join_group(sk
, &mreq
);
1270 static int bind_mcastif_addr(struct socket
*sock
, char *ifname
)
1272 struct net
*net
= sock_net(sock
->sk
);
1273 struct net_device
*dev
;
1275 struct sockaddr_in sin
;
1277 dev
= __dev_get_by_name(net
, ifname
);
1281 addr
= inet_select_addr(dev
, 0, RT_SCOPE_UNIVERSE
);
1283 pr_err("You probably need to specify IP address on "
1284 "multicast interface.\n");
1286 IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
1289 /* Now bind the socket with the address of multicast interface */
1290 sin
.sin_family
= AF_INET
;
1291 sin
.sin_addr
.s_addr
= addr
;
1294 return sock
->ops
->bind(sock
, (struct sockaddr
*)&sin
, sizeof(sin
));
1298 * Set up sending multicast socket over UDP
1300 static struct socket
*make_send_sock(struct net
*net
)
1302 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1303 struct socket
*sock
;
1306 /* First create a socket */
1307 result
= __sock_create(net
, PF_INET
, SOCK_DGRAM
, IPPROTO_UDP
, &sock
, 1);
1309 pr_err("Error during creation of socket; terminating\n");
1310 return ERR_PTR(result
);
1313 result
= set_mcast_if(sock
->sk
, ipvs
->master_mcast_ifn
);
1315 pr_err("Error setting outbound mcast interface\n");
1319 set_mcast_loop(sock
->sk
, 0);
1320 set_mcast_ttl(sock
->sk
, 1);
1322 result
= bind_mcastif_addr(sock
, ipvs
->master_mcast_ifn
);
1324 pr_err("Error binding address of the mcast interface\n");
1328 result
= sock
->ops
->connect(sock
, (struct sockaddr
*) &mcast_addr
,
1329 sizeof(struct sockaddr
), 0);
1331 pr_err("Error connecting to the multicast addr\n");
1339 return ERR_PTR(result
);
1344 * Set up receiving multicast socket over UDP
1346 static struct socket
*make_receive_sock(struct net
*net
)
1348 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1349 struct socket
*sock
;
1352 /* First create a socket */
1353 result
= __sock_create(net
, PF_INET
, SOCK_DGRAM
, IPPROTO_UDP
, &sock
, 1);
1355 pr_err("Error during creation of socket; terminating\n");
1356 return ERR_PTR(result
);
1359 /* it is equivalent to the REUSEADDR option in user-space */
1360 sock
->sk
->sk_reuse
= 1;
1362 result
= sock
->ops
->bind(sock
, (struct sockaddr
*) &mcast_addr
,
1363 sizeof(struct sockaddr
));
1365 pr_err("Error binding to the multicast addr\n");
1369 /* join the multicast group */
1370 result
= join_mcast_group(sock
->sk
,
1371 (struct in_addr
*) &mcast_addr
.sin_addr
,
1372 ipvs
->backup_mcast_ifn
);
1374 pr_err("Error joining to the multicast group\n");
1382 return ERR_PTR(result
);
1387 ip_vs_send_async(struct socket
*sock
, const char *buffer
, const size_t length
)
1389 struct msghdr msg
= {.msg_flags
= MSG_DONTWAIT
|MSG_NOSIGNAL
};
1394 iov
.iov_base
= (void *)buffer
;
1395 iov
.iov_len
= length
;
1397 len
= kernel_sendmsg(sock
, &msg
, &iov
, 1, (size_t)(length
));
1404 ip_vs_send_sync_msg(struct socket
*sock
, struct ip_vs_sync_mesg
*msg
)
1410 /* Put size in network byte order */
1411 msg
->size
= htons(msg
->size
);
1413 if (ip_vs_send_async(sock
, (char *)msg
, msize
) != msize
)
1414 pr_err("ip_vs_send_async error\n");
1418 ip_vs_receive(struct socket
*sock
, char *buffer
, const size_t buflen
)
1420 struct msghdr msg
= {NULL
,};
1426 /* Receive a packet */
1427 iov
.iov_base
= buffer
;
1428 iov
.iov_len
= (size_t)buflen
;
1430 len
= kernel_recvmsg(sock
, &msg
, &iov
, 1, buflen
, 0);
1440 static int sync_thread_master(void *data
)
1442 struct ip_vs_sync_thread_data
*tinfo
= data
;
1443 struct netns_ipvs
*ipvs
= net_ipvs(tinfo
->net
);
1444 struct ip_vs_sync_buff
*sb
;
1446 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
1448 ipvs
->master_mcast_ifn
, ipvs
->master_syncid
);
1450 while (!kthread_should_stop()) {
1451 while ((sb
= sb_dequeue(ipvs
))) {
1452 ip_vs_send_sync_msg(tinfo
->sock
, sb
->mesg
);
1453 ip_vs_sync_buff_release(sb
);
1456 /* check if entries stay in ipvs->sync_buff for 2 seconds */
1457 sb
= get_curr_sync_buff(ipvs
, 2 * HZ
);
1459 ip_vs_send_sync_msg(tinfo
->sock
, sb
->mesg
);
1460 ip_vs_sync_buff_release(sb
);
1463 schedule_timeout_interruptible(HZ
);
1466 /* clean up the sync_buff queue */
1467 while ((sb
= sb_dequeue(ipvs
)))
1468 ip_vs_sync_buff_release(sb
);
1470 /* clean up the current sync_buff */
1471 sb
= get_curr_sync_buff(ipvs
, 0);
1473 ip_vs_sync_buff_release(sb
);
1475 /* release the sending multicast socket */
1476 sock_release(tinfo
->sock
);
1483 static int sync_thread_backup(void *data
)
1485 struct ip_vs_sync_thread_data
*tinfo
= data
;
1486 struct netns_ipvs
*ipvs
= net_ipvs(tinfo
->net
);
1489 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
1491 ipvs
->backup_mcast_ifn
, ipvs
->backup_syncid
);
1493 while (!kthread_should_stop()) {
1494 wait_event_interruptible(*sk_sleep(tinfo
->sock
->sk
),
1495 !skb_queue_empty(&tinfo
->sock
->sk
->sk_receive_queue
)
1496 || kthread_should_stop());
1498 /* do we have data now? */
1499 while (!skb_queue_empty(&(tinfo
->sock
->sk
->sk_receive_queue
))) {
1500 len
= ip_vs_receive(tinfo
->sock
, tinfo
->buf
,
1501 ipvs
->recv_mesg_maxlen
);
1503 pr_err("receiving message error\n");
1507 /* disable bottom half, because it accesses the data
1508 shared by softirq while getting/creating conns */
1510 ip_vs_process_message(tinfo
->net
, tinfo
->buf
, len
);
1515 /* release the sending multicast socket */
1516 sock_release(tinfo
->sock
);
1524 int start_sync_thread(struct net
*net
, int state
, char *mcast_ifn
, __u8 syncid
)
1526 struct ip_vs_sync_thread_data
*tinfo
;
1527 struct task_struct
**realtask
, *task
;
1528 struct socket
*sock
;
1529 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1530 char *name
, *buf
= NULL
;
1531 int (*threadfn
)(void *data
);
1532 int result
= -ENOMEM
;
1534 IP_VS_DBG(7, "%s(): pid %d\n", __func__
, task_pid_nr(current
));
1535 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
1536 sizeof(struct ip_vs_sync_conn_v0
));
1538 if (state
== IP_VS_STATE_MASTER
) {
1539 if (ipvs
->master_thread
)
1542 strlcpy(ipvs
->master_mcast_ifn
, mcast_ifn
,
1543 sizeof(ipvs
->master_mcast_ifn
));
1544 ipvs
->master_syncid
= syncid
;
1545 realtask
= &ipvs
->master_thread
;
1546 name
= "ipvs_master:%d";
1547 threadfn
= sync_thread_master
;
1548 sock
= make_send_sock(net
);
1549 } else if (state
== IP_VS_STATE_BACKUP
) {
1550 if (ipvs
->backup_thread
)
1553 strlcpy(ipvs
->backup_mcast_ifn
, mcast_ifn
,
1554 sizeof(ipvs
->backup_mcast_ifn
));
1555 ipvs
->backup_syncid
= syncid
;
1556 realtask
= &ipvs
->backup_thread
;
1557 name
= "ipvs_backup:%d";
1558 threadfn
= sync_thread_backup
;
1559 sock
= make_receive_sock(net
);
1565 result
= PTR_ERR(sock
);
1569 set_sync_mesg_maxlen(net
, state
);
1570 if (state
== IP_VS_STATE_BACKUP
) {
1571 buf
= kmalloc(ipvs
->recv_mesg_maxlen
, GFP_KERNEL
);
1576 tinfo
= kmalloc(sizeof(*tinfo
), GFP_KERNEL
);
1584 task
= kthread_run(threadfn
, tinfo
, name
, ipvs
->gen
);
1586 result
= PTR_ERR(task
);
1590 /* mark as active */
1592 ipvs
->sync_state
|= state
;
1594 /* increase the module use count */
1595 ip_vs_use_count_inc();
1610 int stop_sync_thread(struct net
*net
, int state
)
1612 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1614 IP_VS_DBG(7, "%s(): pid %d\n", __func__
, task_pid_nr(current
));
1616 if (state
== IP_VS_STATE_MASTER
) {
1617 if (!ipvs
->master_thread
)
1620 pr_info("stopping master sync thread %d ...\n",
1621 task_pid_nr(ipvs
->master_thread
));
1624 * The lock synchronizes with sb_queue_tail(), so that we don't
1625 * add sync buffers to the queue, when we are already in
1626 * progress of stopping the master sync daemon.
1629 spin_lock_bh(&ipvs
->sync_lock
);
1630 ipvs
->sync_state
&= ~IP_VS_STATE_MASTER
;
1631 spin_unlock_bh(&ipvs
->sync_lock
);
1632 kthread_stop(ipvs
->master_thread
);
1633 ipvs
->master_thread
= NULL
;
1634 } else if (state
== IP_VS_STATE_BACKUP
) {
1635 if (!ipvs
->backup_thread
)
1638 pr_info("stopping backup sync thread %d ...\n",
1639 task_pid_nr(ipvs
->backup_thread
));
1641 ipvs
->sync_state
&= ~IP_VS_STATE_BACKUP
;
1642 kthread_stop(ipvs
->backup_thread
);
1643 ipvs
->backup_thread
= NULL
;
1648 /* decrease the module use count */
1649 ip_vs_use_count_dec();
1655 * Initialize data struct for each netns
1657 static int __net_init
__ip_vs_sync_init(struct net
*net
)
1659 struct netns_ipvs
*ipvs
= net_ipvs(net
);
1661 INIT_LIST_HEAD(&ipvs
->sync_queue
);
1662 spin_lock_init(&ipvs
->sync_lock
);
1663 spin_lock_init(&ipvs
->sync_buff_lock
);
1665 ipvs
->sync_mcast_addr
.sin_family
= AF_INET
;
1666 ipvs
->sync_mcast_addr
.sin_port
= cpu_to_be16(IP_VS_SYNC_PORT
);
1667 ipvs
->sync_mcast_addr
.sin_addr
.s_addr
= cpu_to_be32(IP_VS_SYNC_GROUP
);
1671 static void __ip_vs_sync_cleanup(struct net
*net
)
1673 stop_sync_thread(net
, IP_VS_STATE_MASTER
);
1674 stop_sync_thread(net
, IP_VS_STATE_BACKUP
);
1677 static struct pernet_operations ipvs_sync_ops
= {
1678 .init
= __ip_vs_sync_init
,
1679 .exit
= __ip_vs_sync_cleanup
,
1683 int __init
ip_vs_sync_init(void)
1685 return register_pernet_subsys(&ipvs_sync_ops
);
1688 void ip_vs_sync_cleanup(void)
1690 unregister_pernet_subsys(&ipvs_sync_ops
);