2 * IPVS: Locality-Based Least-Connection scheduling module
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
12 * Martin Hamilton : fixed the terrible locking bugs
13 * *lock(tbl->lock) ==> *lock(&tbl->lock)
14 * Wensong Zhang : fixed the uninitialized tbl->lock bug
15 * Wensong Zhang : added doing full expiration check to
16 * collect stale entries of 24+ hours when
17 * no partial expire check in a half hour
18 * Julian Anastasov : replaced del_timer call with del_timer_sync
19 * to avoid the possible race between timer
20 * handler and del_timer thread in SMP
25 * The lblc algorithm is as follows (pseudo code):
27 * if cachenode[dest_ip] is null then
28 * n, cachenode[dest_ip] <- {weighted least-conn node};
30 * n <- cachenode[dest_ip];
32 * (n.conns>n.weight AND
33 * there is a node m with m.conns<m.weight/2) then
34 * n, cachenode[dest_ip] <- {weighted least-conn node};
38 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
39 * me to write this module.
42 #define KMSG_COMPONENT "IPVS"
43 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
46 #include <linux/slab.h>
47 #include <linux/module.h>
48 #include <linux/kernel.h>
49 #include <linux/skbuff.h>
50 #include <linux/jiffies.h>
54 #include <linux/sysctl.h>
56 #include <net/ip_vs.h>
60 * It is for garbage collection of stale IPVS lblc entries,
61 * when the table is full.
63 #define CHECK_EXPIRE_INTERVAL (60*HZ)
64 #define ENTRY_TIMEOUT (6*60*HZ)
66 #define DEFAULT_EXPIRATION (24*60*60*HZ)
69 * It is for full expiration check.
70 * When there is no partial expiration check (garbage collection)
71 * in a half hour, do a full expiration check to collect stale
72 * entries that haven't been touched for a day.
74 #define COUNT_FOR_FULL_EXPIRATION 30
78 * for IPVS lblc entry hash table
80 #ifndef CONFIG_IP_VS_LBLC_TAB_BITS
81 #define CONFIG_IP_VS_LBLC_TAB_BITS 10
83 #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
84 #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
85 #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
89 * IPVS lblc entry represents an association between destination
90 * IP address and its destination server
92 struct ip_vs_lblc_entry
{
93 struct list_head list
;
94 int af
; /* address family */
95 union nf_inet_addr addr
; /* destination IP address */
96 struct ip_vs_dest
*dest
; /* real server (cache) */
97 unsigned long lastuse
; /* last used time */
102 * IPVS lblc hash table
104 struct ip_vs_lblc_table
{
105 struct list_head bucket
[IP_VS_LBLC_TAB_SIZE
]; /* hash bucket */
106 atomic_t entries
; /* number of entries */
107 int max_size
; /* maximum size of entries */
108 struct timer_list periodic_timer
; /* collect stale entries */
109 int rover
; /* rover for expire check */
110 int counter
; /* counter for no expire */
115 * IPVS LBLC sysctl table
118 static ctl_table vs_vars_table
[] = {
120 .procname
= "lblc_expiration",
122 .maxlen
= sizeof(int),
124 .proc_handler
= proc_dointvec_jiffies
,
130 static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry
*en
)
134 * We don't kfree dest because it is referred either by its service
135 * or the trash dest list.
137 atomic_dec(&en
->dest
->refcnt
);
143 * Returns hash value for IPVS LBLC entry
145 static inline unsigned
146 ip_vs_lblc_hashkey(int af
, const union nf_inet_addr
*addr
)
148 __be32 addr_fold
= addr
->ip
;
150 #ifdef CONFIG_IP_VS_IPV6
152 addr_fold
= addr
->ip6
[0]^addr
->ip6
[1]^
153 addr
->ip6
[2]^addr
->ip6
[3];
155 return (ntohl(addr_fold
)*2654435761UL) & IP_VS_LBLC_TAB_MASK
;
160 * Hash an entry in the ip_vs_lblc_table.
161 * returns bool success.
164 ip_vs_lblc_hash(struct ip_vs_lblc_table
*tbl
, struct ip_vs_lblc_entry
*en
)
166 unsigned hash
= ip_vs_lblc_hashkey(en
->af
, &en
->addr
);
168 list_add(&en
->list
, &tbl
->bucket
[hash
]);
169 atomic_inc(&tbl
->entries
);
174 * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
177 static inline struct ip_vs_lblc_entry
*
178 ip_vs_lblc_get(int af
, struct ip_vs_lblc_table
*tbl
,
179 const union nf_inet_addr
*addr
)
181 unsigned hash
= ip_vs_lblc_hashkey(af
, addr
);
182 struct ip_vs_lblc_entry
*en
;
184 list_for_each_entry(en
, &tbl
->bucket
[hash
], list
)
185 if (ip_vs_addr_equal(af
, &en
->addr
, addr
))
193 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
194 * address to a server. Called under write lock.
196 static inline struct ip_vs_lblc_entry
*
197 ip_vs_lblc_new(struct ip_vs_lblc_table
*tbl
, const union nf_inet_addr
*daddr
,
198 struct ip_vs_dest
*dest
)
200 struct ip_vs_lblc_entry
*en
;
202 en
= ip_vs_lblc_get(dest
->af
, tbl
, daddr
);
204 en
= kmalloc(sizeof(*en
), GFP_ATOMIC
);
206 pr_err("%s(): no memory\n", __func__
);
211 ip_vs_addr_copy(dest
->af
, &en
->addr
, daddr
);
212 en
->lastuse
= jiffies
;
214 atomic_inc(&dest
->refcnt
);
217 ip_vs_lblc_hash(tbl
, en
);
218 } else if (en
->dest
!= dest
) {
219 atomic_dec(&en
->dest
->refcnt
);
220 atomic_inc(&dest
->refcnt
);
229 * Flush all the entries of the specified table.
231 static void ip_vs_lblc_flush(struct ip_vs_lblc_table
*tbl
)
233 struct ip_vs_lblc_entry
*en
, *nxt
;
236 for (i
=0; i
<IP_VS_LBLC_TAB_SIZE
; i
++) {
237 list_for_each_entry_safe(en
, nxt
, &tbl
->bucket
[i
], list
) {
239 atomic_dec(&tbl
->entries
);
244 static int sysctl_lblc_expiration(struct ip_vs_service
*svc
)
247 struct netns_ipvs
*ipvs
= net_ipvs(svc
->net
);
248 return ipvs
->sysctl_lblc_expiration
;
250 return DEFAULT_EXPIRATION
;
254 static inline void ip_vs_lblc_full_check(struct ip_vs_service
*svc
)
256 struct ip_vs_lblc_table
*tbl
= svc
->sched_data
;
257 struct ip_vs_lblc_entry
*en
, *nxt
;
258 unsigned long now
= jiffies
;
261 for (i
=0, j
=tbl
->rover
; i
<IP_VS_LBLC_TAB_SIZE
; i
++) {
262 j
= (j
+ 1) & IP_VS_LBLC_TAB_MASK
;
264 write_lock(&svc
->sched_lock
);
265 list_for_each_entry_safe(en
, nxt
, &tbl
->bucket
[j
], list
) {
268 sysctl_lblc_expiration(svc
)))
272 atomic_dec(&tbl
->entries
);
274 write_unlock(&svc
->sched_lock
);
281 * Periodical timer handler for IPVS lblc table
282 * It is used to collect stale entries when the number of entries
283 * exceeds the maximum size of the table.
285 * Fixme: we probably need more complicated algorithm to collect
286 * entries that have not been used for a long time even
287 * if the number of entries doesn't exceed the maximum size
289 * The full expiration check is for this purpose now.
291 static void ip_vs_lblc_check_expire(unsigned long data
)
293 struct ip_vs_service
*svc
= (struct ip_vs_service
*) data
;
294 struct ip_vs_lblc_table
*tbl
= svc
->sched_data
;
295 unsigned long now
= jiffies
;
298 struct ip_vs_lblc_entry
*en
, *nxt
;
300 if ((tbl
->counter
% COUNT_FOR_FULL_EXPIRATION
) == 0) {
301 /* do full expiration check */
302 ip_vs_lblc_full_check(svc
);
307 if (atomic_read(&tbl
->entries
) <= tbl
->max_size
) {
312 goal
= (atomic_read(&tbl
->entries
) - tbl
->max_size
)*4/3;
313 if (goal
> tbl
->max_size
/2)
314 goal
= tbl
->max_size
/2;
316 for (i
=0, j
=tbl
->rover
; i
<IP_VS_LBLC_TAB_SIZE
; i
++) {
317 j
= (j
+ 1) & IP_VS_LBLC_TAB_MASK
;
319 write_lock(&svc
->sched_lock
);
320 list_for_each_entry_safe(en
, nxt
, &tbl
->bucket
[j
], list
) {
321 if (time_before(now
, en
->lastuse
+ ENTRY_TIMEOUT
))
325 atomic_dec(&tbl
->entries
);
328 write_unlock(&svc
->sched_lock
);
335 mod_timer(&tbl
->periodic_timer
, jiffies
+CHECK_EXPIRE_INTERVAL
);
339 static int ip_vs_lblc_init_svc(struct ip_vs_service
*svc
)
342 struct ip_vs_lblc_table
*tbl
;
345 * Allocate the ip_vs_lblc_table for this service
347 tbl
= kmalloc(sizeof(*tbl
), GFP_ATOMIC
);
349 pr_err("%s(): no memory\n", __func__
);
352 svc
->sched_data
= tbl
;
353 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
354 "current service\n", sizeof(*tbl
));
357 * Initialize the hash buckets
359 for (i
=0; i
<IP_VS_LBLC_TAB_SIZE
; i
++) {
360 INIT_LIST_HEAD(&tbl
->bucket
[i
]);
362 tbl
->max_size
= IP_VS_LBLC_TAB_SIZE
*16;
367 * Hook periodic timer for garbage collection
369 setup_timer(&tbl
->periodic_timer
, ip_vs_lblc_check_expire
,
371 mod_timer(&tbl
->periodic_timer
, jiffies
+ CHECK_EXPIRE_INTERVAL
);
377 static int ip_vs_lblc_done_svc(struct ip_vs_service
*svc
)
379 struct ip_vs_lblc_table
*tbl
= svc
->sched_data
;
381 /* remove periodic timer */
382 del_timer_sync(&tbl
->periodic_timer
);
384 /* got to clean up table entries here */
385 ip_vs_lblc_flush(tbl
);
387 /* release the table itself */
389 IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
396 static inline struct ip_vs_dest
*
397 __ip_vs_lblc_schedule(struct ip_vs_service
*svc
)
399 struct ip_vs_dest
*dest
, *least
;
403 * We use the following formula to estimate the load:
404 * (dest overhead) / dest->weight
406 * Remember -- no floats in kernel mode!!!
407 * The comparison of h1*w2 > h2*w1 is equivalent to that of
409 * if every weight is larger than zero.
411 * The server with weight=0 is quiesced and will not receive any
414 list_for_each_entry(dest
, &svc
->destinations
, n_list
) {
415 if (dest
->flags
& IP_VS_DEST_F_OVERLOAD
)
417 if (atomic_read(&dest
->weight
) > 0) {
419 loh
= ip_vs_dest_conn_overhead(least
);
426 * Find the destination with the least load.
429 list_for_each_entry_continue(dest
, &svc
->destinations
, n_list
) {
430 if (dest
->flags
& IP_VS_DEST_F_OVERLOAD
)
433 doh
= ip_vs_dest_conn_overhead(dest
);
434 if (loh
* atomic_read(&dest
->weight
) >
435 doh
* atomic_read(&least
->weight
)) {
441 IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
442 "activeconns %d refcnt %d weight %d overhead %d\n",
443 IP_VS_DBG_ADDR(least
->af
, &least
->addr
),
445 atomic_read(&least
->activeconns
),
446 atomic_read(&least
->refcnt
),
447 atomic_read(&least
->weight
), loh
);
454 * If this destination server is overloaded and there is a less loaded
455 * server, then return true.
458 is_overloaded(struct ip_vs_dest
*dest
, struct ip_vs_service
*svc
)
460 if (atomic_read(&dest
->activeconns
) > atomic_read(&dest
->weight
)) {
461 struct ip_vs_dest
*d
;
463 list_for_each_entry(d
, &svc
->destinations
, n_list
) {
464 if (atomic_read(&d
->activeconns
)*2
465 < atomic_read(&d
->weight
)) {
475 * Locality-Based (weighted) Least-Connection scheduling
477 static struct ip_vs_dest
*
478 ip_vs_lblc_schedule(struct ip_vs_service
*svc
, const struct sk_buff
*skb
)
480 struct ip_vs_lblc_table
*tbl
= svc
->sched_data
;
481 struct ip_vs_iphdr iph
;
482 struct ip_vs_dest
*dest
= NULL
;
483 struct ip_vs_lblc_entry
*en
;
485 ip_vs_fill_iphdr(svc
->af
, skb_network_header(skb
), &iph
);
487 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__
);
489 /* First look in our cache */
490 read_lock(&svc
->sched_lock
);
491 en
= ip_vs_lblc_get(svc
->af
, tbl
, &iph
.daddr
);
493 /* We only hold a read lock, but this is atomic */
494 en
->lastuse
= jiffies
;
497 * If the destination is not available, i.e. it's in the trash,
498 * we must ignore it, as it may be removed from under our feet,
499 * if someone drops our reference count. Our caller only makes
500 * sure that destinations, that are not in the trash, are not
501 * moved to the trash, while we are scheduling. But anyone can
502 * free up entries from the trash at any time.
505 if (en
->dest
->flags
& IP_VS_DEST_F_AVAILABLE
)
508 read_unlock(&svc
->sched_lock
);
510 /* If the destination has a weight and is not overloaded, use it */
511 if (dest
&& atomic_read(&dest
->weight
) > 0 && !is_overloaded(dest
, svc
))
514 /* No cache entry or it is invalid, time to schedule */
515 dest
= __ip_vs_lblc_schedule(svc
);
517 ip_vs_scheduler_err(svc
, "no destination available");
521 /* If we fail to create a cache entry, we'll just use the valid dest */
522 write_lock(&svc
->sched_lock
);
523 ip_vs_lblc_new(tbl
, &iph
.daddr
, dest
);
524 write_unlock(&svc
->sched_lock
);
527 IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
528 IP_VS_DBG_ADDR(svc
->af
, &iph
.daddr
),
529 IP_VS_DBG_ADDR(svc
->af
, &dest
->addr
), ntohs(dest
->port
));
536 * IPVS LBLC Scheduler structure
538 static struct ip_vs_scheduler ip_vs_lblc_scheduler
=
541 .refcnt
= ATOMIC_INIT(0),
542 .module
= THIS_MODULE
,
543 .n_list
= LIST_HEAD_INIT(ip_vs_lblc_scheduler
.n_list
),
544 .init_service
= ip_vs_lblc_init_svc
,
545 .done_service
= ip_vs_lblc_done_svc
,
546 .schedule
= ip_vs_lblc_schedule
,
553 static int __net_init
__ip_vs_lblc_init(struct net
*net
)
555 struct netns_ipvs
*ipvs
= net_ipvs(net
);
557 if (!net_eq(net
, &init_net
)) {
558 ipvs
->lblc_ctl_table
= kmemdup(vs_vars_table
,
559 sizeof(vs_vars_table
),
561 if (ipvs
->lblc_ctl_table
== NULL
)
564 ipvs
->lblc_ctl_table
= vs_vars_table
;
565 ipvs
->sysctl_lblc_expiration
= DEFAULT_EXPIRATION
;
566 ipvs
->lblc_ctl_table
[0].data
= &ipvs
->sysctl_lblc_expiration
;
568 ipvs
->lblc_ctl_header
=
569 register_net_sysctl_table(net
, net_vs_ctl_path
,
570 ipvs
->lblc_ctl_table
);
571 if (!ipvs
->lblc_ctl_header
) {
572 if (!net_eq(net
, &init_net
))
573 kfree(ipvs
->lblc_ctl_table
);
580 static void __net_exit
__ip_vs_lblc_exit(struct net
*net
)
582 struct netns_ipvs
*ipvs
= net_ipvs(net
);
584 unregister_net_sysctl_table(ipvs
->lblc_ctl_header
);
586 if (!net_eq(net
, &init_net
))
587 kfree(ipvs
->lblc_ctl_table
);
592 static int __net_init
__ip_vs_lblc_init(struct net
*net
) { return 0; }
593 static void __net_exit
__ip_vs_lblc_exit(struct net
*net
) { }
597 static struct pernet_operations ip_vs_lblc_ops
= {
598 .init
= __ip_vs_lblc_init
,
599 .exit
= __ip_vs_lblc_exit
,
602 static int __init
ip_vs_lblc_init(void)
606 ret
= register_pernet_subsys(&ip_vs_lblc_ops
);
610 ret
= register_ip_vs_scheduler(&ip_vs_lblc_scheduler
);
612 unregister_pernet_subsys(&ip_vs_lblc_ops
);
616 static void __exit
ip_vs_lblc_cleanup(void)
618 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler
);
619 unregister_pernet_subsys(&ip_vs_lblc_ops
);
623 module_init(ip_vs_lblc_init
);
624 module_exit(ip_vs_lblc_cleanup
);
625 MODULE_LICENSE("GPL");