]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - net/netfilter/ipvs/ip_vs_lblcr.c
Fix common misspellings
[mirror_ubuntu-bionic-kernel.git] / net / netfilter / ipvs / ip_vs_lblcr.c
CommitLineData
1da177e4
LT
1/*
2 * IPVS: Locality-Based Least-Connection with Replication scheduler
3 *
1da177e4
LT
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Julian Anastasov : Added the missing (dest->weight>0)
13 * condition in the ip_vs_dest_set_max.
14 *
15 */
16
17/*
18 * The lblc/r algorithm is as follows (pseudo code):
19 *
20 * if serverSet[dest_ip] is null then
21 * n, serverSet[dest_ip] <- {weighted least-conn node};
22 * else
23 * n <- {least-conn (alive) node in serverSet[dest_ip]};
24 * if (n is null) OR
25 * (n.conns>n.weight AND
26 * there is a node m with m.conns<m.weight/2) then
27 * n <- {weighted least-conn node};
28 * add n to serverSet[dest_ip];
29 * if |serverSet[dest_ip]| > 1 AND
30 * now - serverSet[dest_ip].lastMod > T then
31 * m <- {most conn node in serverSet[dest_ip]};
32 * remove m from serverSet[dest_ip];
33 * if serverSet[dest_ip] changed then
34 * serverSet[dest_ip].lastMod <- now;
35 *
36 * return n;
37 *
38 */
39
9aada7ac
HE
40#define KMSG_COMPONENT "IPVS"
41#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
42
14c85021 43#include <linux/ip.h>
1da177e4
LT
44#include <linux/module.h>
45#include <linux/kernel.h>
14c85021 46#include <linux/skbuff.h>
d7fe0f24 47#include <linux/jiffies.h>
51f0bc78 48#include <linux/list.h>
5a0e3ad6 49#include <linux/slab.h>
1da177e4
LT
50
51/* for sysctl */
52#include <linux/fs.h>
53#include <linux/sysctl.h>
457c4cbc 54#include <net/net_namespace.h>
1da177e4
LT
55
56#include <net/ip_vs.h>
57
58
59/*
60 * It is for garbage collection of stale IPVS lblcr entries,
61 * when the table is full.
62 */
63#define CHECK_EXPIRE_INTERVAL (60*HZ)
64#define ENTRY_TIMEOUT (6*60*HZ)
65
b27d777e
SH
66#define DEFAULT_EXPIRATION (24*60*60*HZ)
67
1da177e4
LT
68/*
69 * It is for full expiration check.
70 * When there is no partial expiration check (garbage collection)
71 * in a half hour, do a full expiration check to collect stale
72 * entries that haven't been touched for a day.
73 */
74#define COUNT_FOR_FULL_EXPIRATION 30
1da177e4
LT
75
76/*
77 * for IPVS lblcr entry hash table
78 */
79#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
80#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
81#endif
82#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
83#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
84#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
85
86
87/*
88 * IPVS destination set structure and operations
89 */
51f0bc78
SH
90struct ip_vs_dest_set_elem {
91 struct list_head list; /* list link */
1da177e4
LT
92 struct ip_vs_dest *dest; /* destination server */
93};
94
95struct ip_vs_dest_set {
96 atomic_t size; /* set size */
97 unsigned long lastmod; /* last modified time */
51f0bc78 98 struct list_head list; /* destination list */
1da177e4
LT
99 rwlock_t lock; /* lock for this list */
100};
101
102
51f0bc78 103static struct ip_vs_dest_set_elem *
1da177e4
LT
104ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
105{
51f0bc78 106 struct ip_vs_dest_set_elem *e;
1da177e4 107
51f0bc78 108 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
109 if (e->dest == dest)
110 /* already existed */
111 return NULL;
112 }
113
f728bafb 114 e = kmalloc(sizeof(*e), GFP_ATOMIC);
1da177e4 115 if (e == NULL) {
1e3e238e 116 pr_err("%s(): no memory\n", __func__);
1da177e4
LT
117 return NULL;
118 }
119
120 atomic_inc(&dest->refcnt);
121 e->dest = dest;
122
51f0bc78 123 list_add(&e->list, &set->list);
1da177e4 124 atomic_inc(&set->size);
1da177e4
LT
125
126 set->lastmod = jiffies;
127 return e;
128}
129
130static void
131ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
132{
51f0bc78 133 struct ip_vs_dest_set_elem *e;
1da177e4 134
51f0bc78 135 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
136 if (e->dest == dest) {
137 /* HIT */
1da177e4
LT
138 atomic_dec(&set->size);
139 set->lastmod = jiffies;
140 atomic_dec(&e->dest->refcnt);
51f0bc78 141 list_del(&e->list);
1da177e4
LT
142 kfree(e);
143 break;
144 }
1da177e4 145 }
1da177e4
LT
146}
147
148static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
149{
51f0bc78 150 struct ip_vs_dest_set_elem *e, *ep;
1da177e4
LT
151
152 write_lock(&set->lock);
51f0bc78 153 list_for_each_entry_safe(e, ep, &set->list, list) {
1da177e4 154 /*
25985edc 155 * We don't kfree dest because it is referred either
1da177e4
LT
156 * by its service or by the trash dest list.
157 */
158 atomic_dec(&e->dest->refcnt);
51f0bc78 159 list_del(&e->list);
1da177e4
LT
160 kfree(e);
161 }
162 write_unlock(&set->lock);
163}
164
165/* get weighted least-connection node in the destination set */
166static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
167{
51f0bc78 168 register struct ip_vs_dest_set_elem *e;
1da177e4
LT
169 struct ip_vs_dest *dest, *least;
170 int loh, doh;
171
172 if (set == NULL)
173 return NULL;
174
1da177e4 175 /* select the first destination server, whose weight > 0 */
51f0bc78 176 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
177 least = e->dest;
178 if (least->flags & IP_VS_DEST_F_OVERLOAD)
179 continue;
180
181 if ((atomic_read(&least->weight) > 0)
182 && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
b552f7e3 183 loh = ip_vs_dest_conn_overhead(least);
1da177e4
LT
184 goto nextstage;
185 }
186 }
1da177e4
LT
187 return NULL;
188
189 /* find the destination with the weighted least load */
190 nextstage:
51f0bc78 191 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
192 dest = e->dest;
193 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
194 continue;
195
b552f7e3 196 doh = ip_vs_dest_conn_overhead(dest);
1da177e4
LT
197 if ((loh * atomic_read(&dest->weight) >
198 doh * atomic_read(&least->weight))
199 && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
200 least = dest;
201 loh = doh;
202 }
203 }
1da177e4 204
1e3e238e 205 IP_VS_DBG_BUF(6, "%s(): server %s:%d "
44548375 206 "activeconns %d refcnt %d weight %d overhead %d\n",
1e3e238e 207 __func__,
44548375
JV
208 IP_VS_DBG_ADDR(least->af, &least->addr),
209 ntohs(least->port),
210 atomic_read(&least->activeconns),
211 atomic_read(&least->refcnt),
212 atomic_read(&least->weight), loh);
1da177e4
LT
213 return least;
214}
215
216
217/* get weighted most-connection node in the destination set */
218static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
219{
51f0bc78 220 register struct ip_vs_dest_set_elem *e;
1da177e4
LT
221 struct ip_vs_dest *dest, *most;
222 int moh, doh;
223
224 if (set == NULL)
225 return NULL;
226
1da177e4 227 /* select the first destination server, whose weight > 0 */
51f0bc78 228 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
229 most = e->dest;
230 if (atomic_read(&most->weight) > 0) {
b552f7e3 231 moh = ip_vs_dest_conn_overhead(most);
1da177e4
LT
232 goto nextstage;
233 }
234 }
1da177e4
LT
235 return NULL;
236
237 /* find the destination with the weighted most load */
238 nextstage:
51f0bc78 239 list_for_each_entry(e, &set->list, list) {
1da177e4 240 dest = e->dest;
b552f7e3 241 doh = ip_vs_dest_conn_overhead(dest);
1da177e4
LT
242 /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
243 if ((moh * atomic_read(&dest->weight) <
244 doh * atomic_read(&most->weight))
245 && (atomic_read(&dest->weight) > 0)) {
246 most = dest;
247 moh = doh;
248 }
249 }
1da177e4 250
1e3e238e 251 IP_VS_DBG_BUF(6, "%s(): server %s:%d "
44548375 252 "activeconns %d refcnt %d weight %d overhead %d\n",
1e3e238e 253 __func__,
44548375
JV
254 IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port),
255 atomic_read(&most->activeconns),
256 atomic_read(&most->refcnt),
257 atomic_read(&most->weight), moh);
1da177e4
LT
258 return most;
259}
260
261
262/*
263 * IPVS lblcr entry represents an association between destination
264 * IP address and its destination server set
265 */
266struct ip_vs_lblcr_entry {
267 struct list_head list;
44548375
JV
268 int af; /* address family */
269 union nf_inet_addr addr; /* destination IP address */
1da177e4
LT
270 struct ip_vs_dest_set set; /* destination server set */
271 unsigned long lastuse; /* last used time */
272};
273
274
275/*
276 * IPVS lblcr hash table
277 */
278struct ip_vs_lblcr_table {
1da177e4
LT
279 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
280 atomic_t entries; /* number of entries */
281 int max_size; /* maximum size of entries */
282 struct timer_list periodic_timer; /* collect stale entries */
283 int rover; /* rover for expire check */
284 int counter; /* counter for no expire */
285};
286
287
fb1de432 288#ifdef CONFIG_SYSCTL
1da177e4
LT
289/*
290 * IPVS LBLCR sysctl table
291 */
292
293static ctl_table vs_vars_table[] = {
294 {
1da177e4 295 .procname = "lblcr_expiration",
d0a1eef9 296 .data = NULL,
1da177e4 297 .maxlen = sizeof(int),
e905a9ed 298 .mode = 0644,
6d9f239a 299 .proc_handler = proc_dointvec_jiffies,
1da177e4 300 },
f8572d8f 301 { }
1da177e4 302};
fb1de432 303#endif
1da177e4 304
1da177e4
LT
305static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
306{
307 list_del(&en->list);
308 ip_vs_dest_set_eraseall(&en->set);
309 kfree(en);
310}
311
312
313/*
314 * Returns hash value for IPVS LBLCR entry
315 */
44548375
JV
316static inline unsigned
317ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
1da177e4 318{
44548375
JV
319 __be32 addr_fold = addr->ip;
320
321#ifdef CONFIG_IP_VS_IPV6
322 if (af == AF_INET6)
323 addr_fold = addr->ip6[0]^addr->ip6[1]^
324 addr->ip6[2]^addr->ip6[3];
325#endif
326 return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
1da177e4
LT
327}
328
329
330/*
331 * Hash an entry in the ip_vs_lblcr_table.
332 * returns bool success.
333 */
f728bafb 334static void
1da177e4
LT
335ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
336{
44548375 337 unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
1da177e4 338
1da177e4
LT
339 list_add(&en->list, &tbl->bucket[hash]);
340 atomic_inc(&tbl->entries);
1da177e4
LT
341}
342
343
1da177e4 344/*
f728bafb
SW
345 * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
346 * read lock.
1da177e4
LT
347 */
348static inline struct ip_vs_lblcr_entry *
44548375
JV
349ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
350 const union nf_inet_addr *addr)
1da177e4 351{
44548375 352 unsigned hash = ip_vs_lblcr_hashkey(af, addr);
1da177e4
LT
353 struct ip_vs_lblcr_entry *en;
354
f728bafb 355 list_for_each_entry(en, &tbl->bucket[hash], list)
44548375 356 if (ip_vs_addr_equal(af, &en->addr, addr))
f728bafb
SW
357 return en;
358
359 return NULL;
360}
1da177e4 361
1da177e4 362
f728bafb
SW
363/*
364 * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
365 * IP address to a server. Called under write lock.
366 */
367static inline struct ip_vs_lblcr_entry *
44548375 368ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
f728bafb
SW
369 struct ip_vs_dest *dest)
370{
371 struct ip_vs_lblcr_entry *en;
372
44548375 373 en = ip_vs_lblcr_get(dest->af, tbl, daddr);
f728bafb
SW
374 if (!en) {
375 en = kmalloc(sizeof(*en), GFP_ATOMIC);
376 if (!en) {
1e3e238e 377 pr_err("%s(): no memory\n", __func__);
f728bafb 378 return NULL;
1da177e4 379 }
f728bafb 380
44548375
JV
381 en->af = dest->af;
382 ip_vs_addr_copy(dest->af, &en->addr, daddr);
f728bafb
SW
383 en->lastuse = jiffies;
384
421f91d2 385 /* initialize its dest set */
f728bafb 386 atomic_set(&(en->set.size), 0);
51f0bc78 387 INIT_LIST_HEAD(&en->set.list);
f728bafb
SW
388 rwlock_init(&en->set.lock);
389
390 ip_vs_lblcr_hash(tbl, en);
1da177e4
LT
391 }
392
f728bafb
SW
393 write_lock(&en->set.lock);
394 ip_vs_dest_set_insert(&en->set, dest);
395 write_unlock(&en->set.lock);
1da177e4 396
f728bafb 397 return en;
1da177e4
LT
398}
399
400
401/*
402 * Flush all the entries of the specified table.
403 */
404static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
405{
406 int i;
407 struct ip_vs_lblcr_entry *en, *nxt;
408
f728bafb 409 /* No locking required, only called during cleanup. */
1da177e4 410 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
1da177e4
LT
411 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
412 ip_vs_lblcr_free(en);
1da177e4 413 }
1da177e4
LT
414 }
415}
416
b27d777e
SH
417static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
418{
419#ifdef CONFIG_SYSCTL
420 struct netns_ipvs *ipvs = net_ipvs(svc->net);
421 return ipvs->sysctl_lblcr_expiration;
422#else
423 return DEFAULT_EXPIRATION;
424#endif
425}
1da177e4 426
f728bafb 427static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
1da177e4 428{
f728bafb 429 struct ip_vs_lblcr_table *tbl = svc->sched_data;
1da177e4
LT
430 unsigned long now = jiffies;
431 int i, j;
432 struct ip_vs_lblcr_entry *en, *nxt;
433
434 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
435 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
436
f728bafb 437 write_lock(&svc->sched_lock);
1da177e4 438 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
b27d777e
SH
439 if (time_after(en->lastuse +
440 sysctl_lblcr_expiration(svc), now))
1da177e4
LT
441 continue;
442
443 ip_vs_lblcr_free(en);
444 atomic_dec(&tbl->entries);
445 }
f728bafb 446 write_unlock(&svc->sched_lock);
1da177e4
LT
447 }
448 tbl->rover = j;
449}
450
451
452/*
453 * Periodical timer handler for IPVS lblcr table
454 * It is used to collect stale entries when the number of entries
455 * exceeds the maximum size of the table.
456 *
457 * Fixme: we probably need more complicated algorithm to collect
458 * entries that have not been used for a long time even
459 * if the number of entries doesn't exceed the maximum size
460 * of the table.
461 * The full expiration check is for this purpose now.
462 */
463static void ip_vs_lblcr_check_expire(unsigned long data)
464{
f728bafb
SW
465 struct ip_vs_service *svc = (struct ip_vs_service *) data;
466 struct ip_vs_lblcr_table *tbl = svc->sched_data;
1da177e4
LT
467 unsigned long now = jiffies;
468 int goal;
469 int i, j;
470 struct ip_vs_lblcr_entry *en, *nxt;
471
1da177e4
LT
472 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
473 /* do full expiration check */
f728bafb 474 ip_vs_lblcr_full_check(svc);
1da177e4
LT
475 tbl->counter = 1;
476 goto out;
477 }
478
479 if (atomic_read(&tbl->entries) <= tbl->max_size) {
480 tbl->counter++;
481 goto out;
482 }
483
484 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
485 if (goal > tbl->max_size/2)
486 goal = tbl->max_size/2;
487
488 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
489 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
490
f728bafb 491 write_lock(&svc->sched_lock);
1da177e4
LT
492 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
493 if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
494 continue;
495
496 ip_vs_lblcr_free(en);
497 atomic_dec(&tbl->entries);
498 goal--;
499 }
f728bafb 500 write_unlock(&svc->sched_lock);
1da177e4
LT
501 if (goal <= 0)
502 break;
503 }
504 tbl->rover = j;
505
506 out:
507 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
508}
509
1da177e4
LT
510static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
511{
512 int i;
513 struct ip_vs_lblcr_table *tbl;
514
515 /*
516 * Allocate the ip_vs_lblcr_table for this service
517 */
f728bafb 518 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
1da177e4 519 if (tbl == NULL) {
1e3e238e 520 pr_err("%s(): no memory\n", __func__);
1da177e4
LT
521 return -ENOMEM;
522 }
523 svc->sched_data = tbl;
524 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
f728bafb 525 "current service\n", sizeof(*tbl));
1da177e4
LT
526
527 /*
528 * Initialize the hash buckets
529 */
530 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
531 INIT_LIST_HEAD(&tbl->bucket[i]);
532 }
1da177e4
LT
533 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
534 tbl->rover = 0;
535 tbl->counter = 1;
536
537 /*
538 * Hook periodic timer for garbage collection
539 */
b24b8a24 540 setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
f728bafb
SW
541 (unsigned long)svc);
542 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
1da177e4 543
1da177e4
LT
544 return 0;
545}
546
547
548static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
549{
550 struct ip_vs_lblcr_table *tbl = svc->sched_data;
551
552 /* remove periodic timer */
553 del_timer_sync(&tbl->periodic_timer);
554
555 /* got to clean up table entries here */
556 ip_vs_lblcr_flush(tbl);
557
558 /* release the table itself */
f728bafb 559 kfree(tbl);
1da177e4 560 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
f728bafb 561 sizeof(*tbl));
1da177e4
LT
562
563 return 0;
564}
565
566
1da177e4 567static inline struct ip_vs_dest *
44548375 568__ip_vs_lblcr_schedule(struct ip_vs_service *svc)
1da177e4
LT
569{
570 struct ip_vs_dest *dest, *least;
571 int loh, doh;
572
573 /*
b552f7e3 574 * We use the following formula to estimate the load:
1da177e4
LT
575 * (dest overhead) / dest->weight
576 *
577 * Remember -- no floats in kernel mode!!!
578 * The comparison of h1*w2 > h2*w1 is equivalent to that of
579 * h1/w1 > h2/w2
580 * if every weight is larger than zero.
581 *
582 * The server with weight=0 is quiesced and will not receive any
583 * new connection.
584 */
585 list_for_each_entry(dest, &svc->destinations, n_list) {
586 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
587 continue;
588
589 if (atomic_read(&dest->weight) > 0) {
590 least = dest;
b552f7e3 591 loh = ip_vs_dest_conn_overhead(least);
1da177e4
LT
592 goto nextstage;
593 }
594 }
595 return NULL;
596
597 /*
598 * Find the destination with the least load.
599 */
600 nextstage:
601 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
602 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
603 continue;
604
b552f7e3 605 doh = ip_vs_dest_conn_overhead(dest);
1da177e4
LT
606 if (loh * atomic_read(&dest->weight) >
607 doh * atomic_read(&least->weight)) {
608 least = dest;
609 loh = doh;
610 }
611 }
612
44548375
JV
613 IP_VS_DBG_BUF(6, "LBLCR: server %s:%d "
614 "activeconns %d refcnt %d weight %d overhead %d\n",
615 IP_VS_DBG_ADDR(least->af, &least->addr),
616 ntohs(least->port),
617 atomic_read(&least->activeconns),
618 atomic_read(&least->refcnt),
619 atomic_read(&least->weight), loh);
1da177e4
LT
620
621 return least;
622}
623
624
625/*
626 * If this destination server is overloaded and there is a less loaded
627 * server, then return true.
628 */
629static inline int
630is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
631{
632 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
633 struct ip_vs_dest *d;
634
635 list_for_each_entry(d, &svc->destinations, n_list) {
636 if (atomic_read(&d->activeconns)*2
637 < atomic_read(&d->weight)) {
638 return 1;
639 }
640 }
641 }
642 return 0;
643}
644
645
646/*
647 * Locality-Based (weighted) Least-Connection scheduling
648 */
649static struct ip_vs_dest *
650ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
651{
f728bafb 652 struct ip_vs_lblcr_table *tbl = svc->sched_data;
44548375 653 struct ip_vs_iphdr iph;
f728bafb
SW
654 struct ip_vs_dest *dest = NULL;
655 struct ip_vs_lblcr_entry *en;
1da177e4 656
44548375
JV
657 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
658
1e3e238e 659 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
1da177e4 660
f728bafb
SW
661 /* First look in our cache */
662 read_lock(&svc->sched_lock);
44548375 663 en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
f728bafb
SW
664 if (en) {
665 /* We only hold a read lock, but this is atomic */
666 en->lastuse = jiffies;
667
668 /* Get the least loaded destination */
669 read_lock(&en->set.lock);
1da177e4 670 dest = ip_vs_dest_set_min(&en->set);
f728bafb
SW
671 read_unlock(&en->set.lock);
672
673 /* More than one destination + enough time passed by, cleanup */
1da177e4 674 if (atomic_read(&en->set.size) > 1 &&
f728bafb 675 time_after(jiffies, en->set.lastmod +
b27d777e 676 sysctl_lblcr_expiration(svc))) {
1da177e4 677 struct ip_vs_dest *m;
f728bafb
SW
678
679 write_lock(&en->set.lock);
1da177e4
LT
680 m = ip_vs_dest_set_max(&en->set);
681 if (m)
682 ip_vs_dest_set_erase(&en->set, m);
f728bafb
SW
683 write_unlock(&en->set.lock);
684 }
685
686 /* If the destination is not overloaded, use it */
687 if (dest && !is_overloaded(dest, svc)) {
688 read_unlock(&svc->sched_lock);
689 goto out;
690 }
691
692 /* The cache entry is invalid, time to schedule */
44548375 693 dest = __ip_vs_lblcr_schedule(svc);
f728bafb 694 if (!dest) {
41ac51ee 695 ip_vs_scheduler_err(svc, "no destination available");
f728bafb
SW
696 read_unlock(&svc->sched_lock);
697 return NULL;
1da177e4 698 }
f728bafb
SW
699
700 /* Update our cache entry */
701 write_lock(&en->set.lock);
702 ip_vs_dest_set_insert(&en->set, dest);
703 write_unlock(&en->set.lock);
704 }
705 read_unlock(&svc->sched_lock);
706
707 if (dest)
708 goto out;
709
710 /* No cache entry, time to schedule */
44548375 711 dest = __ip_vs_lblcr_schedule(svc);
f728bafb
SW
712 if (!dest) {
713 IP_VS_DBG(1, "no destination available\n");
714 return NULL;
1da177e4 715 }
1da177e4 716
f728bafb
SW
717 /* If we fail to create a cache entry, we'll just use the valid dest */
718 write_lock(&svc->sched_lock);
44548375 719 ip_vs_lblcr_new(tbl, &iph.daddr, dest);
f728bafb
SW
720 write_unlock(&svc->sched_lock);
721
722out:
44548375
JV
723 IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
724 IP_VS_DBG_ADDR(svc->af, &iph.daddr),
725 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
1da177e4
LT
726
727 return dest;
728}
729
730
731/*
732 * IPVS LBLCR Scheduler structure
733 */
734static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
735{
736 .name = "lblcr",
737 .refcnt = ATOMIC_INIT(0),
738 .module = THIS_MODULE,
d149ccc9 739 .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
1da177e4
LT
740 .init_service = ip_vs_lblcr_init_svc,
741 .done_service = ip_vs_lblcr_done_svc,
1da177e4
LT
742 .schedule = ip_vs_lblcr_schedule,
743};
744
61b1ab45
HS
745/*
746 * per netns init.
747 */
fb1de432 748#ifdef CONFIG_SYSCTL
61b1ab45
HS
749static int __net_init __ip_vs_lblcr_init(struct net *net)
750{
d0a1eef9
HS
751 struct netns_ipvs *ipvs = net_ipvs(net);
752
753 if (!net_eq(net, &init_net)) {
754 ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
755 sizeof(vs_vars_table),
756 GFP_KERNEL);
757 if (ipvs->lblcr_ctl_table == NULL)
0443929f 758 return -ENOMEM;
d0a1eef9
HS
759 } else
760 ipvs->lblcr_ctl_table = vs_vars_table;
b27d777e 761 ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
d0a1eef9
HS
762 ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
763
764 ipvs->lblcr_ctl_header =
765 register_net_sysctl_table(net, net_vs_ctl_path,
766 ipvs->lblcr_ctl_table);
0443929f
SH
767 if (!ipvs->lblcr_ctl_header) {
768 if (!net_eq(net, &init_net))
769 kfree(ipvs->lblcr_ctl_table);
770 return -ENOMEM;
771 }
61b1ab45
HS
772
773 return 0;
774}
775
776static void __net_exit __ip_vs_lblcr_exit(struct net *net)
777{
d0a1eef9
HS
778 struct netns_ipvs *ipvs = net_ipvs(net);
779
780 unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
61b1ab45 781
d0a1eef9
HS
782 if (!net_eq(net, &init_net))
783 kfree(ipvs->lblcr_ctl_table);
61b1ab45
HS
784}
785
fb1de432
SH
786#else
787
788static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; }
789static void __net_exit __ip_vs_lblcr_exit(struct net *net) { }
790
791#endif
792
61b1ab45
HS
793static struct pernet_operations ip_vs_lblcr_ops = {
794 .init = __ip_vs_lblcr_init,
795 .exit = __ip_vs_lblcr_exit,
796};
1da177e4
LT
797
798static int __init ip_vs_lblcr_init(void)
799{
a014bc8f
PE
800 int ret;
801
61b1ab45
HS
802 ret = register_pernet_subsys(&ip_vs_lblcr_ops);
803 if (ret)
804 return ret;
805
a014bc8f
PE
806 ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
807 if (ret)
61b1ab45 808 unregister_pernet_subsys(&ip_vs_lblcr_ops);
a014bc8f 809 return ret;
1da177e4
LT
810}
811
1da177e4
LT
812static void __exit ip_vs_lblcr_cleanup(void)
813{
1da177e4 814 unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
61b1ab45 815 unregister_pernet_subsys(&ip_vs_lblcr_ops);
1da177e4
LT
816}
817
818
819module_init(ip_vs_lblcr_init);
820module_exit(ip_vs_lblcr_cleanup);
821MODULE_LICENSE("GPL");