]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - net/netfilter/ipvs/ip_vs_lblcr.c
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit...
[mirror_ubuntu-artful-kernel.git] / net / netfilter / ipvs / ip_vs_lblcr.c
CommitLineData
1da177e4
LT
1/*
2 * IPVS: Locality-Based Least-Connection with Replication scheduler
3 *
1da177e4
LT
4 * Authors: Wensong Zhang <wensong@gnuchina.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Changes:
12 * Julian Anastasov : Added the missing (dest->weight>0)
13 * condition in the ip_vs_dest_set_max.
14 *
15 */
16
17/*
18 * The lblc/r algorithm is as follows (pseudo code):
19 *
20 * if serverSet[dest_ip] is null then
21 * n, serverSet[dest_ip] <- {weighted least-conn node};
22 * else
23 * n <- {least-conn (alive) node in serverSet[dest_ip]};
24 * if (n is null) OR
25 * (n.conns>n.weight AND
26 * there is a node m with m.conns<m.weight/2) then
27 * n <- {weighted least-conn node};
28 * add n to serverSet[dest_ip];
29 * if |serverSet[dest_ip]| > 1 AND
30 * now - serverSet[dest_ip].lastMod > T then
31 * m <- {most conn node in serverSet[dest_ip]};
32 * remove m from serverSet[dest_ip];
33 * if serverSet[dest_ip] changed then
34 * serverSet[dest_ip].lastMod <- now;
35 *
36 * return n;
37 *
38 */
39
9aada7ac
HE
40#define KMSG_COMPONENT "IPVS"
41#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
42
14c85021 43#include <linux/ip.h>
1da177e4
LT
44#include <linux/module.h>
45#include <linux/kernel.h>
14c85021 46#include <linux/skbuff.h>
d7fe0f24 47#include <linux/jiffies.h>
51f0bc78 48#include <linux/list.h>
5a0e3ad6 49#include <linux/slab.h>
1da177e4
LT
50
51/* for sysctl */
52#include <linux/fs.h>
53#include <linux/sysctl.h>
457c4cbc 54#include <net/net_namespace.h>
1da177e4
LT
55
56#include <net/ip_vs.h>
57
58
59/*
60 * It is for garbage collection of stale IPVS lblcr entries,
61 * when the table is full.
62 */
63#define CHECK_EXPIRE_INTERVAL (60*HZ)
64#define ENTRY_TIMEOUT (6*60*HZ)
65
66/*
67 * It is for full expiration check.
68 * When there is no partial expiration check (garbage collection)
69 * in a half hour, do a full expiration check to collect stale
70 * entries that haven't been touched for a day.
71 */
72#define COUNT_FOR_FULL_EXPIRATION 30
73static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
74
75
76/*
77 * for IPVS lblcr entry hash table
78 */
79#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
80#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
81#endif
82#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
83#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
84#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
85
86
87/*
88 * IPVS destination set structure and operations
89 */
51f0bc78
SH
90struct ip_vs_dest_set_elem {
91 struct list_head list; /* list link */
1da177e4
LT
92 struct ip_vs_dest *dest; /* destination server */
93};
94
95struct ip_vs_dest_set {
96 atomic_t size; /* set size */
97 unsigned long lastmod; /* last modified time */
51f0bc78 98 struct list_head list; /* destination list */
1da177e4
LT
99 rwlock_t lock; /* lock for this list */
100};
101
102
51f0bc78 103static struct ip_vs_dest_set_elem *
1da177e4
LT
104ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
105{
51f0bc78 106 struct ip_vs_dest_set_elem *e;
1da177e4 107
51f0bc78 108 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
109 if (e->dest == dest)
110 /* already existed */
111 return NULL;
112 }
113
f728bafb 114 e = kmalloc(sizeof(*e), GFP_ATOMIC);
1da177e4 115 if (e == NULL) {
1e3e238e 116 pr_err("%s(): no memory\n", __func__);
1da177e4
LT
117 return NULL;
118 }
119
120 atomic_inc(&dest->refcnt);
121 e->dest = dest;
122
51f0bc78 123 list_add(&e->list, &set->list);
1da177e4 124 atomic_inc(&set->size);
1da177e4
LT
125
126 set->lastmod = jiffies;
127 return e;
128}
129
130static void
131ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
132{
51f0bc78 133 struct ip_vs_dest_set_elem *e;
1da177e4 134
51f0bc78 135 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
136 if (e->dest == dest) {
137 /* HIT */
1da177e4
LT
138 atomic_dec(&set->size);
139 set->lastmod = jiffies;
140 atomic_dec(&e->dest->refcnt);
51f0bc78 141 list_del(&e->list);
1da177e4
LT
142 kfree(e);
143 break;
144 }
1da177e4 145 }
1da177e4
LT
146}
147
148static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
149{
51f0bc78 150 struct ip_vs_dest_set_elem *e, *ep;
1da177e4
LT
151
152 write_lock(&set->lock);
51f0bc78 153 list_for_each_entry_safe(e, ep, &set->list, list) {
1da177e4
LT
154 /*
155 * We don't kfree dest because it is refered either
156 * by its service or by the trash dest list.
157 */
158 atomic_dec(&e->dest->refcnt);
51f0bc78 159 list_del(&e->list);
1da177e4
LT
160 kfree(e);
161 }
162 write_unlock(&set->lock);
163}
164
165/* get weighted least-connection node in the destination set */
166static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
167{
51f0bc78 168 register struct ip_vs_dest_set_elem *e;
1da177e4
LT
169 struct ip_vs_dest *dest, *least;
170 int loh, doh;
171
172 if (set == NULL)
173 return NULL;
174
1da177e4 175 /* select the first destination server, whose weight > 0 */
51f0bc78 176 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
177 least = e->dest;
178 if (least->flags & IP_VS_DEST_F_OVERLOAD)
179 continue;
180
181 if ((atomic_read(&least->weight) > 0)
182 && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
183 loh = atomic_read(&least->activeconns) * 50
184 + atomic_read(&least->inactconns);
185 goto nextstage;
186 }
187 }
1da177e4
LT
188 return NULL;
189
190 /* find the destination with the weighted least load */
191 nextstage:
51f0bc78 192 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
193 dest = e->dest;
194 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
195 continue;
196
197 doh = atomic_read(&dest->activeconns) * 50
198 + atomic_read(&dest->inactconns);
199 if ((loh * atomic_read(&dest->weight) >
200 doh * atomic_read(&least->weight))
201 && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
202 least = dest;
203 loh = doh;
204 }
205 }
1da177e4 206
1e3e238e 207 IP_VS_DBG_BUF(6, "%s(): server %s:%d "
44548375 208 "activeconns %d refcnt %d weight %d overhead %d\n",
1e3e238e 209 __func__,
44548375
JV
210 IP_VS_DBG_ADDR(least->af, &least->addr),
211 ntohs(least->port),
212 atomic_read(&least->activeconns),
213 atomic_read(&least->refcnt),
214 atomic_read(&least->weight), loh);
1da177e4
LT
215 return least;
216}
217
218
219/* get weighted most-connection node in the destination set */
220static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
221{
51f0bc78 222 register struct ip_vs_dest_set_elem *e;
1da177e4
LT
223 struct ip_vs_dest *dest, *most;
224 int moh, doh;
225
226 if (set == NULL)
227 return NULL;
228
1da177e4 229 /* select the first destination server, whose weight > 0 */
51f0bc78 230 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
231 most = e->dest;
232 if (atomic_read(&most->weight) > 0) {
233 moh = atomic_read(&most->activeconns) * 50
234 + atomic_read(&most->inactconns);
235 goto nextstage;
236 }
237 }
1da177e4
LT
238 return NULL;
239
240 /* find the destination with the weighted most load */
241 nextstage:
51f0bc78 242 list_for_each_entry(e, &set->list, list) {
1da177e4
LT
243 dest = e->dest;
244 doh = atomic_read(&dest->activeconns) * 50
245 + atomic_read(&dest->inactconns);
246 /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
247 if ((moh * atomic_read(&dest->weight) <
248 doh * atomic_read(&most->weight))
249 && (atomic_read(&dest->weight) > 0)) {
250 most = dest;
251 moh = doh;
252 }
253 }
1da177e4 254
1e3e238e 255 IP_VS_DBG_BUF(6, "%s(): server %s:%d "
44548375 256 "activeconns %d refcnt %d weight %d overhead %d\n",
1e3e238e 257 __func__,
44548375
JV
258 IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port),
259 atomic_read(&most->activeconns),
260 atomic_read(&most->refcnt),
261 atomic_read(&most->weight), moh);
1da177e4
LT
262 return most;
263}
264
265
266/*
267 * IPVS lblcr entry represents an association between destination
268 * IP address and its destination server set
269 */
270struct ip_vs_lblcr_entry {
271 struct list_head list;
44548375
JV
272 int af; /* address family */
273 union nf_inet_addr addr; /* destination IP address */
1da177e4
LT
274 struct ip_vs_dest_set set; /* destination server set */
275 unsigned long lastuse; /* last used time */
276};
277
278
279/*
280 * IPVS lblcr hash table
281 */
282struct ip_vs_lblcr_table {
1da177e4
LT
283 struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
284 atomic_t entries; /* number of entries */
285 int max_size; /* maximum size of entries */
286 struct timer_list periodic_timer; /* collect stale entries */
287 int rover; /* rover for expire check */
288 int counter; /* counter for no expire */
289};
290
291
292/*
293 * IPVS LBLCR sysctl table
294 */
295
296static ctl_table vs_vars_table[] = {
297 {
1da177e4
LT
298 .procname = "lblcr_expiration",
299 .data = &sysctl_ip_vs_lblcr_expiration,
300 .maxlen = sizeof(int),
e905a9ed 301 .mode = 0644,
6d9f239a 302 .proc_handler = proc_dointvec_jiffies,
1da177e4 303 },
f8572d8f 304 { }
1da177e4
LT
305};
306
1da177e4
LT
307static struct ctl_table_header * sysctl_header;
308
1da177e4
LT
309static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
310{
311 list_del(&en->list);
312 ip_vs_dest_set_eraseall(&en->set);
313 kfree(en);
314}
315
316
317/*
318 * Returns hash value for IPVS LBLCR entry
319 */
44548375
JV
320static inline unsigned
321ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
1da177e4 322{
44548375
JV
323 __be32 addr_fold = addr->ip;
324
325#ifdef CONFIG_IP_VS_IPV6
326 if (af == AF_INET6)
327 addr_fold = addr->ip6[0]^addr->ip6[1]^
328 addr->ip6[2]^addr->ip6[3];
329#endif
330 return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
1da177e4
LT
331}
332
333
334/*
335 * Hash an entry in the ip_vs_lblcr_table.
336 * returns bool success.
337 */
f728bafb 338static void
1da177e4
LT
339ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
340{
44548375 341 unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
1da177e4 342
1da177e4
LT
343 list_add(&en->list, &tbl->bucket[hash]);
344 atomic_inc(&tbl->entries);
1da177e4
LT
345}
346
347
1da177e4 348/*
f728bafb
SW
349 * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
350 * read lock.
1da177e4
LT
351 */
352static inline struct ip_vs_lblcr_entry *
44548375
JV
353ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
354 const union nf_inet_addr *addr)
1da177e4 355{
44548375 356 unsigned hash = ip_vs_lblcr_hashkey(af, addr);
1da177e4
LT
357 struct ip_vs_lblcr_entry *en;
358
f728bafb 359 list_for_each_entry(en, &tbl->bucket[hash], list)
44548375 360 if (ip_vs_addr_equal(af, &en->addr, addr))
f728bafb
SW
361 return en;
362
363 return NULL;
364}
1da177e4 365
1da177e4 366
f728bafb
SW
367/*
368 * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
369 * IP address to a server. Called under write lock.
370 */
371static inline struct ip_vs_lblcr_entry *
44548375 372ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
f728bafb
SW
373 struct ip_vs_dest *dest)
374{
375 struct ip_vs_lblcr_entry *en;
376
44548375 377 en = ip_vs_lblcr_get(dest->af, tbl, daddr);
f728bafb
SW
378 if (!en) {
379 en = kmalloc(sizeof(*en), GFP_ATOMIC);
380 if (!en) {
1e3e238e 381 pr_err("%s(): no memory\n", __func__);
f728bafb 382 return NULL;
1da177e4 383 }
f728bafb 384
44548375
JV
385 en->af = dest->af;
386 ip_vs_addr_copy(dest->af, &en->addr, daddr);
f728bafb
SW
387 en->lastuse = jiffies;
388
389 /* initilize its dest set */
390 atomic_set(&(en->set.size), 0);
51f0bc78 391 INIT_LIST_HEAD(&en->set.list);
f728bafb
SW
392 rwlock_init(&en->set.lock);
393
394 ip_vs_lblcr_hash(tbl, en);
1da177e4
LT
395 }
396
f728bafb
SW
397 write_lock(&en->set.lock);
398 ip_vs_dest_set_insert(&en->set, dest);
399 write_unlock(&en->set.lock);
1da177e4 400
f728bafb 401 return en;
1da177e4
LT
402}
403
404
405/*
406 * Flush all the entries of the specified table.
407 */
408static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
409{
410 int i;
411 struct ip_vs_lblcr_entry *en, *nxt;
412
f728bafb 413 /* No locking required, only called during cleanup. */
1da177e4 414 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
1da177e4
LT
415 list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
416 ip_vs_lblcr_free(en);
1da177e4 417 }
1da177e4
LT
418 }
419}
420
421
f728bafb 422static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
1da177e4 423{
f728bafb 424 struct ip_vs_lblcr_table *tbl = svc->sched_data;
1da177e4
LT
425 unsigned long now = jiffies;
426 int i, j;
427 struct ip_vs_lblcr_entry *en, *nxt;
428
429 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
430 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
431
f728bafb 432 write_lock(&svc->sched_lock);
1da177e4
LT
433 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
434 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
435 now))
436 continue;
437
438 ip_vs_lblcr_free(en);
439 atomic_dec(&tbl->entries);
440 }
f728bafb 441 write_unlock(&svc->sched_lock);
1da177e4
LT
442 }
443 tbl->rover = j;
444}
445
446
447/*
448 * Periodical timer handler for IPVS lblcr table
449 * It is used to collect stale entries when the number of entries
450 * exceeds the maximum size of the table.
451 *
452 * Fixme: we probably need more complicated algorithm to collect
453 * entries that have not been used for a long time even
454 * if the number of entries doesn't exceed the maximum size
455 * of the table.
456 * The full expiration check is for this purpose now.
457 */
458static void ip_vs_lblcr_check_expire(unsigned long data)
459{
f728bafb
SW
460 struct ip_vs_service *svc = (struct ip_vs_service *) data;
461 struct ip_vs_lblcr_table *tbl = svc->sched_data;
1da177e4
LT
462 unsigned long now = jiffies;
463 int goal;
464 int i, j;
465 struct ip_vs_lblcr_entry *en, *nxt;
466
1da177e4
LT
467 if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
468 /* do full expiration check */
f728bafb 469 ip_vs_lblcr_full_check(svc);
1da177e4
LT
470 tbl->counter = 1;
471 goto out;
472 }
473
474 if (atomic_read(&tbl->entries) <= tbl->max_size) {
475 tbl->counter++;
476 goto out;
477 }
478
479 goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
480 if (goal > tbl->max_size/2)
481 goal = tbl->max_size/2;
482
483 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
484 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
485
f728bafb 486 write_lock(&svc->sched_lock);
1da177e4
LT
487 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
488 if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
489 continue;
490
491 ip_vs_lblcr_free(en);
492 atomic_dec(&tbl->entries);
493 goal--;
494 }
f728bafb 495 write_unlock(&svc->sched_lock);
1da177e4
LT
496 if (goal <= 0)
497 break;
498 }
499 tbl->rover = j;
500
501 out:
502 mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
503}
504
1da177e4
LT
505static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
506{
507 int i;
508 struct ip_vs_lblcr_table *tbl;
509
510 /*
511 * Allocate the ip_vs_lblcr_table for this service
512 */
f728bafb 513 tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
1da177e4 514 if (tbl == NULL) {
1e3e238e 515 pr_err("%s(): no memory\n", __func__);
1da177e4
LT
516 return -ENOMEM;
517 }
518 svc->sched_data = tbl;
519 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
f728bafb 520 "current service\n", sizeof(*tbl));
1da177e4
LT
521
522 /*
523 * Initialize the hash buckets
524 */
525 for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
526 INIT_LIST_HEAD(&tbl->bucket[i]);
527 }
1da177e4
LT
528 tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
529 tbl->rover = 0;
530 tbl->counter = 1;
531
532 /*
533 * Hook periodic timer for garbage collection
534 */
b24b8a24 535 setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
f728bafb
SW
536 (unsigned long)svc);
537 mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
1da177e4 538
1da177e4
LT
539 return 0;
540}
541
542
543static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
544{
545 struct ip_vs_lblcr_table *tbl = svc->sched_data;
546
547 /* remove periodic timer */
548 del_timer_sync(&tbl->periodic_timer);
549
550 /* got to clean up table entries here */
551 ip_vs_lblcr_flush(tbl);
552
553 /* release the table itself */
f728bafb 554 kfree(tbl);
1da177e4 555 IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
f728bafb 556 sizeof(*tbl));
1da177e4
LT
557
558 return 0;
559}
560
561
1da177e4 562static inline struct ip_vs_dest *
44548375 563__ip_vs_lblcr_schedule(struct ip_vs_service *svc)
1da177e4
LT
564{
565 struct ip_vs_dest *dest, *least;
566 int loh, doh;
567
568 /*
569 * We think the overhead of processing active connections is fifty
570 * times higher than that of inactive connections in average. (This
571 * fifty times might not be accurate, we will change it later.) We
572 * use the following formula to estimate the overhead:
573 * dest->activeconns*50 + dest->inactconns
574 * and the load:
575 * (dest overhead) / dest->weight
576 *
577 * Remember -- no floats in kernel mode!!!
578 * The comparison of h1*w2 > h2*w1 is equivalent to that of
579 * h1/w1 > h2/w2
580 * if every weight is larger than zero.
581 *
582 * The server with weight=0 is quiesced and will not receive any
583 * new connection.
584 */
585 list_for_each_entry(dest, &svc->destinations, n_list) {
586 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
587 continue;
588
589 if (atomic_read(&dest->weight) > 0) {
590 least = dest;
591 loh = atomic_read(&least->activeconns) * 50
592 + atomic_read(&least->inactconns);
593 goto nextstage;
594 }
595 }
596 return NULL;
597
598 /*
599 * Find the destination with the least load.
600 */
601 nextstage:
602 list_for_each_entry_continue(dest, &svc->destinations, n_list) {
603 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
604 continue;
605
606 doh = atomic_read(&dest->activeconns) * 50
607 + atomic_read(&dest->inactconns);
608 if (loh * atomic_read(&dest->weight) >
609 doh * atomic_read(&least->weight)) {
610 least = dest;
611 loh = doh;
612 }
613 }
614
44548375
JV
615 IP_VS_DBG_BUF(6, "LBLCR: server %s:%d "
616 "activeconns %d refcnt %d weight %d overhead %d\n",
617 IP_VS_DBG_ADDR(least->af, &least->addr),
618 ntohs(least->port),
619 atomic_read(&least->activeconns),
620 atomic_read(&least->refcnt),
621 atomic_read(&least->weight), loh);
1da177e4
LT
622
623 return least;
624}
625
626
627/*
628 * If this destination server is overloaded and there is a less loaded
629 * server, then return true.
630 */
631static inline int
632is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
633{
634 if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
635 struct ip_vs_dest *d;
636
637 list_for_each_entry(d, &svc->destinations, n_list) {
638 if (atomic_read(&d->activeconns)*2
639 < atomic_read(&d->weight)) {
640 return 1;
641 }
642 }
643 }
644 return 0;
645}
646
647
648/*
649 * Locality-Based (weighted) Least-Connection scheduling
650 */
651static struct ip_vs_dest *
652ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
653{
f728bafb 654 struct ip_vs_lblcr_table *tbl = svc->sched_data;
44548375 655 struct ip_vs_iphdr iph;
f728bafb
SW
656 struct ip_vs_dest *dest = NULL;
657 struct ip_vs_lblcr_entry *en;
1da177e4 658
44548375
JV
659 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
660
1e3e238e 661 IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
1da177e4 662
f728bafb
SW
663 /* First look in our cache */
664 read_lock(&svc->sched_lock);
44548375 665 en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
f728bafb
SW
666 if (en) {
667 /* We only hold a read lock, but this is atomic */
668 en->lastuse = jiffies;
669
670 /* Get the least loaded destination */
671 read_lock(&en->set.lock);
1da177e4 672 dest = ip_vs_dest_set_min(&en->set);
f728bafb
SW
673 read_unlock(&en->set.lock);
674
675 /* More than one destination + enough time passed by, cleanup */
1da177e4 676 if (atomic_read(&en->set.size) > 1 &&
f728bafb
SW
677 time_after(jiffies, en->set.lastmod +
678 sysctl_ip_vs_lblcr_expiration)) {
1da177e4 679 struct ip_vs_dest *m;
f728bafb
SW
680
681 write_lock(&en->set.lock);
1da177e4
LT
682 m = ip_vs_dest_set_max(&en->set);
683 if (m)
684 ip_vs_dest_set_erase(&en->set, m);
f728bafb
SW
685 write_unlock(&en->set.lock);
686 }
687
688 /* If the destination is not overloaded, use it */
689 if (dest && !is_overloaded(dest, svc)) {
690 read_unlock(&svc->sched_lock);
691 goto out;
692 }
693
694 /* The cache entry is invalid, time to schedule */
44548375 695 dest = __ip_vs_lblcr_schedule(svc);
f728bafb 696 if (!dest) {
68888d10 697 IP_VS_ERR_RL("LBLCR: no destination available\n");
f728bafb
SW
698 read_unlock(&svc->sched_lock);
699 return NULL;
1da177e4 700 }
f728bafb
SW
701
702 /* Update our cache entry */
703 write_lock(&en->set.lock);
704 ip_vs_dest_set_insert(&en->set, dest);
705 write_unlock(&en->set.lock);
706 }
707 read_unlock(&svc->sched_lock);
708
709 if (dest)
710 goto out;
711
712 /* No cache entry, time to schedule */
44548375 713 dest = __ip_vs_lblcr_schedule(svc);
f728bafb
SW
714 if (!dest) {
715 IP_VS_DBG(1, "no destination available\n");
716 return NULL;
1da177e4 717 }
1da177e4 718
f728bafb
SW
719 /* If we fail to create a cache entry, we'll just use the valid dest */
720 write_lock(&svc->sched_lock);
44548375 721 ip_vs_lblcr_new(tbl, &iph.daddr, dest);
f728bafb
SW
722 write_unlock(&svc->sched_lock);
723
724out:
44548375
JV
725 IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
726 IP_VS_DBG_ADDR(svc->af, &iph.daddr),
727 IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port));
1da177e4
LT
728
729 return dest;
730}
731
732
733/*
734 * IPVS LBLCR Scheduler structure
735 */
736static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
737{
738 .name = "lblcr",
739 .refcnt = ATOMIC_INIT(0),
740 .module = THIS_MODULE,
d149ccc9 741 .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
1da177e4
LT
742 .init_service = ip_vs_lblcr_init_svc,
743 .done_service = ip_vs_lblcr_done_svc,
1da177e4
LT
744 .schedule = ip_vs_lblcr_schedule,
745};
746
747
748static int __init ip_vs_lblcr_init(void)
749{
a014bc8f
PE
750 int ret;
751
90754f8e 752 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table);
a014bc8f
PE
753 ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
754 if (ret)
755 unregister_sysctl_table(sysctl_header);
756 return ret;
1da177e4
LT
757}
758
759
760static void __exit ip_vs_lblcr_cleanup(void)
761{
1da177e4
LT
762 unregister_sysctl_table(sysctl_header);
763 unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
764}
765
766
767module_init(ip_vs_lblcr_init);
768module_exit(ip_vs_lblcr_cleanup);
769MODULE_LICENSE("GPL");