]>
Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * IPVS: Weighted Round-Robin Scheduling module | |
3 | * | |
1da177e4 LT |
4 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> |
5 | * | |
6 | * This program is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU General Public License | |
8 | * as published by the Free Software Foundation; either version | |
9 | * 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * Changes: | |
12 | * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest | |
13 | * Wensong Zhang : changed some comestics things for debugging | |
14 | * Wensong Zhang : changed for the d-linked destination list | |
15 | * Wensong Zhang : added the ip_vs_wrr_update_svc | |
16 | * Julian Anastasov : fixed the bug of returning destination | |
17 | * with weight 0 when all weights are zero | |
18 | * | |
19 | */ | |
20 | ||
9aada7ac HE |
21 | #define KMSG_COMPONENT "IPVS" |
22 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt | |
23 | ||
1da177e4 LT |
24 | #include <linux/module.h> |
25 | #include <linux/kernel.h> | |
5a0e3ad6 | 26 | #include <linux/slab.h> |
9c1ca6e6 | 27 | #include <linux/net.h> |
ae24e578 | 28 | #include <linux/gcd.h> |
1da177e4 LT |
29 | |
30 | #include <net/ip_vs.h> | |
31 | ||
08cb2d03 JA |
32 | /* The WRR algorithm depends on some caclulations: |
33 | * - mw: maximum weight | |
34 | * - di: weight step, greatest common divisor from all weights | |
35 | * - cw: current required weight | |
36 | * As result, all weights are in the [di..mw] range with a step=di. | |
37 | * | |
38 | * First, we start with cw = mw and select dests with weight >= cw. | |
39 | * Then cw is reduced with di and all dests are checked again. | |
40 | * Last pass should be with cw = di. We have mw/di passes in total: | |
41 | * | |
42 | * pass 1: cw = max weight | |
43 | * pass 2: cw = max weight - di | |
44 | * pass 3: cw = max weight - 2 * di | |
45 | * ... | |
46 | * last pass: cw = di | |
47 | * | |
48 | * Weights are supposed to be >= di but we run in parallel with | |
49 | * weight changes, it is possible some dest weight to be reduced | |
50 | * below di, bad if it is the only available dest. | |
51 | * | |
52 | * So, we modify how mw is calculated, now it is reduced with (di - 1), | |
53 | * so that last cw is 1 to catch such dests with weight below di: | |
54 | * pass 1: cw = max weight - (di - 1) | |
55 | * pass 2: cw = max weight - di - (di - 1) | |
56 | * pass 3: cw = max weight - 2 * di - (di - 1) | |
57 | * ... | |
58 | * last pass: cw = 1 | |
59 | * | |
60 | */ | |
61 | ||
1da177e4 LT |
62 | /* |
63 | * current destination pointer for weighted round-robin scheduling | |
64 | */ | |
65 | struct ip_vs_wrr_mark { | |
08cb2d03 | 66 | struct ip_vs_dest *cl; /* current dest or head */ |
1da177e4 LT |
67 | int cw; /* current weight */ |
68 | int mw; /* maximum weight */ | |
69 | int di; /* decreasing interval */ | |
08cb2d03 | 70 | struct rcu_head rcu_head; |
1da177e4 LT |
71 | }; |
72 | ||
73 | ||
1da177e4 LT |
74 | static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) |
75 | { | |
76 | struct ip_vs_dest *dest; | |
77 | int weight; | |
78 | int g = 0; | |
79 | ||
80 | list_for_each_entry(dest, &svc->destinations, n_list) { | |
81 | weight = atomic_read(&dest->weight); | |
82 | if (weight > 0) { | |
83 | if (g > 0) | |
84 | g = gcd(weight, g); | |
85 | else | |
86 | g = weight; | |
87 | } | |
88 | } | |
89 | return g ? g : 1; | |
90 | } | |
91 | ||
92 | ||
93 | /* | |
94 | * Get the maximum weight of the service destinations. | |
95 | */ | |
96 | static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) | |
97 | { | |
98 | struct ip_vs_dest *dest; | |
1e66dafc | 99 | int new_weight, weight = 0; |
1da177e4 LT |
100 | |
101 | list_for_each_entry(dest, &svc->destinations, n_list) { | |
1e66dafc SH |
102 | new_weight = atomic_read(&dest->weight); |
103 | if (new_weight > weight) | |
104 | weight = new_weight; | |
1da177e4 LT |
105 | } |
106 | ||
107 | return weight; | |
108 | } | |
109 | ||
110 | ||
111 | static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) | |
112 | { | |
113 | struct ip_vs_wrr_mark *mark; | |
114 | ||
115 | /* | |
116 | * Allocate the mark variable for WRR scheduling | |
117 | */ | |
4f2a94dc | 118 | mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL); |
0a9ee813 | 119 | if (mark == NULL) |
1da177e4 | 120 | return -ENOMEM; |
0a9ee813 | 121 | |
08cb2d03 | 122 | mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); |
1da177e4 | 123 | mark->di = ip_vs_wrr_gcd_weight(svc); |
08cb2d03 JA |
124 | mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); |
125 | mark->cw = mark->mw; | |
1da177e4 LT |
126 | svc->sched_data = mark; |
127 | ||
128 | return 0; | |
129 | } | |
130 | ||
131 | ||
ed3ffc4e | 132 | static void ip_vs_wrr_done_svc(struct ip_vs_service *svc) |
1da177e4 | 133 | { |
08cb2d03 JA |
134 | struct ip_vs_wrr_mark *mark = svc->sched_data; |
135 | ||
1da177e4 LT |
136 | /* |
137 | * Release the mark variable | |
138 | */ | |
08cb2d03 | 139 | kfree_rcu(mark, rcu_head); |
1da177e4 LT |
140 | } |
141 | ||
142 | ||
08cb2d03 JA |
143 | static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc, |
144 | struct ip_vs_dest *dest) | |
1da177e4 LT |
145 | { |
146 | struct ip_vs_wrr_mark *mark = svc->sched_data; | |
147 | ||
08cb2d03 JA |
148 | write_lock_bh(&svc->sched_lock); |
149 | mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list); | |
1da177e4 | 150 | mark->di = ip_vs_wrr_gcd_weight(svc); |
08cb2d03 JA |
151 | mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1); |
152 | if (mark->cw > mark->mw || !mark->cw) | |
153 | mark->cw = mark->mw; | |
154 | else if (mark->di > 1) | |
155 | mark->cw = (mark->cw / mark->di) * mark->di + 1; | |
156 | write_unlock_bh(&svc->sched_lock); | |
1da177e4 LT |
157 | return 0; |
158 | } | |
159 | ||
160 | ||
161 | /* | |
162 | * Weighted Round-Robin Scheduling | |
163 | */ | |
164 | static struct ip_vs_dest * | |
165 | ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) | |
166 | { | |
08cb2d03 | 167 | struct ip_vs_dest *dest, *last, *stop = NULL; |
1da177e4 | 168 | struct ip_vs_wrr_mark *mark = svc->sched_data; |
08cb2d03 | 169 | bool last_pass = false, restarted = false; |
1da177e4 | 170 | |
1e3e238e | 171 | IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); |
1da177e4 | 172 | |
1da177e4 | 173 | write_lock(&svc->sched_lock); |
08cb2d03 JA |
174 | dest = mark->cl; |
175 | /* No available dests? */ | |
176 | if (mark->mw == 0) | |
177 | goto err_noavail; | |
178 | last = dest; | |
179 | /* Stop only after all dests were checked for weight >= 1 (last pass) */ | |
1da177e4 | 180 | while (1) { |
08cb2d03 JA |
181 | list_for_each_entry_continue_rcu(dest, |
182 | &svc->destinations, | |
183 | n_list) { | |
1da177e4 | 184 | if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && |
08cb2d03 JA |
185 | atomic_read(&dest->weight) >= mark->cw) |
186 | goto found; | |
187 | if (dest == stop) | |
188 | goto err_over; | |
1da177e4 | 189 | } |
08cb2d03 JA |
190 | mark->cw -= mark->di; |
191 | if (mark->cw <= 0) { | |
192 | mark->cw = mark->mw; | |
193 | /* Stop if we tried last pass from first dest: | |
194 | * 1. last_pass: we started checks when cw > di but | |
195 | * then all dests were checked for w >= 1 | |
196 | * 2. last was head: the first and only traversal | |
197 | * was for weight >= 1, for all dests. | |
198 | */ | |
199 | if (last_pass || | |
200 | &last->n_list == &svc->destinations) | |
201 | goto err_over; | |
202 | restarted = true; | |
203 | } | |
204 | last_pass = mark->cw <= mark->di; | |
205 | if (last_pass && restarted && | |
206 | &last->n_list != &svc->destinations) { | |
207 | /* First traversal was for w >= 1 but only | |
208 | * for dests after 'last', now do the same | |
209 | * for all dests up to 'last'. | |
210 | */ | |
211 | stop = last; | |
1da177e4 LT |
212 | } |
213 | } | |
214 | ||
08cb2d03 | 215 | found: |
b14198f6 JV |
216 | IP_VS_DBG_BUF(6, "WRR: server %s:%u " |
217 | "activeconns %d refcnt %d weight %d\n", | |
218 | IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), | |
219 | atomic_read(&dest->activeconns), | |
220 | atomic_read(&dest->refcnt), | |
221 | atomic_read(&dest->weight)); | |
08cb2d03 | 222 | mark->cl = dest; |
1da177e4 LT |
223 | |
224 | out: | |
225 | write_unlock(&svc->sched_lock); | |
226 | return dest; | |
08cb2d03 JA |
227 | |
228 | err_noavail: | |
229 | mark->cl = dest; | |
230 | dest = NULL; | |
231 | ip_vs_scheduler_err(svc, "no destination available"); | |
232 | goto out; | |
233 | ||
234 | err_over: | |
235 | mark->cl = dest; | |
236 | dest = NULL; | |
237 | ip_vs_scheduler_err(svc, "no destination available: " | |
238 | "all destinations are overloaded"); | |
239 | goto out; | |
1da177e4 LT |
240 | } |
241 | ||
242 | ||
243 | static struct ip_vs_scheduler ip_vs_wrr_scheduler = { | |
244 | .name = "wrr", | |
245 | .refcnt = ATOMIC_INIT(0), | |
246 | .module = THIS_MODULE, | |
d149ccc9 | 247 | .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), |
1da177e4 LT |
248 | .init_service = ip_vs_wrr_init_svc, |
249 | .done_service = ip_vs_wrr_done_svc, | |
08cb2d03 JA |
250 | .add_dest = ip_vs_wrr_dest_changed, |
251 | .del_dest = ip_vs_wrr_dest_changed, | |
252 | .upd_dest = ip_vs_wrr_dest_changed, | |
1da177e4 LT |
253 | .schedule = ip_vs_wrr_schedule, |
254 | }; | |
255 | ||
256 | static int __init ip_vs_wrr_init(void) | |
257 | { | |
1da177e4 LT |
258 | return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; |
259 | } | |
260 | ||
261 | static void __exit ip_vs_wrr_cleanup(void) | |
262 | { | |
263 | unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); | |
264 | } | |
265 | ||
266 | module_init(ip_vs_wrr_init); | |
267 | module_exit(ip_vs_wrr_cleanup); | |
268 | MODULE_LICENSE("GPL"); |