]>
Commit | Line | Data |
---|---|---|
5ddfffbd BC |
1 | /* |
2 | * QEMU throttling infrastructure | |
3 | * | |
4 | * Copyright (C) Nodalink, SARL. 2013 | |
5 | * | |
6 | * Author: | |
7 | * Benoît Canet <benoit.canet@irqsave.net> | |
8 | * | |
9 | * This program is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU General Public License as | |
11 | * published by the Free Software Foundation; either version 2 or | |
12 | * (at your option) version 3 of the License. | |
13 | * | |
14 | * This program is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | * GNU General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU General Public License | |
20 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | |
21 | */ | |
22 | ||
23 | #include "qemu/throttle.h" | |
24 | #include "qemu/timer.h" | |
13af91eb | 25 | #include "block/aio.h" |
5ddfffbd BC |
26 | |
27 | /* This function make a bucket leak | |
28 | * | |
29 | * @bkt: the bucket to make leak | |
30 | * @delta_ns: the time delta | |
31 | */ | |
32 | void throttle_leak_bucket(LeakyBucket *bkt, int64_t delta_ns) | |
33 | { | |
34 | double leak; | |
35 | ||
36 | /* compute how much to leak */ | |
37 | leak = (bkt->avg * (double) delta_ns) / NANOSECONDS_PER_SECOND; | |
38 | ||
39 | /* make the bucket leak */ | |
40 | bkt->level = MAX(bkt->level - leak, 0); | |
41 | } | |
42 | ||
43 | /* Calculate the time delta since last leak and make proportionals leaks | |
44 | * | |
45 | * @now: the current timestamp in ns | |
46 | */ | |
47 | static void throttle_do_leak(ThrottleState *ts, int64_t now) | |
48 | { | |
49 | /* compute the time elapsed since the last leak */ | |
50 | int64_t delta_ns = now - ts->previous_leak; | |
51 | int i; | |
52 | ||
53 | ts->previous_leak = now; | |
54 | ||
55 | if (delta_ns <= 0) { | |
56 | return; | |
57 | } | |
58 | ||
59 | /* make each bucket leak */ | |
60 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
61 | throttle_leak_bucket(&ts->cfg.buckets[i], delta_ns); | |
62 | } | |
63 | } | |
64 | ||
65 | /* do the real job of computing the time to wait | |
66 | * | |
67 | * @limit: the throttling limit | |
68 | * @extra: the number of operation to delay | |
69 | * @ret: the time to wait in ns | |
70 | */ | |
71 | static int64_t throttle_do_compute_wait(double limit, double extra) | |
72 | { | |
73 | double wait = extra * NANOSECONDS_PER_SECOND; | |
74 | wait /= limit; | |
75 | return wait; | |
76 | } | |
77 | ||
78 | /* This function compute the wait time in ns that a leaky bucket should trigger | |
79 | * | |
80 | * @bkt: the leaky bucket we operate on | |
81 | * @ret: the resulting wait time in ns or 0 if the operation can go through | |
82 | */ | |
83 | int64_t throttle_compute_wait(LeakyBucket *bkt) | |
84 | { | |
85 | double extra; /* the number of extra units blocking the io */ | |
86 | ||
87 | if (!bkt->avg) { | |
88 | return 0; | |
89 | } | |
90 | ||
91 | extra = bkt->level - bkt->max; | |
92 | ||
93 | if (extra <= 0) { | |
94 | return 0; | |
95 | } | |
96 | ||
97 | return throttle_do_compute_wait(bkt->avg, extra); | |
98 | } | |
99 | ||
100 | /* This function compute the time that must be waited while this IO | |
101 | * | |
102 | * @is_write: true if the current IO is a write, false if it's a read | |
103 | * @ret: time to wait | |
104 | */ | |
105 | static int64_t throttle_compute_wait_for(ThrottleState *ts, | |
106 | bool is_write) | |
107 | { | |
108 | BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL, | |
109 | THROTTLE_OPS_TOTAL, | |
110 | THROTTLE_BPS_READ, | |
111 | THROTTLE_OPS_READ}, | |
112 | {THROTTLE_BPS_TOTAL, | |
113 | THROTTLE_OPS_TOTAL, | |
114 | THROTTLE_BPS_WRITE, | |
115 | THROTTLE_OPS_WRITE}, }; | |
116 | int64_t wait, max_wait = 0; | |
117 | int i; | |
118 | ||
119 | for (i = 0; i < 4; i++) { | |
120 | BucketType index = to_check[is_write][i]; | |
121 | wait = throttle_compute_wait(&ts->cfg.buckets[index]); | |
122 | if (wait > max_wait) { | |
123 | max_wait = wait; | |
124 | } | |
125 | } | |
126 | ||
127 | return max_wait; | |
128 | } | |
129 | ||
130 | /* compute the timer for this type of operation | |
131 | * | |
132 | * @is_write: the type of operation | |
133 | * @now: the current clock timestamp | |
134 | * @next_timestamp: the resulting timer | |
135 | * @ret: true if a timer must be set | |
136 | */ | |
137 | bool throttle_compute_timer(ThrottleState *ts, | |
138 | bool is_write, | |
139 | int64_t now, | |
140 | int64_t *next_timestamp) | |
141 | { | |
142 | int64_t wait; | |
143 | ||
144 | /* leak proportionally to the time elapsed */ | |
145 | throttle_do_leak(ts, now); | |
146 | ||
147 | /* compute the wait time if any */ | |
148 | wait = throttle_compute_wait_for(ts, is_write); | |
149 | ||
150 | /* if the code must wait compute when the next timer should fire */ | |
151 | if (wait) { | |
152 | *next_timestamp = now + wait; | |
153 | return true; | |
154 | } | |
155 | ||
156 | /* else no need to wait at all */ | |
157 | *next_timestamp = now; | |
158 | return false; | |
159 | } | |
160 | ||
13af91eb SH |
161 | /* Add timers to event loop */ |
162 | void throttle_attach_aio_context(ThrottleState *ts, AioContext *new_context) | |
163 | { | |
164 | ts->timers[0] = aio_timer_new(new_context, ts->clock_type, SCALE_NS, | |
165 | ts->read_timer_cb, ts->timer_opaque); | |
166 | ts->timers[1] = aio_timer_new(new_context, ts->clock_type, SCALE_NS, | |
167 | ts->write_timer_cb, ts->timer_opaque); | |
168 | } | |
169 | ||
5ddfffbd BC |
170 | /* To be called first on the ThrottleState */ |
171 | void throttle_init(ThrottleState *ts, | |
13af91eb | 172 | AioContext *aio_context, |
5ddfffbd BC |
173 | QEMUClockType clock_type, |
174 | QEMUTimerCB *read_timer_cb, | |
175 | QEMUTimerCB *write_timer_cb, | |
176 | void *timer_opaque) | |
177 | { | |
178 | memset(ts, 0, sizeof(ThrottleState)); | |
179 | ||
180 | ts->clock_type = clock_type; | |
13af91eb SH |
181 | ts->read_timer_cb = read_timer_cb; |
182 | ts->write_timer_cb = write_timer_cb; | |
183 | ts->timer_opaque = timer_opaque; | |
184 | throttle_attach_aio_context(ts, aio_context); | |
5ddfffbd BC |
185 | } |
186 | ||
187 | /* destroy a timer */ | |
188 | static void throttle_timer_destroy(QEMUTimer **timer) | |
189 | { | |
190 | assert(*timer != NULL); | |
191 | ||
192 | timer_del(*timer); | |
193 | timer_free(*timer); | |
194 | *timer = NULL; | |
195 | } | |
196 | ||
13af91eb SH |
197 | /* Remove timers from event loop */ |
198 | void throttle_detach_aio_context(ThrottleState *ts) | |
5ddfffbd BC |
199 | { |
200 | int i; | |
201 | ||
202 | for (i = 0; i < 2; i++) { | |
203 | throttle_timer_destroy(&ts->timers[i]); | |
204 | } | |
205 | } | |
206 | ||
13af91eb SH |
207 | /* To be called last on the ThrottleState */ |
208 | void throttle_destroy(ThrottleState *ts) | |
209 | { | |
210 | throttle_detach_aio_context(ts); | |
211 | } | |
212 | ||
5ddfffbd BC |
213 | /* is any throttling timer configured */ |
214 | bool throttle_have_timer(ThrottleState *ts) | |
215 | { | |
216 | if (ts->timers[0]) { | |
217 | return true; | |
218 | } | |
219 | ||
220 | return false; | |
221 | } | |
222 | ||
223 | /* Does any throttling must be done | |
224 | * | |
225 | * @cfg: the throttling configuration to inspect | |
226 | * @ret: true if throttling must be done else false | |
227 | */ | |
228 | bool throttle_enabled(ThrottleConfig *cfg) | |
229 | { | |
230 | int i; | |
231 | ||
232 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
233 | if (cfg->buckets[i].avg > 0) { | |
234 | return true; | |
235 | } | |
236 | } | |
237 | ||
238 | return false; | |
239 | } | |
240 | ||
241 | /* return true if any two throttling parameters conflicts | |
242 | * | |
243 | * @cfg: the throttling configuration to inspect | |
244 | * @ret: true if any conflict detected else false | |
245 | */ | |
246 | bool throttle_conflicting(ThrottleConfig *cfg) | |
247 | { | |
248 | bool bps_flag, ops_flag; | |
249 | bool bps_max_flag, ops_max_flag; | |
250 | ||
251 | bps_flag = cfg->buckets[THROTTLE_BPS_TOTAL].avg && | |
252 | (cfg->buckets[THROTTLE_BPS_READ].avg || | |
253 | cfg->buckets[THROTTLE_BPS_WRITE].avg); | |
254 | ||
255 | ops_flag = cfg->buckets[THROTTLE_OPS_TOTAL].avg && | |
256 | (cfg->buckets[THROTTLE_OPS_READ].avg || | |
257 | cfg->buckets[THROTTLE_OPS_WRITE].avg); | |
258 | ||
259 | bps_max_flag = cfg->buckets[THROTTLE_BPS_TOTAL].max && | |
260 | (cfg->buckets[THROTTLE_BPS_READ].max || | |
261 | cfg->buckets[THROTTLE_BPS_WRITE].max); | |
262 | ||
263 | ops_max_flag = cfg->buckets[THROTTLE_OPS_TOTAL].max && | |
264 | (cfg->buckets[THROTTLE_OPS_READ].max || | |
265 | cfg->buckets[THROTTLE_OPS_WRITE].max); | |
266 | ||
267 | return bps_flag || ops_flag || bps_max_flag || ops_max_flag; | |
268 | } | |
269 | ||
270 | /* check if a throttling configuration is valid | |
271 | * @cfg: the throttling configuration to inspect | |
272 | * @ret: true if valid else false | |
273 | */ | |
274 | bool throttle_is_valid(ThrottleConfig *cfg) | |
275 | { | |
276 | bool invalid = false; | |
277 | int i; | |
278 | ||
279 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
280 | if (cfg->buckets[i].avg < 0) { | |
281 | invalid = true; | |
282 | } | |
283 | } | |
284 | ||
285 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
286 | if (cfg->buckets[i].max < 0) { | |
287 | invalid = true; | |
288 | } | |
289 | } | |
290 | ||
291 | return !invalid; | |
292 | } | |
293 | ||
294 | /* fix bucket parameters */ | |
295 | static void throttle_fix_bucket(LeakyBucket *bkt) | |
296 | { | |
297 | double min; | |
298 | ||
299 | /* zero bucket level */ | |
300 | bkt->level = 0; | |
301 | ||
302 | /* The following is done to cope with the Linux CFQ block scheduler | |
303 | * which regroup reads and writes by block of 100ms in the guest. | |
304 | * When they are two process one making reads and one making writes cfq | |
305 | * make a pattern looking like the following: | |
306 | * WWWWWWWWWWWRRRRRRRRRRRRRRWWWWWWWWWWWWWwRRRRRRRRRRRRRRRRR | |
307 | * Having a max burst value of 100ms of the average will help smooth the | |
308 | * throttling | |
309 | */ | |
310 | min = bkt->avg / 10; | |
311 | if (bkt->avg && !bkt->max) { | |
312 | bkt->max = min; | |
313 | } | |
314 | } | |
315 | ||
316 | /* take care of canceling a timer */ | |
317 | static void throttle_cancel_timer(QEMUTimer *timer) | |
318 | { | |
319 | assert(timer != NULL); | |
320 | ||
321 | timer_del(timer); | |
322 | } | |
323 | ||
324 | /* Used to configure the throttle | |
325 | * | |
326 | * @ts: the throttle state we are working on | |
327 | * @cfg: the config to set | |
328 | */ | |
329 | void throttle_config(ThrottleState *ts, ThrottleConfig *cfg) | |
330 | { | |
331 | int i; | |
332 | ||
333 | ts->cfg = *cfg; | |
334 | ||
335 | for (i = 0; i < BUCKETS_COUNT; i++) { | |
336 | throttle_fix_bucket(&ts->cfg.buckets[i]); | |
337 | } | |
338 | ||
339 | ts->previous_leak = qemu_clock_get_ns(ts->clock_type); | |
340 | ||
341 | for (i = 0; i < 2; i++) { | |
342 | throttle_cancel_timer(ts->timers[i]); | |
343 | } | |
344 | } | |
345 | ||
346 | /* used to get config | |
347 | * | |
348 | * @ts: the throttle state we are working on | |
349 | * @cfg: the config to write | |
350 | */ | |
351 | void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg) | |
352 | { | |
353 | *cfg = ts->cfg; | |
354 | } | |
355 | ||
356 | ||
357 | /* Schedule the read or write timer if needed | |
358 | * | |
359 | * NOTE: this function is not unit tested due to it's usage of timer_mod | |
360 | * | |
361 | * @is_write: the type of operation (read/write) | |
362 | * @ret: true if the timer has been scheduled else false | |
363 | */ | |
364 | bool throttle_schedule_timer(ThrottleState *ts, bool is_write) | |
365 | { | |
366 | int64_t now = qemu_clock_get_ns(ts->clock_type); | |
367 | int64_t next_timestamp; | |
368 | bool must_wait; | |
369 | ||
370 | must_wait = throttle_compute_timer(ts, | |
371 | is_write, | |
372 | now, | |
373 | &next_timestamp); | |
374 | ||
375 | /* request not throttled */ | |
376 | if (!must_wait) { | |
377 | return false; | |
378 | } | |
379 | ||
380 | /* request throttled and timer pending -> do nothing */ | |
381 | if (timer_pending(ts->timers[is_write])) { | |
382 | return true; | |
383 | } | |
384 | ||
385 | /* request throttled and timer not pending -> arm timer */ | |
386 | timer_mod(ts->timers[is_write], next_timestamp); | |
387 | return true; | |
388 | } | |
389 | ||
390 | /* do the accounting for this operation | |
391 | * | |
392 | * @is_write: the type of operation (read/write) | |
393 | * @size: the size of the operation | |
394 | */ | |
395 | void throttle_account(ThrottleState *ts, bool is_write, uint64_t size) | |
396 | { | |
397 | double units = 1.0; | |
398 | ||
399 | /* if cfg.op_size is defined and smaller than size we compute unit count */ | |
400 | if (ts->cfg.op_size && size > ts->cfg.op_size) { | |
401 | units = (double) size / ts->cfg.op_size; | |
402 | } | |
403 | ||
404 | ts->cfg.buckets[THROTTLE_BPS_TOTAL].level += size; | |
405 | ts->cfg.buckets[THROTTLE_OPS_TOTAL].level += units; | |
406 | ||
407 | if (is_write) { | |
408 | ts->cfg.buckets[THROTTLE_BPS_WRITE].level += size; | |
409 | ts->cfg.buckets[THROTTLE_OPS_WRITE].level += units; | |
410 | } else { | |
411 | ts->cfg.buckets[THROTTLE_BPS_READ].level += size; | |
412 | ts->cfg.buckets[THROTTLE_OPS_READ].level += units; | |
413 | } | |
414 | } | |
415 |