]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - drivers/md/dm-stats.c
x86/speculation: Add a common function for MD_CLEAR mitigation update
[mirror_ubuntu-jammy-kernel.git] / drivers / md / dm-stats.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
fd2ed4d2
MP
2#include <linux/errno.h>
3#include <linux/numa.h>
4#include <linux/slab.h>
5#include <linux/rculist.h>
6#include <linux/threads.h>
7#include <linux/preempt.h>
8#include <linux/irqflags.h>
9#include <linux/vmalloc.h>
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <linux/device-mapper.h>
13
4cc96131 14#include "dm-core.h"
fd2ed4d2
MP
15#include "dm-stats.h"
16
17#define DM_MSG_PREFIX "stats"
18
19static int dm_stat_need_rcu_barrier;
20
21/*
22 * Using 64-bit values to avoid overflow (which is a
23 * problem that block/genhd.c's IO accounting has).
24 */
25struct dm_stat_percpu {
26 unsigned long long sectors[2];
27 unsigned long long ios[2];
28 unsigned long long merges[2];
29 unsigned long long ticks[2];
30 unsigned long long io_ticks[2];
31 unsigned long long io_ticks_total;
32 unsigned long long time_in_queue;
dfcfac3e 33 unsigned long long *histogram;
fd2ed4d2
MP
34};
35
36struct dm_stat_shared {
37 atomic_t in_flight[2];
c96aec34 38 unsigned long long stamp;
fd2ed4d2
MP
39 struct dm_stat_percpu tmp;
40};
41
42struct dm_stat {
43 struct list_head list_entry;
44 int id;
c96aec34 45 unsigned stat_flags;
fd2ed4d2
MP
46 size_t n_entries;
47 sector_t start;
48 sector_t end;
49 sector_t step;
dfcfac3e
MP
50 unsigned n_histogram_entries;
51 unsigned long long *histogram_boundaries;
fd2ed4d2
MP
52 const char *program_id;
53 const char *aux_data;
54 struct rcu_head rcu_head;
55 size_t shared_alloc_size;
56 size_t percpu_alloc_size;
dfcfac3e 57 size_t histogram_alloc_size;
fd2ed4d2 58 struct dm_stat_percpu *stat_percpu[NR_CPUS];
b18ae8dd 59 struct dm_stat_shared stat_shared[];
fd2ed4d2
MP
60};
61
c96aec34
MP
62#define STAT_PRECISE_TIMESTAMPS 1
63
fd2ed4d2
MP
64struct dm_stats_last_position {
65 sector_t last_sector;
66 unsigned last_rw;
67};
68
69/*
70 * A typo on the command line could possibly make the kernel run out of memory
71 * and crash. To prevent the crash we account all used memory. We fail if we
72 * exhaust 1/4 of all memory or 1/2 of vmalloc space.
73 */
74#define DM_STATS_MEMORY_FACTOR 4
75#define DM_STATS_VMALLOC_FACTOR 2
76
77static DEFINE_SPINLOCK(shared_memory_lock);
78
79static unsigned long shared_memory_amount;
80
81static bool __check_shared_memory(size_t alloc_size)
82{
83 size_t a;
84
85 a = shared_memory_amount + alloc_size;
86 if (a < shared_memory_amount)
87 return false;
ca79b0c2 88 if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR)
fd2ed4d2
MP
89 return false;
90#ifdef CONFIG_MMU
91 if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
92 return false;
93#endif
94 return true;
95}
96
97static bool check_shared_memory(size_t alloc_size)
98{
99 bool ret;
100
101 spin_lock_irq(&shared_memory_lock);
102
103 ret = __check_shared_memory(alloc_size);
104
105 spin_unlock_irq(&shared_memory_lock);
106
107 return ret;
108}
109
110static bool claim_shared_memory(size_t alloc_size)
111{
112 spin_lock_irq(&shared_memory_lock);
113
114 if (!__check_shared_memory(alloc_size)) {
115 spin_unlock_irq(&shared_memory_lock);
116 return false;
117 }
118
119 shared_memory_amount += alloc_size;
120
121 spin_unlock_irq(&shared_memory_lock);
122
123 return true;
124}
125
126static void free_shared_memory(size_t alloc_size)
127{
128 unsigned long flags;
129
130 spin_lock_irqsave(&shared_memory_lock, flags);
131
132 if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
133 spin_unlock_irqrestore(&shared_memory_lock, flags);
134 DMCRIT("Memory usage accounting bug.");
135 return;
136 }
137
138 shared_memory_amount -= alloc_size;
139
140 spin_unlock_irqrestore(&shared_memory_lock, flags);
141}
142
143static void *dm_kvzalloc(size_t alloc_size, int node)
144{
145 void *p;
146
147 if (!claim_shared_memory(alloc_size))
148 return NULL;
149
a7c3e901 150 p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node);
fd2ed4d2
MP
151 if (p)
152 return p;
153
154 free_shared_memory(alloc_size);
155
156 return NULL;
157}
158
159static void dm_kvfree(void *ptr, size_t alloc_size)
160{
161 if (!ptr)
162 return;
163
164 free_shared_memory(alloc_size);
165
0f24b79b 166 kvfree(ptr);
fd2ed4d2
MP
167}
168
169static void dm_stat_free(struct rcu_head *head)
170{
171 int cpu;
172 struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
173
60858318 174 kfree(s->histogram_boundaries);
fd2ed4d2
MP
175 kfree(s->program_id);
176 kfree(s->aux_data);
dfcfac3e
MP
177 for_each_possible_cpu(cpu) {
178 dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
fd2ed4d2 179 dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
dfcfac3e
MP
180 }
181 dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
fd2ed4d2
MP
182 dm_kvfree(s, s->shared_alloc_size);
183}
184
185static int dm_stat_in_flight(struct dm_stat_shared *shared)
186{
187 return atomic_read(&shared->in_flight[READ]) +
188 atomic_read(&shared->in_flight[WRITE]);
189}
190
191void dm_stats_init(struct dm_stats *stats)
192{
193 int cpu;
194 struct dm_stats_last_position *last;
195
196 mutex_init(&stats->mutex);
197 INIT_LIST_HEAD(&stats->list);
9571cf30 198 stats->precise_timestamps = false;
fd2ed4d2
MP
199 stats->last = alloc_percpu(struct dm_stats_last_position);
200 for_each_possible_cpu(cpu) {
201 last = per_cpu_ptr(stats->last, cpu);
202 last->last_sector = (sector_t)ULLONG_MAX;
203 last->last_rw = UINT_MAX;
204 }
205}
206
207void dm_stats_cleanup(struct dm_stats *stats)
208{
209 size_t ni;
210 struct dm_stat *s;
211 struct dm_stat_shared *shared;
212
213 while (!list_empty(&stats->list)) {
214 s = container_of(stats->list.next, struct dm_stat, list_entry);
215 list_del(&s->list_entry);
216 for (ni = 0; ni < s->n_entries; ni++) {
217 shared = &s->stat_shared[ni];
218 if (WARN_ON(dm_stat_in_flight(shared))) {
219 DMCRIT("leaked in-flight counter at index %lu "
220 "(start %llu, end %llu, step %llu): reads %d, writes %d",
221 (unsigned long)ni,
222 (unsigned long long)s->start,
223 (unsigned long long)s->end,
224 (unsigned long long)s->step,
225 atomic_read(&shared->in_flight[READ]),
226 atomic_read(&shared->in_flight[WRITE]));
227 }
228 }
229 dm_stat_free(&s->rcu_head);
230 }
231 free_percpu(stats->last);
d5ffebdd 232 mutex_destroy(&stats->mutex);
fd2ed4d2
MP
233}
234
9571cf30
MS
235static void dm_stats_recalc_precise_timestamps(struct dm_stats *stats)
236{
237 struct list_head *l;
238 struct dm_stat *tmp_s;
239 bool precise_timestamps = false;
240
241 list_for_each(l, &stats->list) {
242 tmp_s = container_of(l, struct dm_stat, list_entry);
243 if (tmp_s->stat_flags & STAT_PRECISE_TIMESTAMPS) {
244 precise_timestamps = true;
245 break;
246 }
247 }
248 stats->precise_timestamps = precise_timestamps;
249}
250
fd2ed4d2 251static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
c96aec34 252 sector_t step, unsigned stat_flags,
dfcfac3e
MP
253 unsigned n_histogram_entries,
254 unsigned long long *histogram_boundaries,
c96aec34 255 const char *program_id, const char *aux_data,
fd2ed4d2
MP
256 void (*suspend_callback)(struct mapped_device *),
257 void (*resume_callback)(struct mapped_device *),
258 struct mapped_device *md)
259{
260 struct list_head *l;
261 struct dm_stat *s, *tmp_s;
262 sector_t n_entries;
263 size_t ni;
264 size_t shared_alloc_size;
265 size_t percpu_alloc_size;
dfcfac3e 266 size_t histogram_alloc_size;
fd2ed4d2
MP
267 struct dm_stat_percpu *p;
268 int cpu;
269 int ret_id;
270 int r;
271
272 if (end < start || !step)
273 return -EINVAL;
274
275 n_entries = end - start;
276 if (dm_sector_div64(n_entries, step))
277 n_entries++;
278
279 if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
280 return -EOVERFLOW;
281
fb16c799 282 shared_alloc_size = struct_size(s, stat_shared, n_entries);
fd2ed4d2
MP
283 if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
284 return -EOVERFLOW;
285
286 percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
287 if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
288 return -EOVERFLOW;
289
dfcfac3e
MP
290 histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
291 if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
292 return -EOVERFLOW;
293
294 if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
295 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
fd2ed4d2
MP
296 return -ENOMEM;
297
298 s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
299 if (!s)
300 return -ENOMEM;
301
c96aec34 302 s->stat_flags = stat_flags;
fd2ed4d2
MP
303 s->n_entries = n_entries;
304 s->start = start;
305 s->end = end;
306 s->step = step;
307 s->shared_alloc_size = shared_alloc_size;
308 s->percpu_alloc_size = percpu_alloc_size;
dfcfac3e
MP
309 s->histogram_alloc_size = histogram_alloc_size;
310
311 s->n_histogram_entries = n_histogram_entries;
312 s->histogram_boundaries = kmemdup(histogram_boundaries,
313 s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
314 if (!s->histogram_boundaries) {
315 r = -ENOMEM;
316 goto out;
317 }
fd2ed4d2
MP
318
319 s->program_id = kstrdup(program_id, GFP_KERNEL);
320 if (!s->program_id) {
321 r = -ENOMEM;
322 goto out;
323 }
324 s->aux_data = kstrdup(aux_data, GFP_KERNEL);
325 if (!s->aux_data) {
326 r = -ENOMEM;
327 goto out;
328 }
329
330 for (ni = 0; ni < n_entries; ni++) {
331 atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
332 atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
333 }
334
dfcfac3e
MP
335 if (s->n_histogram_entries) {
336 unsigned long long *hi;
337 hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
338 if (!hi) {
339 r = -ENOMEM;
340 goto out;
341 }
342 for (ni = 0; ni < n_entries; ni++) {
343 s->stat_shared[ni].tmp.histogram = hi;
344 hi += s->n_histogram_entries + 1;
345 }
346 }
347
fd2ed4d2
MP
348 for_each_possible_cpu(cpu) {
349 p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
350 if (!p) {
351 r = -ENOMEM;
352 goto out;
353 }
354 s->stat_percpu[cpu] = p;
dfcfac3e
MP
355 if (s->n_histogram_entries) {
356 unsigned long long *hi;
357 hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
358 if (!hi) {
359 r = -ENOMEM;
360 goto out;
361 }
362 for (ni = 0; ni < n_entries; ni++) {
363 p[ni].histogram = hi;
364 hi += s->n_histogram_entries + 1;
365 }
366 }
fd2ed4d2
MP
367 }
368
369 /*
370 * Suspend/resume to make sure there is no i/o in flight,
371 * so that newly created statistics will be exact.
372 *
373 * (note: we couldn't suspend earlier because we must not
374 * allocate memory while suspended)
375 */
376 suspend_callback(md);
377
378 mutex_lock(&stats->mutex);
379 s->id = 0;
380 list_for_each(l, &stats->list) {
381 tmp_s = container_of(l, struct dm_stat, list_entry);
382 if (WARN_ON(tmp_s->id < s->id)) {
383 r = -EINVAL;
384 goto out_unlock_resume;
385 }
386 if (tmp_s->id > s->id)
387 break;
388 if (unlikely(s->id == INT_MAX)) {
389 r = -ENFILE;
390 goto out_unlock_resume;
391 }
392 s->id++;
393 }
394 ret_id = s->id;
395 list_add_tail_rcu(&s->list_entry, l);
9571cf30
MS
396
397 dm_stats_recalc_precise_timestamps(stats);
398
fd2ed4d2
MP
399 mutex_unlock(&stats->mutex);
400
401 resume_callback(md);
402
403 return ret_id;
404
405out_unlock_resume:
406 mutex_unlock(&stats->mutex);
407 resume_callback(md);
408out:
409 dm_stat_free(&s->rcu_head);
410 return r;
411}
412
413static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
414{
415 struct dm_stat *s;
416
417 list_for_each_entry(s, &stats->list, list_entry) {
418 if (s->id > id)
419 break;
420 if (s->id == id)
421 return s;
422 }
423
424 return NULL;
425}
426
427static int dm_stats_delete(struct dm_stats *stats, int id)
428{
429 struct dm_stat *s;
430 int cpu;
431
432 mutex_lock(&stats->mutex);
433
434 s = __dm_stats_find(stats, id);
435 if (!s) {
436 mutex_unlock(&stats->mutex);
437 return -ENOENT;
438 }
439
440 list_del_rcu(&s->list_entry);
9571cf30
MS
441
442 dm_stats_recalc_precise_timestamps(stats);
443
fd2ed4d2
MP
444 mutex_unlock(&stats->mutex);
445
446 /*
447 * vfree can't be called from RCU callback
448 */
449 for_each_possible_cpu(cpu)
dfcfac3e
MP
450 if (is_vmalloc_addr(s->stat_percpu) ||
451 is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
fd2ed4d2 452 goto do_sync_free;
dfcfac3e
MP
453 if (is_vmalloc_addr(s) ||
454 is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
fd2ed4d2
MP
455do_sync_free:
456 synchronize_rcu_expedited();
457 dm_stat_free(&s->rcu_head);
458 } else {
6aa7de05 459 WRITE_ONCE(dm_stat_need_rcu_barrier, 1);
fd2ed4d2
MP
460 call_rcu(&s->rcu_head, dm_stat_free);
461 }
462 return 0;
463}
464
465static int dm_stats_list(struct dm_stats *stats, const char *program,
466 char *result, unsigned maxlen)
467{
468 struct dm_stat *s;
469 sector_t len;
470 unsigned sz = 0;
471
472 /*
473 * Output format:
474 * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
475 */
476
477 mutex_lock(&stats->mutex);
478 list_for_each_entry(s, &stats->list, list_entry) {
479 if (!program || !strcmp(program, s->program_id)) {
480 len = s->end - s->start;
bd49784f 481 DMEMIT("%d: %llu+%llu %llu %s %s", s->id,
fd2ed4d2
MP
482 (unsigned long long)s->start,
483 (unsigned long long)len,
484 (unsigned long long)s->step,
485 s->program_id,
486 s->aux_data);
bd49784f
MP
487 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
488 DMEMIT(" precise_timestamps");
489 if (s->n_histogram_entries) {
490 unsigned i;
491 DMEMIT(" histogram:");
492 for (i = 0; i < s->n_histogram_entries; i++) {
493 if (i)
494 DMEMIT(",");
495 DMEMIT("%llu", s->histogram_boundaries[i]);
496 }
497 }
498 DMEMIT("\n");
fd2ed4d2
MP
499 }
500 }
501 mutex_unlock(&stats->mutex);
502
503 return 1;
504}
505
c96aec34
MP
506static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
507 struct dm_stat_percpu *p)
fd2ed4d2
MP
508{
509 /*
510 * This is racy, but so is part_round_stats_single.
511 */
c96aec34
MP
512 unsigned long long now, difference;
513 unsigned in_flight_read, in_flight_write;
514
515 if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
516 now = jiffies;
517 else
518 now = ktime_to_ns(ktime_get());
fd2ed4d2 519
c96aec34 520 difference = now - shared->stamp;
fd2ed4d2
MP
521 if (!difference)
522 return;
c96aec34 523
fd2ed4d2
MP
524 in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
525 in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
526 if (in_flight_read)
527 p->io_ticks[READ] += difference;
528 if (in_flight_write)
529 p->io_ticks[WRITE] += difference;
530 if (in_flight_read + in_flight_write) {
531 p->io_ticks_total += difference;
532 p->time_in_queue += (in_flight_read + in_flight_write) * difference;
533 }
534 shared->stamp = now;
535}
536
537static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
528ec5ab 538 int idx, sector_t len,
c96aec34
MP
539 struct dm_stats_aux *stats_aux, bool end,
540 unsigned long duration_jiffies)
fd2ed4d2 541{
fd2ed4d2
MP
542 struct dm_stat_shared *shared = &s->stat_shared[entry];
543 struct dm_stat_percpu *p;
544
545 /*
bbf3f8cb 546 * For strict correctness we should use local_irq_save/restore
fd2ed4d2
MP
547 * instead of preempt_disable/enable.
548 *
bbf3f8cb
MP
549 * preempt_disable/enable is racy if the driver finishes bios
550 * from non-interrupt context as well as from interrupt context
551 * or from more different interrupts.
fd2ed4d2 552 *
bbf3f8cb
MP
553 * On 64-bit architectures the race only results in not counting some
554 * events, so it is acceptable. On 32-bit architectures the race could
555 * cause the counter going off by 2^32, so we need to do proper locking
556 * there.
fd2ed4d2
MP
557 *
558 * part_stat_lock()/part_stat_unlock() have this race too.
559 */
bbf3f8cb
MP
560#if BITS_PER_LONG == 32
561 unsigned long flags;
562 local_irq_save(flags);
563#else
fd2ed4d2 564 preempt_disable();
bbf3f8cb 565#endif
fd2ed4d2
MP
566 p = &s->stat_percpu[smp_processor_id()][entry];
567
568 if (!end) {
c96aec34 569 dm_stat_round(s, shared, p);
fd2ed4d2
MP
570 atomic_inc(&shared->in_flight[idx]);
571 } else {
dfcfac3e 572 unsigned long long duration;
c96aec34 573 dm_stat_round(s, shared, p);
fd2ed4d2
MP
574 atomic_dec(&shared->in_flight[idx]);
575 p->sectors[idx] += len;
576 p->ios[idx] += 1;
c96aec34 577 p->merges[idx] += stats_aux->merged;
dfcfac3e 578 if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
c96aec34 579 p->ticks[idx] += duration_jiffies;
dfcfac3e
MP
580 duration = jiffies_to_msecs(duration_jiffies);
581 } else {
c96aec34 582 p->ticks[idx] += stats_aux->duration_ns;
dfcfac3e
MP
583 duration = stats_aux->duration_ns;
584 }
585 if (s->n_histogram_entries) {
586 unsigned lo = 0, hi = s->n_histogram_entries + 1;
587 while (lo + 1 < hi) {
588 unsigned mid = (lo + hi) / 2;
589 if (s->histogram_boundaries[mid - 1] > duration) {
590 hi = mid;
591 } else {
592 lo = mid;
593 }
594
595 }
596 p->histogram[lo]++;
597 }
fd2ed4d2
MP
598 }
599
bbf3f8cb
MP
600#if BITS_PER_LONG == 32
601 local_irq_restore(flags);
602#else
fd2ed4d2 603 preempt_enable();
bbf3f8cb 604#endif
fd2ed4d2
MP
605}
606
528ec5ab 607static void __dm_stat_bio(struct dm_stat *s, int bi_rw,
fd2ed4d2 608 sector_t bi_sector, sector_t end_sector,
c96aec34 609 bool end, unsigned long duration_jiffies,
fd2ed4d2
MP
610 struct dm_stats_aux *stats_aux)
611{
612 sector_t rel_sector, offset, todo, fragment_len;
613 size_t entry;
614
615 if (end_sector <= s->start || bi_sector >= s->end)
616 return;
617 if (unlikely(bi_sector < s->start)) {
618 rel_sector = 0;
619 todo = end_sector - s->start;
620 } else {
621 rel_sector = bi_sector - s->start;
622 todo = end_sector - bi_sector;
623 }
624 if (unlikely(end_sector > s->end))
625 todo -= (end_sector - s->end);
626
627 offset = dm_sector_div64(rel_sector, s->step);
628 entry = rel_sector;
629 do {
630 if (WARN_ON_ONCE(entry >= s->n_entries)) {
631 DMCRIT("Invalid area access in region id %d", s->id);
632 return;
633 }
634 fragment_len = todo;
635 if (fragment_len > s->step - offset)
636 fragment_len = s->step - offset;
637 dm_stat_for_entry(s, entry, bi_rw, fragment_len,
c96aec34 638 stats_aux, end, duration_jiffies);
fd2ed4d2
MP
639 todo -= fragment_len;
640 entry++;
641 offset = 0;
642 } while (unlikely(todo != 0));
643}
644
645void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
646 sector_t bi_sector, unsigned bi_sectors, bool end,
bba2e6c0 647 unsigned long start_time,
c96aec34 648 struct dm_stats_aux *stats_aux)
fd2ed4d2
MP
649{
650 struct dm_stat *s;
651 sector_t end_sector;
652 struct dm_stats_last_position *last;
c96aec34 653 bool got_precise_time;
bba2e6c0 654 unsigned long duration_jiffies = 0;
fd2ed4d2
MP
655
656 if (unlikely(!bi_sectors))
657 return;
658
659 end_sector = bi_sector + bi_sectors;
660
661 if (!end) {
662 /*
663 * A race condition can at worst result in the merged flag being
664 * misrepresented, so we don't have to disable preemption here.
665 */
1f125e76 666 last = raw_cpu_ptr(stats->last);
fd2ed4d2 667 stats_aux->merged =
6aa7de05 668 (bi_sector == (READ_ONCE(last->last_sector) &&
528ec5ab 669 ((bi_rw == WRITE) ==
6aa7de05 670 (READ_ONCE(last->last_rw) == WRITE))
fd2ed4d2 671 ));
6aa7de05
MR
672 WRITE_ONCE(last->last_sector, end_sector);
673 WRITE_ONCE(last->last_rw, bi_rw);
bba2e6c0
MS
674 } else
675 duration_jiffies = jiffies - start_time;
fd2ed4d2
MP
676
677 rcu_read_lock();
678
c96aec34
MP
679 got_precise_time = false;
680 list_for_each_entry_rcu(s, &stats->list, list_entry) {
681 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
9571cf30
MS
682 /* start (!end) duration_ns is set by DM core's alloc_io() */
683 if (end)
c96aec34
MP
684 stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
685 got_precise_time = true;
686 }
687 __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
688 }
fd2ed4d2
MP
689
690 rcu_read_unlock();
691}
692
693static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
694 struct dm_stat *s, size_t x)
695{
696 int cpu;
697 struct dm_stat_percpu *p;
698
699 local_irq_disable();
700 p = &s->stat_percpu[smp_processor_id()][x];
c96aec34 701 dm_stat_round(s, shared, p);
fd2ed4d2
MP
702 local_irq_enable();
703
dfcfac3e
MP
704 shared->tmp.sectors[READ] = 0;
705 shared->tmp.sectors[WRITE] = 0;
706 shared->tmp.ios[READ] = 0;
707 shared->tmp.ios[WRITE] = 0;
708 shared->tmp.merges[READ] = 0;
709 shared->tmp.merges[WRITE] = 0;
710 shared->tmp.ticks[READ] = 0;
711 shared->tmp.ticks[WRITE] = 0;
712 shared->tmp.io_ticks[READ] = 0;
713 shared->tmp.io_ticks[WRITE] = 0;
714 shared->tmp.io_ticks_total = 0;
715 shared->tmp.time_in_queue = 0;
716
717 if (s->n_histogram_entries)
718 memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
719
fd2ed4d2
MP
720 for_each_possible_cpu(cpu) {
721 p = &s->stat_percpu[cpu][x];
6aa7de05
MR
722 shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]);
723 shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]);
724 shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]);
725 shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]);
726 shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]);
727 shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]);
728 shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]);
729 shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]);
730 shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]);
731 shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]);
732 shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total);
733 shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue);
dfcfac3e
MP
734 if (s->n_histogram_entries) {
735 unsigned i;
736 for (i = 0; i < s->n_histogram_entries + 1; i++)
6aa7de05 737 shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]);
dfcfac3e 738 }
fd2ed4d2
MP
739 }
740}
741
742static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
743 bool init_tmp_percpu_totals)
744{
745 size_t x;
746 struct dm_stat_shared *shared;
747 struct dm_stat_percpu *p;
748
749 for (x = idx_start; x < idx_end; x++) {
750 shared = &s->stat_shared[x];
751 if (init_tmp_percpu_totals)
752 __dm_stat_init_temporary_percpu_totals(shared, s, x);
753 local_irq_disable();
754 p = &s->stat_percpu[smp_processor_id()][x];
755 p->sectors[READ] -= shared->tmp.sectors[READ];
756 p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
757 p->ios[READ] -= shared->tmp.ios[READ];
758 p->ios[WRITE] -= shared->tmp.ios[WRITE];
759 p->merges[READ] -= shared->tmp.merges[READ];
760 p->merges[WRITE] -= shared->tmp.merges[WRITE];
761 p->ticks[READ] -= shared->tmp.ticks[READ];
762 p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
763 p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
764 p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
765 p->io_ticks_total -= shared->tmp.io_ticks_total;
766 p->time_in_queue -= shared->tmp.time_in_queue;
767 local_irq_enable();
dfcfac3e
MP
768 if (s->n_histogram_entries) {
769 unsigned i;
770 for (i = 0; i < s->n_histogram_entries + 1; i++) {
771 local_irq_disable();
772 p = &s->stat_percpu[smp_processor_id()][x];
773 p->histogram[i] -= shared->tmp.histogram[i];
774 local_irq_enable();
775 }
776 }
fd2ed4d2
MP
777 }
778}
779
780static int dm_stats_clear(struct dm_stats *stats, int id)
781{
782 struct dm_stat *s;
783
784 mutex_lock(&stats->mutex);
785
786 s = __dm_stats_find(stats, id);
787 if (!s) {
788 mutex_unlock(&stats->mutex);
789 return -ENOENT;
790 }
791
792 __dm_stat_clear(s, 0, s->n_entries, true);
793
794 mutex_unlock(&stats->mutex);
795
796 return 1;
797}
798
799/*
800 * This is like jiffies_to_msec, but works for 64-bit values.
801 */
c96aec34 802static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
fd2ed4d2 803{
c96aec34 804 unsigned long long result;
fd2ed4d2
MP
805 unsigned mult;
806
c96aec34
MP
807 if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
808 return j;
809
810 result = 0;
fd2ed4d2
MP
811 if (j)
812 result = jiffies_to_msecs(j & 0x3fffff);
813 if (j >= 1 << 22) {
814 mult = jiffies_to_msecs(1 << 22);
815 result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
816 }
817 if (j >= 1ULL << 44)
818 result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
819
820 return result;
821}
822
823static int dm_stats_print(struct dm_stats *stats, int id,
824 size_t idx_start, size_t idx_len,
825 bool clear, char *result, unsigned maxlen)
826{
827 unsigned sz = 0;
828 struct dm_stat *s;
829 size_t x;
830 sector_t start, end, step;
831 size_t idx_end;
832 struct dm_stat_shared *shared;
833
834 /*
835 * Output format:
836 * <start_sector>+<length> counters
837 */
838
839 mutex_lock(&stats->mutex);
840
841 s = __dm_stats_find(stats, id);
842 if (!s) {
843 mutex_unlock(&stats->mutex);
844 return -ENOENT;
845 }
846
847 idx_end = idx_start + idx_len;
848 if (idx_end < idx_start ||
849 idx_end > s->n_entries)
850 idx_end = s->n_entries;
851
852 if (idx_start > idx_end)
853 idx_start = idx_end;
854
855 step = s->step;
856 start = s->start + (step * idx_start);
857
858 for (x = idx_start; x < idx_end; x++, start = end) {
859 shared = &s->stat_shared[x];
860 end = start + step;
861 if (unlikely(end > s->end))
862 end = s->end;
863
864 __dm_stat_init_temporary_percpu_totals(shared, s, x);
865
dfcfac3e 866 DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
fd2ed4d2
MP
867 (unsigned long long)start,
868 (unsigned long long)step,
869 shared->tmp.ios[READ],
870 shared->tmp.merges[READ],
871 shared->tmp.sectors[READ],
c96aec34 872 dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
fd2ed4d2
MP
873 shared->tmp.ios[WRITE],
874 shared->tmp.merges[WRITE],
875 shared->tmp.sectors[WRITE],
c96aec34 876 dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
fd2ed4d2 877 dm_stat_in_flight(shared),
c96aec34
MP
878 dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
879 dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
880 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
881 dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
dfcfac3e
MP
882 if (s->n_histogram_entries) {
883 unsigned i;
884 for (i = 0; i < s->n_histogram_entries + 1; i++) {
885 DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
886 }
887 }
888 DMEMIT("\n");
fd2ed4d2
MP
889
890 if (unlikely(sz + 1 >= maxlen))
891 goto buffer_overflow;
892 }
893
894 if (clear)
895 __dm_stat_clear(s, idx_start, idx_end, false);
896
897buffer_overflow:
898 mutex_unlock(&stats->mutex);
899
900 return 1;
901}
902
903static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
904{
905 struct dm_stat *s;
906 const char *new_aux_data;
907
908 mutex_lock(&stats->mutex);
909
910 s = __dm_stats_find(stats, id);
911 if (!s) {
912 mutex_unlock(&stats->mutex);
913 return -ENOENT;
914 }
915
916 new_aux_data = kstrdup(aux_data, GFP_KERNEL);
917 if (!new_aux_data) {
918 mutex_unlock(&stats->mutex);
919 return -ENOMEM;
920 }
921
922 kfree(s->aux_data);
923 s->aux_data = new_aux_data;
924
925 mutex_unlock(&stats->mutex);
926
927 return 0;
928}
929
dfcfac3e
MP
930static int parse_histogram(const char *h, unsigned *n_histogram_entries,
931 unsigned long long **histogram_boundaries)
932{
933 const char *q;
934 unsigned n;
935 unsigned long long last;
936
937 *n_histogram_entries = 1;
938 for (q = h; *q; q++)
939 if (*q == ',')
940 (*n_histogram_entries)++;
941
6da2ec56
KC
942 *histogram_boundaries = kmalloc_array(*n_histogram_entries,
943 sizeof(unsigned long long),
944 GFP_KERNEL);
dfcfac3e
MP
945 if (!*histogram_boundaries)
946 return -ENOMEM;
947
948 n = 0;
949 last = 0;
950 while (1) {
951 unsigned long long hi;
952 int s;
953 char ch;
954 s = sscanf(h, "%llu%c", &hi, &ch);
955 if (!s || (s == 2 && ch != ','))
956 return -EINVAL;
957 if (hi <= last)
958 return -EINVAL;
959 last = hi;
960 (*histogram_boundaries)[n] = hi;
961 if (s == 1)
962 return 0;
963 h = strchr(h, ',') + 1;
964 n++;
965 }
966}
967
fd2ed4d2
MP
968static int message_stats_create(struct mapped_device *md,
969 unsigned argc, char **argv,
970 char *result, unsigned maxlen)
971{
dfcfac3e 972 int r;
fd2ed4d2
MP
973 int id;
974 char dummy;
975 unsigned long long start, end, len, step;
976 unsigned divisor;
977 const char *program_id, *aux_data;
c96aec34
MP
978 unsigned stat_flags = 0;
979
dfcfac3e
MP
980 unsigned n_histogram_entries = 0;
981 unsigned long long *histogram_boundaries = NULL;
982
c96aec34
MP
983 struct dm_arg_set as, as_backup;
984 const char *a;
985 unsigned feature_args;
fd2ed4d2
MP
986
987 /*
988 * Input format:
c96aec34 989 * <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
fd2ed4d2
MP
990 */
991
c96aec34 992 if (argc < 3)
dfcfac3e 993 goto ret_einval;
fd2ed4d2 994
c96aec34
MP
995 as.argc = argc;
996 as.argv = argv;
997 dm_consume_args(&as, 1);
998
999 a = dm_shift_arg(&as);
1000 if (!strcmp(a, "-")) {
fd2ed4d2
MP
1001 start = 0;
1002 len = dm_get_size(md);
1003 if (!len)
1004 len = 1;
c96aec34 1005 } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
fd2ed4d2 1006 start != (sector_t)start || len != (sector_t)len)
dfcfac3e 1007 goto ret_einval;
fd2ed4d2
MP
1008
1009 end = start + len;
1010 if (start >= end)
dfcfac3e 1011 goto ret_einval;
fd2ed4d2 1012
c96aec34
MP
1013 a = dm_shift_arg(&as);
1014 if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
dd4c1b7d
MP
1015 if (!divisor)
1016 return -EINVAL;
fd2ed4d2
MP
1017 step = end - start;
1018 if (do_div(step, divisor))
1019 step++;
1020 if (!step)
1021 step = 1;
c96aec34 1022 } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
fd2ed4d2 1023 step != (sector_t)step || !step)
dfcfac3e 1024 goto ret_einval;
fd2ed4d2 1025
c96aec34
MP
1026 as_backup = as;
1027 a = dm_shift_arg(&as);
1028 if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
1029 while (feature_args--) {
1030 a = dm_shift_arg(&as);
1031 if (!a)
dfcfac3e 1032 goto ret_einval;
c96aec34
MP
1033 if (!strcasecmp(a, "precise_timestamps"))
1034 stat_flags |= STAT_PRECISE_TIMESTAMPS;
dfcfac3e
MP
1035 else if (!strncasecmp(a, "histogram:", 10)) {
1036 if (n_histogram_entries)
1037 goto ret_einval;
1038 if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
1039 goto ret;
1040 } else
1041 goto ret_einval;
c96aec34
MP
1042 }
1043 } else {
1044 as = as_backup;
1045 }
1046
fd2ed4d2
MP
1047 program_id = "-";
1048 aux_data = "-";
1049
c96aec34
MP
1050 a = dm_shift_arg(&as);
1051 if (a)
1052 program_id = a;
1053
1054 a = dm_shift_arg(&as);
1055 if (a)
1056 aux_data = a;
fd2ed4d2 1057
c96aec34 1058 if (as.argc)
dfcfac3e 1059 goto ret_einval;
fd2ed4d2
MP
1060
1061 /*
1062 * If a buffer overflow happens after we created the region,
1063 * it's too late (the userspace would retry with a larger
1064 * buffer, but the region id that caused the overflow is already
1065 * leaked). So we must detect buffer overflow in advance.
1066 */
1067 snprintf(result, maxlen, "%d", INT_MAX);
dfcfac3e
MP
1068 if (dm_message_test_buffer_overflow(result, maxlen)) {
1069 r = 1;
1070 goto ret;
1071 }
fd2ed4d2 1072
dfcfac3e
MP
1073 id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
1074 n_histogram_entries, histogram_boundaries, program_id, aux_data,
ffcc3936 1075 dm_internal_suspend_fast, dm_internal_resume_fast, md);
dfcfac3e
MP
1076 if (id < 0) {
1077 r = id;
1078 goto ret;
1079 }
fd2ed4d2
MP
1080
1081 snprintf(result, maxlen, "%d", id);
1082
dfcfac3e
MP
1083 r = 1;
1084 goto ret;
1085
1086ret_einval:
1087 r = -EINVAL;
1088ret:
1089 kfree(histogram_boundaries);
1090 return r;
fd2ed4d2
MP
1091}
1092
1093static int message_stats_delete(struct mapped_device *md,
1094 unsigned argc, char **argv)
1095{
1096 int id;
1097 char dummy;
1098
1099 if (argc != 2)
1100 return -EINVAL;
1101
1102 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1103 return -EINVAL;
1104
1105 return dm_stats_delete(dm_get_stats(md), id);
1106}
1107
1108static int message_stats_clear(struct mapped_device *md,
1109 unsigned argc, char **argv)
1110{
1111 int id;
1112 char dummy;
1113
1114 if (argc != 2)
1115 return -EINVAL;
1116
1117 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1118 return -EINVAL;
1119
1120 return dm_stats_clear(dm_get_stats(md), id);
1121}
1122
1123static int message_stats_list(struct mapped_device *md,
1124 unsigned argc, char **argv,
1125 char *result, unsigned maxlen)
1126{
1127 int r;
1128 const char *program = NULL;
1129
1130 if (argc < 1 || argc > 2)
1131 return -EINVAL;
1132
1133 if (argc > 1) {
1134 program = kstrdup(argv[1], GFP_KERNEL);
1135 if (!program)
1136 return -ENOMEM;
1137 }
1138
1139 r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
1140
1141 kfree(program);
1142
1143 return r;
1144}
1145
1146static int message_stats_print(struct mapped_device *md,
1147 unsigned argc, char **argv, bool clear,
1148 char *result, unsigned maxlen)
1149{
1150 int id;
1151 char dummy;
1152 unsigned long idx_start = 0, idx_len = ULONG_MAX;
1153
1154 if (argc != 2 && argc != 4)
1155 return -EINVAL;
1156
1157 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1158 return -EINVAL;
1159
1160 if (argc > 3) {
1161 if (strcmp(argv[2], "-") &&
1162 sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
1163 return -EINVAL;
1164 if (strcmp(argv[3], "-") &&
1165 sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
1166 return -EINVAL;
1167 }
1168
1169 return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
1170 result, maxlen);
1171}
1172
1173static int message_stats_set_aux(struct mapped_device *md,
1174 unsigned argc, char **argv)
1175{
1176 int id;
1177 char dummy;
1178
1179 if (argc != 3)
1180 return -EINVAL;
1181
1182 if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1183 return -EINVAL;
1184
1185 return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
1186}
1187
1188int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
1189 char *result, unsigned maxlen)
1190{
1191 int r;
1192
fd2ed4d2
MP
1193 /* All messages here must start with '@' */
1194 if (!strcasecmp(argv[0], "@stats_create"))
1195 r = message_stats_create(md, argc, argv, result, maxlen);
1196 else if (!strcasecmp(argv[0], "@stats_delete"))
1197 r = message_stats_delete(md, argc, argv);
1198 else if (!strcasecmp(argv[0], "@stats_clear"))
1199 r = message_stats_clear(md, argc, argv);
1200 else if (!strcasecmp(argv[0], "@stats_list"))
1201 r = message_stats_list(md, argc, argv, result, maxlen);
1202 else if (!strcasecmp(argv[0], "@stats_print"))
1203 r = message_stats_print(md, argc, argv, false, result, maxlen);
1204 else if (!strcasecmp(argv[0], "@stats_print_clear"))
1205 r = message_stats_print(md, argc, argv, true, result, maxlen);
1206 else if (!strcasecmp(argv[0], "@stats_set_aux"))
1207 r = message_stats_set_aux(md, argc, argv);
1208 else
1209 return 2; /* this wasn't a stats message */
1210
1211 if (r == -EINVAL)
1212 DMWARN("Invalid parameters for message %s", argv[0]);
1213
1214 return r;
1215}
1216
1217int __init dm_statistics_init(void)
1218{
76f5bee5 1219 shared_memory_amount = 0;
fd2ed4d2
MP
1220 dm_stat_need_rcu_barrier = 0;
1221 return 0;
1222}
1223
1224void dm_statistics_exit(void)
1225{
1226 if (dm_stat_need_rcu_barrier)
1227 rcu_barrier();
1228 if (WARN_ON(shared_memory_amount))
1229 DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
1230}
1231
1232module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO);
1233MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");