1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2020 Hisilicon Limited.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 #include <linux/debugfs.h>
9 #include <linux/delay.h>
10 #include <linux/device.h>
11 #include <linux/dma-mapping.h>
12 #include <linux/kernel.h>
13 #include <linux/kthread.h>
14 #include <linux/math64.h>
15 #include <linux/module.h>
16 #include <linux/pci.h>
17 #include <linux/platform_device.h>
18 #include <linux/slab.h>
19 #include <linux/timekeeping.h>
21 #define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
22 #define DMA_MAP_MAX_THREADS 1024
23 #define DMA_MAP_MAX_SECONDS 300
25 #define DMA_MAP_BIDIRECTIONAL 0
26 #define DMA_MAP_TO_DEVICE 1
27 #define DMA_MAP_FROM_DEVICE 2
29 struct map_benchmark
{
30 __u64 avg_map_100ns
; /* average map latency in 100ns */
31 __u64 map_stddev
; /* standard deviation of map latency */
32 __u64 avg_unmap_100ns
; /* as above */
34 __u32 threads
; /* how many threads will do map/unmap in parallel */
35 __u32 seconds
; /* how long the test will last */
36 __s32 node
; /* which numa node this benchmark will run on */
37 __u32 dma_bits
; /* DMA addressing capability */
38 __u32 dma_dir
; /* DMA data direction */
39 __u8 expansion
[84]; /* For future use */
42 struct map_benchmark_data
{
43 struct map_benchmark bparam
;
45 struct dentry
*debugfs
;
46 enum dma_data_direction dir
;
47 atomic64_t sum_map_100ns
;
48 atomic64_t sum_unmap_100ns
;
49 atomic64_t sum_sq_map
;
50 atomic64_t sum_sq_unmap
;
54 static int map_benchmark_thread(void *data
)
58 struct map_benchmark_data
*map
= data
;
61 buf
= (void *)__get_free_page(GFP_KERNEL
);
65 while (!kthread_should_stop()) {
66 u64 map_100ns
, unmap_100ns
, map_sq
, unmap_sq
;
67 ktime_t map_stime
, map_etime
, unmap_stime
, unmap_etime
;
68 ktime_t map_delta
, unmap_delta
;
71 * for a non-coherent device, if we don't stain them in the
72 * cache, this will give an underestimate of the real-world
73 * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
74 * 66 means evertything goes well! 66 is lucky.
76 if (map
->dir
!= DMA_FROM_DEVICE
)
77 memset(buf
, 0x66, PAGE_SIZE
);
79 map_stime
= ktime_get();
80 dma_addr
= dma_map_single(map
->dev
, buf
, PAGE_SIZE
, map
->dir
);
81 if (unlikely(dma_mapping_error(map
->dev
, dma_addr
))) {
82 pr_err("dma_map_single failed on %s\n",
87 map_etime
= ktime_get();
88 map_delta
= ktime_sub(map_etime
, map_stime
);
90 unmap_stime
= ktime_get();
91 dma_unmap_single(map
->dev
, dma_addr
, PAGE_SIZE
, map
->dir
);
92 unmap_etime
= ktime_get();
93 unmap_delta
= ktime_sub(unmap_etime
, unmap_stime
);
95 /* calculate sum and sum of squares */
97 map_100ns
= div64_ul(map_delta
, 100);
98 unmap_100ns
= div64_ul(unmap_delta
, 100);
99 map_sq
= map_100ns
* map_100ns
;
100 unmap_sq
= unmap_100ns
* unmap_100ns
;
102 atomic64_add(map_100ns
, &map
->sum_map_100ns
);
103 atomic64_add(unmap_100ns
, &map
->sum_unmap_100ns
);
104 atomic64_add(map_sq
, &map
->sum_sq_map
);
105 atomic64_add(unmap_sq
, &map
->sum_sq_unmap
);
106 atomic64_inc(&map
->loops
);
110 free_page((unsigned long)buf
);
114 static int do_map_benchmark(struct map_benchmark_data
*map
)
116 struct task_struct
**tsk
;
117 int threads
= map
->bparam
.threads
;
118 int node
= map
->bparam
.node
;
119 const cpumask_t
*cpu_mask
= cpumask_of_node(node
);
124 tsk
= kmalloc_array(threads
, sizeof(*tsk
), GFP_KERNEL
);
128 get_device(map
->dev
);
130 for (i
= 0; i
< threads
; i
++) {
131 tsk
[i
] = kthread_create_on_node(map_benchmark_thread
, map
,
132 map
->bparam
.node
, "dma-map-benchmark/%d", i
);
133 if (IS_ERR(tsk
[i
])) {
134 pr_err("create dma_map thread failed\n");
135 ret
= PTR_ERR(tsk
[i
]);
139 if (node
!= NUMA_NO_NODE
)
140 kthread_bind_mask(tsk
[i
], cpu_mask
);
143 /* clear the old value in the previous benchmark */
144 atomic64_set(&map
->sum_map_100ns
, 0);
145 atomic64_set(&map
->sum_unmap_100ns
, 0);
146 atomic64_set(&map
->sum_sq_map
, 0);
147 atomic64_set(&map
->sum_sq_unmap
, 0);
148 atomic64_set(&map
->loops
, 0);
150 for (i
= 0; i
< threads
; i
++) {
151 get_task_struct(tsk
[i
]);
152 wake_up_process(tsk
[i
]);
155 msleep_interruptible(map
->bparam
.seconds
* 1000);
157 /* wait for the completion of benchmark threads */
158 for (i
= 0; i
< threads
; i
++) {
159 ret
= kthread_stop(tsk
[i
]);
164 loops
= atomic64_read(&map
->loops
);
165 if (likely(loops
> 0)) {
166 u64 map_variance
, unmap_variance
;
167 u64 sum_map
= atomic64_read(&map
->sum_map_100ns
);
168 u64 sum_unmap
= atomic64_read(&map
->sum_unmap_100ns
);
169 u64 sum_sq_map
= atomic64_read(&map
->sum_sq_map
);
170 u64 sum_sq_unmap
= atomic64_read(&map
->sum_sq_unmap
);
172 /* average latency */
173 map
->bparam
.avg_map_100ns
= div64_u64(sum_map
, loops
);
174 map
->bparam
.avg_unmap_100ns
= div64_u64(sum_unmap
, loops
);
176 /* standard deviation of latency */
177 map_variance
= div64_u64(sum_sq_map
, loops
) -
178 map
->bparam
.avg_map_100ns
*
179 map
->bparam
.avg_map_100ns
;
180 unmap_variance
= div64_u64(sum_sq_unmap
, loops
) -
181 map
->bparam
.avg_unmap_100ns
*
182 map
->bparam
.avg_unmap_100ns
;
183 map
->bparam
.map_stddev
= int_sqrt64(map_variance
);
184 map
->bparam
.unmap_stddev
= int_sqrt64(unmap_variance
);
188 for (i
= 0; i
< threads
; i
++)
189 put_task_struct(tsk
[i
]);
190 put_device(map
->dev
);
195 static long map_benchmark_ioctl(struct file
*file
, unsigned int cmd
,
198 struct map_benchmark_data
*map
= file
->private_data
;
199 void __user
*argp
= (void __user
*)arg
;
204 if (copy_from_user(&map
->bparam
, argp
, sizeof(map
->bparam
)))
208 case DMA_MAP_BENCHMARK
:
209 if (map
->bparam
.threads
== 0 ||
210 map
->bparam
.threads
> DMA_MAP_MAX_THREADS
) {
211 pr_err("invalid thread number\n");
215 if (map
->bparam
.seconds
== 0 ||
216 map
->bparam
.seconds
> DMA_MAP_MAX_SECONDS
) {
217 pr_err("invalid duration seconds\n");
221 if (map
->bparam
.node
!= NUMA_NO_NODE
&&
222 !node_possible(map
->bparam
.node
)) {
223 pr_err("invalid numa node\n");
227 switch (map
->bparam
.dma_dir
) {
228 case DMA_MAP_BIDIRECTIONAL
:
229 map
->dir
= DMA_BIDIRECTIONAL
;
231 case DMA_MAP_FROM_DEVICE
:
232 map
->dir
= DMA_FROM_DEVICE
;
234 case DMA_MAP_TO_DEVICE
:
235 map
->dir
= DMA_TO_DEVICE
;
238 pr_err("invalid DMA direction\n");
242 old_dma_mask
= dma_get_mask(map
->dev
);
244 ret
= dma_set_mask(map
->dev
,
245 DMA_BIT_MASK(map
->bparam
.dma_bits
));
247 pr_err("failed to set dma_mask on device %s\n",
252 ret
= do_map_benchmark(map
);
255 * restore the original dma_mask as many devices' dma_mask are
256 * set by architectures, acpi, busses. When we bind them back
257 * to their original drivers, those drivers shouldn't see
258 * dma_mask changed by benchmark
260 dma_set_mask(map
->dev
, old_dma_mask
);
266 if (copy_to_user(argp
, &map
->bparam
, sizeof(map
->bparam
)))
272 static const struct file_operations map_benchmark_fops
= {
274 .unlocked_ioctl
= map_benchmark_ioctl
,
277 static void map_benchmark_remove_debugfs(void *data
)
279 struct map_benchmark_data
*map
= (struct map_benchmark_data
*)data
;
281 debugfs_remove(map
->debugfs
);
284 static int __map_benchmark_probe(struct device
*dev
)
286 struct dentry
*entry
;
287 struct map_benchmark_data
*map
;
290 map
= devm_kzalloc(dev
, sizeof(*map
), GFP_KERNEL
);
295 ret
= devm_add_action(dev
, map_benchmark_remove_debugfs
, map
);
297 pr_err("Can't add debugfs remove action\n");
302 * we only permit a device bound with this driver, 2nd probe
305 entry
= debugfs_create_file("dma_map_benchmark", 0600, NULL
, map
,
306 &map_benchmark_fops
);
308 return PTR_ERR(entry
);
309 map
->debugfs
= entry
;
314 static int map_benchmark_platform_probe(struct platform_device
*pdev
)
316 return __map_benchmark_probe(&pdev
->dev
);
319 static struct platform_driver map_benchmark_platform_driver
= {
321 .name
= "dma_map_benchmark",
323 .probe
= map_benchmark_platform_probe
,
327 map_benchmark_pci_probe(struct pci_dev
*pdev
, const struct pci_device_id
*id
)
329 return __map_benchmark_probe(&pdev
->dev
);
332 static struct pci_driver map_benchmark_pci_driver
= {
333 .name
= "dma_map_benchmark",
334 .probe
= map_benchmark_pci_probe
,
337 static int __init
map_benchmark_init(void)
341 ret
= pci_register_driver(&map_benchmark_pci_driver
);
345 ret
= platform_driver_register(&map_benchmark_platform_driver
);
347 pci_unregister_driver(&map_benchmark_pci_driver
);
354 static void __exit
map_benchmark_cleanup(void)
356 platform_driver_unregister(&map_benchmark_platform_driver
);
357 pci_unregister_driver(&map_benchmark_pci_driver
);
360 module_init(map_benchmark_init
);
361 module_exit(map_benchmark_cleanup
);
363 MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
364 MODULE_DESCRIPTION("dma_map benchmark driver");
365 MODULE_LICENSE("GPL");