]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/examples/ioat/kperf/kmod/dmaperf.c
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / spdk / examples / ioat / kperf / kmod / dmaperf.c
1 /*
2 * This file is provided under a dual BSD/GPLv2 license. When using or
3 * redistributing this file, you may do so under either license.
4 *
5 * GPL LICENSE SUMMARY
6 *
7 * Copyright (c) Intel Corporation.
8 * All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of version 2 of the GNU General Public License as
12 * published by the Free Software Foundation.
13 *
14 * BSD LICENSE
15 *
16 * Copyright (c) Intel Corporation.
17 * All rights reserved.
18 *
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
21 * are met:
22 *
23 * * Redistributions of source code must retain the above copyright
24 * notice, this list of conditions and the following disclaimer.
25 * * Redistributions in binary form must reproduce the above copy
26 * notice, this list of conditions and the following disclaimer in
27 * the documentation and/or other materials provided with the
28 * distribution.
29 * * Neither the name of Intel Corporation nor the names of its
30 * contributors may be used to endorse or promote products derived
31 * from this software without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
34 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
35 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
36 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
37 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
38 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
39 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
40 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
41 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
42 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 *
45 * PCIe DMA Perf Linux driver
46 */
47
48 #include <linux/init.h>
49 #include <linux/kernel.h>
50 #include <linux/wait.h>
51 #include <linux/module.h>
52 #include <linux/kthread.h>
53 #include <linux/time.h>
54 #include <linux/timer.h>
55 #include <linux/dma-mapping.h>
56 #include <linux/pci.h>
57 #include <linux/slab.h>
58 #include <linux/spinlock.h>
59 #include <linux/debugfs.h>
60 #include <linux/dmaengine.h>
61 #include <linux/delay.h>
62 #include <linux/printk.h>
63 #include <linux/nodemask.h>
64
65 #define DRIVER_NAME "dma_perf"
66 #define DRIVER_DESCRIPTION "PCIe DMA Performance Measurement Tool"
67
68 #define DRIVER_LICENSE "Dual BSD/GPL"
69 #define DRIVER_VERSION "1.0"
70 #define DRIVER_AUTHOR "Dave Jiang <dave.jiang@intel.com>"
71
72 #define MAX_THREADS 32
73 #define MAX_TEST_SIZE 1024 * 1024 /* 1M */
74 #define DMA_CHANNELS_PER_NODE 8
75
76 MODULE_LICENSE(DRIVER_LICENSE);
77 MODULE_VERSION(DRIVER_VERSION);
78 MODULE_AUTHOR(DRIVER_AUTHOR);
79 MODULE_AUTHOR("Changpeng Liu <changpeng.liu@intel.com>");
80 MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
81
82 static struct dentry *perf_debugfs_dir;
83 static struct perf_ctx *g_perf = NULL;
84
85 static unsigned int seg_order = 12; /* 4K */
86 static unsigned int queue_depth = 256;
87 static unsigned int run_order = 32; /* 4G */
88
89 struct perf_mw {
90 size_t buf_size;
91 void *virt_addr;
92 };
93
94 struct perf_ctx;
95
96 struct pthr_ctx {
97 struct dentry *debugfs_thr_dir;
98 struct dentry *debugfs_copied;
99 struct dentry *debugfs_elapsed_time;
100 struct device *dev;
101 int node;
102 wait_queue_head_t wq;
103 struct perf_mw mw;
104 struct task_struct *thread;
105 struct perf_ctx *perf;
106 atomic_t dma_sync;
107 struct dma_chan *dma_chan;
108 int dma_up;
109 int dma_down;
110 int dma_prep_err;
111 u64 copied;
112 u64 elapsed_time;
113 };
114
115 struct perf_ctx {
116 spinlock_t db_lock;
117 struct dentry *debugfs_node_dir;
118 struct dentry *debugfs_run;
119 struct dentry *debugfs_threads;
120 struct dentry *debugfs_queue_depth;
121 struct dentry *debugfs_transfer_size_order;
122 struct dentry *debugfs_total_size_order;
123 struct dentry *debugfs_status;
124 u8 numa_nodes;
125 u8 perf_threads;
126 bool run;
127 struct pthr_ctx pthr_ctx[MAX_THREADS];
128 atomic_t tsync;
129 };
130
131 static void perf_free_mw(struct pthr_ctx *pctx);
132 static int perf_set_mw(struct pthr_ctx *pctx, size_t size);
133
134 static void perf_copy_callback(void *data)
135 {
136 struct pthr_ctx *pctx = data;
137
138 atomic_dec(&pctx->dma_sync);
139 pctx->dma_down++;
140
141 wake_up(&pctx->wq);
142 }
143
144 static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
145 char *src, size_t size)
146 {
147 struct dma_async_tx_descriptor *txd;
148 struct dma_chan *chan = pctx->dma_chan;
149 struct dma_device *device;
150 struct dmaengine_unmap_data *unmap;
151 dma_cookie_t cookie;
152 size_t src_off, dst_off;
153 int retries = 0;
154
155 if (!chan) {
156 printk("DMA engine does not exist\n");
157 return -EINVAL;
158 }
159
160 device = chan->device;
161 src_off = (size_t)src & ~PAGE_MASK;
162 dst_off = (size_t)dst & ~PAGE_MASK;
163
164 if (!is_dma_copy_aligned(device, src_off, dst_off, size))
165 return -ENODEV;
166
167 unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOWAIT);
168 if (!unmap)
169 return -ENOMEM;
170
171 unmap->len = size;
172 unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
173 src_off, size, DMA_TO_DEVICE);
174 if (dma_mapping_error(device->dev, unmap->addr[0]))
175 goto err_get_unmap;
176
177 unmap->to_cnt = 1;
178
179 unmap->addr[1] = dma_map_page(device->dev, virt_to_page(dst),
180 dst_off, size, DMA_FROM_DEVICE);
181 if (dma_mapping_error(device->dev, unmap->addr[1]))
182 goto err_get_unmap;
183 unmap->from_cnt = 1;
184
185 dma_prep_retry:
186 txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],
187 unmap->addr[0],
188 size, DMA_PREP_INTERRUPT);
189 if (!txd) {
190 if (retries++ > 20) {
191 pctx->dma_prep_err++;
192 goto err_get_unmap;
193 } else {
194 set_current_state(TASK_INTERRUPTIBLE);
195 schedule_timeout(50);
196 goto dma_prep_retry;
197 }
198 }
199
200 txd->callback = perf_copy_callback;
201 txd->callback_param = pctx;
202 dma_set_unmap(txd, unmap);
203
204 cookie = dmaengine_submit(txd);
205 if (dma_submit_error(cookie))
206 goto err_set_unmap;
207
208 atomic_inc(&pctx->dma_sync);
209
210 pctx->dma_up++;
211 dma_async_issue_pending(chan);
212
213 return size;
214
215 err_set_unmap:
216 dmaengine_unmap_put(unmap);
217 err_get_unmap:
218 dmaengine_unmap_put(unmap);
219 return 0;
220 }
221
222 static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src,
223 u64 buf_size, u64 win_size, u64 total)
224 {
225 int chunks, total_chunks, i;
226 int copied_chunks = 0;
227 u64 result;
228 char *tmp = dst;
229 u64 perf, diff_us;
230 ktime_t kstart, kstop, kdiff;
231
232 chunks = win_size / buf_size;
233 total_chunks = total / buf_size;
234
235 printk("%s: chunks: %d total_chunks: %d\n", current->comm, chunks, total_chunks);
236
237 kstart = ktime_get();
238
239 for (i = 0; i < total_chunks; i++) {
240
241 wait_event_interruptible(pctx->wq, atomic_read(&pctx->dma_sync) < queue_depth);
242
243 result = perf_copy(pctx, tmp, src, buf_size);
244 pctx->copied += result;
245 copied_chunks++;
246 if (copied_chunks == chunks) {
247 tmp = dst;
248 copied_chunks = 0;
249 } else
250 tmp += buf_size;
251 }
252
253 printk("%s: All DMA descriptors submitted\n", current->comm);
254
255 /* FIXME: need a timeout here eventually */
256 while (atomic_read(&pctx->dma_sync) != 0)
257 msleep(1);
258
259 pr_info("%s: dma_up: %d dma_down: %d dma_prep_err: %d\n",
260 current->comm, pctx->dma_up, pctx->dma_down,
261 pctx->dma_prep_err);
262
263 kstop = ktime_get();
264 kdiff = ktime_sub(kstop, kstart);
265 diff_us = ktime_to_us(kdiff);
266
267 pr_info("%s: copied %Lu bytes\n", current->comm, pctx->copied);
268
269 pr_info("%s: lasted %Lu usecs\n", current->comm, diff_us);
270
271 perf = pctx->copied / diff_us;
272
273 pr_info("%s: MBytes/s: %Lu\n", current->comm, perf);
274
275 pctx->elapsed_time = diff_us;
276
277 return 0;
278 }
279
280 static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
281 {
282 return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
283 }
284
285 static int dma_perf_thread(void *data)
286 {
287 struct pthr_ctx *pctx = data;
288 struct perf_ctx *perf = pctx->perf;
289 struct perf_mw *mw = &pctx->mw;
290 char *dst;
291 u64 win_size, buf_size, total;
292 void *src;
293 int rc, node;
294 struct dma_chan *dma_chan = NULL;
295
296 pr_info("kthread %s starting...\n", current->comm);
297
298 node = pctx->node;
299
300 if (!pctx->dma_chan) {
301 dma_cap_mask_t dma_mask;
302
303 dma_cap_zero(dma_mask);
304 dma_cap_set(DMA_MEMCPY, dma_mask);
305 dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
306 (void *)(unsigned long)node);
307 if (!dma_chan) {
308 pr_warn("%s: cannot acquire DMA channel, quitting\n",
309 current->comm);
310 return -ENODEV;
311 }
312 pctx->dma_chan = dma_chan;
313 pctx->dev = dma_chan->device->dev;
314 }
315
316 src = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
317 if (!src) {
318 rc = -ENOMEM;
319 goto err;
320 }
321
322 rc = perf_set_mw(pctx, MAX_TEST_SIZE);
323 if (rc < 0) {
324 pr_err("%s: set mw failed\n", current->comm);
325 rc = -ENXIO;
326 goto err;
327 }
328
329 win_size = mw->buf_size;
330 buf_size = 1ULL << seg_order;
331 total = 1ULL << run_order;
332
333 if (buf_size > MAX_TEST_SIZE)
334 buf_size = MAX_TEST_SIZE;
335
336 dst = (char *)mw->virt_addr;
337
338 atomic_inc(&perf->tsync);
339 while (atomic_read(&perf->tsync) != perf->perf_threads)
340 schedule();
341
342 rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
343
344 atomic_dec(&perf->tsync);
345
346 if (rc < 0) {
347 pr_err("%s: failed\n", current->comm);
348 rc = -ENXIO;
349 goto err;
350 }
351
352 return 0;
353
354 err:
355 if (src)
356 kfree(src);
357
358 if (dma_chan) {
359 dma_release_channel(dma_chan);
360 pctx->dma_chan = NULL;
361 }
362
363 return rc;
364 }
365
366 static void perf_free_mw(struct pthr_ctx *pctx)
367 {
368 struct perf_mw *mw = &pctx->mw;
369
370 if (!mw->virt_addr)
371 return;
372
373 kfree(mw->virt_addr);
374 mw->buf_size = 0;
375 mw->virt_addr = NULL;
376 }
377
378 static int perf_set_mw(struct pthr_ctx *pctx, size_t size)
379 {
380 struct perf_mw *mw = &pctx->mw;
381
382 if (!size)
383 return -EINVAL;
384
385 mw->buf_size = size;
386
387 mw->virt_addr = kmalloc_node(size, GFP_KERNEL, pctx->node);
388
389 if (!mw->virt_addr) {
390 mw->buf_size = 0;
391 return -EINVAL;
392 }
393
394 return 0;
395 }
396
397 static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
398 size_t count, loff_t *offp)
399 {
400 struct perf_ctx *perf = filp->private_data;
401 char *buf;
402 ssize_t ret, out_offset;
403
404 if (!perf)
405 return 0;
406
407 buf = kmalloc(64, GFP_KERNEL);
408 out_offset = snprintf(buf, 64, "%d\n", perf->run);
409 ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
410 kfree(buf);
411
412 return ret;
413 }
414
415 static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
416 size_t count, loff_t *offp)
417 {
418 struct perf_ctx *perf = filp->private_data;
419 int node, i;
420
421 if (perf->perf_threads == 0)
422 return 0;
423
424 if (atomic_read(&perf->tsync) == 0)
425 perf->run = false;
426
427 if (perf->run == true) {
428 /* lets stop the threads */
429 perf->run = false;
430 for (i = 0; i < MAX_THREADS; i++) {
431 if (perf->pthr_ctx[i].thread) {
432 kthread_stop(perf->pthr_ctx[i].thread);
433 perf->pthr_ctx[i].thread = NULL;
434 } else
435 break;
436 }
437 } else {
438 perf->run = true;
439
440 if (perf->perf_threads > MAX_THREADS) {
441 perf->perf_threads = MAX_THREADS;
442 pr_info("Reset total threads to: %u\n", MAX_THREADS);
443 }
444
445 /* no greater than 1M */
446 if (seg_order > 20) {
447 seg_order = 20;
448 pr_info("Fix seg_order to %u\n", seg_order);
449 }
450
451 if (run_order < seg_order) {
452 run_order = seg_order;
453 pr_info("Fix run_order to %u\n", run_order);
454 }
455
456 /* launch kernel thread */
457 for (i = 0; i < perf->perf_threads; i++) {
458 struct pthr_ctx *pctx;
459
460 pctx = &perf->pthr_ctx[i];
461 atomic_set(&pctx->dma_sync, 0);
462 pctx->perf = perf;
463 pctx->elapsed_time = 0;
464 pctx->copied = 0;
465
466 init_waitqueue_head(&pctx->wq);
467
468 /* NUMA socket node */
469 pctx->node = i / DMA_CHANNELS_PER_NODE;
470 node = pctx->node;
471
472 pctx->thread =
473 kthread_create_on_node(dma_perf_thread,
474 (void *)pctx,
475 node, "dma_perf %d", i);
476 if (pctx->thread)
477 wake_up_process(pctx->thread);
478 else {
479 perf->run = false;
480 for (i = 0; i < MAX_THREADS; i++) {
481 if (pctx->thread) {
482 kthread_stop(pctx->thread);
483 pctx->thread = NULL;
484 } else
485 break;
486 }
487 }
488
489 if (perf->run == false)
490 return -ENXIO;
491 }
492
493 }
494
495 return count;
496 }
497
498 static const struct file_operations dma_perf_debugfs_run = {
499 .owner = THIS_MODULE,
500 .open = simple_open,
501 .read = debugfs_run_read,
502 .write = debugfs_run_write,
503 };
504
505 static ssize_t debugfs_status_read(struct file *filp, char __user *ubuf,
506 size_t count, loff_t *offp)
507 {
508 struct perf_ctx *perf = filp->private_data;
509 char *buf;
510 ssize_t ret, out_offset;
511
512 if (!perf)
513 return 0;
514
515 buf = kmalloc(64, GFP_KERNEL);
516 out_offset = snprintf(buf, 64, "%s\n", atomic_read(&perf->tsync) ? "running" : "idle");
517 ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
518 kfree(buf);
519
520 return ret;
521 }
522
523 static const struct file_operations dma_perf_debugfs_status = {
524 .owner = THIS_MODULE,
525 .open = simple_open,
526 .read = debugfs_status_read,
527 };
528
529 static int perf_debugfs_setup(struct perf_ctx *perf)
530 {
531
532 int i;
533 char temp_name[64];
534
535 if (!perf_debugfs_dir)
536 return -ENODEV;
537
538 perf->debugfs_node_dir = debugfs_create_dir("dmaperf",
539 perf_debugfs_dir);
540 if (!perf->debugfs_node_dir)
541 return -ENODEV;
542
543 perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
544 perf->debugfs_node_dir, perf,
545 &dma_perf_debugfs_run);
546 if (!perf->debugfs_run)
547 return -ENODEV;
548
549 perf->debugfs_status = debugfs_create_file("status", S_IRUSR,
550 perf->debugfs_node_dir, perf,
551 &dma_perf_debugfs_status);
552 if (!perf->debugfs_status)
553 return -ENODEV;
554
555 perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
556 perf->debugfs_node_dir,
557 &perf->perf_threads);
558 if (!perf->debugfs_threads)
559 return -ENODEV;
560
561 perf->debugfs_queue_depth = debugfs_create_u32("queue_depth", S_IRUSR | S_IWUSR,
562 perf->debugfs_node_dir,
563 &queue_depth);
564 if (!perf->debugfs_queue_depth)
565 return -ENODEV;
566
567 perf->debugfs_transfer_size_order = debugfs_create_u32("transfer_size_order", S_IRUSR | S_IWUSR,
568 perf->debugfs_node_dir,
569 &seg_order);
570 if (!perf->debugfs_transfer_size_order)
571 return -ENODEV;
572
573 perf->debugfs_total_size_order = debugfs_create_u32("total_size_order", S_IRUSR | S_IWUSR,
574 perf->debugfs_node_dir,
575 &run_order);
576 if (!perf->debugfs_total_size_order)
577 return -ENODEV;
578
579 for (i = 0; i < MAX_THREADS; i++) {
580 struct pthr_ctx *pctx = &perf->pthr_ctx[i];
581 sprintf(temp_name, "thread_%d", i);
582
583 pctx->debugfs_thr_dir = debugfs_create_dir(temp_name, perf->debugfs_node_dir);
584 if (!pctx->debugfs_thr_dir)
585 return -ENODEV;
586
587 pctx->debugfs_copied = debugfs_create_u64("copied", S_IRUSR,
588 pctx->debugfs_thr_dir,
589 &pctx->copied);
590 if (!pctx->debugfs_copied)
591 return -ENODEV;
592
593 pctx->debugfs_elapsed_time = debugfs_create_u64("elapsed_time", S_IRUSR,
594 pctx->debugfs_thr_dir,
595 &pctx->elapsed_time);
596 if (!pctx->debugfs_elapsed_time)
597 return -ENODEV;
598 }
599
600 return 0;
601 }
602
603 static int perf_probe(void)
604 {
605 struct perf_ctx *perf;
606 int rc = 0;
607
608 perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, 0);
609 if (!perf) {
610 rc = -ENOMEM;
611 goto err_perf;
612 }
613
614 perf->numa_nodes = num_online_nodes();
615 perf->perf_threads = 1;
616 atomic_set(&perf->tsync, 0);
617 perf->run = false;
618 spin_lock_init(&perf->db_lock);
619
620 if (debugfs_initialized() && !perf_debugfs_dir) {
621 perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
622 if (!perf_debugfs_dir)
623 goto err_ctx;
624
625 rc = perf_debugfs_setup(perf);
626 if (rc)
627 goto err_ctx;
628 }
629
630 g_perf = perf;
631 return 0;
632
633 err_ctx:
634 kfree(perf);
635 err_perf:
636 return rc;
637 }
638
639 static void perf_remove(void)
640 {
641 int i;
642 struct perf_ctx *perf = g_perf;
643
644 if (perf_debugfs_dir) {
645 debugfs_remove_recursive(perf_debugfs_dir);
646 perf_debugfs_dir = NULL;
647 }
648
649 for (i = 0; i < MAX_THREADS; i++) {
650 struct pthr_ctx *pctx = &perf->pthr_ctx[i];
651 if (pctx->dma_chan)
652 dma_release_channel(pctx->dma_chan);
653 perf_free_mw(pctx);
654 }
655
656 kfree(perf);
657 }
658
659 static int __init perf_init_module(void)
660 {
661 printk("DMA Performance Test Init\n");
662 return perf_probe();
663 }
664 module_init(perf_init_module);
665
666 static void __exit perf_exit_module(void)
667 {
668 printk("DMA Performance Test Exit\n");
669 perf_remove();
670 }
671 module_exit(perf_exit_module);