2 * This file is provided under a dual BSD/GPLv2 license. When using or
3 * redistributing this file, you may do so under either license.
7 * Copyright(c) 2015 Intel Corporation. All rights reserved.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
15 * Copyright(c) 2015 Intel Corporation. All rights reserved.
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
21 * * Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * * Redistributions in binary form must reproduce the above copy
24 * notice, this list of conditions and the following disclaimer in
25 * the documentation and/or other materials provided with the
27 * * Neither the name of Intel Corporation nor the names of its
28 * contributors may be used to endorse or promote products derived
29 * from this software without specific prior written permission.
31 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
32 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
33 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
34 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
35 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
37 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
38 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
39 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
41 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * PCIe NTB Perf Linux driver
46 #include <linux/init.h>
47 #include <linux/kernel.h>
48 #include <linux/module.h>
49 #include <linux/kthread.h>
50 #include <linux/time.h>
51 #include <linux/timer.h>
52 #include <linux/dma-mapping.h>
53 #include <linux/pci.h>
54 #include <linux/slab.h>
55 #include <linux/spinlock.h>
56 #include <linux/debugfs.h>
57 #include <linux/dmaengine.h>
58 #include <linux/delay.h>
59 #include <linux/sizes.h>
60 #include <linux/ntb.h>
61 #include <linux/mutex.h>
63 #define DRIVER_NAME "ntb_perf"
64 #define DRIVER_DESCRIPTION "PCIe NTB Performance Measurement Tool"
66 #define DRIVER_LICENSE "Dual BSD/GPL"
67 #define DRIVER_VERSION "1.0"
68 #define DRIVER_AUTHOR "Dave Jiang <dave.jiang@intel.com>"
70 #define PERF_LINK_DOWN_TIMEOUT 10
71 #define PERF_VERSION 0xffff0001
72 #define MAX_THREADS 32
73 #define MAX_TEST_SIZE SZ_1M
75 #define DMA_OUT_RESOURCE_TO msecs_to_jiffies(50)
76 #define DMA_RETRIES 20
77 #define SZ_4G (1ULL << 32)
78 #define MAX_SEG_ORDER 20 /* no larger than 1M for kmalloc buffer */
80 MODULE_LICENSE(DRIVER_LICENSE
);
81 MODULE_VERSION(DRIVER_VERSION
);
82 MODULE_AUTHOR(DRIVER_AUTHOR
);
83 MODULE_DESCRIPTION(DRIVER_DESCRIPTION
);
85 static struct dentry
*perf_debugfs_dir
;
87 static unsigned long max_mw_size
;
88 module_param(max_mw_size
, ulong
, 0644);
89 MODULE_PARM_DESC(max_mw_size
, "Limit size of large memory windows");
91 static unsigned int seg_order
= 19; /* 512K */
92 module_param(seg_order
, uint
, 0644);
93 MODULE_PARM_DESC(seg_order
, "size order [2^n] of buffer segment for testing");
95 static unsigned int run_order
= 32; /* 4G */
96 module_param(run_order
, uint
, 0644);
97 MODULE_PARM_DESC(run_order
, "size order [2^n] of total data to transfer");
99 static bool use_dma
; /* default to 0 */
100 module_param(use_dma
, bool, 0644);
101 MODULE_PARM_DESC(use_dma
, "Using DMA engine to measure performance");
104 phys_addr_t phys_addr
;
105 resource_size_t phys_size
;
106 resource_size_t xlat_align
;
107 resource_size_t xlat_align_size
;
118 struct task_struct
*thread
;
119 struct perf_ctx
*perf
;
121 struct dma_chan
*dma_chan
;
124 void *srcs
[MAX_SRCS
];
125 wait_queue_head_t
*wq
;
136 struct delayed_work link_work
;
137 wait_queue_head_t link_wq
;
138 struct dentry
*debugfs_node_dir
;
139 struct dentry
*debugfs_run
;
140 struct dentry
*debugfs_threads
;
142 /* mutex ensures only one set of threads run at once */
143 struct mutex run_mutex
;
144 struct pthr_ctx pthr_ctx
[MAX_THREADS
];
156 static void perf_link_event(void *ctx
)
158 struct perf_ctx
*perf
= ctx
;
160 if (ntb_link_is_up(perf
->ntb
, NULL
, NULL
) == 1) {
161 schedule_delayed_work(&perf
->link_work
, 2*HZ
);
163 dev_dbg(&perf
->ntb
->pdev
->dev
, "link down\n");
165 if (!perf
->link_is_up
)
166 cancel_delayed_work_sync(&perf
->link_work
);
168 perf
->link_is_up
= false;
172 static void perf_db_event(void *ctx
, int vec
)
174 struct perf_ctx
*perf
= ctx
;
175 u64 db_bits
, db_mask
;
177 db_mask
= ntb_db_vector_mask(perf
->ntb
, vec
);
178 db_bits
= ntb_db_read(perf
->ntb
);
180 dev_dbg(&perf
->ntb
->dev
, "doorbell vec %d mask %#llx bits %#llx\n",
181 vec
, db_mask
, db_bits
);
184 static const struct ntb_ctx_ops perf_ops
= {
185 .link_event
= perf_link_event
,
186 .db_event
= perf_db_event
,
189 static void perf_copy_callback(void *data
)
191 struct pthr_ctx
*pctx
= data
;
193 atomic_dec(&pctx
->dma_sync
);
196 static ssize_t
perf_copy(struct pthr_ctx
*pctx
, char __iomem
*dst
,
197 char *src
, size_t size
)
199 struct perf_ctx
*perf
= pctx
->perf
;
200 struct dma_async_tx_descriptor
*txd
;
201 struct dma_chan
*chan
= pctx
->dma_chan
;
202 struct dma_device
*device
;
203 struct dmaengine_unmap_data
*unmap
;
205 size_t src_off
, dst_off
;
206 struct perf_mw
*mw
= &perf
->mw
;
208 void __iomem
*dst_vaddr
;
213 memcpy_toio(dst
, src
, size
);
218 dev_err(&perf
->ntb
->dev
, "DMA engine does not exist\n");
222 device
= chan
->device
;
223 src_off
= (uintptr_t)src
& ~PAGE_MASK
;
224 dst_off
= (uintptr_t __force
)dst
& ~PAGE_MASK
;
226 if (!is_dma_copy_aligned(device
, src_off
, dst_off
, size
))
231 dst_phys
= mw
->phys_addr
+ (dst_vaddr
- vbase
);
233 unmap
= dmaengine_get_unmap_data(device
->dev
, 1, GFP_NOWAIT
);
238 unmap
->addr
[0] = dma_map_page(device
->dev
, virt_to_page(src
),
239 src_off
, size
, DMA_TO_DEVICE
);
240 if (dma_mapping_error(device
->dev
, unmap
->addr
[0]))
246 txd
= device
->device_prep_dma_memcpy(chan
, dst_phys
,
248 size
, DMA_PREP_INTERRUPT
);
250 set_current_state(TASK_INTERRUPTIBLE
);
251 schedule_timeout(DMA_OUT_RESOURCE_TO
);
253 } while (!txd
&& (++retries
< DMA_RETRIES
));
256 pctx
->dma_prep_err
++;
260 txd
->callback
= perf_copy_callback
;
261 txd
->callback_param
= pctx
;
262 dma_set_unmap(txd
, unmap
);
264 cookie
= dmaengine_submit(txd
);
265 if (dma_submit_error(cookie
))
268 dmaengine_unmap_put(unmap
);
270 atomic_inc(&pctx
->dma_sync
);
271 dma_async_issue_pending(chan
);
276 dmaengine_unmap_put(unmap
);
278 dmaengine_unmap_put(unmap
);
282 static int perf_move_data(struct pthr_ctx
*pctx
, char __iomem
*dst
, char *src
,
283 u64 buf_size
, u64 win_size
, u64 total
)
285 int chunks
, total_chunks
, i
;
286 int copied_chunks
= 0;
287 u64 copied
= 0, result
;
288 char __iomem
*tmp
= dst
;
290 ktime_t kstart
, kstop
, kdiff
;
291 unsigned long last_sleep
= jiffies
;
293 chunks
= div64_u64(win_size
, buf_size
);
294 total_chunks
= div64_u64(total
, buf_size
);
295 kstart
= ktime_get();
297 for (i
= 0; i
< total_chunks
; i
++) {
298 result
= perf_copy(pctx
, tmp
, src
, buf_size
);
301 if (copied_chunks
== chunks
) {
307 /* Probably should schedule every 5s to prevent soft hang. */
308 if (unlikely((jiffies
- last_sleep
) > 5 * HZ
)) {
309 last_sleep
= jiffies
;
310 set_current_state(TASK_INTERRUPTIBLE
);
314 if (unlikely(kthread_should_stop()))
319 pr_debug("%s: All DMA descriptors submitted\n", current
->comm
);
320 while (atomic_read(&pctx
->dma_sync
) != 0) {
321 if (kthread_should_stop())
328 kdiff
= ktime_sub(kstop
, kstart
);
329 diff_us
= ktime_to_us(kdiff
);
331 pr_debug("%s: copied %llu bytes\n", current
->comm
, copied
);
333 pr_debug("%s: lasted %llu usecs\n", current
->comm
, diff_us
);
335 perf
= div64_u64(copied
, diff_us
);
337 pr_debug("%s: MBytes/s: %llu\n", current
->comm
, perf
);
339 pctx
->copied
= copied
;
340 pctx
->diff_us
= diff_us
;
345 static bool perf_dma_filter_fn(struct dma_chan
*chan
, void *node
)
347 return dev_to_node(&chan
->dev
->device
) == (int)(unsigned long)node
;
350 static int ntb_perf_thread(void *data
)
352 struct pthr_ctx
*pctx
= data
;
353 struct perf_ctx
*perf
= pctx
->perf
;
354 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
355 struct perf_mw
*mw
= &perf
->mw
;
357 u64 win_size
, buf_size
, total
;
360 struct dma_chan
*dma_chan
= NULL
;
362 pr_debug("kthread %s starting...\n", current
->comm
);
364 node
= dev_to_node(&pdev
->dev
);
366 if (use_dma
&& !pctx
->dma_chan
) {
367 dma_cap_mask_t dma_mask
;
369 dma_cap_zero(dma_mask
);
370 dma_cap_set(DMA_MEMCPY
, dma_mask
);
371 dma_chan
= dma_request_channel(dma_mask
, perf_dma_filter_fn
,
372 (void *)(unsigned long)node
);
374 pr_warn("%s: cannot acquire DMA channel, quitting\n",
378 pctx
->dma_chan
= dma_chan
;
381 for (i
= 0; i
< MAX_SRCS
; i
++) {
382 pctx
->srcs
[i
] = kmalloc_node(MAX_TEST_SIZE
, GFP_KERNEL
, node
);
383 if (!pctx
->srcs
[i
]) {
389 win_size
= mw
->phys_size
;
390 buf_size
= 1ULL << seg_order
;
391 total
= 1ULL << run_order
;
393 if (buf_size
> MAX_TEST_SIZE
)
394 buf_size
= MAX_TEST_SIZE
;
396 dst
= (char __iomem
*)mw
->vbase
;
398 atomic_inc(&perf
->tsync
);
399 while (atomic_read(&perf
->tsync
) != perf
->perf_threads
)
402 src
= pctx
->srcs
[pctx
->src_idx
];
403 pctx
->src_idx
= (pctx
->src_idx
+ 1) & (MAX_SRCS
- 1);
405 rc
= perf_move_data(pctx
, dst
, src
, buf_size
, win_size
, total
);
407 atomic_dec(&perf
->tsync
);
410 pr_err("%s: failed\n", current
->comm
);
415 for (i
= 0; i
< MAX_SRCS
; i
++) {
416 kfree(pctx
->srcs
[i
]);
417 pctx
->srcs
[i
] = NULL
;
420 atomic_inc(&perf
->tdone
);
426 for (i
= 0; i
< MAX_SRCS
; i
++) {
427 kfree(pctx
->srcs
[i
]);
428 pctx
->srcs
[i
] = NULL
;
432 dma_release_channel(dma_chan
);
433 pctx
->dma_chan
= NULL
;
437 /* Wait until we are told to stop */
439 set_current_state(TASK_INTERRUPTIBLE
);
440 if (kthread_should_stop())
444 __set_current_state(TASK_RUNNING
);
449 static void perf_free_mw(struct perf_ctx
*perf
)
451 struct perf_mw
*mw
= &perf
->mw
;
452 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
457 ntb_mw_clear_trans(perf
->ntb
, 0);
458 dma_free_coherent(&pdev
->dev
, mw
->buf_size
,
459 mw
->virt_addr
, mw
->dma_addr
);
462 mw
->virt_addr
= NULL
;
465 static int perf_set_mw(struct perf_ctx
*perf
, resource_size_t size
)
467 struct perf_mw
*mw
= &perf
->mw
;
468 size_t xlat_size
, buf_size
;
474 xlat_size
= round_up(size
, mw
->xlat_align_size
);
475 buf_size
= round_up(size
, mw
->xlat_align
);
477 if (mw
->xlat_size
== xlat_size
)
483 mw
->xlat_size
= xlat_size
;
484 mw
->buf_size
= buf_size
;
486 mw
->virt_addr
= dma_alloc_coherent(&perf
->ntb
->pdev
->dev
, buf_size
,
487 &mw
->dma_addr
, GFP_KERNEL
);
488 if (!mw
->virt_addr
) {
493 rc
= ntb_mw_set_trans(perf
->ntb
, 0, mw
->dma_addr
, mw
->xlat_size
);
495 dev_err(&perf
->ntb
->dev
, "Unable to set mw0 translation\n");
503 static void perf_link_work(struct work_struct
*work
)
505 struct perf_ctx
*perf
=
506 container_of(work
, struct perf_ctx
, link_work
.work
);
507 struct ntb_dev
*ndev
= perf
->ntb
;
508 struct pci_dev
*pdev
= ndev
->pdev
;
513 dev_dbg(&perf
->ntb
->pdev
->dev
, "%s called\n", __func__
);
515 size
= perf
->mw
.phys_size
;
517 if (max_mw_size
&& size
> max_mw_size
)
520 ntb_peer_spad_write(ndev
, MW_SZ_HIGH
, upper_32_bits(size
));
521 ntb_peer_spad_write(ndev
, MW_SZ_LOW
, lower_32_bits(size
));
522 ntb_peer_spad_write(ndev
, VERSION
, PERF_VERSION
);
524 /* now read what peer wrote */
525 val
= ntb_spad_read(ndev
, VERSION
);
526 if (val
!= PERF_VERSION
) {
527 dev_dbg(&pdev
->dev
, "Remote version = %#x\n", val
);
531 val
= ntb_spad_read(ndev
, MW_SZ_HIGH
);
532 size
= (u64
)val
<< 32;
534 val
= ntb_spad_read(ndev
, MW_SZ_LOW
);
537 dev_dbg(&pdev
->dev
, "Remote MW size = %#llx\n", size
);
539 rc
= perf_set_mw(perf
, size
);
543 perf
->link_is_up
= true;
544 wake_up(&perf
->link_wq
);
552 if (ntb_link_is_up(ndev
, NULL
, NULL
) == 1)
553 schedule_delayed_work(&perf
->link_work
,
554 msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT
));
557 static int perf_setup_mw(struct ntb_dev
*ntb
, struct perf_ctx
*perf
)
564 rc
= ntb_mw_get_range(ntb
, 0, &mw
->phys_addr
, &mw
->phys_size
,
565 &mw
->xlat_align
, &mw
->xlat_align_size
);
569 perf
->mw
.vbase
= ioremap_wc(mw
->phys_addr
, mw
->phys_size
);
576 static ssize_t
debugfs_run_read(struct file
*filp
, char __user
*ubuf
,
577 size_t count
, loff_t
*offp
)
579 struct perf_ctx
*perf
= filp
->private_data
;
581 ssize_t ret
, out_off
= 0;
582 struct pthr_ctx
*pctx
;
589 buf
= kmalloc(1024, GFP_KERNEL
);
593 if (mutex_is_locked(&perf
->run_mutex
)) {
594 out_off
= scnprintf(buf
, 64, "running\n");
598 for (i
= 0; i
< MAX_THREADS
; i
++) {
599 pctx
= &perf
->pthr_ctx
[i
];
601 if (pctx
->status
== -ENODATA
)
605 out_off
+= scnprintf(buf
+ out_off
, 1024 - out_off
,
611 rate
= div64_u64(pctx
->copied
, pctx
->diff_us
);
612 out_off
+= scnprintf(buf
+ out_off
, 1024 - out_off
,
613 "%d: copied %llu bytes in %llu usecs, %llu MBytes/s\n",
614 i
, pctx
->copied
, pctx
->diff_us
, rate
);
618 ret
= simple_read_from_buffer(ubuf
, count
, offp
, buf
, out_off
);
624 static void threads_cleanup(struct perf_ctx
*perf
)
626 struct pthr_ctx
*pctx
;
629 for (i
= 0; i
< MAX_THREADS
; i
++) {
630 pctx
= &perf
->pthr_ctx
[i
];
632 pctx
->status
= kthread_stop(pctx
->thread
);
638 static void perf_clear_thread_status(struct perf_ctx
*perf
)
642 for (i
= 0; i
< MAX_THREADS
; i
++)
643 perf
->pthr_ctx
[i
].status
= -ENODATA
;
646 static ssize_t
debugfs_run_write(struct file
*filp
, const char __user
*ubuf
,
647 size_t count
, loff_t
*offp
)
649 struct perf_ctx
*perf
= filp
->private_data
;
651 DECLARE_WAIT_QUEUE_HEAD(wq
);
653 if (wait_event_interruptible(perf
->link_wq
, perf
->link_is_up
))
656 if (perf
->perf_threads
== 0)
659 if (!mutex_trylock(&perf
->run_mutex
))
662 perf_clear_thread_status(perf
);
664 if (perf
->perf_threads
> MAX_THREADS
) {
665 perf
->perf_threads
= MAX_THREADS
;
666 pr_info("Reset total threads to: %u\n", MAX_THREADS
);
669 /* no greater than 1M */
670 if (seg_order
> MAX_SEG_ORDER
) {
671 seg_order
= MAX_SEG_ORDER
;
672 pr_info("Fix seg_order to %u\n", seg_order
);
675 if (run_order
< seg_order
) {
676 run_order
= seg_order
;
677 pr_info("Fix run_order to %u\n", run_order
);
680 node
= dev_to_node(&perf
->ntb
->pdev
->dev
);
681 atomic_set(&perf
->tdone
, 0);
683 /* launch kernel thread */
684 for (i
= 0; i
< perf
->perf_threads
; i
++) {
685 struct pthr_ctx
*pctx
;
687 pctx
= &perf
->pthr_ctx
[i
];
688 atomic_set(&pctx
->dma_sync
, 0);
692 kthread_create_on_node(ntb_perf_thread
,
694 node
, "ntb_perf %d", i
);
695 if (IS_ERR(pctx
->thread
)) {
699 wake_up_process(pctx
->thread
);
703 wait_event_interruptible(wq
,
704 atomic_read(&perf
->tdone
) == perf
->perf_threads
);
706 threads_cleanup(perf
);
707 mutex_unlock(&perf
->run_mutex
);
711 threads_cleanup(perf
);
712 mutex_unlock(&perf
->run_mutex
);
716 static const struct file_operations ntb_perf_debugfs_run
= {
717 .owner
= THIS_MODULE
,
719 .read
= debugfs_run_read
,
720 .write
= debugfs_run_write
,
723 static int perf_debugfs_setup(struct perf_ctx
*perf
)
725 struct pci_dev
*pdev
= perf
->ntb
->pdev
;
727 if (!debugfs_initialized())
730 if (!perf_debugfs_dir
) {
731 perf_debugfs_dir
= debugfs_create_dir(KBUILD_MODNAME
, NULL
);
732 if (!perf_debugfs_dir
)
736 perf
->debugfs_node_dir
= debugfs_create_dir(pci_name(pdev
),
738 if (!perf
->debugfs_node_dir
)
741 perf
->debugfs_run
= debugfs_create_file("run", S_IRUSR
| S_IWUSR
,
742 perf
->debugfs_node_dir
, perf
,
743 &ntb_perf_debugfs_run
);
744 if (!perf
->debugfs_run
)
747 perf
->debugfs_threads
= debugfs_create_u8("threads", S_IRUSR
| S_IWUSR
,
748 perf
->debugfs_node_dir
,
749 &perf
->perf_threads
);
750 if (!perf
->debugfs_threads
)
756 static int perf_probe(struct ntb_client
*client
, struct ntb_dev
*ntb
)
758 struct pci_dev
*pdev
= ntb
->pdev
;
759 struct perf_ctx
*perf
;
763 if (ntb_spad_count(ntb
) < MAX_SPAD
) {
764 dev_err(&ntb
->dev
, "Not enough scratch pad registers for %s",
769 node
= dev_to_node(&pdev
->dev
);
771 perf
= kzalloc_node(sizeof(*perf
), GFP_KERNEL
, node
);
778 perf
->perf_threads
= 1;
779 atomic_set(&perf
->tsync
, 0);
780 mutex_init(&perf
->run_mutex
);
781 spin_lock_init(&perf
->db_lock
);
782 perf_setup_mw(ntb
, perf
);
783 init_waitqueue_head(&perf
->link_wq
);
784 INIT_DELAYED_WORK(&perf
->link_work
, perf_link_work
);
786 rc
= ntb_set_ctx(ntb
, perf
, &perf_ops
);
790 perf
->link_is_up
= false;
791 ntb_link_enable(ntb
, NTB_SPEED_AUTO
, NTB_WIDTH_AUTO
);
794 rc
= perf_debugfs_setup(perf
);
798 perf_clear_thread_status(perf
);
803 cancel_delayed_work_sync(&perf
->link_work
);
809 static void perf_remove(struct ntb_client
*client
, struct ntb_dev
*ntb
)
811 struct perf_ctx
*perf
= ntb
->ctx
;
814 dev_dbg(&perf
->ntb
->dev
, "%s called\n", __func__
);
816 mutex_lock(&perf
->run_mutex
);
818 cancel_delayed_work_sync(&perf
->link_work
);
821 ntb_link_disable(ntb
);
823 debugfs_remove_recursive(perf_debugfs_dir
);
824 perf_debugfs_dir
= NULL
;
827 for (i
= 0; i
< MAX_THREADS
; i
++) {
828 struct pthr_ctx
*pctx
= &perf
->pthr_ctx
[i
];
831 dma_release_channel(pctx
->dma_chan
);
838 static struct ntb_client perf_client
= {
841 .remove
= perf_remove
,
844 module_ntb_client(perf_client
);