ceph/src/spdk/examples/ioat/kperf/kmod/dmaperf.c

   1 /*
   2  * This file is provided under a dual BSD/GPLv2 license.  When using or
   3  *   redistributing this file, you may do so under either license.
   4  *
   5  *   GPL LICENSE SUMMARY
   6  *
   7  *   Copyright (c) Intel Corporation.
   8  *   All rights reserved.
   9  *
  10  *   This program is free software; you can redistribute it and/or modify
  11  *   it under the terms of version 2 of the GNU General Public License as
  12  *   published by the Free Software Foundation.
  13  *
  14  *   BSD LICENSE
  15  *
  16  *   Copyright (c) Intel Corporation.
  17  *   All rights reserved.
  18  *
  19  *   Redistribution and use in source and binary forms, with or without
  20  *   modification, are permitted provided that the following conditions
  21  *   are met:
  22  *
  23  *     * Redistributions of source code must retain the above copyright
  24  *       notice, this list of conditions and the following disclaimer.
  25  *     * Redistributions in binary form must reproduce the above copy
  26  *       notice, this list of conditions and the following disclaimer in
  27  *       the documentation and/or other materials provided with the
  28  *       distribution.
  29  *     * Neither the name of Intel Corporation nor the names of its
  30  *       contributors may be used to endorse or promote products derived
  31  *       from this software without specific prior written permission.
  32  *
  33  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  34  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  35  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  36  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  37  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  38  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  39  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  40  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  41  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  42  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  43  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  44  *
  45  *   PCIe DMA Perf Linux driver
  46  */
  47
  48 #include <linux/init.h>
  49 #include <linux/kernel.h>
  50 #include <linux/wait.h>
  51 #include <linux/module.h>
  52 #include <linux/kthread.h>
  53 #include <linux/time.h>
  54 #include <linux/timer.h>
  55 #include <linux/dma-mapping.h>
  56 #include <linux/pci.h>
  57 #include <linux/slab.h>
  58 #include <linux/spinlock.h>
  59 #include <linux/debugfs.h>
  60 #include <linux/dmaengine.h>
  61 #include <linux/delay.h>
  62 #include <linux/printk.h>
  63 #include <linux/nodemask.h>
  64
  65 #define DRIVER_NAME             "dma_perf"
  66 #define DRIVER_DESCRIPTION      "PCIe DMA Performance Measurement Tool"
  67
  68 #define DRIVER_LICENSE          "Dual BSD/GPL"
  69 #define DRIVER_VERSION          "1.0"
  70 #define DRIVER_AUTHOR           "Dave Jiang <dave.jiang@intel.com>"
  71
  72 #define MAX_THREADS             32
  73 #define MAX_TEST_SIZE           1024 * 1024     /* 1M */
  74 #define DMA_CHANNELS_PER_NODE   8
  75
  76 MODULE_LICENSE(DRIVER_LICENSE);
  77 MODULE_VERSION(DRIVER_VERSION);
  78 MODULE_AUTHOR(DRIVER_AUTHOR);
  79 MODULE_AUTHOR("Changpeng Liu <changpeng.liu@intel.com>");
  80 MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
  81
  82 static struct dentry *perf_debugfs_dir;
  83 static struct perf_ctx *g_perf = NULL;
  84
  85 static unsigned int seg_order = 12; /* 4K */
  86 static unsigned int queue_depth = 256;
  87 static unsigned int run_order = 32; /* 4G */
  88
  89 struct perf_mw {
  90         size_t          buf_size;
  91         void            *virt_addr;
  92 };
  93
  94 struct perf_ctx;
  95
  96 struct pthr_ctx {
  97         struct dentry           *debugfs_thr_dir;
  98         struct dentry           *debugfs_copied;
  99         struct dentry           *debugfs_elapsed_time;
 100         struct device           *dev;
 101         int                     node;
 102         wait_queue_head_t       wq;
 103         struct perf_mw          mw;
 104         struct task_struct      *thread;
 105         struct perf_ctx         *perf;
 106         atomic_t                dma_sync;
 107         struct dma_chan         *dma_chan;
 108         int                     dma_up;
 109         int                     dma_down;
 110         int                     dma_prep_err;
 111         u64                     copied;
 112         u64                     elapsed_time;
 113 };
 114
 115 struct perf_ctx {
 116         spinlock_t              db_lock;
 117         struct dentry           *debugfs_node_dir;
 118         struct dentry           *debugfs_run;
 119         struct dentry           *debugfs_threads;
 120         struct dentry           *debugfs_queue_depth;
 121         struct dentry           *debugfs_transfer_size_order;
 122         struct dentry           *debugfs_total_size_order;
 123         struct dentry           *debugfs_status;
 124         u8                      numa_nodes;
 125         u8                      perf_threads;
 126         bool                    run;
 127         struct pthr_ctx         pthr_ctx[MAX_THREADS];
 128         atomic_t                tsync;
 129 };
 130
 131 static void perf_free_mw(struct pthr_ctx *pctx);
 132 static int perf_set_mw(struct pthr_ctx *pctx, size_t size);
 133
 134 static void perf_copy_callback(void *data)
 135 {
 136         struct pthr_ctx *pctx = data;
 137
 138         atomic_dec(&pctx->dma_sync);
 139         pctx->dma_down++;
 140
 141         wake_up(&pctx->wq);
 142 }
 143
 144 static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst,
 145                          char *src, size_t size)
 146 {
 147         struct dma_async_tx_descriptor *txd;
 148         struct dma_chan *chan = pctx->dma_chan;
 149         struct dma_device *device;
 150         struct dmaengine_unmap_data *unmap;
 151         dma_cookie_t cookie;
 152         size_t src_off, dst_off;
 153         int retries = 0;
 154
 155         if (!chan) {
 156                 printk("DMA engine does not exist\n");
 157                 return -EINVAL;
 158         }
 159
 160         device = chan->device;
 161         src_off = (size_t)src & ~PAGE_MASK;
 162         dst_off = (size_t)dst & ~PAGE_MASK;
 163
 164         if (!is_dma_copy_aligned(device, src_off, dst_off, size))
 165                 return -ENODEV;
 166
 167         unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOWAIT);
 168         if (!unmap)
 169                 return -ENOMEM;
 170
 171         unmap->len = size;
 172         unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src),
 173                                       src_off, size, DMA_TO_DEVICE);
 174         if (dma_mapping_error(device->dev, unmap->addr[0]))
 175                 goto err_get_unmap;
 176
 177         unmap->to_cnt = 1;
 178
 179         unmap->addr[1] = dma_map_page(device->dev, virt_to_page(dst),
 180                                       dst_off, size, DMA_FROM_DEVICE);
 181         if (dma_mapping_error(device->dev, unmap->addr[1]))
 182                 goto err_get_unmap;
 183         unmap->from_cnt = 1;
 184
 185 dma_prep_retry:
 186         txd = device->device_prep_dma_memcpy(chan, unmap->addr[1],
 187                                              unmap->addr[0],
 188                                              size, DMA_PREP_INTERRUPT);
 189         if (!txd) {
 190                 if (retries++ > 20) {
 191                         pctx->dma_prep_err++;
 192                         goto err_get_unmap;
 193                 } else {
 194                         set_current_state(TASK_INTERRUPTIBLE);
 195                         schedule_timeout(50);
 196                         goto dma_prep_retry;
 197                 }
 198         }
 199
 200         txd->callback = perf_copy_callback;
 201         txd->callback_param = pctx;
 202         dma_set_unmap(txd, unmap);
 203
 204         cookie = dmaengine_submit(txd);
 205         if (dma_submit_error(cookie))
 206                 goto err_set_unmap;
 207
 208         atomic_inc(&pctx->dma_sync);
 209
 210         pctx->dma_up++;
 211         dma_async_issue_pending(chan);
 212
 213         return size;
 214
 215 err_set_unmap:
 216         dmaengine_unmap_put(unmap);
 217 err_get_unmap:
 218         dmaengine_unmap_put(unmap);
 219         return 0;
 220 }
 221
 222 static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src,
 223                           u64 buf_size, u64 win_size, u64 total)
 224 {
 225         int chunks, total_chunks, i;
 226         int copied_chunks = 0;
 227         u64 result;
 228         char *tmp = dst;
 229         u64 perf, diff_us;
 230         ktime_t kstart, kstop, kdiff;
 231
 232         chunks = win_size / buf_size;
 233         total_chunks = total / buf_size;
 234
 235         printk("%s: chunks: %d total_chunks: %d\n", current->comm, chunks, total_chunks);
 236
 237         kstart = ktime_get();
 238
 239         for (i = 0; i < total_chunks; i++) {
 240
 241                 wait_event_interruptible(pctx->wq, atomic_read(&pctx->dma_sync) < queue_depth);
 242
 243                 result = perf_copy(pctx, tmp, src, buf_size);
 244                 pctx->copied += result;
 245                 copied_chunks++;
 246                 if (copied_chunks == chunks) {
 247                         tmp = dst;
 248                         copied_chunks = 0;
 249                 } else
 250                         tmp += buf_size;
 251         }
 252
 253         printk("%s: All DMA descriptors submitted\n", current->comm);
 254
 255         /* FIXME: need a timeout here eventually */
 256         while (atomic_read(&pctx->dma_sync) != 0)
 257                 msleep(1);
 258
 259         pr_info("%s: dma_up: %d  dma_down: %d dma_prep_err: %d\n",
 260                 current->comm, pctx->dma_up, pctx->dma_down,
 261                 pctx->dma_prep_err);
 262
 263         kstop = ktime_get();
 264         kdiff = ktime_sub(kstop, kstart);
 265         diff_us = ktime_to_us(kdiff);
 266
 267         pr_info("%s: copied %Lu bytes\n", current->comm, pctx->copied);
 268
 269         pr_info("%s: lasted %Lu usecs\n", current->comm, diff_us);
 270
 271         perf = pctx->copied / diff_us;
 272
 273         pr_info("%s: MBytes/s: %Lu\n", current->comm, perf);
 274
 275         pctx->elapsed_time = diff_us;
 276
 277         return 0;
 278 }
 279
 280 static bool perf_dma_filter_fn(struct dma_chan *chan, void *node)
 281 {
 282         return dev_to_node(&chan->dev->device) == (int)(unsigned long)node;
 283 }
 284
 285 static int dma_perf_thread(void *data)
 286 {
 287         struct pthr_ctx *pctx = data;
 288         struct perf_ctx *perf = pctx->perf;
 289         struct perf_mw *mw = &pctx->mw;
 290         char *dst;
 291         u64 win_size, buf_size, total;
 292         void *src;
 293         int rc, node;
 294         struct dma_chan *dma_chan = NULL;
 295
 296         pr_info("kthread %s starting...\n", current->comm);
 297
 298         node = pctx->node;
 299
 300         if (!pctx->dma_chan) {
 301                 dma_cap_mask_t dma_mask;
 302
 303                 dma_cap_zero(dma_mask);
 304                 dma_cap_set(DMA_MEMCPY, dma_mask);
 305                 dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn,
 306                                                (void *)(unsigned long)node);
 307                 if (!dma_chan) {
 308                         pr_warn("%s: cannot acquire DMA channel, quitting\n",
 309                                 current->comm);
 310                         return -ENODEV;
 311                 }
 312                 pctx->dma_chan = dma_chan;
 313                 pctx->dev = dma_chan->device->dev;
 314         }
 315
 316         src = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node);
 317         if (!src) {
 318                 rc = -ENOMEM;
 319                 goto err;
 320         }
 321
 322         rc = perf_set_mw(pctx, MAX_TEST_SIZE);
 323         if (rc < 0) {
 324                 pr_err("%s: set mw failed\n", current->comm);
 325                 rc = -ENXIO;
 326                 goto err;
 327         }
 328
 329         win_size = mw->buf_size;
 330         buf_size = 1ULL << seg_order;
 331         total = 1ULL << run_order;
 332
 333         if (buf_size > MAX_TEST_SIZE)
 334                 buf_size = MAX_TEST_SIZE;
 335
 336         dst = (char *)mw->virt_addr;
 337
 338         atomic_inc(&perf->tsync);
 339         while (atomic_read(&perf->tsync) != perf->perf_threads)
 340                 schedule();
 341
 342         rc = perf_move_data(pctx, dst, src, buf_size, win_size, total);
 343
 344         atomic_dec(&perf->tsync);
 345
 346         if (rc < 0) {
 347                 pr_err("%s: failed\n", current->comm);
 348                 rc = -ENXIO;
 349                 goto err;
 350         }
 351
 352         return 0;
 353
 354 err:
 355         if (src)
 356                 kfree(src);
 357
 358         if (dma_chan) {
 359                 dma_release_channel(dma_chan);
 360                 pctx->dma_chan = NULL;
 361         }
 362
 363         return rc;
 364 }
 365
 366 static void perf_free_mw(struct pthr_ctx *pctx)
 367 {
 368         struct perf_mw *mw = &pctx->mw;
 369
 370         if (!mw->virt_addr)
 371                 return;
 372
 373         kfree(mw->virt_addr);
 374         mw->buf_size = 0;
 375         mw->virt_addr = NULL;
 376 }
 377
 378 static int perf_set_mw(struct pthr_ctx *pctx, size_t size)
 379 {
 380         struct perf_mw *mw = &pctx->mw;
 381
 382         if (!size)
 383                 return -EINVAL;
 384
 385         mw->buf_size = size;
 386
 387         mw->virt_addr = kmalloc_node(size, GFP_KERNEL, pctx->node);
 388
 389         if (!mw->virt_addr) {
 390                 mw->buf_size = 0;
 391                 return -EINVAL;
 392         }
 393
 394         return 0;
 395 }
 396
 397 static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf,
 398                                 size_t count, loff_t *offp)
 399 {
 400         struct perf_ctx *perf = filp->private_data;
 401         char *buf;
 402         ssize_t ret, out_offset;
 403
 404         if (!perf)
 405                 return 0;
 406
 407         buf = kmalloc(64, GFP_KERNEL);
 408         out_offset = snprintf(buf, 64, "%d\n", perf->run);
 409         ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
 410         kfree(buf);
 411
 412         return ret;
 413 }
 414
 415 static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf,
 416                                  size_t count, loff_t *offp)
 417 {
 418         struct perf_ctx *perf = filp->private_data;
 419         int node, i;
 420
 421         if (perf->perf_threads == 0)
 422                 return 0;
 423
 424         if (atomic_read(&perf->tsync) == 0)
 425                 perf->run = false;
 426
 427         if (perf->run == true) {
 428                 /* lets stop the threads */
 429                 perf->run = false;
 430                 for (i = 0; i < MAX_THREADS; i++) {
 431                         if (perf->pthr_ctx[i].thread) {
 432                                 kthread_stop(perf->pthr_ctx[i].thread);
 433                                 perf->pthr_ctx[i].thread = NULL;
 434                         } else
 435                                 break;
 436                 }
 437         } else {
 438                 perf->run = true;
 439
 440                 if (perf->perf_threads > MAX_THREADS) {
 441                         perf->perf_threads = MAX_THREADS;
 442                         pr_info("Reset total threads to: %u\n", MAX_THREADS);
 443                 }
 444
 445                 /* no greater than 1M */
 446                 if (seg_order > 20) {
 447                         seg_order = 20;
 448                         pr_info("Fix seg_order to %u\n", seg_order);
 449                 }
 450
 451                 if (run_order < seg_order) {
 452                         run_order = seg_order;
 453                         pr_info("Fix run_order to %u\n", run_order);
 454                 }
 455
 456                 /* launch kernel thread */
 457                 for (i = 0; i < perf->perf_threads; i++) {
 458                         struct pthr_ctx *pctx;
 459
 460                         pctx = &perf->pthr_ctx[i];
 461                         atomic_set(&pctx->dma_sync, 0);
 462                         pctx->perf = perf;
 463                         pctx->elapsed_time = 0;
 464                         pctx->copied = 0;
 465
 466                         init_waitqueue_head(&pctx->wq);
 467
 468                         /* NUMA socket node */
 469                         pctx->node = i / DMA_CHANNELS_PER_NODE;
 470                         node = pctx->node;
 471
 472                         pctx->thread =
 473                                 kthread_create_on_node(dma_perf_thread,
 474                                                        (void *)pctx,
 475                                                        node, "dma_perf %d", i);
 476                         if (pctx->thread)
 477                                 wake_up_process(pctx->thread);
 478                         else {
 479                                 perf->run = false;
 480                                 for (i = 0; i < MAX_THREADS; i++) {
 481                                         if (pctx->thread) {
 482                                                 kthread_stop(pctx->thread);
 483                                                 pctx->thread = NULL;
 484                                         } else
 485                                                 break;
 486                                 }
 487                         }
 488
 489                         if (perf->run == false)
 490                                 return -ENXIO;
 491                 }
 492
 493         }
 494
 495         return count;
 496 }
 497
 498 static const struct file_operations dma_perf_debugfs_run = {
 499         .owner = THIS_MODULE,
 500         .open = simple_open,
 501         .read = debugfs_run_read,
 502         .write = debugfs_run_write,
 503 };
 504
 505 static ssize_t debugfs_status_read(struct file *filp, char __user *ubuf,
 506                                    size_t count, loff_t *offp)
 507 {
 508         struct perf_ctx *perf = filp->private_data;
 509         char *buf;
 510         ssize_t ret, out_offset;
 511
 512         if (!perf)
 513                 return 0;
 514
 515         buf = kmalloc(64, GFP_KERNEL);
 516         out_offset = snprintf(buf, 64, "%s\n", atomic_read(&perf->tsync) ? "running" : "idle");
 517         ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset);
 518         kfree(buf);
 519
 520         return ret;
 521 }
 522
 523 static const struct file_operations dma_perf_debugfs_status = {
 524         .owner = THIS_MODULE,
 525         .open = simple_open,
 526         .read = debugfs_status_read,
 527 };
 528
 529 static int perf_debugfs_setup(struct perf_ctx *perf)
 530 {
 531
 532         int i;
 533         char temp_name[64];
 534
 535         if (!perf_debugfs_dir)
 536                 return -ENODEV;
 537
 538         perf->debugfs_node_dir = debugfs_create_dir("dmaperf",
 539                                  perf_debugfs_dir);
 540         if (!perf->debugfs_node_dir)
 541                 return -ENODEV;
 542
 543         perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR,
 544                                                 perf->debugfs_node_dir, perf,
 545                                                 &dma_perf_debugfs_run);
 546         if (!perf->debugfs_run)
 547                 return -ENODEV;
 548
 549         perf->debugfs_status = debugfs_create_file("status", S_IRUSR,
 550                                perf->debugfs_node_dir, perf,
 551                                &dma_perf_debugfs_status);
 552         if (!perf->debugfs_status)
 553                 return -ENODEV;
 554
 555         perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR,
 556                                 perf->debugfs_node_dir,
 557                                 &perf->perf_threads);
 558         if (!perf->debugfs_threads)
 559                 return -ENODEV;
 560
 561         perf->debugfs_queue_depth = debugfs_create_u32("queue_depth", S_IRUSR | S_IWUSR,
 562                                     perf->debugfs_node_dir,
 563                                     &queue_depth);
 564         if (!perf->debugfs_queue_depth)
 565                 return -ENODEV;
 566
 567         perf->debugfs_transfer_size_order = debugfs_create_u32("transfer_size_order", S_IRUSR | S_IWUSR,
 568                                             perf->debugfs_node_dir,
 569                                             &seg_order);
 570         if (!perf->debugfs_transfer_size_order)
 571                 return -ENODEV;
 572
 573         perf->debugfs_total_size_order = debugfs_create_u32("total_size_order", S_IRUSR | S_IWUSR,
 574                                          perf->debugfs_node_dir,
 575                                          &run_order);
 576         if (!perf->debugfs_total_size_order)
 577                 return -ENODEV;
 578
 579         for (i = 0; i < MAX_THREADS; i++) {
 580                 struct pthr_ctx *pctx = &perf->pthr_ctx[i];
 581                 sprintf(temp_name, "thread_%d", i);
 582
 583                 pctx->debugfs_thr_dir = debugfs_create_dir(temp_name, perf->debugfs_node_dir);
 584                 if (!pctx->debugfs_thr_dir)
 585                         return -ENODEV;
 586
 587                 pctx->debugfs_copied = debugfs_create_u64("copied", S_IRUSR,
 588                                        pctx->debugfs_thr_dir,
 589                                        &pctx->copied);
 590                 if (!pctx->debugfs_copied)
 591                         return -ENODEV;
 592
 593                 pctx->debugfs_elapsed_time = debugfs_create_u64("elapsed_time", S_IRUSR,
 594                                              pctx->debugfs_thr_dir,
 595                                              &pctx->elapsed_time);
 596                 if (!pctx->debugfs_elapsed_time)
 597                         return -ENODEV;
 598         }
 599
 600         return 0;
 601 }
 602
 603 static int perf_probe(void)
 604 {
 605         struct perf_ctx *perf;
 606         int rc = 0;
 607
 608         perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, 0);
 609         if (!perf) {
 610                 rc = -ENOMEM;
 611                 goto err_perf;
 612         }
 613
 614         perf->numa_nodes = num_online_nodes();
 615         perf->perf_threads = 1;
 616         atomic_set(&perf->tsync, 0);
 617         perf->run = false;
 618         spin_lock_init(&perf->db_lock);
 619
 620         if (debugfs_initialized() && !perf_debugfs_dir) {
 621                 perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
 622                 if (!perf_debugfs_dir)
 623                         goto err_ctx;
 624
 625                 rc = perf_debugfs_setup(perf);
 626                 if (rc)
 627                         goto err_ctx;
 628         }
 629
 630         g_perf = perf;
 631         return 0;
 632
 633 err_ctx:
 634         kfree(perf);
 635 err_perf:
 636         return rc;
 637 }
 638
 639 static void perf_remove(void)
 640 {
 641         int i;
 642         struct perf_ctx *perf = g_perf;
 643
 644         if (perf_debugfs_dir) {
 645                 debugfs_remove_recursive(perf_debugfs_dir);
 646                 perf_debugfs_dir = NULL;
 647         }
 648
 649         for (i = 0; i < MAX_THREADS; i++) {
 650                 struct pthr_ctx *pctx = &perf->pthr_ctx[i];
 651                 if (pctx->dma_chan)
 652                         dma_release_channel(pctx->dma_chan);
 653                 perf_free_mw(pctx);
 654         }
 655
 656         kfree(perf);
 657 }
 658
 659 static int __init perf_init_module(void)
 660 {
 661         printk("DMA Performance Test Init\n");
 662         return perf_probe();
 663 }
 664 module_init(perf_init_module);
 665
 666 static void __exit perf_exit_module(void)
 667 {
 668         printk("DMA Performance Test Exit\n");
 669         perf_remove();
 670 }
 671 module_exit(perf_exit_module);