2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem.h>
43 #include <rdma/ib_umem_odp.h>
44 #include <rdma/ib_verbs.h>
49 * We can't use an array for xlt_emergency_page because dma_map_single doesn't
50 * work on kernel modules memory
52 void *xlt_emergency_page
;
53 static DEFINE_MUTEX(xlt_emergency_page_mutex
);
56 MAX_PENDING_REG_MR
= 8,
59 #define MLX5_UMR_ALIGN 2048
62 create_mkey_callback(int status
, struct mlx5_async_work
*context
);
63 static struct mlx5_ib_mr
*reg_create(struct ib_pd
*pd
, struct ib_umem
*umem
,
64 u64 iova
, int access_flags
,
65 unsigned int page_size
, bool populate
);
67 static void set_mkc_access_pd_addr_fields(void *mkc
, int acc
, u64 start_addr
,
70 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
71 bool ro_pci_enabled
= pcie_relaxed_ordering_enabled(dev
->mdev
->pdev
);
73 MLX5_SET(mkc
, mkc
, a
, !!(acc
& IB_ACCESS_REMOTE_ATOMIC
));
74 MLX5_SET(mkc
, mkc
, rw
, !!(acc
& IB_ACCESS_REMOTE_WRITE
));
75 MLX5_SET(mkc
, mkc
, rr
, !!(acc
& IB_ACCESS_REMOTE_READ
));
76 MLX5_SET(mkc
, mkc
, lw
, !!(acc
& IB_ACCESS_LOCAL_WRITE
));
77 MLX5_SET(mkc
, mkc
, lr
, 1);
79 if (MLX5_CAP_GEN(dev
->mdev
, relaxed_ordering_write
))
80 MLX5_SET(mkc
, mkc
, relaxed_ordering_write
,
81 (acc
& IB_ACCESS_RELAXED_ORDERING
) && ro_pci_enabled
);
82 if (MLX5_CAP_GEN(dev
->mdev
, relaxed_ordering_read
))
83 MLX5_SET(mkc
, mkc
, relaxed_ordering_read
,
84 (acc
& IB_ACCESS_RELAXED_ORDERING
) && ro_pci_enabled
);
86 MLX5_SET(mkc
, mkc
, pd
, to_mpd(pd
)->pdn
);
87 MLX5_SET(mkc
, mkc
, qpn
, 0xffffff);
88 MLX5_SET64(mkc
, mkc
, start_addr
, start_addr
);
92 assign_mkey_variant(struct mlx5_ib_dev
*dev
, struct mlx5_core_mkey
*mkey
,
95 u8 key
= atomic_inc_return(&dev
->mkey_var
);
98 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
99 MLX5_SET(mkc
, mkc
, mkey_7_0
, key
);
104 mlx5_ib_create_mkey(struct mlx5_ib_dev
*dev
, struct mlx5_core_mkey
*mkey
,
107 assign_mkey_variant(dev
, mkey
, in
);
108 return mlx5_core_create_mkey(dev
->mdev
, mkey
, in
, inlen
);
112 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev
*dev
,
113 struct mlx5_core_mkey
*mkey
,
114 struct mlx5_async_ctx
*async_ctx
,
115 u32
*in
, int inlen
, u32
*out
, int outlen
,
116 struct mlx5_async_work
*context
)
118 MLX5_SET(create_mkey_in
, in
, opcode
, MLX5_CMD_OP_CREATE_MKEY
);
119 assign_mkey_variant(dev
, mkey
, in
);
120 return mlx5_cmd_exec_cb(async_ctx
, in
, inlen
, out
, outlen
,
121 create_mkey_callback
, context
);
124 static int mr_cache_max_order(struct mlx5_ib_dev
*dev
);
125 static void queue_adjust_cache_locked(struct mlx5_cache_ent
*ent
);
127 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev
*dev
)
129 return !MLX5_CAP_GEN(dev
->mdev
, umr_indirect_mkey_disabled
);
132 static int destroy_mkey(struct mlx5_ib_dev
*dev
, struct mlx5_ib_mr
*mr
)
134 WARN_ON(xa_load(&dev
->odp_mkeys
, mlx5_base_mkey(mr
->mmkey
.key
)));
136 return mlx5_core_destroy_mkey(dev
->mdev
, &mr
->mmkey
);
139 static void create_mkey_callback(int status
, struct mlx5_async_work
*context
)
141 struct mlx5_ib_mr
*mr
=
142 container_of(context
, struct mlx5_ib_mr
, cb_work
);
143 struct mlx5_cache_ent
*ent
= mr
->cache_ent
;
144 struct mlx5_ib_dev
*dev
= ent
->dev
;
148 mlx5_ib_warn(dev
, "async reg mr failed. status %d\n", status
);
150 spin_lock_irqsave(&ent
->lock
, flags
);
152 WRITE_ONCE(dev
->fill_delay
, 1);
153 spin_unlock_irqrestore(&ent
->lock
, flags
);
154 mod_timer(&dev
->delay_timer
, jiffies
+ HZ
);
158 mr
->mmkey
.type
= MLX5_MKEY_MR
;
159 mr
->mmkey
.key
|= mlx5_idx_to_mkey(
160 MLX5_GET(create_mkey_out
, mr
->out
, mkey_index
));
161 init_waitqueue_head(&mr
->mmkey
.wait
);
163 WRITE_ONCE(dev
->cache
.last_add
, jiffies
);
165 spin_lock_irqsave(&ent
->lock
, flags
);
166 list_add_tail(&mr
->list
, &ent
->head
);
167 ent
->available_mrs
++;
169 /* If we are doing fill_to_high_water then keep going. */
170 queue_adjust_cache_locked(ent
);
172 spin_unlock_irqrestore(&ent
->lock
, flags
);
175 static struct mlx5_ib_mr
*alloc_cache_mr(struct mlx5_cache_ent
*ent
, void *mkc
)
177 struct mlx5_ib_mr
*mr
;
179 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
184 set_mkc_access_pd_addr_fields(mkc
, 0, 0, ent
->dev
->umrc
.pd
);
185 MLX5_SET(mkc
, mkc
, free
, 1);
186 MLX5_SET(mkc
, mkc
, umr_en
, 1);
187 MLX5_SET(mkc
, mkc
, access_mode_1_0
, ent
->access_mode
& 0x3);
188 MLX5_SET(mkc
, mkc
, access_mode_4_2
, (ent
->access_mode
>> 2) & 0x7);
190 MLX5_SET(mkc
, mkc
, translations_octword_size
, ent
->xlt
);
191 MLX5_SET(mkc
, mkc
, log_page_size
, ent
->page
);
195 /* Asynchronously schedule new MRs to be populated in the cache. */
196 static int add_keys(struct mlx5_cache_ent
*ent
, unsigned int num
)
198 size_t inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
199 struct mlx5_ib_mr
*mr
;
205 in
= kzalloc(inlen
, GFP_KERNEL
);
209 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
210 for (i
= 0; i
< num
; i
++) {
211 mr
= alloc_cache_mr(ent
, mkc
);
216 spin_lock_irq(&ent
->lock
);
217 if (ent
->pending
>= MAX_PENDING_REG_MR
) {
219 spin_unlock_irq(&ent
->lock
);
224 spin_unlock_irq(&ent
->lock
);
225 err
= mlx5_ib_create_mkey_cb(ent
->dev
, &mr
->mmkey
,
226 &ent
->dev
->async_ctx
, in
, inlen
,
227 mr
->out
, sizeof(mr
->out
),
230 spin_lock_irq(&ent
->lock
);
232 spin_unlock_irq(&ent
->lock
);
233 mlx5_ib_warn(ent
->dev
, "create mkey failed %d\n", err
);
243 /* Synchronously create a MR in the cache */
244 static struct mlx5_ib_mr
*create_cache_mr(struct mlx5_cache_ent
*ent
)
246 size_t inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
247 struct mlx5_ib_mr
*mr
;
252 in
= kzalloc(inlen
, GFP_KERNEL
);
254 return ERR_PTR(-ENOMEM
);
255 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
257 mr
= alloc_cache_mr(ent
, mkc
);
263 err
= mlx5_core_create_mkey(ent
->dev
->mdev
, &mr
->mmkey
, in
, inlen
);
267 mr
->mmkey
.type
= MLX5_MKEY_MR
;
268 WRITE_ONCE(ent
->dev
->cache
.last_add
, jiffies
);
269 spin_lock_irq(&ent
->lock
);
271 spin_unlock_irq(&ent
->lock
);
281 static void remove_cache_mr_locked(struct mlx5_cache_ent
*ent
)
283 struct mlx5_ib_mr
*mr
;
285 lockdep_assert_held(&ent
->lock
);
286 if (list_empty(&ent
->head
))
288 mr
= list_first_entry(&ent
->head
, struct mlx5_ib_mr
, list
);
290 ent
->available_mrs
--;
292 spin_unlock_irq(&ent
->lock
);
293 mlx5_core_destroy_mkey(ent
->dev
->mdev
, &mr
->mmkey
);
295 spin_lock_irq(&ent
->lock
);
298 static int resize_available_mrs(struct mlx5_cache_ent
*ent
, unsigned int target
,
303 lockdep_assert_held(&ent
->lock
);
307 target
= ent
->limit
* 2;
308 if (target
== ent
->available_mrs
+ ent
->pending
)
310 if (target
> ent
->available_mrs
+ ent
->pending
) {
311 u32 todo
= target
- (ent
->available_mrs
+ ent
->pending
);
313 spin_unlock_irq(&ent
->lock
);
314 err
= add_keys(ent
, todo
);
316 usleep_range(3000, 5000);
317 spin_lock_irq(&ent
->lock
);
324 remove_cache_mr_locked(ent
);
329 static ssize_t
size_write(struct file
*filp
, const char __user
*buf
,
330 size_t count
, loff_t
*pos
)
332 struct mlx5_cache_ent
*ent
= filp
->private_data
;
336 err
= kstrtou32_from_user(buf
, count
, 0, &target
);
341 * Target is the new value of total_mrs the user requests, however we
342 * cannot free MRs that are in use. Compute the target value for
345 spin_lock_irq(&ent
->lock
);
346 if (target
< ent
->total_mrs
- ent
->available_mrs
) {
350 target
= target
- (ent
->total_mrs
- ent
->available_mrs
);
351 if (target
< ent
->limit
|| target
> ent
->limit
*2) {
355 err
= resize_available_mrs(ent
, target
, false);
358 spin_unlock_irq(&ent
->lock
);
363 spin_unlock_irq(&ent
->lock
);
367 static ssize_t
size_read(struct file
*filp
, char __user
*buf
, size_t count
,
370 struct mlx5_cache_ent
*ent
= filp
->private_data
;
374 err
= snprintf(lbuf
, sizeof(lbuf
), "%d\n", ent
->total_mrs
);
378 return simple_read_from_buffer(buf
, count
, pos
, lbuf
, err
);
381 static const struct file_operations size_fops
= {
382 .owner
= THIS_MODULE
,
388 static ssize_t
limit_write(struct file
*filp
, const char __user
*buf
,
389 size_t count
, loff_t
*pos
)
391 struct mlx5_cache_ent
*ent
= filp
->private_data
;
395 err
= kstrtou32_from_user(buf
, count
, 0, &var
);
400 * Upon set we immediately fill the cache to high water mark implied by
403 spin_lock_irq(&ent
->lock
);
405 err
= resize_available_mrs(ent
, 0, true);
406 spin_unlock_irq(&ent
->lock
);
412 static ssize_t
limit_read(struct file
*filp
, char __user
*buf
, size_t count
,
415 struct mlx5_cache_ent
*ent
= filp
->private_data
;
419 err
= snprintf(lbuf
, sizeof(lbuf
), "%d\n", ent
->limit
);
423 return simple_read_from_buffer(buf
, count
, pos
, lbuf
, err
);
426 static const struct file_operations limit_fops
= {
427 .owner
= THIS_MODULE
,
429 .write
= limit_write
,
433 static bool someone_adding(struct mlx5_mr_cache
*cache
)
437 for (i
= 0; i
< MAX_MR_CACHE_ENTRIES
; i
++) {
438 struct mlx5_cache_ent
*ent
= &cache
->ent
[i
];
441 spin_lock_irq(&ent
->lock
);
442 ret
= ent
->available_mrs
< ent
->limit
;
443 spin_unlock_irq(&ent
->lock
);
451 * Check if the bucket is outside the high/low water mark and schedule an async
452 * update. The cache refill has hysteresis, once the low water mark is hit it is
453 * refilled up to the high mark.
455 static void queue_adjust_cache_locked(struct mlx5_cache_ent
*ent
)
457 lockdep_assert_held(&ent
->lock
);
459 if (ent
->disabled
|| READ_ONCE(ent
->dev
->fill_delay
))
461 if (ent
->available_mrs
< ent
->limit
) {
462 ent
->fill_to_high_water
= true;
463 queue_work(ent
->dev
->cache
.wq
, &ent
->work
);
464 } else if (ent
->fill_to_high_water
&&
465 ent
->available_mrs
+ ent
->pending
< 2 * ent
->limit
) {
467 * Once we start populating due to hitting a low water mark
468 * continue until we pass the high water mark.
470 queue_work(ent
->dev
->cache
.wq
, &ent
->work
);
471 } else if (ent
->available_mrs
== 2 * ent
->limit
) {
472 ent
->fill_to_high_water
= false;
473 } else if (ent
->available_mrs
> 2 * ent
->limit
) {
474 /* Queue deletion of excess entries */
475 ent
->fill_to_high_water
= false;
477 queue_delayed_work(ent
->dev
->cache
.wq
, &ent
->dwork
,
478 msecs_to_jiffies(1000));
480 queue_work(ent
->dev
->cache
.wq
, &ent
->work
);
484 static void __cache_work_func(struct mlx5_cache_ent
*ent
)
486 struct mlx5_ib_dev
*dev
= ent
->dev
;
487 struct mlx5_mr_cache
*cache
= &dev
->cache
;
490 spin_lock_irq(&ent
->lock
);
494 if (ent
->fill_to_high_water
&&
495 ent
->available_mrs
+ ent
->pending
< 2 * ent
->limit
&&
496 !READ_ONCE(dev
->fill_delay
)) {
497 spin_unlock_irq(&ent
->lock
);
498 err
= add_keys(ent
, 1);
499 spin_lock_irq(&ent
->lock
);
504 * EAGAIN only happens if pending is positive, so we
505 * will be rescheduled from reg_mr_callback(). The only
506 * failure path here is ENOMEM.
508 if (err
!= -EAGAIN
) {
511 "command failed order %d, err %d\n",
513 queue_delayed_work(cache
->wq
, &ent
->dwork
,
514 msecs_to_jiffies(1000));
517 } else if (ent
->available_mrs
> 2 * ent
->limit
) {
521 * The remove_cache_mr() logic is performed as garbage
522 * collection task. Such task is intended to be run when no
523 * other active processes are running.
525 * The need_resched() will return TRUE if there are user tasks
526 * to be activated in near future.
528 * In such case, we don't execute remove_cache_mr() and postpone
529 * the garbage collection work to try to run in next cycle, in
530 * order to free CPU resources to other tasks.
532 spin_unlock_irq(&ent
->lock
);
533 need_delay
= need_resched() || someone_adding(cache
) ||
535 READ_ONCE(cache
->last_add
) + 300 * HZ
);
536 spin_lock_irq(&ent
->lock
);
540 queue_delayed_work(cache
->wq
, &ent
->dwork
, 300 * HZ
);
541 remove_cache_mr_locked(ent
);
542 queue_adjust_cache_locked(ent
);
545 spin_unlock_irq(&ent
->lock
);
548 static void delayed_cache_work_func(struct work_struct
*work
)
550 struct mlx5_cache_ent
*ent
;
552 ent
= container_of(work
, struct mlx5_cache_ent
, dwork
.work
);
553 __cache_work_func(ent
);
556 static void cache_work_func(struct work_struct
*work
)
558 struct mlx5_cache_ent
*ent
;
560 ent
= container_of(work
, struct mlx5_cache_ent
, work
);
561 __cache_work_func(ent
);
564 /* Allocate a special entry from the cache */
565 struct mlx5_ib_mr
*mlx5_mr_cache_alloc(struct mlx5_ib_dev
*dev
,
566 unsigned int entry
, int access_flags
)
568 struct mlx5_mr_cache
*cache
= &dev
->cache
;
569 struct mlx5_cache_ent
*ent
;
570 struct mlx5_ib_mr
*mr
;
572 if (WARN_ON(entry
<= MR_CACHE_LAST_STD_ENTRY
||
573 entry
>= ARRAY_SIZE(cache
->ent
)))
574 return ERR_PTR(-EINVAL
);
576 /* Matches access in alloc_cache_mr() */
577 if (!mlx5_ib_can_reconfig_with_umr(dev
, 0, access_flags
))
578 return ERR_PTR(-EOPNOTSUPP
);
580 ent
= &cache
->ent
[entry
];
581 spin_lock_irq(&ent
->lock
);
582 if (list_empty(&ent
->head
)) {
583 spin_unlock_irq(&ent
->lock
);
584 mr
= create_cache_mr(ent
);
588 mr
= list_first_entry(&ent
->head
, struct mlx5_ib_mr
, list
);
590 ent
->available_mrs
--;
591 queue_adjust_cache_locked(ent
);
592 spin_unlock_irq(&ent
->lock
);
596 mr
->access_flags
= access_flags
;
600 /* Return a MR already available in the cache */
601 static struct mlx5_ib_mr
*get_cache_mr(struct mlx5_cache_ent
*req_ent
)
603 struct mlx5_ib_dev
*dev
= req_ent
->dev
;
604 struct mlx5_ib_mr
*mr
= NULL
;
605 struct mlx5_cache_ent
*ent
= req_ent
;
607 /* Try larger MR pools from the cache to satisfy the allocation */
608 for (; ent
!= &dev
->cache
.ent
[MR_CACHE_LAST_STD_ENTRY
+ 1]; ent
++) {
609 mlx5_ib_dbg(dev
, "order %u, cache index %zu\n", ent
->order
,
610 ent
- dev
->cache
.ent
);
612 spin_lock_irq(&ent
->lock
);
613 if (!list_empty(&ent
->head
)) {
614 mr
= list_first_entry(&ent
->head
, struct mlx5_ib_mr
,
617 ent
->available_mrs
--;
618 queue_adjust_cache_locked(ent
);
619 spin_unlock_irq(&ent
->lock
);
623 queue_adjust_cache_locked(ent
);
624 spin_unlock_irq(&ent
->lock
);
630 static void mlx5_mr_cache_free(struct mlx5_ib_dev
*dev
, struct mlx5_ib_mr
*mr
)
632 struct mlx5_cache_ent
*ent
= mr
->cache_ent
;
634 spin_lock_irq(&ent
->lock
);
635 list_add_tail(&mr
->list
, &ent
->head
);
636 ent
->available_mrs
++;
637 queue_adjust_cache_locked(ent
);
638 spin_unlock_irq(&ent
->lock
);
641 static void clean_keys(struct mlx5_ib_dev
*dev
, int c
)
643 struct mlx5_mr_cache
*cache
= &dev
->cache
;
644 struct mlx5_cache_ent
*ent
= &cache
->ent
[c
];
645 struct mlx5_ib_mr
*tmp_mr
;
646 struct mlx5_ib_mr
*mr
;
649 cancel_delayed_work(&ent
->dwork
);
651 spin_lock_irq(&ent
->lock
);
652 if (list_empty(&ent
->head
)) {
653 spin_unlock_irq(&ent
->lock
);
656 mr
= list_first_entry(&ent
->head
, struct mlx5_ib_mr
, list
);
657 list_move(&mr
->list
, &del_list
);
658 ent
->available_mrs
--;
660 spin_unlock_irq(&ent
->lock
);
661 mlx5_core_destroy_mkey(dev
->mdev
, &mr
->mmkey
);
664 list_for_each_entry_safe(mr
, tmp_mr
, &del_list
, list
) {
670 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev
*dev
)
672 if (!mlx5_debugfs_root
|| dev
->is_rep
)
675 debugfs_remove_recursive(dev
->cache
.root
);
676 dev
->cache
.root
= NULL
;
679 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev
*dev
)
681 struct mlx5_mr_cache
*cache
= &dev
->cache
;
682 struct mlx5_cache_ent
*ent
;
686 if (!mlx5_debugfs_root
|| dev
->is_rep
)
689 cache
->root
= debugfs_create_dir("mr_cache", dev
->mdev
->priv
.dbg_root
);
691 for (i
= 0; i
< MAX_MR_CACHE_ENTRIES
; i
++) {
692 ent
= &cache
->ent
[i
];
693 sprintf(ent
->name
, "%d", ent
->order
);
694 dir
= debugfs_create_dir(ent
->name
, cache
->root
);
695 debugfs_create_file("size", 0600, dir
, ent
, &size_fops
);
696 debugfs_create_file("limit", 0600, dir
, ent
, &limit_fops
);
697 debugfs_create_u32("cur", 0400, dir
, &ent
->available_mrs
);
698 debugfs_create_u32("miss", 0600, dir
, &ent
->miss
);
702 static void delay_time_func(struct timer_list
*t
)
704 struct mlx5_ib_dev
*dev
= from_timer(dev
, t
, delay_timer
);
706 WRITE_ONCE(dev
->fill_delay
, 0);
709 int mlx5_mr_cache_init(struct mlx5_ib_dev
*dev
)
711 struct mlx5_mr_cache
*cache
= &dev
->cache
;
712 struct mlx5_cache_ent
*ent
;
715 mutex_init(&dev
->slow_path_mutex
);
716 cache
->wq
= alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM
);
718 mlx5_ib_warn(dev
, "failed to create work queue\n");
722 mlx5_cmd_init_async_ctx(dev
->mdev
, &dev
->async_ctx
);
723 timer_setup(&dev
->delay_timer
, delay_time_func
, 0);
724 for (i
= 0; i
< MAX_MR_CACHE_ENTRIES
; i
++) {
725 ent
= &cache
->ent
[i
];
726 INIT_LIST_HEAD(&ent
->head
);
727 spin_lock_init(&ent
->lock
);
732 INIT_WORK(&ent
->work
, cache_work_func
);
733 INIT_DELAYED_WORK(&ent
->dwork
, delayed_cache_work_func
);
735 if (i
> MR_CACHE_LAST_STD_ENTRY
) {
736 mlx5_odp_init_mr_cache_entry(ent
);
740 if (ent
->order
> mr_cache_max_order(dev
))
743 ent
->page
= PAGE_SHIFT
;
744 ent
->xlt
= (1 << ent
->order
) * sizeof(struct mlx5_mtt
) /
745 MLX5_IB_UMR_OCTOWORD
;
746 ent
->access_mode
= MLX5_MKC_ACCESS_MODE_MTT
;
747 if ((dev
->mdev
->profile
.mask
& MLX5_PROF_MASK_MR_CACHE
) &&
748 !dev
->is_rep
&& mlx5_core_is_pf(dev
->mdev
) &&
749 mlx5_ib_can_load_pas_with_umr(dev
, 0))
750 ent
->limit
= dev
->mdev
->profile
.mr_cache
[i
].limit
;
753 spin_lock_irq(&ent
->lock
);
754 queue_adjust_cache_locked(ent
);
755 spin_unlock_irq(&ent
->lock
);
758 mlx5_mr_cache_debugfs_init(dev
);
763 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev
*dev
)
770 for (i
= 0; i
< MAX_MR_CACHE_ENTRIES
; i
++) {
771 struct mlx5_cache_ent
*ent
= &dev
->cache
.ent
[i
];
773 spin_lock_irq(&ent
->lock
);
774 ent
->disabled
= true;
775 spin_unlock_irq(&ent
->lock
);
776 cancel_work_sync(&ent
->work
);
777 cancel_delayed_work_sync(&ent
->dwork
);
780 mlx5_mr_cache_debugfs_cleanup(dev
);
781 mlx5_cmd_cleanup_async_ctx(&dev
->async_ctx
);
783 for (i
= 0; i
< MAX_MR_CACHE_ENTRIES
; i
++)
786 destroy_workqueue(dev
->cache
.wq
);
787 del_timer_sync(&dev
->delay_timer
);
792 struct ib_mr
*mlx5_ib_get_dma_mr(struct ib_pd
*pd
, int acc
)
794 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
795 int inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
796 struct mlx5_ib_mr
*mr
;
801 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
803 return ERR_PTR(-ENOMEM
);
805 in
= kzalloc(inlen
, GFP_KERNEL
);
811 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
813 MLX5_SET(mkc
, mkc
, access_mode_1_0
, MLX5_MKC_ACCESS_MODE_PA
);
814 MLX5_SET(mkc
, mkc
, length64
, 1);
815 set_mkc_access_pd_addr_fields(mkc
, acc
| IB_ACCESS_RELAXED_ORDERING
, 0,
818 err
= mlx5_ib_create_mkey(dev
, &mr
->mmkey
, in
, inlen
);
823 mr
->mmkey
.type
= MLX5_MKEY_MR
;
824 mr
->ibmr
.lkey
= mr
->mmkey
.key
;
825 mr
->ibmr
.rkey
= mr
->mmkey
.key
;
839 static int get_octo_len(u64 addr
, u64 len
, int page_shift
)
841 u64 page_size
= 1ULL << page_shift
;
845 offset
= addr
& (page_size
- 1);
846 npages
= ALIGN(len
+ offset
, page_size
) >> page_shift
;
847 return (npages
+ 1) / 2;
850 static int mr_cache_max_order(struct mlx5_ib_dev
*dev
)
852 if (MLX5_CAP_GEN(dev
->mdev
, umr_extended_translation_offset
))
853 return MR_CACHE_LAST_STD_ENTRY
+ 2;
854 return MLX5_MAX_UMR_SHIFT
;
857 static void mlx5_ib_umr_done(struct ib_cq
*cq
, struct ib_wc
*wc
)
859 struct mlx5_ib_umr_context
*context
=
860 container_of(wc
->wr_cqe
, struct mlx5_ib_umr_context
, cqe
);
862 context
->status
= wc
->status
;
863 complete(&context
->done
);
866 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context
*context
)
868 context
->cqe
.done
= mlx5_ib_umr_done
;
869 context
->status
= -1;
870 init_completion(&context
->done
);
873 static int mlx5_ib_post_send_wait(struct mlx5_ib_dev
*dev
,
874 struct mlx5_umr_wr
*umrwr
)
876 struct umr_common
*umrc
= &dev
->umrc
;
877 const struct ib_send_wr
*bad
;
879 struct mlx5_ib_umr_context umr_context
;
881 mlx5_ib_init_umr_context(&umr_context
);
882 umrwr
->wr
.wr_cqe
= &umr_context
.cqe
;
885 err
= ib_post_send(umrc
->qp
, &umrwr
->wr
, &bad
);
887 mlx5_ib_warn(dev
, "UMR post send failed, err %d\n", err
);
889 wait_for_completion(&umr_context
.done
);
890 if (umr_context
.status
!= IB_WC_SUCCESS
) {
891 mlx5_ib_warn(dev
, "reg umr failed (%u)\n",
900 static struct mlx5_cache_ent
*mr_cache_ent_from_order(struct mlx5_ib_dev
*dev
,
903 struct mlx5_mr_cache
*cache
= &dev
->cache
;
905 if (order
< cache
->ent
[0].order
)
906 return &cache
->ent
[0];
907 order
= order
- cache
->ent
[0].order
;
908 if (order
> MR_CACHE_LAST_STD_ENTRY
)
910 return &cache
->ent
[order
];
913 static void set_mr_fields(struct mlx5_ib_dev
*dev
, struct mlx5_ib_mr
*mr
,
914 u64 length
, int access_flags
)
916 mr
->ibmr
.lkey
= mr
->mmkey
.key
;
917 mr
->ibmr
.rkey
= mr
->mmkey
.key
;
918 mr
->ibmr
.length
= length
;
919 mr
->ibmr
.device
= &dev
->ib_dev
;
920 mr
->access_flags
= access_flags
;
923 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem
*umem
,
927 * The alignment of iova has already been checked upon entering
928 * UVERBS_METHOD_REG_DMABUF_MR
934 static struct mlx5_ib_mr
*alloc_cacheable_mr(struct ib_pd
*pd
,
935 struct ib_umem
*umem
, u64 iova
,
938 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
939 struct mlx5_cache_ent
*ent
;
940 struct mlx5_ib_mr
*mr
;
941 unsigned int page_size
;
944 page_size
= mlx5_umem_dmabuf_default_pgsz(umem
, iova
);
946 page_size
= mlx5_umem_find_best_pgsz(umem
, mkc
, log_page_size
,
948 if (WARN_ON(!page_size
))
949 return ERR_PTR(-EINVAL
);
950 ent
= mr_cache_ent_from_order(
951 dev
, order_base_2(ib_umem_num_dma_blocks(umem
, page_size
)));
953 * Matches access in alloc_cache_mr(). If the MR can't come from the
954 * cache then synchronously create an uncached one.
956 if (!ent
|| ent
->limit
== 0 ||
957 !mlx5_ib_can_reconfig_with_umr(dev
, 0, access_flags
)) {
958 mutex_lock(&dev
->slow_path_mutex
);
959 mr
= reg_create(pd
, umem
, iova
, access_flags
, page_size
, false);
960 mutex_unlock(&dev
->slow_path_mutex
);
964 mr
= get_cache_mr(ent
);
966 mr
= create_cache_mr(ent
);
968 * The above already tried to do the same stuff as reg_create(),
969 * no reason to try it again.
977 mr
->mmkey
.iova
= iova
;
978 mr
->mmkey
.size
= umem
->length
;
979 mr
->mmkey
.pd
= to_mpd(pd
)->pdn
;
980 mr
->page_shift
= order_base_2(page_size
);
981 set_mr_fields(dev
, mr
, umem
->length
, access_flags
);
986 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
987 MLX5_UMR_MTT_ALIGNMENT)
988 #define MLX5_SPARE_UMR_CHUNK 0x10000
991 * Allocate a temporary buffer to hold the per-page information to transfer to
992 * HW. For efficiency this should be as large as it can be, but buffer
993 * allocation failure is not allowed, so try smaller sizes.
995 static void *mlx5_ib_alloc_xlt(size_t *nents
, size_t ent_size
, gfp_t gfp_mask
)
997 const size_t xlt_chunk_align
=
998 MLX5_UMR_MTT_ALIGNMENT
/ ent_size
;
1002 static_assert(PAGE_SIZE
% MLX5_UMR_MTT_ALIGNMENT
== 0);
1005 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
1006 * allocation can't trigger any kind of reclaim.
1010 gfp_mask
|= __GFP_ZERO
| __GFP_NORETRY
;
1013 * If the system already has a suitable high order page then just use
1014 * that, but don't try hard to create one. This max is about 1M, so a
1015 * free x86 huge page will satisfy it.
1017 size
= min_t(size_t, ent_size
* ALIGN(*nents
, xlt_chunk_align
),
1018 MLX5_MAX_UMR_CHUNK
);
1019 *nents
= size
/ ent_size
;
1020 res
= (void *)__get_free_pages(gfp_mask
| __GFP_NOWARN
,
1025 if (size
> MLX5_SPARE_UMR_CHUNK
) {
1026 size
= MLX5_SPARE_UMR_CHUNK
;
1027 *nents
= size
/ ent_size
;
1028 res
= (void *)__get_free_pages(gfp_mask
| __GFP_NOWARN
,
1034 *nents
= PAGE_SIZE
/ ent_size
;
1035 res
= (void *)__get_free_page(gfp_mask
);
1039 mutex_lock(&xlt_emergency_page_mutex
);
1040 memset(xlt_emergency_page
, 0, PAGE_SIZE
);
1041 return xlt_emergency_page
;
1044 static void mlx5_ib_free_xlt(void *xlt
, size_t length
)
1046 if (xlt
== xlt_emergency_page
) {
1047 mutex_unlock(&xlt_emergency_page_mutex
);
1051 free_pages((unsigned long)xlt
, get_order(length
));
1055 * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for
1058 static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr
*mr
,
1059 struct mlx5_umr_wr
*wr
, struct ib_sge
*sg
,
1060 size_t nents
, size_t ent_size
,
1063 struct mlx5_ib_dev
*dev
= mr_to_mdev(mr
);
1064 struct device
*ddev
= &dev
->mdev
->pdev
->dev
;
1068 xlt
= mlx5_ib_alloc_xlt(&nents
, ent_size
,
1069 flags
& MLX5_IB_UPD_XLT_ATOMIC
? GFP_ATOMIC
:
1071 sg
->length
= nents
* ent_size
;
1072 dma
= dma_map_single(ddev
, xlt
, sg
->length
, DMA_TO_DEVICE
);
1073 if (dma_mapping_error(ddev
, dma
)) {
1074 mlx5_ib_err(dev
, "unable to map DMA during XLT update.\n");
1075 mlx5_ib_free_xlt(xlt
, sg
->length
);
1079 sg
->lkey
= dev
->umrc
.pd
->local_dma_lkey
;
1081 memset(wr
, 0, sizeof(*wr
));
1082 wr
->wr
.send_flags
= MLX5_IB_SEND_UMR_UPDATE_XLT
;
1083 if (!(flags
& MLX5_IB_UPD_XLT_ENABLE
))
1084 wr
->wr
.send_flags
|= MLX5_IB_SEND_UMR_FAIL_IF_FREE
;
1085 wr
->wr
.sg_list
= sg
;
1087 wr
->wr
.opcode
= MLX5_IB_WR_UMR
;
1088 wr
->pd
= mr
->ibmr
.pd
;
1089 wr
->mkey
= mr
->mmkey
.key
;
1090 wr
->length
= mr
->mmkey
.size
;
1091 wr
->virt_addr
= mr
->mmkey
.iova
;
1092 wr
->access_flags
= mr
->access_flags
;
1093 wr
->page_shift
= mr
->page_shift
;
1094 wr
->xlt_size
= sg
->length
;
1098 static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev
*dev
, void *xlt
,
1101 struct device
*ddev
= &dev
->mdev
->pdev
->dev
;
1103 dma_unmap_single(ddev
, sg
->addr
, sg
->length
, DMA_TO_DEVICE
);
1104 mlx5_ib_free_xlt(xlt
, sg
->length
);
1107 static unsigned int xlt_wr_final_send_flags(unsigned int flags
)
1109 unsigned int res
= 0;
1111 if (flags
& MLX5_IB_UPD_XLT_ENABLE
)
1112 res
|= MLX5_IB_SEND_UMR_ENABLE_MR
|
1113 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS
|
1114 MLX5_IB_SEND_UMR_UPDATE_TRANSLATION
;
1115 if (flags
& MLX5_IB_UPD_XLT_PD
|| flags
& MLX5_IB_UPD_XLT_ACCESS
)
1116 res
|= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS
;
1117 if (flags
& MLX5_IB_UPD_XLT_ADDR
)
1118 res
|= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION
;
1122 int mlx5_ib_update_xlt(struct mlx5_ib_mr
*mr
, u64 idx
, int npages
,
1123 int page_shift
, int flags
)
1125 struct mlx5_ib_dev
*dev
= mr_to_mdev(mr
);
1126 struct device
*ddev
= &dev
->mdev
->pdev
->dev
;
1128 struct mlx5_umr_wr wr
;
1131 int desc_size
= (flags
& MLX5_IB_UPD_XLT_INDIRECT
)
1132 ? sizeof(struct mlx5_klm
)
1133 : sizeof(struct mlx5_mtt
);
1134 const int page_align
= MLX5_UMR_MTT_ALIGNMENT
/ desc_size
;
1135 const int page_mask
= page_align
- 1;
1136 size_t pages_mapped
= 0;
1137 size_t pages_to_map
= 0;
1139 size_t size_to_map
= 0;
1140 size_t orig_sg_length
;
1142 if ((flags
& MLX5_IB_UPD_XLT_INDIRECT
) &&
1143 !umr_can_use_indirect_mkey(dev
))
1146 if (WARN_ON(!mr
->umem
->is_odp
))
1149 /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
1150 * so we need to align the offset and length accordingly
1152 if (idx
& page_mask
) {
1153 npages
+= idx
& page_mask
;
1156 pages_to_map
= ALIGN(npages
, page_align
);
1158 xlt
= mlx5_ib_create_xlt_wr(mr
, &wr
, &sg
, npages
, desc_size
, flags
);
1161 pages_iter
= sg
.length
/ desc_size
;
1162 orig_sg_length
= sg
.length
;
1164 if (!(flags
& MLX5_IB_UPD_XLT_INDIRECT
)) {
1165 struct ib_umem_odp
*odp
= to_ib_umem_odp(mr
->umem
);
1166 size_t max_pages
= ib_umem_odp_num_pages(odp
) - idx
;
1168 pages_to_map
= min_t(size_t, pages_to_map
, max_pages
);
1171 wr
.page_shift
= page_shift
;
1173 for (pages_mapped
= 0;
1174 pages_mapped
< pages_to_map
&& !err
;
1175 pages_mapped
+= pages_iter
, idx
+= pages_iter
) {
1176 npages
= min_t(int, pages_iter
, pages_to_map
- pages_mapped
);
1177 size_to_map
= npages
* desc_size
;
1178 dma_sync_single_for_cpu(ddev
, sg
.addr
, sg
.length
,
1180 mlx5_odp_populate_xlt(xlt
, idx
, npages
, mr
, flags
);
1181 dma_sync_single_for_device(ddev
, sg
.addr
, sg
.length
,
1184 sg
.length
= ALIGN(size_to_map
, MLX5_UMR_MTT_ALIGNMENT
);
1186 if (pages_mapped
+ pages_iter
>= pages_to_map
)
1187 wr
.wr
.send_flags
|= xlt_wr_final_send_flags(flags
);
1189 wr
.offset
= idx
* desc_size
;
1190 wr
.xlt_size
= sg
.length
;
1192 err
= mlx5_ib_post_send_wait(dev
, &wr
);
1194 sg
.length
= orig_sg_length
;
1195 mlx5_ib_unmap_free_xlt(dev
, xlt
, &sg
);
1200 * Send the DMA list to the HW for a normal MR using UMR.
1201 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
1204 int mlx5_ib_update_mr_pas(struct mlx5_ib_mr
*mr
, unsigned int flags
)
1206 struct mlx5_ib_dev
*dev
= mr_to_mdev(mr
);
1207 struct device
*ddev
= &dev
->mdev
->pdev
->dev
;
1208 struct ib_block_iter biter
;
1209 struct mlx5_mtt
*cur_mtt
;
1210 struct mlx5_umr_wr wr
;
1211 size_t orig_sg_length
;
1212 struct mlx5_mtt
*mtt
;
1217 if (WARN_ON(mr
->umem
->is_odp
))
1220 mtt
= mlx5_ib_create_xlt_wr(mr
, &wr
, &sg
,
1221 ib_umem_num_dma_blocks(mr
->umem
,
1222 1 << mr
->page_shift
),
1223 sizeof(*mtt
), flags
);
1226 orig_sg_length
= sg
.length
;
1229 rdma_for_each_block (mr
->umem
->sgt_append
.sgt
.sgl
, &biter
,
1230 mr
->umem
->sgt_append
.sgt
.nents
,
1231 BIT(mr
->page_shift
)) {
1232 if (cur_mtt
== (void *)mtt
+ sg
.length
) {
1233 dma_sync_single_for_device(ddev
, sg
.addr
, sg
.length
,
1235 err
= mlx5_ib_post_send_wait(dev
, &wr
);
1238 dma_sync_single_for_cpu(ddev
, sg
.addr
, sg
.length
,
1240 wr
.offset
+= sg
.length
;
1245 cpu_to_be64(rdma_block_iter_dma_address(&biter
) |
1246 MLX5_IB_MTT_PRESENT
);
1248 if (mr
->umem
->is_dmabuf
&& (flags
& MLX5_IB_UPD_XLT_ZAP
))
1254 final_size
= (void *)cur_mtt
- (void *)mtt
;
1255 sg
.length
= ALIGN(final_size
, MLX5_UMR_MTT_ALIGNMENT
);
1256 memset(cur_mtt
, 0, sg
.length
- final_size
);
1257 wr
.wr
.send_flags
|= xlt_wr_final_send_flags(flags
);
1258 wr
.xlt_size
= sg
.length
;
1260 dma_sync_single_for_device(ddev
, sg
.addr
, sg
.length
, DMA_TO_DEVICE
);
1261 err
= mlx5_ib_post_send_wait(dev
, &wr
);
1264 sg
.length
= orig_sg_length
;
1265 mlx5_ib_unmap_free_xlt(dev
, mtt
, &sg
);
1270 * If ibmr is NULL it will be allocated by reg_create.
1271 * Else, the given ibmr will be used.
1273 static struct mlx5_ib_mr
*reg_create(struct ib_pd
*pd
, struct ib_umem
*umem
,
1274 u64 iova
, int access_flags
,
1275 unsigned int page_size
, bool populate
)
1277 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1278 struct mlx5_ib_mr
*mr
;
1284 bool pg_cap
= !!(MLX5_CAP_GEN(dev
->mdev
, pg
));
1287 return ERR_PTR(-EINVAL
);
1288 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
1290 return ERR_PTR(-ENOMEM
);
1293 mr
->access_flags
= access_flags
;
1294 mr
->page_shift
= order_base_2(page_size
);
1296 inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
1298 inlen
+= sizeof(*pas
) *
1299 roundup(ib_umem_num_dma_blocks(umem
, page_size
), 2);
1300 in
= kvzalloc(inlen
, GFP_KERNEL
);
1305 pas
= (__be64
*)MLX5_ADDR_OF(create_mkey_in
, in
, klm_pas_mtt
);
1307 if (WARN_ON(access_flags
& IB_ACCESS_ON_DEMAND
)) {
1311 mlx5_ib_populate_pas(umem
, 1UL << mr
->page_shift
, pas
,
1312 pg_cap
? MLX5_IB_MTT_PRESENT
: 0);
1315 /* The pg_access bit allows setting the access flags
1316 * in the page list submitted with the command. */
1317 MLX5_SET(create_mkey_in
, in
, pg_access
, !!(pg_cap
));
1319 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
1320 set_mkc_access_pd_addr_fields(mkc
, access_flags
, iova
,
1321 populate
? pd
: dev
->umrc
.pd
);
1322 MLX5_SET(mkc
, mkc
, free
, !populate
);
1323 MLX5_SET(mkc
, mkc
, access_mode_1_0
, MLX5_MKC_ACCESS_MODE_MTT
);
1324 MLX5_SET(mkc
, mkc
, umr_en
, 1);
1326 MLX5_SET64(mkc
, mkc
, len
, umem
->length
);
1327 MLX5_SET(mkc
, mkc
, bsf_octword_size
, 0);
1328 MLX5_SET(mkc
, mkc
, translations_octword_size
,
1329 get_octo_len(iova
, umem
->length
, mr
->page_shift
));
1330 MLX5_SET(mkc
, mkc
, log_page_size
, mr
->page_shift
);
1332 MLX5_SET(create_mkey_in
, in
, translations_octword_actual_size
,
1333 get_octo_len(iova
, umem
->length
, mr
->page_shift
));
1336 err
= mlx5_ib_create_mkey(dev
, &mr
->mmkey
, in
, inlen
);
1338 mlx5_ib_warn(dev
, "create mkey failed\n");
1341 mr
->mmkey
.type
= MLX5_MKEY_MR
;
1343 set_mr_fields(dev
, mr
, umem
->length
, access_flags
);
1346 mlx5_ib_dbg(dev
, "mkey = 0x%x\n", mr
->mmkey
.key
);
1354 return ERR_PTR(err
);
1357 static struct ib_mr
*mlx5_ib_get_dm_mr(struct ib_pd
*pd
, u64 start_addr
,
1358 u64 length
, int acc
, int mode
)
1360 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1361 int inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
1362 struct mlx5_ib_mr
*mr
;
1367 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
1369 return ERR_PTR(-ENOMEM
);
1371 in
= kzalloc(inlen
, GFP_KERNEL
);
1377 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
1379 MLX5_SET(mkc
, mkc
, access_mode_1_0
, mode
& 0x3);
1380 MLX5_SET(mkc
, mkc
, access_mode_4_2
, (mode
>> 2) & 0x7);
1381 MLX5_SET64(mkc
, mkc
, len
, length
);
1382 set_mkc_access_pd_addr_fields(mkc
, acc
, start_addr
, pd
);
1384 err
= mlx5_ib_create_mkey(dev
, &mr
->mmkey
, in
, inlen
);
1390 set_mr_fields(dev
, mr
, length
, acc
);
1400 return ERR_PTR(err
);
1403 int mlx5_ib_advise_mr(struct ib_pd
*pd
,
1404 enum ib_uverbs_advise_mr_advice advice
,
1406 struct ib_sge
*sg_list
,
1408 struct uverbs_attr_bundle
*attrs
)
1410 if (advice
!= IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH
&&
1411 advice
!= IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE
&&
1412 advice
!= IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
)
1415 return mlx5_ib_advise_mr_prefetch(pd
, advice
, flags
,
1419 struct ib_mr
*mlx5_ib_reg_dm_mr(struct ib_pd
*pd
, struct ib_dm
*dm
,
1420 struct ib_dm_mr_attr
*attr
,
1421 struct uverbs_attr_bundle
*attrs
)
1423 struct mlx5_ib_dm
*mdm
= to_mdm(dm
);
1424 struct mlx5_core_dev
*dev
= to_mdev(dm
->device
)->mdev
;
1425 u64 start_addr
= mdm
->dev_addr
+ attr
->offset
;
1428 switch (mdm
->type
) {
1429 case MLX5_IB_UAPI_DM_TYPE_MEMIC
:
1430 if (attr
->access_flags
& ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS
)
1431 return ERR_PTR(-EINVAL
);
1433 mode
= MLX5_MKC_ACCESS_MODE_MEMIC
;
1434 start_addr
-= pci_resource_start(dev
->pdev
, 0);
1436 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM
:
1437 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM
:
1438 if (attr
->access_flags
& ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS
)
1439 return ERR_PTR(-EINVAL
);
1441 mode
= MLX5_MKC_ACCESS_MODE_SW_ICM
;
1444 return ERR_PTR(-EINVAL
);
1447 return mlx5_ib_get_dm_mr(pd
, start_addr
, attr
->length
,
1448 attr
->access_flags
, mode
);
1451 static struct ib_mr
*create_real_mr(struct ib_pd
*pd
, struct ib_umem
*umem
,
1452 u64 iova
, int access_flags
)
1454 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1455 struct mlx5_ib_mr
*mr
= NULL
;
1459 xlt_with_umr
= mlx5_ib_can_load_pas_with_umr(dev
, umem
->length
);
1461 mr
= alloc_cacheable_mr(pd
, umem
, iova
, access_flags
);
1463 unsigned int page_size
= mlx5_umem_find_best_pgsz(
1464 umem
, mkc
, log_page_size
, 0, iova
);
1466 mutex_lock(&dev
->slow_path_mutex
);
1467 mr
= reg_create(pd
, umem
, iova
, access_flags
, page_size
, true);
1468 mutex_unlock(&dev
->slow_path_mutex
);
1471 ib_umem_release(umem
);
1472 return ERR_CAST(mr
);
1475 mlx5_ib_dbg(dev
, "mkey 0x%x\n", mr
->mmkey
.key
);
1477 atomic_add(ib_umem_num_pages(umem
), &dev
->mdev
->priv
.reg_pages
);
1481 * If the MR was created with reg_create then it will be
1482 * configured properly but left disabled. It is safe to go ahead
1483 * and configure it again via UMR while enabling it.
1485 err
= mlx5_ib_update_mr_pas(mr
, MLX5_IB_UPD_XLT_ENABLE
);
1487 mlx5_ib_dereg_mr(&mr
->ibmr
, NULL
);
1488 return ERR_PTR(err
);
1494 static struct ib_mr
*create_user_odp_mr(struct ib_pd
*pd
, u64 start
, u64 length
,
1495 u64 iova
, int access_flags
,
1496 struct ib_udata
*udata
)
1498 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1499 struct ib_umem_odp
*odp
;
1500 struct mlx5_ib_mr
*mr
;
1503 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING
))
1504 return ERR_PTR(-EOPNOTSUPP
);
1506 err
= mlx5r_odp_create_eq(dev
, &dev
->odp_pf_eq
);
1508 return ERR_PTR(err
);
1509 if (!start
&& length
== U64_MAX
) {
1511 return ERR_PTR(-EINVAL
);
1512 if (!(dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT_IMPLICIT
))
1513 return ERR_PTR(-EINVAL
);
1515 mr
= mlx5_ib_alloc_implicit_mr(to_mpd(pd
), access_flags
);
1517 return ERR_CAST(mr
);
1521 /* ODP requires xlt update via umr to work. */
1522 if (!mlx5_ib_can_load_pas_with_umr(dev
, length
))
1523 return ERR_PTR(-EINVAL
);
1525 odp
= ib_umem_odp_get(&dev
->ib_dev
, start
, length
, access_flags
,
1528 return ERR_CAST(odp
);
1530 mr
= alloc_cacheable_mr(pd
, &odp
->umem
, iova
, access_flags
);
1532 ib_umem_release(&odp
->umem
);
1533 return ERR_CAST(mr
);
1535 xa_init(&mr
->implicit_children
);
1538 err
= mlx5r_store_odp_mkey(dev
, &mr
->mmkey
);
1542 err
= mlx5_ib_init_odp_mr(mr
);
1548 mlx5_ib_dereg_mr(&mr
->ibmr
, NULL
);
1549 return ERR_PTR(err
);
1552 struct ib_mr
*mlx5_ib_reg_user_mr(struct ib_pd
*pd
, u64 start
, u64 length
,
1553 u64 iova
, int access_flags
,
1554 struct ib_udata
*udata
)
1556 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1557 struct ib_umem
*umem
;
1559 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM
))
1560 return ERR_PTR(-EOPNOTSUPP
);
1562 mlx5_ib_dbg(dev
, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1563 start
, iova
, length
, access_flags
);
1565 if (access_flags
& IB_ACCESS_ON_DEMAND
)
1566 return create_user_odp_mr(pd
, start
, length
, iova
, access_flags
,
1568 umem
= ib_umem_get(&dev
->ib_dev
, start
, length
, access_flags
);
1570 return ERR_CAST(umem
);
1571 return create_real_mr(pd
, umem
, iova
, access_flags
);
1574 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment
*attach
)
1576 struct ib_umem_dmabuf
*umem_dmabuf
= attach
->importer_priv
;
1577 struct mlx5_ib_mr
*mr
= umem_dmabuf
->private;
1579 dma_resv_assert_held(umem_dmabuf
->attach
->dmabuf
->resv
);
1581 if (!umem_dmabuf
->sgt
)
1584 mlx5_ib_update_mr_pas(mr
, MLX5_IB_UPD_XLT_ZAP
);
1585 ib_umem_dmabuf_unmap_pages(umem_dmabuf
);
1588 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops
= {
1589 .allow_peer2peer
= 1,
1590 .move_notify
= mlx5_ib_dmabuf_invalidate_cb
,
1593 struct ib_mr
*mlx5_ib_reg_user_mr_dmabuf(struct ib_pd
*pd
, u64 offset
,
1594 u64 length
, u64 virt_addr
,
1595 int fd
, int access_flags
,
1596 struct ib_udata
*udata
)
1598 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1599 struct mlx5_ib_mr
*mr
= NULL
;
1600 struct ib_umem_dmabuf
*umem_dmabuf
;
1603 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM
) ||
1604 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING
))
1605 return ERR_PTR(-EOPNOTSUPP
);
1608 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
1609 offset
, virt_addr
, length
, fd
, access_flags
);
1611 /* dmabuf requires xlt update via umr to work. */
1612 if (!mlx5_ib_can_load_pas_with_umr(dev
, length
))
1613 return ERR_PTR(-EINVAL
);
1615 umem_dmabuf
= ib_umem_dmabuf_get(&dev
->ib_dev
, offset
, length
, fd
,
1617 &mlx5_ib_dmabuf_attach_ops
);
1618 if (IS_ERR(umem_dmabuf
)) {
1619 mlx5_ib_dbg(dev
, "umem_dmabuf get failed (%ld)\n",
1620 PTR_ERR(umem_dmabuf
));
1621 return ERR_CAST(umem_dmabuf
);
1624 mr
= alloc_cacheable_mr(pd
, &umem_dmabuf
->umem
, virt_addr
,
1627 ib_umem_release(&umem_dmabuf
->umem
);
1628 return ERR_CAST(mr
);
1631 mlx5_ib_dbg(dev
, "mkey 0x%x\n", mr
->mmkey
.key
);
1633 atomic_add(ib_umem_num_pages(mr
->umem
), &dev
->mdev
->priv
.reg_pages
);
1634 umem_dmabuf
->private = mr
;
1635 err
= mlx5r_store_odp_mkey(dev
, &mr
->mmkey
);
1639 err
= mlx5_ib_init_dmabuf_mr(mr
);
1645 mlx5_ib_dereg_mr(&mr
->ibmr
, NULL
);
1646 return ERR_PTR(err
);
1650 * revoke_mr - Fence all DMA on the MR
1651 * @mr: The MR to fence
1653 * Upon return the NIC will not be doing any DMA to the pages under the MR,
1654 * and any DMA in progress will be completed. Failure of this function
1655 * indicates the HW has failed catastrophically.
1657 static int revoke_mr(struct mlx5_ib_mr
*mr
)
1659 struct mlx5_umr_wr umrwr
= {};
1661 if (mr_to_mdev(mr
)->mdev
->state
== MLX5_DEVICE_STATE_INTERNAL_ERROR
)
1664 umrwr
.wr
.send_flags
= MLX5_IB_SEND_UMR_DISABLE_MR
|
1665 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS
;
1666 umrwr
.wr
.opcode
= MLX5_IB_WR_UMR
;
1667 umrwr
.pd
= mr_to_mdev(mr
)->umrc
.pd
;
1668 umrwr
.mkey
= mr
->mmkey
.key
;
1669 umrwr
.ignore_free_state
= 1;
1671 return mlx5_ib_post_send_wait(mr_to_mdev(mr
), &umrwr
);
1675 * True if the change in access flags can be done via UMR, only some access
1676 * flags can be updated.
1678 static bool can_use_umr_rereg_access(struct mlx5_ib_dev
*dev
,
1679 unsigned int current_access_flags
,
1680 unsigned int target_access_flags
)
1682 unsigned int diffs
= current_access_flags
^ target_access_flags
;
1684 if (diffs
& ~(IB_ACCESS_LOCAL_WRITE
| IB_ACCESS_REMOTE_WRITE
|
1685 IB_ACCESS_REMOTE_READ
| IB_ACCESS_RELAXED_ORDERING
))
1687 return mlx5_ib_can_reconfig_with_umr(dev
, current_access_flags
,
1688 target_access_flags
);
1691 static int umr_rereg_pd_access(struct mlx5_ib_mr
*mr
, struct ib_pd
*pd
,
1694 struct mlx5_ib_dev
*dev
= to_mdev(mr
->ibmr
.device
);
1695 struct mlx5_umr_wr umrwr
= {
1697 .send_flags
= MLX5_IB_SEND_UMR_FAIL_IF_FREE
|
1698 MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS
,
1699 .opcode
= MLX5_IB_WR_UMR
,
1701 .mkey
= mr
->mmkey
.key
,
1703 .access_flags
= access_flags
,
1707 err
= mlx5_ib_post_send_wait(dev
, &umrwr
);
1711 mr
->access_flags
= access_flags
;
1712 mr
->mmkey
.pd
= to_mpd(pd
)->pdn
;
1716 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr
*mr
,
1717 struct ib_umem
*new_umem
,
1718 int new_access_flags
, u64 iova
,
1719 unsigned long *page_size
)
1721 struct mlx5_ib_dev
*dev
= to_mdev(mr
->ibmr
.device
);
1723 /* We only track the allocated sizes of MRs from the cache */
1726 if (!mlx5_ib_can_load_pas_with_umr(dev
, new_umem
->length
))
1730 mlx5_umem_find_best_pgsz(new_umem
, mkc
, log_page_size
, 0, iova
);
1731 if (WARN_ON(!*page_size
))
1733 return (1ULL << mr
->cache_ent
->order
) >=
1734 ib_umem_num_dma_blocks(new_umem
, *page_size
);
1737 static int umr_rereg_pas(struct mlx5_ib_mr
*mr
, struct ib_pd
*pd
,
1738 int access_flags
, int flags
, struct ib_umem
*new_umem
,
1739 u64 iova
, unsigned long page_size
)
1741 struct mlx5_ib_dev
*dev
= to_mdev(mr
->ibmr
.device
);
1742 int upd_flags
= MLX5_IB_UPD_XLT_ADDR
| MLX5_IB_UPD_XLT_ENABLE
;
1743 struct ib_umem
*old_umem
= mr
->umem
;
1747 * To keep everything simple the MR is revoked before we start to mess
1748 * with it. This ensure the change is atomic relative to any use of the
1751 err
= revoke_mr(mr
);
1755 if (flags
& IB_MR_REREG_PD
) {
1757 mr
->mmkey
.pd
= to_mpd(pd
)->pdn
;
1758 upd_flags
|= MLX5_IB_UPD_XLT_PD
;
1760 if (flags
& IB_MR_REREG_ACCESS
) {
1761 mr
->access_flags
= access_flags
;
1762 upd_flags
|= MLX5_IB_UPD_XLT_ACCESS
;
1765 mr
->ibmr
.length
= new_umem
->length
;
1766 mr
->mmkey
.iova
= iova
;
1767 mr
->mmkey
.size
= new_umem
->length
;
1768 mr
->page_shift
= order_base_2(page_size
);
1769 mr
->umem
= new_umem
;
1770 err
= mlx5_ib_update_mr_pas(mr
, upd_flags
);
1773 * The MR is revoked at this point so there is no issue to free
1776 mr
->umem
= old_umem
;
1780 atomic_sub(ib_umem_num_pages(old_umem
), &dev
->mdev
->priv
.reg_pages
);
1781 ib_umem_release(old_umem
);
1782 atomic_add(ib_umem_num_pages(new_umem
), &dev
->mdev
->priv
.reg_pages
);
1786 struct ib_mr
*mlx5_ib_rereg_user_mr(struct ib_mr
*ib_mr
, int flags
, u64 start
,
1787 u64 length
, u64 iova
, int new_access_flags
,
1788 struct ib_pd
*new_pd
,
1789 struct ib_udata
*udata
)
1791 struct mlx5_ib_dev
*dev
= to_mdev(ib_mr
->device
);
1792 struct mlx5_ib_mr
*mr
= to_mmr(ib_mr
);
1795 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM
))
1796 return ERR_PTR(-EOPNOTSUPP
);
1800 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1801 start
, iova
, length
, new_access_flags
);
1803 if (flags
& ~(IB_MR_REREG_TRANS
| IB_MR_REREG_PD
| IB_MR_REREG_ACCESS
))
1804 return ERR_PTR(-EOPNOTSUPP
);
1806 if (!(flags
& IB_MR_REREG_ACCESS
))
1807 new_access_flags
= mr
->access_flags
;
1808 if (!(flags
& IB_MR_REREG_PD
))
1811 if (!(flags
& IB_MR_REREG_TRANS
)) {
1812 struct ib_umem
*umem
;
1814 /* Fast path for PD/access change */
1815 if (can_use_umr_rereg_access(dev
, mr
->access_flags
,
1816 new_access_flags
)) {
1817 err
= umr_rereg_pd_access(mr
, new_pd
, new_access_flags
);
1819 return ERR_PTR(err
);
1822 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1823 if (!mr
->umem
|| is_odp_mr(mr
) || is_dmabuf_mr(mr
))
1827 * Only one active MR can refer to a umem at one time, revoke
1828 * the old MR before assigning the umem to the new one.
1830 err
= revoke_mr(mr
);
1832 return ERR_PTR(err
);
1835 atomic_sub(ib_umem_num_pages(umem
), &dev
->mdev
->priv
.reg_pages
);
1837 return create_real_mr(new_pd
, umem
, mr
->mmkey
.iova
,
1842 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1843 * but the logic around releasing the umem is different
1845 if (!mr
->umem
|| is_odp_mr(mr
) || is_dmabuf_mr(mr
))
1848 if (!(new_access_flags
& IB_ACCESS_ON_DEMAND
) &&
1849 can_use_umr_rereg_access(dev
, mr
->access_flags
, new_access_flags
)) {
1850 struct ib_umem
*new_umem
;
1851 unsigned long page_size
;
1853 new_umem
= ib_umem_get(&dev
->ib_dev
, start
, length
,
1855 if (IS_ERR(new_umem
))
1856 return ERR_CAST(new_umem
);
1858 /* Fast path for PAS change */
1859 if (can_use_umr_rereg_pas(mr
, new_umem
, new_access_flags
, iova
,
1861 err
= umr_rereg_pas(mr
, new_pd
, new_access_flags
, flags
,
1862 new_umem
, iova
, page_size
);
1864 ib_umem_release(new_umem
);
1865 return ERR_PTR(err
);
1869 return create_real_mr(new_pd
, new_umem
, iova
, new_access_flags
);
1873 * Everything else has no state we can preserve, just create a new MR
1877 return mlx5_ib_reg_user_mr(new_pd
, start
, length
, iova
,
1878 new_access_flags
, udata
);
1882 mlx5_alloc_priv_descs(struct ib_device
*device
,
1883 struct mlx5_ib_mr
*mr
,
1887 struct mlx5_ib_dev
*dev
= to_mdev(device
);
1888 struct device
*ddev
= &dev
->mdev
->pdev
->dev
;
1889 int size
= ndescs
* desc_size
;
1893 add_size
= max_t(int, MLX5_UMR_ALIGN
- ARCH_KMALLOC_MINALIGN
, 0);
1895 mr
->descs_alloc
= kzalloc(size
+ add_size
, GFP_KERNEL
);
1896 if (!mr
->descs_alloc
)
1899 mr
->descs
= PTR_ALIGN(mr
->descs_alloc
, MLX5_UMR_ALIGN
);
1901 mr
->desc_map
= dma_map_single(ddev
, mr
->descs
, size
, DMA_TO_DEVICE
);
1902 if (dma_mapping_error(ddev
, mr
->desc_map
)) {
1909 kfree(mr
->descs_alloc
);
1915 mlx5_free_priv_descs(struct mlx5_ib_mr
*mr
)
1917 if (!mr
->umem
&& mr
->descs
) {
1918 struct ib_device
*device
= mr
->ibmr
.device
;
1919 int size
= mr
->max_descs
* mr
->desc_size
;
1920 struct mlx5_ib_dev
*dev
= to_mdev(device
);
1922 dma_unmap_single(&dev
->mdev
->pdev
->dev
, mr
->desc_map
, size
,
1924 kfree(mr
->descs_alloc
);
1929 int mlx5_ib_dereg_mr(struct ib_mr
*ibmr
, struct ib_udata
*udata
)
1931 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
1932 struct mlx5_ib_dev
*dev
= to_mdev(ibmr
->device
);
1936 * Any async use of the mr must hold the refcount, once the refcount
1937 * goes to zero no other thread, such as ODP page faults, prefetch, any
1938 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
1940 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING
) &&
1941 refcount_read(&mr
->mmkey
.usecount
) != 0 &&
1942 xa_erase(&mr_to_mdev(mr
)->odp_mkeys
, mlx5_base_mkey(mr
->mmkey
.key
)))
1943 mlx5r_deref_wait_odp_mkey(&mr
->mmkey
);
1945 if (ibmr
->type
== IB_MR_TYPE_INTEGRITY
) {
1946 xa_cmpxchg(&dev
->sig_mrs
, mlx5_base_mkey(mr
->mmkey
.key
),
1947 mr
->sig
, NULL
, GFP_KERNEL
);
1950 rc
= mlx5_ib_dereg_mr(&mr
->mtt_mr
->ibmr
, NULL
);
1956 rc
= mlx5_ib_dereg_mr(&mr
->klm_mr
->ibmr
, NULL
);
1962 if (mlx5_core_destroy_psv(dev
->mdev
,
1963 mr
->sig
->psv_memory
.psv_idx
))
1964 mlx5_ib_warn(dev
, "failed to destroy mem psv %d\n",
1965 mr
->sig
->psv_memory
.psv_idx
);
1966 if (mlx5_core_destroy_psv(dev
->mdev
, mr
->sig
->psv_wire
.psv_idx
))
1967 mlx5_ib_warn(dev
, "failed to destroy wire psv %d\n",
1968 mr
->sig
->psv_wire
.psv_idx
);
1974 if (mr
->cache_ent
) {
1975 if (revoke_mr(mr
)) {
1976 spin_lock_irq(&mr
->cache_ent
->lock
);
1977 mr
->cache_ent
->total_mrs
--;
1978 spin_unlock_irq(&mr
->cache_ent
->lock
);
1979 mr
->cache_ent
= NULL
;
1982 if (!mr
->cache_ent
) {
1983 rc
= destroy_mkey(to_mdev(mr
->ibmr
.device
), mr
);
1989 bool is_odp
= is_odp_mr(mr
);
1992 atomic_sub(ib_umem_num_pages(mr
->umem
),
1993 &dev
->mdev
->priv
.reg_pages
);
1994 ib_umem_release(mr
->umem
);
1996 mlx5_ib_free_odp_mr(mr
);
1999 if (mr
->cache_ent
) {
2000 mlx5_mr_cache_free(dev
, mr
);
2002 mlx5_free_priv_descs(mr
);
2008 static void mlx5_set_umr_free_mkey(struct ib_pd
*pd
, u32
*in
, int ndescs
,
2009 int access_mode
, int page_shift
)
2013 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
2015 /* This is only used from the kernel, so setting the PD is OK. */
2016 set_mkc_access_pd_addr_fields(mkc
, IB_ACCESS_RELAXED_ORDERING
, 0, pd
);
2017 MLX5_SET(mkc
, mkc
, free
, 1);
2018 MLX5_SET(mkc
, mkc
, translations_octword_size
, ndescs
);
2019 MLX5_SET(mkc
, mkc
, access_mode_1_0
, access_mode
& 0x3);
2020 MLX5_SET(mkc
, mkc
, access_mode_4_2
, (access_mode
>> 2) & 0x7);
2021 MLX5_SET(mkc
, mkc
, umr_en
, 1);
2022 MLX5_SET(mkc
, mkc
, log_page_size
, page_shift
);
2025 static int _mlx5_alloc_mkey_descs(struct ib_pd
*pd
, struct mlx5_ib_mr
*mr
,
2026 int ndescs
, int desc_size
, int page_shift
,
2027 int access_mode
, u32
*in
, int inlen
)
2029 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
2032 mr
->access_mode
= access_mode
;
2033 mr
->desc_size
= desc_size
;
2034 mr
->max_descs
= ndescs
;
2036 err
= mlx5_alloc_priv_descs(pd
->device
, mr
, ndescs
, desc_size
);
2040 mlx5_set_umr_free_mkey(pd
, in
, ndescs
, access_mode
, page_shift
);
2042 err
= mlx5_ib_create_mkey(dev
, &mr
->mmkey
, in
, inlen
);
2044 goto err_free_descs
;
2046 mr
->mmkey
.type
= MLX5_MKEY_MR
;
2047 mr
->ibmr
.lkey
= mr
->mmkey
.key
;
2048 mr
->ibmr
.rkey
= mr
->mmkey
.key
;
2053 mlx5_free_priv_descs(mr
);
2057 static struct mlx5_ib_mr
*mlx5_ib_alloc_pi_mr(struct ib_pd
*pd
,
2058 u32 max_num_sg
, u32 max_num_meta_sg
,
2059 int desc_size
, int access_mode
)
2061 int inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
2062 int ndescs
= ALIGN(max_num_sg
+ max_num_meta_sg
, 4);
2064 struct mlx5_ib_mr
*mr
;
2068 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
2070 return ERR_PTR(-ENOMEM
);
2073 mr
->ibmr
.device
= pd
->device
;
2075 in
= kzalloc(inlen
, GFP_KERNEL
);
2081 if (access_mode
== MLX5_MKC_ACCESS_MODE_MTT
)
2082 page_shift
= PAGE_SHIFT
;
2084 err
= _mlx5_alloc_mkey_descs(pd
, mr
, ndescs
, desc_size
, page_shift
,
2085 access_mode
, in
, inlen
);
2098 return ERR_PTR(err
);
2101 static int mlx5_alloc_mem_reg_descs(struct ib_pd
*pd
, struct mlx5_ib_mr
*mr
,
2102 int ndescs
, u32
*in
, int inlen
)
2104 return _mlx5_alloc_mkey_descs(pd
, mr
, ndescs
, sizeof(struct mlx5_mtt
),
2105 PAGE_SHIFT
, MLX5_MKC_ACCESS_MODE_MTT
, in
,
2109 static int mlx5_alloc_sg_gaps_descs(struct ib_pd
*pd
, struct mlx5_ib_mr
*mr
,
2110 int ndescs
, u32
*in
, int inlen
)
2112 return _mlx5_alloc_mkey_descs(pd
, mr
, ndescs
, sizeof(struct mlx5_klm
),
2113 0, MLX5_MKC_ACCESS_MODE_KLMS
, in
, inlen
);
2116 static int mlx5_alloc_integrity_descs(struct ib_pd
*pd
, struct mlx5_ib_mr
*mr
,
2117 int max_num_sg
, int max_num_meta_sg
,
2120 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
2125 mr
->sig
= kzalloc(sizeof(*mr
->sig
), GFP_KERNEL
);
2129 /* create mem & wire PSVs */
2130 err
= mlx5_core_create_psv(dev
->mdev
, to_mpd(pd
)->pdn
, 2, psv_index
);
2134 mr
->sig
->psv_memory
.psv_idx
= psv_index
[0];
2135 mr
->sig
->psv_wire
.psv_idx
= psv_index
[1];
2137 mr
->sig
->sig_status_checked
= true;
2138 mr
->sig
->sig_err_exists
= false;
2139 /* Next UMR, Arm SIGERR */
2140 ++mr
->sig
->sigerr_count
;
2141 mr
->klm_mr
= mlx5_ib_alloc_pi_mr(pd
, max_num_sg
, max_num_meta_sg
,
2142 sizeof(struct mlx5_klm
),
2143 MLX5_MKC_ACCESS_MODE_KLMS
);
2144 if (IS_ERR(mr
->klm_mr
)) {
2145 err
= PTR_ERR(mr
->klm_mr
);
2146 goto err_destroy_psv
;
2148 mr
->mtt_mr
= mlx5_ib_alloc_pi_mr(pd
, max_num_sg
, max_num_meta_sg
,
2149 sizeof(struct mlx5_mtt
),
2150 MLX5_MKC_ACCESS_MODE_MTT
);
2151 if (IS_ERR(mr
->mtt_mr
)) {
2152 err
= PTR_ERR(mr
->mtt_mr
);
2153 goto err_free_klm_mr
;
2156 /* Set bsf descriptors for mkey */
2157 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
2158 MLX5_SET(mkc
, mkc
, bsf_en
, 1);
2159 MLX5_SET(mkc
, mkc
, bsf_octword_size
, MLX5_MKEY_BSF_OCTO_SIZE
);
2161 err
= _mlx5_alloc_mkey_descs(pd
, mr
, 4, sizeof(struct mlx5_klm
), 0,
2162 MLX5_MKC_ACCESS_MODE_KLMS
, in
, inlen
);
2164 goto err_free_mtt_mr
;
2166 err
= xa_err(xa_store(&dev
->sig_mrs
, mlx5_base_mkey(mr
->mmkey
.key
),
2167 mr
->sig
, GFP_KERNEL
));
2169 goto err_free_descs
;
2173 destroy_mkey(dev
, mr
);
2174 mlx5_free_priv_descs(mr
);
2176 mlx5_ib_dereg_mr(&mr
->mtt_mr
->ibmr
, NULL
);
2179 mlx5_ib_dereg_mr(&mr
->klm_mr
->ibmr
, NULL
);
2182 if (mlx5_core_destroy_psv(dev
->mdev
, mr
->sig
->psv_memory
.psv_idx
))
2183 mlx5_ib_warn(dev
, "failed to destroy mem psv %d\n",
2184 mr
->sig
->psv_memory
.psv_idx
);
2185 if (mlx5_core_destroy_psv(dev
->mdev
, mr
->sig
->psv_wire
.psv_idx
))
2186 mlx5_ib_warn(dev
, "failed to destroy wire psv %d\n",
2187 mr
->sig
->psv_wire
.psv_idx
);
2194 static struct ib_mr
*__mlx5_ib_alloc_mr(struct ib_pd
*pd
,
2195 enum ib_mr_type mr_type
, u32 max_num_sg
,
2196 u32 max_num_meta_sg
)
2198 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
2199 int inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
2200 int ndescs
= ALIGN(max_num_sg
, 4);
2201 struct mlx5_ib_mr
*mr
;
2205 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
2207 return ERR_PTR(-ENOMEM
);
2209 in
= kzalloc(inlen
, GFP_KERNEL
);
2215 mr
->ibmr
.device
= pd
->device
;
2219 case IB_MR_TYPE_MEM_REG
:
2220 err
= mlx5_alloc_mem_reg_descs(pd
, mr
, ndescs
, in
, inlen
);
2222 case IB_MR_TYPE_SG_GAPS
:
2223 err
= mlx5_alloc_sg_gaps_descs(pd
, mr
, ndescs
, in
, inlen
);
2225 case IB_MR_TYPE_INTEGRITY
:
2226 err
= mlx5_alloc_integrity_descs(pd
, mr
, max_num_sg
,
2227 max_num_meta_sg
, in
, inlen
);
2230 mlx5_ib_warn(dev
, "Invalid mr type %d\n", mr_type
);
2245 return ERR_PTR(err
);
2248 struct ib_mr
*mlx5_ib_alloc_mr(struct ib_pd
*pd
, enum ib_mr_type mr_type
,
2251 return __mlx5_ib_alloc_mr(pd
, mr_type
, max_num_sg
, 0);
2254 struct ib_mr
*mlx5_ib_alloc_mr_integrity(struct ib_pd
*pd
,
2255 u32 max_num_sg
, u32 max_num_meta_sg
)
2257 return __mlx5_ib_alloc_mr(pd
, IB_MR_TYPE_INTEGRITY
, max_num_sg
,
2261 int mlx5_ib_alloc_mw(struct ib_mw
*ibmw
, struct ib_udata
*udata
)
2263 struct mlx5_ib_dev
*dev
= to_mdev(ibmw
->device
);
2264 int inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
2265 struct mlx5_ib_mw
*mw
= to_mmw(ibmw
);
2270 struct mlx5_ib_alloc_mw req
= {};
2273 __u32 response_length
;
2276 err
= ib_copy_from_udata(&req
, udata
, min(udata
->inlen
, sizeof(req
)));
2280 if (req
.comp_mask
|| req
.reserved1
|| req
.reserved2
)
2283 if (udata
->inlen
> sizeof(req
) &&
2284 !ib_is_udata_cleared(udata
, sizeof(req
),
2285 udata
->inlen
- sizeof(req
)))
2288 ndescs
= req
.num_klms
? roundup(req
.num_klms
, 4) : roundup(1, 4);
2290 in
= kzalloc(inlen
, GFP_KERNEL
);
2296 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
2298 MLX5_SET(mkc
, mkc
, free
, 1);
2299 MLX5_SET(mkc
, mkc
, translations_octword_size
, ndescs
);
2300 MLX5_SET(mkc
, mkc
, pd
, to_mpd(ibmw
->pd
)->pdn
);
2301 MLX5_SET(mkc
, mkc
, umr_en
, 1);
2302 MLX5_SET(mkc
, mkc
, lr
, 1);
2303 MLX5_SET(mkc
, mkc
, access_mode_1_0
, MLX5_MKC_ACCESS_MODE_KLMS
);
2304 MLX5_SET(mkc
, mkc
, en_rinval
, !!((ibmw
->type
== IB_MW_TYPE_2
)));
2305 MLX5_SET(mkc
, mkc
, qpn
, 0xffffff);
2307 err
= mlx5_ib_create_mkey(dev
, &mw
->mmkey
, in
, inlen
);
2311 mw
->mmkey
.type
= MLX5_MKEY_MW
;
2312 ibmw
->rkey
= mw
->mmkey
.key
;
2313 mw
->ndescs
= ndescs
;
2315 resp
.response_length
=
2316 min(offsetofend(typeof(resp
), response_length
), udata
->outlen
);
2317 if (resp
.response_length
) {
2318 err
= ib_copy_to_udata(udata
, &resp
, resp
.response_length
);
2323 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING
)) {
2324 err
= mlx5r_store_odp_mkey(dev
, &mw
->mmkey
);
2333 mlx5_core_destroy_mkey(dev
->mdev
, &mw
->mmkey
);
2339 int mlx5_ib_dealloc_mw(struct ib_mw
*mw
)
2341 struct mlx5_ib_dev
*dev
= to_mdev(mw
->device
);
2342 struct mlx5_ib_mw
*mmw
= to_mmw(mw
);
2344 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING
) &&
2345 xa_erase(&dev
->odp_mkeys
, mlx5_base_mkey(mmw
->mmkey
.key
)))
2347 * pagefault_single_data_segment() may be accessing mmw
2348 * if the user bound an ODP MR to this MW.
2350 mlx5r_deref_wait_odp_mkey(&mmw
->mmkey
);
2352 return mlx5_core_destroy_mkey(dev
->mdev
, &mmw
->mmkey
);
2355 int mlx5_ib_check_mr_status(struct ib_mr
*ibmr
, u32 check_mask
,
2356 struct ib_mr_status
*mr_status
)
2358 struct mlx5_ib_mr
*mmr
= to_mmr(ibmr
);
2361 if (check_mask
& ~IB_MR_CHECK_SIG_STATUS
) {
2362 pr_err("Invalid status check mask\n");
2367 mr_status
->fail_status
= 0;
2368 if (check_mask
& IB_MR_CHECK_SIG_STATUS
) {
2371 pr_err("signature status check requested on a non-signature enabled MR\n");
2375 mmr
->sig
->sig_status_checked
= true;
2376 if (!mmr
->sig
->sig_err_exists
)
2379 if (ibmr
->lkey
== mmr
->sig
->err_item
.key
)
2380 memcpy(&mr_status
->sig_err
, &mmr
->sig
->err_item
,
2381 sizeof(mr_status
->sig_err
));
2383 mr_status
->sig_err
.err_type
= IB_SIG_BAD_GUARD
;
2384 mr_status
->sig_err
.sig_err_offset
= 0;
2385 mr_status
->sig_err
.key
= mmr
->sig
->err_item
.key
;
2388 mmr
->sig
->sig_err_exists
= false;
2389 mr_status
->fail_status
|= IB_MR_CHECK_SIG_STATUS
;
2397 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr
*ibmr
, struct scatterlist
*data_sg
,
2398 int data_sg_nents
, unsigned int *data_sg_offset
,
2399 struct scatterlist
*meta_sg
, int meta_sg_nents
,
2400 unsigned int *meta_sg_offset
)
2402 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2403 unsigned int sg_offset
= 0;
2406 mr
->meta_length
= 0;
2407 if (data_sg_nents
== 1) {
2411 sg_offset
= *data_sg_offset
;
2412 mr
->data_length
= sg_dma_len(data_sg
) - sg_offset
;
2413 mr
->data_iova
= sg_dma_address(data_sg
) + sg_offset
;
2414 if (meta_sg_nents
== 1) {
2416 mr
->meta_ndescs
= 1;
2418 sg_offset
= *meta_sg_offset
;
2421 mr
->meta_length
= sg_dma_len(meta_sg
) - sg_offset
;
2422 mr
->pi_iova
= sg_dma_address(meta_sg
) + sg_offset
;
2424 ibmr
->length
= mr
->data_length
+ mr
->meta_length
;
2431 mlx5_ib_sg_to_klms(struct mlx5_ib_mr
*mr
,
2432 struct scatterlist
*sgl
,
2433 unsigned short sg_nents
,
2434 unsigned int *sg_offset_p
,
2435 struct scatterlist
*meta_sgl
,
2436 unsigned short meta_sg_nents
,
2437 unsigned int *meta_sg_offset_p
)
2439 struct scatterlist
*sg
= sgl
;
2440 struct mlx5_klm
*klms
= mr
->descs
;
2441 unsigned int sg_offset
= sg_offset_p
? *sg_offset_p
: 0;
2442 u32 lkey
= mr
->ibmr
.pd
->local_dma_lkey
;
2445 mr
->ibmr
.iova
= sg_dma_address(sg
) + sg_offset
;
2446 mr
->ibmr
.length
= 0;
2448 for_each_sg(sgl
, sg
, sg_nents
, i
) {
2449 if (unlikely(i
>= mr
->max_descs
))
2451 klms
[i
].va
= cpu_to_be64(sg_dma_address(sg
) + sg_offset
);
2452 klms
[i
].bcount
= cpu_to_be32(sg_dma_len(sg
) - sg_offset
);
2453 klms
[i
].key
= cpu_to_be32(lkey
);
2454 mr
->ibmr
.length
+= sg_dma_len(sg
) - sg_offset
;
2460 *sg_offset_p
= sg_offset
;
2463 mr
->data_length
= mr
->ibmr
.length
;
2465 if (meta_sg_nents
) {
2467 sg_offset
= meta_sg_offset_p
? *meta_sg_offset_p
: 0;
2468 for_each_sg(meta_sgl
, sg
, meta_sg_nents
, j
) {
2469 if (unlikely(i
+ j
>= mr
->max_descs
))
2471 klms
[i
+ j
].va
= cpu_to_be64(sg_dma_address(sg
) +
2473 klms
[i
+ j
].bcount
= cpu_to_be32(sg_dma_len(sg
) -
2475 klms
[i
+ j
].key
= cpu_to_be32(lkey
);
2476 mr
->ibmr
.length
+= sg_dma_len(sg
) - sg_offset
;
2480 if (meta_sg_offset_p
)
2481 *meta_sg_offset_p
= sg_offset
;
2483 mr
->meta_ndescs
= j
;
2484 mr
->meta_length
= mr
->ibmr
.length
- mr
->data_length
;
2490 static int mlx5_set_page(struct ib_mr
*ibmr
, u64 addr
)
2492 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2495 if (unlikely(mr
->ndescs
== mr
->max_descs
))
2499 descs
[mr
->ndescs
++] = cpu_to_be64(addr
| MLX5_EN_RD
| MLX5_EN_WR
);
2504 static int mlx5_set_page_pi(struct ib_mr
*ibmr
, u64 addr
)
2506 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2509 if (unlikely(mr
->ndescs
+ mr
->meta_ndescs
== mr
->max_descs
))
2513 descs
[mr
->ndescs
+ mr
->meta_ndescs
++] =
2514 cpu_to_be64(addr
| MLX5_EN_RD
| MLX5_EN_WR
);
2520 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr
*ibmr
, struct scatterlist
*data_sg
,
2521 int data_sg_nents
, unsigned int *data_sg_offset
,
2522 struct scatterlist
*meta_sg
, int meta_sg_nents
,
2523 unsigned int *meta_sg_offset
)
2525 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2526 struct mlx5_ib_mr
*pi_mr
= mr
->mtt_mr
;
2530 pi_mr
->meta_ndescs
= 0;
2531 pi_mr
->meta_length
= 0;
2533 ib_dma_sync_single_for_cpu(ibmr
->device
, pi_mr
->desc_map
,
2534 pi_mr
->desc_size
* pi_mr
->max_descs
,
2537 pi_mr
->ibmr
.page_size
= ibmr
->page_size
;
2538 n
= ib_sg_to_pages(&pi_mr
->ibmr
, data_sg
, data_sg_nents
, data_sg_offset
,
2540 if (n
!= data_sg_nents
)
2543 pi_mr
->data_iova
= pi_mr
->ibmr
.iova
;
2544 pi_mr
->data_length
= pi_mr
->ibmr
.length
;
2545 pi_mr
->ibmr
.length
= pi_mr
->data_length
;
2546 ibmr
->length
= pi_mr
->data_length
;
2548 if (meta_sg_nents
) {
2549 u64 page_mask
= ~((u64
)ibmr
->page_size
- 1);
2550 u64 iova
= pi_mr
->data_iova
;
2552 n
+= ib_sg_to_pages(&pi_mr
->ibmr
, meta_sg
, meta_sg_nents
,
2553 meta_sg_offset
, mlx5_set_page_pi
);
2555 pi_mr
->meta_length
= pi_mr
->ibmr
.length
;
2557 * PI address for the HW is the offset of the metadata address
2558 * relative to the first data page address.
2559 * It equals to first data page address + size of data pages +
2560 * metadata offset at the first metadata page
2562 pi_mr
->pi_iova
= (iova
& page_mask
) +
2563 pi_mr
->ndescs
* ibmr
->page_size
+
2564 (pi_mr
->ibmr
.iova
& ~page_mask
);
2566 * In order to use one MTT MR for data and metadata, we register
2567 * also the gaps between the end of the data and the start of
2568 * the metadata (the sig MR will verify that the HW will access
2569 * to right addresses). This mapping is safe because we use
2570 * internal mkey for the registration.
2572 pi_mr
->ibmr
.length
= pi_mr
->pi_iova
+ pi_mr
->meta_length
- iova
;
2573 pi_mr
->ibmr
.iova
= iova
;
2574 ibmr
->length
+= pi_mr
->meta_length
;
2577 ib_dma_sync_single_for_device(ibmr
->device
, pi_mr
->desc_map
,
2578 pi_mr
->desc_size
* pi_mr
->max_descs
,
2585 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr
*ibmr
, struct scatterlist
*data_sg
,
2586 int data_sg_nents
, unsigned int *data_sg_offset
,
2587 struct scatterlist
*meta_sg
, int meta_sg_nents
,
2588 unsigned int *meta_sg_offset
)
2590 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2591 struct mlx5_ib_mr
*pi_mr
= mr
->klm_mr
;
2595 pi_mr
->meta_ndescs
= 0;
2596 pi_mr
->meta_length
= 0;
2598 ib_dma_sync_single_for_cpu(ibmr
->device
, pi_mr
->desc_map
,
2599 pi_mr
->desc_size
* pi_mr
->max_descs
,
2602 n
= mlx5_ib_sg_to_klms(pi_mr
, data_sg
, data_sg_nents
, data_sg_offset
,
2603 meta_sg
, meta_sg_nents
, meta_sg_offset
);
2605 ib_dma_sync_single_for_device(ibmr
->device
, pi_mr
->desc_map
,
2606 pi_mr
->desc_size
* pi_mr
->max_descs
,
2609 /* This is zero-based memory region */
2610 pi_mr
->data_iova
= 0;
2611 pi_mr
->ibmr
.iova
= 0;
2612 pi_mr
->pi_iova
= pi_mr
->data_length
;
2613 ibmr
->length
= pi_mr
->ibmr
.length
;
2618 int mlx5_ib_map_mr_sg_pi(struct ib_mr
*ibmr
, struct scatterlist
*data_sg
,
2619 int data_sg_nents
, unsigned int *data_sg_offset
,
2620 struct scatterlist
*meta_sg
, int meta_sg_nents
,
2621 unsigned int *meta_sg_offset
)
2623 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2624 struct mlx5_ib_mr
*pi_mr
= NULL
;
2627 WARN_ON(ibmr
->type
!= IB_MR_TYPE_INTEGRITY
);
2630 mr
->data_length
= 0;
2632 mr
->meta_ndescs
= 0;
2635 * As a performance optimization, if possible, there is no need to
2636 * perform UMR operation to register the data/metadata buffers.
2637 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2638 * Fallback to UMR only in case of a failure.
2640 n
= mlx5_ib_map_pa_mr_sg_pi(ibmr
, data_sg
, data_sg_nents
,
2641 data_sg_offset
, meta_sg
, meta_sg_nents
,
2643 if (n
== data_sg_nents
+ meta_sg_nents
)
2646 * As a performance optimization, if possible, there is no need to map
2647 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2648 * descriptors and fallback to KLM only in case of a failure.
2649 * It's more efficient for the HW to work with MTT descriptors
2650 * (especially in high load).
2651 * Use KLM (indirect access) only if it's mandatory.
2654 n
= mlx5_ib_map_mtt_mr_sg_pi(ibmr
, data_sg
, data_sg_nents
,
2655 data_sg_offset
, meta_sg
, meta_sg_nents
,
2657 if (n
== data_sg_nents
+ meta_sg_nents
)
2661 n
= mlx5_ib_map_klm_mr_sg_pi(ibmr
, data_sg
, data_sg_nents
,
2662 data_sg_offset
, meta_sg
, meta_sg_nents
,
2664 if (unlikely(n
!= data_sg_nents
+ meta_sg_nents
))
2668 /* This is zero-based memory region */
2672 ibmr
->sig_attrs
->meta_length
= pi_mr
->meta_length
;
2674 ibmr
->sig_attrs
->meta_length
= mr
->meta_length
;
2679 int mlx5_ib_map_mr_sg(struct ib_mr
*ibmr
, struct scatterlist
*sg
, int sg_nents
,
2680 unsigned int *sg_offset
)
2682 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2687 ib_dma_sync_single_for_cpu(ibmr
->device
, mr
->desc_map
,
2688 mr
->desc_size
* mr
->max_descs
,
2691 if (mr
->access_mode
== MLX5_MKC_ACCESS_MODE_KLMS
)
2692 n
= mlx5_ib_sg_to_klms(mr
, sg
, sg_nents
, sg_offset
, NULL
, 0,
2695 n
= ib_sg_to_pages(ibmr
, sg
, sg_nents
, sg_offset
,
2698 ib_dma_sync_single_for_device(ibmr
->device
, mr
->desc_map
,
2699 mr
->desc_size
* mr
->max_descs
,