2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <rdma/ib_umem.h>
34 #include <rdma/ib_umem_odp.h>
35 #include <linux/kernel.h>
40 #define MAX_PREFETCH_LEN (4*1024*1024U)
42 /* Timeout in ms to wait for an active mmu notifier to complete when handling
44 #define MMU_NOTIFIER_TIMEOUT 1000
46 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
47 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
48 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
49 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
50 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
52 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
54 static u64 mlx5_imr_ksm_entries
;
56 static int check_parent(struct ib_umem_odp
*odp
,
57 struct mlx5_ib_mr
*parent
)
59 struct mlx5_ib_mr
*mr
= odp
->private;
61 return mr
&& mr
->parent
== parent
&& !odp
->dying
;
64 struct ib_ucontext_per_mm
*mr_to_per_mm(struct mlx5_ib_mr
*mr
)
66 if (WARN_ON(!mr
|| !mr
->umem
|| !mr
->umem
->is_odp
))
69 return to_ib_umem_odp(mr
->umem
)->per_mm
;
72 static struct ib_umem_odp
*odp_next(struct ib_umem_odp
*odp
)
74 struct mlx5_ib_mr
*mr
= odp
->private, *parent
= mr
->parent
;
75 struct ib_ucontext_per_mm
*per_mm
= odp
->per_mm
;
78 down_read(&per_mm
->umem_rwsem
);
80 rb
= rb_next(&odp
->interval_tree
.rb
);
83 odp
= rb_entry(rb
, struct ib_umem_odp
, interval_tree
.rb
);
84 if (check_parent(odp
, parent
))
90 up_read(&per_mm
->umem_rwsem
);
94 static struct ib_umem_odp
*odp_lookup(u64 start
, u64 length
,
95 struct mlx5_ib_mr
*parent
)
97 struct ib_ucontext_per_mm
*per_mm
= mr_to_per_mm(parent
);
98 struct ib_umem_odp
*odp
;
101 down_read(&per_mm
->umem_rwsem
);
102 odp
= rbt_ib_umem_lookup(&per_mm
->umem_tree
, start
, length
);
107 if (check_parent(odp
, parent
))
109 rb
= rb_next(&odp
->interval_tree
.rb
);
112 odp
= rb_entry(rb
, struct ib_umem_odp
, interval_tree
.rb
);
113 if (ib_umem_start(&odp
->umem
) > start
+ length
)
119 up_read(&per_mm
->umem_rwsem
);
123 void mlx5_odp_populate_klm(struct mlx5_klm
*pklm
, size_t offset
,
124 size_t nentries
, struct mlx5_ib_mr
*mr
, int flags
)
126 struct ib_pd
*pd
= mr
->ibmr
.pd
;
127 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
128 struct ib_umem_odp
*odp
;
132 if (flags
& MLX5_IB_UPD_XLT_ZAP
) {
133 for (i
= 0; i
< nentries
; i
++, pklm
++) {
134 pklm
->bcount
= cpu_to_be32(MLX5_IMR_MTT_SIZE
);
135 pklm
->key
= cpu_to_be32(dev
->null_mkey
);
141 odp
= odp_lookup(offset
* MLX5_IMR_MTT_SIZE
,
142 nentries
* MLX5_IMR_MTT_SIZE
, mr
);
144 for (i
= 0; i
< nentries
; i
++, pklm
++) {
145 pklm
->bcount
= cpu_to_be32(MLX5_IMR_MTT_SIZE
);
146 va
= (offset
+ i
) * MLX5_IMR_MTT_SIZE
;
147 if (odp
&& odp
->umem
.address
== va
) {
148 struct mlx5_ib_mr
*mtt
= odp
->private;
150 pklm
->key
= cpu_to_be32(mtt
->ibmr
.lkey
);
153 pklm
->key
= cpu_to_be32(dev
->null_mkey
);
155 mlx5_ib_dbg(dev
, "[%d] va %lx key %x\n",
156 i
, va
, be32_to_cpu(pklm
->key
));
160 static void mr_leaf_free_action(struct work_struct
*work
)
162 struct ib_umem_odp
*odp
= container_of(work
, struct ib_umem_odp
, work
);
163 int idx
= ib_umem_start(&odp
->umem
) >> MLX5_IMR_MTT_SHIFT
;
164 struct mlx5_ib_mr
*mr
= odp
->private, *imr
= mr
->parent
;
167 synchronize_srcu(&mr
->dev
->mr_srcu
);
169 ib_umem_release(&odp
->umem
);
171 mlx5_ib_update_xlt(imr
, idx
, 1, 0,
172 MLX5_IB_UPD_XLT_INDIRECT
|
173 MLX5_IB_UPD_XLT_ATOMIC
);
174 mlx5_mr_cache_free(mr
->dev
, mr
);
176 if (atomic_dec_and_test(&imr
->num_leaf_free
))
177 wake_up(&imr
->q_leaf_free
);
180 void mlx5_ib_invalidate_range(struct ib_umem_odp
*umem_odp
, unsigned long start
,
183 struct mlx5_ib_mr
*mr
;
184 const u64 umr_block_mask
= (MLX5_UMR_MTT_ALIGNMENT
/
185 sizeof(struct mlx5_mtt
)) - 1;
186 u64 idx
= 0, blk_start_idx
= 0;
187 struct ib_umem
*umem
;
192 pr_err("invalidation called on NULL umem or non-ODP umem\n");
195 umem
= &umem_odp
->umem
;
197 mr
= umem_odp
->private;
199 if (!mr
|| !mr
->ibmr
.pd
)
202 start
= max_t(u64
, ib_umem_start(umem
), start
);
203 end
= min_t(u64
, ib_umem_end(umem
), end
);
206 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
207 * while we are doing the invalidation, no page fault will attempt to
208 * overwrite the same MTTs. Concurent invalidations might race us,
209 * but they will write 0s as well, so no difference in the end result.
212 for (addr
= start
; addr
< end
; addr
+= BIT(umem
->page_shift
)) {
213 idx
= (addr
- ib_umem_start(umem
)) >> umem
->page_shift
;
215 * Strive to write the MTTs in chunks, but avoid overwriting
216 * non-existing MTTs. The huristic here can be improved to
217 * estimate the cost of another UMR vs. the cost of bigger
220 if (umem_odp
->dma_list
[idx
] &
221 (ODP_READ_ALLOWED_BIT
| ODP_WRITE_ALLOWED_BIT
)) {
227 u64 umr_offset
= idx
& umr_block_mask
;
229 if (in_block
&& umr_offset
== 0) {
230 mlx5_ib_update_xlt(mr
, blk_start_idx
,
231 idx
- blk_start_idx
, 0,
232 MLX5_IB_UPD_XLT_ZAP
|
233 MLX5_IB_UPD_XLT_ATOMIC
);
239 mlx5_ib_update_xlt(mr
, blk_start_idx
,
240 idx
- blk_start_idx
+ 1, 0,
241 MLX5_IB_UPD_XLT_ZAP
|
242 MLX5_IB_UPD_XLT_ATOMIC
);
244 * We are now sure that the device will not access the
245 * memory. We can safely unmap it, and mark it as dirty if
249 ib_umem_odp_unmap_dma_pages(umem_odp
, start
, end
);
251 if (unlikely(!umem
->npages
&& mr
->parent
&&
253 WRITE_ONCE(umem_odp
->dying
, 1);
254 atomic_inc(&mr
->parent
->num_leaf_free
);
255 schedule_work(&umem_odp
->work
);
259 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev
*dev
)
261 struct ib_odp_caps
*caps
= &dev
->odp_caps
;
263 memset(caps
, 0, sizeof(*caps
));
265 if (!MLX5_CAP_GEN(dev
->mdev
, pg
))
268 caps
->general_caps
= IB_ODP_SUPPORT
;
270 if (MLX5_CAP_GEN(dev
->mdev
, umr_extended_translation_offset
))
271 dev
->odp_max_size
= U64_MAX
;
273 dev
->odp_max_size
= BIT_ULL(MLX5_MAX_UMR_SHIFT
+ PAGE_SHIFT
);
275 if (MLX5_CAP_ODP(dev
->mdev
, ud_odp_caps
.send
))
276 caps
->per_transport_caps
.ud_odp_caps
|= IB_ODP_SUPPORT_SEND
;
278 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.send
))
279 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_SEND
;
281 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.receive
))
282 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_RECV
;
284 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.write
))
285 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_WRITE
;
287 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.read
))
288 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_READ
;
290 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.atomic
))
291 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_ATOMIC
;
293 if (MLX5_CAP_GEN(dev
->mdev
, fixed_buffer_size
) &&
294 MLX5_CAP_GEN(dev
->mdev
, null_mkey
) &&
295 MLX5_CAP_GEN(dev
->mdev
, umr_extended_translation_offset
))
296 caps
->general_caps
|= IB_ODP_SUPPORT_IMPLICIT
;
301 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev
*dev
,
302 struct mlx5_pagefault
*pfault
,
305 int wq_num
= pfault
->event_subtype
== MLX5_PFAULT_SUBTYPE_WQE
?
306 pfault
->wqe
.wq_num
: pfault
->token
;
307 int ret
= mlx5_core_page_fault_resume(dev
->mdev
,
313 mlx5_ib_err(dev
, "Failed to resolve the page fault on WQ 0x%x\n",
317 static struct mlx5_ib_mr
*implicit_mr_alloc(struct ib_pd
*pd
,
318 struct ib_umem
*umem
,
319 bool ksm
, int access_flags
)
321 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
322 struct mlx5_ib_mr
*mr
;
325 mr
= mlx5_mr_cache_alloc(dev
, ksm
? MLX5_IMR_KSM_CACHE_ENTRY
:
326 MLX5_IMR_MTT_CACHE_ENTRY
);
334 mr
->access_flags
= access_flags
;
339 err
= mlx5_ib_update_xlt(mr
, 0,
340 mlx5_imr_ksm_entries
,
342 MLX5_IB_UPD_XLT_INDIRECT
|
343 MLX5_IB_UPD_XLT_ZAP
|
344 MLX5_IB_UPD_XLT_ENABLE
);
347 err
= mlx5_ib_update_xlt(mr
, 0,
348 MLX5_IMR_MTT_ENTRIES
,
350 MLX5_IB_UPD_XLT_ZAP
|
351 MLX5_IB_UPD_XLT_ENABLE
|
352 MLX5_IB_UPD_XLT_ATOMIC
);
358 mr
->ibmr
.lkey
= mr
->mmkey
.key
;
359 mr
->ibmr
.rkey
= mr
->mmkey
.key
;
363 mlx5_ib_dbg(dev
, "key %x dev %p mr %p\n",
364 mr
->mmkey
.key
, dev
->mdev
, mr
);
369 mlx5_ib_err(dev
, "Failed to register MKEY %d\n", err
);
370 mlx5_mr_cache_free(dev
, mr
);
375 static struct ib_umem_odp
*implicit_mr_get_data(struct mlx5_ib_mr
*mr
,
376 u64 io_virt
, size_t bcnt
)
378 struct mlx5_ib_dev
*dev
= to_mdev(mr
->ibmr
.pd
->device
);
379 struct ib_umem_odp
*odp
, *result
= NULL
;
380 struct ib_umem_odp
*odp_mr
= to_ib_umem_odp(mr
->umem
);
381 u64 addr
= io_virt
& MLX5_IMR_MTT_MASK
;
382 int nentries
= 0, start_idx
= 0, ret
;
383 struct mlx5_ib_mr
*mtt
;
385 mutex_lock(&odp_mr
->umem_mutex
);
386 odp
= odp_lookup(addr
, 1, mr
);
388 mlx5_ib_dbg(dev
, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
389 io_virt
, bcnt
, addr
, odp
);
396 odp
= ib_alloc_odp_umem(odp_mr
->per_mm
, addr
,
399 mutex_unlock(&odp_mr
->umem_mutex
);
400 return ERR_CAST(odp
);
403 mtt
= implicit_mr_alloc(mr
->ibmr
.pd
, &odp
->umem
, 0,
406 mutex_unlock(&odp_mr
->umem_mutex
);
407 ib_umem_release(&odp
->umem
);
408 return ERR_CAST(mtt
);
412 mtt
->umem
= &odp
->umem
;
413 mtt
->mmkey
.iova
= addr
;
415 INIT_WORK(&odp
->work
, mr_leaf_free_action
);
418 start_idx
= addr
>> MLX5_IMR_MTT_SHIFT
;
422 /* Return first odp if region not covered by single one */
426 addr
+= MLX5_IMR_MTT_SIZE
;
427 if (unlikely(addr
< io_virt
+ bcnt
)) {
429 if (odp
&& odp
->umem
.address
!= addr
)
434 if (unlikely(nentries
)) {
435 ret
= mlx5_ib_update_xlt(mr
, start_idx
, nentries
, 0,
436 MLX5_IB_UPD_XLT_INDIRECT
|
437 MLX5_IB_UPD_XLT_ATOMIC
);
439 mlx5_ib_err(dev
, "Failed to update PAS\n");
440 result
= ERR_PTR(ret
);
444 mutex_unlock(&odp_mr
->umem_mutex
);
448 struct mlx5_ib_mr
*mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd
*pd
,
451 struct ib_ucontext
*ctx
= pd
->ibpd
.uobject
->context
;
452 struct mlx5_ib_mr
*imr
;
453 struct ib_umem
*umem
;
455 umem
= ib_umem_get(ctx
, 0, 0, IB_ACCESS_ON_DEMAND
, 0);
457 return ERR_CAST(umem
);
459 imr
= implicit_mr_alloc(&pd
->ibpd
, umem
, 1, access_flags
);
461 ib_umem_release(umem
);
462 return ERR_CAST(imr
);
466 init_waitqueue_head(&imr
->q_leaf_free
);
467 atomic_set(&imr
->num_leaf_free
, 0);
472 static int mr_leaf_free(struct ib_umem_odp
*umem_odp
, u64 start
, u64 end
,
475 struct mlx5_ib_mr
*mr
= umem_odp
->private, *imr
= cookie
;
476 struct ib_umem
*umem
= &umem_odp
->umem
;
478 if (mr
->parent
!= imr
)
481 ib_umem_odp_unmap_dma_pages(umem_odp
, ib_umem_start(umem
),
487 WRITE_ONCE(umem_odp
->dying
, 1);
488 atomic_inc(&imr
->num_leaf_free
);
489 schedule_work(&umem_odp
->work
);
494 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr
*imr
)
496 struct ib_ucontext_per_mm
*per_mm
= mr_to_per_mm(imr
);
498 down_read(&per_mm
->umem_rwsem
);
499 rbt_ib_umem_for_each_in_range(&per_mm
->umem_tree
, 0, ULLONG_MAX
,
500 mr_leaf_free
, true, imr
);
501 up_read(&per_mm
->umem_rwsem
);
503 wait_event(imr
->q_leaf_free
, !atomic_read(&imr
->num_leaf_free
));
506 static int pagefault_mr(struct mlx5_ib_dev
*dev
, struct mlx5_ib_mr
*mr
,
507 u64 io_virt
, size_t bcnt
, u32
*bytes_mapped
)
509 struct ib_umem_odp
*odp_mr
= to_ib_umem_odp(mr
->umem
);
510 u64 access_mask
= ODP_READ_ALLOWED_BIT
;
511 int npages
= 0, page_shift
, np
;
512 u64 start_idx
, page_mask
;
513 struct ib_umem_odp
*odp
;
518 if (!odp_mr
->page_list
) {
519 odp
= implicit_mr_get_data(mr
, io_virt
, bcnt
);
530 size
= min_t(size_t, bcnt
, ib_umem_end(&odp
->umem
) - io_virt
);
532 page_shift
= mr
->umem
->page_shift
;
533 page_mask
= ~(BIT(page_shift
) - 1);
534 start_idx
= (io_virt
- (mr
->mmkey
.iova
& page_mask
)) >> page_shift
;
536 if (mr
->umem
->writable
)
537 access_mask
|= ODP_WRITE_ALLOWED_BIT
;
539 current_seq
= READ_ONCE(odp
->notifiers_seq
);
541 * Ensure the sequence number is valid for some time before we call
546 ret
= ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr
->umem
), io_virt
, size
,
547 access_mask
, current_seq
);
554 mutex_lock(&odp
->umem_mutex
);
555 if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr
->umem
),
558 * No need to check whether the MTTs really belong to
559 * this MR, since ib_umem_odp_map_dma_pages already
562 ret
= mlx5_ib_update_xlt(mr
, start_idx
, np
,
563 page_shift
, MLX5_IB_UPD_XLT_ATOMIC
);
567 mutex_unlock(&odp
->umem_mutex
);
571 mlx5_ib_err(dev
, "Failed to update mkey page tables\n");
576 u32 new_mappings
= (np
<< page_shift
) -
577 (io_virt
- round_down(io_virt
, 1 << page_shift
));
578 *bytes_mapped
+= min_t(u32
, new_mappings
, size
);
581 npages
+= np
<< (page_shift
- PAGE_SHIFT
);
584 if (unlikely(bcnt
)) {
585 struct ib_umem_odp
*next
;
588 next
= odp_next(odp
);
589 if (unlikely(!next
|| next
->umem
.address
!= io_virt
)) {
590 mlx5_ib_dbg(dev
, "next implicit leaf removed at 0x%llx. got %p\n",
602 if (ret
== -EAGAIN
) {
603 if (mr
->parent
|| !odp
->dying
) {
604 unsigned long timeout
=
605 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT
);
607 if (!wait_for_completion_timeout(
608 &odp
->notifier_completion
,
610 mlx5_ib_warn(dev
, "timeout waiting for mmu notifier. seq %d against %d\n",
611 current_seq
, odp
->notifiers_seq
);
614 /* The MR is being killed, kill the QP as well. */
623 struct pf_frame
*next
;
631 * Handle a single data segment in a page-fault WQE or RDMA region.
633 * Returns number of OS pages retrieved on success. The caller may continue to
634 * the next data segment.
635 * Can return the following error codes:
636 * -EAGAIN to designate a temporary error. The caller will abort handling the
637 * page fault and resolve it.
638 * -EFAULT when there's an error mapping the requested pages. The caller will
639 * abort the page fault handling.
641 static int pagefault_single_data_segment(struct mlx5_ib_dev
*dev
,
642 u32 key
, u64 io_virt
, size_t bcnt
,
643 u32
*bytes_committed
,
646 int npages
= 0, srcu_key
, ret
, i
, outlen
, cur_outlen
= 0, depth
= 0;
647 struct pf_frame
*head
= NULL
, *frame
;
648 struct mlx5_core_mkey
*mmkey
;
649 struct mlx5_ib_mw
*mw
;
650 struct mlx5_ib_mr
*mr
;
651 struct mlx5_klm
*pklm
;
655 srcu_key
= srcu_read_lock(&dev
->mr_srcu
);
657 io_virt
+= *bytes_committed
;
658 bcnt
-= *bytes_committed
;
661 mmkey
= __mlx5_mr_lookup(dev
->mdev
, mlx5_base_mkey(key
));
662 if (!mmkey
|| mmkey
->key
!= key
) {
663 mlx5_ib_dbg(dev
, "failed to find mkey %x\n", key
);
668 switch (mmkey
->type
) {
670 mr
= container_of(mmkey
, struct mlx5_ib_mr
, mmkey
);
671 if (!mr
->live
|| !mr
->ibmr
.pd
) {
672 mlx5_ib_dbg(dev
, "got dead MR\n");
677 if (!mr
->umem
->is_odp
) {
678 mlx5_ib_dbg(dev
, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
681 *bytes_mapped
+= bcnt
;
686 ret
= pagefault_mr(dev
, mr
, io_virt
, bcnt
, bytes_mapped
);
695 mw
= container_of(mmkey
, struct mlx5_ib_mw
, mmkey
);
697 if (depth
>= MLX5_CAP_GEN(dev
->mdev
, max_indirection
)) {
698 mlx5_ib_dbg(dev
, "indirection level exceeded\n");
703 outlen
= MLX5_ST_SZ_BYTES(query_mkey_out
) +
704 sizeof(*pklm
) * (mw
->ndescs
- 2);
706 if (outlen
> cur_outlen
) {
708 out
= kzalloc(outlen
, GFP_KERNEL
);
716 pklm
= (struct mlx5_klm
*)MLX5_ADDR_OF(query_mkey_out
, out
,
717 bsf0_klm0_pas_mtt0_1
);
719 ret
= mlx5_core_query_mkey(dev
->mdev
, &mw
->mmkey
, out
, outlen
);
723 offset
= io_virt
- MLX5_GET64(query_mkey_out
, out
,
724 memory_key_mkey_entry
.start_addr
);
726 for (i
= 0; bcnt
&& i
< mw
->ndescs
; i
++, pklm
++) {
727 if (offset
>= be32_to_cpu(pklm
->bcount
)) {
728 offset
-= be32_to_cpu(pklm
->bcount
);
732 frame
= kzalloc(sizeof(*frame
), GFP_KERNEL
);
738 frame
->key
= be32_to_cpu(pklm
->key
);
739 frame
->io_virt
= be64_to_cpu(pklm
->va
) + offset
;
740 frame
->bcnt
= min_t(size_t, bcnt
,
741 be32_to_cpu(pklm
->bcount
) - offset
);
742 frame
->depth
= depth
+ 1;
752 mlx5_ib_dbg(dev
, "wrong mkey type %d\n", mmkey
->type
);
762 io_virt
= frame
->io_virt
;
764 depth
= frame
->depth
;
778 srcu_read_unlock(&dev
->mr_srcu
, srcu_key
);
779 *bytes_committed
= 0;
780 return ret
? ret
: npages
;
784 * Parse a series of data segments for page fault handling.
786 * @qp the QP on which the fault occurred.
787 * @pfault contains page fault information.
788 * @wqe points at the first data segment in the WQE.
789 * @wqe_end points after the end of the WQE.
790 * @bytes_mapped receives the number of bytes that the function was able to
791 * map. This allows the caller to decide intelligently whether
792 * enough memory was mapped to resolve the page fault
793 * successfully (e.g. enough for the next MTU, or the entire
795 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
796 * the committed bytes).
798 * Returns the number of pages loaded if positive, zero for an empty WQE, or a
799 * negative error code.
801 static int pagefault_data_segments(struct mlx5_ib_dev
*dev
,
802 struct mlx5_pagefault
*pfault
,
803 struct mlx5_ib_qp
*qp
, void *wqe
,
804 void *wqe_end
, u32
*bytes_mapped
,
805 u32
*total_wqe_bytes
, int receive_queue
)
807 int ret
= 0, npages
= 0;
814 /* Skip SRQ next-WQE segment. */
815 if (receive_queue
&& qp
->ibqp
.srq
)
816 wqe
+= sizeof(struct mlx5_wqe_srq_next_seg
);
821 *total_wqe_bytes
= 0;
823 while (wqe
< wqe_end
) {
824 struct mlx5_wqe_data_seg
*dseg
= wqe
;
826 io_virt
= be64_to_cpu(dseg
->addr
);
827 key
= be32_to_cpu(dseg
->lkey
);
828 byte_count
= be32_to_cpu(dseg
->byte_count
);
829 inline_segment
= !!(byte_count
& MLX5_INLINE_SEG
);
830 bcnt
= byte_count
& ~MLX5_INLINE_SEG
;
832 if (inline_segment
) {
833 bcnt
= bcnt
& MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK
;
834 wqe
+= ALIGN(sizeof(struct mlx5_wqe_inline_seg
) + bcnt
,
837 wqe
+= sizeof(*dseg
);
840 /* receive WQE end of sg list. */
841 if (receive_queue
&& bcnt
== 0 && key
== MLX5_INVALID_LKEY
&&
845 if (!inline_segment
&& total_wqe_bytes
) {
846 *total_wqe_bytes
+= bcnt
- min_t(size_t, bcnt
,
847 pfault
->bytes_committed
);
850 /* A zero length data segment designates a length of 2GB. */
854 if (inline_segment
|| bcnt
<= pfault
->bytes_committed
) {
855 pfault
->bytes_committed
-=
857 pfault
->bytes_committed
);
861 ret
= pagefault_single_data_segment(dev
, key
, io_virt
, bcnt
,
862 &pfault
->bytes_committed
,
869 return ret
< 0 ? ret
: npages
;
872 static const u32 mlx5_ib_odp_opcode_cap
[] = {
873 [MLX5_OPCODE_SEND
] = IB_ODP_SUPPORT_SEND
,
874 [MLX5_OPCODE_SEND_IMM
] = IB_ODP_SUPPORT_SEND
,
875 [MLX5_OPCODE_SEND_INVAL
] = IB_ODP_SUPPORT_SEND
,
876 [MLX5_OPCODE_RDMA_WRITE
] = IB_ODP_SUPPORT_WRITE
,
877 [MLX5_OPCODE_RDMA_WRITE_IMM
] = IB_ODP_SUPPORT_WRITE
,
878 [MLX5_OPCODE_RDMA_READ
] = IB_ODP_SUPPORT_READ
,
879 [MLX5_OPCODE_ATOMIC_CS
] = IB_ODP_SUPPORT_ATOMIC
,
880 [MLX5_OPCODE_ATOMIC_FA
] = IB_ODP_SUPPORT_ATOMIC
,
884 * Parse initiator WQE. Advances the wqe pointer to point at the
885 * scatter-gather list, and set wqe_end to the end of the WQE.
887 static int mlx5_ib_mr_initiator_pfault_handler(
888 struct mlx5_ib_dev
*dev
, struct mlx5_pagefault
*pfault
,
889 struct mlx5_ib_qp
*qp
, void **wqe
, void **wqe_end
, int wqe_length
)
891 struct mlx5_wqe_ctrl_seg
*ctrl
= *wqe
;
892 u16 wqe_index
= pfault
->wqe
.wqe_index
;
894 struct mlx5_base_av
*av
;
897 u32 ctrl_wqe_index
, ctrl_qpn
;
899 u32 qpn
= qp
->trans_qp
.base
.mqp
.qpn
;
901 ds
= be32_to_cpu(ctrl
->qpn_ds
) & MLX5_WQE_CTRL_DS_MASK
;
902 if (ds
* MLX5_WQE_DS_UNITS
> wqe_length
) {
903 mlx5_ib_err(dev
, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
909 mlx5_ib_err(dev
, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
915 ctrl_wqe_index
= (be32_to_cpu(ctrl
->opmod_idx_opcode
) &
916 MLX5_WQE_CTRL_WQE_INDEX_MASK
) >>
917 MLX5_WQE_CTRL_WQE_INDEX_SHIFT
;
918 if (wqe_index
!= ctrl_wqe_index
) {
919 mlx5_ib_err(dev
, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
925 ctrl_qpn
= (be32_to_cpu(ctrl
->qpn_ds
) & MLX5_WQE_CTRL_QPN_MASK
) >>
926 MLX5_WQE_CTRL_QPN_SHIFT
;
927 if (qpn
!= ctrl_qpn
) {
928 mlx5_ib_err(dev
, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
935 *wqe_end
= *wqe
+ ds
* MLX5_WQE_DS_UNITS
;
936 *wqe
+= sizeof(*ctrl
);
938 opcode
= be32_to_cpu(ctrl
->opmod_idx_opcode
) &
939 MLX5_WQE_CTRL_OPCODE_MASK
;
941 switch (qp
->ibqp
.qp_type
) {
943 transport_caps
= dev
->odp_caps
.per_transport_caps
.rc_odp_caps
;
946 transport_caps
= dev
->odp_caps
.per_transport_caps
.ud_odp_caps
;
949 mlx5_ib_err(dev
, "ODP fault on QP of an unsupported transport 0x%x\n",
954 if (unlikely(opcode
>= ARRAY_SIZE(mlx5_ib_odp_opcode_cap
) ||
955 !(transport_caps
& mlx5_ib_odp_opcode_cap
[opcode
]))) {
956 mlx5_ib_err(dev
, "ODP fault on QP of an unsupported opcode 0x%x\n",
961 if (qp
->ibqp
.qp_type
!= IB_QPT_RC
) {
963 if (av
->dqp_dct
& cpu_to_be32(MLX5_EXTENDED_UD_AV
))
964 *wqe
+= sizeof(struct mlx5_av
);
966 *wqe
+= sizeof(struct mlx5_base_av
);
970 case MLX5_OPCODE_RDMA_WRITE
:
971 case MLX5_OPCODE_RDMA_WRITE_IMM
:
972 case MLX5_OPCODE_RDMA_READ
:
973 *wqe
+= sizeof(struct mlx5_wqe_raddr_seg
);
975 case MLX5_OPCODE_ATOMIC_CS
:
976 case MLX5_OPCODE_ATOMIC_FA
:
977 *wqe
+= sizeof(struct mlx5_wqe_raddr_seg
);
978 *wqe
+= sizeof(struct mlx5_wqe_atomic_seg
);
986 * Parse responder WQE. Advances the wqe pointer to point at the
987 * scatter-gather list, and set wqe_end to the end of the WQE.
989 static int mlx5_ib_mr_responder_pfault_handler(
990 struct mlx5_ib_dev
*dev
, struct mlx5_pagefault
*pfault
,
991 struct mlx5_ib_qp
*qp
, void **wqe
, void **wqe_end
, int wqe_length
)
993 struct mlx5_ib_wq
*wq
= &qp
->rq
;
994 int wqe_size
= 1 << wq
->wqe_shift
;
997 mlx5_ib_err(dev
, "ODP fault on SRQ is not supported\n");
1002 mlx5_ib_err(dev
, "ODP fault with WQE signatures is not supported\n");
1006 if (wqe_size
> wqe_length
) {
1007 mlx5_ib_err(dev
, "Couldn't read all of the receive WQE's content\n");
1011 switch (qp
->ibqp
.qp_type
) {
1013 if (!(dev
->odp_caps
.per_transport_caps
.rc_odp_caps
&
1014 IB_ODP_SUPPORT_RECV
))
1015 goto invalid_transport_or_opcode
;
1018 invalid_transport_or_opcode
:
1019 mlx5_ib_err(dev
, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
1024 *wqe_end
= *wqe
+ wqe_size
;
1029 static struct mlx5_ib_qp
*mlx5_ib_odp_find_qp(struct mlx5_ib_dev
*dev
,
1032 struct mlx5_core_qp
*mqp
= __mlx5_qp_lookup(dev
->mdev
, wq_num
);
1035 mlx5_ib_err(dev
, "QPN 0x%6x not found\n", wq_num
);
1039 return to_mibqp(mqp
);
1042 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev
*dev
,
1043 struct mlx5_pagefault
*pfault
)
1046 void *wqe
, *wqe_end
;
1047 u32 bytes_mapped
, total_wqe_bytes
;
1048 char *buffer
= NULL
;
1049 int resume_with_error
= 1;
1050 u16 wqe_index
= pfault
->wqe
.wqe_index
;
1051 int requestor
= pfault
->type
& MLX5_PFAULT_REQUESTOR
;
1052 struct mlx5_ib_qp
*qp
;
1054 buffer
= (char *)__get_free_page(GFP_KERNEL
);
1056 mlx5_ib_err(dev
, "Error allocating memory for IO page fault handling.\n");
1057 goto resolve_page_fault
;
1060 qp
= mlx5_ib_odp_find_qp(dev
, pfault
->wqe
.wq_num
);
1062 goto resolve_page_fault
;
1064 ret
= mlx5_ib_read_user_wqe(qp
, requestor
, wqe_index
, buffer
,
1065 PAGE_SIZE
, &qp
->trans_qp
.base
);
1067 mlx5_ib_err(dev
, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n",
1068 ret
, wqe_index
, pfault
->token
);
1069 goto resolve_page_fault
;
1074 ret
= mlx5_ib_mr_initiator_pfault_handler(dev
, pfault
, qp
, &wqe
,
1077 ret
= mlx5_ib_mr_responder_pfault_handler(dev
, pfault
, qp
, &wqe
,
1080 goto resolve_page_fault
;
1082 if (wqe
>= wqe_end
) {
1083 mlx5_ib_err(dev
, "ODP fault on invalid WQE.\n");
1084 goto resolve_page_fault
;
1087 ret
= pagefault_data_segments(dev
, pfault
, qp
, wqe
, wqe_end
,
1088 &bytes_mapped
, &total_wqe_bytes
,
1090 if (ret
== -EAGAIN
) {
1091 resume_with_error
= 0;
1092 goto resolve_page_fault
;
1093 } else if (ret
< 0 || total_wqe_bytes
> bytes_mapped
) {
1094 goto resolve_page_fault
;
1097 resume_with_error
= 0;
1099 mlx5_ib_page_fault_resume(dev
, pfault
, resume_with_error
);
1100 mlx5_ib_dbg(dev
, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
1101 pfault
->wqe
.wq_num
, resume_with_error
,
1103 free_page((unsigned long)buffer
);
1106 static int pages_in_range(u64 address
, u32 length
)
1108 return (ALIGN(address
+ length
, PAGE_SIZE
) -
1109 (address
& PAGE_MASK
)) >> PAGE_SHIFT
;
1112 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev
*dev
,
1113 struct mlx5_pagefault
*pfault
)
1117 u32 prefetch_len
= pfault
->bytes_committed
;
1118 int prefetch_activated
= 0;
1119 u32 rkey
= pfault
->rdma
.r_key
;
1122 /* The RDMA responder handler handles the page fault in two parts.
1123 * First it brings the necessary pages for the current packet
1124 * (and uses the pfault context), and then (after resuming the QP)
1125 * prefetches more pages. The second operation cannot use the pfault
1126 * context and therefore uses the dummy_pfault context allocated on
1128 pfault
->rdma
.rdma_va
+= pfault
->bytes_committed
;
1129 pfault
->rdma
.rdma_op_len
-= min(pfault
->bytes_committed
,
1130 pfault
->rdma
.rdma_op_len
);
1131 pfault
->bytes_committed
= 0;
1133 address
= pfault
->rdma
.rdma_va
;
1134 length
= pfault
->rdma
.rdma_op_len
;
1136 /* For some operations, the hardware cannot tell the exact message
1137 * length, and in those cases it reports zero. Use prefetch
1140 prefetch_activated
= 1;
1141 length
= pfault
->rdma
.packet_size
;
1142 prefetch_len
= min(MAX_PREFETCH_LEN
, prefetch_len
);
1145 ret
= pagefault_single_data_segment(dev
, rkey
, address
, length
,
1146 &pfault
->bytes_committed
, NULL
);
1147 if (ret
== -EAGAIN
) {
1148 /* We're racing with an invalidation, don't prefetch */
1149 prefetch_activated
= 0;
1150 } else if (ret
< 0 || pages_in_range(address
, length
) > ret
) {
1151 mlx5_ib_page_fault_resume(dev
, pfault
, 1);
1153 mlx5_ib_dbg(dev
, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
1154 ret
, pfault
->token
, pfault
->type
);
1158 mlx5_ib_page_fault_resume(dev
, pfault
, 0);
1159 mlx5_ib_dbg(dev
, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
1160 pfault
->token
, pfault
->type
,
1161 prefetch_activated
);
1163 /* At this point, there might be a new pagefault already arriving in
1164 * the eq, switch to the dummy pagefault for the rest of the
1165 * processing. We're still OK with the objects being alive as the
1166 * work-queue is being fenced. */
1168 if (prefetch_activated
) {
1169 u32 bytes_committed
= 0;
1171 ret
= pagefault_single_data_segment(dev
, rkey
, address
,
1173 &bytes_committed
, NULL
);
1174 if (ret
< 0 && ret
!= -EAGAIN
) {
1175 mlx5_ib_dbg(dev
, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
1176 ret
, pfault
->token
, address
, prefetch_len
);
1181 void mlx5_ib_pfault(struct mlx5_core_dev
*mdev
, void *context
,
1182 struct mlx5_pagefault
*pfault
)
1184 struct mlx5_ib_dev
*dev
= context
;
1185 u8 event_subtype
= pfault
->event_subtype
;
1187 switch (event_subtype
) {
1188 case MLX5_PFAULT_SUBTYPE_WQE
:
1189 mlx5_ib_mr_wqe_pfault_handler(dev
, pfault
);
1191 case MLX5_PFAULT_SUBTYPE_RDMA
:
1192 mlx5_ib_mr_rdma_pfault_handler(dev
, pfault
);
1195 mlx5_ib_err(dev
, "Invalid page fault event subtype: 0x%x\n",
1197 mlx5_ib_page_fault_resume(dev
, pfault
, 1);
1201 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent
*ent
)
1203 if (!(ent
->dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT_IMPLICIT
))
1206 switch (ent
->order
- 2) {
1207 case MLX5_IMR_MTT_CACHE_ENTRY
:
1208 ent
->page
= PAGE_SHIFT
;
1209 ent
->xlt
= MLX5_IMR_MTT_ENTRIES
*
1210 sizeof(struct mlx5_mtt
) /
1211 MLX5_IB_UMR_OCTOWORD
;
1212 ent
->access_mode
= MLX5_MKC_ACCESS_MODE_MTT
;
1216 case MLX5_IMR_KSM_CACHE_ENTRY
:
1217 ent
->page
= MLX5_KSM_PAGE_SHIFT
;
1218 ent
->xlt
= mlx5_imr_ksm_entries
*
1219 sizeof(struct mlx5_klm
) /
1220 MLX5_IB_UMR_OCTOWORD
;
1221 ent
->access_mode
= MLX5_MKC_ACCESS_MODE_KSM
;
1227 int mlx5_ib_odp_init_one(struct mlx5_ib_dev
*dev
)
1231 if (dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT_IMPLICIT
) {
1232 ret
= mlx5_cmd_null_mkey(dev
->mdev
, &dev
->null_mkey
);
1234 mlx5_ib_err(dev
, "Error getting null_mkey %d\n", ret
);
1242 int mlx5_ib_odp_init(void)
1244 mlx5_imr_ksm_entries
= BIT_ULL(get_order(TASK_SIZE
) -