2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <rdma/ib_umem.h>
34 #include <rdma/ib_umem_odp.h>
35 #include <linux/kernel.h>
40 #include <linux/mlx5/eq.h>
42 /* Contains the details of a pagefault. */
43 struct mlx5_pagefault
{
49 /* Initiator or send message responder pagefault details. */
51 /* Received packet size, only valid for responders. */
54 * Number of resource holding WQE, depends on type.
58 * WQE index. Refers to either the send queue or
59 * receive queue, according to event_subtype.
63 /* RDMA responder pagefault details */
67 * Received packet size, minimal size page fault
68 * resolution required for forward progress.
76 struct mlx5_ib_pf_eq
*eq
;
77 struct work_struct work
;
80 #define MAX_PREFETCH_LEN (4*1024*1024U)
82 /* Timeout in ms to wait for an active mmu notifier to complete when handling
84 #define MMU_NOTIFIER_TIMEOUT 1000
86 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
87 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
88 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
89 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
90 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
92 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
94 static u64 mlx5_imr_ksm_entries
;
96 static int check_parent(struct ib_umem_odp
*odp
,
97 struct mlx5_ib_mr
*parent
)
99 struct mlx5_ib_mr
*mr
= odp
->private;
101 return mr
&& mr
->parent
== parent
&& !odp
->dying
;
104 static struct ib_ucontext_per_mm
*mr_to_per_mm(struct mlx5_ib_mr
*mr
)
106 if (WARN_ON(!mr
|| !is_odp_mr(mr
)))
109 return to_ib_umem_odp(mr
->umem
)->per_mm
;
112 static struct ib_umem_odp
*odp_next(struct ib_umem_odp
*odp
)
114 struct mlx5_ib_mr
*mr
= odp
->private, *parent
= mr
->parent
;
115 struct ib_ucontext_per_mm
*per_mm
= odp
->per_mm
;
118 down_read(&per_mm
->umem_rwsem
);
120 rb
= rb_next(&odp
->interval_tree
.rb
);
123 odp
= rb_entry(rb
, struct ib_umem_odp
, interval_tree
.rb
);
124 if (check_parent(odp
, parent
))
130 up_read(&per_mm
->umem_rwsem
);
134 static struct ib_umem_odp
*odp_lookup(u64 start
, u64 length
,
135 struct mlx5_ib_mr
*parent
)
137 struct ib_ucontext_per_mm
*per_mm
= mr_to_per_mm(parent
);
138 struct ib_umem_odp
*odp
;
141 down_read(&per_mm
->umem_rwsem
);
142 odp
= rbt_ib_umem_lookup(&per_mm
->umem_tree
, start
, length
);
147 if (check_parent(odp
, parent
))
149 rb
= rb_next(&odp
->interval_tree
.rb
);
152 odp
= rb_entry(rb
, struct ib_umem_odp
, interval_tree
.rb
);
153 if (ib_umem_start(odp
) > start
+ length
)
159 up_read(&per_mm
->umem_rwsem
);
163 void mlx5_odp_populate_klm(struct mlx5_klm
*pklm
, size_t offset
,
164 size_t nentries
, struct mlx5_ib_mr
*mr
, int flags
)
166 struct ib_pd
*pd
= mr
->ibmr
.pd
;
167 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
168 struct ib_umem_odp
*odp
;
172 if (flags
& MLX5_IB_UPD_XLT_ZAP
) {
173 for (i
= 0; i
< nentries
; i
++, pklm
++) {
174 pklm
->bcount
= cpu_to_be32(MLX5_IMR_MTT_SIZE
);
175 pklm
->key
= cpu_to_be32(dev
->null_mkey
);
181 odp
= odp_lookup(offset
* MLX5_IMR_MTT_SIZE
,
182 nentries
* MLX5_IMR_MTT_SIZE
, mr
);
184 for (i
= 0; i
< nentries
; i
++, pklm
++) {
185 pklm
->bcount
= cpu_to_be32(MLX5_IMR_MTT_SIZE
);
186 va
= (offset
+ i
) * MLX5_IMR_MTT_SIZE
;
187 if (odp
&& odp
->umem
.address
== va
) {
188 struct mlx5_ib_mr
*mtt
= odp
->private;
190 pklm
->key
= cpu_to_be32(mtt
->ibmr
.lkey
);
193 pklm
->key
= cpu_to_be32(dev
->null_mkey
);
195 mlx5_ib_dbg(dev
, "[%d] va %lx key %x\n",
196 i
, va
, be32_to_cpu(pklm
->key
));
200 static void mr_leaf_free_action(struct work_struct
*work
)
202 struct ib_umem_odp
*odp
= container_of(work
, struct ib_umem_odp
, work
);
203 int idx
= ib_umem_start(odp
) >> MLX5_IMR_MTT_SHIFT
;
204 struct mlx5_ib_mr
*mr
= odp
->private, *imr
= mr
->parent
;
207 synchronize_srcu(&mr
->dev
->mr_srcu
);
209 ib_umem_release(&odp
->umem
);
211 mlx5_ib_update_xlt(imr
, idx
, 1, 0,
212 MLX5_IB_UPD_XLT_INDIRECT
|
213 MLX5_IB_UPD_XLT_ATOMIC
);
214 mlx5_mr_cache_free(mr
->dev
, mr
);
216 if (atomic_dec_and_test(&imr
->num_leaf_free
))
217 wake_up(&imr
->q_leaf_free
);
220 void mlx5_ib_invalidate_range(struct ib_umem_odp
*umem_odp
, unsigned long start
,
223 struct mlx5_ib_mr
*mr
;
224 const u64 umr_block_mask
= (MLX5_UMR_MTT_ALIGNMENT
/
225 sizeof(struct mlx5_mtt
)) - 1;
226 u64 idx
= 0, blk_start_idx
= 0;
231 pr_err("invalidation called on NULL umem or non-ODP umem\n");
235 mr
= umem_odp
->private;
237 if (!mr
|| !mr
->ibmr
.pd
)
240 start
= max_t(u64
, ib_umem_start(umem_odp
), start
);
241 end
= min_t(u64
, ib_umem_end(umem_odp
), end
);
244 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
245 * while we are doing the invalidation, no page fault will attempt to
246 * overwrite the same MTTs. Concurent invalidations might race us,
247 * but they will write 0s as well, so no difference in the end result.
249 mutex_lock(&umem_odp
->umem_mutex
);
250 for (addr
= start
; addr
< end
; addr
+= BIT(umem_odp
->page_shift
)) {
251 idx
= (addr
- ib_umem_start(umem_odp
)) >> umem_odp
->page_shift
;
253 * Strive to write the MTTs in chunks, but avoid overwriting
254 * non-existing MTTs. The huristic here can be improved to
255 * estimate the cost of another UMR vs. the cost of bigger
258 if (umem_odp
->dma_list
[idx
] &
259 (ODP_READ_ALLOWED_BIT
| ODP_WRITE_ALLOWED_BIT
)) {
265 u64 umr_offset
= idx
& umr_block_mask
;
267 if (in_block
&& umr_offset
== 0) {
268 mlx5_ib_update_xlt(mr
, blk_start_idx
,
269 idx
- blk_start_idx
, 0,
270 MLX5_IB_UPD_XLT_ZAP
|
271 MLX5_IB_UPD_XLT_ATOMIC
);
277 mlx5_ib_update_xlt(mr
, blk_start_idx
,
278 idx
- blk_start_idx
+ 1, 0,
279 MLX5_IB_UPD_XLT_ZAP
|
280 MLX5_IB_UPD_XLT_ATOMIC
);
281 mutex_unlock(&umem_odp
->umem_mutex
);
283 * We are now sure that the device will not access the
284 * memory. We can safely unmap it, and mark it as dirty if
288 ib_umem_odp_unmap_dma_pages(umem_odp
, start
, end
);
290 if (unlikely(!umem_odp
->npages
&& mr
->parent
&&
292 WRITE_ONCE(umem_odp
->dying
, 1);
293 atomic_inc(&mr
->parent
->num_leaf_free
);
294 schedule_work(&umem_odp
->work
);
298 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev
*dev
)
300 struct ib_odp_caps
*caps
= &dev
->odp_caps
;
302 memset(caps
, 0, sizeof(*caps
));
304 if (!MLX5_CAP_GEN(dev
->mdev
, pg
) ||
305 !mlx5_ib_can_use_umr(dev
, true))
308 caps
->general_caps
= IB_ODP_SUPPORT
;
310 if (MLX5_CAP_GEN(dev
->mdev
, umr_extended_translation_offset
))
311 dev
->odp_max_size
= U64_MAX
;
313 dev
->odp_max_size
= BIT_ULL(MLX5_MAX_UMR_SHIFT
+ PAGE_SHIFT
);
315 if (MLX5_CAP_ODP(dev
->mdev
, ud_odp_caps
.send
))
316 caps
->per_transport_caps
.ud_odp_caps
|= IB_ODP_SUPPORT_SEND
;
318 if (MLX5_CAP_ODP(dev
->mdev
, ud_odp_caps
.srq_receive
))
319 caps
->per_transport_caps
.ud_odp_caps
|= IB_ODP_SUPPORT_SRQ_RECV
;
321 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.send
))
322 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_SEND
;
324 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.receive
))
325 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_RECV
;
327 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.write
))
328 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_WRITE
;
330 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.read
))
331 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_READ
;
333 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.atomic
))
334 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_ATOMIC
;
336 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.srq_receive
))
337 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_SRQ_RECV
;
339 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.send
))
340 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_SEND
;
342 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.receive
))
343 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_RECV
;
345 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.write
))
346 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_WRITE
;
348 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.read
))
349 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_READ
;
351 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.atomic
))
352 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_ATOMIC
;
354 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.srq_receive
))
355 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_SRQ_RECV
;
357 if (MLX5_CAP_GEN(dev
->mdev
, fixed_buffer_size
) &&
358 MLX5_CAP_GEN(dev
->mdev
, null_mkey
) &&
359 MLX5_CAP_GEN(dev
->mdev
, umr_extended_translation_offset
) &&
360 !MLX5_CAP_GEN(dev
->mdev
, umr_indirect_mkey_disabled
))
361 caps
->general_caps
|= IB_ODP_SUPPORT_IMPLICIT
;
366 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev
*dev
,
367 struct mlx5_pagefault
*pfault
,
370 int wq_num
= pfault
->event_subtype
== MLX5_PFAULT_SUBTYPE_WQE
?
371 pfault
->wqe
.wq_num
: pfault
->token
;
372 u32 out
[MLX5_ST_SZ_DW(page_fault_resume_out
)] = { };
373 u32 in
[MLX5_ST_SZ_DW(page_fault_resume_in
)] = { };
376 MLX5_SET(page_fault_resume_in
, in
, opcode
, MLX5_CMD_OP_PAGE_FAULT_RESUME
);
377 MLX5_SET(page_fault_resume_in
, in
, page_fault_type
, pfault
->type
);
378 MLX5_SET(page_fault_resume_in
, in
, token
, pfault
->token
);
379 MLX5_SET(page_fault_resume_in
, in
, wq_number
, wq_num
);
380 MLX5_SET(page_fault_resume_in
, in
, error
, !!error
);
382 err
= mlx5_cmd_exec(dev
->mdev
, in
, sizeof(in
), out
, sizeof(out
));
384 mlx5_ib_err(dev
, "Failed to resolve the page fault on WQ 0x%x err %d\n",
388 static struct mlx5_ib_mr
*implicit_mr_alloc(struct ib_pd
*pd
,
389 struct ib_umem
*umem
,
390 bool ksm
, int access_flags
)
392 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
393 struct mlx5_ib_mr
*mr
;
396 mr
= mlx5_mr_cache_alloc(dev
, ksm
? MLX5_IMR_KSM_CACHE_ENTRY
:
397 MLX5_IMR_MTT_CACHE_ENTRY
);
405 mr
->access_flags
= access_flags
;
410 err
= mlx5_ib_update_xlt(mr
, 0,
411 mlx5_imr_ksm_entries
,
413 MLX5_IB_UPD_XLT_INDIRECT
|
414 MLX5_IB_UPD_XLT_ZAP
|
415 MLX5_IB_UPD_XLT_ENABLE
);
418 err
= mlx5_ib_update_xlt(mr
, 0,
419 MLX5_IMR_MTT_ENTRIES
,
421 MLX5_IB_UPD_XLT_ZAP
|
422 MLX5_IB_UPD_XLT_ENABLE
|
423 MLX5_IB_UPD_XLT_ATOMIC
);
429 mr
->ibmr
.lkey
= mr
->mmkey
.key
;
430 mr
->ibmr
.rkey
= mr
->mmkey
.key
;
434 mlx5_ib_dbg(dev
, "key %x dev %p mr %p\n",
435 mr
->mmkey
.key
, dev
->mdev
, mr
);
440 mlx5_ib_err(dev
, "Failed to register MKEY %d\n", err
);
441 mlx5_mr_cache_free(dev
, mr
);
446 static struct ib_umem_odp
*implicit_mr_get_data(struct mlx5_ib_mr
*mr
,
447 u64 io_virt
, size_t bcnt
)
449 struct mlx5_ib_dev
*dev
= to_mdev(mr
->ibmr
.pd
->device
);
450 struct ib_umem_odp
*odp
, *result
= NULL
;
451 struct ib_umem_odp
*odp_mr
= to_ib_umem_odp(mr
->umem
);
452 u64 addr
= io_virt
& MLX5_IMR_MTT_MASK
;
453 int nentries
= 0, start_idx
= 0, ret
;
454 struct mlx5_ib_mr
*mtt
;
456 mutex_lock(&odp_mr
->umem_mutex
);
457 odp
= odp_lookup(addr
, 1, mr
);
459 mlx5_ib_dbg(dev
, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
460 io_virt
, bcnt
, addr
, odp
);
467 odp
= ib_alloc_odp_umem(odp_mr
, addr
,
470 mutex_unlock(&odp_mr
->umem_mutex
);
471 return ERR_CAST(odp
);
474 mtt
= implicit_mr_alloc(mr
->ibmr
.pd
, &odp
->umem
, 0,
477 mutex_unlock(&odp_mr
->umem_mutex
);
478 ib_umem_release(&odp
->umem
);
479 return ERR_CAST(mtt
);
483 mtt
->umem
= &odp
->umem
;
484 mtt
->mmkey
.iova
= addr
;
486 INIT_WORK(&odp
->work
, mr_leaf_free_action
);
489 start_idx
= addr
>> MLX5_IMR_MTT_SHIFT
;
493 /* Return first odp if region not covered by single one */
497 addr
+= MLX5_IMR_MTT_SIZE
;
498 if (unlikely(addr
< io_virt
+ bcnt
)) {
500 if (odp
&& odp
->umem
.address
!= addr
)
505 if (unlikely(nentries
)) {
506 ret
= mlx5_ib_update_xlt(mr
, start_idx
, nentries
, 0,
507 MLX5_IB_UPD_XLT_INDIRECT
|
508 MLX5_IB_UPD_XLT_ATOMIC
);
510 mlx5_ib_err(dev
, "Failed to update PAS\n");
511 result
= ERR_PTR(ret
);
515 mutex_unlock(&odp_mr
->umem_mutex
);
519 struct mlx5_ib_mr
*mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd
*pd
,
520 struct ib_udata
*udata
,
523 struct mlx5_ib_mr
*imr
;
524 struct ib_umem
*umem
;
526 umem
= ib_umem_get(udata
, 0, 0, access_flags
, 0);
528 return ERR_CAST(umem
);
530 imr
= implicit_mr_alloc(&pd
->ibpd
, umem
, 1, access_flags
);
532 ib_umem_release(umem
);
533 return ERR_CAST(imr
);
537 init_waitqueue_head(&imr
->q_leaf_free
);
538 atomic_set(&imr
->num_leaf_free
, 0);
539 atomic_set(&imr
->num_pending_prefetch
, 0);
544 static int mr_leaf_free(struct ib_umem_odp
*umem_odp
, u64 start
, u64 end
,
547 struct mlx5_ib_mr
*mr
= umem_odp
->private, *imr
= cookie
;
549 if (mr
->parent
!= imr
)
552 ib_umem_odp_unmap_dma_pages(umem_odp
, ib_umem_start(umem_odp
),
553 ib_umem_end(umem_odp
));
558 WRITE_ONCE(umem_odp
->dying
, 1);
559 atomic_inc(&imr
->num_leaf_free
);
560 schedule_work(&umem_odp
->work
);
565 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr
*imr
)
567 struct ib_ucontext_per_mm
*per_mm
= mr_to_per_mm(imr
);
569 down_read(&per_mm
->umem_rwsem
);
570 rbt_ib_umem_for_each_in_range(&per_mm
->umem_tree
, 0, ULLONG_MAX
,
571 mr_leaf_free
, true, imr
);
572 up_read(&per_mm
->umem_rwsem
);
574 wait_event(imr
->q_leaf_free
, !atomic_read(&imr
->num_leaf_free
));
577 #define MLX5_PF_FLAGS_PREFETCH BIT(0)
578 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
579 static int pagefault_mr(struct mlx5_ib_dev
*dev
, struct mlx5_ib_mr
*mr
,
580 u64 io_virt
, size_t bcnt
, u32
*bytes_mapped
,
583 int npages
= 0, current_seq
, page_shift
, ret
, np
;
584 struct ib_umem_odp
*odp_mr
= to_ib_umem_odp(mr
->umem
);
585 bool downgrade
= flags
& MLX5_PF_FLAGS_DOWNGRADE
;
586 bool prefetch
= flags
& MLX5_PF_FLAGS_PREFETCH
;
588 u64 start_idx
, page_mask
;
589 struct ib_umem_odp
*odp
;
592 if (!odp_mr
->page_list
) {
593 odp
= implicit_mr_get_data(mr
, io_virt
, bcnt
);
603 size
= min_t(size_t, bcnt
, ib_umem_end(odp
) - io_virt
);
605 page_shift
= odp
->page_shift
;
606 page_mask
= ~(BIT(page_shift
) - 1);
607 start_idx
= (io_virt
- (mr
->mmkey
.iova
& page_mask
)) >> page_shift
;
608 access_mask
= ODP_READ_ALLOWED_BIT
;
610 if (prefetch
&& !downgrade
&& !mr
->umem
->writable
) {
611 /* prefetch with write-access must
612 * be supported by the MR
618 if (mr
->umem
->writable
&& !downgrade
)
619 access_mask
|= ODP_WRITE_ALLOWED_BIT
;
621 current_seq
= READ_ONCE(odp
->notifiers_seq
);
623 * Ensure the sequence number is valid for some time before we call
628 ret
= ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr
->umem
), io_virt
, size
,
629 access_mask
, current_seq
);
636 mutex_lock(&odp
->umem_mutex
);
637 if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr
->umem
),
640 * No need to check whether the MTTs really belong to
641 * this MR, since ib_umem_odp_map_dma_pages already
644 ret
= mlx5_ib_update_xlt(mr
, start_idx
, np
,
645 page_shift
, MLX5_IB_UPD_XLT_ATOMIC
);
649 mutex_unlock(&odp
->umem_mutex
);
653 mlx5_ib_err(dev
, "Failed to update mkey page tables\n");
658 u32 new_mappings
= (np
<< page_shift
) -
659 (io_virt
- round_down(io_virt
, 1 << page_shift
));
660 *bytes_mapped
+= min_t(u32
, new_mappings
, size
);
663 npages
+= np
<< (page_shift
- PAGE_SHIFT
);
666 if (unlikely(bcnt
)) {
667 struct ib_umem_odp
*next
;
670 next
= odp_next(odp
);
671 if (unlikely(!next
|| next
->umem
.address
!= io_virt
)) {
672 mlx5_ib_dbg(dev
, "next implicit leaf removed at 0x%llx. got %p\n",
684 if (ret
== -EAGAIN
) {
685 unsigned long timeout
= msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT
);
687 if (!wait_for_completion_timeout(&odp
->notifier_completion
,
691 "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
692 current_seq
, odp
->notifiers_seq
,
693 odp
->notifiers_count
);
701 struct pf_frame
*next
;
708 static bool mkey_is_eq(struct mlx5_core_mkey
*mmkey
, u32 key
)
712 if (mmkey
->type
== MLX5_MKEY_MW
)
713 return mlx5_base_mkey(mmkey
->key
) == mlx5_base_mkey(key
);
714 return mmkey
->key
== key
;
717 static int get_indirect_num_descs(struct mlx5_core_mkey
*mmkey
)
719 struct mlx5_ib_mw
*mw
;
720 struct mlx5_ib_devx_mr
*devx_mr
;
722 if (mmkey
->type
== MLX5_MKEY_MW
) {
723 mw
= container_of(mmkey
, struct mlx5_ib_mw
, mmkey
);
727 devx_mr
= container_of(mmkey
, struct mlx5_ib_devx_mr
,
729 return devx_mr
->ndescs
;
733 * Handle a single data segment in a page-fault WQE or RDMA region.
735 * Returns number of OS pages retrieved on success. The caller may continue to
736 * the next data segment.
737 * Can return the following error codes:
738 * -EAGAIN to designate a temporary error. The caller will abort handling the
739 * page fault and resolve it.
740 * -EFAULT when there's an error mapping the requested pages. The caller will
741 * abort the page fault handling.
743 static int pagefault_single_data_segment(struct mlx5_ib_dev
*dev
,
744 struct ib_pd
*pd
, u32 key
,
745 u64 io_virt
, size_t bcnt
,
746 u32
*bytes_committed
,
747 u32
*bytes_mapped
, u32 flags
)
749 int npages
= 0, srcu_key
, ret
, i
, outlen
, cur_outlen
= 0, depth
= 0;
750 bool prefetch
= flags
& MLX5_PF_FLAGS_PREFETCH
;
751 struct pf_frame
*head
= NULL
, *frame
;
752 struct mlx5_core_mkey
*mmkey
;
753 struct mlx5_ib_mr
*mr
;
754 struct mlx5_klm
*pklm
;
759 srcu_key
= srcu_read_lock(&dev
->mr_srcu
);
761 io_virt
+= *bytes_committed
;
762 bcnt
-= *bytes_committed
;
765 mmkey
= xa_load(&dev
->mdev
->priv
.mkey_table
, mlx5_base_mkey(key
));
766 if (!mkey_is_eq(mmkey
, key
)) {
767 mlx5_ib_dbg(dev
, "failed to find mkey %x\n", key
);
772 if (prefetch
&& mmkey
->type
!= MLX5_MKEY_MR
) {
773 mlx5_ib_dbg(dev
, "prefetch is allowed only for MR\n");
778 switch (mmkey
->type
) {
780 mr
= container_of(mmkey
, struct mlx5_ib_mr
, mmkey
);
781 if (!mr
->live
|| !mr
->ibmr
.pd
) {
782 mlx5_ib_dbg(dev
, "got dead MR\n");
788 if (!is_odp_mr(mr
) ||
790 mlx5_ib_dbg(dev
, "Invalid prefetch request: %s\n",
791 is_odp_mr(mr
) ? "MR is not ODP" :
792 "PD is not of the MR");
798 if (!is_odp_mr(mr
)) {
799 mlx5_ib_dbg(dev
, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
802 *bytes_mapped
+= bcnt
;
807 ret
= pagefault_mr(dev
, mr
, io_virt
, bcnt
, bytes_mapped
, flags
);
816 case MLX5_MKEY_INDIRECT_DEVX
:
817 ndescs
= get_indirect_num_descs(mmkey
);
819 if (depth
>= MLX5_CAP_GEN(dev
->mdev
, max_indirection
)) {
820 mlx5_ib_dbg(dev
, "indirection level exceeded\n");
825 outlen
= MLX5_ST_SZ_BYTES(query_mkey_out
) +
826 sizeof(*pklm
) * (ndescs
- 2);
828 if (outlen
> cur_outlen
) {
830 out
= kzalloc(outlen
, GFP_KERNEL
);
838 pklm
= (struct mlx5_klm
*)MLX5_ADDR_OF(query_mkey_out
, out
,
839 bsf0_klm0_pas_mtt0_1
);
841 ret
= mlx5_core_query_mkey(dev
->mdev
, mmkey
, out
, outlen
);
845 offset
= io_virt
- MLX5_GET64(query_mkey_out
, out
,
846 memory_key_mkey_entry
.start_addr
);
848 for (i
= 0; bcnt
&& i
< ndescs
; i
++, pklm
++) {
849 if (offset
>= be32_to_cpu(pklm
->bcount
)) {
850 offset
-= be32_to_cpu(pklm
->bcount
);
854 frame
= kzalloc(sizeof(*frame
), GFP_KERNEL
);
860 frame
->key
= be32_to_cpu(pklm
->key
);
861 frame
->io_virt
= be64_to_cpu(pklm
->va
) + offset
;
862 frame
->bcnt
= min_t(size_t, bcnt
,
863 be32_to_cpu(pklm
->bcount
) - offset
);
864 frame
->depth
= depth
+ 1;
874 mlx5_ib_dbg(dev
, "wrong mkey type %d\n", mmkey
->type
);
884 io_virt
= frame
->io_virt
;
886 depth
= frame
->depth
;
900 srcu_read_unlock(&dev
->mr_srcu
, srcu_key
);
901 *bytes_committed
= 0;
902 return ret
? ret
: npages
;
906 * Parse a series of data segments for page fault handling.
908 * @pfault contains page fault information.
909 * @wqe points at the first data segment in the WQE.
910 * @wqe_end points after the end of the WQE.
911 * @bytes_mapped receives the number of bytes that the function was able to
912 * map. This allows the caller to decide intelligently whether
913 * enough memory was mapped to resolve the page fault
914 * successfully (e.g. enough for the next MTU, or the entire
916 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
917 * the committed bytes).
919 * Returns the number of pages loaded if positive, zero for an empty WQE, or a
920 * negative error code.
922 static int pagefault_data_segments(struct mlx5_ib_dev
*dev
,
923 struct mlx5_pagefault
*pfault
,
925 void *wqe_end
, u32
*bytes_mapped
,
926 u32
*total_wqe_bytes
, bool receive_queue
)
928 int ret
= 0, npages
= 0;
938 *total_wqe_bytes
= 0;
940 while (wqe
< wqe_end
) {
941 struct mlx5_wqe_data_seg
*dseg
= wqe
;
943 io_virt
= be64_to_cpu(dseg
->addr
);
944 key
= be32_to_cpu(dseg
->lkey
);
945 byte_count
= be32_to_cpu(dseg
->byte_count
);
946 inline_segment
= !!(byte_count
& MLX5_INLINE_SEG
);
947 bcnt
= byte_count
& ~MLX5_INLINE_SEG
;
949 if (inline_segment
) {
950 bcnt
= bcnt
& MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK
;
951 wqe
+= ALIGN(sizeof(struct mlx5_wqe_inline_seg
) + bcnt
,
954 wqe
+= sizeof(*dseg
);
957 /* receive WQE end of sg list. */
958 if (receive_queue
&& bcnt
== 0 && key
== MLX5_INVALID_LKEY
&&
962 if (!inline_segment
&& total_wqe_bytes
) {
963 *total_wqe_bytes
+= bcnt
- min_t(size_t, bcnt
,
964 pfault
->bytes_committed
);
967 /* A zero length data segment designates a length of 2GB. */
971 if (inline_segment
|| bcnt
<= pfault
->bytes_committed
) {
972 pfault
->bytes_committed
-=
974 pfault
->bytes_committed
);
978 ret
= pagefault_single_data_segment(dev
, NULL
, key
,
980 &pfault
->bytes_committed
,
987 return ret
< 0 ? ret
: npages
;
990 static const u32 mlx5_ib_odp_opcode_cap
[] = {
991 [MLX5_OPCODE_SEND
] = IB_ODP_SUPPORT_SEND
,
992 [MLX5_OPCODE_SEND_IMM
] = IB_ODP_SUPPORT_SEND
,
993 [MLX5_OPCODE_SEND_INVAL
] = IB_ODP_SUPPORT_SEND
,
994 [MLX5_OPCODE_RDMA_WRITE
] = IB_ODP_SUPPORT_WRITE
,
995 [MLX5_OPCODE_RDMA_WRITE_IMM
] = IB_ODP_SUPPORT_WRITE
,
996 [MLX5_OPCODE_RDMA_READ
] = IB_ODP_SUPPORT_READ
,
997 [MLX5_OPCODE_ATOMIC_CS
] = IB_ODP_SUPPORT_ATOMIC
,
998 [MLX5_OPCODE_ATOMIC_FA
] = IB_ODP_SUPPORT_ATOMIC
,
1002 * Parse initiator WQE. Advances the wqe pointer to point at the
1003 * scatter-gather list, and set wqe_end to the end of the WQE.
1005 static int mlx5_ib_mr_initiator_pfault_handler(
1006 struct mlx5_ib_dev
*dev
, struct mlx5_pagefault
*pfault
,
1007 struct mlx5_ib_qp
*qp
, void **wqe
, void **wqe_end
, int wqe_length
)
1009 struct mlx5_wqe_ctrl_seg
*ctrl
= *wqe
;
1010 u16 wqe_index
= pfault
->wqe
.wqe_index
;
1012 struct mlx5_base_av
*av
;
1013 unsigned ds
, opcode
;
1015 u32 ctrl_wqe_index
, ctrl_qpn
;
1017 u32 qpn
= qp
->trans_qp
.base
.mqp
.qpn
;
1019 ds
= be32_to_cpu(ctrl
->qpn_ds
) & MLX5_WQE_CTRL_DS_MASK
;
1020 if (ds
* MLX5_WQE_DS_UNITS
> wqe_length
) {
1021 mlx5_ib_err(dev
, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
1027 mlx5_ib_err(dev
, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
1033 ctrl_wqe_index
= (be32_to_cpu(ctrl
->opmod_idx_opcode
) &
1034 MLX5_WQE_CTRL_WQE_INDEX_MASK
) >>
1035 MLX5_WQE_CTRL_WQE_INDEX_SHIFT
;
1036 if (wqe_index
!= ctrl_wqe_index
) {
1037 mlx5_ib_err(dev
, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
1043 ctrl_qpn
= (be32_to_cpu(ctrl
->qpn_ds
) & MLX5_WQE_CTRL_QPN_MASK
) >>
1044 MLX5_WQE_CTRL_QPN_SHIFT
;
1045 if (qpn
!= ctrl_qpn
) {
1046 mlx5_ib_err(dev
, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
1053 *wqe_end
= *wqe
+ ds
* MLX5_WQE_DS_UNITS
;
1054 *wqe
+= sizeof(*ctrl
);
1056 opcode
= be32_to_cpu(ctrl
->opmod_idx_opcode
) &
1057 MLX5_WQE_CTRL_OPCODE_MASK
;
1059 switch (qp
->ibqp
.qp_type
) {
1060 case IB_QPT_XRC_INI
:
1061 *wqe
+= sizeof(struct mlx5_wqe_xrc_seg
);
1062 transport_caps
= dev
->odp_caps
.per_transport_caps
.xrc_odp_caps
;
1065 transport_caps
= dev
->odp_caps
.per_transport_caps
.rc_odp_caps
;
1068 transport_caps
= dev
->odp_caps
.per_transport_caps
.ud_odp_caps
;
1071 mlx5_ib_err(dev
, "ODP fault on QP of an unsupported transport 0x%x\n",
1076 if (unlikely(opcode
>= ARRAY_SIZE(mlx5_ib_odp_opcode_cap
) ||
1077 !(transport_caps
& mlx5_ib_odp_opcode_cap
[opcode
]))) {
1078 mlx5_ib_err(dev
, "ODP fault on QP of an unsupported opcode 0x%x\n",
1083 if (qp
->ibqp
.qp_type
== IB_QPT_UD
) {
1085 if (av
->dqp_dct
& cpu_to_be32(MLX5_EXTENDED_UD_AV
))
1086 *wqe
+= sizeof(struct mlx5_av
);
1088 *wqe
+= sizeof(struct mlx5_base_av
);
1092 case MLX5_OPCODE_RDMA_WRITE
:
1093 case MLX5_OPCODE_RDMA_WRITE_IMM
:
1094 case MLX5_OPCODE_RDMA_READ
:
1095 *wqe
+= sizeof(struct mlx5_wqe_raddr_seg
);
1097 case MLX5_OPCODE_ATOMIC_CS
:
1098 case MLX5_OPCODE_ATOMIC_FA
:
1099 *wqe
+= sizeof(struct mlx5_wqe_raddr_seg
);
1100 *wqe
+= sizeof(struct mlx5_wqe_atomic_seg
);
1108 * Parse responder WQE and set wqe_end to the end of the WQE.
1110 static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev
*dev
,
1111 struct mlx5_ib_srq
*srq
,
1112 void **wqe
, void **wqe_end
,
1115 int wqe_size
= 1 << srq
->msrq
.wqe_shift
;
1117 if (wqe_size
> wqe_length
) {
1118 mlx5_ib_err(dev
, "Couldn't read all of the receive WQE's content\n");
1122 *wqe_end
= *wqe
+ wqe_size
;
1123 *wqe
+= sizeof(struct mlx5_wqe_srq_next_seg
);
1128 static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev
*dev
,
1129 struct mlx5_ib_qp
*qp
,
1130 void *wqe
, void **wqe_end
,
1133 struct mlx5_ib_wq
*wq
= &qp
->rq
;
1134 int wqe_size
= 1 << wq
->wqe_shift
;
1137 mlx5_ib_err(dev
, "ODP fault with WQE signatures is not supported\n");
1141 if (wqe_size
> wqe_length
) {
1142 mlx5_ib_err(dev
, "Couldn't read all of the receive WQE's content\n");
1146 switch (qp
->ibqp
.qp_type
) {
1148 if (!(dev
->odp_caps
.per_transport_caps
.rc_odp_caps
&
1149 IB_ODP_SUPPORT_RECV
))
1150 goto invalid_transport_or_opcode
;
1153 invalid_transport_or_opcode
:
1154 mlx5_ib_err(dev
, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
1159 *wqe_end
= wqe
+ wqe_size
;
1164 static inline struct mlx5_core_rsc_common
*odp_get_rsc(struct mlx5_ib_dev
*dev
,
1165 u32 wq_num
, int pf_type
)
1167 struct mlx5_core_rsc_common
*common
= NULL
;
1168 struct mlx5_core_srq
*srq
;
1171 case MLX5_WQE_PF_TYPE_RMP
:
1172 srq
= mlx5_cmd_get_srq(dev
, wq_num
);
1174 common
= &srq
->common
;
1176 case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE
:
1177 case MLX5_WQE_PF_TYPE_RESP
:
1178 case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC
:
1179 common
= mlx5_core_res_hold(dev
->mdev
, wq_num
, MLX5_RES_QP
);
1188 static inline struct mlx5_ib_qp
*res_to_qp(struct mlx5_core_rsc_common
*res
)
1190 struct mlx5_core_qp
*mqp
= (struct mlx5_core_qp
*)res
;
1192 return to_mibqp(mqp
);
1195 static inline struct mlx5_ib_srq
*res_to_srq(struct mlx5_core_rsc_common
*res
)
1197 struct mlx5_core_srq
*msrq
=
1198 container_of(res
, struct mlx5_core_srq
, common
);
1200 return to_mibsrq(msrq
);
1203 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev
*dev
,
1204 struct mlx5_pagefault
*pfault
)
1206 bool sq
= pfault
->type
& MLX5_PFAULT_REQUESTOR
;
1207 u16 wqe_index
= pfault
->wqe
.wqe_index
;
1208 void *wqe
= NULL
, *wqe_end
= NULL
;
1209 u32 bytes_mapped
, total_wqe_bytes
;
1210 struct mlx5_core_rsc_common
*res
;
1211 int resume_with_error
= 1;
1212 struct mlx5_ib_qp
*qp
;
1213 size_t bytes_copied
;
1216 res
= odp_get_rsc(dev
, pfault
->wqe
.wq_num
, pfault
->type
);
1218 mlx5_ib_dbg(dev
, "wqe page fault for missing resource %d\n", pfault
->wqe
.wq_num
);
1222 if (res
->res
!= MLX5_RES_QP
&& res
->res
!= MLX5_RES_SRQ
&&
1223 res
->res
!= MLX5_RES_XSRQ
) {
1224 mlx5_ib_err(dev
, "wqe page fault for unsupported type %d\n",
1226 goto resolve_page_fault
;
1229 wqe
= (void *)__get_free_page(GFP_KERNEL
);
1231 mlx5_ib_err(dev
, "Error allocating memory for IO page fault handling.\n");
1232 goto resolve_page_fault
;
1235 qp
= (res
->res
== MLX5_RES_QP
) ? res_to_qp(res
) : NULL
;
1237 ret
= mlx5_ib_read_user_wqe_sq(qp
, wqe_index
, wqe
, PAGE_SIZE
,
1241 ret
= mlx5_ib_mr_initiator_pfault_handler(
1242 dev
, pfault
, qp
, &wqe
, &wqe_end
, bytes_copied
);
1243 } else if (qp
&& !sq
) {
1244 ret
= mlx5_ib_read_user_wqe_rq(qp
, wqe_index
, wqe
, PAGE_SIZE
,
1248 ret
= mlx5_ib_mr_responder_pfault_handler_rq(
1249 dev
, qp
, wqe
, &wqe_end
, bytes_copied
);
1251 struct mlx5_ib_srq
*srq
= res_to_srq(res
);
1253 ret
= mlx5_ib_read_user_wqe_srq(srq
, wqe_index
, wqe
, PAGE_SIZE
,
1257 ret
= mlx5_ib_mr_responder_pfault_handler_srq(
1258 dev
, srq
, &wqe
, &wqe_end
, bytes_copied
);
1261 if (ret
< 0 || wqe
>= wqe_end
)
1262 goto resolve_page_fault
;
1264 ret
= pagefault_data_segments(dev
, pfault
, wqe
, wqe_end
, &bytes_mapped
,
1265 &total_wqe_bytes
, !sq
);
1269 if (ret
< 0 || total_wqe_bytes
> bytes_mapped
)
1270 goto resolve_page_fault
;
1274 resume_with_error
= 0;
1280 "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
1281 ret
, wqe_index
, pfault
->token
);
1284 mlx5_ib_page_fault_resume(dev
, pfault
, resume_with_error
);
1285 mlx5_ib_dbg(dev
, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
1286 pfault
->wqe
.wq_num
, resume_with_error
,
1288 mlx5_core_res_put(res
);
1289 free_page((unsigned long)wqe
);
1292 static int pages_in_range(u64 address
, u32 length
)
1294 return (ALIGN(address
+ length
, PAGE_SIZE
) -
1295 (address
& PAGE_MASK
)) >> PAGE_SHIFT
;
1298 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev
*dev
,
1299 struct mlx5_pagefault
*pfault
)
1303 u32 prefetch_len
= pfault
->bytes_committed
;
1304 int prefetch_activated
= 0;
1305 u32 rkey
= pfault
->rdma
.r_key
;
1308 /* The RDMA responder handler handles the page fault in two parts.
1309 * First it brings the necessary pages for the current packet
1310 * (and uses the pfault context), and then (after resuming the QP)
1311 * prefetches more pages. The second operation cannot use the pfault
1312 * context and therefore uses the dummy_pfault context allocated on
1314 pfault
->rdma
.rdma_va
+= pfault
->bytes_committed
;
1315 pfault
->rdma
.rdma_op_len
-= min(pfault
->bytes_committed
,
1316 pfault
->rdma
.rdma_op_len
);
1317 pfault
->bytes_committed
= 0;
1319 address
= pfault
->rdma
.rdma_va
;
1320 length
= pfault
->rdma
.rdma_op_len
;
1322 /* For some operations, the hardware cannot tell the exact message
1323 * length, and in those cases it reports zero. Use prefetch
1326 prefetch_activated
= 1;
1327 length
= pfault
->rdma
.packet_size
;
1328 prefetch_len
= min(MAX_PREFETCH_LEN
, prefetch_len
);
1331 ret
= pagefault_single_data_segment(dev
, NULL
, rkey
, address
, length
,
1332 &pfault
->bytes_committed
, NULL
,
1334 if (ret
== -EAGAIN
) {
1335 /* We're racing with an invalidation, don't prefetch */
1336 prefetch_activated
= 0;
1337 } else if (ret
< 0 || pages_in_range(address
, length
) > ret
) {
1338 mlx5_ib_page_fault_resume(dev
, pfault
, 1);
1340 mlx5_ib_dbg(dev
, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
1341 ret
, pfault
->token
, pfault
->type
);
1345 mlx5_ib_page_fault_resume(dev
, pfault
, 0);
1346 mlx5_ib_dbg(dev
, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
1347 pfault
->token
, pfault
->type
,
1348 prefetch_activated
);
1350 /* At this point, there might be a new pagefault already arriving in
1351 * the eq, switch to the dummy pagefault for the rest of the
1352 * processing. We're still OK with the objects being alive as the
1353 * work-queue is being fenced. */
1355 if (prefetch_activated
) {
1356 u32 bytes_committed
= 0;
1358 ret
= pagefault_single_data_segment(dev
, NULL
, rkey
, address
,
1360 &bytes_committed
, NULL
,
1362 if (ret
< 0 && ret
!= -EAGAIN
) {
1363 mlx5_ib_dbg(dev
, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
1364 ret
, pfault
->token
, address
, prefetch_len
);
1369 static void mlx5_ib_pfault(struct mlx5_ib_dev
*dev
, struct mlx5_pagefault
*pfault
)
1371 u8 event_subtype
= pfault
->event_subtype
;
1373 switch (event_subtype
) {
1374 case MLX5_PFAULT_SUBTYPE_WQE
:
1375 mlx5_ib_mr_wqe_pfault_handler(dev
, pfault
);
1377 case MLX5_PFAULT_SUBTYPE_RDMA
:
1378 mlx5_ib_mr_rdma_pfault_handler(dev
, pfault
);
1381 mlx5_ib_err(dev
, "Invalid page fault event subtype: 0x%x\n",
1383 mlx5_ib_page_fault_resume(dev
, pfault
, 1);
1387 static void mlx5_ib_eqe_pf_action(struct work_struct
*work
)
1389 struct mlx5_pagefault
*pfault
= container_of(work
,
1390 struct mlx5_pagefault
,
1392 struct mlx5_ib_pf_eq
*eq
= pfault
->eq
;
1394 mlx5_ib_pfault(eq
->dev
, pfault
);
1395 mempool_free(pfault
, eq
->pool
);
1398 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq
*eq
)
1400 struct mlx5_eqe_page_fault
*pf_eqe
;
1401 struct mlx5_pagefault
*pfault
;
1402 struct mlx5_eqe
*eqe
;
1405 while ((eqe
= mlx5_eq_get_eqe(eq
->core
, cc
))) {
1406 pfault
= mempool_alloc(eq
->pool
, GFP_ATOMIC
);
1408 schedule_work(&eq
->work
);
1412 pf_eqe
= &eqe
->data
.page_fault
;
1413 pfault
->event_subtype
= eqe
->sub_type
;
1414 pfault
->bytes_committed
= be32_to_cpu(pf_eqe
->bytes_committed
);
1416 mlx5_ib_dbg(eq
->dev
,
1417 "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
1418 eqe
->sub_type
, pfault
->bytes_committed
);
1420 switch (eqe
->sub_type
) {
1421 case MLX5_PFAULT_SUBTYPE_RDMA
:
1422 /* RDMA based event */
1424 be32_to_cpu(pf_eqe
->rdma
.pftype_token
) >> 24;
1426 be32_to_cpu(pf_eqe
->rdma
.pftype_token
) &
1428 pfault
->rdma
.r_key
=
1429 be32_to_cpu(pf_eqe
->rdma
.r_key
);
1430 pfault
->rdma
.packet_size
=
1431 be16_to_cpu(pf_eqe
->rdma
.packet_length
);
1432 pfault
->rdma
.rdma_op_len
=
1433 be32_to_cpu(pf_eqe
->rdma
.rdma_op_len
);
1434 pfault
->rdma
.rdma_va
=
1435 be64_to_cpu(pf_eqe
->rdma
.rdma_va
);
1436 mlx5_ib_dbg(eq
->dev
,
1437 "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
1438 pfault
->type
, pfault
->token
,
1439 pfault
->rdma
.r_key
);
1440 mlx5_ib_dbg(eq
->dev
,
1441 "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
1442 pfault
->rdma
.rdma_op_len
,
1443 pfault
->rdma
.rdma_va
);
1446 case MLX5_PFAULT_SUBTYPE_WQE
:
1447 /* WQE based event */
1449 (be32_to_cpu(pf_eqe
->wqe
.pftype_wq
) >> 24) & 0x7;
1451 be32_to_cpu(pf_eqe
->wqe
.token
);
1452 pfault
->wqe
.wq_num
=
1453 be32_to_cpu(pf_eqe
->wqe
.pftype_wq
) &
1455 pfault
->wqe
.wqe_index
=
1456 be16_to_cpu(pf_eqe
->wqe
.wqe_index
);
1457 pfault
->wqe
.packet_size
=
1458 be16_to_cpu(pf_eqe
->wqe
.packet_length
);
1459 mlx5_ib_dbg(eq
->dev
,
1460 "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
1461 pfault
->type
, pfault
->token
,
1463 pfault
->wqe
.wqe_index
);
1467 mlx5_ib_warn(eq
->dev
,
1468 "Unsupported page fault event sub-type: 0x%02hhx\n",
1470 /* Unsupported page faults should still be
1471 * resolved by the page fault handler
1476 INIT_WORK(&pfault
->work
, mlx5_ib_eqe_pf_action
);
1477 queue_work(eq
->wq
, &pfault
->work
);
1479 cc
= mlx5_eq_update_cc(eq
->core
, ++cc
);
1482 mlx5_eq_update_ci(eq
->core
, cc
, 1);
1485 static int mlx5_ib_eq_pf_int(struct notifier_block
*nb
, unsigned long type
,
1488 struct mlx5_ib_pf_eq
*eq
=
1489 container_of(nb
, struct mlx5_ib_pf_eq
, irq_nb
);
1490 unsigned long flags
;
1492 if (spin_trylock_irqsave(&eq
->lock
, flags
)) {
1493 mlx5_ib_eq_pf_process(eq
);
1494 spin_unlock_irqrestore(&eq
->lock
, flags
);
1496 schedule_work(&eq
->work
);
1502 /* mempool_refill() was proposed but unfortunately wasn't accepted
1503 * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
1506 static void mempool_refill(mempool_t
*pool
)
1508 while (pool
->curr_nr
< pool
->min_nr
)
1509 mempool_free(mempool_alloc(pool
, GFP_KERNEL
), pool
);
1512 static void mlx5_ib_eq_pf_action(struct work_struct
*work
)
1514 struct mlx5_ib_pf_eq
*eq
=
1515 container_of(work
, struct mlx5_ib_pf_eq
, work
);
1517 mempool_refill(eq
->pool
);
1519 spin_lock_irq(&eq
->lock
);
1520 mlx5_ib_eq_pf_process(eq
);
1521 spin_unlock_irq(&eq
->lock
);
1525 MLX5_IB_NUM_PF_EQE
= 0x1000,
1526 MLX5_IB_NUM_PF_DRAIN
= 64,
1530 mlx5_ib_create_pf_eq(struct mlx5_ib_dev
*dev
, struct mlx5_ib_pf_eq
*eq
)
1532 struct mlx5_eq_param param
= {};
1535 INIT_WORK(&eq
->work
, mlx5_ib_eq_pf_action
);
1536 spin_lock_init(&eq
->lock
);
1539 eq
->pool
= mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN
,
1540 sizeof(struct mlx5_pagefault
));
1544 eq
->wq
= alloc_workqueue("mlx5_ib_page_fault",
1545 WQ_HIGHPRI
| WQ_UNBOUND
| WQ_MEM_RECLAIM
,
1552 eq
->irq_nb
.notifier_call
= mlx5_ib_eq_pf_int
;
1553 param
= (struct mlx5_eq_param
) {
1555 .nent
= MLX5_IB_NUM_PF_EQE
,
1557 param
.mask
[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT
;
1558 eq
->core
= mlx5_eq_create_generic(dev
->mdev
, ¶m
);
1559 if (IS_ERR(eq
->core
)) {
1560 err
= PTR_ERR(eq
->core
);
1563 err
= mlx5_eq_enable(dev
->mdev
, eq
->core
, &eq
->irq_nb
);
1565 mlx5_ib_err(dev
, "failed to enable odp EQ %d\n", err
);
1571 mlx5_eq_destroy_generic(dev
->mdev
, eq
->core
);
1573 destroy_workqueue(eq
->wq
);
1575 mempool_destroy(eq
->pool
);
1580 mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev
*dev
, struct mlx5_ib_pf_eq
*eq
)
1584 mlx5_eq_disable(dev
->mdev
, eq
->core
, &eq
->irq_nb
);
1585 err
= mlx5_eq_destroy_generic(dev
->mdev
, eq
->core
);
1586 cancel_work_sync(&eq
->work
);
1587 destroy_workqueue(eq
->wq
);
1588 mempool_destroy(eq
->pool
);
1593 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent
*ent
)
1595 if (!(ent
->dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT_IMPLICIT
))
1598 switch (ent
->order
- 2) {
1599 case MLX5_IMR_MTT_CACHE_ENTRY
:
1600 ent
->page
= PAGE_SHIFT
;
1601 ent
->xlt
= MLX5_IMR_MTT_ENTRIES
*
1602 sizeof(struct mlx5_mtt
) /
1603 MLX5_IB_UMR_OCTOWORD
;
1604 ent
->access_mode
= MLX5_MKC_ACCESS_MODE_MTT
;
1608 case MLX5_IMR_KSM_CACHE_ENTRY
:
1609 ent
->page
= MLX5_KSM_PAGE_SHIFT
;
1610 ent
->xlt
= mlx5_imr_ksm_entries
*
1611 sizeof(struct mlx5_klm
) /
1612 MLX5_IB_UMR_OCTOWORD
;
1613 ent
->access_mode
= MLX5_MKC_ACCESS_MODE_KSM
;
1619 static const struct ib_device_ops mlx5_ib_dev_odp_ops
= {
1620 .advise_mr
= mlx5_ib_advise_mr
,
1623 int mlx5_ib_odp_init_one(struct mlx5_ib_dev
*dev
)
1627 if (!(dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT
))
1630 ib_set_device_ops(&dev
->ib_dev
, &mlx5_ib_dev_odp_ops
);
1632 if (dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT_IMPLICIT
) {
1633 ret
= mlx5_cmd_null_mkey(dev
->mdev
, &dev
->null_mkey
);
1635 mlx5_ib_err(dev
, "Error getting null_mkey %d\n", ret
);
1640 ret
= mlx5_ib_create_pf_eq(dev
, &dev
->odp_pf_eq
);
1645 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev
*dev
)
1647 if (!(dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT
))
1650 mlx5_ib_destroy_pf_eq(dev
, &dev
->odp_pf_eq
);
1653 int mlx5_ib_odp_init(void)
1655 mlx5_imr_ksm_entries
= BIT_ULL(get_order(TASK_SIZE
) -
1661 struct prefetch_mr_work
{
1662 struct work_struct work
;
1666 struct ib_sge sg_list
[0];
1669 static void num_pending_prefetch_dec(struct mlx5_ib_dev
*dev
,
1670 struct ib_sge
*sg_list
, u32 num_sge
,
1676 srcu_key
= srcu_read_lock(&dev
->mr_srcu
);
1678 for (i
= from
; i
< num_sge
; ++i
) {
1679 struct mlx5_core_mkey
*mmkey
;
1680 struct mlx5_ib_mr
*mr
;
1682 mmkey
= xa_load(&dev
->mdev
->priv
.mkey_table
,
1683 mlx5_base_mkey(sg_list
[i
].lkey
));
1684 mr
= container_of(mmkey
, struct mlx5_ib_mr
, mmkey
);
1685 atomic_dec(&mr
->num_pending_prefetch
);
1688 srcu_read_unlock(&dev
->mr_srcu
, srcu_key
);
1691 static bool num_pending_prefetch_inc(struct ib_pd
*pd
,
1692 struct ib_sge
*sg_list
, u32 num_sge
)
1694 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1698 for (i
= 0; i
< num_sge
; ++i
) {
1699 struct mlx5_core_mkey
*mmkey
;
1700 struct mlx5_ib_mr
*mr
;
1702 mmkey
= xa_load(&dev
->mdev
->priv
.mkey_table
,
1703 mlx5_base_mkey(sg_list
[i
].lkey
));
1704 if (!mmkey
|| mmkey
->key
!= sg_list
[i
].lkey
) {
1709 if (mmkey
->type
!= MLX5_MKEY_MR
) {
1714 mr
= container_of(mmkey
, struct mlx5_ib_mr
, mmkey
);
1716 if (mr
->ibmr
.pd
!= pd
) {
1726 atomic_inc(&mr
->num_pending_prefetch
);
1730 num_pending_prefetch_dec(dev
, sg_list
, i
, 0);
1735 static int mlx5_ib_prefetch_sg_list(struct ib_pd
*pd
, u32 pf_flags
,
1736 struct ib_sge
*sg_list
, u32 num_sge
)
1740 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1742 for (i
= 0; i
< num_sge
; ++i
) {
1743 struct ib_sge
*sg
= &sg_list
[i
];
1744 int bytes_committed
= 0;
1746 ret
= pagefault_single_data_segment(dev
, pd
, sg
->lkey
, sg
->addr
,
1748 &bytes_committed
, NULL
,
1754 return ret
< 0 ? ret
: 0;
1757 static void mlx5_ib_prefetch_mr_work(struct work_struct
*work
)
1759 struct prefetch_mr_work
*w
=
1760 container_of(work
, struct prefetch_mr_work
, work
);
1762 if (ib_device_try_get(w
->pd
->device
)) {
1763 mlx5_ib_prefetch_sg_list(w
->pd
, w
->pf_flags
, w
->sg_list
,
1765 ib_device_put(w
->pd
->device
);
1768 num_pending_prefetch_dec(to_mdev(w
->pd
->device
), w
->sg_list
,
1773 int mlx5_ib_advise_mr_prefetch(struct ib_pd
*pd
,
1774 enum ib_uverbs_advise_mr_advice advice
,
1775 u32 flags
, struct ib_sge
*sg_list
, u32 num_sge
)
1777 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1778 u32 pf_flags
= MLX5_PF_FLAGS_PREFETCH
;
1779 struct prefetch_mr_work
*work
;
1783 if (advice
== IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH
)
1784 pf_flags
|= MLX5_PF_FLAGS_DOWNGRADE
;
1786 if (flags
& IB_UVERBS_ADVISE_MR_FLAG_FLUSH
)
1787 return mlx5_ib_prefetch_sg_list(pd
, pf_flags
, sg_list
,
1790 work
= kvzalloc(struct_size(work
, sg_list
, num_sge
), GFP_KERNEL
);
1794 memcpy(work
->sg_list
, sg_list
, num_sge
* sizeof(struct ib_sge
));
1796 /* It is guaranteed that the pd when work is executed is the pd when
1797 * work was queued since pd can't be destroyed while it holds MRs and
1798 * destroying a MR leads to flushing the workquque
1801 work
->pf_flags
= pf_flags
;
1802 work
->num_sge
= num_sge
;
1804 INIT_WORK(&work
->work
, mlx5_ib_prefetch_mr_work
);
1806 srcu_key
= srcu_read_lock(&dev
->mr_srcu
);
1808 valid_req
= num_pending_prefetch_inc(pd
, sg_list
, num_sge
);
1810 queue_work(system_unbound_wq
, &work
->work
);
1814 srcu_read_unlock(&dev
->mr_srcu
, srcu_key
);
1816 return valid_req
? 0 : -EINVAL
;