]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* SPDX-License-Identifier: BSD-3-Clause |
2 | * Copyright 2017 6WIND S.A. | |
3 | * Copyright 2017 Mellanox Technologies, Ltd | |
4 | */ | |
5 | ||
6 | /** | |
7 | * @file | |
8 | * Memory management functions for mlx4 driver. | |
9 | */ | |
10 | ||
11 | #include <assert.h> | |
12 | #include <errno.h> | |
13 | #include <inttypes.h> | |
14 | #include <stddef.h> | |
15 | #include <stdint.h> | |
16 | #include <string.h> | |
17 | ||
18 | /* Verbs headers do not support -pedantic. */ | |
19 | #ifdef PEDANTIC | |
20 | #pragma GCC diagnostic ignored "-Wpedantic" | |
21 | #endif | |
22 | #include <infiniband/verbs.h> | |
23 | #ifdef PEDANTIC | |
24 | #pragma GCC diagnostic error "-Wpedantic" | |
25 | #endif | |
26 | ||
27 | #include <rte_branch_prediction.h> | |
28 | #include <rte_common.h> | |
29 | #include <rte_errno.h> | |
30 | #include <rte_malloc.h> | |
31 | #include <rte_memory.h> | |
32 | #include <rte_mempool.h> | |
33 | #include <rte_rwlock.h> | |
34 | ||
35 | #include "mlx4_glue.h" | |
36 | #include "mlx4_mr.h" | |
37 | #include "mlx4_rxtx.h" | |
38 | #include "mlx4_utils.h" | |
39 | ||
40 | struct mr_find_contig_memsegs_data { | |
41 | uintptr_t addr; | |
42 | uintptr_t start; | |
43 | uintptr_t end; | |
44 | const struct rte_memseg_list *msl; | |
45 | }; | |
46 | ||
47 | struct mr_update_mp_data { | |
48 | struct rte_eth_dev *dev; | |
49 | struct mlx4_mr_ctrl *mr_ctrl; | |
50 | int ret; | |
51 | }; | |
52 | ||
53 | /** | |
54 | * Expand B-tree table to a given size. Can't be called with holding | |
55 | * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc(). | |
56 | * | |
57 | * @param bt | |
58 | * Pointer to B-tree structure. | |
59 | * @param n | |
60 | * Number of entries for expansion. | |
61 | * | |
62 | * @return | |
63 | * 0 on success, -1 on failure. | |
64 | */ | |
65 | static int | |
66 | mr_btree_expand(struct mlx4_mr_btree *bt, int n) | |
67 | { | |
68 | void *mem; | |
69 | int ret = 0; | |
70 | ||
71 | if (n <= bt->size) | |
72 | return ret; | |
73 | /* | |
74 | * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is | |
75 | * used inside if there's no room to expand. Because this is a quite | |
76 | * rare case and a part of very slow path, it is very acceptable. | |
77 | * Initially cache_bh[] will be given practically enough space and once | |
78 | * it is expanded, expansion wouldn't be needed again ever. | |
79 | */ | |
80 | mem = rte_realloc(bt->table, n * sizeof(struct mlx4_mr_cache), 0); | |
81 | if (mem == NULL) { | |
82 | /* Not an error, B-tree search will be skipped. */ | |
83 | WARN("failed to expand MR B-tree (%p) table", (void *)bt); | |
84 | ret = -1; | |
85 | } else { | |
86 | DEBUG("expanded MR B-tree table (size=%u)", n); | |
87 | bt->table = mem; | |
88 | bt->size = n; | |
89 | } | |
90 | return ret; | |
91 | } | |
92 | ||
93 | /** | |
94 | * Look up LKey from given B-tree lookup table, store the last index and return | |
95 | * searched LKey. | |
96 | * | |
97 | * @param bt | |
98 | * Pointer to B-tree structure. | |
99 | * @param[out] idx | |
100 | * Pointer to index. Even on search failure, returns index where it stops | |
101 | * searching so that index can be used when inserting a new entry. | |
102 | * @param addr | |
103 | * Search key. | |
104 | * | |
105 | * @return | |
106 | * Searched LKey on success, UINT32_MAX on no match. | |
107 | */ | |
108 | static uint32_t | |
109 | mr_btree_lookup(struct mlx4_mr_btree *bt, uint16_t *idx, uintptr_t addr) | |
110 | { | |
111 | struct mlx4_mr_cache *lkp_tbl; | |
112 | uint16_t n; | |
113 | uint16_t base = 0; | |
114 | ||
115 | assert(bt != NULL); | |
116 | lkp_tbl = *bt->table; | |
117 | n = bt->len; | |
118 | /* First entry must be NULL for comparison. */ | |
119 | assert(bt->len > 0 || (lkp_tbl[0].start == 0 && | |
120 | lkp_tbl[0].lkey == UINT32_MAX)); | |
121 | /* Binary search. */ | |
122 | do { | |
123 | register uint16_t delta = n >> 1; | |
124 | ||
125 | if (addr < lkp_tbl[base + delta].start) { | |
126 | n = delta; | |
127 | } else { | |
128 | base += delta; | |
129 | n -= delta; | |
130 | } | |
131 | } while (n > 1); | |
132 | assert(addr >= lkp_tbl[base].start); | |
133 | *idx = base; | |
134 | if (addr < lkp_tbl[base].end) | |
135 | return lkp_tbl[base].lkey; | |
136 | /* Not found. */ | |
137 | return UINT32_MAX; | |
138 | } | |
139 | ||
140 | /** | |
141 | * Insert an entry to B-tree lookup table. | |
142 | * | |
143 | * @param bt | |
144 | * Pointer to B-tree structure. | |
145 | * @param entry | |
146 | * Pointer to new entry to insert. | |
147 | * | |
148 | * @return | |
149 | * 0 on success, -1 on failure. | |
150 | */ | |
151 | static int | |
152 | mr_btree_insert(struct mlx4_mr_btree *bt, struct mlx4_mr_cache *entry) | |
153 | { | |
154 | struct mlx4_mr_cache *lkp_tbl; | |
155 | uint16_t idx = 0; | |
156 | size_t shift; | |
157 | ||
158 | assert(bt != NULL); | |
159 | assert(bt->len <= bt->size); | |
160 | assert(bt->len > 0); | |
161 | lkp_tbl = *bt->table; | |
162 | /* Find out the slot for insertion. */ | |
163 | if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) { | |
164 | DEBUG("abort insertion to B-tree(%p): already exist at" | |
165 | " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", | |
166 | (void *)bt, idx, entry->start, entry->end, entry->lkey); | |
167 | /* Already exist, return. */ | |
168 | return 0; | |
169 | } | |
170 | /* If table is full, return error. */ | |
171 | if (unlikely(bt->len == bt->size)) { | |
172 | bt->overflow = 1; | |
173 | return -1; | |
174 | } | |
175 | /* Insert entry. */ | |
176 | ++idx; | |
177 | shift = (bt->len - idx) * sizeof(struct mlx4_mr_cache); | |
178 | if (shift) | |
179 | memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift); | |
180 | lkp_tbl[idx] = *entry; | |
181 | bt->len++; | |
182 | DEBUG("inserted B-tree(%p)[%u]," | |
183 | " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", | |
184 | (void *)bt, idx, entry->start, entry->end, entry->lkey); | |
185 | return 0; | |
186 | } | |
187 | ||
188 | /** | |
189 | * Initialize B-tree and allocate memory for lookup table. | |
190 | * | |
191 | * @param bt | |
192 | * Pointer to B-tree structure. | |
193 | * @param n | |
194 | * Number of entries to allocate. | |
195 | * @param socket | |
196 | * NUMA socket on which memory must be allocated. | |
197 | * | |
198 | * @return | |
199 | * 0 on success, a negative errno value otherwise and rte_errno is set. | |
200 | */ | |
201 | int | |
202 | mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket) | |
203 | { | |
204 | if (bt == NULL) { | |
205 | rte_errno = EINVAL; | |
206 | return -rte_errno; | |
207 | } | |
208 | memset(bt, 0, sizeof(*bt)); | |
209 | bt->table = rte_calloc_socket("B-tree table", | |
210 | n, sizeof(struct mlx4_mr_cache), | |
211 | 0, socket); | |
212 | if (bt->table == NULL) { | |
213 | rte_errno = ENOMEM; | |
214 | ERROR("failed to allocate memory for btree cache on socket %d", | |
215 | socket); | |
216 | return -rte_errno; | |
217 | } | |
218 | bt->size = n; | |
219 | /* First entry must be NULL for binary search. */ | |
220 | (*bt->table)[bt->len++] = (struct mlx4_mr_cache) { | |
221 | .lkey = UINT32_MAX, | |
222 | }; | |
223 | DEBUG("initialized B-tree %p with table %p", | |
224 | (void *)bt, (void *)bt->table); | |
225 | return 0; | |
226 | } | |
227 | ||
228 | /** | |
229 | * Free B-tree resources. | |
230 | * | |
231 | * @param bt | |
232 | * Pointer to B-tree structure. | |
233 | */ | |
234 | void | |
235 | mlx4_mr_btree_free(struct mlx4_mr_btree *bt) | |
236 | { | |
237 | if (bt == NULL) | |
238 | return; | |
239 | DEBUG("freeing B-tree %p with table %p", (void *)bt, (void *)bt->table); | |
240 | rte_free(bt->table); | |
241 | memset(bt, 0, sizeof(*bt)); | |
242 | } | |
243 | ||
244 | #ifndef NDEBUG | |
245 | /** | |
246 | * Dump all the entries in a B-tree | |
247 | * | |
248 | * @param bt | |
249 | * Pointer to B-tree structure. | |
250 | */ | |
251 | void | |
252 | mlx4_mr_btree_dump(struct mlx4_mr_btree *bt) | |
253 | { | |
254 | int idx; | |
255 | struct mlx4_mr_cache *lkp_tbl; | |
256 | ||
257 | if (bt == NULL) | |
258 | return; | |
259 | lkp_tbl = *bt->table; | |
260 | for (idx = 0; idx < bt->len; ++idx) { | |
261 | struct mlx4_mr_cache *entry = &lkp_tbl[idx]; | |
262 | ||
263 | DEBUG("B-tree(%p)[%u]," | |
264 | " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x", | |
265 | (void *)bt, idx, entry->start, entry->end, entry->lkey); | |
266 | } | |
267 | } | |
268 | #endif | |
269 | ||
270 | /** | |
271 | * Find virtually contiguous memory chunk in a given MR. | |
272 | * | |
273 | * @param dev | |
274 | * Pointer to MR structure. | |
275 | * @param[out] entry | |
276 | * Pointer to returning MR cache entry. If not found, this will not be | |
277 | * updated. | |
278 | * @param start_idx | |
279 | * Start index of the memseg bitmap. | |
280 | * | |
281 | * @return | |
282 | * Next index to go on lookup. | |
283 | */ | |
284 | static int | |
285 | mr_find_next_chunk(struct mlx4_mr *mr, struct mlx4_mr_cache *entry, | |
286 | int base_idx) | |
287 | { | |
288 | uintptr_t start = 0; | |
289 | uintptr_t end = 0; | |
290 | uint32_t idx = 0; | |
291 | ||
9f95a23c TL |
292 | /* MR for external memory doesn't have memseg list. */ |
293 | if (mr->msl == NULL) { | |
294 | struct ibv_mr *ibv_mr = mr->ibv_mr; | |
295 | ||
296 | assert(mr->ms_bmp_n == 1); | |
297 | assert(mr->ms_n == 1); | |
298 | assert(base_idx == 0); | |
299 | /* | |
300 | * Can't search it from memseg list but get it directly from | |
301 | * verbs MR as there's only one chunk. | |
302 | */ | |
303 | entry->start = (uintptr_t)ibv_mr->addr; | |
304 | entry->end = (uintptr_t)ibv_mr->addr + mr->ibv_mr->length; | |
305 | entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey); | |
306 | /* Returning 1 ends iteration. */ | |
307 | return 1; | |
308 | } | |
11fdf7f2 TL |
309 | for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) { |
310 | if (rte_bitmap_get(mr->ms_bmp, idx)) { | |
311 | const struct rte_memseg_list *msl; | |
312 | const struct rte_memseg *ms; | |
313 | ||
314 | msl = mr->msl; | |
315 | ms = rte_fbarray_get(&msl->memseg_arr, | |
316 | mr->ms_base_idx + idx); | |
317 | assert(msl->page_sz == ms->hugepage_sz); | |
318 | if (!start) | |
319 | start = ms->addr_64; | |
320 | end = ms->addr_64 + ms->hugepage_sz; | |
321 | } else if (start) { | |
322 | /* Passed the end of a fragment. */ | |
323 | break; | |
324 | } | |
325 | } | |
326 | if (start) { | |
327 | /* Found one chunk. */ | |
328 | entry->start = start; | |
329 | entry->end = end; | |
330 | entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey); | |
331 | } | |
332 | return idx; | |
333 | } | |
334 | ||
335 | /** | |
336 | * Insert a MR to the global B-tree cache. It may fail due to low-on-memory. | |
337 | * Then, this entry will have to be searched by mr_lookup_dev_list() in | |
338 | * mlx4_mr_create() on miss. | |
339 | * | |
340 | * @param dev | |
341 | * Pointer to Ethernet device. | |
342 | * @param mr | |
343 | * Pointer to MR to insert. | |
344 | * | |
345 | * @return | |
346 | * 0 on success, -1 on failure. | |
347 | */ | |
348 | static int | |
349 | mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx4_mr *mr) | |
350 | { | |
9f95a23c | 351 | struct mlx4_priv *priv = dev->data->dev_private; |
11fdf7f2 TL |
352 | unsigned int n; |
353 | ||
354 | DEBUG("port %u inserting MR(%p) to global cache", | |
355 | dev->data->port_id, (void *)mr); | |
356 | for (n = 0; n < mr->ms_bmp_n; ) { | |
9f95a23c | 357 | struct mlx4_mr_cache entry; |
11fdf7f2 | 358 | |
9f95a23c | 359 | memset(&entry, 0, sizeof(entry)); |
11fdf7f2 TL |
360 | /* Find a contiguous chunk and advance the index. */ |
361 | n = mr_find_next_chunk(mr, &entry, n); | |
362 | if (!entry.end) | |
363 | break; | |
364 | if (mr_btree_insert(&priv->mr.cache, &entry) < 0) { | |
365 | /* | |
366 | * Overflowed, but the global table cannot be expanded | |
367 | * because of deadlock. | |
368 | */ | |
369 | return -1; | |
370 | } | |
371 | } | |
372 | return 0; | |
373 | } | |
374 | ||
375 | /** | |
376 | * Look up address in the original global MR list. | |
377 | * | |
378 | * @param dev | |
379 | * Pointer to Ethernet device. | |
380 | * @param[out] entry | |
381 | * Pointer to returning MR cache entry. If no match, this will not be updated. | |
382 | * @param addr | |
383 | * Search key. | |
384 | * | |
385 | * @return | |
386 | * Found MR on match, NULL otherwise. | |
387 | */ | |
388 | static struct mlx4_mr * | |
389 | mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, | |
390 | uintptr_t addr) | |
391 | { | |
9f95a23c | 392 | struct mlx4_priv *priv = dev->data->dev_private; |
11fdf7f2 TL |
393 | struct mlx4_mr *mr; |
394 | ||
395 | /* Iterate all the existing MRs. */ | |
396 | LIST_FOREACH(mr, &priv->mr.mr_list, mr) { | |
397 | unsigned int n; | |
398 | ||
399 | if (mr->ms_n == 0) | |
400 | continue; | |
401 | for (n = 0; n < mr->ms_bmp_n; ) { | |
9f95a23c | 402 | struct mlx4_mr_cache ret; |
11fdf7f2 | 403 | |
9f95a23c | 404 | memset(&ret, 0, sizeof(ret)); |
11fdf7f2 TL |
405 | n = mr_find_next_chunk(mr, &ret, n); |
406 | if (addr >= ret.start && addr < ret.end) { | |
407 | /* Found. */ | |
408 | *entry = ret; | |
409 | return mr; | |
410 | } | |
411 | } | |
412 | } | |
413 | return NULL; | |
414 | } | |
415 | ||
416 | /** | |
417 | * Look up address on device. | |
418 | * | |
419 | * @param dev | |
420 | * Pointer to Ethernet device. | |
421 | * @param[out] entry | |
422 | * Pointer to returning MR cache entry. If no match, this will not be updated. | |
423 | * @param addr | |
424 | * Search key. | |
425 | * | |
426 | * @return | |
427 | * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. | |
428 | */ | |
429 | static uint32_t | |
430 | mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, | |
431 | uintptr_t addr) | |
432 | { | |
9f95a23c | 433 | struct mlx4_priv *priv = dev->data->dev_private; |
11fdf7f2 TL |
434 | uint16_t idx; |
435 | uint32_t lkey = UINT32_MAX; | |
436 | struct mlx4_mr *mr; | |
437 | ||
438 | /* | |
439 | * If the global cache has overflowed since it failed to expand the | |
440 | * B-tree table, it can't have all the existing MRs. Then, the address | |
441 | * has to be searched by traversing the original MR list instead, which | |
442 | * is very slow path. Otherwise, the global cache is all inclusive. | |
443 | */ | |
444 | if (!unlikely(priv->mr.cache.overflow)) { | |
445 | lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); | |
446 | if (lkey != UINT32_MAX) | |
447 | *entry = (*priv->mr.cache.table)[idx]; | |
448 | } else { | |
449 | /* Falling back to the slowest path. */ | |
450 | mr = mr_lookup_dev_list(dev, entry, addr); | |
451 | if (mr != NULL) | |
452 | lkey = entry->lkey; | |
453 | } | |
454 | assert(lkey == UINT32_MAX || (addr >= entry->start && | |
455 | addr < entry->end)); | |
456 | return lkey; | |
457 | } | |
458 | ||
459 | /** | |
460 | * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free() | |
461 | * can raise memory free event and the callback function will spin on the lock. | |
462 | * | |
463 | * @param mr | |
464 | * Pointer to MR to free. | |
465 | */ | |
466 | static void | |
467 | mr_free(struct mlx4_mr *mr) | |
468 | { | |
469 | if (mr == NULL) | |
470 | return; | |
471 | DEBUG("freeing MR(%p):", (void *)mr); | |
472 | if (mr->ibv_mr != NULL) | |
473 | claim_zero(mlx4_glue->dereg_mr(mr->ibv_mr)); | |
474 | if (mr->ms_bmp != NULL) | |
475 | rte_bitmap_free(mr->ms_bmp); | |
476 | rte_free(mr); | |
477 | } | |
478 | ||
479 | /** | |
9f95a23c | 480 | * Release resources of detached MR having no online entry. |
11fdf7f2 TL |
481 | * |
482 | * @param dev | |
483 | * Pointer to Ethernet device. | |
484 | */ | |
485 | static void | |
486 | mlx4_mr_garbage_collect(struct rte_eth_dev *dev) | |
487 | { | |
9f95a23c | 488 | struct mlx4_priv *priv = dev->data->dev_private; |
11fdf7f2 TL |
489 | struct mlx4_mr *mr_next; |
490 | struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list); | |
491 | ||
9f95a23c TL |
492 | /* Must be called from the primary process. */ |
493 | assert(rte_eal_process_type() == RTE_PROC_PRIMARY); | |
11fdf7f2 TL |
494 | /* |
495 | * MR can't be freed with holding the lock because rte_free() could call | |
496 | * memory free callback function. This will be a deadlock situation. | |
497 | */ | |
498 | rte_rwlock_write_lock(&priv->mr.rwlock); | |
499 | /* Detach the whole free list and release it after unlocking. */ | |
500 | free_list = priv->mr.mr_free_list; | |
501 | LIST_INIT(&priv->mr.mr_free_list); | |
502 | rte_rwlock_write_unlock(&priv->mr.rwlock); | |
503 | /* Release resources. */ | |
504 | mr_next = LIST_FIRST(&free_list); | |
505 | while (mr_next != NULL) { | |
506 | struct mlx4_mr *mr = mr_next; | |
507 | ||
508 | mr_next = LIST_NEXT(mr, mr); | |
509 | mr_free(mr); | |
510 | } | |
511 | } | |
512 | ||
513 | /* Called during rte_memseg_contig_walk() by mlx4_mr_create(). */ | |
514 | static int | |
515 | mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl, | |
516 | const struct rte_memseg *ms, size_t len, void *arg) | |
517 | { | |
518 | struct mr_find_contig_memsegs_data *data = arg; | |
519 | ||
520 | if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len) | |
521 | return 0; | |
522 | /* Found, save it and stop walking. */ | |
523 | data->start = ms->addr_64; | |
524 | data->end = ms->addr_64 + len; | |
525 | data->msl = msl; | |
526 | return 1; | |
527 | } | |
528 | ||
529 | /** | |
9f95a23c TL |
530 | * Create a new global Memory Region (MR) for a missing virtual address. |
531 | * This API should be called on a secondary process, then a request is sent to | |
532 | * the primary process in order to create a MR for the address. As the global MR | |
533 | * list is on the shared memory, following LKey lookup should succeed unless the | |
534 | * request fails. | |
11fdf7f2 TL |
535 | * |
536 | * @param dev | |
537 | * Pointer to Ethernet device. | |
538 | * @param[out] entry | |
539 | * Pointer to returning MR cache entry, found in the global cache or newly | |
540 | * created. If failed to create one, this will not be updated. | |
541 | * @param addr | |
542 | * Target virtual address to register. | |
543 | * | |
544 | * @return | |
545 | * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. | |
546 | */ | |
547 | static uint32_t | |
9f95a23c TL |
548 | mlx4_mr_create_secondary(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, |
549 | uintptr_t addr) | |
11fdf7f2 | 550 | { |
9f95a23c TL |
551 | struct mlx4_priv *priv = dev->data->dev_private; |
552 | int ret; | |
553 | ||
554 | DEBUG("port %u requesting MR creation for address (%p)", | |
555 | dev->data->port_id, (void *)addr); | |
556 | ret = mlx4_mp_req_mr_create(dev, addr); | |
557 | if (ret) { | |
558 | DEBUG("port %u fail to request MR creation for address (%p)", | |
559 | dev->data->port_id, (void *)addr); | |
560 | return UINT32_MAX; | |
561 | } | |
562 | rte_rwlock_read_lock(&priv->mr.rwlock); | |
563 | /* Fill in output data. */ | |
564 | mr_lookup_dev(dev, entry, addr); | |
565 | /* Lookup can't fail. */ | |
566 | assert(entry->lkey != UINT32_MAX); | |
567 | rte_rwlock_read_unlock(&priv->mr.rwlock); | |
568 | DEBUG("port %u MR CREATED by primary process for %p:\n" | |
569 | " [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x", | |
570 | dev->data->port_id, (void *)addr, | |
571 | entry->start, entry->end, entry->lkey); | |
572 | return entry->lkey; | |
573 | } | |
574 | ||
575 | /** | |
576 | * Create a new global Memory Region (MR) for a missing virtual address. | |
577 | * Register entire virtually contiguous memory chunk around the address. | |
578 | * This must be called from the primary process. | |
579 | * | |
580 | * @param dev | |
581 | * Pointer to Ethernet device. | |
582 | * @param[out] entry | |
583 | * Pointer to returning MR cache entry, found in the global cache or newly | |
584 | * created. If failed to create one, this will not be updated. | |
585 | * @param addr | |
586 | * Target virtual address to register. | |
587 | * | |
588 | * @return | |
589 | * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. | |
590 | */ | |
591 | uint32_t | |
592 | mlx4_mr_create_primary(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, | |
593 | uintptr_t addr) | |
594 | { | |
595 | struct mlx4_priv *priv = dev->data->dev_private; | |
11fdf7f2 TL |
596 | struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; |
597 | const struct rte_memseg_list *msl; | |
598 | const struct rte_memseg *ms; | |
599 | struct mlx4_mr *mr = NULL; | |
600 | size_t len; | |
601 | uint32_t ms_n; | |
602 | uint32_t bmp_size; | |
603 | void *bmp_mem; | |
604 | int ms_idx_shift = -1; | |
605 | unsigned int n; | |
606 | struct mr_find_contig_memsegs_data data = { | |
607 | .addr = addr, | |
608 | }; | |
609 | struct mr_find_contig_memsegs_data data_re; | |
610 | ||
611 | DEBUG("port %u creating a MR using address (%p)", | |
612 | dev->data->port_id, (void *)addr); | |
613 | /* | |
614 | * Release detached MRs if any. This can't be called with holding either | |
615 | * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have | |
616 | * been detached by the memory free event but it couldn't be released | |
617 | * inside the callback due to deadlock. As a result, releasing resources | |
618 | * is quite opportunistic. | |
619 | */ | |
620 | mlx4_mr_garbage_collect(dev); | |
621 | /* | |
9f95a23c TL |
622 | * If enabled, find out a contiguous virtual address chunk in use, to |
623 | * which the given address belongs, in order to register maximum range. | |
624 | * In the best case where mempools are not dynamically recreated and | |
625 | * '--socket-mem' is specified as an EAL option, it is very likely to | |
11fdf7f2 | 626 | * have only one MR(LKey) per a socket and per a hugepage-size even |
9f95a23c TL |
627 | * though the system memory is highly fragmented. As the whole memory |
628 | * chunk will be pinned by kernel, it can't be reused unless entire | |
629 | * chunk is freed from EAL. | |
630 | * | |
631 | * If disabled, just register one memseg (page). Then, memory | |
632 | * consumption will be minimized but it may drop performance if there | |
633 | * are many MRs to lookup on the datapath. | |
11fdf7f2 | 634 | */ |
9f95a23c TL |
635 | if (!priv->mr_ext_memseg_en) { |
636 | data.msl = rte_mem_virt2memseg_list((void *)addr); | |
637 | data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz); | |
638 | data.end = data.start + data.msl->page_sz; | |
639 | } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) { | |
11fdf7f2 TL |
640 | WARN("port %u unable to find virtually contiguous" |
641 | " chunk for address (%p)." | |
642 | " rte_memseg_contig_walk() failed.", | |
643 | dev->data->port_id, (void *)addr); | |
644 | rte_errno = ENXIO; | |
645 | goto err_nolock; | |
646 | } | |
647 | alloc_resources: | |
648 | /* Addresses must be page-aligned. */ | |
649 | assert(rte_is_aligned((void *)data.start, data.msl->page_sz)); | |
650 | assert(rte_is_aligned((void *)data.end, data.msl->page_sz)); | |
651 | msl = data.msl; | |
652 | ms = rte_mem_virt2memseg((void *)data.start, msl); | |
653 | len = data.end - data.start; | |
654 | assert(msl->page_sz == ms->hugepage_sz); | |
655 | /* Number of memsegs in the range. */ | |
656 | ms_n = len / msl->page_sz; | |
657 | DEBUG("port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR ")," | |
658 | " page_sz=0x%" PRIx64 ", ms_n=%u", | |
659 | dev->data->port_id, (void *)addr, | |
660 | data.start, data.end, msl->page_sz, ms_n); | |
661 | /* Size of memory for bitmap. */ | |
662 | bmp_size = rte_bitmap_get_memory_footprint(ms_n); | |
663 | mr = rte_zmalloc_socket(NULL, | |
664 | RTE_ALIGN_CEIL(sizeof(*mr), | |
665 | RTE_CACHE_LINE_SIZE) + | |
666 | bmp_size, | |
667 | RTE_CACHE_LINE_SIZE, msl->socket_id); | |
668 | if (mr == NULL) { | |
669 | WARN("port %u unable to allocate memory for a new MR of" | |
670 | " address (%p).", | |
671 | dev->data->port_id, (void *)addr); | |
672 | rte_errno = ENOMEM; | |
673 | goto err_nolock; | |
674 | } | |
675 | mr->msl = msl; | |
676 | /* | |
677 | * Save the index of the first memseg and initialize memseg bitmap. To | |
678 | * see if a memseg of ms_idx in the memseg-list is still valid, check: | |
679 | * rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx) | |
680 | */ | |
681 | mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); | |
682 | bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE); | |
683 | mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size); | |
684 | if (mr->ms_bmp == NULL) { | |
9f95a23c | 685 | WARN("port %u unable to initialize bitmap for a new MR of" |
11fdf7f2 TL |
686 | " address (%p).", |
687 | dev->data->port_id, (void *)addr); | |
688 | rte_errno = EINVAL; | |
689 | goto err_nolock; | |
690 | } | |
691 | /* | |
692 | * Should recheck whether the extended contiguous chunk is still valid. | |
693 | * Because memory_hotplug_lock can't be held if there's any memory | |
694 | * related calls in a critical path, resource allocation above can't be | |
695 | * locked. If the memory has been changed at this point, try again with | |
696 | * just single page. If not, go on with the big chunk atomically from | |
697 | * here. | |
698 | */ | |
699 | rte_rwlock_read_lock(&mcfg->memory_hotplug_lock); | |
700 | data_re = data; | |
701 | if (len > msl->page_sz && | |
702 | !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) { | |
703 | WARN("port %u unable to find virtually contiguous" | |
704 | " chunk for address (%p)." | |
705 | " rte_memseg_contig_walk() failed.", | |
706 | dev->data->port_id, (void *)addr); | |
707 | rte_errno = ENXIO; | |
708 | goto err_memlock; | |
709 | } | |
710 | if (data.start != data_re.start || data.end != data_re.end) { | |
711 | /* | |
712 | * The extended contiguous chunk has been changed. Try again | |
713 | * with single memseg instead. | |
714 | */ | |
715 | data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz); | |
716 | data.end = data.start + msl->page_sz; | |
717 | rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); | |
718 | mr_free(mr); | |
719 | goto alloc_resources; | |
720 | } | |
721 | assert(data.msl == data_re.msl); | |
722 | rte_rwlock_write_lock(&priv->mr.rwlock); | |
723 | /* | |
724 | * Check the address is really missing. If other thread already created | |
725 | * one or it is not found due to overflow, abort and return. | |
726 | */ | |
727 | if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) { | |
728 | /* | |
729 | * Insert to the global cache table. It may fail due to | |
730 | * low-on-memory. Then, this entry will have to be searched | |
731 | * here again. | |
732 | */ | |
733 | mr_btree_insert(&priv->mr.cache, entry); | |
734 | DEBUG("port %u found MR for %p on final lookup, abort", | |
735 | dev->data->port_id, (void *)addr); | |
736 | rte_rwlock_write_unlock(&priv->mr.rwlock); | |
737 | rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); | |
738 | /* | |
739 | * Must be unlocked before calling rte_free() because | |
740 | * mlx4_mr_mem_event_free_cb() can be called inside. | |
741 | */ | |
742 | mr_free(mr); | |
743 | return entry->lkey; | |
744 | } | |
745 | /* | |
746 | * Trim start and end addresses for verbs MR. Set bits for registering | |
747 | * memsegs but exclude already registered ones. Bitmap can be | |
748 | * fragmented. | |
749 | */ | |
750 | for (n = 0; n < ms_n; ++n) { | |
751 | uintptr_t start; | |
9f95a23c | 752 | struct mlx4_mr_cache ret; |
11fdf7f2 | 753 | |
9f95a23c | 754 | memset(&ret, 0, sizeof(ret)); |
11fdf7f2 TL |
755 | start = data_re.start + n * msl->page_sz; |
756 | /* Exclude memsegs already registered by other MRs. */ | |
757 | if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) { | |
758 | /* | |
759 | * Start from the first unregistered memseg in the | |
760 | * extended range. | |
761 | */ | |
762 | if (ms_idx_shift == -1) { | |
763 | mr->ms_base_idx += n; | |
764 | data.start = start; | |
765 | ms_idx_shift = n; | |
766 | } | |
767 | data.end = start + msl->page_sz; | |
768 | rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift); | |
769 | ++mr->ms_n; | |
770 | } | |
771 | } | |
772 | len = data.end - data.start; | |
773 | mr->ms_bmp_n = len / msl->page_sz; | |
774 | assert(ms_idx_shift + mr->ms_bmp_n <= ms_n); | |
775 | /* | |
776 | * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be | |
777 | * called with holding the memory lock because it doesn't use | |
778 | * mlx4_alloc_buf_extern() which eventually calls rte_malloc_socket() | |
779 | * through mlx4_alloc_verbs_buf(). | |
780 | */ | |
781 | mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)data.start, len, | |
782 | IBV_ACCESS_LOCAL_WRITE); | |
783 | if (mr->ibv_mr == NULL) { | |
784 | WARN("port %u fail to create a verbs MR for address (%p)", | |
785 | dev->data->port_id, (void *)addr); | |
786 | rte_errno = EINVAL; | |
787 | goto err_mrlock; | |
788 | } | |
789 | assert((uintptr_t)mr->ibv_mr->addr == data.start); | |
790 | assert(mr->ibv_mr->length == len); | |
791 | LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); | |
792 | DEBUG("port %u MR CREATED (%p) for %p:\n" | |
793 | " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," | |
794 | " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", | |
795 | dev->data->port_id, (void *)mr, (void *)addr, | |
796 | data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey), | |
797 | mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); | |
798 | /* Insert to the global cache table. */ | |
799 | mr_insert_dev_cache(dev, mr); | |
800 | /* Fill in output data. */ | |
801 | mr_lookup_dev(dev, entry, addr); | |
802 | /* Lookup can't fail. */ | |
803 | assert(entry->lkey != UINT32_MAX); | |
804 | rte_rwlock_write_unlock(&priv->mr.rwlock); | |
805 | rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); | |
806 | return entry->lkey; | |
807 | err_mrlock: | |
808 | rte_rwlock_write_unlock(&priv->mr.rwlock); | |
809 | err_memlock: | |
810 | rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock); | |
811 | err_nolock: | |
812 | /* | |
813 | * In case of error, as this can be called in a datapath, a warning | |
814 | * message per an error is preferable instead. Must be unlocked before | |
815 | * calling rte_free() because mlx4_mr_mem_event_free_cb() can be called | |
816 | * inside. | |
817 | */ | |
818 | mr_free(mr); | |
819 | return UINT32_MAX; | |
820 | } | |
821 | ||
9f95a23c TL |
822 | /** |
823 | * Create a new global Memory Region (MR) for a missing virtual address. | |
824 | * This can be called from primary and secondary process. | |
825 | * | |
826 | * @param dev | |
827 | * Pointer to Ethernet device. | |
828 | * @param[out] entry | |
829 | * Pointer to returning MR cache entry, found in the global cache or newly | |
830 | * created. If failed to create one, this will not be updated. | |
831 | * @param addr | |
832 | * Target virtual address to register. | |
833 | * | |
834 | * @return | |
835 | * Searched LKey on success, UINT32_MAX on failure and rte_errno is set. | |
836 | */ | |
837 | static uint32_t | |
838 | mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry, | |
839 | uintptr_t addr) | |
840 | { | |
841 | uint32_t ret = 0; | |
842 | ||
843 | switch (rte_eal_process_type()) { | |
844 | case RTE_PROC_PRIMARY: | |
845 | ret = mlx4_mr_create_primary(dev, entry, addr); | |
846 | break; | |
847 | case RTE_PROC_SECONDARY: | |
848 | ret = mlx4_mr_create_secondary(dev, entry, addr); | |
849 | break; | |
850 | default: | |
851 | break; | |
852 | } | |
853 | return ret; | |
854 | } | |
855 | ||
11fdf7f2 TL |
856 | /** |
857 | * Rebuild the global B-tree cache of device from the original MR list. | |
858 | * | |
859 | * @param dev | |
860 | * Pointer to Ethernet device. | |
861 | */ | |
862 | static void | |
863 | mr_rebuild_dev_cache(struct rte_eth_dev *dev) | |
864 | { | |
9f95a23c | 865 | struct mlx4_priv *priv = dev->data->dev_private; |
11fdf7f2 TL |
866 | struct mlx4_mr *mr; |
867 | ||
868 | DEBUG("port %u rebuild dev cache[]", dev->data->port_id); | |
869 | /* Flush cache to rebuild. */ | |
870 | priv->mr.cache.len = 1; | |
871 | priv->mr.cache.overflow = 0; | |
872 | /* Iterate all the existing MRs. */ | |
873 | LIST_FOREACH(mr, &priv->mr.mr_list, mr) | |
874 | if (mr_insert_dev_cache(dev, mr) < 0) | |
875 | return; | |
876 | } | |
877 | ||
878 | /** | |
879 | * Callback for memory free event. Iterate freed memsegs and check whether it | |
880 | * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a | |
881 | * result, the MR would be fragmented. If it becomes empty, the MR will be freed | |
882 | * later by mlx4_mr_garbage_collect(). | |
883 | * | |
884 | * The global cache must be rebuilt if there's any change and this event has to | |
885 | * be propagated to dataplane threads to flush the local caches. | |
886 | * | |
887 | * @param dev | |
888 | * Pointer to Ethernet device. | |
889 | * @param addr | |
890 | * Address of freed memory. | |
891 | * @param len | |
892 | * Size of freed memory. | |
893 | */ | |
894 | static void | |
895 | mlx4_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len) | |
896 | { | |
9f95a23c | 897 | struct mlx4_priv *priv = dev->data->dev_private; |
11fdf7f2 TL |
898 | const struct rte_memseg_list *msl; |
899 | struct mlx4_mr *mr; | |
900 | int ms_n; | |
901 | int i; | |
902 | int rebuild = 0; | |
903 | ||
904 | DEBUG("port %u free callback: addr=%p, len=%zu", | |
905 | dev->data->port_id, addr, len); | |
906 | msl = rte_mem_virt2memseg_list(addr); | |
907 | /* addr and len must be page-aligned. */ | |
908 | assert((uintptr_t)addr == RTE_ALIGN((uintptr_t)addr, msl->page_sz)); | |
909 | assert(len == RTE_ALIGN(len, msl->page_sz)); | |
910 | ms_n = len / msl->page_sz; | |
911 | rte_rwlock_write_lock(&priv->mr.rwlock); | |
912 | /* Clear bits of freed memsegs from MR. */ | |
913 | for (i = 0; i < ms_n; ++i) { | |
914 | const struct rte_memseg *ms; | |
915 | struct mlx4_mr_cache entry; | |
916 | uintptr_t start; | |
917 | int ms_idx; | |
918 | uint32_t pos; | |
919 | ||
920 | /* Find MR having this memseg. */ | |
921 | start = (uintptr_t)addr + i * msl->page_sz; | |
922 | mr = mr_lookup_dev_list(dev, &entry, start); | |
923 | if (mr == NULL) | |
924 | continue; | |
9f95a23c | 925 | assert(mr->msl); /* Can't be external memory. */ |
11fdf7f2 TL |
926 | ms = rte_mem_virt2memseg((void *)start, msl); |
927 | assert(ms != NULL); | |
928 | assert(msl->page_sz == ms->hugepage_sz); | |
929 | ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); | |
930 | pos = ms_idx - mr->ms_base_idx; | |
931 | assert(rte_bitmap_get(mr->ms_bmp, pos)); | |
932 | assert(pos < mr->ms_bmp_n); | |
933 | DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p", | |
934 | dev->data->port_id, (void *)mr, pos, (void *)start); | |
935 | rte_bitmap_clear(mr->ms_bmp, pos); | |
936 | if (--mr->ms_n == 0) { | |
937 | LIST_REMOVE(mr, mr); | |
938 | LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); | |
939 | DEBUG("port %u remove MR(%p) from list", | |
940 | dev->data->port_id, (void *)mr); | |
941 | } | |
942 | /* | |
943 | * MR is fragmented or will be freed. the global cache must be | |
944 | * rebuilt. | |
945 | */ | |
946 | rebuild = 1; | |
947 | } | |
948 | if (rebuild) { | |
949 | mr_rebuild_dev_cache(dev); | |
950 | /* | |
951 | * Flush local caches by propagating invalidation across cores. | |
952 | * rte_smp_wmb() is enough to synchronize this event. If one of | |
953 | * freed memsegs is seen by other core, that means the memseg | |
954 | * has been allocated by allocator, which will come after this | |
955 | * free call. Therefore, this store instruction (incrementing | |
956 | * generation below) will be guaranteed to be seen by other core | |
957 | * before the core sees the newly allocated memory. | |
958 | */ | |
959 | ++priv->mr.dev_gen; | |
960 | DEBUG("broadcasting local cache flush, gen=%d", | |
961 | priv->mr.dev_gen); | |
962 | rte_smp_wmb(); | |
963 | } | |
964 | rte_rwlock_write_unlock(&priv->mr.rwlock); | |
965 | #ifndef NDEBUG | |
966 | if (rebuild) | |
967 | mlx4_mr_dump_dev(dev); | |
968 | #endif | |
969 | } | |
970 | ||
971 | /** | |
972 | * Callback for memory event. | |
973 | * | |
974 | * @param event_type | |
975 | * Memory event type. | |
976 | * @param addr | |
977 | * Address of memory. | |
978 | * @param len | |
979 | * Size of memory. | |
980 | */ | |
981 | void | |
982 | mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr, | |
983 | size_t len, void *arg __rte_unused) | |
984 | { | |
9f95a23c TL |
985 | struct mlx4_priv *priv; |
986 | struct mlx4_dev_list *dev_list = &mlx4_shared_data->mem_event_cb_list; | |
11fdf7f2 | 987 | |
9f95a23c TL |
988 | /* Must be called from the primary process. */ |
989 | assert(rte_eal_process_type() == RTE_PROC_PRIMARY); | |
11fdf7f2 TL |
990 | switch (event_type) { |
991 | case RTE_MEM_EVENT_FREE: | |
9f95a23c | 992 | rte_rwlock_read_lock(&mlx4_shared_data->mem_event_rwlock); |
11fdf7f2 | 993 | /* Iterate all the existing mlx4 devices. */ |
9f95a23c TL |
994 | LIST_FOREACH(priv, dev_list, mem_event_cb) |
995 | mlx4_mr_mem_event_free_cb(ETH_DEV(priv), addr, len); | |
996 | rte_rwlock_read_unlock(&mlx4_shared_data->mem_event_rwlock); | |
11fdf7f2 TL |
997 | break; |
998 | case RTE_MEM_EVENT_ALLOC: | |
999 | default: | |
1000 | break; | |
1001 | } | |
1002 | } | |
1003 | ||
1004 | /** | |
1005 | * Look up address in the global MR cache table. If not found, create a new MR. | |
1006 | * Insert the found/created entry to local bottom-half cache table. | |
1007 | * | |
1008 | * @param dev | |
1009 | * Pointer to Ethernet device. | |
1010 | * @param mr_ctrl | |
1011 | * Pointer to per-queue MR control structure. | |
1012 | * @param[out] entry | |
1013 | * Pointer to returning MR cache entry, found in the global cache or newly | |
1014 | * created. If failed to create one, this is not written. | |
1015 | * @param addr | |
1016 | * Search key. | |
1017 | * | |
1018 | * @return | |
1019 | * Searched LKey on success, UINT32_MAX on no match. | |
1020 | */ | |
1021 | static uint32_t | |
1022 | mlx4_mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, | |
1023 | struct mlx4_mr_cache *entry, uintptr_t addr) | |
1024 | { | |
9f95a23c | 1025 | struct mlx4_priv *priv = dev->data->dev_private; |
11fdf7f2 TL |
1026 | struct mlx4_mr_btree *bt = &mr_ctrl->cache_bh; |
1027 | uint16_t idx; | |
1028 | uint32_t lkey; | |
1029 | ||
1030 | /* If local cache table is full, try to double it. */ | |
1031 | if (unlikely(bt->len == bt->size)) | |
1032 | mr_btree_expand(bt, bt->size << 1); | |
1033 | /* Look up in the global cache. */ | |
1034 | rte_rwlock_read_lock(&priv->mr.rwlock); | |
1035 | lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr); | |
1036 | if (lkey != UINT32_MAX) { | |
1037 | /* Found. */ | |
1038 | *entry = (*priv->mr.cache.table)[idx]; | |
1039 | rte_rwlock_read_unlock(&priv->mr.rwlock); | |
1040 | /* | |
1041 | * Update local cache. Even if it fails, return the found entry | |
1042 | * to update top-half cache. Next time, this entry will be found | |
1043 | * in the global cache. | |
1044 | */ | |
1045 | mr_btree_insert(bt, entry); | |
1046 | return lkey; | |
1047 | } | |
1048 | rte_rwlock_read_unlock(&priv->mr.rwlock); | |
1049 | /* First time to see the address? Create a new MR. */ | |
1050 | lkey = mlx4_mr_create(dev, entry, addr); | |
1051 | /* | |
1052 | * Update the local cache if successfully created a new global MR. Even | |
1053 | * if failed to create one, there's no action to take in this datapath | |
1054 | * code. As returning LKey is invalid, this will eventually make HW | |
1055 | * fail. | |
1056 | */ | |
1057 | if (lkey != UINT32_MAX) | |
1058 | mr_btree_insert(bt, entry); | |
1059 | return lkey; | |
1060 | } | |
1061 | ||
1062 | /** | |
1063 | * Bottom-half of LKey search on datapath. Firstly search in cache_bh[] and if | |
1064 | * misses, search in the global MR cache table and update the new entry to | |
1065 | * per-queue local caches. | |
1066 | * | |
1067 | * @param dev | |
1068 | * Pointer to Ethernet device. | |
1069 | * @param mr_ctrl | |
1070 | * Pointer to per-queue MR control structure. | |
1071 | * @param addr | |
1072 | * Search key. | |
1073 | * | |
1074 | * @return | |
1075 | * Searched LKey on success, UINT32_MAX on no match. | |
1076 | */ | |
1077 | static uint32_t | |
1078 | mlx4_mr_addr2mr_bh(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, | |
1079 | uintptr_t addr) | |
1080 | { | |
1081 | uint32_t lkey; | |
1082 | uint16_t bh_idx = 0; | |
1083 | /* Victim in top-half cache to replace with new entry. */ | |
1084 | struct mlx4_mr_cache *repl = &mr_ctrl->cache[mr_ctrl->head]; | |
1085 | ||
1086 | /* Binary-search MR translation table. */ | |
1087 | lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr); | |
1088 | /* Update top-half cache. */ | |
1089 | if (likely(lkey != UINT32_MAX)) { | |
1090 | *repl = (*mr_ctrl->cache_bh.table)[bh_idx]; | |
1091 | } else { | |
1092 | /* | |
1093 | * If missed in local lookup table, search in the global cache | |
1094 | * and local cache_bh[] will be updated inside if possible. | |
1095 | * Top-half cache entry will also be updated. | |
1096 | */ | |
1097 | lkey = mlx4_mr_lookup_dev(dev, mr_ctrl, repl, addr); | |
1098 | if (unlikely(lkey == UINT32_MAX)) | |
1099 | return UINT32_MAX; | |
1100 | } | |
1101 | /* Update the most recently used entry. */ | |
1102 | mr_ctrl->mru = mr_ctrl->head; | |
1103 | /* Point to the next victim, the oldest. */ | |
1104 | mr_ctrl->head = (mr_ctrl->head + 1) % MLX4_MR_CACHE_N; | |
1105 | return lkey; | |
1106 | } | |
1107 | ||
1108 | /** | |
1109 | * Bottom-half of LKey search on Rx. | |
1110 | * | |
1111 | * @param rxq | |
1112 | * Pointer to Rx queue structure. | |
1113 | * @param addr | |
1114 | * Search key. | |
1115 | * | |
1116 | * @return | |
1117 | * Searched LKey on success, UINT32_MAX on no match. | |
1118 | */ | |
1119 | uint32_t | |
1120 | mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr) | |
1121 | { | |
1122 | struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl; | |
9f95a23c | 1123 | struct mlx4_priv *priv = rxq->priv; |
11fdf7f2 | 1124 | |
9f95a23c | 1125 | return mlx4_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr); |
11fdf7f2 TL |
1126 | } |
1127 | ||
1128 | /** | |
1129 | * Bottom-half of LKey search on Tx. | |
1130 | * | |
1131 | * @param txq | |
1132 | * Pointer to Tx queue structure. | |
1133 | * @param addr | |
1134 | * Search key. | |
1135 | * | |
1136 | * @return | |
1137 | * Searched LKey on success, UINT32_MAX on no match. | |
1138 | */ | |
9f95a23c | 1139 | static uint32_t |
11fdf7f2 TL |
1140 | mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr) |
1141 | { | |
1142 | struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl; | |
9f95a23c TL |
1143 | struct mlx4_priv *priv = txq->priv; |
1144 | ||
1145 | return mlx4_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr); | |
1146 | } | |
1147 | ||
1148 | /** | |
1149 | * Bottom-half of LKey search on Tx. If it can't be searched in the memseg | |
1150 | * list, register the mempool of the mbuf as externally allocated memory. | |
1151 | * | |
1152 | * @param txq | |
1153 | * Pointer to Tx queue structure. | |
1154 | * @param mb | |
1155 | * Pointer to mbuf. | |
1156 | * | |
1157 | * @return | |
1158 | * Searched LKey on success, UINT32_MAX on no match. | |
1159 | */ | |
1160 | uint32_t | |
1161 | mlx4_tx_mb2mr_bh(struct txq *txq, struct rte_mbuf *mb) | |
1162 | { | |
1163 | uintptr_t addr = (uintptr_t)mb->buf_addr; | |
1164 | uint32_t lkey; | |
11fdf7f2 | 1165 | |
9f95a23c TL |
1166 | lkey = mlx4_tx_addr2mr_bh(txq, addr); |
1167 | if (lkey == UINT32_MAX && rte_errno == ENXIO) { | |
1168 | /* Mempool may have externally allocated memory. */ | |
1169 | return mlx4_tx_update_ext_mp(txq, addr, mlx4_mb2mp(mb)); | |
1170 | } | |
1171 | return lkey; | |
11fdf7f2 TL |
1172 | } |
1173 | ||
1174 | /** | |
1175 | * Flush all of the local cache entries. | |
1176 | * | |
1177 | * @param mr_ctrl | |
1178 | * Pointer to per-queue MR control structure. | |
1179 | */ | |
1180 | void | |
1181 | mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl) | |
1182 | { | |
1183 | /* Reset the most-recently-used index. */ | |
1184 | mr_ctrl->mru = 0; | |
1185 | /* Reset the linear search array. */ | |
1186 | mr_ctrl->head = 0; | |
1187 | memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache)); | |
1188 | /* Reset the B-tree table. */ | |
1189 | mr_ctrl->cache_bh.len = 1; | |
1190 | mr_ctrl->cache_bh.overflow = 0; | |
1191 | /* Update the generation number. */ | |
1192 | mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr; | |
1193 | DEBUG("mr_ctrl(%p): flushed, cur_gen=%d", | |
1194 | (void *)mr_ctrl, mr_ctrl->cur_gen); | |
1195 | } | |
1196 | ||
9f95a23c TL |
1197 | /** |
1198 | * Called during rte_mempool_mem_iter() by mlx4_mr_update_ext_mp(). | |
1199 | * | |
1200 | * Externally allocated chunk is registered and a MR is created for the chunk. | |
1201 | * The MR object is added to the global list. If memseg list of a MR object | |
1202 | * (mr->msl) is null, the MR object can be regarded as externally allocated | |
1203 | * memory. | |
1204 | * | |
1205 | * Once external memory is registered, it should be static. If the memory is | |
1206 | * freed and the virtual address range has different physical memory mapped | |
1207 | * again, it may cause crash on device due to the wrong translation entry. PMD | |
1208 | * can't track the free event of the external memory for now. | |
1209 | */ | |
1210 | static void | |
1211 | mlx4_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque, | |
1212 | struct rte_mempool_memhdr *memhdr, | |
1213 | unsigned mem_idx __rte_unused) | |
1214 | { | |
1215 | struct mr_update_mp_data *data = opaque; | |
1216 | struct rte_eth_dev *dev = data->dev; | |
1217 | struct mlx4_priv *priv = dev->data->dev_private; | |
1218 | struct mlx4_mr_ctrl *mr_ctrl = data->mr_ctrl; | |
1219 | struct mlx4_mr *mr = NULL; | |
1220 | uintptr_t addr = (uintptr_t)memhdr->addr; | |
1221 | size_t len = memhdr->len; | |
1222 | struct mlx4_mr_cache entry; | |
1223 | uint32_t lkey; | |
1224 | ||
1225 | assert(rte_eal_process_type() == RTE_PROC_PRIMARY); | |
1226 | /* If already registered, it should return. */ | |
1227 | rte_rwlock_read_lock(&priv->mr.rwlock); | |
1228 | lkey = mr_lookup_dev(dev, &entry, addr); | |
1229 | rte_rwlock_read_unlock(&priv->mr.rwlock); | |
1230 | if (lkey != UINT32_MAX) | |
1231 | return; | |
1232 | mr = rte_zmalloc_socket(NULL, | |
1233 | RTE_ALIGN_CEIL(sizeof(*mr), | |
1234 | RTE_CACHE_LINE_SIZE), | |
1235 | RTE_CACHE_LINE_SIZE, mp->socket_id); | |
1236 | if (mr == NULL) { | |
1237 | WARN("port %u unable to allocate memory for a new MR of" | |
1238 | " mempool (%s).", | |
1239 | dev->data->port_id, mp->name); | |
1240 | data->ret = -1; | |
1241 | return; | |
1242 | } | |
1243 | DEBUG("port %u register MR for chunk #%d of mempool (%s)", | |
1244 | dev->data->port_id, mem_idx, mp->name); | |
1245 | mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)addr, len, | |
1246 | IBV_ACCESS_LOCAL_WRITE); | |
1247 | if (mr->ibv_mr == NULL) { | |
1248 | WARN("port %u fail to create a verbs MR for address (%p)", | |
1249 | dev->data->port_id, (void *)addr); | |
1250 | rte_free(mr); | |
1251 | data->ret = -1; | |
1252 | return; | |
1253 | } | |
1254 | mr->msl = NULL; /* Mark it is external memory. */ | |
1255 | mr->ms_bmp = NULL; | |
1256 | mr->ms_n = 1; | |
1257 | mr->ms_bmp_n = 1; | |
1258 | rte_rwlock_write_lock(&priv->mr.rwlock); | |
1259 | LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr); | |
1260 | DEBUG("port %u MR CREATED (%p) for external memory %p:\n" | |
1261 | " [0x%" PRIxPTR ", 0x%" PRIxPTR ")," | |
1262 | " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u", | |
1263 | dev->data->port_id, (void *)mr, (void *)addr, | |
1264 | addr, addr + len, rte_cpu_to_be_32(mr->ibv_mr->lkey), | |
1265 | mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n); | |
1266 | /* Insert to the global cache table. */ | |
1267 | mr_insert_dev_cache(dev, mr); | |
1268 | rte_rwlock_write_unlock(&priv->mr.rwlock); | |
1269 | /* Insert to the local cache table */ | |
1270 | mlx4_mr_addr2mr_bh(dev, mr_ctrl, addr); | |
1271 | } | |
1272 | ||
1273 | /** | |
1274 | * Register MR for entire memory chunks in a Mempool having externally allocated | |
1275 | * memory and fill in local cache. | |
1276 | * | |
1277 | * @param dev | |
1278 | * Pointer to Ethernet device. | |
1279 | * @param mr_ctrl | |
1280 | * Pointer to per-queue MR control structure. | |
1281 | * @param mp | |
1282 | * Pointer to registering Mempool. | |
1283 | * | |
1284 | * @return | |
1285 | * 0 on success, -1 on failure. | |
1286 | */ | |
1287 | static uint32_t | |
1288 | mlx4_mr_update_ext_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, | |
1289 | struct rte_mempool *mp) | |
1290 | { | |
1291 | struct mr_update_mp_data data = { | |
1292 | .dev = dev, | |
1293 | .mr_ctrl = mr_ctrl, | |
1294 | .ret = 0, | |
1295 | }; | |
1296 | ||
1297 | rte_mempool_mem_iter(mp, mlx4_mr_update_ext_mp_cb, &data); | |
1298 | return data.ret; | |
1299 | } | |
1300 | ||
1301 | /** | |
1302 | * Register MR entire memory chunks in a Mempool having externally allocated | |
1303 | * memory and search LKey of the address to return. | |
1304 | * | |
1305 | * @param dev | |
1306 | * Pointer to Ethernet device. | |
1307 | * @param addr | |
1308 | * Search key. | |
1309 | * @param mp | |
1310 | * Pointer to registering Mempool where addr belongs. | |
1311 | * | |
1312 | * @return | |
1313 | * LKey for address on success, UINT32_MAX on failure. | |
1314 | */ | |
1315 | uint32_t | |
1316 | mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr, struct rte_mempool *mp) | |
1317 | { | |
1318 | struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl; | |
1319 | struct mlx4_priv *priv = txq->priv; | |
1320 | ||
1321 | if (rte_eal_process_type() != RTE_PROC_PRIMARY) { | |
1322 | WARN("port %u using address (%p) from unregistered mempool" | |
1323 | " having externally allocated memory" | |
1324 | " in secondary process, please create mempool" | |
1325 | " prior to rte_eth_dev_start()", | |
1326 | PORT_ID(priv), (void *)addr); | |
1327 | return UINT32_MAX; | |
1328 | } | |
1329 | mlx4_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp); | |
1330 | return mlx4_tx_addr2mr_bh(txq, addr); | |
1331 | } | |
1332 | ||
11fdf7f2 TL |
1333 | /* Called during rte_mempool_mem_iter() by mlx4_mr_update_mp(). */ |
1334 | static void | |
1335 | mlx4_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque, | |
1336 | struct rte_mempool_memhdr *memhdr, | |
1337 | unsigned mem_idx __rte_unused) | |
1338 | { | |
1339 | struct mr_update_mp_data *data = opaque; | |
1340 | uint32_t lkey; | |
1341 | ||
1342 | /* Stop iteration if failed in the previous walk. */ | |
1343 | if (data->ret < 0) | |
1344 | return; | |
1345 | /* Register address of the chunk and update local caches. */ | |
1346 | lkey = mlx4_mr_addr2mr_bh(data->dev, data->mr_ctrl, | |
1347 | (uintptr_t)memhdr->addr); | |
1348 | if (lkey == UINT32_MAX) | |
1349 | data->ret = -1; | |
1350 | } | |
1351 | ||
1352 | /** | |
1353 | * Register entire memory chunks in a Mempool. | |
1354 | * | |
1355 | * @param dev | |
1356 | * Pointer to Ethernet device. | |
1357 | * @param mr_ctrl | |
1358 | * Pointer to per-queue MR control structure. | |
1359 | * @param mp | |
1360 | * Pointer to registering Mempool. | |
1361 | * | |
1362 | * @return | |
1363 | * 0 on success, -1 on failure. | |
1364 | */ | |
1365 | int | |
1366 | mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl, | |
1367 | struct rte_mempool *mp) | |
1368 | { | |
1369 | struct mr_update_mp_data data = { | |
1370 | .dev = dev, | |
1371 | .mr_ctrl = mr_ctrl, | |
1372 | .ret = 0, | |
1373 | }; | |
1374 | ||
1375 | rte_mempool_mem_iter(mp, mlx4_mr_update_mp_cb, &data); | |
9f95a23c TL |
1376 | if (data.ret < 0 && rte_errno == ENXIO) { |
1377 | /* Mempool may have externally allocated memory. */ | |
1378 | return mlx4_mr_update_ext_mp(dev, mr_ctrl, mp); | |
1379 | } | |
11fdf7f2 TL |
1380 | return data.ret; |
1381 | } | |
1382 | ||
1383 | #ifndef NDEBUG | |
1384 | /** | |
1385 | * Dump all the created MRs and the global cache entries. | |
1386 | * | |
1387 | * @param dev | |
1388 | * Pointer to Ethernet device. | |
1389 | */ | |
1390 | void | |
1391 | mlx4_mr_dump_dev(struct rte_eth_dev *dev) | |
1392 | { | |
9f95a23c | 1393 | struct mlx4_priv *priv = dev->data->dev_private; |
11fdf7f2 TL |
1394 | struct mlx4_mr *mr; |
1395 | int mr_n = 0; | |
1396 | int chunk_n = 0; | |
1397 | ||
1398 | rte_rwlock_read_lock(&priv->mr.rwlock); | |
1399 | /* Iterate all the existing MRs. */ | |
1400 | LIST_FOREACH(mr, &priv->mr.mr_list, mr) { | |
1401 | unsigned int n; | |
1402 | ||
1403 | DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u", | |
1404 | dev->data->port_id, mr_n++, | |
1405 | rte_cpu_to_be_32(mr->ibv_mr->lkey), | |
1406 | mr->ms_n, mr->ms_bmp_n); | |
1407 | if (mr->ms_n == 0) | |
1408 | continue; | |
1409 | for (n = 0; n < mr->ms_bmp_n; ) { | |
9f95a23c | 1410 | struct mlx4_mr_cache ret; |
11fdf7f2 | 1411 | |
9f95a23c | 1412 | memset(&ret, 0, sizeof(ret)); |
11fdf7f2 TL |
1413 | n = mr_find_next_chunk(mr, &ret, n); |
1414 | if (!ret.end) | |
1415 | break; | |
1416 | DEBUG(" chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")", | |
1417 | chunk_n++, ret.start, ret.end); | |
1418 | } | |
1419 | } | |
1420 | DEBUG("port %u dumping global cache", dev->data->port_id); | |
1421 | mlx4_mr_btree_dump(&priv->mr.cache); | |
1422 | rte_rwlock_read_unlock(&priv->mr.rwlock); | |
1423 | } | |
1424 | #endif | |
1425 | ||
1426 | /** | |
1427 | * Release all the created MRs and resources. Remove device from memory callback | |
1428 | * list. | |
1429 | * | |
1430 | * @param dev | |
1431 | * Pointer to Ethernet device. | |
1432 | */ | |
1433 | void | |
1434 | mlx4_mr_release(struct rte_eth_dev *dev) | |
1435 | { | |
9f95a23c TL |
1436 | struct mlx4_priv *priv = dev->data->dev_private; |
1437 | struct mlx4_mr *mr_next; | |
11fdf7f2 TL |
1438 | |
1439 | /* Remove from memory callback device list. */ | |
9f95a23c | 1440 | rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock); |
11fdf7f2 | 1441 | LIST_REMOVE(priv, mem_event_cb); |
9f95a23c | 1442 | rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock); |
11fdf7f2 TL |
1443 | #ifndef NDEBUG |
1444 | mlx4_mr_dump_dev(dev); | |
1445 | #endif | |
1446 | rte_rwlock_write_lock(&priv->mr.rwlock); | |
1447 | /* Detach from MR list and move to free list. */ | |
9f95a23c | 1448 | mr_next = LIST_FIRST(&priv->mr.mr_list); |
11fdf7f2 TL |
1449 | while (mr_next != NULL) { |
1450 | struct mlx4_mr *mr = mr_next; | |
1451 | ||
1452 | mr_next = LIST_NEXT(mr, mr); | |
1453 | LIST_REMOVE(mr, mr); | |
1454 | LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr); | |
1455 | } | |
1456 | LIST_INIT(&priv->mr.mr_list); | |
1457 | /* Free global cache. */ | |
1458 | mlx4_mr_btree_free(&priv->mr.cache); | |
1459 | rte_rwlock_write_unlock(&priv->mr.rwlock); | |
1460 | /* Free all remaining MRs. */ | |
1461 | mlx4_mr_garbage_collect(dev); | |
1462 | } |