]>
Commit | Line | Data |
---|---|---|
2251334d BM |
1 | // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause |
2 | ||
3 | /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ | |
4 | /* Copyright (c) 2008-2019, IBM Corporation */ | |
5 | ||
6 | #include <linux/gfp.h> | |
7 | #include <rdma/ib_verbs.h> | |
8 | #include <linux/dma-mapping.h> | |
9 | #include <linux/slab.h> | |
10 | #include <linux/sched/mm.h> | |
11 | #include <linux/resource.h> | |
12 | ||
13 | #include "siw.h" | |
14 | #include "siw_mem.h" | |
15 | ||
16 | /* | |
17 | * Stag lookup is based on its index part only (24 bits). | |
18 | * The code avoids special Stag of zero and tries to randomize | |
19 | * STag values between 1 and SIW_STAG_MAX_INDEX. | |
20 | */ | |
21 | int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) | |
22 | { | |
23 | struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); | |
24 | u32 id, next; | |
25 | ||
26 | get_random_bytes(&next, 4); | |
27 | next &= 0x00ffffff; | |
28 | ||
29 | if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, | |
30 | GFP_KERNEL) < 0) | |
31 | return -ENOMEM; | |
32 | ||
33 | /* Set the STag index part */ | |
34 | m->stag = id << 8; | |
35 | ||
36 | siw_dbg_mem(m, "new MEM object\n"); | |
37 | ||
38 | return 0; | |
39 | } | |
40 | ||
41 | /* | |
42 | * siw_mem_id2obj() | |
43 | * | |
44 | * resolves memory from stag given by id. might be called from: | |
45 | * o process context before sending out of sgl, or | |
46 | * o in softirq when resolving target memory | |
47 | */ | |
48 | struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) | |
49 | { | |
50 | struct siw_mem *mem; | |
51 | ||
52 | rcu_read_lock(); | |
53 | mem = xa_load(&sdev->mem_xa, stag_index); | |
54 | if (likely(mem && kref_get_unless_zero(&mem->ref))) { | |
55 | rcu_read_unlock(); | |
56 | return mem; | |
57 | } | |
58 | rcu_read_unlock(); | |
59 | ||
60 | return NULL; | |
61 | } | |
62 | ||
63 | static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, | |
64 | bool dirty) | |
65 | { | |
f1f6a7dd | 66 | unpin_user_pages_dirty_lock(chunk->plist, num_pages, dirty); |
2251334d BM |
67 | } |
68 | ||
69 | void siw_umem_release(struct siw_umem *umem, bool dirty) | |
70 | { | |
71 | struct mm_struct *mm_s = umem->owning_mm; | |
72 | int i, num_pages = umem->num_pages; | |
73 | ||
74 | for (i = 0; num_pages; i++) { | |
75 | int to_free = min_t(int, PAGES_PER_CHUNK, num_pages); | |
76 | ||
77 | siw_free_plist(&umem->page_chunk[i], to_free, | |
78 | umem->writable && dirty); | |
79 | kfree(umem->page_chunk[i].plist); | |
80 | num_pages -= to_free; | |
81 | } | |
82 | atomic64_sub(umem->num_pages, &mm_s->pinned_vm); | |
83 | ||
84 | mmdrop(mm_s); | |
85 | kfree(umem->page_chunk); | |
86 | kfree(umem); | |
87 | } | |
88 | ||
89 | int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, | |
90 | u64 start, u64 len, int rights) | |
91 | { | |
92 | struct siw_device *sdev = to_siw_dev(pd->device); | |
93 | struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); | |
94 | struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); | |
95 | u32 id, next; | |
96 | ||
97 | if (!mem) | |
98 | return -ENOMEM; | |
99 | ||
100 | mem->mem_obj = mem_obj; | |
101 | mem->stag_valid = 0; | |
102 | mem->sdev = sdev; | |
103 | mem->va = start; | |
104 | mem->len = len; | |
105 | mem->pd = pd; | |
106 | mem->perms = rights & IWARP_ACCESS_MASK; | |
107 | kref_init(&mem->ref); | |
108 | ||
109 | mr->mem = mem; | |
110 | ||
111 | get_random_bytes(&next, 4); | |
112 | next &= 0x00ffffff; | |
113 | ||
114 | if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next, | |
115 | GFP_KERNEL) < 0) { | |
116 | kfree(mem); | |
117 | return -ENOMEM; | |
118 | } | |
119 | /* Set the STag index part */ | |
120 | mem->stag = id << 8; | |
121 | mr->base_mr.lkey = mr->base_mr.rkey = mem->stag; | |
122 | ||
123 | return 0; | |
124 | } | |
125 | ||
126 | void siw_mr_drop_mem(struct siw_mr *mr) | |
127 | { | |
128 | struct siw_mem *mem = mr->mem, *found; | |
129 | ||
130 | mem->stag_valid = 0; | |
131 | ||
132 | /* make STag invalid visible asap */ | |
133 | smp_mb(); | |
134 | ||
135 | found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8); | |
136 | WARN_ON(found != mem); | |
137 | siw_mem_put(mem); | |
138 | } | |
139 | ||
140 | void siw_free_mem(struct kref *ref) | |
141 | { | |
142 | struct siw_mem *mem = container_of(ref, struct siw_mem, ref); | |
143 | ||
144 | siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n"); | |
145 | ||
146 | if (!mem->is_mw && mem->mem_obj) { | |
147 | if (mem->is_pbl == 0) | |
148 | siw_umem_release(mem->umem, true); | |
149 | else | |
150 | kfree(mem->pbl); | |
151 | } | |
152 | kfree(mem); | |
153 | } | |
154 | ||
155 | /* | |
156 | * siw_check_mem() | |
157 | * | |
158 | * Check protection domain, STAG state, access permissions and | |
159 | * address range for memory object. | |
160 | * | |
161 | * @pd: Protection Domain memory should belong to | |
162 | * @mem: memory to be checked | |
163 | * @addr: starting addr of mem | |
164 | * @perms: requested access permissions | |
165 | * @len: len of memory interval to be checked | |
166 | * | |
167 | */ | |
168 | int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, | |
169 | enum ib_access_flags perms, int len) | |
170 | { | |
171 | if (!mem->stag_valid) { | |
172 | siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag); | |
173 | return -E_STAG_INVALID; | |
174 | } | |
175 | if (mem->pd != pd) { | |
176 | siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag); | |
177 | return -E_PD_MISMATCH; | |
178 | } | |
179 | /* | |
180 | * check access permissions | |
181 | */ | |
182 | if ((mem->perms & perms) < perms) { | |
183 | siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n", | |
184 | mem->perms, perms); | |
185 | return -E_ACCESS_PERM; | |
186 | } | |
187 | /* | |
188 | * Check if access falls into valid memory interval. | |
189 | */ | |
190 | if (addr < mem->va || addr + len > mem->va + mem->len) { | |
191 | siw_dbg_pd(pd, "MEM interval len %d\n", len); | |
c536277e BM |
192 | siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n", |
193 | (void *)(uintptr_t)addr, | |
194 | (void *)(uintptr_t)(addr + len)); | |
195 | siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n", | |
196 | (void *)(uintptr_t)mem->va, | |
197 | (void *)(uintptr_t)(mem->va + mem->len), | |
2251334d BM |
198 | mem->stag); |
199 | ||
200 | return -E_BASE_BOUNDS; | |
201 | } | |
202 | return E_ACCESS_OK; | |
203 | } | |
204 | ||
205 | /* | |
206 | * siw_check_sge() | |
207 | * | |
208 | * Check SGE for access rights in given interval | |
209 | * | |
210 | * @pd: Protection Domain memory should belong to | |
211 | * @sge: SGE to be checked | |
212 | * @mem: location of memory reference within array | |
213 | * @perms: requested access permissions | |
214 | * @off: starting offset in SGE | |
215 | * @len: len of memory interval to be checked | |
216 | * | |
217 | * NOTE: Function references SGE's memory object (mem->obj) | |
218 | * if not yet done. New reference is kept if check went ok and | |
219 | * released if check failed. If mem->obj is already valid, no new | |
220 | * lookup is being done and mem is not released it check fails. | |
221 | */ | |
222 | int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], | |
223 | enum ib_access_flags perms, u32 off, int len) | |
224 | { | |
225 | struct siw_device *sdev = to_siw_dev(pd->device); | |
226 | struct siw_mem *new = NULL; | |
227 | int rv = E_ACCESS_OK; | |
228 | ||
229 | if (len + off > sge->length) { | |
230 | rv = -E_BASE_BOUNDS; | |
231 | goto fail; | |
232 | } | |
233 | if (*mem == NULL) { | |
234 | new = siw_mem_id2obj(sdev, sge->lkey >> 8); | |
235 | if (unlikely(!new)) { | |
236 | siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey); | |
237 | rv = -E_STAG_INVALID; | |
238 | goto fail; | |
239 | } | |
240 | *mem = new; | |
241 | } | |
242 | /* Check if user re-registered with different STag key */ | |
243 | if (unlikely((*mem)->stag != sge->lkey)) { | |
244 | siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey); | |
245 | rv = -E_STAG_INVALID; | |
246 | goto fail; | |
247 | } | |
248 | rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len); | |
249 | if (unlikely(rv)) | |
250 | goto fail; | |
251 | ||
252 | return 0; | |
253 | ||
254 | fail: | |
255 | if (new) { | |
256 | *mem = NULL; | |
257 | siw_mem_put(new); | |
258 | } | |
259 | return rv; | |
260 | } | |
261 | ||
262 | void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op) | |
263 | { | |
264 | switch (op) { | |
265 | case SIW_OP_SEND: | |
266 | case SIW_OP_WRITE: | |
267 | case SIW_OP_SEND_WITH_IMM: | |
268 | case SIW_OP_SEND_REMOTE_INV: | |
269 | case SIW_OP_READ: | |
270 | case SIW_OP_READ_LOCAL_INV: | |
271 | if (!(wqe->sqe.flags & SIW_WQE_INLINE)) | |
272 | siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge); | |
273 | break; | |
274 | ||
275 | case SIW_OP_RECEIVE: | |
276 | siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge); | |
277 | break; | |
278 | ||
279 | case SIW_OP_READ_RESPONSE: | |
280 | siw_unref_mem_sgl(wqe->mem, 1); | |
281 | break; | |
282 | ||
283 | default: | |
284 | /* | |
285 | * SIW_OP_INVAL_STAG and SIW_OP_REG_MR | |
286 | * do not hold memory references | |
287 | */ | |
288 | break; | |
289 | } | |
290 | } | |
291 | ||
292 | int siw_invalidate_stag(struct ib_pd *pd, u32 stag) | |
293 | { | |
294 | struct siw_device *sdev = to_siw_dev(pd->device); | |
295 | struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8); | |
296 | int rv = 0; | |
297 | ||
298 | if (unlikely(!mem)) { | |
299 | siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag); | |
300 | return -EINVAL; | |
301 | } | |
302 | if (unlikely(mem->pd != pd)) { | |
303 | siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag); | |
304 | rv = -EACCES; | |
305 | goto out; | |
306 | } | |
307 | /* | |
308 | * Per RDMA verbs definition, an STag may already be in invalid | |
309 | * state if invalidation is requested. So no state check here. | |
310 | */ | |
311 | mem->stag_valid = 0; | |
312 | ||
313 | siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag); | |
314 | out: | |
315 | siw_mem_put(mem); | |
316 | return rv; | |
317 | } | |
318 | ||
319 | /* | |
320 | * Gets physical address backed by PBL element. Address is referenced | |
321 | * by linear byte offset into list of variably sized PB elements. | |
322 | * Optionally, provides remaining len within current element, and | |
323 | * current PBL index for later resume at same element. | |
324 | */ | |
c536277e | 325 | dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) |
2251334d BM |
326 | { |
327 | int i = idx ? *idx : 0; | |
328 | ||
329 | while (i < pbl->num_buf) { | |
330 | struct siw_pble *pble = &pbl->pbe[i]; | |
331 | ||
332 | if (pble->pbl_off + pble->size > off) { | |
333 | u64 pble_off = off - pble->pbl_off; | |
334 | ||
335 | if (len) | |
336 | *len = pble->size - pble_off; | |
337 | if (idx) | |
338 | *idx = i; | |
339 | ||
340 | return pble->addr + pble_off; | |
341 | } | |
342 | i++; | |
343 | } | |
344 | if (len) | |
345 | *len = 0; | |
346 | return 0; | |
347 | } | |
348 | ||
349 | struct siw_pbl *siw_pbl_alloc(u32 num_buf) | |
350 | { | |
351 | struct siw_pbl *pbl; | |
2251334d BM |
352 | |
353 | if (num_buf == 0) | |
354 | return ERR_PTR(-EINVAL); | |
355 | ||
bd25c806 | 356 | pbl = kzalloc(struct_size(pbl, pbe, num_buf), GFP_KERNEL); |
2251334d BM |
357 | if (!pbl) |
358 | return ERR_PTR(-ENOMEM); | |
359 | ||
360 | pbl->max_buf = num_buf; | |
361 | ||
362 | return pbl; | |
363 | } | |
364 | ||
365 | struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) | |
366 | { | |
367 | struct siw_umem *umem; | |
368 | struct mm_struct *mm_s; | |
369 | u64 first_page_va; | |
370 | unsigned long mlock_limit; | |
371 | unsigned int foll_flags = FOLL_WRITE; | |
372 | int num_pages, num_chunks, i, rv = 0; | |
373 | ||
374 | if (!can_do_mlock()) | |
375 | return ERR_PTR(-EPERM); | |
376 | ||
377 | if (!len) | |
378 | return ERR_PTR(-EINVAL); | |
379 | ||
380 | first_page_va = start & PAGE_MASK; | |
381 | num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT; | |
382 | num_chunks = (num_pages >> CHUNK_SHIFT) + 1; | |
383 | ||
384 | umem = kzalloc(sizeof(*umem), GFP_KERNEL); | |
385 | if (!umem) | |
386 | return ERR_PTR(-ENOMEM); | |
387 | ||
388 | mm_s = current->mm; | |
389 | umem->owning_mm = mm_s; | |
390 | umem->writable = writable; | |
391 | ||
392 | mmgrab(mm_s); | |
393 | ||
394 | if (!writable) | |
395 | foll_flags |= FOLL_FORCE; | |
396 | ||
d8ed45c5 | 397 | mmap_read_lock(mm_s); |
2251334d BM |
398 | |
399 | mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | |
400 | ||
401 | if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) { | |
402 | rv = -ENOMEM; | |
403 | goto out_sem_up; | |
404 | } | |
405 | umem->fp_addr = first_page_va; | |
406 | ||
407 | umem->page_chunk = | |
408 | kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL); | |
409 | if (!umem->page_chunk) { | |
410 | rv = -ENOMEM; | |
411 | goto out_sem_up; | |
412 | } | |
413 | for (i = 0; num_pages; i++) { | |
414 | int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK); | |
415 | ||
416 | umem->page_chunk[i].plist = | |
417 | kcalloc(nents, sizeof(struct page *), GFP_KERNEL); | |
418 | if (!umem->page_chunk[i].plist) { | |
419 | rv = -ENOMEM; | |
420 | goto out_sem_up; | |
421 | } | |
422 | got = 0; | |
423 | while (nents) { | |
424 | struct page **plist = &umem->page_chunk[i].plist[got]; | |
425 | ||
dfa0a4ff | 426 | rv = pin_user_pages(first_page_va, nents, |
2251334d BM |
427 | foll_flags | FOLL_LONGTERM, |
428 | plist, NULL); | |
429 | if (rv < 0) | |
430 | goto out_sem_up; | |
431 | ||
432 | umem->num_pages += rv; | |
433 | atomic64_add(rv, &mm_s->pinned_vm); | |
434 | first_page_va += rv * PAGE_SIZE; | |
435 | nents -= rv; | |
436 | got += rv; | |
437 | } | |
438 | num_pages -= got; | |
439 | } | |
440 | out_sem_up: | |
d8ed45c5 | 441 | mmap_read_unlock(mm_s); |
2251334d BM |
442 | |
443 | if (rv > 0) | |
444 | return umem; | |
445 | ||
446 | siw_umem_release(umem, false); | |
447 | ||
448 | return ERR_PTR(rv); | |
449 | } |