]> git.proxmox.com Git - ceph.git/blob - ceph/src/pmdk/src/common/mmap.c
import ceph 16.2.7
[ceph.git] / ceph / src / pmdk / src / common / mmap.c
1 // SPDX-License-Identifier: BSD-3-Clause
2 /* Copyright 2014-2020, Intel Corporation */
3
4 /*
5 * mmap.c -- mmap utilities
6 */
7
8 #include <errno.h>
9 #include <inttypes.h>
10 #include <fcntl.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <sys/mman.h>
14 #include <unistd.h>
15
16 #include "file.h"
17 #include "queue.h"
18 #include "mmap.h"
19 #include "sys_util.h"
20 #include "os.h"
21 #include "alloc.h"
22 #include "libpmem2.h"
23
24 int Mmap_no_random;
25 void *Mmap_hint;
26 static os_rwlock_t Mmap_list_lock;
27
28 static PMDK_SORTEDQ_HEAD(map_list_head, map_tracker) Mmap_list =
29 PMDK_SORTEDQ_HEAD_INITIALIZER(Mmap_list);
30
31 /*
32 * util_mmap_init -- initialize the mmap utils
33 *
34 * This is called from the library initialization code.
35 */
36 void
37 util_mmap_init(void)
38 {
39 LOG(3, NULL);
40
41 util_rwlock_init(&Mmap_list_lock);
42
43 /*
44 * For testing, allow overriding the default mmap() hint address.
45 * If hint address is defined, it also disables address randomization.
46 */
47 char *e = os_getenv("PMEM_MMAP_HINT");
48 if (e) {
49 char *endp;
50 errno = 0;
51 unsigned long long val = strtoull(e, &endp, 16);
52
53 if (errno || endp == e) {
54 LOG(2, "Invalid PMEM_MMAP_HINT");
55 } else if (os_access(OS_MAPFILE, R_OK)) {
56 LOG(2, "No /proc, PMEM_MMAP_HINT ignored");
57 } else {
58 Mmap_hint = (void *)val;
59 Mmap_no_random = 1;
60 LOG(3, "PMEM_MMAP_HINT set to %p", Mmap_hint);
61 }
62 }
63 }
64
65 /*
66 * util_mmap_fini -- clean up the mmap utils
67 *
68 * This is called before process stop.
69 */
70 void
71 util_mmap_fini(void)
72 {
73 LOG(3, NULL);
74
75 util_rwlock_destroy(&Mmap_list_lock);
76 }
77
78 /*
79 * util_map -- memory map a file
80 *
81 * This is just a convenience function that calls mmap() with the
82 * appropriate arguments and includes our trace points.
83 */
84 void *
85 util_map(int fd, os_off_t off, size_t len, int flags, int rdonly,
86 size_t req_align, int *map_sync)
87 {
88 LOG(3, "fd %d len %zu flags %d rdonly %d req_align %zu map_sync %p",
89 fd, len, flags, rdonly, req_align, map_sync);
90
91 void *base;
92 void *addr = util_map_hint(len, req_align);
93 if (addr == MAP_FAILED) {
94 LOG(1, "cannot find a contiguous region of given size");
95 return NULL;
96 }
97
98 if (req_align)
99 ASSERTeq((uintptr_t)addr % req_align, 0);
100
101 int proto = rdonly ? PROT_READ : PROT_READ|PROT_WRITE;
102 base = util_map_sync(addr, len, proto, flags, fd, off, map_sync);
103 if (base == MAP_FAILED) {
104 ERR("!mmap %zu bytes", len);
105 return NULL;
106 }
107
108 LOG(3, "mapped at %p", base);
109
110 return base;
111 }
112
113 /*
114 * util_unmap -- unmap a file
115 *
116 * This is just a convenience function that calls munmap() with the
117 * appropriate arguments and includes our trace points.
118 */
119 int
120 util_unmap(void *addr, size_t len)
121 {
122 LOG(3, "addr %p len %zu", addr, len);
123
124 /*
125 * XXX Workaround for https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=169608
126 */
127 #ifdef __FreeBSD__
128 if (!IS_PAGE_ALIGNED((uintptr_t)addr)) {
129 errno = EINVAL;
130 ERR("!munmap");
131 return -1;
132 }
133 #endif
134 int retval = munmap(addr, len);
135 if (retval < 0)
136 ERR("!munmap");
137
138 return retval;
139 }
140
141 /*
142 * util_range_ro -- set a memory range read-only
143 */
144 int
145 util_range_ro(void *addr, size_t len)
146 {
147 LOG(3, "addr %p len %zu", addr, len);
148
149 uintptr_t uptr;
150 int retval;
151
152 /*
153 * mprotect requires addr to be a multiple of pagesize, so
154 * adjust addr and len to represent the full 4k chunks
155 * covering the given range.
156 */
157
158 /* increase len by the amount we gain when we round addr down */
159 len += (uintptr_t)addr & (Pagesize - 1);
160
161 /* round addr down to page boundary */
162 uptr = (uintptr_t)addr & ~(Pagesize - 1);
163
164 if ((retval = mprotect((void *)uptr, len, PROT_READ)) < 0)
165 ERR("!mprotect: PROT_READ");
166
167 return retval;
168 }
169
170 /*
171 * util_range_rw -- set a memory range read-write
172 */
173 int
174 util_range_rw(void *addr, size_t len)
175 {
176 LOG(3, "addr %p len %zu", addr, len);
177
178 uintptr_t uptr;
179 int retval;
180
181 /*
182 * mprotect requires addr to be a multiple of pagesize, so
183 * adjust addr and len to represent the full 4k chunks
184 * covering the given range.
185 */
186
187 /* increase len by the amount we gain when we round addr down */
188 len += (uintptr_t)addr & (Pagesize - 1);
189
190 /* round addr down to page boundary */
191 uptr = (uintptr_t)addr & ~(Pagesize - 1);
192
193 if ((retval = mprotect((void *)uptr, len, PROT_READ|PROT_WRITE)) < 0)
194 ERR("!mprotect: PROT_READ|PROT_WRITE");
195
196 return retval;
197 }
198
199 /*
200 * util_range_none -- set a memory range for no access allowed
201 */
202 int
203 util_range_none(void *addr, size_t len)
204 {
205 LOG(3, "addr %p len %zu", addr, len);
206
207 uintptr_t uptr;
208 int retval;
209
210 /*
211 * mprotect requires addr to be a multiple of pagesize, so
212 * adjust addr and len to represent the full 4k chunks
213 * covering the given range.
214 */
215
216 /* increase len by the amount we gain when we round addr down */
217 len += (uintptr_t)addr & (Pagesize - 1);
218
219 /* round addr down to page boundary */
220 uptr = (uintptr_t)addr & ~(Pagesize - 1);
221
222 if ((retval = mprotect((void *)uptr, len, PROT_NONE)) < 0)
223 ERR("!mprotect: PROT_NONE");
224
225 return retval;
226 }
227
228 /*
229 * util_range_comparer -- (internal) compares the two mapping trackers
230 */
231 static intptr_t
232 util_range_comparer(struct map_tracker *a, struct map_tracker *b)
233 {
234 return ((intptr_t)a->base_addr - (intptr_t)b->base_addr);
235 }
236
237 /*
238 * util_range_find_unlocked -- (internal) find the map tracker
239 * for given address range
240 *
241 * Returns the first entry at least partially overlapping given range.
242 * It's up to the caller to check whether the entry exactly matches the range,
243 * or if the range spans multiple entries.
244 */
245 static struct map_tracker *
246 util_range_find_unlocked(uintptr_t addr, size_t len)
247 {
248 LOG(10, "addr 0x%016" PRIxPTR " len %zu", addr, len);
249
250 uintptr_t end = addr + len;
251
252 struct map_tracker *mt;
253
254 PMDK_SORTEDQ_FOREACH(mt, &Mmap_list, entry) {
255 if (addr < mt->end_addr &&
256 (addr >= mt->base_addr || end > mt->base_addr))
257 goto out;
258
259 /* break if there is no chance to find matching entry */
260 if (addr < mt->base_addr)
261 break;
262 }
263 mt = NULL;
264
265 out:
266 return mt;
267 }
268
269 /*
270 * util_range_find -- find the map tracker for given address range
271 * the same as util_range_find_unlocked but locked
272 */
273 struct map_tracker *
274 util_range_find(uintptr_t addr, size_t len)
275 {
276 LOG(10, "addr 0x%016" PRIxPTR " len %zu", addr, len);
277
278 util_rwlock_rdlock(&Mmap_list_lock);
279
280 struct map_tracker *mt = util_range_find_unlocked(addr, len);
281
282 util_rwlock_unlock(&Mmap_list_lock);
283 return mt;
284 }
285
286 /*
287 * util_range_register -- add a memory range into a map tracking list
288 */
289 int
290 util_range_register(const void *addr, size_t len, const char *path,
291 enum pmem_map_type type)
292 {
293 LOG(3, "addr %p len %zu path %s type %d", addr, len, path, type);
294
295 /* check if not tracked already */
296 if (util_range_find((uintptr_t)addr, len) != NULL) {
297 ERR(
298 "duplicated persistent memory range; presumably unmapped with munmap() instead of pmem_unmap(): addr %p len %zu",
299 addr, len);
300 errno = ENOMEM;
301 return -1;
302 }
303
304 struct map_tracker *mt;
305 mt = Malloc(sizeof(struct map_tracker));
306 if (mt == NULL) {
307 ERR("!Malloc");
308 return -1;
309 }
310
311 mt->base_addr = (uintptr_t)addr;
312 mt->end_addr = mt->base_addr + len;
313 mt->type = type;
314 if (type == PMEM_DEV_DAX) {
315 unsigned region_id;
316 int ret = util_ddax_region_find(path, &region_id);
317 if (ret < 0) {
318 ERR("Cannot find DAX device region id");
319 return -1;
320 }
321 mt->region_id = region_id;
322 }
323
324 util_rwlock_wrlock(&Mmap_list_lock);
325
326 PMDK_SORTEDQ_INSERT(&Mmap_list, mt, entry, struct map_tracker,
327 util_range_comparer);
328
329 util_rwlock_unlock(&Mmap_list_lock);
330
331 return 0;
332 }
333
334 /*
335 * util_range_split -- (internal) remove or split a map tracking entry
336 */
337 static int
338 util_range_split(struct map_tracker *mt, const void *addrp, const void *endp)
339 {
340 LOG(3, "begin %p end %p", addrp, endp);
341
342 uintptr_t addr = (uintptr_t)addrp;
343 uintptr_t end = (uintptr_t)endp;
344 ASSERTne(mt, NULL);
345 if (addr == end || addr % Mmap_align != 0 || end % Mmap_align != 0) {
346 ERR(
347 "invalid munmap length, must be non-zero and page aligned");
348 return -1;
349 }
350
351 struct map_tracker *mtb = NULL;
352 struct map_tracker *mte = NULL;
353
354 /*
355 * 1) b e b e
356 * xxxxxxxxxxxxx => xxx.......xxxx - mtb+mte
357 * 2) b e b e
358 * xxxxxxxxxxxxx => xxxxxxx....... - mtb
359 * 3) b e b e
360 * xxxxxxxxxxxxx => ........xxxxxx - mte
361 * 4) b e b e
362 * xxxxxxxxxxxxx => .............. - <none>
363 */
364
365 if (addr > mt->base_addr) {
366 /* case #1/2 */
367 /* new mapping at the beginning */
368 mtb = Malloc(sizeof(struct map_tracker));
369 if (mtb == NULL) {
370 ERR("!Malloc");
371 goto err;
372 }
373
374 mtb->base_addr = mt->base_addr;
375 mtb->end_addr = addr;
376 mtb->region_id = mt->region_id;
377 mtb->type = mt->type;
378 }
379
380 if (end < mt->end_addr) {
381 /* case #1/3 */
382 /* new mapping at the end */
383 mte = Malloc(sizeof(struct map_tracker));
384 if (mte == NULL) {
385 ERR("!Malloc");
386 goto err;
387 }
388
389 mte->base_addr = end;
390 mte->end_addr = mt->end_addr;
391 mte->region_id = mt->region_id;
392 mte->type = mt->type;
393 }
394
395 PMDK_SORTEDQ_REMOVE(&Mmap_list, mt, entry);
396
397 if (mtb) {
398 PMDK_SORTEDQ_INSERT(&Mmap_list, mtb, entry,
399 struct map_tracker, util_range_comparer);
400 }
401
402 if (mte) {
403 PMDK_SORTEDQ_INSERT(&Mmap_list, mte, entry,
404 struct map_tracker, util_range_comparer);
405 }
406
407 /* free entry for the original mapping */
408 Free(mt);
409 return 0;
410
411 err:
412 Free(mtb);
413 Free(mte);
414 return -1;
415 }
416
417 /*
418 * util_range_unregister -- remove a memory range
419 * from map tracking list
420 *
421 * Remove the region between [begin,end]. If it's in a middle of the existing
422 * mapping, it results in two new map trackers.
423 */
424 int
425 util_range_unregister(const void *addr, size_t len)
426 {
427 LOG(3, "addr %p len %zu", addr, len);
428
429 int ret = 0;
430
431 util_rwlock_wrlock(&Mmap_list_lock);
432
433 /*
434 * Changes in the map tracker list must match the underlying behavior.
435 *
436 * $ man 2 mmap:
437 * The address addr must be a multiple of the page size (but length
438 * need not be). All pages containing a part of the indicated range
439 * are unmapped.
440 *
441 * This means that we must align the length to the page size.
442 */
443 len = PAGE_ALIGNED_UP_SIZE(len);
444
445 void *end = (char *)addr + len;
446
447 /* XXX optimize the loop */
448 struct map_tracker *mt;
449 while ((mt = util_range_find_unlocked((uintptr_t)addr, len)) != NULL) {
450 if (util_range_split(mt, addr, end) != 0) {
451 ret = -1;
452 break;
453 }
454 }
455
456 util_rwlock_unlock(&Mmap_list_lock);
457 return ret;
458 }
459
460 /*
461 * util_range_is_pmem -- return true if entire range
462 * is persistent memory
463 */
464 int
465 util_range_is_pmem(const void *addrp, size_t len)
466 {
467 LOG(10, "addr %p len %zu", addrp, len);
468
469 uintptr_t addr = (uintptr_t)addrp;
470 int retval = 1;
471
472 util_rwlock_rdlock(&Mmap_list_lock);
473
474 do {
475 struct map_tracker *mt = util_range_find(addr, len);
476 if (mt == NULL) {
477 LOG(4, "address not found 0x%016" PRIxPTR, addr);
478 retval = 0;
479 break;
480 }
481
482 LOG(10, "range found - begin 0x%016" PRIxPTR
483 " end 0x%016" PRIxPTR,
484 mt->base_addr, mt->end_addr);
485
486 if (mt->base_addr > addr) {
487 LOG(10, "base address doesn't match: "
488 "0x%" PRIxPTR " > 0x%" PRIxPTR,
489 mt->base_addr, addr);
490 retval = 0;
491 break;
492 }
493
494 uintptr_t map_len = mt->end_addr - addr;
495 if (map_len > len)
496 map_len = len;
497 len -= map_len;
498 addr += map_len;
499 } while (len > 0);
500
501 util_rwlock_unlock(&Mmap_list_lock);
502
503 return retval;
504 }