1 // SPDX-License-Identifier: BSD-3-Clause
2 /* Copyright 2015-2020, Intel Corporation */
4 * Copyright (c) 2016, Microsoft Corporation. All rights reserved.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
18 * * Neither the name of the copyright holder nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 * set.c -- pool set utilities
56 #include <linux/limits.h>
68 #include "valgrind_internal.h"
70 #include "util_pmem.h"
73 #include "set_badblocks.h"
75 #define LIBRARY_REMOTE "librpmem.so.1"
76 #define SIZE_AUTODETECT_STR "AUTO"
78 #define PMEM_EXT ".pmem"
79 #define PMEM_EXT_LEN sizeof(PMEM_EXT)
80 #define PMEM_FILE_PADDING 6
81 #define PMEM_FILE_NAME_MAX_LEN 20
82 #define PMEM_FILE_MAX_LEN (PMEM_FILE_NAME_MAX_LEN + PMEM_FILE_PADDING)
84 static RPMEMpool
*(*Rpmem_create
)(const char *target
, const char *pool_set_name
,
85 void *pool_addr
, size_t pool_size
, unsigned *nlanes
,
86 const struct rpmem_pool_attr
*rpmem_attr
);
87 static RPMEMpool
*(*Rpmem_open
)(const char *target
, const char *pool_set_name
,
88 void *pool_addr
, size_t pool_size
, unsigned *nlanes
,
89 struct rpmem_pool_attr
*rpmem_attr
);
90 int (*Rpmem_close
)(RPMEMpool
*rpp
);
91 int (*Rpmem_persist
)(RPMEMpool
*rpp
, size_t offset
, size_t length
,
92 unsigned lane
, unsigned flags
);
93 int (*Rpmem_deep_persist
)(RPMEMpool
*rpp
, size_t offset
, size_t length
,
95 int (*Rpmem_read
)(RPMEMpool
*rpp
, void *buff
, size_t offset
,
96 size_t length
, unsigned lane
);
97 int (*Rpmem_remove
)(const char *target
, const char *pool_set_name
, int flags
);
98 int (*Rpmem_set_attr
)(RPMEMpool
*rpp
, const struct rpmem_pool_attr
*rattr
);
100 static int Remote_replication_available
;
101 static os_mutex_t Remote_lock
;
102 static void *Rpmem_handle_remote
;
104 int Prefault_at_open
= 0;
105 int Prefault_at_create
= 0;
106 int SDS_at_create
= POOL_FEAT_INCOMPAT_DEFAULT
& POOL_E_FEAT_SDS
? 1 : 0;
107 int Fallocate_at_create
= 1;
110 /* list of pool set option names and flags */
111 static const struct pool_set_option Options
[] = {
112 { "SINGLEHDR", OPTION_SINGLEHDR
},
114 { "NOHDRS", OPTION_NOHDRS
},
116 { NULL
, OPTION_UNKNOWN
}
120 * util_remote_init -- initialize remote replication
123 util_remote_init(void)
127 /* XXX Is duplicate initialization really okay? */
128 if (!Remote_replication_available
) {
129 util_mutex_init(&Remote_lock
);
130 Remote_replication_available
= 1;
135 * util_remote_fini -- finalize remote replication
138 util_remote_fini(void)
142 util_remote_unload();
144 /* XXX Okay to be here if not initialized? */
145 if (Remote_replication_available
) {
146 Remote_replication_available
= 0;
147 util_mutex_destroy(&Remote_lock
);
152 * util_dl_check_error -- check libdl error
155 util_dl_check_error(void *handle
, const char *func
)
157 LOG(15, "handle %p func %s", handle
, func
);
159 if (handle
== NULL
) {
160 char *errstr
= util_dlerror();
162 ERR("%s(): %s", func
, errstr
);
170 * util_remote_unload_core -- (internal) unload remote library (core function)
173 util_remote_unload_core(void)
175 if (Rpmem_handle_remote
!= NULL
) {
176 util_dlclose(Rpmem_handle_remote
);
177 Rpmem_handle_remote
= NULL
;
182 Rpmem_persist
= NULL
;
183 Rpmem_deep_persist
= NULL
;
186 Rpmem_set_attr
= NULL
;
190 * util_remote_unload -- unload remote library
193 util_remote_unload(void)
197 if (!Remote_replication_available
)
200 util_mutex_lock(&Remote_lock
);
202 util_remote_unload_core();
204 util_mutex_unlock(&Remote_lock
);
208 * util_remote_load -- load remote library
211 util_remote_load(void)
215 if (!Remote_replication_available
) {
216 ERR("remote replication is not available");
220 CHECK_FUNC_COMPATIBLE(rpmem_create
, *Rpmem_create
);
221 CHECK_FUNC_COMPATIBLE(rpmem_open
, *Rpmem_open
);
222 CHECK_FUNC_COMPATIBLE(rpmem_close
, *Rpmem_close
);
223 CHECK_FUNC_COMPATIBLE(rpmem_persist
, *Rpmem_persist
);
224 CHECK_FUNC_COMPATIBLE(rpmem_deep_persist
, *Rpmem_deep_persist
);
225 CHECK_FUNC_COMPATIBLE(rpmem_read
, *Rpmem_read
);
226 CHECK_FUNC_COMPATIBLE(rpmem_remove
, *Rpmem_remove
);
228 util_mutex_lock(&Remote_lock
);
230 if (Rpmem_handle_remote
)
233 Rpmem_handle_remote
= util_dlopen(LIBRARY_REMOTE
);
234 if (util_dl_check_error(Rpmem_handle_remote
, "dlopen")) {
235 ERR("the pool set requires a remote replica, "
236 "but the '%s' library cannot be loaded",
241 Rpmem_create
= util_dlsym(Rpmem_handle_remote
, "rpmem_create");
242 if (util_dl_check_error(Rpmem_create
, "dlsym")) {
243 ERR("symbol 'rpmem_create' not found");
247 Rpmem_open
= util_dlsym(Rpmem_handle_remote
, "rpmem_open");
248 if (util_dl_check_error(Rpmem_open
, "dlsym")) {
249 ERR("symbol 'rpmem_open' not found");
253 Rpmem_close
= util_dlsym(Rpmem_handle_remote
, "rpmem_close");
254 if (util_dl_check_error(Rpmem_close
, "dlsym")) {
255 ERR("symbol 'rpmem_close' not found");
259 Rpmem_persist
= util_dlsym(Rpmem_handle_remote
, "rpmem_persist");
260 if (util_dl_check_error(Rpmem_persist
, "dlsym")) {
261 ERR("symbol 'rpmem_persist' not found");
265 Rpmem_deep_persist
= util_dlsym(Rpmem_handle_remote
,
266 "rpmem_deep_persist");
267 if (util_dl_check_error(Rpmem_deep_persist
, "dlsym")) {
268 ERR("symbol 'rpmem_deep_persist' not found");
272 Rpmem_read
= util_dlsym(Rpmem_handle_remote
, "rpmem_read");
273 if (util_dl_check_error(Rpmem_read
, "dlsym")) {
274 ERR("symbol 'rpmem_read' not found");
278 Rpmem_remove
= util_dlsym(Rpmem_handle_remote
, "rpmem_remove");
279 if (util_dl_check_error(Rpmem_remove
, "dlsym")) {
280 ERR("symbol 'rpmem_remove' not found");
284 Rpmem_set_attr
= util_dlsym(Rpmem_handle_remote
, "rpmem_set_attr");
285 if (util_dl_check_error(Rpmem_set_attr
, "dlsym")) {
286 ERR("symbol 'rpmem_set_attr' not found");
291 util_mutex_unlock(&Remote_lock
);
295 LOG(4, "error clean up");
296 util_remote_unload_core();
297 util_mutex_unlock(&Remote_lock
);
301 /* reserve space for size, path and some whitespace and/or comment */
307 PARSER_INVALID_TOKEN
,
308 PARSER_REMOTE_REPLICA_EXPECTED
,
310 PARSER_CANNOT_READ_SIZE
,
311 PARSER_ABSOLUTE_PATH_EXPECTED
,
312 PARSER_RELATIVE_PATH_EXPECTED
,
315 PARSER_REMOTE_REP_UNEXPECTED_PARTS
,
316 PARSER_SIZE_MISMATCH
,
317 PARSER_OUT_OF_MEMORY
,
318 PARSER_OPTION_UNKNOWN
,
319 PARSER_OPTION_EXPECTED
,
324 static const char *parser_errstr
[PARSER_MAX_CODE
] = {
326 "the first line must be exactly 'PMEMPOOLSET'",
327 "exactly 'REPLICA' expected",
328 "invalid token found in the current line",
329 "address of remote node and descriptor of remote pool set expected",
330 "incorrect format of size",
331 "cannot determine size of a part",
332 "incorrect path (must be an absolute one)",
333 "incorrect descriptor (must be a relative path)",
336 "unexpected parts for remote replica",
337 "sizes of pool set and replica mismatch",
338 "allocating memory failed",
340 "missing option name",
341 "" /* format correct */
345 * util_replica_force_page_allocation - (internal) forces page allocation for
349 util_replica_force_page_allocation(struct pool_replica
*rep
)
351 volatile char *cur_addr
= rep
->part
[0].addr
;
352 char *addr_end
= (char *)cur_addr
+ rep
->resvsize
;
353 for (; cur_addr
< addr_end
; cur_addr
+= Pagesize
) {
354 *cur_addr
= *cur_addr
;
355 VALGRIND_SET_CLEAN(cur_addr
, 1);
360 * util_map_hdr -- map a header of a pool set
363 util_map_hdr(struct pool_set_part
*part
, int flags
, int rdonly
)
365 LOG(3, "part %p flags %d", part
, flags
);
367 COMPILE_ERROR_ON(POOL_HDR_SIZE
== 0);
368 ASSERTeq(POOL_HDR_SIZE
% Pagesize
, 0);
371 * Workaround for Device DAX not allowing to map a portion
372 * of the device if offset/length are not aligned to the internal
373 * device alignment (page size). I.e. if the device alignment
374 * is 2M, we cannot map the 4K header, but need to align the mapping
377 * According to mmap(2), system should automatically align mapping
378 * length to be a multiple of the underlying page size, but it's
379 * not true for Device DAX.
381 size_t hdrsize
= part
->alignment
> POOL_HDR_SIZE
382 ? part
->alignment
: POOL_HDR_SIZE
;
386 #if VG_MEMCHECK_ENABLED
388 /* this is required only for Device DAX & memcheck */
389 addr
= util_map_hint(hdrsize
, hdrsize
);
390 if (addr
== MAP_FAILED
) {
391 LOG(1, "cannot find a contiguous region of given size");
392 /* there's nothing we can do */
398 int prot
= rdonly
? PROT_READ
: PROT_READ
|PROT_WRITE
;
399 void *hdrp
= util_map_sync(addr
, hdrsize
, prot
, flags
,
400 part
->fd
, 0, &part
->hdr_map_sync
);
401 if (hdrp
== MAP_FAILED
) {
402 ERR("!mmap: %s", part
->path
);
406 part
->hdrsize
= hdrsize
;
409 VALGRIND_REGISTER_PMEM_MAPPING(part
->hdr
, part
->hdrsize
);
410 VALGRIND_REGISTER_PMEM_FILE(part
->fd
, part
->hdr
, part
->hdrsize
, 0);
416 * util_unmap_hdr -- unmap pool set part header
419 util_unmap_hdr(struct pool_set_part
*part
)
421 if (part
->hdr
== NULL
|| part
->hdrsize
== 0)
424 LOG(4, "munmap: addr %p size %zu", part
->hdr
, part
->hdrsize
);
425 VALGRIND_REMOVE_PMEM_MAPPING(part
->hdr
, part
->hdrsize
);
426 if (munmap(part
->hdr
, part
->hdrsize
) != 0)
427 /* this means there's a bug on the caller side */
428 FATAL("!munmap: %s", part
->path
);
434 * util_map_part -- map a part of a pool set
437 util_map_part(struct pool_set_part
*part
, void *addr
, size_t size
,
438 size_t offset
, int flags
, int rdonly
)
440 LOG(3, "part %p addr %p size %zu offset %zu flags %d",
441 part
, addr
, size
, offset
, flags
);
443 ASSERTeq((uintptr_t)addr
% Mmap_align
, 0);
444 ASSERTeq(offset
% Mmap_align
, 0);
445 ASSERTeq(size
% Mmap_align
, 0);
446 ASSERT(((os_off_t
)offset
) >= 0);
447 ASSERTeq(offset
% part
->alignment
, 0);
448 ASSERT(offset
< part
->filesize
);
451 size
= (part
->filesize
- offset
) & ~(part
->alignment
- 1);
453 size
= roundup(size
, part
->alignment
);
455 int prot
= rdonly
? PROT_READ
: PROT_READ
| PROT_WRITE
;
456 void *addrp
= util_map_sync(addr
, size
, prot
, flags
, part
->fd
,
457 (os_off_t
)offset
, &part
->map_sync
);
458 if (addrp
== MAP_FAILED
) {
459 ERR("!mmap: %s", part
->path
);
463 if (addr
!= NULL
&& (flags
& MAP_FIXED
) && addrp
!= addr
) {
464 ERR("unable to map at requested address %p", addr
);
472 VALGRIND_REGISTER_PMEM_MAPPING(part
->addr
, part
->size
);
473 VALGRIND_REGISTER_PMEM_FILE(part
->fd
, part
->addr
, part
->size
, offset
);
479 * util_unmap_part -- unmap a part of a pool set
482 util_unmap_part(struct pool_set_part
*part
)
484 LOG(3, "part %p", part
);
486 if (part
->addr
!= NULL
&& part
->size
!= 0) {
487 LOG(4, "munmap: addr %p size %zu", part
->addr
, part
->size
);
488 VALGRIND_REMOVE_PMEM_MAPPING(part
->addr
, part
->size
);
489 if (munmap(part
->addr
, part
->size
) != 0) {
490 ERR("!munmap: %s", part
->path
);
501 * util_unmap_parts -- unmap parts from start_index to the end_index
504 util_unmap_parts(struct pool_replica
*rep
, unsigned start_index
,
507 LOG(3, "rep: %p, start_index: %u, end_index: %u", rep
, start_index
,
510 for (unsigned p
= start_index
; p
<= end_index
; p
++)
511 util_unmap_part(&rep
->part
[p
]);
517 * util_poolset_free -- free pool set info
520 util_poolset_free(struct pool_set
*set
)
522 LOG(3, "set %p", set
);
524 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
525 struct pool_replica
*rep
= set
->replica
[r
];
526 if (rep
->remote
== NULL
) {
527 /* only local replicas have paths */
528 for (unsigned p
= 0; p
< rep
->nallocated
; p
++) {
529 Free((void *)(rep
->part
[p
].path
));
533 ASSERTeq(rep
->nparts
, 1);
534 Free(rep
->remote
->node_addr
);
535 Free(rep
->remote
->pool_desc
);
538 struct pool_set_directory
*d
;
539 VEC_FOREACH_BY_PTR(d
, &rep
->directory
) {
540 Free((void *)d
->path
);
542 VEC_DELETE(&rep
->directory
);
543 Free(set
->replica
[r
]);
550 * util_poolset_open -- open all replicas from a poolset
553 util_poolset_open(struct pool_set
*set
)
555 for (unsigned r
= 0; r
< set
->nreplicas
; ++r
) {
556 if (util_replica_open(set
, r
, MAP_SHARED
)) {
557 LOG(2, "replica open failed: replica %u", r
);
567 * util_replica_close_local -- close local replica, optionally delete the
571 util_replica_close_local(struct pool_replica
*rep
, unsigned repn
,
572 enum del_parts_mode del
)
574 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
575 if (rep
->part
[p
].fd
!= -1)
576 (void) os_close(rep
->part
[p
].fd
);
578 if ((del
== DELETE_CREATED_PARTS
&& rep
->part
[p
].created
) ||
579 del
== DELETE_ALL_PARTS
) {
580 LOG(4, "unlink %s", rep
->part
[p
].path
);
581 int olderrno
= errno
;
582 if (util_unlink(rep
->part
[p
].path
) && errno
!= ENOENT
) {
583 ERR("!unlink %s failed (part %u, replica %u)",
584 rep
->part
[p
].path
, p
, repn
);
594 * util_replica_close_remote -- close remote replica, optionally delete the
598 util_replica_close_remote(struct pool_replica
*rep
, unsigned repn
,
599 enum del_parts_mode del
)
604 if (rep
->remote
->rpp
) {
605 LOG(4, "closing remote replica #%u", repn
);
606 Rpmem_close(rep
->remote
->rpp
);
607 rep
->remote
->rpp
= NULL
;
610 if ((del
== DELETE_CREATED_PARTS
&& rep
->part
[0].created
) ||
611 del
== DELETE_ALL_PARTS
) {
612 LOG(4, "removing remote replica #%u", repn
);
613 int ret
= Rpmem_remove(rep
->remote
->node_addr
,
614 rep
->remote
->pool_desc
, 0);
616 LOG(1, "!removing remote replica #%u failed", repn
);
624 * util_poolset_close -- unmap and close all the parts of the pool set,
625 * optionally delete parts
628 util_poolset_close(struct pool_set
*set
, enum del_parts_mode del
)
630 LOG(3, "set %p del %d", set
, del
);
634 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
635 util_replica_close(set
, r
);
637 struct pool_replica
*rep
= set
->replica
[r
];
639 (void) util_replica_close_local(rep
, r
, del
);
641 (void) util_replica_close_remote(rep
, r
, del
);
645 * XXX On FreeBSD, mmap()ing a file does not increment the flock()
646 * reference count, so we had to keep the files open until now.
649 util_poolset_fdclose_always(set
);
651 util_poolset_free(set
);
657 * util_poolset_chmod -- change mode for all created files related to pool set
660 util_poolset_chmod(struct pool_set
*set
, mode_t mode
)
662 LOG(3, "set %p mode %o", set
, mode
);
664 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
665 struct pool_replica
*rep
= set
->replica
[r
];
667 /* skip remote replicas */
668 if (rep
->remote
!= NULL
)
671 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
672 struct pool_set_part
*part
= &rep
->part
[p
];
674 /* skip not created or closed parts */
675 if (!part
->created
|| part
->fd
== -1)
679 if (os_fstat(part
->fd
, &stbuf
) != 0) {
680 ERR("!fstat %d %s", part
->fd
, part
->path
);
684 if (stbuf
.st_mode
& ~(unsigned)S_IFMT
) {
685 LOG(1, "file permissions changed during pool "
686 "initialization, file: %s (%o)",
688 stbuf
.st_mode
& ~(unsigned)S_IFMT
);
691 if (os_chmod(part
->path
, mode
)) {
692 ERR("!chmod %u/%u/%s", r
, p
, part
->path
);
702 * util_poolset_fdclose_always -- close file descriptors related to pool set
705 util_poolset_fdclose_always(struct pool_set
*set
)
707 LOG(3, "set %p", set
);
709 for (unsigned r
= 0; r
< set
->nreplicas
; r
++)
710 util_replica_fdclose(set
->replica
[r
]);
714 * util_poolset_fdclose -- close pool set file descriptors if not FreeBSD
716 * XXX On FreeBSD, mmap()ing a file does not increment the flock()
717 * reference count, so we need to keep the files open.
720 util_poolset_fdclose(struct pool_set
*set
)
723 LOG(3, "set %p: holding open", set
);
725 util_poolset_fdclose_always(set
);
730 * util_autodetect_size -- (internal) retrieves size of an existing file
733 util_autodetect_size(const char *path
)
735 enum file_type type
= util_file_get_type(path
);
739 if (type
== TYPE_NORMAL
) {
740 ERR("size autodetection is supported only for device dax");
744 return util_file_get_size(path
);
748 * parser_read_line -- (internal) read line and validate size and path
749 * from a pool set file
751 static enum parser_codes
752 parser_read_line(char *line
, size_t *size
, char **path
)
758 char *saveptr
= NULL
; /* must be NULL initialized on Windows */
760 size_str
= strtok_r(line
, " \t", &saveptr
);
761 path_str
= strtok_r(NULL
, " \t", &saveptr
);
762 rest_str
= strtok_r(NULL
, " \t", &saveptr
);
764 if (!size_str
|| !path_str
|| rest_str
)
765 return PARSER_INVALID_TOKEN
;
767 LOG(10, "size '%s' path '%s'", size_str
, path_str
);
770 * A format of the size is checked in detail. As regards the path,
771 * it is checked only if the read path is an absolute path.
772 * The rest should be checked during creating/opening the file.
775 /* check if the read path is an absolute path */
776 if (!util_is_absolute_path(path_str
))
777 return PARSER_ABSOLUTE_PATH_EXPECTED
;
779 *path
= Strdup(path_str
);
782 return PARSER_OUT_OF_MEMORY
;
785 if (strcmp(SIZE_AUTODETECT_STR
, size_str
) == 0) {
787 * XXX: this should be done after the parsing completes, but
788 * currently this operation is performed in simply too many
789 * places in the code to move this someplace else.
791 ssize_t s
= util_autodetect_size(path_str
);
795 return PARSER_CANNOT_READ_SIZE
;
800 return PARSER_CONTINUE
;
803 ret
= util_parse_size(size_str
, size
);
804 if (ret
!= 0 || *size
== 0) {
807 return PARSER_WRONG_SIZE
;
810 return PARSER_CONTINUE
;
814 * parser_read_replica -- (internal) read line and validate remote replica
815 * from a pool set file
817 static enum parser_codes
818 parser_read_replica(char *line
, char **node_addr
, char **pool_desc
)
823 char *saveptr
= NULL
; /* must be NULL initialized on Windows */
825 addr_str
= strtok_r(line
, " \t", &saveptr
);
826 desc_str
= strtok_r(NULL
, " \t", &saveptr
);
827 rest_str
= strtok_r(NULL
, " \t", &saveptr
);
829 if (!addr_str
|| !desc_str
)
830 return PARSER_REMOTE_REPLICA_EXPECTED
;
833 return PARSER_INVALID_TOKEN
;
835 LOG(10, "node address '%s' pool set descriptor '%s'",
838 /* check if the descriptor is a relative path */
839 if (util_is_absolute_path(desc_str
))
840 return PARSER_RELATIVE_PATH_EXPECTED
;
842 *node_addr
= Strdup(addr_str
);
843 *pool_desc
= Strdup(desc_str
);
845 if (!(*node_addr
) || !(*pool_desc
)) {
851 return PARSER_OUT_OF_MEMORY
;
854 return PARSER_CONTINUE
;
858 * parser_read_options -- (internal) read line and validate options
860 static enum parser_codes
861 parser_read_options(char *line
, unsigned *options
)
863 LOG(3, "line '%s'", line
);
866 char *saveptr
= NULL
; /* must be NULL initialized on Windows */
868 char *opt_str
= strtok_r(line
, " \t", &saveptr
);
869 while (opt_str
!= NULL
) {
870 LOG(4, "option '%s'", opt_str
);
873 while (Options
[i
].name
&& strcmp(opt_str
, Options
[i
].name
) != 0)
876 if (Options
[i
].name
== NULL
) {
877 LOG(4, "unknown option '%s'", opt_str
);
878 return PARSER_OPTION_UNKNOWN
;
881 if (*options
& Options
[i
].flag
)
882 LOG(4, "duplicated option '%s'", opt_str
);
884 *options
|= Options
[i
].flag
;
887 opt_str
= strtok_r(NULL
, " \t", &saveptr
);
891 return PARSER_OPTION_EXPECTED
;
893 return PARSER_CONTINUE
;
897 * util_replica_reserve -- reserves part slots capacity in a replica
900 util_replica_reserve(struct pool_replica
**repp
, unsigned n
)
902 LOG(3, "replica %p n %u", *repp
, n
);
904 struct pool_replica
*rep
= *repp
;
905 if (rep
->nallocated
>= n
)
908 rep
= Realloc(rep
, sizeof(struct pool_replica
) +
909 (n
) * sizeof(struct pool_set_part
));
915 size_t nsize
= sizeof(struct pool_set_part
) * (n
- rep
->nallocated
);
916 memset(rep
->part
+ rep
->nallocated
, 0, nsize
);
925 * util_replica_add_part_by_idx -- (internal) allocates, initializes and adds a
926 * part structure at the provided location in the replica info
929 util_replica_add_part_by_idx(struct pool_replica
**repp
,
930 const char *path
, size_t filesize
, unsigned p
)
932 LOG(3, "replica %p path %s filesize %zu", *repp
, path
, filesize
);
934 if (util_replica_reserve(repp
, p
+ 1) != 0)
937 struct pool_replica
*rep
= *repp
;
943 enum file_type type
= util_file_get_type(path
);
944 if (type
== OTHER_ERROR
)
947 is_dev_dax
= type
== TYPE_DEVDAX
;
950 rep
->part
[p
].path
= path
;
951 rep
->part
[p
].filesize
= filesize
;
952 rep
->part
[p
].fd
= -1;
953 rep
->part
[p
].is_dev_dax
= is_dev_dax
;
954 rep
->part
[p
].created
= 0;
955 rep
->part
[p
].hdr
= NULL
;
956 rep
->part
[p
].addr
= NULL
;
957 rep
->part
[p
].remote_hdr
= NULL
;
958 rep
->part
[p
].has_bad_blocks
= 0;
961 rep
->part
[p
].alignment
= util_file_device_dax_alignment(path
);
963 rep
->part
[p
].alignment
= Mmap_align
;
965 ASSERTne(rep
->part
[p
].alignment
, 0);
973 * util_replica_add_part -- adds a next part in replica info
976 util_replica_add_part(struct pool_replica
**repp
,
977 const char *path
, size_t filesize
)
979 LOG(3, "replica %p path \"%s\" filesize %zu", *repp
, path
, filesize
);
981 return util_replica_add_part_by_idx(repp
, path
,
982 filesize
, (*repp
)->nparts
);
986 * util_parse_add_part -- (internal) add a new part file to the replica info
989 util_parse_add_part(struct pool_set
*set
, const char *path
, size_t filesize
)
991 LOG(3, "set %p path %s filesize %zu", set
, path
, filesize
);
995 if (set
->directory_based
) {
996 ERR("cannot mix directories and files in a set");
1001 return util_replica_add_part(&set
->replica
[set
->nreplicas
- 1],
1006 * util_parse_add_directory --
1007 * (internal) add a new directory to the replica info
1010 util_parse_add_directory(struct pool_set
*set
, const char *path
,
1013 LOG(3, "set %p path %s filesize %zu", set
, path
, filesize
);
1015 ASSERTne(set
, NULL
);
1017 struct pool_replica
*rep
= set
->replica
[set
->nreplicas
- 1];
1018 ASSERTne(rep
, NULL
);
1020 if (set
->directory_based
== 0) {
1021 if (rep
->nparts
> 0 || set
->nreplicas
> 1) {
1022 ERR("cannot mix directories and files in a set");
1026 set
->directory_based
= 1;
1029 char *rpath
= util_part_realpath(path
);
1030 if (rpath
== NULL
) {
1031 ERR("cannot resolve realpath of new directory");
1035 for (unsigned i
= 0; i
< set
->nreplicas
; ++i
) {
1036 struct pool_replica
*r
= set
->replica
[i
];
1037 struct pool_set_directory
*dir
;
1039 VEC_FOREACH_BY_PTR(dir
, &r
->directory
) {
1040 dpath
= util_part_realpath(dir
->path
);
1041 ASSERTne(dpath
, NULL
); /* must have been resolved */
1042 if (strcmp(rpath
, dpath
) == 0) {
1043 ERR("cannot use the same directory twice");
1054 struct pool_set_directory d
;
1056 d
.resvsize
= filesize
;
1058 if (VEC_PUSH_BACK(&rep
->directory
, d
) != 0)
1061 rep
->resvsize
+= filesize
;
1067 * util_parse_add_element --
1068 * (internal) add a new element to the replica info
1071 util_parse_add_element(struct pool_set
*set
, const char *path
, size_t filesize
)
1073 LOG(3, "set %p path %s filesize %zu", set
, path
, filesize
);
1077 int olderrno
= errno
;
1079 if (os_stat(path
, &stat
) == 0 && S_ISDIR(stat
.st_mode
))
1080 return util_parse_add_directory(set
, path
, filesize
);
1084 return util_parse_add_part(set
, path
, filesize
);
1088 * util_parse_add_replica -- (internal) add a new replica to the pool set info
1091 util_parse_add_replica(struct pool_set
**setp
)
1093 LOG(3, "setp %p", setp
);
1095 ASSERTne(setp
, NULL
);
1097 struct pool_set
*set
= *setp
;
1098 ASSERTne(set
, NULL
);
1100 set
= Realloc(set
, sizeof(struct pool_set
) +
1101 (set
->nreplicas
+ 1) * sizeof(struct pool_replica
*));
1108 struct pool_replica
*rep
;
1109 rep
= Zalloc(sizeof(struct pool_replica
));
1115 VEC_INIT(&rep
->directory
);
1117 unsigned r
= set
->nreplicas
++;
1119 set
->replica
[r
] = rep
;
1125 * util_replica_check_map_sync -- (internal) check MAP_SYNC restrictions
1128 util_replica_check_map_sync(struct pool_set
*set
, unsigned repidx
,
1131 LOG(3, "set %p repidx %u", set
, repidx
);
1133 struct pool_replica
*rep
= set
->replica
[repidx
];
1135 int map_sync
= rep
->part
[0].map_sync
;
1137 for (unsigned p
= 1; p
< rep
->nparts
; p
++) {
1138 if (map_sync
!= rep
->part
[p
].map_sync
) {
1139 ERR("replica #%u part %u %smapped with MAP_SYNC",
1140 repidx
, p
, rep
->part
[p
].map_sync
? "" : "not");
1146 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++) {
1147 if (map_sync
!= rep
->part
[p
].hdr_map_sync
) {
1148 ERR("replica #%u part %u header %smapped "
1149 "with MAP_SYNC", repidx
, p
,
1150 rep
->part
[p
].hdr_map_sync
?
1161 * util_poolset_check_devdax -- (internal) check Device DAX restrictions
1164 util_poolset_check_devdax(struct pool_set
*set
)
1166 LOG(3, "set %p", set
);
1168 if (set
->directory_based
)
1171 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
1172 struct pool_replica
*rep
= set
->replica
[r
];
1173 int is_dev_dax
= rep
->part
[0].is_dev_dax
;
1175 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
1176 if (rep
->part
[p
].is_dev_dax
!= is_dev_dax
) {
1178 "either all the parts must be Device DAX or none");
1182 if (is_dev_dax
&& rep
->nparts
> 1 &&
1183 (set
->options
& (OPTION_SINGLEHDR
|
1184 OPTION_NOHDRS
)) == 0 &&
1185 util_file_device_dax_alignment(rep
->part
[p
].path
)
1188 "Multiple DAX devices with alignment other than 4KB. Use the SINGLEHDR poolset option.");
1197 * util_poolset_check_options -- (internal) check if poolset options are
1201 util_poolset_check_options(struct pool_set
*set
)
1203 LOG(3, "set %p", set
);
1204 if ((set
->options
& OPTION_SINGLEHDR
) &&
1205 (set
->options
& OPTION_NOHDRS
)) {
1207 "both SINGLEHDR and NOHDR poolset options used at the same time");
1214 * util_poolset_set_size -- (internal) calculate pool size
1217 util_poolset_set_size(struct pool_set
*set
)
1219 LOG(3, "set %p", set
);
1221 set
->poolsize
= SIZE_MAX
;
1222 set
->resvsize
= SIZE_MAX
;
1224 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
1225 struct pool_replica
*rep
= set
->replica
[r
];
1227 if (set
->options
& OPTION_SINGLEHDR
)
1229 else if (set
->options
& OPTION_NOHDRS
)
1232 rep
->nhdrs
= rep
->nparts
;
1235 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
1237 (rep
->part
[p
].filesize
& ~(Mmap_align
- 1));
1240 rep
->repsize
-= (rep
->nhdrs
- 1) * Mmap_align
;
1242 if (rep
->resvsize
== 0)
1243 rep
->resvsize
= rep
->repsize
;
1246 * Calculate pool size - choose the smallest replica size.
1247 * Ignore remote replicas.
1249 if (rep
->remote
== NULL
&& rep
->repsize
< set
->poolsize
)
1250 set
->poolsize
= rep
->repsize
;
1251 if (rep
->remote
== NULL
&& rep
->resvsize
< set
->resvsize
)
1252 set
->resvsize
= rep
->resvsize
;
1255 LOG(3, "pool size set to %zu", set
->poolsize
);
1259 * util_parse_add_remote_replica -- (internal) add a new remote replica
1260 * to the pool set info
1263 util_parse_add_remote_replica(struct pool_set
**setp
, char *node_addr
,
1266 LOG(3, "setp %p node_addr %s pool_desc %s", setp
, node_addr
, pool_desc
);
1268 ASSERTne(setp
, NULL
);
1269 ASSERTne(node_addr
, NULL
);
1270 ASSERTne(pool_desc
, NULL
);
1272 int ret
= util_parse_add_replica(setp
);
1277 * A remote replica has one fake part of size equal twice pool header
1278 * size for storing pool header and pool descriptor.
1280 ret
= util_parse_add_part(*setp
, NULL
, 2 * POOL_HDR_SIZE
);
1284 struct pool_set
*set
= *setp
;
1285 struct pool_replica
*rep
= set
->replica
[set
->nreplicas
- 1];
1286 ASSERTne(rep
, NULL
);
1288 rep
->remote
= Zalloc(sizeof(struct remote_replica
));
1289 if (rep
->remote
== NULL
) {
1293 rep
->remote
->node_addr
= node_addr
;
1294 rep
->remote
->pool_desc
= pool_desc
;
1301 * util_part_idx_by_file_name -- (internal) retrieves the part index from a
1302 * name of the file that is an element of a directory poolset
1305 util_part_idx_by_file_name(const char *filename
)
1307 LOG(3, "filename \"%s\"", filename
);
1309 int olderrno
= errno
;
1311 long part_idx
= strtol(filename
, NULL
, 10);
1321 * util_poolset_directory_load -- (internal) loads and initializes all
1322 * existing parts in a single directory
1325 util_poolset_directory_load(struct pool_replica
**repp
, const char *directory
)
1327 LOG(3, "rep %p dir \"%s\"", *repp
, directory
);
1329 struct fs
*f
= fs_new(directory
);
1331 ERR("!fs_new: \"%s\"", directory
);
1338 struct fs_entry
*entry
;
1339 while ((entry
= fs_read(f
)) != NULL
) {
1340 if (entry
->level
!= 1)
1342 if (entry
->type
!= FS_ENTRY_FILE
)
1344 if (entry
->namelen
< PMEM_EXT_LEN
)
1346 const char *ext
= entry
->path
+ entry
->pathlen
-
1348 if (strcmp(PMEM_EXT
, ext
) != 0)
1351 long part_idx
= util_part_idx_by_file_name(entry
->name
);
1355 ssize_t size
= util_file_get_size(entry
->path
);
1358 "cannot read size of file (%s) in a poolset directory",
1363 if ((path
= Strdup(entry
->path
)) == NULL
) {
1368 if (util_replica_add_part_by_idx(repp
, path
,
1369 (size_t)size
, (unsigned)part_idx
) != 0) {
1370 ERR("unable to load part %s", entry
->path
);
1385 * util_poolset_directories_load -- (internal) loads and initializes all
1386 * existing parts in the poolset directories
1389 util_poolset_directories_load(struct pool_set
*set
)
1391 LOG(3, "set %p", set
);
1393 if (!set
->directory_based
)
1396 unsigned next_part_id
= 0;
1397 unsigned max_parts_rep
= 0;
1398 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
1401 struct pool_set_directory
*d
;
1403 int prev_nparts
= 0;
1404 VEC_FOREACH_BY_PTR(d
, &set
->replica
[r
]->directory
) {
1405 prev_nparts
= nparts
;
1406 nparts
= util_poolset_directory_load(&set
->replica
[r
],
1409 ERR("failed to load parts from directory %s",
1414 next_part_id
+= (unsigned)nparts
;
1416 /* always try to evenly spread files across dirs */
1417 if (r
== 0 && prev_nparts
> nparts
)
1418 set
->next_directory_id
++;
1421 if (next_part_id
> set
->replica
[max_parts_rep
]->nparts
)
1425 set
->next_id
= next_part_id
;
1429 * In order to maintain the same semantics of poolset parsing for
1430 * regular poolsets and directory poolsets, we need to speculatively
1431 * recreate the information regarding any missing parts in replicas.
1433 struct pool_replica
*rep
;
1434 struct pool_replica
*mrep
= set
->replica
[max_parts_rep
];
1436 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
1437 if (set
->replica
[r
]->nparts
== mrep
->nparts
)
1440 if (VEC_SIZE(&set
->replica
[r
]->directory
) == 0) {
1442 ERR("!no directories in replica");
1446 if (util_replica_reserve(&set
->replica
[r
], mrep
->nparts
) != 0)
1449 rep
= set
->replica
[r
];
1451 struct pool_set_directory
*d
= VEC_GET(&rep
->directory
, 0);
1453 for (unsigned pidx
= 0; pidx
< rep
->nallocated
; ++pidx
) {
1454 struct pool_set_part
*p
= &rep
->part
[pidx
];
1455 *p
= mrep
->part
[pidx
];
1457 size_t path_len
= strlen(d
->path
) + PMEM_FILE_MAX_LEN
;
1458 if ((p
->path
= Malloc(path_len
)) == NULL
) {
1463 snprintf((char *)p
->path
, path_len
,
1464 "%s" OS_DIR_SEP_STR
"%0*u%s",
1465 d
->path
, PMEM_FILE_PADDING
,
1468 rep
->nparts
= mrep
->nparts
;
1475 * util_poolset_parse -- parse pool set config file
1477 * Returns 0 if the file is a valid poolset config file,
1478 * and -1 in case of any error.
1480 * XXX: use memory mapped file
1483 util_poolset_parse(struct pool_set
**setp
, const char *path
, int fd
)
1485 LOG(3, "setp %p path %s fd %d", setp
, path
, fd
);
1487 struct pool_set
*set
= NULL
;
1488 enum parser_codes result
;
1498 if (os_lseek(fd
, 0, SEEK_SET
) != 0) {
1499 ERR("!lseek %d", fd
);
1509 /* associate a stream with the file descriptor */
1510 if ((fs
= os_fdopen(fd
, "r")) == NULL
) {
1511 ERR("!fdopen %d", fd
);
1516 unsigned nlines
= 0;
1517 unsigned nparts
= 0; /* number of parts in current replica */
1519 /* read the first line */
1520 line
= util_readline(fs
);
1522 ERR("!Reading poolset file");
1527 set
= Zalloc(sizeof(struct pool_set
));
1529 ERR("!Malloc for pool set");
1533 set
->path
= Strdup(path
);
1534 if (set
->path
== NULL
) {
1539 /* check also if the last character is '\n' */
1540 if (strncmp(line
, POOLSET_HDR_SIG
, POOLSET_HDR_SIG_LEN
) == 0 &&
1541 line
[POOLSET_HDR_SIG_LEN
] == '\n') {
1542 /* 'PMEMPOOLSET' signature detected */
1543 LOG(10, "PMEMPOOLSET");
1545 int ret
= util_parse_add_replica(&set
);
1550 result
= PARSER_CONTINUE
;
1552 result
= PARSER_PMEMPOOLSET
;
1555 while (result
== PARSER_CONTINUE
) {
1557 /* read next line */
1558 line
= util_readline(fs
);
1562 /* chop off newline and comments */
1563 if ((cp
= strchr(line
, '\n')) != NULL
)
1565 if (cp
!= line
&& (cp
= strchr(line
, '#')) != NULL
)
1568 /* skip comments and blank lines */
1575 result
= PARSER_FORMAT_OK
;
1577 if (set
->nreplicas
== 1)
1578 result
= PARSER_SET_NO_PARTS
;
1580 result
= PARSER_REP_NO_PARTS
;
1582 } else if (strncmp(line
, POOLSET_OPTION_SIG
,
1583 POOLSET_OPTION_SIG_LEN
) == 0) {
1584 result
= parser_read_options(
1585 line
+ POOLSET_OPTION_SIG_LEN
,
1587 if (result
== PARSER_CONTINUE
) {
1588 LOG(10, "OPTIONS: %x", set
->options
);
1590 } else if (strncmp(line
, POOLSET_REPLICA_SIG
,
1591 POOLSET_REPLICA_SIG_LEN
) == 0) {
1592 if (line
[POOLSET_REPLICA_SIG_LEN
] != '\0') {
1593 /* something more than 'REPLICA' */
1594 char c
= line
[POOLSET_REPLICA_SIG_LEN
];
1595 if (!isblank((unsigned char)c
)) {
1596 result
= PARSER_REPLICA
;
1599 /* check if it is a remote replica */
1600 result
= parser_read_replica(
1601 line
+ POOLSET_REPLICA_SIG_LEN
,
1602 &node_addr
, &pool_desc
);
1603 if (result
== PARSER_CONTINUE
) {
1604 /* remote REPLICA */
1605 LOG(10, "REMOTE REPLICA "
1606 "node address '%s' "
1607 "pool set descriptor '%s'",
1608 node_addr
, pool_desc
);
1609 if (util_parse_add_remote_replica(&set
,
1610 node_addr
, pool_desc
))
1613 } else if (nparts
>= 1) {
1614 /* 'REPLICA' signature detected */
1617 int ret
= util_parse_add_replica(&set
);
1622 result
= PARSER_CONTINUE
;
1624 if (set
->nreplicas
== 1)
1625 result
= PARSER_SET_NO_PARTS
;
1627 result
= PARSER_REP_NO_PARTS
;
1630 /* there could be no parts for remote replicas */
1631 if (set
->replica
[set
->nreplicas
- 1]->remote
) {
1632 result
= PARSER_REMOTE_REP_UNEXPECTED_PARTS
;
1636 /* read size and path */
1637 result
= parser_read_line(line
, &psize
, &ppath
);
1638 if (result
== PARSER_CONTINUE
) {
1639 /* add a new pool's part to the list */
1640 int ret
= util_parse_add_element(set
,
1651 if (result
!= PARSER_FORMAT_OK
) {
1652 ERR("%s [%s:%d]", path
, parser_errstr
[result
], nlines
);
1654 case PARSER_CANNOT_READ_SIZE
:
1655 case PARSER_OUT_OF_MEMORY
:
1656 /* do not overwrite errno */
1664 if (util_poolset_check_devdax(set
) != 0) {
1669 if (util_poolset_directories_load(set
) != 0) {
1670 ERR("cannot load part files from directories");
1674 LOG(4, "set file format correct (%s)", path
);
1675 (void) os_fclose(fs
);
1677 util_poolset_check_options(set
);
1678 util_poolset_set_size(set
);
1685 (void) os_fclose(fs
);
1687 util_poolset_free(set
);
1693 * util_poolset_single -- (internal) create a one-part pool set
1695 * On success returns a pointer to a newly allocated and initialized
1696 * pool set structure. Otherwise, NULL is returned.
1698 static struct pool_set
*
1699 util_poolset_single(const char *path
, size_t filesize
, int create
,
1702 LOG(3, "path %s filesize %zu create %d",
1703 path
, filesize
, create
);
1705 enum file_type type
= util_file_get_type(path
);
1706 if (type
== OTHER_ERROR
)
1709 struct pool_set
*set
;
1710 set
= Zalloc(sizeof(struct pool_set
) +
1711 sizeof(struct pool_replica
*));
1713 ERR("!Malloc for pool set");
1717 set
->path
= Strdup(path
);
1718 if (set
->path
== NULL
) {
1724 struct pool_replica
*rep
;
1725 rep
= Zalloc(sizeof(struct pool_replica
) +
1726 sizeof(struct pool_set_part
));
1728 ERR("!Malloc for pool set replica");
1734 VEC_INIT(&rep
->directory
);
1736 set
->replica
[0] = rep
;
1738 rep
->part
[0].filesize
= filesize
;
1739 rep
->part
[0].path
= Strdup(path
);
1740 rep
->part
[0].fd
= -1; /* will be filled out by util_poolset_file() */
1741 rep
->part
[0].is_dev_dax
= type
== TYPE_DEVDAX
;
1742 rep
->part
[0].created
= create
;
1743 rep
->part
[0].hdr
= NULL
;
1744 rep
->part
[0].addr
= NULL
;
1745 rep
->part
[0].has_bad_blocks
= 0;
1747 if (rep
->part
[0].is_dev_dax
)
1748 rep
->part
[0].alignment
= util_file_device_dax_alignment(path
);
1750 rep
->part
[0].alignment
= Mmap_align
;
1752 ASSERTne(rep
->part
[0].alignment
, 0);
1754 rep
->nallocated
= 1;
1758 /* it does not have a remote replica */
1762 /* round down to the nearest mapping alignment boundary */
1763 rep
->repsize
= rep
->part
[0].filesize
& ~(rep
->part
[0].alignment
- 1);
1764 rep
->resvsize
= rep
->repsize
;
1766 set
->poolsize
= rep
->repsize
;
1767 set
->resvsize
= rep
->resvsize
;
1770 set
->ignore_sds
= ignore_sds
|| (set
->options
& OPTION_NOHDRS
);
1776 * util_part_open -- open or create a single part file
1779 util_part_open(struct pool_set_part
*part
, size_t minsize
, int create_part
)
1781 LOG(3, "part %p minsize %zu create %d", part
, minsize
, create_part
);
1783 int exists
= util_file_exists(part
->path
);
1787 int create_file
= create_part
;
1794 part
->fd
= util_file_create(part
->path
, part
->filesize
,
1796 if (part
->fd
== -1) {
1797 LOG(2, "failed to create file: %s", part
->path
);
1804 part
->fd
= util_file_open(part
->path
, &size
, minsize
, flags
);
1805 if (part
->fd
== -1) {
1806 LOG(2, "failed to open file: %s", part
->path
);
1810 if (Fallocate_at_create
&& create_part
&& !part
->is_dev_dax
) {
1811 int ret
= os_posix_fallocate(part
->fd
, 0,
1815 ERR("!posix_fallocate \"%s\", %zu", part
->path
,
1821 /* check if filesize matches */
1822 if (part
->filesize
!= size
) {
1823 ERR("file size does not match config: %s, %zu != %zu",
1824 part
->path
, size
, part
->filesize
);
1834 * util_part_fdclose -- close part file
1837 util_part_fdclose(struct pool_set_part
*part
)
1839 LOG(3, "part %p", part
);
1841 if (part
->fd
!= -1) {
1842 (void) os_close(part
->fd
);
1848 * util_set_rpmem_attr -- (internal) overwrite existing pool attributes
1850 * does not set uuid, next_part_uuid, prev_part_uuid
1853 util_set_rpmem_attr(struct pool_hdr
*hdrp
, const struct rpmem_pool_attr
*rattr
)
1855 LOG(5, "hdrp %p rattr %p", hdrp
, rattr
);
1856 memcpy(hdrp
->signature
, rattr
->signature
, POOL_HDR_SIG_LEN
);
1857 hdrp
->major
= rattr
->major
;
1858 hdrp
->features
.compat
= rattr
->compat_features
;
1859 hdrp
->features
.incompat
= rattr
->incompat_features
;
1860 hdrp
->features
.ro_compat
= rattr
->ro_compat_features
;
1861 memcpy(hdrp
->poolset_uuid
, rattr
->poolset_uuid
, POOL_HDR_UUID_LEN
);
1862 memcpy(hdrp
->next_repl_uuid
, rattr
->next_uuid
, POOL_HDR_UUID_LEN
);
1863 memcpy(hdrp
->prev_repl_uuid
, rattr
->prev_uuid
, POOL_HDR_UUID_LEN
);
1864 memcpy(&hdrp
->arch_flags
, rattr
->user_flags
, sizeof(struct arch_flags
));
1868 * util_get_rpmem_attr -- (internal) get attributes for remote replica header
1871 util_get_rpmem_attr(struct rpmem_pool_attr
*rattr
, const struct pool_hdr
*hdrp
)
1873 LOG(5, "rpmem_attr %p hdrp %p", rattr
, hdrp
);
1874 ASSERTne(rattr
, NULL
);
1875 memcpy(rattr
->signature
, hdrp
->signature
, POOL_HDR_SIG_LEN
);
1876 rattr
->major
= hdrp
->major
;
1877 rattr
->compat_features
= hdrp
->features
.compat
;
1878 rattr
->incompat_features
= hdrp
->features
.incompat
;
1879 rattr
->ro_compat_features
= hdrp
->features
.ro_compat
;
1880 memcpy(rattr
->poolset_uuid
, hdrp
->poolset_uuid
, POOL_HDR_UUID_LEN
);
1881 memcpy(rattr
->uuid
, hdrp
->uuid
, POOL_HDR_UUID_LEN
);
1882 memcpy(rattr
->next_uuid
, hdrp
->next_repl_uuid
, POOL_HDR_UUID_LEN
);
1883 memcpy(rattr
->prev_uuid
, hdrp
->prev_repl_uuid
, POOL_HDR_UUID_LEN
);
1884 memcpy(rattr
->user_flags
, &hdrp
->arch_flags
, sizeof(struct arch_flags
));
1888 * util_remote_store_attr -- (internal) store attributes read from remote
1889 * replica in the local volatile pool header
1892 util_remote_store_attr(struct pool_hdr
*hdrp
,
1893 const struct rpmem_pool_attr
*rattr
)
1895 LOG(4, "hdrp %p rpmem_attr %p", hdrp
, rattr
);
1897 util_set_rpmem_attr(hdrp
, rattr
);
1898 memcpy(hdrp
->uuid
, rattr
->uuid
, POOL_HDR_UUID_LEN
);
1899 memcpy(hdrp
->next_part_uuid
, rattr
->uuid
, POOL_HDR_UUID_LEN
);
1900 memcpy(hdrp
->prev_part_uuid
, rattr
->uuid
, POOL_HDR_UUID_LEN
);
1904 * util_update_remote_header -- update attributes of a remote replica;
1905 * the remote replica must be open
1908 util_update_remote_header(struct pool_set
*set
, unsigned repn
)
1910 LOG(3, "set %p, repn %u", set
, repn
);
1912 ASSERTne(REP(set
, repn
)->remote
, NULL
);
1913 ASSERTne(REP(set
, repn
)->remote
->rpp
, NULL
);
1915 struct pool_replica
*rep
= REP(set
, repn
);
1916 struct pool_hdr
*hdr
= HDR(rep
, 0);
1918 /* get attributes from the local pool header */
1919 struct rpmem_pool_attr attributes
;
1920 util_get_rpmem_attr(&attributes
, hdr
);
1922 /* push the attributes to the remote replica */
1923 RPMEMpool
*rpp
= rep
->remote
->rpp
;
1924 int ret
= Rpmem_set_attr(rpp
, &attributes
);
1926 ERR("!Rpmem_set_attr");
1933 * util_pool_close_remote -- close a remote replica
1936 util_pool_close_remote(RPMEMpool
*rpp
)
1938 LOG(3, "rpp %p", rpp
);
1940 return Rpmem_close(rpp
);
1944 * util_poolset_remote_open -- open or create a remote replica
1947 util_poolset_remote_open(struct pool_replica
*rep
, unsigned repidx
,
1948 size_t minsize
, int create
, void *pool_addr
,
1949 size_t pool_size
, unsigned *nlanes
)
1951 LOG(3, "rep %p repidx %u minsize %zu create %d "
1952 "pool_addr %p pool_size %zu nlanes %p",
1953 rep
, repidx
, minsize
, create
,
1954 pool_addr
, pool_size
, nlanes
);
1956 ASSERTne(nlanes
, NULL
);
1958 if (!Rpmem_handle_remote
) {
1962 unsigned remote_nlanes
= *nlanes
;
1965 struct rpmem_pool_attr rpmem_attr_create
;
1966 util_get_rpmem_attr(&rpmem_attr_create
, rep
->part
[0].hdr
);
1968 rep
->remote
->rpp
= Rpmem_create(rep
->remote
->node_addr
,
1969 rep
->remote
->pool_desc
,
1973 &rpmem_attr_create
);
1974 if (rep
->remote
->rpp
== NULL
) {
1975 ERR("creating remote replica #%u failed", repidx
);
1978 rep
->part
[0].created
= 1;
1980 struct rpmem_pool_attr rpmem_attr_open
;
1982 rep
->remote
->rpp
= Rpmem_open(rep
->remote
->node_addr
,
1983 rep
->remote
->pool_desc
,
1988 if (rep
->remote
->rpp
== NULL
) {
1989 ERR("opening remote replica #%u failed", repidx
);
1993 util_remote_store_attr(rep
->part
[0].hdr
, &rpmem_attr_open
);
1996 if (remote_nlanes
< *nlanes
)
1997 *nlanes
= remote_nlanes
;
2003 * util_poolset_files_local -- (internal) open or create all the local
2004 * part files of a pool set and replica sets
2007 util_poolset_files_local(struct pool_set
*set
, size_t minpartsize
, int create
)
2009 LOG(3, "set %p minpartsize %zu create %d", set
, minpartsize
, create
);
2011 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
2012 struct pool_replica
*rep
= set
->replica
[r
];
2014 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
2015 if (util_part_open(&rep
->part
[p
], minpartsize
,
2026 * util_poolset_remote_replica_open -- open remote replica
2029 util_poolset_remote_replica_open(struct pool_set
*set
, unsigned repidx
,
2030 size_t minsize
, int create
, unsigned *nlanes
)
2034 * This is a workaround for an issue with using device dax with
2035 * libibverbs. To handle fork() function calls correctly libfabric use
2036 * ibv_fork_init(3) which makes all registered memory being madvised
2037 * with MADV_DONTFORK flag. In libpmemobj the remote replication is
2038 * performed without pool header (first 4k). In such case the address
2039 * passed to madvise(2) is aligned to 4k, but device dax can require
2040 * different alignment (default is 2MB). This workaround madvises the
2041 * entire memory region before registering it by fi_mr_reg(3).
2043 * The librpmem client requires fork() support to work correctly.
2045 if (set
->replica
[0]->part
[0].is_dev_dax
) {
2046 int ret
= os_madvise(set
->replica
[0]->part
[0].addr
,
2047 set
->replica
[0]->part
[0].filesize
,
2056 void *pool_addr
= (void *)((uintptr_t)set
->replica
[0]->part
[0].addr
);
2058 return util_poolset_remote_open(set
->replica
[repidx
], repidx
, minsize
,
2059 create
, pool_addr
, set
->poolsize
, nlanes
);
2063 * util_poolset_files_remote -- (internal) open or create all the remote
2064 * part files of a pool set and replica sets
2067 util_poolset_files_remote(struct pool_set
*set
, size_t minsize
,
2068 unsigned *nlanes
, int create
)
2070 LOG(3, "set %p minsize %zu nlanes %p create %d",
2071 set
, minsize
, nlanes
, create
);
2073 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
2074 struct pool_replica
*rep
= set
->replica
[r
];
2076 if (util_poolset_remote_replica_open(set
, r
,
2077 minsize
, create
, nlanes
))
2086 * util_poolset_read -- read memory pool set file
2088 * On success returns 0 and a pointer to a newly allocated structure
2089 * containing the info of all the parts of the pool set and replicas.
2092 util_poolset_read(struct pool_set
**setp
, const char *path
)
2094 LOG(3, "setp %p path %s", setp
, path
);
2100 if ((fd
= os_open(path
, O_RDONLY
)) < 0) {
2101 ERR("!open: path \"%s\"", path
);
2105 ret
= util_poolset_parse(setp
, path
, fd
);
2108 (void) os_close(fd
);
2114 * util_poolset_create_set -- create a new pool set structure
2116 * On success returns 0 and a pointer to a newly allocated structure
2117 * containing the info of all the parts of the pool set and replicas.
2120 util_poolset_create_set(struct pool_set
**setp
, const char *path
,
2121 size_t poolsize
, size_t minsize
, int ignore_sds
)
2123 LOG(3, "setp %p path %s poolsize %zu minsize %zu",
2124 setp
, path
, poolsize
, minsize
);
2131 enum file_type type
= util_file_get_type(path
);
2132 if (type
== OTHER_ERROR
)
2135 if (poolsize
!= 0) {
2136 if (type
== TYPE_DEVDAX
) {
2137 ERR("size must be zero for device dax");
2140 *setp
= util_poolset_single(path
, poolsize
, 1, ignore_sds
);
2147 /* do not check minsize */
2148 if ((fd
= util_file_open(path
, &size
, 0, O_RDONLY
)) == -1)
2151 char signature
[POOLSET_HDR_SIG_LEN
];
2152 if (type
== TYPE_NORMAL
) {
2154 * read returns ssize_t, but we know it will return value
2155 * between -1 and POOLSET_HDR_SIG_LEN (11), so we can safely
2158 ret
= (int)read(fd
, signature
, POOLSET_HDR_SIG_LEN
);
2160 ERR("!read %d", fd
);
2165 if (type
== TYPE_DEVDAX
|| ret
< POOLSET_HDR_SIG_LEN
||
2166 strncmp(signature
, POOLSET_HDR_SIG
, POOLSET_HDR_SIG_LEN
)) {
2167 LOG(4, "not a pool set header");
2168 (void) os_close(fd
);
2170 if (size
< minsize
) {
2171 ERR("file is not a poolset file and its size (%zu)"
2172 " is smaller than %zu", size
, minsize
);
2176 *setp
= util_poolset_single(path
, size
, 0, ignore_sds
);
2183 ret
= util_poolset_parse(setp
, path
, fd
);
2187 (*setp
)->ignore_sds
= ignore_sds
|| ((*setp
)->options
& OPTION_NOHDRS
);
2189 /* remote replication is not supported on Windows */
2190 if ((*setp
)->remote
) {
2191 util_poolset_free(*setp
);
2192 ERR("remote replication is not supported on Windows");
2201 (void) os_close(fd
);
2207 * util_poolset_check_header_options -- (internal) check if poolset options
2211 util_poolset_check_header_options(struct pool_set
*set
, uint32_t incompat
)
2213 LOG(3, "set %p, incompat %#x", set
, incompat
);
2215 if (((set
->options
& OPTION_SINGLEHDR
) == 0) !=
2216 ((incompat
& POOL_FEAT_SINGLEHDR
) == 0)) {
2218 "poolset file options (%u) do not match incompat feature flags (%#x)",
2219 set
->options
, incompat
);
2227 * util_header_create -- create header of a single pool set file
2230 util_header_create(struct pool_set
*set
, unsigned repidx
, unsigned partidx
,
2231 const struct pool_attr
*attr
, int overwrite
)
2233 LOG(3, "set %p repidx %u partidx %u attr %p overwrite %d", set
, repidx
,
2234 partidx
, attr
, overwrite
);
2236 ASSERTne(attr
, NULL
);
2238 struct pool_replica
*rep
= set
->replica
[repidx
];
2240 /* opaque info lives at the beginning of mapped memory pool */
2241 struct pool_hdr
*hdrp
= rep
->part
[partidx
].hdr
;
2243 /* check if the pool header is all zeros */
2244 if (!util_is_zeroed(hdrp
, sizeof(*hdrp
)) && !overwrite
) {
2245 ERR("Non-empty file detected");
2250 /* create pool's header */
2251 util_pool_attr2hdr(hdrp
, attr
);
2253 if (set
->options
& OPTION_SINGLEHDR
)
2254 hdrp
->features
.incompat
|= POOL_FEAT_SINGLEHDR
;
2256 memcpy(hdrp
->poolset_uuid
, set
->uuid
, POOL_HDR_UUID_LEN
);
2257 memcpy(hdrp
->uuid
, PART(rep
, partidx
)->uuid
, POOL_HDR_UUID_LEN
);
2260 if (set
->options
& OPTION_SINGLEHDR
) {
2261 /* next/prev part point to part #0 */
2262 ASSERTeq(partidx
, 0);
2263 memcpy(hdrp
->prev_part_uuid
, PART(rep
, 0)->uuid
,
2265 memcpy(hdrp
->next_part_uuid
, PART(rep
, 0)->uuid
,
2268 memcpy(hdrp
->prev_part_uuid
, PARTP(rep
, partidx
)->uuid
,
2270 memcpy(hdrp
->next_part_uuid
, PARTN(rep
, partidx
)->uuid
,
2275 if (!util_is_zeroed(attr
->prev_repl_uuid
, POOL_HDR_UUID_LEN
)) {
2276 memcpy(hdrp
->prev_repl_uuid
, attr
->prev_repl_uuid
,
2279 memcpy(hdrp
->prev_repl_uuid
, PART(REPP(set
, repidx
), 0)->uuid
,
2282 if (!util_is_zeroed(attr
->next_repl_uuid
, POOL_HDR_UUID_LEN
)) {
2283 memcpy(hdrp
->next_repl_uuid
, attr
->next_repl_uuid
,
2286 memcpy(hdrp
->next_repl_uuid
, PART(REPN(set
, repidx
), 0)->uuid
,
2293 if (os_fstat(rep
->part
[partidx
].fd
, &stbuf
) != 0) {
2297 ASSERT(stbuf
.st_ctime
);
2298 hdrp
->crtime
= (uint64_t)stbuf
.st_ctime
;
2301 int arch_is_zeroed
= util_is_zeroed(attr
->arch_flags
,
2304 util_get_arch_flags(&hdrp
->arch_flags
);
2306 util_convert2le_hdr(hdrp
);
2308 if (!arch_is_zeroed
) {
2309 memcpy(&hdrp
->arch_flags
, attr
->arch_flags
, POOL_HDR_ARCH_LEN
);
2312 if (!set
->ignore_sds
&& partidx
== 0 && !rep
->remote
) {
2313 shutdown_state_init(&hdrp
->sds
, rep
);
2314 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
2315 if (shutdown_state_add_part(&hdrp
->sds
,
2316 PART(rep
, p
)->fd
, rep
))
2319 shutdown_state_set_dirty(&hdrp
->sds
, rep
);
2322 util_checksum(hdrp
, sizeof(*hdrp
), &hdrp
->checksum
,
2323 1, POOL_HDR_CSUM_END_OFF(hdrp
));
2325 /* store pool's header */
2326 util_persist_auto(rep
->is_pmem
, hdrp
, sizeof(*hdrp
));
2332 * util_header_check -- (internal) validate header of a single pool set file
2335 util_header_check(struct pool_set
*set
, unsigned repidx
, unsigned partidx
,
2336 const struct pool_attr
*attr
)
2338 LOG(3, "set %p repidx %u partidx %u attr %p", set
, repidx
, partidx
,
2341 ASSERTne(attr
, NULL
);
2343 struct pool_replica
*rep
= set
->replica
[repidx
];
2345 /* opaque info lives at the beginning of mapped memory pool */
2346 struct pool_hdr
*hdrp
= rep
->part
[partidx
].hdr
;
2347 struct pool_hdr hdr
;
2349 memcpy(&hdr
, hdrp
, sizeof(hdr
));
2351 /* local copy of a remote header does not need to be converted */
2352 if (rep
->remote
== NULL
)
2353 util_convert2h_hdr_nocheck(&hdr
);
2355 /* to be valid, a header must have a major version of at least 1 */
2356 if (hdr
.major
== 0) {
2357 ERR("invalid major version (0)");
2362 /* check signature */
2363 if (memcmp(hdr
.signature
, attr
->signature
, POOL_HDR_SIG_LEN
)) {
2364 ERR("wrong pool type: \"%.8s\"", hdr
.signature
);
2369 /* check format version number */
2370 if (hdr
.major
!= attr
->major
) {
2371 ERR("pool version %d (library expects %d)", hdr
.major
,
2373 if (hdr
.major
< attr
->major
)
2375 "Please run the pmdk-convert utility to upgrade the pool.");
2380 rep
->part
[partidx
].rdonly
= 0;
2382 int retval
= util_feature_check(&hdr
, attr
->features
);
2387 rep
->part
[partidx
].rdonly
= 1;
2389 if (rep
->remote
== NULL
) {
2391 * and to be valid, the fields must checksum correctly
2393 * NOTE: checksum validation is performed after format version
2394 * and feature check, because if POOL_FEAT_CKSUM_2K flag is set,
2395 * we want to report it as incompatible feature, rather than
2398 if (!util_checksum(&hdr
, sizeof(hdr
), &hdr
.checksum
,
2399 0, POOL_HDR_CSUM_END_OFF(&hdr
))) {
2400 ERR("invalid checksum of pool header");
2405 LOG(3, "valid header, signature \"%.8s\"", hdr
.signature
);
2408 if (util_check_arch_flags(&hdr
.arch_flags
)) {
2409 ERR("wrong architecture flags");
2414 /* check pool set UUID */
2415 if (memcmp(HDR(REP(set
, 0), 0)->poolset_uuid
, hdr
.poolset_uuid
,
2416 POOL_HDR_UUID_LEN
)) {
2417 ERR("wrong pool set UUID");
2422 /* check pool set linkage */
2423 if (memcmp(HDRP(rep
, partidx
)->uuid
, hdr
.prev_part_uuid
,
2424 POOL_HDR_UUID_LEN
) ||
2425 memcmp(HDRN(rep
, partidx
)->uuid
, hdr
.next_part_uuid
,
2426 POOL_HDR_UUID_LEN
)) {
2427 ERR("wrong part UUID");
2432 /* check format version */
2433 if (HDR(rep
, 0)->major
!= hdrp
->major
) {
2434 ERR("incompatible pool format");
2439 /* check compatibility features */
2440 if (HDR(rep
, 0)->features
.compat
!= hdrp
->features
.compat
||
2441 HDR(rep
, 0)->features
.incompat
!= hdrp
->features
.incompat
||
2442 HDR(rep
, 0)->features
.ro_compat
!= hdrp
->features
.ro_compat
) {
2443 ERR("incompatible feature flags");
2448 /* check poolset options */
2449 if (util_poolset_check_header_options(set
,
2450 HDR(rep
, 0)->features
.incompat
))
2457 * util_header_check_remote -- (internal) validate header of a remote
2461 util_header_check_remote(struct pool_set
*set
, unsigned partidx
)
2463 LOG(3, "set %p partidx %u ", set
, partidx
);
2465 /* there is only one replica in remote poolset */
2466 struct pool_replica
*rep
= set
->replica
[0];
2467 /* opaque info lives at the beginning of mapped memory pool */
2468 struct pool_hdr
*hdrp
= rep
->part
[partidx
].hdr
;
2469 struct pool_hdr hdr
;
2471 if (util_is_zeroed(hdrp
, sizeof(*hdrp
))) {
2472 ERR("pool header zeroed");
2477 memcpy(&hdr
, hdrp
, sizeof(hdr
));
2479 util_convert2h_hdr_nocheck(&hdr
);
2481 /* valid header found */
2482 if (memcmp(HDR(rep
, 0)->signature
, hdrp
->signature
, POOL_HDR_SIG_LEN
)) {
2483 ERR("pool signature mismatch in part %d", partidx
);
2488 /* check format version */
2489 if (HDR(rep
, 0)->major
!= hdrp
->major
) {
2490 ERR("pool version mismatch in part %d", partidx
);
2495 /* check compatibility features */
2496 if (HDR(rep
, 0)->features
.compat
!= hdrp
->features
.compat
) {
2497 ERR("'may have' compatibility flags mismatch in part %d",
2502 if (HDR(rep
, 0)->features
.incompat
!= hdrp
->features
.incompat
) {
2503 ERR("'must support' compatibility flags mismatch in part %d",
2508 if (HDR(rep
, 0)->features
.ro_compat
!= hdrp
->features
.ro_compat
) {
2509 ERR("'force read-only' compatibility flags mismatch in part %d",
2516 * and to be valid, the fields must checksum correctly
2518 * NOTE: checksum validation is performed after format version and
2519 * feature check, because if POOL_FEAT_CKSUM_2K flag is set,
2520 * we want to report it as incompatible feature, rather than invalid
2523 if (!util_checksum(&hdr
, sizeof(hdr
), &hdr
.checksum
,
2524 0, POOL_HDR_CSUM_END_OFF(&hdr
))) {
2525 ERR("invalid checksum of pool header");
2529 LOG(3, "valid header, signature \"%.8s\"", hdr
.signature
);
2531 /* check pool set UUID */
2532 if (memcmp(HDR(rep
, 0)->poolset_uuid
, hdrp
->poolset_uuid
,
2533 POOL_HDR_UUID_LEN
)) {
2534 ERR("wrong pool set UUID in part %d", partidx
);
2539 /* check previous replica UUID */
2540 if (memcmp(HDR(rep
, 0)->prev_repl_uuid
, hdrp
->prev_repl_uuid
,
2541 POOL_HDR_UUID_LEN
)) {
2542 ERR("wrong previous replica UUID in part %d", partidx
);
2547 /* check next replica UUID */
2548 if (memcmp(HDR(rep
, 0)->next_repl_uuid
, hdrp
->next_repl_uuid
,
2549 POOL_HDR_UUID_LEN
)) {
2550 ERR("wrong next replica UUID in part %d", partidx
);
2555 if (memcmp(&HDR(rep
, 0)->arch_flags
, &hdrp
->arch_flags
,
2556 sizeof(hdrp
->arch_flags
))) {
2557 ERR("wrong architecture flags");
2562 /* check pool set linkage */
2563 if (memcmp(HDRP(rep
, partidx
)->uuid
, hdrp
->prev_part_uuid
,
2564 POOL_HDR_UUID_LEN
) ||
2565 memcmp(HDRN(rep
, partidx
)->uuid
, hdrp
->next_part_uuid
,
2566 POOL_HDR_UUID_LEN
)) {
2567 ERR("wrong part UUID in part %d", partidx
);
2572 /* read shutdown state toggle from header */
2573 set
->ignore_sds
|= IGNORE_SDS(HDR(rep
, 0));
2575 if (!set
->ignore_sds
&& partidx
== 0) {
2576 struct shutdown_state sds
;
2577 shutdown_state_init(&sds
, NULL
);
2578 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
2579 if (shutdown_state_add_part(&sds
,
2580 PART(rep
, p
)->fd
, NULL
))
2584 if (shutdown_state_check(&sds
, &hdrp
->sds
, rep
)) {
2589 shutdown_state_set_dirty(&hdrp
->sds
, rep
);
2592 rep
->part
[partidx
].rdonly
= 0;
2598 * util_replica_set_is_pmem -- sets per-replica is_pmem flag
2600 * The replica is PMEM if:
2601 * - all parts are on device dax, or
2602 * - all parts are mapped with MAP_SYNC.
2604 * It's enough to check only first part because it's already verified
2605 * that either all or none parts are device dax or mapped with MAP_SYNC.
2608 util_replica_set_is_pmem(struct pool_replica
*rep
)
2610 rep
->is_pmem
= rep
->part
[0].is_dev_dax
|| rep
->part
[0].map_sync
||
2611 pmem_is_pmem(rep
->part
[0].addr
, rep
->resvsize
);
2615 * util_replica_map_local -- (internal) map memory pool for local replica
2618 util_replica_map_local(struct pool_set
*set
, unsigned repidx
, int flags
)
2620 LOG(3, "set %p repidx %u flags %d", set
, repidx
, flags
);
2623 * XXX: Like we reserve space for all parts in this replica when we map
2624 * the first part, we need to reserve the space for all replicas
2625 * upfront. It is not necessary that the replicas are contiguous but
2626 * that way we would not fragment the memory much. I think we should
2627 * leave this to MM, but let's have a note as per our collective minds.
2631 int remaining_retries
= 0;
2633 int remaining_retries
= 10;
2635 int retry_for_contiguous_addr
;
2637 /* header size for all headers but the first one */
2638 size_t hdrsize
= (set
->options
& (OPTION_SINGLEHDR
| OPTION_NOHDRS
)) ?
2641 struct pool_replica
*rep
= set
->replica
[repidx
];
2643 ASSERTeq(rep
->remote
, NULL
);
2644 ASSERTne(rep
->part
, NULL
);
2647 retry_for_contiguous_addr
= 0;
2648 mapsize
= rep
->part
[0].filesize
& ~(Mmap_align
- 1);
2650 /* determine a hint address for mmap() */
2651 addr
= util_map_hint(rep
->resvsize
, 0);
2652 if (addr
== MAP_FAILED
) {
2653 LOG(1, "cannot find a contiguous region of given size");
2657 /* map the first part and reserve space for remaining parts */
2658 if (util_map_part(&rep
->part
[0], addr
, rep
->resvsize
, 0,
2660 LOG(2, "pool mapping failed - replica #%u part #0",
2665 VALGRIND_REGISTER_PMEM_MAPPING(rep
->part
[0].addr
,
2667 VALGRIND_REGISTER_PMEM_FILE(rep
->part
[0].fd
,
2668 rep
->part
[0].addr
, rep
->part
[0].size
, 0);
2670 set
->zeroed
&= rep
->part
[0].created
;
2672 addr
= (char *)rep
->part
[0].addr
+ mapsize
;
2675 * map the remaining parts of the usable pool space
2676 * (aligned to memory mapping granularity)
2678 for (unsigned p
= 1; p
< rep
->nparts
; p
++) {
2680 if (util_map_part(&rep
->part
[p
], addr
, 0, hdrsize
,
2681 flags
| MAP_FIXED
, 0) != 0) {
2683 * if we can't map the part at the address we
2684 * asked for, unmap all the parts that are
2685 * mapped and remap at a different address.
2687 if ((errno
== EINVAL
) &&
2688 (remaining_retries
> 0)) {
2689 LOG(2, "usable space mapping failed - "
2690 "part #%d - retrying", p
);
2691 retry_for_contiguous_addr
= 1;
2692 remaining_retries
--;
2694 util_unmap_parts(rep
, 0, p
- 1);
2696 /* release rest of the VA reserved */
2697 ASSERTne(addr
, NULL
);
2698 ASSERTne(addr
, MAP_FAILED
);
2699 munmap(addr
, rep
->resvsize
- mapsize
);
2702 LOG(2, "usable space mapping failed - part #%d",
2707 VALGRIND_REGISTER_PMEM_FILE(rep
->part
[p
].fd
,
2708 rep
->part
[p
].addr
, rep
->part
[p
].size
,
2711 mapsize
+= rep
->part
[p
].size
;
2712 set
->zeroed
&= rep
->part
[p
].created
;
2713 addr
= (char *)addr
+ rep
->part
[p
].size
;
2715 } while (retry_for_contiguous_addr
);
2718 * Initially part[0].size is the size of address space
2719 * reservation for all parts from given replica. After
2720 * mapping that space we need to overwrite part[0].size
2721 * with its actual size to be consistent - size for each
2722 * part should be the actual mapping size of this part
2723 * only - it simplifies future calculations.
2725 rep
->part
[0].size
= rep
->part
[0].filesize
& ~(Mmap_align
- 1);
2727 if (util_replica_check_map_sync(set
, repidx
, 0))
2730 util_replica_set_is_pmem(rep
);
2732 if (Prefault_at_create
)
2733 util_replica_force_page_allocation(rep
);
2735 ASSERTeq(mapsize
, rep
->repsize
);
2737 LOG(3, "replica #%u addr %p", repidx
, rep
->part
[0].addr
);
2742 LOG(4, "error clean up");
2744 if (mapsize
< rep
->repsize
) {
2745 ASSERTne(rep
->part
[0].addr
, NULL
);
2746 ASSERTne(rep
->part
[0].addr
, MAP_FAILED
);
2747 munmap(rep
->part
[0].addr
, rep
->resvsize
- mapsize
);
2749 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
2750 util_unmap_part(&rep
->part
[p
]);
2757 * util_replica_init_headers_local -- (internal) initialize pool headers
2760 util_replica_init_headers_local(struct pool_set
*set
, unsigned repidx
,
2761 int flags
, const struct pool_attr
*attr
)
2763 LOG(3, "set %p repidx %u flags %d attr %p", set
, repidx
, flags
, attr
);
2765 struct pool_replica
*rep
= set
->replica
[repidx
];
2767 /* map all headers - don't care about the address */
2768 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++) {
2769 if (util_map_hdr(&rep
->part
[p
], flags
, 0) != 0) {
2770 LOG(2, "header mapping failed - part #%d", p
);
2775 /* create headers, set UUID's */
2776 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++) {
2777 if (util_header_create(set
, repidx
, p
, attr
, 0) != 0) {
2778 LOG(2, "header creation failed - part #%d", p
);
2783 /* unmap all headers */
2784 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++)
2785 util_unmap_hdr(&rep
->part
[p
]);
2790 LOG(4, "error clean up");
2792 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++) {
2793 util_unmap_hdr(&rep
->part
[p
]);
2800 * util_replica_create_local -- (internal) create a new memory pool for local
2804 util_replica_create_local(struct pool_set
*set
, unsigned repidx
, int flags
,
2805 const struct pool_attr
*attr
)
2807 LOG(3, "set %p repidx %u flags %d attr %p", set
, repidx
, flags
, attr
);
2810 * the first replica has to be mapped prior to remote ones so if
2811 * a replica is already mapped skip mapping creation
2813 if (PART(REP(set
, repidx
), 0)->addr
== NULL
) {
2814 if (util_replica_map_local(set
, repidx
, flags
) != 0) {
2815 LOG(2, "replica #%u map failed", repidx
);
2823 if (util_replica_init_headers_local(set
, repidx
, flags
, attr
) != 0) {
2824 LOG(2, "replica #%u headers initialization failed", repidx
);
2831 * util_replica_create_remote -- (internal) create a new memory pool
2832 * for remote replica
2835 util_replica_create_remote(struct pool_set
*set
, unsigned repidx
, int flags
,
2836 const struct pool_attr
*attr
)
2838 LOG(3, "set %p repidx %u flags %d attr %p", set
, repidx
, flags
, attr
);
2840 struct pool_replica
*rep
= set
->replica
[repidx
];
2842 ASSERTne(rep
->remote
, NULL
);
2843 ASSERTne(rep
->part
, NULL
);
2844 ASSERTeq(rep
->nparts
, 1);
2845 ASSERTeq(rep
->nhdrs
, 1);
2846 ASSERTne(attr
, NULL
);
2848 struct pool_set_part
*part
= rep
->part
;
2851 * A remote replica has one fake part of size equal twice pool header
2852 * size for storing pool header and pool descriptor.
2854 part
->size
= rep
->repsize
;
2855 ASSERT(IS_PAGE_ALIGNED(part
->size
));
2856 part
->remote_hdr
= Zalloc(part
->size
+ Pagesize
);
2857 if (!part
->remote_hdr
) {
2862 part
->hdr
= PAGE_ALIGN_UP(part
->remote_hdr
);
2863 part
->addr
= PAGE_ALIGN_UP(part
->remote_hdr
);
2864 part
->hdrsize
= POOL_HDR_SIZE
;
2866 /* create header, set UUID's */
2867 if (util_header_create(set
, repidx
, 0, attr
, 0) != 0) {
2868 LOG(2, "header creation failed - part #0");
2869 Free(part
->remote_hdr
);
2873 LOG(3, "replica #%u addr %p", repidx
, rep
->part
[0].addr
);
2879 * util_replica_close -- close a memory pool replica
2881 * This function unmaps all mapped memory regions.
2884 util_replica_close(struct pool_set
*set
, unsigned repidx
)
2886 LOG(3, "set %p repidx %u", set
, repidx
);
2887 struct pool_replica
*rep
= set
->replica
[repidx
];
2889 if (rep
->remote
== NULL
) {
2890 struct pool_set_part
*part
= PART(rep
, 0);
2891 if (!set
->ignore_sds
&& part
->addr
!= NULL
&&
2893 struct pool_hdr
*hdr
= part
->addr
;
2894 RANGE_RW(hdr
, sizeof(*hdr
), part
->is_dev_dax
);
2896 * deep drain will call msync on one page in each
2897 * part in replica to trigger WPQ flush.
2898 * This pages may have been marked as
2899 * undefined/inaccessible, but msyncing such memory
2900 * is not a bug, so as a workaround temporarily
2901 * disable error reporting.
2903 VALGRIND_DO_DISABLE_ERROR_REPORTING
;
2904 util_replica_deep_drain(part
->addr
, rep
->repsize
,
2906 VALGRIND_DO_ENABLE_ERROR_REPORTING
;
2907 shutdown_state_clear_dirty(&hdr
->sds
, rep
);
2909 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++)
2910 util_unmap_hdr(&rep
->part
[p
]);
2912 rep
->part
[0].size
= rep
->resvsize
;
2913 util_unmap_part(&rep
->part
[0]);
2915 LOG(4, "freeing volatile header of remote replica #%u", repidx
);
2916 Free(rep
->part
[0].remote_hdr
);
2917 rep
->part
[0].remote_hdr
= NULL
;
2918 rep
->part
[0].hdr
= NULL
;
2919 rep
->part
[0].hdrsize
= 0;
2920 rep
->part
[0].addr
= NULL
;
2921 rep
->part
[0].size
= 0;
2928 * util_poolset_append_new_part -- (internal) creates a new part in each replica
2932 util_poolset_append_new_part(struct pool_set
*set
, size_t size
)
2934 LOG(3, "set %p size %zu", set
, size
);
2936 if (!set
->directory_based
)
2939 struct pool_set_directory
*d
;
2940 size_t directory_id
;
2945 for (r
= 0; r
< set
->nreplicas
; ++r
) {
2946 struct pool_replica
*rep
= set
->replica
[r
];
2948 directory_id
= set
->next_directory_id
%
2949 VEC_SIZE(&rep
->directory
);
2950 d
= VEC_GET(&rep
->directory
, directory_id
);
2952 path_len
= strlen(d
->path
) + PMEM_FILE_MAX_LEN
;
2953 if ((path
= Malloc(path_len
)) == NULL
) {
2958 snprintf(path
, path_len
, "%s" OS_DIR_SEP_STR
"%0*u%s",
2959 d
->path
, PMEM_FILE_PADDING
, set
->next_id
, PMEM_EXT
);
2961 if (util_replica_add_part(&set
->replica
[r
], path
, size
) != 0)
2962 FATAL("cannot add a new part to the replica info");
2965 set
->next_directory_id
+= 1;
2968 util_poolset_set_size(set
);
2973 /* for each replica 0..r-1 remove the last part */
2974 for (unsigned rn
= 0; rn
< r
; ++rn
) {
2975 struct pool_replica
*rep
= set
->replica
[rn
];
2976 unsigned pidx
= rep
->nparts
- 1;
2977 Free((void *)(rep
->part
[pidx
].path
));
2978 rep
->part
[pidx
].path
= NULL
;
2986 * util_pool_extend -- extends the poolset by the provided size
2989 util_pool_extend(struct pool_set
*set
, size_t *size
, size_t minpartsize
)
2991 LOG(3, "set %p size %zu minpartsize %zu", set
, *size
, minpartsize
);
2994 ERR("cannot extend pool by 0 bytes");
2998 if ((set
->options
& OPTION_SINGLEHDR
) == 0) {
3000 "extending the pool by appending parts with headers is not supported!");
3004 if (set
->poolsize
+ *size
> set
->resvsize
) {
3005 *size
= set
->resvsize
- set
->poolsize
;
3006 if (*size
< minpartsize
) {
3007 ERR("exceeded reservation size");
3010 LOG(4, "extend size adjusted to not exceed reservation size");
3013 size_t old_poolsize
= set
->poolsize
;
3015 if (util_poolset_append_new_part(set
, *size
) != 0) {
3016 ERR("unable to append a new part to the pool");
3020 size_t hdrsize
= (set
->options
& OPTION_SINGLEHDR
) ? 0 : Mmap_align
;
3022 void *addr_base
= NULL
;
3025 for (r
= 0; r
< set
->nreplicas
; r
++) {
3026 struct pool_replica
*rep
= set
->replica
[r
];
3027 unsigned pidx
= rep
->nparts
- 1;
3028 struct pool_set_part
*p
= &rep
->part
[pidx
];
3030 if (util_part_open(p
, 0, 1 /* create */) != 0) {
3031 ERR("cannot open the new part");
3035 addr
= (char *)rep
->part
[0].addr
+ old_poolsize
;
3036 if (addr_base
== NULL
)
3039 if (util_map_part(p
, addr
, 0, hdrsize
,
3040 MAP_SHARED
| MAP_FIXED
, 0) != 0) {
3041 ERR("cannot map the new part");
3046 * new part must be mapped the same way as all the rest
3049 if (p
->map_sync
!= rep
->part
[0].map_sync
) {
3051 ERR("new part cannot be mapped with MAP_SYNC");
3053 ERR("new part mapped with MAP_SYNC");
3058 /* XXX: mode should be the same as for pmemxxx_create() */
3059 if (util_poolset_chmod(set
, S_IWUSR
| S_IRUSR
))
3062 util_poolset_fdclose(set
);
3067 for (unsigned rn
= 0; rn
<= r
; ++rn
) {
3068 struct pool_replica
*rep
= set
->replica
[r
];
3069 unsigned pidx
= rep
->nparts
- 1;
3070 struct pool_set_part
*p
= &rep
->part
[pidx
];
3074 (void) os_close(p
->fd
);
3077 Free((void *)p
->path
);
3080 util_poolset_set_size(set
);
3086 * util_print_bad_files_cb -- (internal) callback printing names of pool files
3087 * containing bad blocks
3090 util_print_bad_files_cb(struct part_file
*pf
, void *arg
)
3092 if (!pf
->is_remote
&& pf
->part
&& pf
->part
->has_bad_blocks
)
3093 ERR("file contains bad blocks -- '%s'", pf
->part
->path
);
3099 * util_pool_create_uuids -- create a new memory pool (set or a single file)
3102 * On success returns 0 and a pointer to a newly allocated structure
3103 * containing the info of all the parts of the pool set and replicas.
3106 util_pool_create_uuids(struct pool_set
**setp
, const char *path
,
3107 size_t poolsize
, size_t minsize
, size_t minpartsize
,
3108 const struct pool_attr
*attr
, unsigned *nlanes
, int can_have_rep
,
3111 LOG(3, "setp %p path %s poolsize %zu minsize %zu minpartsize %zu "
3112 "pattr %p nlanes %p can_have_rep %i remote %i", setp
, path
,
3113 poolsize
, minsize
, minpartsize
, attr
, nlanes
, can_have_rep
,
3116 /* attributes cannot be NULL for local replicas */
3117 ASSERT(remote
|| attr
!= NULL
);
3119 int flags
= MAP_SHARED
;
3122 int exists
= util_file_exists(path
);
3126 /* check if file exists */
3127 if (poolsize
> 0 && exists
) {
3128 ERR("file %s already exists", path
);
3133 int ret
= util_poolset_create_set(setp
, path
, poolsize
, minsize
,
3136 LOG(2, "cannot create pool set -- '%s'", path
);
3140 struct pool_set
*set
= *setp
;
3142 ASSERT(set
->nreplicas
> 0);
3144 if (!remote
&& (set
->options
& OPTION_NOHDRS
)) {
3146 "the NOHDRS poolset option is not supported for local poolsets");
3148 goto err_poolset_free
;
3151 if ((attr
== NULL
) != ((set
->options
& OPTION_NOHDRS
) != 0)) {
3153 "pool attributes are not supported for poolsets without headers (with the NOHDRS option)");
3155 goto err_poolset_free
;
3158 if (set
->directory_based
&& ((set
->options
& OPTION_SINGLEHDR
) == 0)) {
3160 "directory based pools are not supported for poolsets with headers (without SINGLEHDR option)");
3162 goto err_poolset_free
;
3165 if (set
->resvsize
< minsize
) {
3166 ERR("reservation pool size %zu smaller than %zu",
3167 set
->resvsize
, minsize
);
3169 goto err_poolset_free
;
3172 if (set
->directory_based
&& set
->poolsize
== 0 &&
3173 util_poolset_append_new_part(set
, minsize
) != 0) {
3174 ERR("cannot create a new part in provided directories");
3175 goto err_poolset_free
;
3179 (attr
->features
.compat
& POOL_FEAT_CHECK_BAD_BLOCKS
)) {
3180 int bbs
= badblocks_check_poolset(set
, 1 /* create */);
3183 "failed to check pool set for bad blocks -- '%s'",
3185 goto err_poolset_free
;
3189 util_poolset_foreach_part_struct(set
,
3190 util_print_bad_files_cb
,
3193 "pool set contains bad blocks and cannot be created, run 'pmempool create --clear-bad-blocks' utility to clear bad blocks and create a pool");
3195 goto err_poolset_free
;
3199 if (set
->poolsize
< minsize
) {
3200 ERR("net pool size %zu smaller than %zu",
3201 set
->poolsize
, minsize
);
3203 goto err_poolset_free
;
3207 /* it is a remote replica - it cannot have replicas */
3208 if (set
->nreplicas
> 1) {
3209 LOG(2, "remote pool set cannot have replicas");
3211 goto err_poolset_free
;
3214 /* check if poolset options match remote pool attributes */
3216 ((set
->options
& OPTION_SINGLEHDR
) == 0) !=
3217 ((attr
->features
.incompat
&
3218 POOL_FEAT_SINGLEHDR
) == 0)) {
3220 "pool incompat feature flags and remote poolset options do not match");
3222 goto err_poolset_free
;
3226 if (!can_have_rep
&& set
->nreplicas
> 1) {
3227 ERR("replication not supported");
3229 goto err_poolset_free
;
3232 if (set
->remote
&& util_remote_load()) {
3234 "the pool set requires a remote replica, but the '%s' library cannot be loaded",
3236 goto err_poolset_free
;
3242 if (!util_is_zeroed(attr
->poolset_uuid
, POOL_HDR_UUID_LEN
)) {
3243 memcpy(set
->uuid
, attr
->poolset_uuid
,
3246 /* generate pool set UUID */
3247 ret
= util_uuid_generate(set
->uuid
);
3249 LOG(2, "cannot generate pool set UUID");
3254 /* generate UUID's for all the parts */
3255 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
3256 struct pool_replica
*rep
= set
->replica
[r
];
3257 for (unsigned i
= 0; i
< rep
->nhdrs
; i
++) {
3258 ret
= util_uuid_generate(rep
->part
[i
].uuid
);
3261 "cannot generate pool set part UUID");
3267 /* overwrite UUID of the first part if given */
3268 if (!util_is_zeroed(attr
->first_part_uuid
, POOL_HDR_UUID_LEN
)) {
3269 memcpy(set
->replica
[0]->part
[0].uuid
,
3270 attr
->first_part_uuid
, POOL_HDR_UUID_LEN
);
3274 ret
= util_poolset_files_local(set
, minpartsize
, 1);
3278 /* map first local replica - it has to exist prior to remote ones */
3279 ret
= util_replica_map_local(set
, 0, flags
);
3283 /* prepare remote replicas first */
3285 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
3286 if (REP(set
, r
)->remote
== NULL
) {
3289 if (util_replica_create_remote(set
, r
, flags
, attr
) !=
3291 LOG(2, "replica #%u creation failed", r
);
3296 ret
= util_poolset_files_remote(set
, minsize
, nlanes
,
3302 /* prepare local replicas */
3304 if (util_replica_create_local(set
, 0, flags
, attr
) != 0) {
3305 LOG(2, "replica #0 creation failed");
3309 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
3310 if (REP(set
, r
)->remote
!= NULL
) {
3313 if (util_replica_create_local(set
, r
, flags
, attr
) !=
3315 LOG(2, "replica #%u creation failed", r
);
3325 for (unsigned r
= 0; r
< set
->nreplicas
; r
++)
3326 util_replica_close(set
, r
);
3330 util_poolset_close(set
, DELETE_CREATED_PARTS
);
3336 util_poolset_free(set
);
3342 * util_pool_create -- create a new memory pool (set or a single file)
3344 * On success returns 0 and a pointer to a newly allocated structure
3345 * containing the info of all the parts of the pool set and replicas.
3348 util_pool_create(struct pool_set
**setp
, const char *path
, size_t poolsize
,
3349 size_t minsize
, size_t minpartsize
, const struct pool_attr
*attr
,
3350 unsigned *nlanes
, int can_have_rep
)
3352 LOG(3, "setp %p path %s poolsize %zu minsize %zu minpartsize %zu "
3353 "attr %p nlanes %p can_have_rep %i", setp
, path
, poolsize
,
3354 minsize
, minpartsize
, attr
, nlanes
, can_have_rep
);
3356 return util_pool_create_uuids(setp
, path
, poolsize
, minsize
,
3357 minpartsize
, attr
, nlanes
, can_have_rep
, POOL_LOCAL
);
3361 * util_replica_open_local -- (internal) open a memory pool local replica
3364 util_replica_open_local(struct pool_set
*set
, unsigned repidx
, int flags
)
3366 LOG(3, "set %p repidx %u flags %d", set
, repidx
, flags
);
3368 int remaining_retries
= 10;
3369 int retry_for_contiguous_addr
;
3371 size_t hdrsize
= (set
->options
& (OPTION_SINGLEHDR
| OPTION_NOHDRS
)) ?
3373 struct pool_replica
*rep
= set
->replica
[repidx
];
3377 retry_for_contiguous_addr
= 0;
3379 /* determine a hint address for mmap() if not specified */
3381 addr
= util_map_hint(rep
->resvsize
, 0);
3382 if (addr
== MAP_FAILED
) {
3383 LOG(1, "cannot find a contiguous region of given size");
3387 mapsize
= rep
->part
[0].filesize
& ~(Mmap_align
- 1);
3389 /* map the first part and reserve space for remaining parts */
3390 if (util_map_part(&rep
->part
[0], addr
, rep
->resvsize
, 0,
3392 LOG(2, "pool mapping failed - replica #%u part #0",
3397 VALGRIND_REGISTER_PMEM_MAPPING(rep
->part
[0].addr
,
3399 VALGRIND_REGISTER_PMEM_FILE(rep
->part
[0].fd
,
3400 rep
->part
[0].addr
, rep
->resvsize
, 0);
3402 /* map all headers - don't care about the address */
3403 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++) {
3404 if (util_map_hdr(&rep
->part
[p
], flags
, 0) != 0) {
3405 LOG(2, "header mapping failed - part #%d", p
);
3410 addr
= (char *)rep
->part
[0].addr
+ mapsize
;
3413 * map the remaining parts of the usable pool space
3414 * (aligned to memory mapping granularity)
3416 for (unsigned p
= 1; p
< rep
->nparts
; p
++) {
3417 struct pool_set_part
*part
= &rep
->part
[p
];
3418 size_t targetsize
= mapsize
+
3419 ALIGN_DOWN(part
->filesize
- hdrsize
,
3421 if (targetsize
> rep
->resvsize
) {
3423 "pool mapping failed - address space reservation too small");
3429 if (util_map_part(part
, addr
, 0, hdrsize
,
3430 flags
| MAP_FIXED
, 0) != 0) {
3432 * if we can't map the part at the address we
3433 * asked for, unmap all the parts that are
3434 * mapped and remap at a different address.
3436 if ((errno
== EINVAL
) &&
3437 (remaining_retries
> 0)) {
3438 LOG(2, "usable space mapping failed - "
3439 "part #%d - retrying", p
);
3440 retry_for_contiguous_addr
= 1;
3441 remaining_retries
--;
3443 util_unmap_parts(rep
, 0, p
- 1);
3445 /* release rest of the VA reserved */
3446 munmap(rep
->part
[0].addr
,
3450 LOG(2, "usable space mapping failed - part #%d",
3455 VALGRIND_REGISTER_PMEM_FILE(part
->fd
,
3456 part
->addr
, part
->size
, hdrsize
);
3458 mapsize
+= part
->size
;
3459 addr
= (char *)addr
+ part
->size
;
3461 } while (retry_for_contiguous_addr
);
3464 * Initially part[0].size is the size of address space
3465 * reservation for all parts from given replica. After
3466 * mapping that space we need to overwrite part[0].size
3467 * with its actual size to be consistent - size for each
3468 * part should be the actual mapping size of this part
3469 * only - it simplifies future calculations.
3471 rep
->part
[0].size
= rep
->part
[0].filesize
& ~(Mmap_align
- 1);
3473 if (util_replica_check_map_sync(set
, repidx
, 1))
3476 util_replica_set_is_pmem(rep
);
3478 if (Prefault_at_open
)
3479 util_replica_force_page_allocation(rep
);
3481 ASSERTeq(mapsize
, rep
->repsize
);
3483 /* calculate pool size - choose the smallest replica size */
3484 if (rep
->repsize
< set
->poolsize
)
3485 set
->poolsize
= rep
->repsize
;
3487 LOG(3, "replica addr %p", rep
->part
[0].addr
);
3491 LOG(4, "error clean up");
3493 if (mapsize
< rep
->repsize
) {
3494 ASSERTne(rep
->part
[0].addr
, NULL
);
3495 ASSERTne(rep
->part
[0].addr
, MAP_FAILED
);
3496 munmap(rep
->part
[0].addr
, rep
->resvsize
- mapsize
);
3498 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++)
3499 util_unmap_hdr(&rep
->part
[p
]);
3500 for (unsigned p
= 0; p
< rep
->nparts
; p
++)
3501 util_unmap_part(&rep
->part
[p
]);
3507 * util_replica_open_remote -- open a memory pool for remote replica
3510 util_replica_open_remote(struct pool_set
*set
, unsigned repidx
, int flags
)
3512 LOG(3, "set %p repidx %u flags %d", set
, repidx
, flags
);
3514 struct pool_replica
*rep
= set
->replica
[repidx
];
3516 ASSERTne(rep
->remote
, NULL
);
3517 ASSERTne(rep
->part
, NULL
);
3518 ASSERTeq(rep
->nparts
, 1);
3519 ASSERTeq(rep
->nhdrs
, 1);
3521 struct pool_set_part
*part
= rep
->part
;
3523 part
->size
= rep
->repsize
;
3524 ASSERT(IS_PAGE_ALIGNED(part
->size
));
3525 part
->remote_hdr
= Zalloc(part
->size
+ Pagesize
);
3526 if (!part
->remote_hdr
) {
3531 part
->hdr
= PAGE_ALIGN_UP(part
->remote_hdr
);
3532 part
->addr
= PAGE_ALIGN_UP(part
->remote_hdr
);
3533 part
->hdrsize
= POOL_HDR_SIZE
;
3535 LOG(3, "replica #%u addr %p", repidx
, rep
->part
[0].addr
);
3541 * util_replica_open -- open a memory pool replica
3544 util_replica_open(struct pool_set
*set
, unsigned repidx
, int flags
)
3546 LOG(3, "set %p repidx %u flags %d", set
, repidx
, flags
);
3548 if (set
->replica
[repidx
]->remote
)
3549 return util_replica_open_remote(set
, repidx
, flags
);
3551 return util_replica_open_local(set
, repidx
, flags
);
3555 * util_replica_set_attr -- overwrite existing replica attributes
3558 util_replica_set_attr(struct pool_replica
*rep
,
3559 const struct rpmem_pool_attr
*rattr
)
3561 LOG(3, "rep %p, rattr %p", rep
, rattr
);
3562 ASSERT(rattr
!= NULL
|| rep
->nhdrs
== 0);
3564 if (rattr
!= NULL
&& rep
->nhdrs
== 0) {
3566 "cannot set pool attributes for a replica without headers (with the NOHDRS option)");
3570 int flags
= MAP_SHARED
;
3572 /* map all headers - don't care about the address */
3573 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
3574 if (util_map_hdr(&rep
->part
[p
], flags
, 0) != 0) {
3575 LOG(2, "header mapping failed - part #%d", p
);
3580 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++) {
3581 ASSERTne(rattr
, NULL
);
3583 struct pool_hdr
*hdrp
= HDR(rep
, p
);
3584 ASSERTne(hdrp
, NULL
);
3585 util_convert2h_hdr_nocheck(hdrp
);
3587 util_set_rpmem_attr(hdrp
, rattr
);
3589 if (hdrp
== HDR(rep
, 0))
3590 memcpy(hdrp
->uuid
, rattr
->uuid
, POOL_HDR_UUID_LEN
);
3591 if (hdrp
== HDRP(rep
, 0))
3592 memcpy(hdrp
->next_part_uuid
, rattr
->uuid
,
3594 if (hdrp
== HDRN(rep
, 0))
3595 memcpy(hdrp
->prev_part_uuid
, rattr
->uuid
,
3598 util_convert2le_hdr(hdrp
);
3600 util_checksum(hdrp
, sizeof(*hdrp
), &hdrp
->checksum
,
3601 1, POOL_HDR_CSUM_END_OFF(hdrp
));
3603 /* store pool's header */
3604 util_persist_auto(rep
->is_pmem
, hdrp
, sizeof(*hdrp
));
3607 /* unmap all headers */
3608 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++)
3609 util_unmap_hdr(&rep
->part
[p
]);
3613 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++) {
3614 util_unmap_hdr(&rep
->part
[p
]);
3620 * util_get_attr_from_header -- get pool attributes from a pool header
3623 util_pool_hdr2attr(struct pool_attr
*attr
, struct pool_hdr
*hdr
)
3625 LOG(3, "attr %p, hdr %p", attr
, hdr
);
3626 ASSERTne(attr
, NULL
);
3627 ASSERTne(hdr
, NULL
);
3628 memset(attr
, 0, sizeof(*attr
));
3629 memcpy(attr
->signature
, hdr
->signature
, POOL_HDR_SIG_LEN
);
3630 attr
->major
= hdr
->major
;
3631 attr
->features
.compat
= hdr
->features
.compat
;
3632 attr
->features
.incompat
= hdr
->features
.incompat
;
3633 attr
->features
.ro_compat
= hdr
->features
.ro_compat
;
3634 memcpy(attr
->poolset_uuid
, hdr
->poolset_uuid
, POOL_HDR_UUID_LEN
);
3638 * util_copy_attr_to_header -- copy pool attributes into pool header
3641 util_pool_attr2hdr(struct pool_hdr
*hdr
, const struct pool_attr
*attr
)
3643 LOG(3, "hdr %p, attr %p", hdr
, attr
);
3644 ASSERTne(hdr
, NULL
);
3645 ASSERTne(attr
, NULL
);
3646 memcpy(hdr
->signature
, attr
->signature
, POOL_HDR_SIG_LEN
);
3647 hdr
->major
= attr
->major
;
3648 hdr
->features
.compat
= attr
->features
.compat
;
3649 hdr
->features
.incompat
= attr
->features
.incompat
;
3650 hdr
->features
.ro_compat
= attr
->features
.ro_compat
;
3654 * util_unmap_all_hdrs -- unmap all pool set headers
3657 util_unmap_all_hdrs(struct pool_set
*set
)
3659 LOG(3, "set %p", set
);
3661 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
3662 struct pool_replica
*rep
= set
->replica
[r
];
3663 if (rep
->remote
== NULL
) {
3664 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++)
3665 util_unmap_hdr(&rep
->part
[p
]);
3668 * hdr & hdrsize were set only for util_header_check(),
3669 * they will not be used any more. The memory will be
3670 * freed by util_replica_close()
3672 rep
->part
[0].hdr
= NULL
;
3673 rep
->part
[0].hdrsize
= 0;
3679 * util_replica_check -- check headers, check UUID's, check replicas linkage
3682 util_replica_check(struct pool_set
*set
, const struct pool_attr
*attr
)
3684 LOG(3, "set %p attr %p", set
, attr
);
3686 /* read shutdown state toggle from header */
3687 set
->ignore_sds
|= IGNORE_SDS(HDR(REP(set
, 0), 0));
3689 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
3690 struct pool_replica
*rep
= set
->replica
[r
];
3691 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++) {
3692 if (util_header_check(set
, r
, p
, attr
) != 0) {
3693 LOG(2, "header check failed - part #%d", p
);
3696 set
->rdonly
|= rep
->part
[p
].rdonly
;
3699 if (memcmp(HDR(REPP(set
, r
), 0)->uuid
,
3700 HDR(REP(set
, r
), 0)->prev_repl_uuid
,
3701 POOL_HDR_UUID_LEN
) ||
3702 memcmp(HDR(REPN(set
, r
), 0)->uuid
,
3703 HDR(REP(set
, r
), 0)->next_repl_uuid
,
3704 POOL_HDR_UUID_LEN
)) {
3705 ERR("wrong replica UUID");
3709 if (!set
->ignore_sds
&& !rep
->remote
&& rep
->nhdrs
) {
3710 struct shutdown_state sds
;
3711 shutdown_state_init(&sds
, NULL
);
3712 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
3713 if (shutdown_state_add_part(&sds
,
3714 PART(rep
, p
)->fd
, NULL
))
3718 ASSERTne(rep
->nhdrs
, 0);
3719 ASSERTne(rep
->nparts
, 0);
3720 if (shutdown_state_check(&sds
, &HDR(rep
, 0)->sds
,
3722 LOG(2, "ADR failure detected");
3726 shutdown_state_set_dirty(&HDR(rep
, 0)->sds
,
3734 * util_pool_has_device_dax -- (internal) check if poolset has any device dax
3737 util_pool_has_device_dax(struct pool_set
*set
)
3739 for (unsigned r
= 0; r
< set
->nreplicas
; ++r
) {
3740 struct pool_replica
*rep
= REP(set
, r
);
3741 /* either all the parts must be Device DAX or none */
3742 if (PART(rep
, 0)->is_dev_dax
)
3749 * util_pool_open_nocheck -- open a memory pool (set or a single file)
3751 * This function opens a pool set without checking the header values.
3754 util_pool_open_nocheck(struct pool_set
*set
, unsigned flags
)
3756 LOG(3, "set %p flags 0x%x", set
, flags
);
3758 int cow
= flags
& POOL_OPEN_COW
;
3760 if (cow
&& util_pool_has_device_dax(set
)) {
3761 ERR("device dax cannot be mapped privately");
3766 int mmap_flags
= cow
? MAP_PRIVATE
|MAP_NORESERVE
: MAP_SHARED
;
3769 ASSERTne(set
, NULL
);
3770 ASSERT(set
->nreplicas
> 0);
3772 if (flags
& POOL_OPEN_CHECK_BAD_BLOCKS
) {
3773 /* check if any bad block recovery file exists */
3774 int bfe
= badblocks_recovery_file_exists(set
);
3777 "error: a bad block recovery file exists, run 'pmempool sync --bad-blocks' utility to try to recover the pool");
3783 "an error occurred when checking whether recovery file exists.");
3787 int bbs
= badblocks_check_poolset(set
, 0 /* not create */);
3789 LOG(1, "failed to check pool set for bad blocks");
3794 if (flags
& POOL_OPEN_IGNORE_BAD_BLOCKS
) {
3796 "WARNING: pool set contains bad blocks, ignoring");
3799 "pool set contains bad blocks and cannot be opened, run 'pmempool sync --bad-blocks' utility to try to recover the pool");
3806 if (set
->remote
&& util_remote_load()) {
3807 ERR("the pool set requires a remote replica, "
3808 "but the '%s' library cannot be loaded",
3813 int ret
= util_poolset_files_local(set
, 0 /* minpartsize */, 0);
3819 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
3820 if (util_replica_open(set
, r
, mmap_flags
) != 0) {
3821 LOG(2, "replica #%u open failed", r
);
3827 ret
= util_poolset_files_remote(set
, 0, NULL
, 0);
3832 util_unmap_all_hdrs(set
);
3837 LOG(4, "error clean up");
3839 for (unsigned r
= 0; r
< set
->nreplicas
; r
++)
3840 util_replica_close(set
, r
);
3844 util_poolset_close(set
, DO_NOT_DELETE_PARTS
);
3850 * util_read_compat_features -- (internal) read compat features from the header
3853 util_read_compat_features(struct pool_set
*set
, uint32_t *compat_features
)
3855 LOG(3, "set %p pcompat_features %p", set
, compat_features
);
3857 *compat_features
= 0;
3859 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
3860 struct pool_replica
*rep
= set
->replica
[r
];
3865 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
3866 struct pool_set_part
*part
= &rep
->part
[p
];
3868 if (util_part_open(part
, 0, 0 /* create */)) {
3869 LOG(1, "!cannot open the part -- \"%s\"",
3871 /* try to open the next part */
3875 if (util_map_hdr(part
, MAP_SHARED
, 0) != 0) {
3876 LOG(1, "header mapping failed -- \"%s\"",
3878 util_part_fdclose(part
);
3882 struct pool_hdr
*hdrp
= part
->hdr
;
3883 *compat_features
= hdrp
->features
.compat
;
3885 util_unmap_hdr(part
);
3886 util_part_fdclose(part
);
3888 /* exit on the first successfully opened part */
3897 * unlink_remote_replicas -- removes remote replicas from poolset
3899 * It is necessary when COW flag is set because remote replicas
3900 * cannot be mapped privately
3903 unlink_remote_replicas(struct pool_set
*set
)
3906 while (i
< set
->nreplicas
) {
3907 if (set
->replica
[i
]->remote
== NULL
) {
3912 util_replica_close(set
, i
);
3913 int ret
= util_replica_close_remote(set
->replica
[i
], i
,
3914 DO_NOT_DELETE_PARTS
);
3918 size_t size
= sizeof(set
->replica
[i
]) *
3919 (set
->nreplicas
- i
- 1);
3920 memmove(&set
->replica
[i
], &set
->replica
[i
+ 1], size
);
3929 * util_pool_open -- open a memory pool (set or a single file)
3931 * This routine does all the work, but takes a rdonly flag so internal
3932 * calls can map a read-only pool if required.
3935 util_pool_open(struct pool_set
**setp
, const char *path
, size_t minpartsize
,
3936 const struct pool_attr
*attr
, unsigned *nlanes
, void *addr
,
3939 LOG(3, "setp %p path %s minpartsize %zu attr %p nlanes %p "
3940 "addr %p flags 0x%x ", setp
, path
, minpartsize
, attr
, nlanes
,
3943 int cow
= flags
& POOL_OPEN_COW
;
3944 int mmap_flags
= cow
? MAP_PRIVATE
|MAP_NORESERVE
: MAP_SHARED
;
3947 /* do not check minsize */
3948 int ret
= util_poolset_create_set(setp
, path
, 0, 0,
3949 flags
& POOL_OPEN_IGNORE_SDS
);
3951 LOG(2, "cannot open pool set -- '%s'", path
);
3955 if ((*setp
)->replica
[0]->nparts
== 0) {
3957 ERR("!no parts in replicas");
3958 goto err_poolset_free
;
3961 if (cow
&& (*setp
)->replica
[0]->part
[0].is_dev_dax
) {
3962 ERR("device dax cannot be mapped privately");
3964 goto err_poolset_free
;
3967 struct pool_set
*set
= *setp
;
3969 ASSERT(set
->nreplicas
> 0);
3971 uint32_t compat_features
;
3973 if (util_read_compat_features(set
, &compat_features
)) {
3974 LOG(1, "reading compat features failed");
3975 goto err_poolset_free
;
3978 if (compat_features
& POOL_FEAT_CHECK_BAD_BLOCKS
) {
3979 /* check if any bad block recovery file exists */
3980 int bfe
= badblocks_recovery_file_exists(set
);
3983 "error: a bad block recovery file exists, run 'pmempool sync --bad-blocks' utility to try to recover the pool");
3985 goto err_poolset_free
;
3990 "an error occurred when checking whether recovery file exists.");
3991 goto err_poolset_free
;
3994 int bbs
= badblocks_check_poolset(set
, 0 /* not create */);
3997 "failed to check pool set for bad blocks -- '%s'",
3999 goto err_poolset_free
;
4003 if (flags
& POOL_OPEN_IGNORE_BAD_BLOCKS
) {
4005 "WARNING: pool set contains bad blocks, ignoring -- '%s'",
4009 "pool set contains bad blocks and cannot be opened, run 'pmempool sync --bad-blocks' utility to try to recover the pool -- '%s'",
4012 goto err_poolset_free
;
4017 if (set
->remote
&& util_remote_load()) {
4019 "the pool set requires a remote replica, but the '%s' library cannot be loaded",
4021 goto err_poolset_free
;
4024 ret
= util_poolset_files_local(set
, minpartsize
, 0);
4028 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
4029 if (util_replica_open(set
, r
, mmap_flags
) != 0) {
4030 LOG(2, "replica #%u open failed", r
);
4036 /* do not check minsize */
4037 ret
= util_poolset_files_remote(set
, 0, nlanes
, 0);
4042 /* check headers, check UUID's, check replicas linkage */
4043 if (attr
!= NULL
&& util_replica_check(set
, attr
))
4046 /* unmap all headers */
4047 util_unmap_all_hdrs(set
);
4049 /* remove all remote replicas from poolset when cow */
4050 if (cow
&& set
->remote
) {
4051 ret
= unlink_remote_replicas(set
);
4059 LOG(4, "error clean up");
4061 for (unsigned r
= 0; r
< set
->nreplicas
; r
++)
4062 util_replica_close(set
, r
);
4066 util_poolset_close(set
, DO_NOT_DELETE_PARTS
);
4072 util_poolset_free(*setp
);
4078 * util_pool_open_remote -- open a remote pool set file
4080 * This routine does all the work, but takes a rdonly flag so internal
4081 * calls can map a read-only pool if required.
4084 util_pool_open_remote(struct pool_set
**setp
, const char *path
, int cow
,
4085 size_t minpartsize
, struct rpmem_pool_attr
*rattr
)
4087 LOG(3, "setp %p path %s cow %d minpartsize %zu rattr %p",
4088 setp
, path
, cow
, minpartsize
, rattr
);
4090 int flags
= cow
? MAP_PRIVATE
|MAP_NORESERVE
: MAP_SHARED
;
4093 /* do not check minsize */
4094 int ret
= util_poolset_create_set(setp
, path
, 0, 0, 0);
4096 LOG(2, "cannot open pool set -- '%s'", path
);
4100 if (cow
&& (*setp
)->replica
[0]->part
[0].is_dev_dax
) {
4101 ERR("device dax cannot be mapped privately");
4106 struct pool_set
*set
= *setp
;
4108 if (set
->nreplicas
> 1) {
4109 LOG(2, "remote pool set cannot have replicas");
4113 uint32_t compat_features
;
4115 if (util_read_compat_features(set
, &compat_features
)) {
4116 LOG(1, "reading compat features failed");
4120 if (compat_features
& POOL_FEAT_CHECK_BAD_BLOCKS
) {
4121 /* check if there are any bad blocks */
4122 int bbs
= badblocks_check_poolset(set
, 0 /* not create */);
4125 "failed to check the remote replica for bad blocks -- '%s'",
4132 "remote replica contains bad blocks and cannot be opened, run 'pmempool sync --bad-blocks' utility to recreate it -- '%s'",
4139 ret
= util_poolset_files_local(set
, minpartsize
, 0);
4143 if (util_replica_open(set
, 0, flags
) != 0) {
4144 LOG(2, "replica open failed");
4148 struct pool_replica
*rep
= set
->replica
[0];
4150 set
->rdonly
|= rep
->part
[0].rdonly
;
4152 /* check headers, check UUID's, check replicas linkage */
4153 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++) {
4154 if (util_header_check_remote(set
, p
) != 0) {
4155 LOG(2, "header check failed - part #%d", p
);
4158 set
->rdonly
|= rep
->part
[p
].rdonly
;
4161 if (rep
->nhdrs
> 0) {
4162 /* header exists, copy pool attributes */
4163 struct pool_hdr
*hdr
= rep
->part
[0].hdr
;
4164 util_get_rpmem_attr(rattr
, hdr
);
4166 /* header does not exist, zero pool attributes */
4167 memset(rattr
, 0, sizeof(*rattr
));
4170 /* unmap all headers */
4171 for (unsigned p
= 0; p
< rep
->nhdrs
; p
++)
4172 util_unmap_hdr(&rep
->part
[p
]);
4177 LOG(4, "error clean up");
4179 util_replica_close(set
, 0);
4183 util_poolset_close(set
, DO_NOT_DELETE_PARTS
);
4189 * util_is_poolset_file -- check if specified file is a poolset file
4197 util_is_poolset_file(const char *path
)
4199 enum file_type type
= util_file_get_type(path
);
4203 if (type
== TYPE_DEVDAX
)
4206 int fd
= util_file_open(path
, NULL
, 0, O_RDONLY
);
4212 char signature
[POOLSET_HDR_SIG_LEN
];
4215 sret
= util_read(fd
, &signature
[rd
], sizeof(signature
) - rd
);
4223 } else if (rd
!= sizeof(signature
)) {
4228 if (memcmp(signature
, POOLSET_HDR_SIG
, POOLSET_HDR_SIG_LEN
) == 0)
4235 * util_poolset_foreach_part_struct -- walk through all poolset file parts
4238 * Stops processing if callback returns non-zero value.
4239 * The value returned by callback is returned to the caller.
4242 util_poolset_foreach_part_struct(struct pool_set
*set
,
4243 int (*callback
)(struct part_file
*pf
, void *arg
), void *arg
)
4245 LOG(3, "set %p callback %p arg %p", set
, callback
, arg
);
4247 ASSERTne(callback
, NULL
);
4251 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
4252 struct part_file cbdata
;
4253 if (set
->replica
[r
]->remote
) {
4254 cbdata
.is_remote
= 1;
4255 cbdata
.remote
= set
->replica
[r
]->remote
;
4257 ret
= (*callback
)(&cbdata
, arg
);
4261 cbdata
.is_remote
= 0;
4262 cbdata
.remote
= NULL
;
4263 for (unsigned p
= 0; p
< set
->replica
[r
]->nparts
; p
++) {
4264 cbdata
.part
= &set
->replica
[r
]->part
[p
];
4265 ret
= (*callback
)(&cbdata
, arg
);
4276 * util_poolset_foreach_part -- walk through all poolset file parts
4278 * Stops processing if callback returns non-zero value.
4279 * The value returned by callback is returned to the caller.
4282 * 0 - all part files have been processed
4283 * -1 - parsing poolset file error
4286 util_poolset_foreach_part(const char *path
,
4287 int (*callback
)(struct part_file
*pf
, void *arg
), void *arg
)
4289 LOG(3, "path %s callback %p arg %p", path
, callback
, arg
);
4291 ASSERTne(callback
, NULL
);
4293 int fd
= os_open(path
, O_RDONLY
);
4295 ERR("!open: path \"%s\"", path
);
4299 struct pool_set
*set
;
4300 int ret
= util_poolset_parse(&set
, path
, fd
);
4302 ERR("util_poolset_parse failed -- '%s'", path
);
4307 ret
= util_poolset_foreach_part_struct(set
, callback
, arg
);
4310 * Make sure callback does not return -1,
4311 * because this value is reserved for parsing
4315 util_poolset_free(set
);
4323 * util_poolset_size -- get size of poolset, returns 0 on error
4326 util_poolset_size(const char *path
)
4328 int fd
= os_open(path
, O_RDONLY
);
4333 struct pool_set
*set
;
4334 if (util_poolset_parse(&set
, path
, fd
))
4337 size
= set
->poolsize
;
4339 util_poolset_free(set
);
4346 * util_replica_fdclose -- close all parts of given replica
4349 util_replica_fdclose(struct pool_replica
*rep
)
4351 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
4352 struct pool_set_part
*part
= &rep
->part
[p
];
4353 util_part_fdclose(part
);
4358 * util_replica_deep_common -- performs common calculations
4359 * on all parts from replica to define intersection ranges
4360 * for final flushing operations that take place in
4361 * os_part_deep_common function.
4364 util_replica_deep_common(const void *addr
, size_t len
, struct pool_set
*set
,
4365 unsigned replica_id
, int flush
)
4367 LOG(3, "addr %p len %zu set %p replica_id %u flush %d",
4368 addr
, len
, set
, replica_id
, flush
);
4370 struct pool_replica
*rep
= set
->replica
[replica_id
];
4371 uintptr_t rep_start
= (uintptr_t)rep
->part
[0].addr
;
4372 uintptr_t rep_end
= rep_start
+ rep
->repsize
;
4373 uintptr_t start
= (uintptr_t)addr
;
4374 uintptr_t end
= start
+ len
;
4376 ASSERT(start
>= rep_start
);
4377 ASSERT(end
<= rep_end
);
4379 for (unsigned p
= 0; p
< rep
->nparts
; p
++) {
4380 struct pool_set_part
*part
= &rep
->part
[p
];
4381 uintptr_t part_start
= (uintptr_t)part
->addr
;
4382 uintptr_t part_end
= part_start
+ part
->size
;
4383 /* init intersection start and end addresses */
4384 uintptr_t range_start
= start
;
4385 uintptr_t range_end
= end
;
4387 if (part_start
> end
|| part_end
< start
)
4389 /* recalculate intersection addresses */
4390 if (part_start
> start
)
4391 range_start
= part_start
;
4393 range_end
= part_end
;
4394 size_t range_len
= range_end
- range_start
;
4396 LOG(15, "perform deep flushing for replica %u "
4397 "part %p, addr %p, len %lu",
4398 replica_id
, part
, (void *)range_start
, range_len
);
4399 if (os_part_deep_common(rep
, p
, (void *)range_start
,
4400 range_len
, flush
)) {
4401 LOG(1, "os_part_deep_common(%p, %p, %lu)",
4402 part
, (void *)range_start
, range_len
);
4410 * util_replica_deep_persist -- wrapper for util_replica_deep_common
4411 * Calling the target precedes initialization of function that
4412 * partly defines way of deep replica flushing.
4415 util_replica_deep_persist(const void *addr
, size_t len
, struct pool_set
*set
,
4416 unsigned replica_id
)
4418 LOG(3, "addr %p len %zu set %p replica_id %u",
4419 addr
, len
, set
, replica_id
);
4422 return util_replica_deep_common(addr
, len
, set
, replica_id
, flush
);
4426 * util_replica_deep_drain -- wrapper for util_replica_deep_common
4427 * Calling the target precedes initialization of function that
4428 * partly defines way of deep replica flushing.
4431 util_replica_deep_drain(const void *addr
, size_t len
, struct pool_set
*set
,
4432 unsigned replica_id
)
4434 LOG(3, "addr %p len %zu set %p replica_id %u",
4435 addr
, len
, set
, replica_id
);
4438 return util_replica_deep_common(addr
, len
, set
, replica_id
, flush
);