]> git.proxmox.com Git - ceph.git/blob - ceph/src/pmdk/src/common/set.c
import ceph 16.2.7
[ceph.git] / ceph / src / pmdk / src / common / set.c
1 // SPDX-License-Identifier: BSD-3-Clause
2 /* Copyright 2015-2020, Intel Corporation */
3 /*
4 * Copyright (c) 2016, Microsoft Corporation. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * * Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * * Neither the name of the copyright holder nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 */
34
35 /*
36 * set.c -- pool set utilities
37 */
38
39 #ifndef _GNU_SOURCE
40 #define _GNU_SOURCE
41 #endif
42
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46 #include <sys/stat.h>
47 #include <sys/mman.h>
48 #include <fcntl.h>
49 #include <unistd.h>
50 #include <stdint.h>
51 #include <endian.h>
52 #include <errno.h>
53 #include <stddef.h>
54 #include <time.h>
55 #include <ctype.h>
56 #include <linux/limits.h>
57 #include <sys/mman.h>
58
59 #include "libpmem.h"
60 #include "librpmem.h"
61 #include "set.h"
62 #include "file.h"
63 #include "os.h"
64 #include "mmap.h"
65 #include "util.h"
66 #include "out.h"
67 #include "dlsym.h"
68 #include "valgrind_internal.h"
69 #include "sys_util.h"
70 #include "util_pmem.h"
71 #include "fs.h"
72 #include "os_deep.h"
73 #include "set_badblocks.h"
74
75 #define LIBRARY_REMOTE "librpmem.so.1"
76 #define SIZE_AUTODETECT_STR "AUTO"
77
78 #define PMEM_EXT ".pmem"
79 #define PMEM_EXT_LEN sizeof(PMEM_EXT)
80 #define PMEM_FILE_PADDING 6
81 #define PMEM_FILE_NAME_MAX_LEN 20
82 #define PMEM_FILE_MAX_LEN (PMEM_FILE_NAME_MAX_LEN + PMEM_FILE_PADDING)
83
84 static RPMEMpool *(*Rpmem_create)(const char *target, const char *pool_set_name,
85 void *pool_addr, size_t pool_size, unsigned *nlanes,
86 const struct rpmem_pool_attr *rpmem_attr);
87 static RPMEMpool *(*Rpmem_open)(const char *target, const char *pool_set_name,
88 void *pool_addr, size_t pool_size, unsigned *nlanes,
89 struct rpmem_pool_attr *rpmem_attr);
90 int (*Rpmem_close)(RPMEMpool *rpp);
91 int (*Rpmem_persist)(RPMEMpool *rpp, size_t offset, size_t length,
92 unsigned lane, unsigned flags);
93 int (*Rpmem_deep_persist)(RPMEMpool *rpp, size_t offset, size_t length,
94 unsigned lane);
95 int (*Rpmem_read)(RPMEMpool *rpp, void *buff, size_t offset,
96 size_t length, unsigned lane);
97 int (*Rpmem_remove)(const char *target, const char *pool_set_name, int flags);
98 int (*Rpmem_set_attr)(RPMEMpool *rpp, const struct rpmem_pool_attr *rattr);
99
100 static int Remote_replication_available;
101 static os_mutex_t Remote_lock;
102 static void *Rpmem_handle_remote;
103
104 int Prefault_at_open = 0;
105 int Prefault_at_create = 0;
106 int SDS_at_create = POOL_FEAT_INCOMPAT_DEFAULT & POOL_E_FEAT_SDS ? 1 : 0;
107 int Fallocate_at_create = 1;
108 int COW_at_open = 0;
109
110 /* list of pool set option names and flags */
111 static const struct pool_set_option Options[] = {
112 { "SINGLEHDR", OPTION_SINGLEHDR },
113 #ifndef _WIN32
114 { "NOHDRS", OPTION_NOHDRS },
115 #endif
116 { NULL, OPTION_UNKNOWN }
117 };
118
119 /*
120 * util_remote_init -- initialize remote replication
121 */
122 void
123 util_remote_init(void)
124 {
125 LOG(3, NULL);
126
127 /* XXX Is duplicate initialization really okay? */
128 if (!Remote_replication_available) {
129 util_mutex_init(&Remote_lock);
130 Remote_replication_available = 1;
131 }
132 }
133
134 /*
135 * util_remote_fini -- finalize remote replication
136 */
137 void
138 util_remote_fini(void)
139 {
140 LOG(3, NULL);
141
142 util_remote_unload();
143
144 /* XXX Okay to be here if not initialized? */
145 if (Remote_replication_available) {
146 Remote_replication_available = 0;
147 util_mutex_destroy(&Remote_lock);
148 }
149 }
150
151 /*
152 * util_dl_check_error -- check libdl error
153 */
154 static int
155 util_dl_check_error(void *handle, const char *func)
156 {
157 LOG(15, "handle %p func %s", handle, func);
158
159 if (handle == NULL) {
160 char *errstr = util_dlerror();
161 if (errstr)
162 ERR("%s(): %s", func, errstr);
163 errno = ELIBACC;
164 return -1;
165 }
166 return 0;
167 }
168
169 /*
170 * util_remote_unload_core -- (internal) unload remote library (core function)
171 */
172 static void
173 util_remote_unload_core(void)
174 {
175 if (Rpmem_handle_remote != NULL) {
176 util_dlclose(Rpmem_handle_remote);
177 Rpmem_handle_remote = NULL;
178 }
179 Rpmem_create = NULL;
180 Rpmem_open = NULL;
181 Rpmem_close = NULL;
182 Rpmem_persist = NULL;
183 Rpmem_deep_persist = NULL;
184 Rpmem_read = NULL;
185 Rpmem_remove = NULL;
186 Rpmem_set_attr = NULL;
187 }
188
189 /*
190 * util_remote_unload -- unload remote library
191 */
192 void
193 util_remote_unload(void)
194 {
195 LOG(3, NULL);
196
197 if (!Remote_replication_available)
198 return;
199
200 util_mutex_lock(&Remote_lock);
201
202 util_remote_unload_core();
203
204 util_mutex_unlock(&Remote_lock);
205 }
206
207 /*
208 * util_remote_load -- load remote library
209 */
210 int
211 util_remote_load(void)
212 {
213 LOG(3, NULL);
214
215 if (!Remote_replication_available) {
216 ERR("remote replication is not available");
217 return -1;
218 }
219
220 CHECK_FUNC_COMPATIBLE(rpmem_create, *Rpmem_create);
221 CHECK_FUNC_COMPATIBLE(rpmem_open, *Rpmem_open);
222 CHECK_FUNC_COMPATIBLE(rpmem_close, *Rpmem_close);
223 CHECK_FUNC_COMPATIBLE(rpmem_persist, *Rpmem_persist);
224 CHECK_FUNC_COMPATIBLE(rpmem_deep_persist, *Rpmem_deep_persist);
225 CHECK_FUNC_COMPATIBLE(rpmem_read, *Rpmem_read);
226 CHECK_FUNC_COMPATIBLE(rpmem_remove, *Rpmem_remove);
227
228 util_mutex_lock(&Remote_lock);
229
230 if (Rpmem_handle_remote)
231 goto end;
232
233 Rpmem_handle_remote = util_dlopen(LIBRARY_REMOTE);
234 if (util_dl_check_error(Rpmem_handle_remote, "dlopen")) {
235 ERR("the pool set requires a remote replica, "
236 "but the '%s' library cannot be loaded",
237 LIBRARY_REMOTE);
238 goto err;
239 }
240
241 Rpmem_create = util_dlsym(Rpmem_handle_remote, "rpmem_create");
242 if (util_dl_check_error(Rpmem_create, "dlsym")) {
243 ERR("symbol 'rpmem_create' not found");
244 goto err;
245 }
246
247 Rpmem_open = util_dlsym(Rpmem_handle_remote, "rpmem_open");
248 if (util_dl_check_error(Rpmem_open, "dlsym")) {
249 ERR("symbol 'rpmem_open' not found");
250 goto err;
251 }
252
253 Rpmem_close = util_dlsym(Rpmem_handle_remote, "rpmem_close");
254 if (util_dl_check_error(Rpmem_close, "dlsym")) {
255 ERR("symbol 'rpmem_close' not found");
256 goto err;
257 }
258
259 Rpmem_persist = util_dlsym(Rpmem_handle_remote, "rpmem_persist");
260 if (util_dl_check_error(Rpmem_persist, "dlsym")) {
261 ERR("symbol 'rpmem_persist' not found");
262 goto err;
263 }
264
265 Rpmem_deep_persist = util_dlsym(Rpmem_handle_remote,
266 "rpmem_deep_persist");
267 if (util_dl_check_error(Rpmem_deep_persist, "dlsym")) {
268 ERR("symbol 'rpmem_deep_persist' not found");
269 goto err;
270 }
271
272 Rpmem_read = util_dlsym(Rpmem_handle_remote, "rpmem_read");
273 if (util_dl_check_error(Rpmem_read, "dlsym")) {
274 ERR("symbol 'rpmem_read' not found");
275 goto err;
276 }
277
278 Rpmem_remove = util_dlsym(Rpmem_handle_remote, "rpmem_remove");
279 if (util_dl_check_error(Rpmem_remove, "dlsym")) {
280 ERR("symbol 'rpmem_remove' not found");
281 goto err;
282 }
283
284 Rpmem_set_attr = util_dlsym(Rpmem_handle_remote, "rpmem_set_attr");
285 if (util_dl_check_error(Rpmem_set_attr, "dlsym")) {
286 ERR("symbol 'rpmem_set_attr' not found");
287 goto err;
288 }
289
290 end:
291 util_mutex_unlock(&Remote_lock);
292 return 0;
293
294 err:
295 LOG(4, "error clean up");
296 util_remote_unload_core();
297 util_mutex_unlock(&Remote_lock);
298 return -1;
299 }
300
301 /* reserve space for size, path and some whitespace and/or comment */
302
303 enum parser_codes {
304 PARSER_CONTINUE = 0,
305 PARSER_PMEMPOOLSET,
306 PARSER_REPLICA,
307 PARSER_INVALID_TOKEN,
308 PARSER_REMOTE_REPLICA_EXPECTED,
309 PARSER_WRONG_SIZE,
310 PARSER_CANNOT_READ_SIZE,
311 PARSER_ABSOLUTE_PATH_EXPECTED,
312 PARSER_RELATIVE_PATH_EXPECTED,
313 PARSER_SET_NO_PARTS,
314 PARSER_REP_NO_PARTS,
315 PARSER_REMOTE_REP_UNEXPECTED_PARTS,
316 PARSER_SIZE_MISMATCH,
317 PARSER_OUT_OF_MEMORY,
318 PARSER_OPTION_UNKNOWN,
319 PARSER_OPTION_EXPECTED,
320 PARSER_FORMAT_OK,
321 PARSER_MAX_CODE
322 };
323
324 static const char *parser_errstr[PARSER_MAX_CODE] = {
325 "", /* parsing */
326 "the first line must be exactly 'PMEMPOOLSET'",
327 "exactly 'REPLICA' expected",
328 "invalid token found in the current line",
329 "address of remote node and descriptor of remote pool set expected",
330 "incorrect format of size",
331 "cannot determine size of a part",
332 "incorrect path (must be an absolute one)",
333 "incorrect descriptor (must be a relative path)",
334 "no pool set parts",
335 "no replica parts",
336 "unexpected parts for remote replica",
337 "sizes of pool set and replica mismatch",
338 "allocating memory failed",
339 "unknown option",
340 "missing option name",
341 "" /* format correct */
342 };
343
344 /*
345 * util_replica_force_page_allocation - (internal) forces page allocation for
346 * replica
347 */
348 static void
349 util_replica_force_page_allocation(struct pool_replica *rep)
350 {
351 volatile char *cur_addr = rep->part[0].addr;
352 char *addr_end = (char *)cur_addr + rep->resvsize;
353 for (; cur_addr < addr_end; cur_addr += Pagesize) {
354 *cur_addr = *cur_addr;
355 VALGRIND_SET_CLEAN(cur_addr, 1);
356 }
357 }
358
359 /*
360 * util_map_hdr -- map a header of a pool set
361 */
362 int
363 util_map_hdr(struct pool_set_part *part, int flags, int rdonly)
364 {
365 LOG(3, "part %p flags %d", part, flags);
366
367 COMPILE_ERROR_ON(POOL_HDR_SIZE == 0);
368 ASSERTeq(POOL_HDR_SIZE % Pagesize, 0);
369
370 /*
371 * Workaround for Device DAX not allowing to map a portion
372 * of the device if offset/length are not aligned to the internal
373 * device alignment (page size). I.e. if the device alignment
374 * is 2M, we cannot map the 4K header, but need to align the mapping
375 * length to 2M.
376 *
377 * According to mmap(2), system should automatically align mapping
378 * length to be a multiple of the underlying page size, but it's
379 * not true for Device DAX.
380 */
381 size_t hdrsize = part->alignment > POOL_HDR_SIZE
382 ? part->alignment : POOL_HDR_SIZE;
383
384 void *addr = NULL;
385
386 #if VG_MEMCHECK_ENABLED
387 if (On_valgrind) {
388 /* this is required only for Device DAX & memcheck */
389 addr = util_map_hint(hdrsize, hdrsize);
390 if (addr == MAP_FAILED) {
391 LOG(1, "cannot find a contiguous region of given size");
392 /* there's nothing we can do */
393 return -1;
394 }
395 }
396 #endif
397
398 int prot = rdonly ? PROT_READ : PROT_READ|PROT_WRITE;
399 void *hdrp = util_map_sync(addr, hdrsize, prot, flags,
400 part->fd, 0, &part->hdr_map_sync);
401 if (hdrp == MAP_FAILED) {
402 ERR("!mmap: %s", part->path);
403 return -1;
404 }
405
406 part->hdrsize = hdrsize;
407 part->hdr = hdrp;
408
409 VALGRIND_REGISTER_PMEM_MAPPING(part->hdr, part->hdrsize);
410 VALGRIND_REGISTER_PMEM_FILE(part->fd, part->hdr, part->hdrsize, 0);
411
412 return 0;
413 }
414
415 /*
416 * util_unmap_hdr -- unmap pool set part header
417 */
418 void
419 util_unmap_hdr(struct pool_set_part *part)
420 {
421 if (part->hdr == NULL || part->hdrsize == 0)
422 return;
423
424 LOG(4, "munmap: addr %p size %zu", part->hdr, part->hdrsize);
425 VALGRIND_REMOVE_PMEM_MAPPING(part->hdr, part->hdrsize);
426 if (munmap(part->hdr, part->hdrsize) != 0)
427 /* this means there's a bug on the caller side */
428 FATAL("!munmap: %s", part->path);
429 part->hdr = NULL;
430 part->hdrsize = 0;
431 }
432
433 /*
434 * util_map_part -- map a part of a pool set
435 */
436 int
437 util_map_part(struct pool_set_part *part, void *addr, size_t size,
438 size_t offset, int flags, int rdonly)
439 {
440 LOG(3, "part %p addr %p size %zu offset %zu flags %d",
441 part, addr, size, offset, flags);
442
443 ASSERTeq((uintptr_t)addr % Mmap_align, 0);
444 ASSERTeq(offset % Mmap_align, 0);
445 ASSERTeq(size % Mmap_align, 0);
446 ASSERT(((os_off_t)offset) >= 0);
447 ASSERTeq(offset % part->alignment, 0);
448 ASSERT(offset < part->filesize);
449
450 if (!size)
451 size = (part->filesize - offset) & ~(part->alignment - 1);
452 else
453 size = roundup(size, part->alignment);
454
455 int prot = rdonly ? PROT_READ : PROT_READ | PROT_WRITE;
456 void *addrp = util_map_sync(addr, size, prot, flags, part->fd,
457 (os_off_t)offset, &part->map_sync);
458 if (addrp == MAP_FAILED) {
459 ERR("!mmap: %s", part->path);
460 return -1;
461 }
462
463 if (addr != NULL && (flags & MAP_FIXED) && addrp != addr) {
464 ERR("unable to map at requested address %p", addr);
465 munmap(addrp, size);
466 return -1;
467 }
468
469 part->addr = addrp;
470 part->size = size;
471
472 VALGRIND_REGISTER_PMEM_MAPPING(part->addr, part->size);
473 VALGRIND_REGISTER_PMEM_FILE(part->fd, part->addr, part->size, offset);
474
475 return 0;
476 }
477
478 /*
479 * util_unmap_part -- unmap a part of a pool set
480 */
481 int
482 util_unmap_part(struct pool_set_part *part)
483 {
484 LOG(3, "part %p", part);
485
486 if (part->addr != NULL && part->size != 0) {
487 LOG(4, "munmap: addr %p size %zu", part->addr, part->size);
488 VALGRIND_REMOVE_PMEM_MAPPING(part->addr, part->size);
489 if (munmap(part->addr, part->size) != 0) {
490 ERR("!munmap: %s", part->path);
491 }
492
493 part->addr = NULL;
494 part->size = 0;
495 }
496
497 return 0;
498 }
499
500 /*
501 * util_unmap_parts -- unmap parts from start_index to the end_index
502 */
503 int
504 util_unmap_parts(struct pool_replica *rep, unsigned start_index,
505 unsigned end_index)
506 {
507 LOG(3, "rep: %p, start_index: %u, end_index: %u", rep, start_index,
508 end_index);
509
510 for (unsigned p = start_index; p <= end_index; p++)
511 util_unmap_part(&rep->part[p]);
512
513 return 0;
514 }
515
516 /*
517 * util_poolset_free -- free pool set info
518 */
519 void
520 util_poolset_free(struct pool_set *set)
521 {
522 LOG(3, "set %p", set);
523
524 for (unsigned r = 0; r < set->nreplicas; r++) {
525 struct pool_replica *rep = set->replica[r];
526 if (rep->remote == NULL) {
527 /* only local replicas have paths */
528 for (unsigned p = 0; p < rep->nallocated; p++) {
529 Free((void *)(rep->part[p].path));
530 }
531 } else {
532 /* remote replica */
533 ASSERTeq(rep->nparts, 1);
534 Free(rep->remote->node_addr);
535 Free(rep->remote->pool_desc);
536 Free(rep->remote);
537 }
538 struct pool_set_directory *d;
539 VEC_FOREACH_BY_PTR(d, &rep->directory) {
540 Free((void *)d->path);
541 }
542 VEC_DELETE(&rep->directory);
543 Free(set->replica[r]);
544 }
545 Free(set->path);
546 Free(set);
547 }
548
549 /*
550 * util_poolset_open -- open all replicas from a poolset
551 */
552 int
553 util_poolset_open(struct pool_set *set)
554 {
555 for (unsigned r = 0; r < set->nreplicas; ++r) {
556 if (util_replica_open(set, r, MAP_SHARED)) {
557 LOG(2, "replica open failed: replica %u", r);
558 errno = EINVAL;
559 return -1;
560 }
561 }
562
563 return 0;
564 }
565
566 /*
567 * util_replica_close_local -- close local replica, optionally delete the
568 * replica's parts
569 */
570 int
571 util_replica_close_local(struct pool_replica *rep, unsigned repn,
572 enum del_parts_mode del)
573 {
574 for (unsigned p = 0; p < rep->nparts; p++) {
575 if (rep->part[p].fd != -1)
576 (void) os_close(rep->part[p].fd);
577
578 if ((del == DELETE_CREATED_PARTS && rep->part[p].created) ||
579 del == DELETE_ALL_PARTS) {
580 LOG(4, "unlink %s", rep->part[p].path);
581 int olderrno = errno;
582 if (util_unlink(rep->part[p].path) && errno != ENOENT) {
583 ERR("!unlink %s failed (part %u, replica %u)",
584 rep->part[p].path, p, repn);
585 return -1;
586 }
587 errno = olderrno;
588 }
589 }
590 return 0;
591 }
592
593 /*
594 * util_replica_close_remote -- close remote replica, optionally delete the
595 * replica
596 */
597 int
598 util_replica_close_remote(struct pool_replica *rep, unsigned repn,
599 enum del_parts_mode del)
600 {
601 if (!rep->remote)
602 return 0;
603
604 if (rep->remote->rpp) {
605 LOG(4, "closing remote replica #%u", repn);
606 Rpmem_close(rep->remote->rpp);
607 rep->remote->rpp = NULL;
608 }
609
610 if ((del == DELETE_CREATED_PARTS && rep->part[0].created) ||
611 del == DELETE_ALL_PARTS) {
612 LOG(4, "removing remote replica #%u", repn);
613 int ret = Rpmem_remove(rep->remote->node_addr,
614 rep->remote->pool_desc, 0);
615 if (ret) {
616 LOG(1, "!removing remote replica #%u failed", repn);
617 return -1;
618 }
619 }
620 return 0;
621 }
622
623 /*
624 * util_poolset_close -- unmap and close all the parts of the pool set,
625 * optionally delete parts
626 */
627 void
628 util_poolset_close(struct pool_set *set, enum del_parts_mode del)
629 {
630 LOG(3, "set %p del %d", set, del);
631
632 int oerrno = errno;
633
634 for (unsigned r = 0; r < set->nreplicas; r++) {
635 util_replica_close(set, r);
636
637 struct pool_replica *rep = set->replica[r];
638 if (!rep->remote)
639 (void) util_replica_close_local(rep, r, del);
640 else
641 (void) util_replica_close_remote(rep, r, del);
642 }
643
644 /*
645 * XXX On FreeBSD, mmap()ing a file does not increment the flock()
646 * reference count, so we had to keep the files open until now.
647 */
648 #ifdef __FreeBSD__
649 util_poolset_fdclose_always(set);
650 #endif
651 util_poolset_free(set);
652
653 errno = oerrno;
654 }
655
656 /*
657 * util_poolset_chmod -- change mode for all created files related to pool set
658 */
659 int
660 util_poolset_chmod(struct pool_set *set, mode_t mode)
661 {
662 LOG(3, "set %p mode %o", set, mode);
663
664 for (unsigned r = 0; r < set->nreplicas; r++) {
665 struct pool_replica *rep = set->replica[r];
666
667 /* skip remote replicas */
668 if (rep->remote != NULL)
669 continue;
670
671 for (unsigned p = 0; p < rep->nparts; p++) {
672 struct pool_set_part *part = &rep->part[p];
673
674 /* skip not created or closed parts */
675 if (!part->created || part->fd == -1)
676 continue;
677
678 os_stat_t stbuf;
679 if (os_fstat(part->fd, &stbuf) != 0) {
680 ERR("!fstat %d %s", part->fd, part->path);
681 return -1;
682 }
683
684 if (stbuf.st_mode & ~(unsigned)S_IFMT) {
685 LOG(1, "file permissions changed during pool "
686 "initialization, file: %s (%o)",
687 part->path,
688 stbuf.st_mode & ~(unsigned)S_IFMT);
689 }
690
691 if (os_chmod(part->path, mode)) {
692 ERR("!chmod %u/%u/%s", r, p, part->path);
693 return -1;
694 }
695 }
696 }
697
698 return 0;
699 }
700
701 /*
702 * util_poolset_fdclose_always -- close file descriptors related to pool set
703 */
704 void
705 util_poolset_fdclose_always(struct pool_set *set)
706 {
707 LOG(3, "set %p", set);
708
709 for (unsigned r = 0; r < set->nreplicas; r++)
710 util_replica_fdclose(set->replica[r]);
711 }
712
713 /*
714 * util_poolset_fdclose -- close pool set file descriptors if not FreeBSD
715 *
716 * XXX On FreeBSD, mmap()ing a file does not increment the flock()
717 * reference count, so we need to keep the files open.
718 */
719 void
720 util_poolset_fdclose(struct pool_set *set)
721 {
722 #ifdef __FreeBSD__
723 LOG(3, "set %p: holding open", set);
724 #else
725 util_poolset_fdclose_always(set);
726 #endif
727 }
728
729 /*
730 * util_autodetect_size -- (internal) retrieves size of an existing file
731 */
732 static ssize_t
733 util_autodetect_size(const char *path)
734 {
735 enum file_type type = util_file_get_type(path);
736 if (type < 0)
737 return -1;
738
739 if (type == TYPE_NORMAL) {
740 ERR("size autodetection is supported only for device dax");
741 return -1;
742 }
743
744 return util_file_get_size(path);
745 }
746
747 /*
748 * parser_read_line -- (internal) read line and validate size and path
749 * from a pool set file
750 */
751 static enum parser_codes
752 parser_read_line(char *line, size_t *size, char **path)
753 {
754 int ret;
755 char *size_str;
756 char *path_str;
757 char *rest_str;
758 char *saveptr = NULL; /* must be NULL initialized on Windows */
759
760 size_str = strtok_r(line, " \t", &saveptr);
761 path_str = strtok_r(NULL, " \t", &saveptr);
762 rest_str = strtok_r(NULL, " \t", &saveptr);
763
764 if (!size_str || !path_str || rest_str)
765 return PARSER_INVALID_TOKEN;
766
767 LOG(10, "size '%s' path '%s'", size_str, path_str);
768
769 /*
770 * A format of the size is checked in detail. As regards the path,
771 * it is checked only if the read path is an absolute path.
772 * The rest should be checked during creating/opening the file.
773 */
774
775 /* check if the read path is an absolute path */
776 if (!util_is_absolute_path(path_str))
777 return PARSER_ABSOLUTE_PATH_EXPECTED;
778
779 *path = Strdup(path_str);
780 if (!(*path)) {
781 ERR("!Strdup");
782 return PARSER_OUT_OF_MEMORY;
783 }
784
785 if (strcmp(SIZE_AUTODETECT_STR, size_str) == 0) {
786 /*
787 * XXX: this should be done after the parsing completes, but
788 * currently this operation is performed in simply too many
789 * places in the code to move this someplace else.
790 */
791 ssize_t s = util_autodetect_size(path_str);
792 if (s < 0) {
793 Free(*path);
794 *path = NULL;
795 return PARSER_CANNOT_READ_SIZE;
796 }
797
798 *size = (size_t)s;
799
800 return PARSER_CONTINUE;
801 }
802
803 ret = util_parse_size(size_str, size);
804 if (ret != 0 || *size == 0) {
805 Free(*path);
806 *path = NULL;
807 return PARSER_WRONG_SIZE;
808 }
809
810 return PARSER_CONTINUE;
811 }
812
813 /*
814 * parser_read_replica -- (internal) read line and validate remote replica
815 * from a pool set file
816 */
817 static enum parser_codes
818 parser_read_replica(char *line, char **node_addr, char **pool_desc)
819 {
820 char *addr_str;
821 char *desc_str;
822 char *rest_str;
823 char *saveptr = NULL; /* must be NULL initialized on Windows */
824
825 addr_str = strtok_r(line, " \t", &saveptr);
826 desc_str = strtok_r(NULL, " \t", &saveptr);
827 rest_str = strtok_r(NULL, " \t", &saveptr);
828
829 if (!addr_str || !desc_str)
830 return PARSER_REMOTE_REPLICA_EXPECTED;
831
832 if (rest_str)
833 return PARSER_INVALID_TOKEN;
834
835 LOG(10, "node address '%s' pool set descriptor '%s'",
836 addr_str, desc_str);
837
838 /* check if the descriptor is a relative path */
839 if (util_is_absolute_path(desc_str))
840 return PARSER_RELATIVE_PATH_EXPECTED;
841
842 *node_addr = Strdup(addr_str);
843 *pool_desc = Strdup(desc_str);
844
845 if (!(*node_addr) || !(*pool_desc)) {
846 ERR("!Strdup");
847 if (*node_addr)
848 Free(*node_addr);
849 if (*pool_desc)
850 Free(*pool_desc);
851 return PARSER_OUT_OF_MEMORY;
852 }
853
854 return PARSER_CONTINUE;
855 }
856
857 /*
858 * parser_read_options -- (internal) read line and validate options
859 */
860 static enum parser_codes
861 parser_read_options(char *line, unsigned *options)
862 {
863 LOG(3, "line '%s'", line);
864
865 int opt_cnt = 0;
866 char *saveptr = NULL; /* must be NULL initialized on Windows */
867
868 char *opt_str = strtok_r(line, " \t", &saveptr);
869 while (opt_str != NULL) {
870 LOG(4, "option '%s'", opt_str);
871
872 int i = 0;
873 while (Options[i].name && strcmp(opt_str, Options[i].name) != 0)
874 i++;
875
876 if (Options[i].name == NULL) {
877 LOG(4, "unknown option '%s'", opt_str);
878 return PARSER_OPTION_UNKNOWN;
879 }
880
881 if (*options & Options[i].flag)
882 LOG(4, "duplicated option '%s'", opt_str);
883
884 *options |= Options[i].flag;
885
886 opt_cnt++;
887 opt_str = strtok_r(NULL, " \t", &saveptr);
888 }
889
890 if (opt_cnt == 0)
891 return PARSER_OPTION_EXPECTED;
892
893 return PARSER_CONTINUE;
894 }
895
896 /*
897 * util_replica_reserve -- reserves part slots capacity in a replica
898 */
899 static int
900 util_replica_reserve(struct pool_replica **repp, unsigned n)
901 {
902 LOG(3, "replica %p n %u", *repp, n);
903
904 struct pool_replica *rep = *repp;
905 if (rep->nallocated >= n)
906 return 0;
907
908 rep = Realloc(rep, sizeof(struct pool_replica) +
909 (n) * sizeof(struct pool_set_part));
910 if (rep == NULL) {
911 ERR("!Realloc");
912 return -1;
913 }
914
915 size_t nsize = sizeof(struct pool_set_part) * (n - rep->nallocated);
916 memset(rep->part + rep->nallocated, 0, nsize);
917
918 rep->nallocated = n;
919 *repp = rep;
920
921 return 0;
922 }
923
924 /*
925 * util_replica_add_part_by_idx -- (internal) allocates, initializes and adds a
926 * part structure at the provided location in the replica info
927 */
928 static int
929 util_replica_add_part_by_idx(struct pool_replica **repp,
930 const char *path, size_t filesize, unsigned p)
931 {
932 LOG(3, "replica %p path %s filesize %zu", *repp, path, filesize);
933
934 if (util_replica_reserve(repp, p + 1) != 0)
935 return -1;
936
937 struct pool_replica *rep = *repp;
938 ASSERTne(rep, NULL);
939
940 int is_dev_dax = 0;
941
942 if (path != NULL) {
943 enum file_type type = util_file_get_type(path);
944 if (type == OTHER_ERROR)
945 return -1;
946
947 is_dev_dax = type == TYPE_DEVDAX;
948 }
949
950 rep->part[p].path = path;
951 rep->part[p].filesize = filesize;
952 rep->part[p].fd = -1;
953 rep->part[p].is_dev_dax = is_dev_dax;
954 rep->part[p].created = 0;
955 rep->part[p].hdr = NULL;
956 rep->part[p].addr = NULL;
957 rep->part[p].remote_hdr = NULL;
958 rep->part[p].has_bad_blocks = 0;
959
960 if (is_dev_dax)
961 rep->part[p].alignment = util_file_device_dax_alignment(path);
962 else
963 rep->part[p].alignment = Mmap_align;
964
965 ASSERTne(rep->part[p].alignment, 0);
966
967 rep->nparts += 1;
968
969 return 0;
970 }
971
972 /*
973 * util_replica_add_part -- adds a next part in replica info
974 */
975 static int
976 util_replica_add_part(struct pool_replica **repp,
977 const char *path, size_t filesize)
978 {
979 LOG(3, "replica %p path \"%s\" filesize %zu", *repp, path, filesize);
980
981 return util_replica_add_part_by_idx(repp, path,
982 filesize, (*repp)->nparts);
983 }
984
985 /*
986 * util_parse_add_part -- (internal) add a new part file to the replica info
987 */
988 static int
989 util_parse_add_part(struct pool_set *set, const char *path, size_t filesize)
990 {
991 LOG(3, "set %p path %s filesize %zu", set, path, filesize);
992
993 ASSERTne(set, NULL);
994
995 if (set->directory_based) {
996 ERR("cannot mix directories and files in a set");
997 errno = EINVAL;
998 return -1;
999 }
1000
1001 return util_replica_add_part(&set->replica[set->nreplicas - 1],
1002 path, filesize);
1003 }
1004
1005 /*
1006 * util_parse_add_directory --
1007 * (internal) add a new directory to the replica info
1008 */
1009 static int
1010 util_parse_add_directory(struct pool_set *set, const char *path,
1011 size_t filesize)
1012 {
1013 LOG(3, "set %p path %s filesize %zu", set, path, filesize);
1014
1015 ASSERTne(set, NULL);
1016
1017 struct pool_replica *rep = set->replica[set->nreplicas - 1];
1018 ASSERTne(rep, NULL);
1019
1020 if (set->directory_based == 0) {
1021 if (rep->nparts > 0 || set->nreplicas > 1) {
1022 ERR("cannot mix directories and files in a set");
1023 errno = EINVAL;
1024 return -1;
1025 }
1026 set->directory_based = 1;
1027 }
1028
1029 char *rpath = util_part_realpath(path);
1030 if (rpath == NULL) {
1031 ERR("cannot resolve realpath of new directory");
1032 return -1;
1033 }
1034
1035 for (unsigned i = 0; i < set->nreplicas; ++i) {
1036 struct pool_replica *r = set->replica[i];
1037 struct pool_set_directory *dir;
1038 char *dpath = NULL;
1039 VEC_FOREACH_BY_PTR(dir, &r->directory) {
1040 dpath = util_part_realpath(dir->path);
1041 ASSERTne(dpath, NULL); /* must have been resolved */
1042 if (strcmp(rpath, dpath) == 0) {
1043 ERR("cannot use the same directory twice");
1044 errno = EEXIST;
1045 free(dpath);
1046 free(rpath);
1047 return -1;
1048 }
1049 free(dpath);
1050 }
1051 }
1052 free(rpath);
1053
1054 struct pool_set_directory d;
1055 d.path = path;
1056 d.resvsize = filesize;
1057
1058 if (VEC_PUSH_BACK(&rep->directory, d) != 0)
1059 return -1;
1060
1061 rep->resvsize += filesize;
1062
1063 return 0;
1064 }
1065
1066 /*
1067 * util_parse_add_element --
1068 * (internal) add a new element to the replica info
1069 */
1070 static int
1071 util_parse_add_element(struct pool_set *set, const char *path, size_t filesize)
1072 {
1073 LOG(3, "set %p path %s filesize %zu", set, path, filesize);
1074
1075 os_stat_t stat;
1076
1077 int olderrno = errno;
1078
1079 if (os_stat(path, &stat) == 0 && S_ISDIR(stat.st_mode))
1080 return util_parse_add_directory(set, path, filesize);
1081
1082 errno = olderrno;
1083
1084 return util_parse_add_part(set, path, filesize);
1085 }
1086
1087 /*
1088 * util_parse_add_replica -- (internal) add a new replica to the pool set info
1089 */
1090 static int
1091 util_parse_add_replica(struct pool_set **setp)
1092 {
1093 LOG(3, "setp %p", setp);
1094
1095 ASSERTne(setp, NULL);
1096
1097 struct pool_set *set = *setp;
1098 ASSERTne(set, NULL);
1099
1100 set = Realloc(set, sizeof(struct pool_set) +
1101 (set->nreplicas + 1) * sizeof(struct pool_replica *));
1102 if (set == NULL) {
1103 ERR("!Realloc");
1104 return -1;
1105 }
1106 *setp = set;
1107
1108 struct pool_replica *rep;
1109 rep = Zalloc(sizeof(struct pool_replica));
1110 if (rep == NULL) {
1111 ERR("!Zalloc");
1112 return -1;
1113 }
1114
1115 VEC_INIT(&rep->directory);
1116
1117 unsigned r = set->nreplicas++;
1118
1119 set->replica[r] = rep;
1120
1121 return 0;
1122 }
1123
1124 /*
1125 * util_replica_check_map_sync -- (internal) check MAP_SYNC restrictions
1126 */
1127 static int
1128 util_replica_check_map_sync(struct pool_set *set, unsigned repidx,
1129 int check_hdr)
1130 {
1131 LOG(3, "set %p repidx %u", set, repidx);
1132
1133 struct pool_replica *rep = set->replica[repidx];
1134
1135 int map_sync = rep->part[0].map_sync;
1136
1137 for (unsigned p = 1; p < rep->nparts; p++) {
1138 if (map_sync != rep->part[p].map_sync) {
1139 ERR("replica #%u part %u %smapped with MAP_SYNC",
1140 repidx, p, rep->part[p].map_sync ? "" : "not");
1141 return -1;
1142 }
1143 }
1144
1145 if (check_hdr) {
1146 for (unsigned p = 0; p < rep->nhdrs; p++) {
1147 if (map_sync != rep->part[p].hdr_map_sync) {
1148 ERR("replica #%u part %u header %smapped "
1149 "with MAP_SYNC", repidx, p,
1150 rep->part[p].hdr_map_sync ?
1151 "" : "not");
1152 return -1;
1153 }
1154 }
1155 }
1156
1157 return 0;
1158 }
1159
1160 /*
1161 * util_poolset_check_devdax -- (internal) check Device DAX restrictions
1162 */
1163 static int
1164 util_poolset_check_devdax(struct pool_set *set)
1165 {
1166 LOG(3, "set %p", set);
1167
1168 if (set->directory_based)
1169 return 0;
1170
1171 for (unsigned r = 0; r < set->nreplicas; r++) {
1172 struct pool_replica *rep = set->replica[r];
1173 int is_dev_dax = rep->part[0].is_dev_dax;
1174
1175 for (unsigned p = 0; p < rep->nparts; p++) {
1176 if (rep->part[p].is_dev_dax != is_dev_dax) {
1177 ERR(
1178 "either all the parts must be Device DAX or none");
1179 return -1;
1180 }
1181
1182 if (is_dev_dax && rep->nparts > 1 &&
1183 (set->options & (OPTION_SINGLEHDR |
1184 OPTION_NOHDRS)) == 0 &&
1185 util_file_device_dax_alignment(rep->part[p].path)
1186 != Pagesize) {
1187 ERR(
1188 "Multiple DAX devices with alignment other than 4KB. Use the SINGLEHDR poolset option.");
1189 return -1;
1190 }
1191 }
1192 }
1193 return 0;
1194 }
1195
1196 /*
1197 * util_poolset_check_options -- (internal) check if poolset options are
1198 * admissible
1199 */
1200 static int
1201 util_poolset_check_options(struct pool_set *set)
1202 {
1203 LOG(3, "set %p", set);
1204 if ((set->options & OPTION_SINGLEHDR) &&
1205 (set->options & OPTION_NOHDRS)) {
1206 ERR(
1207 "both SINGLEHDR and NOHDR poolset options used at the same time");
1208 return -1;
1209 }
1210 return 0;
1211 }
1212
1213 /*
1214 * util_poolset_set_size -- (internal) calculate pool size
1215 */
1216 static void
1217 util_poolset_set_size(struct pool_set *set)
1218 {
1219 LOG(3, "set %p", set);
1220
1221 set->poolsize = SIZE_MAX;
1222 set->resvsize = SIZE_MAX;
1223
1224 for (unsigned r = 0; r < set->nreplicas; r++) {
1225 struct pool_replica *rep = set->replica[r];
1226
1227 if (set->options & OPTION_SINGLEHDR)
1228 rep->nhdrs = 1;
1229 else if (set->options & OPTION_NOHDRS)
1230 rep->nhdrs = 0;
1231 else
1232 rep->nhdrs = rep->nparts;
1233
1234 rep->repsize = 0;
1235 for (unsigned p = 0; p < rep->nparts; p++) {
1236 rep->repsize +=
1237 (rep->part[p].filesize & ~(Mmap_align - 1));
1238 }
1239 if (rep->nhdrs > 0)
1240 rep->repsize -= (rep->nhdrs - 1) * Mmap_align;
1241
1242 if (rep->resvsize == 0)
1243 rep->resvsize = rep->repsize;
1244
1245 /*
1246 * Calculate pool size - choose the smallest replica size.
1247 * Ignore remote replicas.
1248 */
1249 if (rep->remote == NULL && rep->repsize < set->poolsize)
1250 set->poolsize = rep->repsize;
1251 if (rep->remote == NULL && rep->resvsize < set->resvsize)
1252 set->resvsize = rep->resvsize;
1253 }
1254
1255 LOG(3, "pool size set to %zu", set->poolsize);
1256 }
1257
1258 /*
1259 * util_parse_add_remote_replica -- (internal) add a new remote replica
1260 * to the pool set info
1261 */
1262 static int
1263 util_parse_add_remote_replica(struct pool_set **setp, char *node_addr,
1264 char *pool_desc)
1265 {
1266 LOG(3, "setp %p node_addr %s pool_desc %s", setp, node_addr, pool_desc);
1267
1268 ASSERTne(setp, NULL);
1269 ASSERTne(node_addr, NULL);
1270 ASSERTne(pool_desc, NULL);
1271
1272 int ret = util_parse_add_replica(setp);
1273 if (ret != 0)
1274 return ret;
1275
1276 /*
1277 * A remote replica has one fake part of size equal twice pool header
1278 * size for storing pool header and pool descriptor.
1279 */
1280 ret = util_parse_add_part(*setp, NULL, 2 * POOL_HDR_SIZE);
1281 if (ret != 0)
1282 return ret;
1283
1284 struct pool_set *set = *setp;
1285 struct pool_replica *rep = set->replica[set->nreplicas - 1];
1286 ASSERTne(rep, NULL);
1287
1288 rep->remote = Zalloc(sizeof(struct remote_replica));
1289 if (rep->remote == NULL) {
1290 ERR("!Malloc");
1291 return -1;
1292 }
1293 rep->remote->node_addr = node_addr;
1294 rep->remote->pool_desc = pool_desc;
1295 set->remote = 1;
1296
1297 return 0;
1298 }
1299
1300 /*
1301 * util_part_idx_by_file_name -- (internal) retrieves the part index from a
1302 * name of the file that is an element of a directory poolset
1303 */
1304 static long
1305 util_part_idx_by_file_name(const char *filename)
1306 {
1307 LOG(3, "filename \"%s\"", filename);
1308
1309 int olderrno = errno;
1310 errno = 0;
1311 long part_idx = strtol(filename, NULL, 10);
1312 if (errno != 0)
1313 return -1;
1314
1315 errno = olderrno;
1316
1317 return part_idx;
1318 }
1319
1320 /*
1321 * util_poolset_directory_load -- (internal) loads and initializes all
1322 * existing parts in a single directory
1323 */
1324 static int
1325 util_poolset_directory_load(struct pool_replica **repp, const char *directory)
1326 {
1327 LOG(3, "rep %p dir \"%s\"", *repp, directory);
1328
1329 struct fs *f = fs_new(directory);
1330 if (f == NULL) {
1331 ERR("!fs_new: \"%s\"", directory);
1332 return -1;
1333 }
1334
1335 int nparts = 0;
1336 char *path = NULL;
1337
1338 struct fs_entry *entry;
1339 while ((entry = fs_read(f)) != NULL) {
1340 if (entry->level != 1)
1341 continue;
1342 if (entry->type != FS_ENTRY_FILE)
1343 continue;
1344 if (entry->namelen < PMEM_EXT_LEN)
1345 continue;
1346 const char *ext = entry->path + entry->pathlen -
1347 PMEM_EXT_LEN + 1;
1348 if (strcmp(PMEM_EXT, ext) != 0)
1349 continue;
1350
1351 long part_idx = util_part_idx_by_file_name(entry->name);
1352 if (part_idx < 0)
1353 continue;
1354
1355 ssize_t size = util_file_get_size(entry->path);
1356 if (size < 0) {
1357 LOG(2,
1358 "cannot read size of file (%s) in a poolset directory",
1359 entry->path);
1360 goto err;
1361 }
1362
1363 if ((path = Strdup(entry->path)) == NULL) {
1364 ERR("!Strdup");
1365 goto err;
1366 }
1367
1368 if (util_replica_add_part_by_idx(repp, path,
1369 (size_t)size, (unsigned)part_idx) != 0) {
1370 ERR("unable to load part %s", entry->path);
1371 goto err;
1372 }
1373 nparts++;
1374 }
1375
1376 fs_delete(f);
1377 return nparts;
1378
1379 err:
1380 fs_delete(f);
1381 return -1;
1382 }
1383
1384 /*
1385 * util_poolset_directories_load -- (internal) loads and initializes all
1386 * existing parts in the poolset directories
1387 */
1388 static int
1389 util_poolset_directories_load(struct pool_set *set)
1390 {
1391 LOG(3, "set %p", set);
1392
1393 if (!set->directory_based)
1394 return 0;
1395
1396 unsigned next_part_id = 0;
1397 unsigned max_parts_rep = 0;
1398 for (unsigned r = 0; r < set->nreplicas; r++) {
1399 next_part_id = 0;
1400
1401 struct pool_set_directory *d;
1402 int nparts = 0;
1403 int prev_nparts = 0;
1404 VEC_FOREACH_BY_PTR(d, &set->replica[r]->directory) {
1405 prev_nparts = nparts;
1406 nparts = util_poolset_directory_load(&set->replica[r],
1407 d->path);
1408 if (nparts < 0) {
1409 ERR("failed to load parts from directory %s",
1410 d->path);
1411 return -1;
1412 }
1413
1414 next_part_id += (unsigned)nparts;
1415
1416 /* always try to evenly spread files across dirs */
1417 if (r == 0 && prev_nparts > nparts)
1418 set->next_directory_id++;
1419 }
1420
1421 if (next_part_id > set->replica[max_parts_rep]->nparts)
1422 max_parts_rep = r;
1423
1424 if (r == 0)
1425 set->next_id = next_part_id;
1426 }
1427
1428 /*
1429 * In order to maintain the same semantics of poolset parsing for
1430 * regular poolsets and directory poolsets, we need to speculatively
1431 * recreate the information regarding any missing parts in replicas.
1432 */
1433 struct pool_replica *rep;
1434 struct pool_replica *mrep = set->replica[max_parts_rep];
1435
1436 for (unsigned r = 0; r < set->nreplicas; r++) {
1437 if (set->replica[r]->nparts == mrep->nparts)
1438 continue;
1439
1440 if (VEC_SIZE(&set->replica[r]->directory) == 0) {
1441 errno = ENOENT;
1442 ERR("!no directories in replica");
1443 return -1;
1444 }
1445
1446 if (util_replica_reserve(&set->replica[r], mrep->nparts) != 0)
1447 return -1;
1448
1449 rep = set->replica[r];
1450
1451 struct pool_set_directory *d = VEC_GET(&rep->directory, 0);
1452
1453 for (unsigned pidx = 0; pidx < rep->nallocated; ++pidx) {
1454 struct pool_set_part *p = &rep->part[pidx];
1455 *p = mrep->part[pidx];
1456
1457 size_t path_len = strlen(d->path) + PMEM_FILE_MAX_LEN;
1458 if ((p->path = Malloc(path_len)) == NULL) {
1459 ERR("!Malloc");
1460 return -1;
1461 }
1462
1463 snprintf((char *)p->path, path_len,
1464 "%s" OS_DIR_SEP_STR "%0*u%s",
1465 d->path, PMEM_FILE_PADDING,
1466 pidx, PMEM_EXT);
1467 }
1468 rep->nparts = mrep->nparts;
1469 }
1470
1471 return 0;
1472 }
1473
1474 /*
1475 * util_poolset_parse -- parse pool set config file
1476 *
1477 * Returns 0 if the file is a valid poolset config file,
1478 * and -1 in case of any error.
1479 *
1480 * XXX: use memory mapped file
1481 */
1482 int
1483 util_poolset_parse(struct pool_set **setp, const char *path, int fd)
1484 {
1485 LOG(3, "setp %p path %s fd %d", setp, path, fd);
1486
1487 struct pool_set *set = NULL;
1488 enum parser_codes result;
1489 char *line;
1490 char *ppath;
1491 char *pool_desc;
1492 char *node_addr;
1493 char *cp;
1494 size_t psize;
1495 FILE *fs;
1496 int oerrno;
1497
1498 if (os_lseek(fd, 0, SEEK_SET) != 0) {
1499 ERR("!lseek %d", fd);
1500 return -1;
1501 }
1502
1503 fd = dup(fd);
1504 if (fd < 0) {
1505 ERR("!dup");
1506 return -1;
1507 }
1508
1509 /* associate a stream with the file descriptor */
1510 if ((fs = os_fdopen(fd, "r")) == NULL) {
1511 ERR("!fdopen %d", fd);
1512 os_close(fd);
1513 return -1;
1514 }
1515
1516 unsigned nlines = 0;
1517 unsigned nparts = 0; /* number of parts in current replica */
1518
1519 /* read the first line */
1520 line = util_readline(fs);
1521 if (line == NULL) {
1522 ERR("!Reading poolset file");
1523 goto err;
1524 }
1525 nlines++;
1526
1527 set = Zalloc(sizeof(struct pool_set));
1528 if (set == NULL) {
1529 ERR("!Malloc for pool set");
1530 goto err;
1531 }
1532
1533 set->path = Strdup(path);
1534 if (set->path == NULL) {
1535 ERR("!Strdup");
1536 goto err;
1537 }
1538
1539 /* check also if the last character is '\n' */
1540 if (strncmp(line, POOLSET_HDR_SIG, POOLSET_HDR_SIG_LEN) == 0 &&
1541 line[POOLSET_HDR_SIG_LEN] == '\n') {
1542 /* 'PMEMPOOLSET' signature detected */
1543 LOG(10, "PMEMPOOLSET");
1544
1545 int ret = util_parse_add_replica(&set);
1546 if (ret != 0)
1547 goto err;
1548
1549 nparts = 0;
1550 result = PARSER_CONTINUE;
1551 } else {
1552 result = PARSER_PMEMPOOLSET;
1553 }
1554
1555 while (result == PARSER_CONTINUE) {
1556 Free(line);
1557 /* read next line */
1558 line = util_readline(fs);
1559 nlines++;
1560
1561 if (line) {
1562 /* chop off newline and comments */
1563 if ((cp = strchr(line, '\n')) != NULL)
1564 *cp = '\0';
1565 if (cp != line && (cp = strchr(line, '#')) != NULL)
1566 *cp = '\0';
1567
1568 /* skip comments and blank lines */
1569 if (cp == line)
1570 continue;
1571 }
1572
1573 if (!line) {
1574 if (nparts >= 1) {
1575 result = PARSER_FORMAT_OK;
1576 } else {
1577 if (set->nreplicas == 1)
1578 result = PARSER_SET_NO_PARTS;
1579 else
1580 result = PARSER_REP_NO_PARTS;
1581 }
1582 } else if (strncmp(line, POOLSET_OPTION_SIG,
1583 POOLSET_OPTION_SIG_LEN) == 0) {
1584 result = parser_read_options(
1585 line + POOLSET_OPTION_SIG_LEN,
1586 &set->options);
1587 if (result == PARSER_CONTINUE) {
1588 LOG(10, "OPTIONS: %x", set->options);
1589 }
1590 } else if (strncmp(line, POOLSET_REPLICA_SIG,
1591 POOLSET_REPLICA_SIG_LEN) == 0) {
1592 if (line[POOLSET_REPLICA_SIG_LEN] != '\0') {
1593 /* something more than 'REPLICA' */
1594 char c = line[POOLSET_REPLICA_SIG_LEN];
1595 if (!isblank((unsigned char)c)) {
1596 result = PARSER_REPLICA;
1597 continue;
1598 }
1599 /* check if it is a remote replica */
1600 result = parser_read_replica(
1601 line + POOLSET_REPLICA_SIG_LEN,
1602 &node_addr, &pool_desc);
1603 if (result == PARSER_CONTINUE) {
1604 /* remote REPLICA */
1605 LOG(10, "REMOTE REPLICA "
1606 "node address '%s' "
1607 "pool set descriptor '%s'",
1608 node_addr, pool_desc);
1609 if (util_parse_add_remote_replica(&set,
1610 node_addr, pool_desc))
1611 goto err;
1612 }
1613 } else if (nparts >= 1) {
1614 /* 'REPLICA' signature detected */
1615 LOG(10, "REPLICA");
1616
1617 int ret = util_parse_add_replica(&set);
1618 if (ret != 0)
1619 goto err;
1620
1621 nparts = 0;
1622 result = PARSER_CONTINUE;
1623 } else {
1624 if (set->nreplicas == 1)
1625 result = PARSER_SET_NO_PARTS;
1626 else
1627 result = PARSER_REP_NO_PARTS;
1628 }
1629 } else {
1630 /* there could be no parts for remote replicas */
1631 if (set->replica[set->nreplicas - 1]->remote) {
1632 result = PARSER_REMOTE_REP_UNEXPECTED_PARTS;
1633 continue;
1634 }
1635
1636 /* read size and path */
1637 result = parser_read_line(line, &psize, &ppath);
1638 if (result == PARSER_CONTINUE) {
1639 /* add a new pool's part to the list */
1640 int ret = util_parse_add_element(set,
1641 ppath, psize);
1642 if (ret != 0) {
1643 Free(ppath);
1644 goto err;
1645 }
1646 nparts++;
1647 }
1648 }
1649 }
1650
1651 if (result != PARSER_FORMAT_OK) {
1652 ERR("%s [%s:%d]", path, parser_errstr[result], nlines);
1653 switch (result) {
1654 case PARSER_CANNOT_READ_SIZE:
1655 case PARSER_OUT_OF_MEMORY:
1656 /* do not overwrite errno */
1657 break;
1658 default:
1659 errno = EINVAL;
1660 }
1661 goto err;
1662 }
1663
1664 if (util_poolset_check_devdax(set) != 0) {
1665 errno = EINVAL;
1666 goto err;
1667 }
1668
1669 if (util_poolset_directories_load(set) != 0) {
1670 ERR("cannot load part files from directories");
1671 goto err;
1672 }
1673
1674 LOG(4, "set file format correct (%s)", path);
1675 (void) os_fclose(fs);
1676 Free(line);
1677 util_poolset_check_options(set);
1678 util_poolset_set_size(set);
1679 *setp = set;
1680 return 0;
1681
1682 err:
1683 oerrno = errno;
1684 Free(line);
1685 (void) os_fclose(fs);
1686 if (set)
1687 util_poolset_free(set);
1688 errno = oerrno;
1689 return -1;
1690 }
1691
1692 /*
1693 * util_poolset_single -- (internal) create a one-part pool set
1694 *
1695 * On success returns a pointer to a newly allocated and initialized
1696 * pool set structure. Otherwise, NULL is returned.
1697 */
1698 static struct pool_set *
1699 util_poolset_single(const char *path, size_t filesize, int create,
1700 int ignore_sds)
1701 {
1702 LOG(3, "path %s filesize %zu create %d",
1703 path, filesize, create);
1704
1705 enum file_type type = util_file_get_type(path);
1706 if (type == OTHER_ERROR)
1707 return NULL;
1708
1709 struct pool_set *set;
1710 set = Zalloc(sizeof(struct pool_set) +
1711 sizeof(struct pool_replica *));
1712 if (set == NULL) {
1713 ERR("!Malloc for pool set");
1714 return NULL;
1715 }
1716
1717 set->path = Strdup(path);
1718 if (set->path == NULL) {
1719 ERR("!Strdup");
1720 Free(set);
1721 return NULL;
1722 }
1723
1724 struct pool_replica *rep;
1725 rep = Zalloc(sizeof(struct pool_replica) +
1726 sizeof(struct pool_set_part));
1727 if (rep == NULL) {
1728 ERR("!Malloc for pool set replica");
1729 Free(set->path);
1730 Free(set);
1731 return NULL;
1732 }
1733
1734 VEC_INIT(&rep->directory);
1735
1736 set->replica[0] = rep;
1737
1738 rep->part[0].filesize = filesize;
1739 rep->part[0].path = Strdup(path);
1740 rep->part[0].fd = -1; /* will be filled out by util_poolset_file() */
1741 rep->part[0].is_dev_dax = type == TYPE_DEVDAX;
1742 rep->part[0].created = create;
1743 rep->part[0].hdr = NULL;
1744 rep->part[0].addr = NULL;
1745 rep->part[0].has_bad_blocks = 0;
1746
1747 if (rep->part[0].is_dev_dax)
1748 rep->part[0].alignment = util_file_device_dax_alignment(path);
1749 else
1750 rep->part[0].alignment = Mmap_align;
1751
1752 ASSERTne(rep->part[0].alignment, 0);
1753
1754 rep->nallocated = 1;
1755 rep->nparts = 1;
1756 rep->nhdrs = 1;
1757
1758 /* it does not have a remote replica */
1759 rep->remote = NULL;
1760 set->remote = 0;
1761
1762 /* round down to the nearest mapping alignment boundary */
1763 rep->repsize = rep->part[0].filesize & ~(rep->part[0].alignment - 1);
1764 rep->resvsize = rep->repsize;
1765
1766 set->poolsize = rep->repsize;
1767 set->resvsize = rep->resvsize;
1768
1769 set->nreplicas = 1;
1770 set->ignore_sds = ignore_sds || (set->options & OPTION_NOHDRS);
1771
1772 return set;
1773 }
1774
1775 /*
1776 * util_part_open -- open or create a single part file
1777 */
1778 int
1779 util_part_open(struct pool_set_part *part, size_t minsize, int create_part)
1780 {
1781 LOG(3, "part %p minsize %zu create %d", part, minsize, create_part);
1782
1783 int exists = util_file_exists(part->path);
1784 if (exists < 0)
1785 return -1;
1786
1787 int create_file = create_part;
1788
1789 if (exists)
1790 create_file = 0;
1791
1792 part->created = 0;
1793 if (create_file) {
1794 part->fd = util_file_create(part->path, part->filesize,
1795 minsize);
1796 if (part->fd == -1) {
1797 LOG(2, "failed to create file: %s", part->path);
1798 return -1;
1799 }
1800 part->created = 1;
1801 } else {
1802 size_t size = 0;
1803 int flags = O_RDWR;
1804 part->fd = util_file_open(part->path, &size, minsize, flags);
1805 if (part->fd == -1) {
1806 LOG(2, "failed to open file: %s", part->path);
1807 return -1;
1808 }
1809
1810 if (Fallocate_at_create && create_part && !part->is_dev_dax) {
1811 int ret = os_posix_fallocate(part->fd, 0,
1812 (os_off_t)size);
1813 if (ret != 0) {
1814 errno = ret;
1815 ERR("!posix_fallocate \"%s\", %zu", part->path,
1816 size);
1817 return -1;
1818 }
1819 }
1820
1821 /* check if filesize matches */
1822 if (part->filesize != size) {
1823 ERR("file size does not match config: %s, %zu != %zu",
1824 part->path, size, part->filesize);
1825 errno = EINVAL;
1826 return -1;
1827 }
1828 }
1829
1830 return 0;
1831 }
1832
1833 /*
1834 * util_part_fdclose -- close part file
1835 */
1836 void
1837 util_part_fdclose(struct pool_set_part *part)
1838 {
1839 LOG(3, "part %p", part);
1840
1841 if (part->fd != -1) {
1842 (void) os_close(part->fd);
1843 part->fd = -1;
1844 }
1845 }
1846
1847 /*
1848 * util_set_rpmem_attr -- (internal) overwrite existing pool attributes
1849 *
1850 * does not set uuid, next_part_uuid, prev_part_uuid
1851 */
1852 static void
1853 util_set_rpmem_attr(struct pool_hdr *hdrp, const struct rpmem_pool_attr *rattr)
1854 {
1855 LOG(5, "hdrp %p rattr %p", hdrp, rattr);
1856 memcpy(hdrp->signature, rattr->signature, POOL_HDR_SIG_LEN);
1857 hdrp->major = rattr->major;
1858 hdrp->features.compat = rattr->compat_features;
1859 hdrp->features.incompat = rattr->incompat_features;
1860 hdrp->features.ro_compat = rattr->ro_compat_features;
1861 memcpy(hdrp->poolset_uuid, rattr->poolset_uuid, POOL_HDR_UUID_LEN);
1862 memcpy(hdrp->next_repl_uuid, rattr->next_uuid, POOL_HDR_UUID_LEN);
1863 memcpy(hdrp->prev_repl_uuid, rattr->prev_uuid, POOL_HDR_UUID_LEN);
1864 memcpy(&hdrp->arch_flags, rattr->user_flags, sizeof(struct arch_flags));
1865 }
1866
1867 /*
1868 * util_get_rpmem_attr -- (internal) get attributes for remote replica header
1869 */
1870 static void
1871 util_get_rpmem_attr(struct rpmem_pool_attr *rattr, const struct pool_hdr *hdrp)
1872 {
1873 LOG(5, "rpmem_attr %p hdrp %p", rattr, hdrp);
1874 ASSERTne(rattr, NULL);
1875 memcpy(rattr->signature, hdrp->signature, POOL_HDR_SIG_LEN);
1876 rattr->major = hdrp->major;
1877 rattr->compat_features = hdrp->features.compat;
1878 rattr->incompat_features = hdrp->features.incompat;
1879 rattr->ro_compat_features = hdrp->features.ro_compat;
1880 memcpy(rattr->poolset_uuid, hdrp->poolset_uuid, POOL_HDR_UUID_LEN);
1881 memcpy(rattr->uuid, hdrp->uuid, POOL_HDR_UUID_LEN);
1882 memcpy(rattr->next_uuid, hdrp->next_repl_uuid, POOL_HDR_UUID_LEN);
1883 memcpy(rattr->prev_uuid, hdrp->prev_repl_uuid, POOL_HDR_UUID_LEN);
1884 memcpy(rattr->user_flags, &hdrp->arch_flags, sizeof(struct arch_flags));
1885 }
1886
1887 /*
1888 * util_remote_store_attr -- (internal) store attributes read from remote
1889 * replica in the local volatile pool header
1890 */
1891 static void
1892 util_remote_store_attr(struct pool_hdr *hdrp,
1893 const struct rpmem_pool_attr *rattr)
1894 {
1895 LOG(4, "hdrp %p rpmem_attr %p", hdrp, rattr);
1896
1897 util_set_rpmem_attr(hdrp, rattr);
1898 memcpy(hdrp->uuid, rattr->uuid, POOL_HDR_UUID_LEN);
1899 memcpy(hdrp->next_part_uuid, rattr->uuid, POOL_HDR_UUID_LEN);
1900 memcpy(hdrp->prev_part_uuid, rattr->uuid, POOL_HDR_UUID_LEN);
1901 }
1902
1903 /*
1904 * util_update_remote_header -- update attributes of a remote replica;
1905 * the remote replica must be open
1906 */
1907 int
1908 util_update_remote_header(struct pool_set *set, unsigned repn)
1909 {
1910 LOG(3, "set %p, repn %u", set, repn);
1911
1912 ASSERTne(REP(set, repn)->remote, NULL);
1913 ASSERTne(REP(set, repn)->remote->rpp, NULL);
1914
1915 struct pool_replica *rep = REP(set, repn);
1916 struct pool_hdr *hdr = HDR(rep, 0);
1917
1918 /* get attributes from the local pool header */
1919 struct rpmem_pool_attr attributes;
1920 util_get_rpmem_attr(&attributes, hdr);
1921
1922 /* push the attributes to the remote replica */
1923 RPMEMpool *rpp = rep->remote->rpp;
1924 int ret = Rpmem_set_attr(rpp, &attributes);
1925 if (ret) {
1926 ERR("!Rpmem_set_attr");
1927 return -1;
1928 }
1929 return 0;
1930 }
1931
1932 /*
1933 * util_pool_close_remote -- close a remote replica
1934 */
1935 int
1936 util_pool_close_remote(RPMEMpool *rpp)
1937 {
1938 LOG(3, "rpp %p", rpp);
1939
1940 return Rpmem_close(rpp);
1941 }
1942
1943 /*
1944 * util_poolset_remote_open -- open or create a remote replica
1945 */
1946 int
1947 util_poolset_remote_open(struct pool_replica *rep, unsigned repidx,
1948 size_t minsize, int create, void *pool_addr,
1949 size_t pool_size, unsigned *nlanes)
1950 {
1951 LOG(3, "rep %p repidx %u minsize %zu create %d "
1952 "pool_addr %p pool_size %zu nlanes %p",
1953 rep, repidx, minsize, create,
1954 pool_addr, pool_size, nlanes);
1955
1956 ASSERTne(nlanes, NULL);
1957
1958 if (!Rpmem_handle_remote) {
1959 return -1;
1960 }
1961
1962 unsigned remote_nlanes = *nlanes;
1963
1964 if (create) {
1965 struct rpmem_pool_attr rpmem_attr_create;
1966 util_get_rpmem_attr(&rpmem_attr_create, rep->part[0].hdr);
1967
1968 rep->remote->rpp = Rpmem_create(rep->remote->node_addr,
1969 rep->remote->pool_desc,
1970 pool_addr,
1971 pool_size,
1972 &remote_nlanes,
1973 &rpmem_attr_create);
1974 if (rep->remote->rpp == NULL) {
1975 ERR("creating remote replica #%u failed", repidx);
1976 return -1;
1977 }
1978 rep->part[0].created = 1;
1979 } else { /* open */
1980 struct rpmem_pool_attr rpmem_attr_open;
1981
1982 rep->remote->rpp = Rpmem_open(rep->remote->node_addr,
1983 rep->remote->pool_desc,
1984 pool_addr,
1985 pool_size,
1986 &remote_nlanes,
1987 &rpmem_attr_open);
1988 if (rep->remote->rpp == NULL) {
1989 ERR("opening remote replica #%u failed", repidx);
1990 return -1;
1991 }
1992
1993 util_remote_store_attr(rep->part[0].hdr, &rpmem_attr_open);
1994 }
1995
1996 if (remote_nlanes < *nlanes)
1997 *nlanes = remote_nlanes;
1998
1999 return 0;
2000 }
2001
2002 /*
2003 * util_poolset_files_local -- (internal) open or create all the local
2004 * part files of a pool set and replica sets
2005 */
2006 static int
2007 util_poolset_files_local(struct pool_set *set, size_t minpartsize, int create)
2008 {
2009 LOG(3, "set %p minpartsize %zu create %d", set, minpartsize, create);
2010
2011 for (unsigned r = 0; r < set->nreplicas; r++) {
2012 struct pool_replica *rep = set->replica[r];
2013 if (!rep->remote) {
2014 for (unsigned p = 0; p < rep->nparts; p++) {
2015 if (util_part_open(&rep->part[p], minpartsize,
2016 create))
2017 return -1;
2018 }
2019 }
2020 }
2021
2022 return 0;
2023 }
2024
2025 /*
2026 * util_poolset_remote_replica_open -- open remote replica
2027 */
2028 int
2029 util_poolset_remote_replica_open(struct pool_set *set, unsigned repidx,
2030 size_t minsize, int create, unsigned *nlanes)
2031 {
2032 #ifndef _WIN32
2033 /*
2034 * This is a workaround for an issue with using device dax with
2035 * libibverbs. To handle fork() function calls correctly libfabric use
2036 * ibv_fork_init(3) which makes all registered memory being madvised
2037 * with MADV_DONTFORK flag. In libpmemobj the remote replication is
2038 * performed without pool header (first 4k). In such case the address
2039 * passed to madvise(2) is aligned to 4k, but device dax can require
2040 * different alignment (default is 2MB). This workaround madvises the
2041 * entire memory region before registering it by fi_mr_reg(3).
2042 *
2043 * The librpmem client requires fork() support to work correctly.
2044 */
2045 if (set->replica[0]->part[0].is_dev_dax) {
2046 int ret = os_madvise(set->replica[0]->part[0].addr,
2047 set->replica[0]->part[0].filesize,
2048 MADV_DONTFORK);
2049 if (ret) {
2050 ERR("!madvise");
2051 return ret;
2052 }
2053 }
2054 #endif
2055
2056 void *pool_addr = (void *)((uintptr_t)set->replica[0]->part[0].addr);
2057
2058 return util_poolset_remote_open(set->replica[repidx], repidx, minsize,
2059 create, pool_addr, set->poolsize, nlanes);
2060 }
2061
2062 /*
2063 * util_poolset_files_remote -- (internal) open or create all the remote
2064 * part files of a pool set and replica sets
2065 */
2066 static int
2067 util_poolset_files_remote(struct pool_set *set, size_t minsize,
2068 unsigned *nlanes, int create)
2069 {
2070 LOG(3, "set %p minsize %zu nlanes %p create %d",
2071 set, minsize, nlanes, create);
2072
2073 for (unsigned r = 0; r < set->nreplicas; r++) {
2074 struct pool_replica *rep = set->replica[r];
2075 if (rep->remote) {
2076 if (util_poolset_remote_replica_open(set, r,
2077 minsize, create, nlanes))
2078 return -1;
2079 }
2080 }
2081
2082 return 0;
2083 }
2084
2085 /*
2086 * util_poolset_read -- read memory pool set file
2087 *
2088 * On success returns 0 and a pointer to a newly allocated structure
2089 * containing the info of all the parts of the pool set and replicas.
2090 */
2091 int
2092 util_poolset_read(struct pool_set **setp, const char *path)
2093 {
2094 LOG(3, "setp %p path %s", setp, path);
2095
2096 int oerrno;
2097 int ret = 0;
2098 int fd;
2099
2100 if ((fd = os_open(path, O_RDONLY)) < 0) {
2101 ERR("!open: path \"%s\"", path);
2102 return -1;
2103 }
2104
2105 ret = util_poolset_parse(setp, path, fd);
2106
2107 oerrno = errno;
2108 (void) os_close(fd);
2109 errno = oerrno;
2110 return ret;
2111 }
2112
2113 /*
2114 * util_poolset_create_set -- create a new pool set structure
2115 *
2116 * On success returns 0 and a pointer to a newly allocated structure
2117 * containing the info of all the parts of the pool set and replicas.
2118 */
2119 int
2120 util_poolset_create_set(struct pool_set **setp, const char *path,
2121 size_t poolsize, size_t minsize, int ignore_sds)
2122 {
2123 LOG(3, "setp %p path %s poolsize %zu minsize %zu",
2124 setp, path, poolsize, minsize);
2125
2126 int oerrno;
2127 int ret = 0;
2128 int fd;
2129 size_t size = 0;
2130
2131 enum file_type type = util_file_get_type(path);
2132 if (type == OTHER_ERROR)
2133 return -1;
2134
2135 if (poolsize != 0) {
2136 if (type == TYPE_DEVDAX) {
2137 ERR("size must be zero for device dax");
2138 return -1;
2139 }
2140 *setp = util_poolset_single(path, poolsize, 1, ignore_sds);
2141 if (*setp == NULL)
2142 return -1;
2143
2144 return 0;
2145 }
2146
2147 /* do not check minsize */
2148 if ((fd = util_file_open(path, &size, 0, O_RDONLY)) == -1)
2149 return -1;
2150
2151 char signature[POOLSET_HDR_SIG_LEN];
2152 if (type == TYPE_NORMAL) {
2153 /*
2154 * read returns ssize_t, but we know it will return value
2155 * between -1 and POOLSET_HDR_SIG_LEN (11), so we can safely
2156 * cast it to int
2157 */
2158 ret = (int)read(fd, signature, POOLSET_HDR_SIG_LEN);
2159 if (ret < 0) {
2160 ERR("!read %d", fd);
2161 goto err;
2162 }
2163 }
2164
2165 if (type == TYPE_DEVDAX || ret < POOLSET_HDR_SIG_LEN ||
2166 strncmp(signature, POOLSET_HDR_SIG, POOLSET_HDR_SIG_LEN)) {
2167 LOG(4, "not a pool set header");
2168 (void) os_close(fd);
2169
2170 if (size < minsize) {
2171 ERR("file is not a poolset file and its size (%zu)"
2172 " is smaller than %zu", size, minsize);
2173 errno = EINVAL;
2174 return -1;
2175 }
2176 *setp = util_poolset_single(path, size, 0, ignore_sds);
2177 if (*setp == NULL)
2178 return -1;
2179
2180 return 0;
2181 }
2182
2183 ret = util_poolset_parse(setp, path, fd);
2184 if (ret)
2185 goto err;
2186
2187 (*setp)->ignore_sds = ignore_sds || ((*setp)->options & OPTION_NOHDRS);
2188 #ifdef _WIN32
2189 /* remote replication is not supported on Windows */
2190 if ((*setp)->remote) {
2191 util_poolset_free(*setp);
2192 ERR("remote replication is not supported on Windows");
2193 errno = ENOTSUP;
2194 ret = -1;
2195 goto err;
2196 }
2197 #endif /* _WIN32 */
2198
2199 err:
2200 oerrno = errno;
2201 (void) os_close(fd);
2202 errno = oerrno;
2203 return ret;
2204 }
2205
2206 /*
2207 * util_poolset_check_header_options -- (internal) check if poolset options
2208 * match given flags
2209 */
2210 static int
2211 util_poolset_check_header_options(struct pool_set *set, uint32_t incompat)
2212 {
2213 LOG(3, "set %p, incompat %#x", set, incompat);
2214
2215 if (((set->options & OPTION_SINGLEHDR) == 0) !=
2216 ((incompat & POOL_FEAT_SINGLEHDR) == 0)) {
2217 ERR(
2218 "poolset file options (%u) do not match incompat feature flags (%#x)",
2219 set->options, incompat);
2220 errno = EINVAL;
2221 return -1;
2222 }
2223 return 0;
2224 }
2225
2226 /*
2227 * util_header_create -- create header of a single pool set file
2228 */
2229 int
2230 util_header_create(struct pool_set *set, unsigned repidx, unsigned partidx,
2231 const struct pool_attr *attr, int overwrite)
2232 {
2233 LOG(3, "set %p repidx %u partidx %u attr %p overwrite %d", set, repidx,
2234 partidx, attr, overwrite);
2235
2236 ASSERTne(attr, NULL);
2237
2238 struct pool_replica *rep = set->replica[repidx];
2239
2240 /* opaque info lives at the beginning of mapped memory pool */
2241 struct pool_hdr *hdrp = rep->part[partidx].hdr;
2242
2243 /* check if the pool header is all zeros */
2244 if (!util_is_zeroed(hdrp, sizeof(*hdrp)) && !overwrite) {
2245 ERR("Non-empty file detected");
2246 errno = EEXIST;
2247 return -1;
2248 }
2249
2250 /* create pool's header */
2251 util_pool_attr2hdr(hdrp, attr);
2252
2253 if (set->options & OPTION_SINGLEHDR)
2254 hdrp->features.incompat |= POOL_FEAT_SINGLEHDR;
2255
2256 memcpy(hdrp->poolset_uuid, set->uuid, POOL_HDR_UUID_LEN);
2257 memcpy(hdrp->uuid, PART(rep, partidx)->uuid, POOL_HDR_UUID_LEN);
2258
2259 /* link parts */
2260 if (set->options & OPTION_SINGLEHDR) {
2261 /* next/prev part point to part #0 */
2262 ASSERTeq(partidx, 0);
2263 memcpy(hdrp->prev_part_uuid, PART(rep, 0)->uuid,
2264 POOL_HDR_UUID_LEN);
2265 memcpy(hdrp->next_part_uuid, PART(rep, 0)->uuid,
2266 POOL_HDR_UUID_LEN);
2267 } else {
2268 memcpy(hdrp->prev_part_uuid, PARTP(rep, partidx)->uuid,
2269 POOL_HDR_UUID_LEN);
2270 memcpy(hdrp->next_part_uuid, PARTN(rep, partidx)->uuid,
2271 POOL_HDR_UUID_LEN);
2272 }
2273
2274 /* link replicas */
2275 if (!util_is_zeroed(attr->prev_repl_uuid, POOL_HDR_UUID_LEN)) {
2276 memcpy(hdrp->prev_repl_uuid, attr->prev_repl_uuid,
2277 POOL_HDR_UUID_LEN);
2278 } else {
2279 memcpy(hdrp->prev_repl_uuid, PART(REPP(set, repidx), 0)->uuid,
2280 POOL_HDR_UUID_LEN);
2281 }
2282 if (!util_is_zeroed(attr->next_repl_uuid, POOL_HDR_UUID_LEN)) {
2283 memcpy(hdrp->next_repl_uuid, attr->next_repl_uuid,
2284 POOL_HDR_UUID_LEN);
2285 } else {
2286 memcpy(hdrp->next_repl_uuid, PART(REPN(set, repidx), 0)->uuid,
2287 POOL_HDR_UUID_LEN);
2288 }
2289
2290 if (!rep->remote) {
2291 os_stat_t stbuf;
2292
2293 if (os_fstat(rep->part[partidx].fd, &stbuf) != 0) {
2294 ERR("!fstat");
2295 return -1;
2296 }
2297 ASSERT(stbuf.st_ctime);
2298 hdrp->crtime = (uint64_t)stbuf.st_ctime;
2299 }
2300
2301 int arch_is_zeroed = util_is_zeroed(attr->arch_flags,
2302 POOL_HDR_ARCH_LEN);
2303 if (arch_is_zeroed)
2304 util_get_arch_flags(&hdrp->arch_flags);
2305
2306 util_convert2le_hdr(hdrp);
2307
2308 if (!arch_is_zeroed) {
2309 memcpy(&hdrp->arch_flags, attr->arch_flags, POOL_HDR_ARCH_LEN);
2310 }
2311
2312 if (!set->ignore_sds && partidx == 0 && !rep->remote) {
2313 shutdown_state_init(&hdrp->sds, rep);
2314 for (unsigned p = 0; p < rep->nparts; p++) {
2315 if (shutdown_state_add_part(&hdrp->sds,
2316 PART(rep, p)->fd, rep))
2317 return -1;
2318 }
2319 shutdown_state_set_dirty(&hdrp->sds, rep);
2320 }
2321
2322 util_checksum(hdrp, sizeof(*hdrp), &hdrp->checksum,
2323 1, POOL_HDR_CSUM_END_OFF(hdrp));
2324
2325 /* store pool's header */
2326 util_persist_auto(rep->is_pmem, hdrp, sizeof(*hdrp));
2327
2328 return 0;
2329 }
2330
2331 /*
2332 * util_header_check -- (internal) validate header of a single pool set file
2333 */
2334 static int
2335 util_header_check(struct pool_set *set, unsigned repidx, unsigned partidx,
2336 const struct pool_attr *attr)
2337 {
2338 LOG(3, "set %p repidx %u partidx %u attr %p", set, repidx, partidx,
2339 attr);
2340
2341 ASSERTne(attr, NULL);
2342
2343 struct pool_replica *rep = set->replica[repidx];
2344
2345 /* opaque info lives at the beginning of mapped memory pool */
2346 struct pool_hdr *hdrp = rep->part[partidx].hdr;
2347 struct pool_hdr hdr;
2348
2349 memcpy(&hdr, hdrp, sizeof(hdr));
2350
2351 /* local copy of a remote header does not need to be converted */
2352 if (rep->remote == NULL)
2353 util_convert2h_hdr_nocheck(&hdr);
2354
2355 /* to be valid, a header must have a major version of at least 1 */
2356 if (hdr.major == 0) {
2357 ERR("invalid major version (0)");
2358 errno = EINVAL;
2359 return -1;
2360 }
2361
2362 /* check signature */
2363 if (memcmp(hdr.signature, attr->signature, POOL_HDR_SIG_LEN)) {
2364 ERR("wrong pool type: \"%.8s\"", hdr.signature);
2365 errno = EINVAL;
2366 return -1;
2367 }
2368
2369 /* check format version number */
2370 if (hdr.major != attr->major) {
2371 ERR("pool version %d (library expects %d)", hdr.major,
2372 attr->major);
2373 if (hdr.major < attr->major)
2374 ERR(
2375 "Please run the pmdk-convert utility to upgrade the pool.");
2376 errno = EINVAL;
2377 return -1;
2378 }
2379
2380 rep->part[partidx].rdonly = 0;
2381
2382 int retval = util_feature_check(&hdr, attr->features);
2383 if (retval < 0)
2384 return -1;
2385
2386 if (retval == 0)
2387 rep->part[partidx].rdonly = 1;
2388
2389 if (rep->remote == NULL) {
2390 /*
2391 * and to be valid, the fields must checksum correctly
2392 *
2393 * NOTE: checksum validation is performed after format version
2394 * and feature check, because if POOL_FEAT_CKSUM_2K flag is set,
2395 * we want to report it as incompatible feature, rather than
2396 * invalid checksum.
2397 */
2398 if (!util_checksum(&hdr, sizeof(hdr), &hdr.checksum,
2399 0, POOL_HDR_CSUM_END_OFF(&hdr))) {
2400 ERR("invalid checksum of pool header");
2401 errno = EINVAL;
2402 return -1;
2403 }
2404
2405 LOG(3, "valid header, signature \"%.8s\"", hdr.signature);
2406 }
2407
2408 if (util_check_arch_flags(&hdr.arch_flags)) {
2409 ERR("wrong architecture flags");
2410 errno = EINVAL;
2411 return -1;
2412 }
2413
2414 /* check pool set UUID */
2415 if (memcmp(HDR(REP(set, 0), 0)->poolset_uuid, hdr.poolset_uuid,
2416 POOL_HDR_UUID_LEN)) {
2417 ERR("wrong pool set UUID");
2418 errno = EINVAL;
2419 return -1;
2420 }
2421
2422 /* check pool set linkage */
2423 if (memcmp(HDRP(rep, partidx)->uuid, hdr.prev_part_uuid,
2424 POOL_HDR_UUID_LEN) ||
2425 memcmp(HDRN(rep, partidx)->uuid, hdr.next_part_uuid,
2426 POOL_HDR_UUID_LEN)) {
2427 ERR("wrong part UUID");
2428 errno = EINVAL;
2429 return -1;
2430 }
2431
2432 /* check format version */
2433 if (HDR(rep, 0)->major != hdrp->major) {
2434 ERR("incompatible pool format");
2435 errno = EINVAL;
2436 return -1;
2437 }
2438
2439 /* check compatibility features */
2440 if (HDR(rep, 0)->features.compat != hdrp->features.compat ||
2441 HDR(rep, 0)->features.incompat != hdrp->features.incompat ||
2442 HDR(rep, 0)->features.ro_compat != hdrp->features.ro_compat) {
2443 ERR("incompatible feature flags");
2444 errno = EINVAL;
2445 return -1;
2446 }
2447
2448 /* check poolset options */
2449 if (util_poolset_check_header_options(set,
2450 HDR(rep, 0)->features.incompat))
2451 return -1;
2452
2453 return 0;
2454 }
2455
2456 /*
2457 * util_header_check_remote -- (internal) validate header of a remote
2458 * pool set file
2459 */
2460 static int
2461 util_header_check_remote(struct pool_set *set, unsigned partidx)
2462 {
2463 LOG(3, "set %p partidx %u ", set, partidx);
2464
2465 /* there is only one replica in remote poolset */
2466 struct pool_replica *rep = set->replica[0];
2467 /* opaque info lives at the beginning of mapped memory pool */
2468 struct pool_hdr *hdrp = rep->part[partidx].hdr;
2469 struct pool_hdr hdr;
2470
2471 if (util_is_zeroed(hdrp, sizeof(*hdrp))) {
2472 ERR("pool header zeroed");
2473 errno = EINVAL;
2474 return -1;
2475 }
2476
2477 memcpy(&hdr, hdrp, sizeof(hdr));
2478
2479 util_convert2h_hdr_nocheck(&hdr);
2480
2481 /* valid header found */
2482 if (memcmp(HDR(rep, 0)->signature, hdrp->signature, POOL_HDR_SIG_LEN)) {
2483 ERR("pool signature mismatch in part %d", partidx);
2484 errno = EINVAL;
2485 return -1;
2486 }
2487
2488 /* check format version */
2489 if (HDR(rep, 0)->major != hdrp->major) {
2490 ERR("pool version mismatch in part %d", partidx);
2491 errno = EINVAL;
2492 return -1;
2493 }
2494
2495 /* check compatibility features */
2496 if (HDR(rep, 0)->features.compat != hdrp->features.compat) {
2497 ERR("'may have' compatibility flags mismatch in part %d",
2498 partidx);
2499 errno = EINVAL;
2500 return -1;
2501 }
2502 if (HDR(rep, 0)->features.incompat != hdrp->features.incompat) {
2503 ERR("'must support' compatibility flags mismatch in part %d",
2504 partidx);
2505 errno = EINVAL;
2506 return -1;
2507 }
2508 if (HDR(rep, 0)->features.ro_compat != hdrp->features.ro_compat) {
2509 ERR("'force read-only' compatibility flags mismatch in part %d",
2510 partidx);
2511 errno = EINVAL;
2512 return -1;
2513 }
2514
2515 /*
2516 * and to be valid, the fields must checksum correctly
2517 *
2518 * NOTE: checksum validation is performed after format version and
2519 * feature check, because if POOL_FEAT_CKSUM_2K flag is set,
2520 * we want to report it as incompatible feature, rather than invalid
2521 * checksum.
2522 */
2523 if (!util_checksum(&hdr, sizeof(hdr), &hdr.checksum,
2524 0, POOL_HDR_CSUM_END_OFF(&hdr))) {
2525 ERR("invalid checksum of pool header");
2526 return -1;
2527 }
2528
2529 LOG(3, "valid header, signature \"%.8s\"", hdr.signature);
2530
2531 /* check pool set UUID */
2532 if (memcmp(HDR(rep, 0)->poolset_uuid, hdrp->poolset_uuid,
2533 POOL_HDR_UUID_LEN)) {
2534 ERR("wrong pool set UUID in part %d", partidx);
2535 errno = EINVAL;
2536 return -1;
2537 }
2538
2539 /* check previous replica UUID */
2540 if (memcmp(HDR(rep, 0)->prev_repl_uuid, hdrp->prev_repl_uuid,
2541 POOL_HDR_UUID_LEN)) {
2542 ERR("wrong previous replica UUID in part %d", partidx);
2543 errno = EINVAL;
2544 return -1;
2545 }
2546
2547 /* check next replica UUID */
2548 if (memcmp(HDR(rep, 0)->next_repl_uuid, hdrp->next_repl_uuid,
2549 POOL_HDR_UUID_LEN)) {
2550 ERR("wrong next replica UUID in part %d", partidx);
2551 errno = EINVAL;
2552 return -1;
2553 }
2554
2555 if (memcmp(&HDR(rep, 0)->arch_flags, &hdrp->arch_flags,
2556 sizeof(hdrp->arch_flags))) {
2557 ERR("wrong architecture flags");
2558 errno = EINVAL;
2559 return -1;
2560 }
2561
2562 /* check pool set linkage */
2563 if (memcmp(HDRP(rep, partidx)->uuid, hdrp->prev_part_uuid,
2564 POOL_HDR_UUID_LEN) ||
2565 memcmp(HDRN(rep, partidx)->uuid, hdrp->next_part_uuid,
2566 POOL_HDR_UUID_LEN)) {
2567 ERR("wrong part UUID in part %d", partidx);
2568 errno = EINVAL;
2569 return -1;
2570 }
2571
2572 /* read shutdown state toggle from header */
2573 set->ignore_sds |= IGNORE_SDS(HDR(rep, 0));
2574
2575 if (!set->ignore_sds && partidx == 0) {
2576 struct shutdown_state sds;
2577 shutdown_state_init(&sds, NULL);
2578 for (unsigned p = 0; p < rep->nparts; p++) {
2579 if (shutdown_state_add_part(&sds,
2580 PART(rep, p)->fd, NULL))
2581 return -1;
2582 }
2583
2584 if (shutdown_state_check(&sds, &hdrp->sds, rep)) {
2585 errno = EINVAL;
2586 return -1;
2587 }
2588
2589 shutdown_state_set_dirty(&hdrp->sds, rep);
2590 }
2591
2592 rep->part[partidx].rdonly = 0;
2593
2594 return 0;
2595 }
2596
2597 /*
2598 * util_replica_set_is_pmem -- sets per-replica is_pmem flag
2599 *
2600 * The replica is PMEM if:
2601 * - all parts are on device dax, or
2602 * - all parts are mapped with MAP_SYNC.
2603 *
2604 * It's enough to check only first part because it's already verified
2605 * that either all or none parts are device dax or mapped with MAP_SYNC.
2606 */
2607 static inline void
2608 util_replica_set_is_pmem(struct pool_replica *rep)
2609 {
2610 rep->is_pmem = rep->part[0].is_dev_dax || rep->part[0].map_sync ||
2611 pmem_is_pmem(rep->part[0].addr, rep->resvsize);
2612 }
2613
2614 /*
2615 * util_replica_map_local -- (internal) map memory pool for local replica
2616 */
2617 static int
2618 util_replica_map_local(struct pool_set *set, unsigned repidx, int flags)
2619 {
2620 LOG(3, "set %p repidx %u flags %d", set, repidx, flags);
2621
2622 /*
2623 * XXX: Like we reserve space for all parts in this replica when we map
2624 * the first part, we need to reserve the space for all replicas
2625 * upfront. It is not necessary that the replicas are contiguous but
2626 * that way we would not fragment the memory much. I think we should
2627 * leave this to MM, but let's have a note as per our collective minds.
2628 */
2629
2630 #ifndef _WIN32
2631 int remaining_retries = 0;
2632 #else
2633 int remaining_retries = 10;
2634 #endif
2635 int retry_for_contiguous_addr;
2636 size_t mapsize;
2637 /* header size for all headers but the first one */
2638 size_t hdrsize = (set->options & (OPTION_SINGLEHDR | OPTION_NOHDRS)) ?
2639 0 : Mmap_align;
2640 void *addr;
2641 struct pool_replica *rep = set->replica[repidx];
2642
2643 ASSERTeq(rep->remote, NULL);
2644 ASSERTne(rep->part, NULL);
2645
2646 do {
2647 retry_for_contiguous_addr = 0;
2648 mapsize = rep->part[0].filesize & ~(Mmap_align - 1);
2649
2650 /* determine a hint address for mmap() */
2651 addr = util_map_hint(rep->resvsize, 0);
2652 if (addr == MAP_FAILED) {
2653 LOG(1, "cannot find a contiguous region of given size");
2654 return -1;
2655 }
2656
2657 /* map the first part and reserve space for remaining parts */
2658 if (util_map_part(&rep->part[0], addr, rep->resvsize, 0,
2659 flags, 0) != 0) {
2660 LOG(2, "pool mapping failed - replica #%u part #0",
2661 repidx);
2662 return -1;
2663 }
2664
2665 VALGRIND_REGISTER_PMEM_MAPPING(rep->part[0].addr,
2666 rep->part[0].size);
2667 VALGRIND_REGISTER_PMEM_FILE(rep->part[0].fd,
2668 rep->part[0].addr, rep->part[0].size, 0);
2669
2670 set->zeroed &= rep->part[0].created;
2671
2672 addr = (char *)rep->part[0].addr + mapsize;
2673
2674 /*
2675 * map the remaining parts of the usable pool space
2676 * (aligned to memory mapping granularity)
2677 */
2678 for (unsigned p = 1; p < rep->nparts; p++) {
2679 /* map data part */
2680 if (util_map_part(&rep->part[p], addr, 0, hdrsize,
2681 flags | MAP_FIXED, 0) != 0) {
2682 /*
2683 * if we can't map the part at the address we
2684 * asked for, unmap all the parts that are
2685 * mapped and remap at a different address.
2686 */
2687 if ((errno == EINVAL) &&
2688 (remaining_retries > 0)) {
2689 LOG(2, "usable space mapping failed - "
2690 "part #%d - retrying", p);
2691 retry_for_contiguous_addr = 1;
2692 remaining_retries--;
2693
2694 util_unmap_parts(rep, 0, p - 1);
2695
2696 /* release rest of the VA reserved */
2697 ASSERTne(addr, NULL);
2698 ASSERTne(addr, MAP_FAILED);
2699 munmap(addr, rep->resvsize - mapsize);
2700 break;
2701 }
2702 LOG(2, "usable space mapping failed - part #%d",
2703 p);
2704 goto err;
2705 }
2706
2707 VALGRIND_REGISTER_PMEM_FILE(rep->part[p].fd,
2708 rep->part[p].addr, rep->part[p].size,
2709 hdrsize);
2710
2711 mapsize += rep->part[p].size;
2712 set->zeroed &= rep->part[p].created;
2713 addr = (char *)addr + rep->part[p].size;
2714 }
2715 } while (retry_for_contiguous_addr);
2716
2717 /*
2718 * Initially part[0].size is the size of address space
2719 * reservation for all parts from given replica. After
2720 * mapping that space we need to overwrite part[0].size
2721 * with its actual size to be consistent - size for each
2722 * part should be the actual mapping size of this part
2723 * only - it simplifies future calculations.
2724 */
2725 rep->part[0].size = rep->part[0].filesize & ~(Mmap_align - 1);
2726
2727 if (util_replica_check_map_sync(set, repidx, 0))
2728 goto err;
2729
2730 util_replica_set_is_pmem(rep);
2731
2732 if (Prefault_at_create)
2733 util_replica_force_page_allocation(rep);
2734
2735 ASSERTeq(mapsize, rep->repsize);
2736
2737 LOG(3, "replica #%u addr %p", repidx, rep->part[0].addr);
2738
2739 return 0;
2740
2741 err:
2742 LOG(4, "error clean up");
2743 int oerrno = errno;
2744 if (mapsize < rep->repsize) {
2745 ASSERTne(rep->part[0].addr, NULL);
2746 ASSERTne(rep->part[0].addr, MAP_FAILED);
2747 munmap(rep->part[0].addr, rep->resvsize - mapsize);
2748 }
2749 for (unsigned p = 0; p < rep->nparts; p++) {
2750 util_unmap_part(&rep->part[p]);
2751 }
2752 errno = oerrno;
2753 return -1;
2754 }
2755
2756 /*
2757 * util_replica_init_headers_local -- (internal) initialize pool headers
2758 */
2759 static int
2760 util_replica_init_headers_local(struct pool_set *set, unsigned repidx,
2761 int flags, const struct pool_attr *attr)
2762 {
2763 LOG(3, "set %p repidx %u flags %d attr %p", set, repidx, flags, attr);
2764
2765 struct pool_replica *rep = set->replica[repidx];
2766
2767 /* map all headers - don't care about the address */
2768 for (unsigned p = 0; p < rep->nhdrs; p++) {
2769 if (util_map_hdr(&rep->part[p], flags, 0) != 0) {
2770 LOG(2, "header mapping failed - part #%d", p);
2771 goto err;
2772 }
2773 }
2774
2775 /* create headers, set UUID's */
2776 for (unsigned p = 0; p < rep->nhdrs; p++) {
2777 if (util_header_create(set, repidx, p, attr, 0) != 0) {
2778 LOG(2, "header creation failed - part #%d", p);
2779 goto err;
2780 }
2781 }
2782
2783 /* unmap all headers */
2784 for (unsigned p = 0; p < rep->nhdrs; p++)
2785 util_unmap_hdr(&rep->part[p]);
2786
2787 return 0;
2788
2789 err:
2790 LOG(4, "error clean up");
2791 int oerrno = errno;
2792 for (unsigned p = 0; p < rep->nhdrs; p++) {
2793 util_unmap_hdr(&rep->part[p]);
2794 }
2795 errno = oerrno;
2796 return -1;
2797 }
2798
2799 /*
2800 * util_replica_create_local -- (internal) create a new memory pool for local
2801 * replica
2802 */
2803 static int
2804 util_replica_create_local(struct pool_set *set, unsigned repidx, int flags,
2805 const struct pool_attr *attr)
2806 {
2807 LOG(3, "set %p repidx %u flags %d attr %p", set, repidx, flags, attr);
2808
2809 /*
2810 * the first replica has to be mapped prior to remote ones so if
2811 * a replica is already mapped skip mapping creation
2812 */
2813 if (PART(REP(set, repidx), 0)->addr == NULL) {
2814 if (util_replica_map_local(set, repidx, flags) != 0) {
2815 LOG(2, "replica #%u map failed", repidx);
2816 return -1;
2817 }
2818 }
2819
2820 if (attr == NULL)
2821 return 0;
2822
2823 if (util_replica_init_headers_local(set, repidx, flags, attr) != 0) {
2824 LOG(2, "replica #%u headers initialization failed", repidx);
2825 return -1;
2826 }
2827 return 0;
2828 }
2829
2830 /*
2831 * util_replica_create_remote -- (internal) create a new memory pool
2832 * for remote replica
2833 */
2834 static int
2835 util_replica_create_remote(struct pool_set *set, unsigned repidx, int flags,
2836 const struct pool_attr *attr)
2837 {
2838 LOG(3, "set %p repidx %u flags %d attr %p", set, repidx, flags, attr);
2839
2840 struct pool_replica *rep = set->replica[repidx];
2841
2842 ASSERTne(rep->remote, NULL);
2843 ASSERTne(rep->part, NULL);
2844 ASSERTeq(rep->nparts, 1);
2845 ASSERTeq(rep->nhdrs, 1);
2846 ASSERTne(attr, NULL);
2847
2848 struct pool_set_part *part = rep->part;
2849
2850 /*
2851 * A remote replica has one fake part of size equal twice pool header
2852 * size for storing pool header and pool descriptor.
2853 */
2854 part->size = rep->repsize;
2855 ASSERT(IS_PAGE_ALIGNED(part->size));
2856 part->remote_hdr = Zalloc(part->size + Pagesize);
2857 if (!part->remote_hdr) {
2858 ERR("!Zalloc");
2859 return -1;
2860 }
2861
2862 part->hdr = PAGE_ALIGN_UP(part->remote_hdr);
2863 part->addr = PAGE_ALIGN_UP(part->remote_hdr);
2864 part->hdrsize = POOL_HDR_SIZE;
2865
2866 /* create header, set UUID's */
2867 if (util_header_create(set, repidx, 0, attr, 0) != 0) {
2868 LOG(2, "header creation failed - part #0");
2869 Free(part->remote_hdr);
2870 return -1;
2871 }
2872
2873 LOG(3, "replica #%u addr %p", repidx, rep->part[0].addr);
2874
2875 return 0;
2876 }
2877
2878 /*
2879 * util_replica_close -- close a memory pool replica
2880 *
2881 * This function unmaps all mapped memory regions.
2882 */
2883 int
2884 util_replica_close(struct pool_set *set, unsigned repidx)
2885 {
2886 LOG(3, "set %p repidx %u", set, repidx);
2887 struct pool_replica *rep = set->replica[repidx];
2888
2889 if (rep->remote == NULL) {
2890 struct pool_set_part *part = PART(rep, 0);
2891 if (!set->ignore_sds && part->addr != NULL &&
2892 part->size != 0) {
2893 struct pool_hdr *hdr = part->addr;
2894 RANGE_RW(hdr, sizeof(*hdr), part->is_dev_dax);
2895 /*
2896 * deep drain will call msync on one page in each
2897 * part in replica to trigger WPQ flush.
2898 * This pages may have been marked as
2899 * undefined/inaccessible, but msyncing such memory
2900 * is not a bug, so as a workaround temporarily
2901 * disable error reporting.
2902 */
2903 VALGRIND_DO_DISABLE_ERROR_REPORTING;
2904 util_replica_deep_drain(part->addr, rep->repsize,
2905 set, repidx);
2906 VALGRIND_DO_ENABLE_ERROR_REPORTING;
2907 shutdown_state_clear_dirty(&hdr->sds, rep);
2908 }
2909 for (unsigned p = 0; p < rep->nhdrs; p++)
2910 util_unmap_hdr(&rep->part[p]);
2911
2912 rep->part[0].size = rep->resvsize;
2913 util_unmap_part(&rep->part[0]);
2914 } else {
2915 LOG(4, "freeing volatile header of remote replica #%u", repidx);
2916 Free(rep->part[0].remote_hdr);
2917 rep->part[0].remote_hdr = NULL;
2918 rep->part[0].hdr = NULL;
2919 rep->part[0].hdrsize = 0;
2920 rep->part[0].addr = NULL;
2921 rep->part[0].size = 0;
2922 }
2923
2924 return 0;
2925 }
2926
2927 /*
2928 * util_poolset_append_new_part -- (internal) creates a new part in each replica
2929 * of the poolset
2930 */
2931 static int
2932 util_poolset_append_new_part(struct pool_set *set, size_t size)
2933 {
2934 LOG(3, "set %p size %zu", set, size);
2935
2936 if (!set->directory_based)
2937 return -1;
2938
2939 struct pool_set_directory *d;
2940 size_t directory_id;
2941 char *path;
2942 size_t path_len;
2943
2944 unsigned r;
2945 for (r = 0; r < set->nreplicas; ++r) {
2946 struct pool_replica *rep = set->replica[r];
2947
2948 directory_id = set->next_directory_id %
2949 VEC_SIZE(&rep->directory);
2950 d = VEC_GET(&rep->directory, directory_id);
2951
2952 path_len = strlen(d->path) + PMEM_FILE_MAX_LEN;
2953 if ((path = Malloc(path_len)) == NULL) {
2954 ERR("!Malloc");
2955 goto err_part_init;
2956 }
2957
2958 snprintf(path, path_len, "%s" OS_DIR_SEP_STR "%0*u%s",
2959 d->path, PMEM_FILE_PADDING, set->next_id, PMEM_EXT);
2960
2961 if (util_replica_add_part(&set->replica[r], path, size) != 0)
2962 FATAL("cannot add a new part to the replica info");
2963 }
2964
2965 set->next_directory_id += 1;
2966 set->next_id += 1;
2967
2968 util_poolset_set_size(set);
2969
2970 return 0;
2971
2972 err_part_init:
2973 /* for each replica 0..r-1 remove the last part */
2974 for (unsigned rn = 0; rn < r; ++rn) {
2975 struct pool_replica *rep = set->replica[rn];
2976 unsigned pidx = rep->nparts - 1;
2977 Free((void *)(rep->part[pidx].path));
2978 rep->part[pidx].path = NULL;
2979 rep->nparts--;
2980 }
2981
2982 return -1;
2983 }
2984
2985 /*
2986 * util_pool_extend -- extends the poolset by the provided size
2987 */
2988 void *
2989 util_pool_extend(struct pool_set *set, size_t *size, size_t minpartsize)
2990 {
2991 LOG(3, "set %p size %zu minpartsize %zu", set, *size, minpartsize);
2992
2993 if (*size == 0) {
2994 ERR("cannot extend pool by 0 bytes");
2995 return NULL;
2996 }
2997
2998 if ((set->options & OPTION_SINGLEHDR) == 0) {
2999 ERR(
3000 "extending the pool by appending parts with headers is not supported!");
3001 return NULL;
3002 }
3003
3004 if (set->poolsize + *size > set->resvsize) {
3005 *size = set->resvsize - set->poolsize;
3006 if (*size < minpartsize) {
3007 ERR("exceeded reservation size");
3008 return NULL;
3009 }
3010 LOG(4, "extend size adjusted to not exceed reservation size");
3011 }
3012
3013 size_t old_poolsize = set->poolsize;
3014
3015 if (util_poolset_append_new_part(set, *size) != 0) {
3016 ERR("unable to append a new part to the pool");
3017 return NULL;
3018 }
3019
3020 size_t hdrsize = (set->options & OPTION_SINGLEHDR) ? 0 : Mmap_align;
3021 void *addr = NULL;
3022 void *addr_base = NULL;
3023
3024 unsigned r;
3025 for (r = 0; r < set->nreplicas; r++) {
3026 struct pool_replica *rep = set->replica[r];
3027 unsigned pidx = rep->nparts - 1;
3028 struct pool_set_part *p = &rep->part[pidx];
3029
3030 if (util_part_open(p, 0, 1 /* create */) != 0) {
3031 ERR("cannot open the new part");
3032 goto err;
3033 }
3034
3035 addr = (char *)rep->part[0].addr + old_poolsize;
3036 if (addr_base == NULL)
3037 addr_base = addr;
3038
3039 if (util_map_part(p, addr, 0, hdrsize,
3040 MAP_SHARED | MAP_FIXED, 0) != 0) {
3041 ERR("cannot map the new part");
3042 goto err;
3043 }
3044
3045 /*
3046 * new part must be mapped the same way as all the rest
3047 * within a replica
3048 */
3049 if (p->map_sync != rep->part[0].map_sync) {
3050 if (p->map_sync)
3051 ERR("new part cannot be mapped with MAP_SYNC");
3052 else
3053 ERR("new part mapped with MAP_SYNC");
3054 goto err;
3055 }
3056 }
3057
3058 /* XXX: mode should be the same as for pmemxxx_create() */
3059 if (util_poolset_chmod(set, S_IWUSR | S_IRUSR))
3060 goto err;
3061
3062 util_poolset_fdclose(set);
3063
3064 return addr_base;
3065
3066 err:
3067 for (unsigned rn = 0; rn <= r; ++rn) {
3068 struct pool_replica *rep = set->replica[r];
3069 unsigned pidx = rep->nparts - 1;
3070 struct pool_set_part *p = &rep->part[pidx];
3071 rep->nparts--;
3072
3073 if (p->fd != 0)
3074 (void) os_close(p->fd);
3075 if (p->created)
3076 os_unlink(p->path);
3077 Free((void *)p->path);
3078 p->path = NULL;
3079 }
3080 util_poolset_set_size(set);
3081
3082 return NULL;
3083 }
3084
3085 /*
3086 * util_print_bad_files_cb -- (internal) callback printing names of pool files
3087 * containing bad blocks
3088 */
3089 static int
3090 util_print_bad_files_cb(struct part_file *pf, void *arg)
3091 {
3092 if (!pf->is_remote && pf->part && pf->part->has_bad_blocks)
3093 ERR("file contains bad blocks -- '%s'", pf->part->path);
3094
3095 return 0;
3096 }
3097
3098 /*
3099 * util_pool_create_uuids -- create a new memory pool (set or a single file)
3100 * with given uuids
3101 *
3102 * On success returns 0 and a pointer to a newly allocated structure
3103 * containing the info of all the parts of the pool set and replicas.
3104 */
3105 int
3106 util_pool_create_uuids(struct pool_set **setp, const char *path,
3107 size_t poolsize, size_t minsize, size_t minpartsize,
3108 const struct pool_attr *attr, unsigned *nlanes, int can_have_rep,
3109 int remote)
3110 {
3111 LOG(3, "setp %p path %s poolsize %zu minsize %zu minpartsize %zu "
3112 "pattr %p nlanes %p can_have_rep %i remote %i", setp, path,
3113 poolsize, minsize, minpartsize, attr, nlanes, can_have_rep,
3114 remote);
3115
3116 /* attributes cannot be NULL for local replicas */
3117 ASSERT(remote || attr != NULL);
3118
3119 int flags = MAP_SHARED;
3120 int oerrno;
3121
3122 int exists = util_file_exists(path);
3123 if (exists < 0)
3124 return -1;
3125
3126 /* check if file exists */
3127 if (poolsize > 0 && exists) {
3128 ERR("file %s already exists", path);
3129 errno = EEXIST;
3130 return -1;
3131 }
3132
3133 int ret = util_poolset_create_set(setp, path, poolsize, minsize,
3134 IGNORE_SDS(attr));
3135 if (ret < 0) {
3136 LOG(2, "cannot create pool set -- '%s'", path);
3137 return -1;
3138 }
3139
3140 struct pool_set *set = *setp;
3141
3142 ASSERT(set->nreplicas > 0);
3143
3144 if (!remote && (set->options & OPTION_NOHDRS)) {
3145 ERR(
3146 "the NOHDRS poolset option is not supported for local poolsets");
3147 errno = EINVAL;
3148 goto err_poolset_free;
3149 }
3150
3151 if ((attr == NULL) != ((set->options & OPTION_NOHDRS) != 0)) {
3152 ERR(
3153 "pool attributes are not supported for poolsets without headers (with the NOHDRS option)");
3154 errno = EINVAL;
3155 goto err_poolset_free;
3156 }
3157
3158 if (set->directory_based && ((set->options & OPTION_SINGLEHDR) == 0)) {
3159 ERR(
3160 "directory based pools are not supported for poolsets with headers (without SINGLEHDR option)");
3161 errno = EINVAL;
3162 goto err_poolset_free;
3163 }
3164
3165 if (set->resvsize < minsize) {
3166 ERR("reservation pool size %zu smaller than %zu",
3167 set->resvsize, minsize);
3168 errno = EINVAL;
3169 goto err_poolset_free;
3170 }
3171
3172 if (set->directory_based && set->poolsize == 0 &&
3173 util_poolset_append_new_part(set, minsize) != 0) {
3174 ERR("cannot create a new part in provided directories");
3175 goto err_poolset_free;
3176 }
3177
3178 if (attr != NULL &&
3179 (attr->features.compat & POOL_FEAT_CHECK_BAD_BLOCKS)) {
3180 int bbs = badblocks_check_poolset(set, 1 /* create */);
3181 if (bbs < 0) {
3182 LOG(1,
3183 "failed to check pool set for bad blocks -- '%s'",
3184 path);
3185 goto err_poolset_free;
3186 }
3187
3188 if (bbs > 0) {
3189 util_poolset_foreach_part_struct(set,
3190 util_print_bad_files_cb,
3191 NULL);
3192 ERR(
3193 "pool set contains bad blocks and cannot be created, run 'pmempool create --clear-bad-blocks' utility to clear bad blocks and create a pool");
3194 errno = EIO;
3195 goto err_poolset_free;
3196 }
3197 }
3198
3199 if (set->poolsize < minsize) {
3200 ERR("net pool size %zu smaller than %zu",
3201 set->poolsize, minsize);
3202 errno = EINVAL;
3203 goto err_poolset_free;
3204 }
3205
3206 if (remote) {
3207 /* it is a remote replica - it cannot have replicas */
3208 if (set->nreplicas > 1) {
3209 LOG(2, "remote pool set cannot have replicas");
3210 errno = EINVAL;
3211 goto err_poolset_free;
3212 }
3213
3214 /* check if poolset options match remote pool attributes */
3215 if (attr != NULL &&
3216 ((set->options & OPTION_SINGLEHDR) == 0) !=
3217 ((attr->features.incompat &
3218 POOL_FEAT_SINGLEHDR) == 0)) {
3219 ERR(
3220 "pool incompat feature flags and remote poolset options do not match");
3221 errno = EINVAL;
3222 goto err_poolset_free;
3223 }
3224 }
3225
3226 if (!can_have_rep && set->nreplicas > 1) {
3227 ERR("replication not supported");
3228 errno = ENOTSUP;
3229 goto err_poolset_free;
3230 }
3231
3232 if (set->remote && util_remote_load()) {
3233 ERR(
3234 "the pool set requires a remote replica, but the '%s' library cannot be loaded",
3235 LIBRARY_REMOTE);
3236 goto err_poolset_free;
3237 }
3238
3239 set->zeroed = 1;
3240
3241 if (attr != NULL) {
3242 if (!util_is_zeroed(attr->poolset_uuid, POOL_HDR_UUID_LEN)) {
3243 memcpy(set->uuid, attr->poolset_uuid,
3244 POOL_HDR_UUID_LEN);
3245 } else {
3246 /* generate pool set UUID */
3247 ret = util_uuid_generate(set->uuid);
3248 if (ret < 0) {
3249 LOG(2, "cannot generate pool set UUID");
3250 goto err_poolset;
3251 }
3252 }
3253
3254 /* generate UUID's for all the parts */
3255 for (unsigned r = 0; r < set->nreplicas; r++) {
3256 struct pool_replica *rep = set->replica[r];
3257 for (unsigned i = 0; i < rep->nhdrs; i++) {
3258 ret = util_uuid_generate(rep->part[i].uuid);
3259 if (ret < 0) {
3260 LOG(2,
3261 "cannot generate pool set part UUID");
3262 goto err_poolset;
3263 }
3264 }
3265 }
3266
3267 /* overwrite UUID of the first part if given */
3268 if (!util_is_zeroed(attr->first_part_uuid, POOL_HDR_UUID_LEN)) {
3269 memcpy(set->replica[0]->part[0].uuid,
3270 attr->first_part_uuid, POOL_HDR_UUID_LEN);
3271 }
3272 }
3273
3274 ret = util_poolset_files_local(set, minpartsize, 1);
3275 if (ret != 0)
3276 goto err_poolset;
3277
3278 /* map first local replica - it has to exist prior to remote ones */
3279 ret = util_replica_map_local(set, 0, flags);
3280 if (ret != 0)
3281 goto err_poolset;
3282
3283 /* prepare remote replicas first */
3284 if (set->remote) {
3285 for (unsigned r = 0; r < set->nreplicas; r++) {
3286 if (REP(set, r)->remote == NULL) {
3287 continue;
3288 }
3289 if (util_replica_create_remote(set, r, flags, attr) !=
3290 0) {
3291 LOG(2, "replica #%u creation failed", r);
3292 goto err_create;
3293 }
3294 }
3295
3296 ret = util_poolset_files_remote(set, minsize, nlanes,
3297 1 /* create */);
3298 if (ret != 0)
3299 goto err_create;
3300 }
3301
3302 /* prepare local replicas */
3303 if (remote) {
3304 if (util_replica_create_local(set, 0, flags, attr) != 0) {
3305 LOG(2, "replica #0 creation failed");
3306 goto err_create;
3307 }
3308 } else {
3309 for (unsigned r = 0; r < set->nreplicas; r++) {
3310 if (REP(set, r)->remote != NULL) {
3311 continue;
3312 }
3313 if (util_replica_create_local(set, r, flags, attr) !=
3314 0) {
3315 LOG(2, "replica #%u creation failed", r);
3316 goto err_create;
3317 }
3318 }
3319 }
3320
3321 return 0;
3322
3323 err_create:
3324 oerrno = errno;
3325 for (unsigned r = 0; r < set->nreplicas; r++)
3326 util_replica_close(set, r);
3327 errno = oerrno;
3328 err_poolset:
3329 oerrno = errno;
3330 util_poolset_close(set, DELETE_CREATED_PARTS);
3331 errno = oerrno;
3332 return -1;
3333
3334 err_poolset_free:
3335 oerrno = errno;
3336 util_poolset_free(set);
3337 errno = oerrno;
3338 return -1;
3339 }
3340
3341 /*
3342 * util_pool_create -- create a new memory pool (set or a single file)
3343 *
3344 * On success returns 0 and a pointer to a newly allocated structure
3345 * containing the info of all the parts of the pool set and replicas.
3346 */
3347 int
3348 util_pool_create(struct pool_set **setp, const char *path, size_t poolsize,
3349 size_t minsize, size_t minpartsize, const struct pool_attr *attr,
3350 unsigned *nlanes, int can_have_rep)
3351 {
3352 LOG(3, "setp %p path %s poolsize %zu minsize %zu minpartsize %zu "
3353 "attr %p nlanes %p can_have_rep %i", setp, path, poolsize,
3354 minsize, minpartsize, attr, nlanes, can_have_rep);
3355
3356 return util_pool_create_uuids(setp, path, poolsize, minsize,
3357 minpartsize, attr, nlanes, can_have_rep, POOL_LOCAL);
3358 }
3359
3360 /*
3361 * util_replica_open_local -- (internal) open a memory pool local replica
3362 */
3363 static int
3364 util_replica_open_local(struct pool_set *set, unsigned repidx, int flags)
3365 {
3366 LOG(3, "set %p repidx %u flags %d", set, repidx, flags);
3367
3368 int remaining_retries = 10;
3369 int retry_for_contiguous_addr;
3370 size_t mapsize;
3371 size_t hdrsize = (set->options & (OPTION_SINGLEHDR | OPTION_NOHDRS)) ?
3372 0 : Mmap_align;
3373 struct pool_replica *rep = set->replica[repidx];
3374 void *addr = NULL;
3375
3376 do {
3377 retry_for_contiguous_addr = 0;
3378
3379 /* determine a hint address for mmap() if not specified */
3380 if (addr == NULL)
3381 addr = util_map_hint(rep->resvsize, 0);
3382 if (addr == MAP_FAILED) {
3383 LOG(1, "cannot find a contiguous region of given size");
3384 return -1;
3385 }
3386
3387 mapsize = rep->part[0].filesize & ~(Mmap_align - 1);
3388
3389 /* map the first part and reserve space for remaining parts */
3390 if (util_map_part(&rep->part[0], addr, rep->resvsize, 0,
3391 flags, 0) != 0) {
3392 LOG(2, "pool mapping failed - replica #%u part #0",
3393 repidx);
3394 return -1;
3395 }
3396
3397 VALGRIND_REGISTER_PMEM_MAPPING(rep->part[0].addr,
3398 rep->resvsize);
3399 VALGRIND_REGISTER_PMEM_FILE(rep->part[0].fd,
3400 rep->part[0].addr, rep->resvsize, 0);
3401
3402 /* map all headers - don't care about the address */
3403 for (unsigned p = 0; p < rep->nhdrs; p++) {
3404 if (util_map_hdr(&rep->part[p], flags, 0) != 0) {
3405 LOG(2, "header mapping failed - part #%d", p);
3406 goto err;
3407 }
3408 }
3409
3410 addr = (char *)rep->part[0].addr + mapsize;
3411
3412 /*
3413 * map the remaining parts of the usable pool space
3414 * (aligned to memory mapping granularity)
3415 */
3416 for (unsigned p = 1; p < rep->nparts; p++) {
3417 struct pool_set_part *part = &rep->part[p];
3418 size_t targetsize = mapsize +
3419 ALIGN_DOWN(part->filesize - hdrsize,
3420 part->alignment);
3421 if (targetsize > rep->resvsize) {
3422 ERR(
3423 "pool mapping failed - address space reservation too small");
3424 errno = EINVAL;
3425 goto err;
3426 }
3427
3428 /* map data part */
3429 if (util_map_part(part, addr, 0, hdrsize,
3430 flags | MAP_FIXED, 0) != 0) {
3431 /*
3432 * if we can't map the part at the address we
3433 * asked for, unmap all the parts that are
3434 * mapped and remap at a different address.
3435 */
3436 if ((errno == EINVAL) &&
3437 (remaining_retries > 0)) {
3438 LOG(2, "usable space mapping failed - "
3439 "part #%d - retrying", p);
3440 retry_for_contiguous_addr = 1;
3441 remaining_retries--;
3442
3443 util_unmap_parts(rep, 0, p - 1);
3444
3445 /* release rest of the VA reserved */
3446 munmap(rep->part[0].addr,
3447 rep->resvsize);
3448 break;
3449 }
3450 LOG(2, "usable space mapping failed - part #%d",
3451 p);
3452 goto err;
3453 }
3454
3455 VALGRIND_REGISTER_PMEM_FILE(part->fd,
3456 part->addr, part->size, hdrsize);
3457
3458 mapsize += part->size;
3459 addr = (char *)addr + part->size;
3460 }
3461 } while (retry_for_contiguous_addr);
3462
3463 /*
3464 * Initially part[0].size is the size of address space
3465 * reservation for all parts from given replica. After
3466 * mapping that space we need to overwrite part[0].size
3467 * with its actual size to be consistent - size for each
3468 * part should be the actual mapping size of this part
3469 * only - it simplifies future calculations.
3470 */
3471 rep->part[0].size = rep->part[0].filesize & ~(Mmap_align - 1);
3472
3473 if (util_replica_check_map_sync(set, repidx, 1))
3474 goto err;
3475
3476 util_replica_set_is_pmem(rep);
3477
3478 if (Prefault_at_open)
3479 util_replica_force_page_allocation(rep);
3480
3481 ASSERTeq(mapsize, rep->repsize);
3482
3483 /* calculate pool size - choose the smallest replica size */
3484 if (rep->repsize < set->poolsize)
3485 set->poolsize = rep->repsize;
3486
3487 LOG(3, "replica addr %p", rep->part[0].addr);
3488
3489 return 0;
3490 err:
3491 LOG(4, "error clean up");
3492 int oerrno = errno;
3493 if (mapsize < rep->repsize) {
3494 ASSERTne(rep->part[0].addr, NULL);
3495 ASSERTne(rep->part[0].addr, MAP_FAILED);
3496 munmap(rep->part[0].addr, rep->resvsize - mapsize);
3497 }
3498 for (unsigned p = 0; p < rep->nhdrs; p++)
3499 util_unmap_hdr(&rep->part[p]);
3500 for (unsigned p = 0; p < rep->nparts; p++)
3501 util_unmap_part(&rep->part[p]);
3502 errno = oerrno;
3503 return -1;
3504 }
3505
3506 /*
3507 * util_replica_open_remote -- open a memory pool for remote replica
3508 */
3509 int
3510 util_replica_open_remote(struct pool_set *set, unsigned repidx, int flags)
3511 {
3512 LOG(3, "set %p repidx %u flags %d", set, repidx, flags);
3513
3514 struct pool_replica *rep = set->replica[repidx];
3515
3516 ASSERTne(rep->remote, NULL);
3517 ASSERTne(rep->part, NULL);
3518 ASSERTeq(rep->nparts, 1);
3519 ASSERTeq(rep->nhdrs, 1);
3520
3521 struct pool_set_part *part = rep->part;
3522
3523 part->size = rep->repsize;
3524 ASSERT(IS_PAGE_ALIGNED(part->size));
3525 part->remote_hdr = Zalloc(part->size + Pagesize);
3526 if (!part->remote_hdr) {
3527 ERR("!Zalloc");
3528 return -1;
3529 }
3530
3531 part->hdr = PAGE_ALIGN_UP(part->remote_hdr);
3532 part->addr = PAGE_ALIGN_UP(part->remote_hdr);
3533 part->hdrsize = POOL_HDR_SIZE;
3534
3535 LOG(3, "replica #%u addr %p", repidx, rep->part[0].addr);
3536
3537 return 0;
3538 }
3539
3540 /*
3541 * util_replica_open -- open a memory pool replica
3542 */
3543 int
3544 util_replica_open(struct pool_set *set, unsigned repidx, int flags)
3545 {
3546 LOG(3, "set %p repidx %u flags %d", set, repidx, flags);
3547
3548 if (set->replica[repidx]->remote)
3549 return util_replica_open_remote(set, repidx, flags);
3550
3551 return util_replica_open_local(set, repidx, flags);
3552 }
3553
3554 /*
3555 * util_replica_set_attr -- overwrite existing replica attributes
3556 */
3557 int
3558 util_replica_set_attr(struct pool_replica *rep,
3559 const struct rpmem_pool_attr *rattr)
3560 {
3561 LOG(3, "rep %p, rattr %p", rep, rattr);
3562 ASSERT(rattr != NULL || rep->nhdrs == 0);
3563
3564 if (rattr != NULL && rep->nhdrs == 0) {
3565 ERR(
3566 "cannot set pool attributes for a replica without headers (with the NOHDRS option)");
3567 errno = EINVAL;
3568 return -1;
3569 }
3570 int flags = MAP_SHARED;
3571
3572 /* map all headers - don't care about the address */
3573 for (unsigned p = 0; p < rep->nparts; p++) {
3574 if (util_map_hdr(&rep->part[p], flags, 0) != 0) {
3575 LOG(2, "header mapping failed - part #%d", p);
3576 goto err;
3577 }
3578 }
3579
3580 for (unsigned p = 0; p < rep->nhdrs; p++) {
3581 ASSERTne(rattr, NULL);
3582
3583 struct pool_hdr *hdrp = HDR(rep, p);
3584 ASSERTne(hdrp, NULL);
3585 util_convert2h_hdr_nocheck(hdrp);
3586
3587 util_set_rpmem_attr(hdrp, rattr);
3588
3589 if (hdrp == HDR(rep, 0))
3590 memcpy(hdrp->uuid, rattr->uuid, POOL_HDR_UUID_LEN);
3591 if (hdrp == HDRP(rep, 0))
3592 memcpy(hdrp->next_part_uuid, rattr->uuid,
3593 POOL_HDR_UUID_LEN);
3594 if (hdrp == HDRN(rep, 0))
3595 memcpy(hdrp->prev_part_uuid, rattr->uuid,
3596 POOL_HDR_UUID_LEN);
3597
3598 util_convert2le_hdr(hdrp);
3599
3600 util_checksum(hdrp, sizeof(*hdrp), &hdrp->checksum,
3601 1, POOL_HDR_CSUM_END_OFF(hdrp));
3602
3603 /* store pool's header */
3604 util_persist_auto(rep->is_pmem, hdrp, sizeof(*hdrp));
3605 }
3606
3607 /* unmap all headers */
3608 for (unsigned p = 0; p < rep->nhdrs; p++)
3609 util_unmap_hdr(&rep->part[p]);
3610
3611 return 0;
3612 err:
3613 for (unsigned p = 0; p < rep->nhdrs; p++) {
3614 util_unmap_hdr(&rep->part[p]);
3615 }
3616 return -1;
3617 }
3618
3619 /*
3620 * util_get_attr_from_header -- get pool attributes from a pool header
3621 */
3622 void
3623 util_pool_hdr2attr(struct pool_attr *attr, struct pool_hdr *hdr)
3624 {
3625 LOG(3, "attr %p, hdr %p", attr, hdr);
3626 ASSERTne(attr, NULL);
3627 ASSERTne(hdr, NULL);
3628 memset(attr, 0, sizeof(*attr));
3629 memcpy(attr->signature, hdr->signature, POOL_HDR_SIG_LEN);
3630 attr->major = hdr->major;
3631 attr->features.compat = hdr->features.compat;
3632 attr->features.incompat = hdr->features.incompat;
3633 attr->features.ro_compat = hdr->features.ro_compat;
3634 memcpy(attr->poolset_uuid, hdr->poolset_uuid, POOL_HDR_UUID_LEN);
3635 }
3636
3637 /*
3638 * util_copy_attr_to_header -- copy pool attributes into pool header
3639 */
3640 void
3641 util_pool_attr2hdr(struct pool_hdr *hdr, const struct pool_attr *attr)
3642 {
3643 LOG(3, "hdr %p, attr %p", hdr, attr);
3644 ASSERTne(hdr, NULL);
3645 ASSERTne(attr, NULL);
3646 memcpy(hdr->signature, attr->signature, POOL_HDR_SIG_LEN);
3647 hdr->major = attr->major;
3648 hdr->features.compat = attr->features.compat;
3649 hdr->features.incompat = attr->features.incompat;
3650 hdr->features.ro_compat = attr->features.ro_compat;
3651 }
3652
3653 /*
3654 * util_unmap_all_hdrs -- unmap all pool set headers
3655 */
3656 static void
3657 util_unmap_all_hdrs(struct pool_set *set)
3658 {
3659 LOG(3, "set %p", set);
3660
3661 for (unsigned r = 0; r < set->nreplicas; r++) {
3662 struct pool_replica *rep = set->replica[r];
3663 if (rep->remote == NULL) {
3664 for (unsigned p = 0; p < rep->nhdrs; p++)
3665 util_unmap_hdr(&rep->part[p]);
3666 } else {
3667 /*
3668 * hdr & hdrsize were set only for util_header_check(),
3669 * they will not be used any more. The memory will be
3670 * freed by util_replica_close()
3671 */
3672 rep->part[0].hdr = NULL;
3673 rep->part[0].hdrsize = 0;
3674 }
3675 }
3676 }
3677
3678 /*
3679 * util_replica_check -- check headers, check UUID's, check replicas linkage
3680 */
3681 static int
3682 util_replica_check(struct pool_set *set, const struct pool_attr *attr)
3683 {
3684 LOG(3, "set %p attr %p", set, attr);
3685
3686 /* read shutdown state toggle from header */
3687 set->ignore_sds |= IGNORE_SDS(HDR(REP(set, 0), 0));
3688
3689 for (unsigned r = 0; r < set->nreplicas; r++) {
3690 struct pool_replica *rep = set->replica[r];
3691 for (unsigned p = 0; p < rep->nhdrs; p++) {
3692 if (util_header_check(set, r, p, attr) != 0) {
3693 LOG(2, "header check failed - part #%d", p);
3694 return -1;
3695 }
3696 set->rdonly |= rep->part[p].rdonly;
3697 }
3698
3699 if (memcmp(HDR(REPP(set, r), 0)->uuid,
3700 HDR(REP(set, r), 0)->prev_repl_uuid,
3701 POOL_HDR_UUID_LEN) ||
3702 memcmp(HDR(REPN(set, r), 0)->uuid,
3703 HDR(REP(set, r), 0)->next_repl_uuid,
3704 POOL_HDR_UUID_LEN)) {
3705 ERR("wrong replica UUID");
3706 errno = EINVAL;
3707 return -1;
3708 }
3709 if (!set->ignore_sds && !rep->remote && rep->nhdrs) {
3710 struct shutdown_state sds;
3711 shutdown_state_init(&sds, NULL);
3712 for (unsigned p = 0; p < rep->nparts; p++) {
3713 if (shutdown_state_add_part(&sds,
3714 PART(rep, p)->fd, NULL))
3715 return -1;
3716 }
3717
3718 ASSERTne(rep->nhdrs, 0);
3719 ASSERTne(rep->nparts, 0);
3720 if (shutdown_state_check(&sds, &HDR(rep, 0)->sds,
3721 rep)) {
3722 LOG(2, "ADR failure detected");
3723 errno = EINVAL;
3724 return -1;
3725 }
3726 shutdown_state_set_dirty(&HDR(rep, 0)->sds,
3727 rep);
3728 }
3729 }
3730 return 0;
3731 }
3732
3733 /*
3734 * util_pool_has_device_dax -- (internal) check if poolset has any device dax
3735 */
3736 int
3737 util_pool_has_device_dax(struct pool_set *set)
3738 {
3739 for (unsigned r = 0; r < set->nreplicas; ++r) {
3740 struct pool_replica *rep = REP(set, r);
3741 /* either all the parts must be Device DAX or none */
3742 if (PART(rep, 0)->is_dev_dax)
3743 return 1;
3744 }
3745 return 0;
3746 }
3747
3748 /*
3749 * util_pool_open_nocheck -- open a memory pool (set or a single file)
3750 *
3751 * This function opens a pool set without checking the header values.
3752 */
3753 int
3754 util_pool_open_nocheck(struct pool_set *set, unsigned flags)
3755 {
3756 LOG(3, "set %p flags 0x%x", set, flags);
3757
3758 int cow = flags & POOL_OPEN_COW;
3759
3760 if (cow && util_pool_has_device_dax(set)) {
3761 ERR("device dax cannot be mapped privately");
3762 errno = ENOTSUP;
3763 return -1;
3764 }
3765
3766 int mmap_flags = cow ? MAP_PRIVATE|MAP_NORESERVE : MAP_SHARED;
3767 int oerrno;
3768
3769 ASSERTne(set, NULL);
3770 ASSERT(set->nreplicas > 0);
3771
3772 if (flags & POOL_OPEN_CHECK_BAD_BLOCKS) {
3773 /* check if any bad block recovery file exists */
3774 int bfe = badblocks_recovery_file_exists(set);
3775 if (bfe > 0) {
3776 ERR(
3777 "error: a bad block recovery file exists, run 'pmempool sync --bad-blocks' utility to try to recover the pool");
3778 errno = EINVAL;
3779 return -1;
3780 }
3781 if (bfe < 0) {
3782 LOG(1,
3783 "an error occurred when checking whether recovery file exists.");
3784 return -1;
3785 }
3786
3787 int bbs = badblocks_check_poolset(set, 0 /* not create */);
3788 if (bbs < 0) {
3789 LOG(1, "failed to check pool set for bad blocks");
3790 return -1;
3791 }
3792
3793 if (bbs > 0) {
3794 if (flags & POOL_OPEN_IGNORE_BAD_BLOCKS) {
3795 LOG(1,
3796 "WARNING: pool set contains bad blocks, ignoring");
3797 } else {
3798 ERR(
3799 "pool set contains bad blocks and cannot be opened, run 'pmempool sync --bad-blocks' utility to try to recover the pool");
3800 errno = EIO;
3801 return -1;
3802 }
3803 }
3804 }
3805
3806 if (set->remote && util_remote_load()) {
3807 ERR("the pool set requires a remote replica, "
3808 "but the '%s' library cannot be loaded",
3809 LIBRARY_REMOTE);
3810 return -1;
3811 }
3812
3813 int ret = util_poolset_files_local(set, 0 /* minpartsize */, 0);
3814 if (ret != 0)
3815 goto err_poolset;
3816
3817 set->rdonly = 0;
3818
3819 for (unsigned r = 0; r < set->nreplicas; r++) {
3820 if (util_replica_open(set, r, mmap_flags) != 0) {
3821 LOG(2, "replica #%u open failed", r);
3822 goto err_replica;
3823 }
3824 }
3825
3826 if (set->remote) {
3827 ret = util_poolset_files_remote(set, 0, NULL, 0);
3828 if (ret != 0)
3829 goto err_replica;
3830 }
3831
3832 util_unmap_all_hdrs(set);
3833
3834 return 0;
3835
3836 err_replica:
3837 LOG(4, "error clean up");
3838 oerrno = errno;
3839 for (unsigned r = 0; r < set->nreplicas; r++)
3840 util_replica_close(set, r);
3841 errno = oerrno;
3842 err_poolset:
3843 oerrno = errno;
3844 util_poolset_close(set, DO_NOT_DELETE_PARTS);
3845 errno = oerrno;
3846 return -1;
3847 }
3848
3849 /*
3850 * util_read_compat_features -- (internal) read compat features from the header
3851 */
3852 static int
3853 util_read_compat_features(struct pool_set *set, uint32_t *compat_features)
3854 {
3855 LOG(3, "set %p pcompat_features %p", set, compat_features);
3856
3857 *compat_features = 0;
3858
3859 for (unsigned r = 0; r < set->nreplicas; r++) {
3860 struct pool_replica *rep = set->replica[r];
3861
3862 if (rep->remote)
3863 continue;
3864
3865 for (unsigned p = 0; p < rep->nparts; p++) {
3866 struct pool_set_part *part = &rep->part[p];
3867
3868 if (util_part_open(part, 0, 0 /* create */)) {
3869 LOG(1, "!cannot open the part -- \"%s\"",
3870 part->path);
3871 /* try to open the next part */
3872 continue;
3873 }
3874
3875 if (util_map_hdr(part, MAP_SHARED, 0) != 0) {
3876 LOG(1, "header mapping failed -- \"%s\"",
3877 part->path);
3878 util_part_fdclose(part);
3879 return -1;
3880 }
3881
3882 struct pool_hdr *hdrp = part->hdr;
3883 *compat_features = hdrp->features.compat;
3884
3885 util_unmap_hdr(part);
3886 util_part_fdclose(part);
3887
3888 /* exit on the first successfully opened part */
3889 return 0;
3890 }
3891 }
3892
3893 return 0;
3894 }
3895
3896 /*
3897 * unlink_remote_replicas -- removes remote replicas from poolset
3898 *
3899 * It is necessary when COW flag is set because remote replicas
3900 * cannot be mapped privately
3901 */
3902 static int
3903 unlink_remote_replicas(struct pool_set *set)
3904 {
3905 unsigned i = 0;
3906 while (i < set->nreplicas) {
3907 if (set->replica[i]->remote == NULL) {
3908 i++;
3909 continue;
3910 }
3911
3912 util_replica_close(set, i);
3913 int ret = util_replica_close_remote(set->replica[i], i,
3914 DO_NOT_DELETE_PARTS);
3915 if (ret != 0)
3916 return ret;
3917
3918 size_t size = sizeof(set->replica[i]) *
3919 (set->nreplicas - i - 1);
3920 memmove(&set->replica[i], &set->replica[i + 1], size);
3921 set->nreplicas--;
3922 }
3923
3924 set->remote = 0;
3925 return 0;
3926 }
3927
3928 /*
3929 * util_pool_open -- open a memory pool (set or a single file)
3930 *
3931 * This routine does all the work, but takes a rdonly flag so internal
3932 * calls can map a read-only pool if required.
3933 */
3934 int
3935 util_pool_open(struct pool_set **setp, const char *path, size_t minpartsize,
3936 const struct pool_attr *attr, unsigned *nlanes, void *addr,
3937 unsigned flags)
3938 {
3939 LOG(3, "setp %p path %s minpartsize %zu attr %p nlanes %p "
3940 "addr %p flags 0x%x ", setp, path, minpartsize, attr, nlanes,
3941 addr, flags);
3942
3943 int cow = flags & POOL_OPEN_COW;
3944 int mmap_flags = cow ? MAP_PRIVATE|MAP_NORESERVE : MAP_SHARED;
3945 int oerrno;
3946
3947 /* do not check minsize */
3948 int ret = util_poolset_create_set(setp, path, 0, 0,
3949 flags & POOL_OPEN_IGNORE_SDS);
3950 if (ret < 0) {
3951 LOG(2, "cannot open pool set -- '%s'", path);
3952 return -1;
3953 }
3954
3955 if ((*setp)->replica[0]->nparts == 0) {
3956 errno = ENOENT;
3957 ERR("!no parts in replicas");
3958 goto err_poolset_free;
3959 }
3960
3961 if (cow && (*setp)->replica[0]->part[0].is_dev_dax) {
3962 ERR("device dax cannot be mapped privately");
3963 errno = ENOTSUP;
3964 goto err_poolset_free;
3965 }
3966
3967 struct pool_set *set = *setp;
3968
3969 ASSERT(set->nreplicas > 0);
3970
3971 uint32_t compat_features;
3972
3973 if (util_read_compat_features(set, &compat_features)) {
3974 LOG(1, "reading compat features failed");
3975 goto err_poolset_free;
3976 }
3977
3978 if (compat_features & POOL_FEAT_CHECK_BAD_BLOCKS) {
3979 /* check if any bad block recovery file exists */
3980 int bfe = badblocks_recovery_file_exists(set);
3981 if (bfe > 0) {
3982 ERR(
3983 "error: a bad block recovery file exists, run 'pmempool sync --bad-blocks' utility to try to recover the pool");
3984 errno = EINVAL;
3985 goto err_poolset_free;
3986 }
3987
3988 if (bfe < 0) {
3989 LOG(1,
3990 "an error occurred when checking whether recovery file exists.");
3991 goto err_poolset_free;
3992 }
3993
3994 int bbs = badblocks_check_poolset(set, 0 /* not create */);
3995 if (bbs < 0) {
3996 LOG(1,
3997 "failed to check pool set for bad blocks -- '%s'",
3998 path);
3999 goto err_poolset_free;
4000 }
4001
4002 if (bbs > 0) {
4003 if (flags & POOL_OPEN_IGNORE_BAD_BLOCKS) {
4004 LOG(1,
4005 "WARNING: pool set contains bad blocks, ignoring -- '%s'",
4006 path);
4007 } else {
4008 ERR(
4009 "pool set contains bad blocks and cannot be opened, run 'pmempool sync --bad-blocks' utility to try to recover the pool -- '%s'",
4010 path);
4011 errno = EIO;
4012 goto err_poolset_free;
4013 }
4014 }
4015 }
4016
4017 if (set->remote && util_remote_load()) {
4018 ERR(
4019 "the pool set requires a remote replica, but the '%s' library cannot be loaded",
4020 LIBRARY_REMOTE);
4021 goto err_poolset_free;
4022 }
4023
4024 ret = util_poolset_files_local(set, minpartsize, 0);
4025 if (ret != 0)
4026 goto err_poolset;
4027
4028 for (unsigned r = 0; r < set->nreplicas; r++) {
4029 if (util_replica_open(set, r, mmap_flags) != 0) {
4030 LOG(2, "replica #%u open failed", r);
4031 goto err_replica;
4032 }
4033 }
4034
4035 if (set->remote) {
4036 /* do not check minsize */
4037 ret = util_poolset_files_remote(set, 0, nlanes, 0);
4038 if (ret != 0)
4039 goto err_replica;
4040 }
4041
4042 /* check headers, check UUID's, check replicas linkage */
4043 if (attr != NULL && util_replica_check(set, attr))
4044 goto err_replica;
4045
4046 /* unmap all headers */
4047 util_unmap_all_hdrs(set);
4048
4049 /* remove all remote replicas from poolset when cow */
4050 if (cow && set->remote) {
4051 ret = unlink_remote_replicas(set);
4052 if (ret != 0)
4053 goto err_replica;
4054 }
4055
4056 return 0;
4057
4058 err_replica:
4059 LOG(4, "error clean up");
4060 oerrno = errno;
4061 for (unsigned r = 0; r < set->nreplicas; r++)
4062 util_replica_close(set, r);
4063 errno = oerrno;
4064 err_poolset:
4065 oerrno = errno;
4066 util_poolset_close(set, DO_NOT_DELETE_PARTS);
4067 errno = oerrno;
4068 return -1;
4069
4070 err_poolset_free:
4071 oerrno = errno;
4072 util_poolset_free(*setp);
4073 errno = oerrno;
4074 return -1;
4075 }
4076
4077 /*
4078 * util_pool_open_remote -- open a remote pool set file
4079 *
4080 * This routine does all the work, but takes a rdonly flag so internal
4081 * calls can map a read-only pool if required.
4082 */
4083 int
4084 util_pool_open_remote(struct pool_set **setp, const char *path, int cow,
4085 size_t minpartsize, struct rpmem_pool_attr *rattr)
4086 {
4087 LOG(3, "setp %p path %s cow %d minpartsize %zu rattr %p",
4088 setp, path, cow, minpartsize, rattr);
4089
4090 int flags = cow ? MAP_PRIVATE|MAP_NORESERVE : MAP_SHARED;
4091 int oerrno;
4092
4093 /* do not check minsize */
4094 int ret = util_poolset_create_set(setp, path, 0, 0, 0);
4095 if (ret < 0) {
4096 LOG(2, "cannot open pool set -- '%s'", path);
4097 return -1;
4098 }
4099
4100 if (cow && (*setp)->replica[0]->part[0].is_dev_dax) {
4101 ERR("device dax cannot be mapped privately");
4102 errno = ENOTSUP;
4103 return -1;
4104 }
4105
4106 struct pool_set *set = *setp;
4107
4108 if (set->nreplicas > 1) {
4109 LOG(2, "remote pool set cannot have replicas");
4110 goto err_poolset;
4111 }
4112
4113 uint32_t compat_features;
4114
4115 if (util_read_compat_features(set, &compat_features)) {
4116 LOG(1, "reading compat features failed");
4117 goto err_poolset;
4118 }
4119
4120 if (compat_features & POOL_FEAT_CHECK_BAD_BLOCKS) {
4121 /* check if there are any bad blocks */
4122 int bbs = badblocks_check_poolset(set, 0 /* not create */);
4123 if (bbs < 0) {
4124 LOG(1,
4125 "failed to check the remote replica for bad blocks -- '%s'",
4126 path);
4127 goto err_poolset;
4128 }
4129
4130 if (bbs > 0) {
4131 ERR(
4132 "remote replica contains bad blocks and cannot be opened, run 'pmempool sync --bad-blocks' utility to recreate it -- '%s'",
4133 path);
4134 errno = EIO;
4135 goto err_poolset;
4136 }
4137 }
4138
4139 ret = util_poolset_files_local(set, minpartsize, 0);
4140 if (ret != 0)
4141 goto err_poolset;
4142
4143 if (util_replica_open(set, 0, flags) != 0) {
4144 LOG(2, "replica open failed");
4145 goto err_replica;
4146 }
4147
4148 struct pool_replica *rep = set->replica[0];
4149
4150 set->rdonly |= rep->part[0].rdonly;
4151
4152 /* check headers, check UUID's, check replicas linkage */
4153 for (unsigned p = 0; p < rep->nhdrs; p++) {
4154 if (util_header_check_remote(set, p) != 0) {
4155 LOG(2, "header check failed - part #%d", p);
4156 goto err_replica;
4157 }
4158 set->rdonly |= rep->part[p].rdonly;
4159 }
4160
4161 if (rep->nhdrs > 0) {
4162 /* header exists, copy pool attributes */
4163 struct pool_hdr *hdr = rep->part[0].hdr;
4164 util_get_rpmem_attr(rattr, hdr);
4165 } else {
4166 /* header does not exist, zero pool attributes */
4167 memset(rattr, 0, sizeof(*rattr));
4168 }
4169
4170 /* unmap all headers */
4171 for (unsigned p = 0; p < rep->nhdrs; p++)
4172 util_unmap_hdr(&rep->part[p]);
4173
4174 return 0;
4175
4176 err_replica:
4177 LOG(4, "error clean up");
4178 oerrno = errno;
4179 util_replica_close(set, 0);
4180 errno = oerrno;
4181 err_poolset:
4182 oerrno = errno;
4183 util_poolset_close(set, DO_NOT_DELETE_PARTS);
4184 errno = oerrno;
4185 return -1;
4186 }
4187
4188 /*
4189 * util_is_poolset_file -- check if specified file is a poolset file
4190 *
4191 * Return value:
4192 * -1 - error
4193 * 0 - not a poolset
4194 * 1 - is a poolset
4195 */
4196 int
4197 util_is_poolset_file(const char *path)
4198 {
4199 enum file_type type = util_file_get_type(path);
4200 if (type < 0)
4201 return -1;
4202
4203 if (type == TYPE_DEVDAX)
4204 return 0;
4205
4206 int fd = util_file_open(path, NULL, 0, O_RDONLY);
4207 if (fd < 0)
4208 return -1;
4209
4210 int ret = 0;
4211 ssize_t sret;
4212 char signature[POOLSET_HDR_SIG_LEN];
4213 size_t rd = 0;
4214 do {
4215 sret = util_read(fd, &signature[rd], sizeof(signature) - rd);
4216 if (sret > 0)
4217 rd += (size_t)sret;
4218 } while (sret > 0);
4219 if (sret < 0) {
4220 ERR("!read");
4221 ret = -1;
4222 goto out;
4223 } else if (rd != sizeof(signature)) {
4224 ret = 0;
4225 goto out;
4226 }
4227
4228 if (memcmp(signature, POOLSET_HDR_SIG, POOLSET_HDR_SIG_LEN) == 0)
4229 ret = 1;
4230 out:
4231 os_close(fd);
4232 return ret;
4233 }
4234 /*
4235 * util_poolset_foreach_part_struct -- walk through all poolset file parts
4236 * of the given set
4237 *
4238 * Stops processing if callback returns non-zero value.
4239 * The value returned by callback is returned to the caller.
4240 */
4241 int
4242 util_poolset_foreach_part_struct(struct pool_set *set,
4243 int (*callback)(struct part_file *pf, void *arg), void *arg)
4244 {
4245 LOG(3, "set %p callback %p arg %p", set, callback, arg);
4246
4247 ASSERTne(callback, NULL);
4248
4249 int ret;
4250
4251 for (unsigned r = 0; r < set->nreplicas; r++) {
4252 struct part_file cbdata;
4253 if (set->replica[r]->remote) {
4254 cbdata.is_remote = 1;
4255 cbdata.remote = set->replica[r]->remote;
4256 cbdata.part = NULL;
4257 ret = (*callback)(&cbdata, arg);
4258 if (ret)
4259 return ret;
4260 } else {
4261 cbdata.is_remote = 0;
4262 cbdata.remote = NULL;
4263 for (unsigned p = 0; p < set->replica[r]->nparts; p++) {
4264 cbdata.part = &set->replica[r]->part[p];
4265 ret = (*callback)(&cbdata, arg);
4266 if (ret)
4267 return ret;
4268 }
4269 }
4270 }
4271
4272 return 0;
4273 }
4274
4275 /*
4276 * util_poolset_foreach_part -- walk through all poolset file parts
4277 *
4278 * Stops processing if callback returns non-zero value.
4279 * The value returned by callback is returned to the caller.
4280 *
4281 * Return value:
4282 * 0 - all part files have been processed
4283 * -1 - parsing poolset file error
4284 */
4285 int
4286 util_poolset_foreach_part(const char *path,
4287 int (*callback)(struct part_file *pf, void *arg), void *arg)
4288 {
4289 LOG(3, "path %s callback %p arg %p", path, callback, arg);
4290
4291 ASSERTne(callback, NULL);
4292
4293 int fd = os_open(path, O_RDONLY);
4294 if (fd < 0) {
4295 ERR("!open: path \"%s\"", path);
4296 return -1;
4297 }
4298
4299 struct pool_set *set;
4300 int ret = util_poolset_parse(&set, path, fd);
4301 if (ret) {
4302 ERR("util_poolset_parse failed -- '%s'", path);
4303 ret = -1;
4304 goto err_close;
4305 }
4306
4307 ret = util_poolset_foreach_part_struct(set, callback, arg);
4308
4309 /*
4310 * Make sure callback does not return -1,
4311 * because this value is reserved for parsing
4312 * error.
4313 */
4314 ASSERTne(ret, -1);
4315 util_poolset_free(set);
4316
4317 err_close:
4318 os_close(fd);
4319 return ret;
4320 }
4321
4322 /*
4323 * util_poolset_size -- get size of poolset, returns 0 on error
4324 */
4325 size_t
4326 util_poolset_size(const char *path)
4327 {
4328 int fd = os_open(path, O_RDONLY);
4329 if (fd < 0)
4330 return 0;
4331
4332 size_t size = 0;
4333 struct pool_set *set;
4334 if (util_poolset_parse(&set, path, fd))
4335 goto err_close;
4336
4337 size = set->poolsize;
4338
4339 util_poolset_free(set);
4340 err_close:
4341 os_close(fd);
4342 return size;
4343 }
4344
4345 /*
4346 * util_replica_fdclose -- close all parts of given replica
4347 */
4348 void
4349 util_replica_fdclose(struct pool_replica *rep)
4350 {
4351 for (unsigned p = 0; p < rep->nparts; p++) {
4352 struct pool_set_part *part = &rep->part[p];
4353 util_part_fdclose(part);
4354 }
4355 }
4356
4357 /*
4358 * util_replica_deep_common -- performs common calculations
4359 * on all parts from replica to define intersection ranges
4360 * for final flushing operations that take place in
4361 * os_part_deep_common function.
4362 */
4363 int
4364 util_replica_deep_common(const void *addr, size_t len, struct pool_set *set,
4365 unsigned replica_id, int flush)
4366 {
4367 LOG(3, "addr %p len %zu set %p replica_id %u flush %d",
4368 addr, len, set, replica_id, flush);
4369
4370 struct pool_replica *rep = set->replica[replica_id];
4371 uintptr_t rep_start = (uintptr_t)rep->part[0].addr;
4372 uintptr_t rep_end = rep_start + rep->repsize;
4373 uintptr_t start = (uintptr_t)addr;
4374 uintptr_t end = start + len;
4375
4376 ASSERT(start >= rep_start);
4377 ASSERT(end <= rep_end);
4378
4379 for (unsigned p = 0; p < rep->nparts; p++) {
4380 struct pool_set_part *part = &rep->part[p];
4381 uintptr_t part_start = (uintptr_t)part->addr;
4382 uintptr_t part_end = part_start + part->size;
4383 /* init intersection start and end addresses */
4384 uintptr_t range_start = start;
4385 uintptr_t range_end = end;
4386
4387 if (part_start > end || part_end < start)
4388 continue;
4389 /* recalculate intersection addresses */
4390 if (part_start > start)
4391 range_start = part_start;
4392 if (part_end < end)
4393 range_end = part_end;
4394 size_t range_len = range_end - range_start;
4395
4396 LOG(15, "perform deep flushing for replica %u "
4397 "part %p, addr %p, len %lu",
4398 replica_id, part, (void *)range_start, range_len);
4399 if (os_part_deep_common(rep, p, (void *)range_start,
4400 range_len, flush)) {
4401 LOG(1, "os_part_deep_common(%p, %p, %lu)",
4402 part, (void *)range_start, range_len);
4403 return -1;
4404 }
4405 }
4406 return 0;
4407 }
4408
4409 /*
4410 * util_replica_deep_persist -- wrapper for util_replica_deep_common
4411 * Calling the target precedes initialization of function that
4412 * partly defines way of deep replica flushing.
4413 */
4414 int
4415 util_replica_deep_persist(const void *addr, size_t len, struct pool_set *set,
4416 unsigned replica_id)
4417 {
4418 LOG(3, "addr %p len %zu set %p replica_id %u",
4419 addr, len, set, replica_id);
4420
4421 int flush = 1;
4422 return util_replica_deep_common(addr, len, set, replica_id, flush);
4423 }
4424
4425 /*
4426 * util_replica_deep_drain -- wrapper for util_replica_deep_common
4427 * Calling the target precedes initialization of function that
4428 * partly defines way of deep replica flushing.
4429 */
4430 int
4431 util_replica_deep_drain(const void *addr, size_t len, struct pool_set *set,
4432 unsigned replica_id)
4433 {
4434 LOG(3, "addr %p len %zu set %p replica_id %u",
4435 addr, len, set, replica_id);
4436
4437 int flush = 0;
4438 return util_replica_deep_common(addr, len, set, replica_id, flush);
4439 }