]> git.proxmox.com Git - ceph.git/blob - ceph/src/pmdk/src/libpmemobj/obj.c
import ceph 16.2.7
[ceph.git] / ceph / src / pmdk / src / libpmemobj / obj.c
1 // SPDX-License-Identifier: BSD-3-Clause
2 /* Copyright 2014-2020, Intel Corporation */
3
4 /*
5 * obj.c -- transactional object store implementation
6 */
7 #include <inttypes.h>
8 #include <limits.h>
9 #include <wchar.h>
10 #include <stdbool.h>
11
12 #include "valgrind_internal.h"
13 #include "libpmem.h"
14 #include "memblock.h"
15 #include "critnib.h"
16 #include "list.h"
17 #include "mmap.h"
18 #include "obj.h"
19 #include "ctl_global.h"
20 #include "ravl.h"
21
22 #include "heap_layout.h"
23 #include "os.h"
24 #include "os_thread.h"
25 #include "pmemops.h"
26 #include "set.h"
27 #include "sync.h"
28 #include "tx.h"
29 #include "sys_util.h"
30
31 /*
32 * The variable from which the config is directly loaded. The string
33 * cannot contain any comments or extraneous white characters.
34 */
35 #define OBJ_CONFIG_ENV_VARIABLE "PMEMOBJ_CONF"
36
37 /*
38 * The variable that points to a config file from which the config is loaded.
39 */
40 #define OBJ_CONFIG_FILE_ENV_VARIABLE "PMEMOBJ_CONF_FILE"
41
42 /*
43 * The variable which overwrites a number of lanes available at runtime.
44 */
45 #define OBJ_NLANES_ENV_VARIABLE "PMEMOBJ_NLANES"
46
47 #define OBJ_X_VALID_FLAGS PMEMOBJ_F_RELAXED
48
49 static const struct pool_attr Obj_create_attr = {
50 OBJ_HDR_SIG,
51 OBJ_FORMAT_MAJOR,
52 OBJ_FORMAT_FEAT_DEFAULT,
53 {0}, {0}, {0}, {0}, {0}
54 };
55
56 static const struct pool_attr Obj_open_attr = {
57 OBJ_HDR_SIG,
58 OBJ_FORMAT_MAJOR,
59 OBJ_FORMAT_FEAT_CHECK,
60 {0}, {0}, {0}, {0}, {0}
61 };
62
63 static struct critnib *pools_ht; /* hash table used for searching by UUID */
64 static struct critnib *pools_tree; /* tree used for searching by address */
65
66 int _pobj_cache_invalidate;
67
68 #ifndef _WIN32
69
70 __thread struct _pobj_pcache _pobj_cached_pool;
71
72 /*
73 * pmemobj_direct -- returns the direct pointer of an object
74 */
75 void *
76 pmemobj_direct(PMEMoid oid)
77 {
78 return pmemobj_direct_inline(oid);
79 }
80
81 #else /* _WIN32 */
82
83 /*
84 * XXX - this is a temporary implementation
85 *
86 * Seems like we could still use TLS and simply substitute "__thread" with
87 * "__declspec(thread)", however it's not clear if it would work correctly
88 * with Windows DLL's.
89 * Need to verify that once we have the multi-threaded tests ported.
90 */
91
92 struct _pobj_pcache {
93 PMEMobjpool *pop;
94 uint64_t uuid_lo;
95 int invalidate;
96 };
97
98 static os_once_t Cached_pool_key_once = OS_ONCE_INIT;
99 static os_tls_key_t Cached_pool_key;
100
101 /*
102 * _Cached_pool_key_alloc -- (internal) allocate pool cache pthread key
103 */
104 static void
105 _Cached_pool_key_alloc(void)
106 {
107 int pth_ret = os_tls_key_create(&Cached_pool_key, free);
108 if (pth_ret)
109 FATAL("!os_tls_key_create");
110 }
111
112 /*
113 * pmemobj_direct -- returns the direct pointer of an object
114 */
115 void *
116 pmemobj_direct(PMEMoid oid)
117 {
118 if (oid.off == 0 || oid.pool_uuid_lo == 0)
119 return NULL;
120
121 struct _pobj_pcache *pcache = os_tls_get(Cached_pool_key);
122 if (pcache == NULL) {
123 pcache = calloc(sizeof(struct _pobj_pcache), 1);
124 if (pcache == NULL)
125 FATAL("!pcache malloc");
126 int ret = os_tls_set(Cached_pool_key, pcache);
127 if (ret)
128 FATAL("!os_tls_set");
129 }
130
131 if (_pobj_cache_invalidate != pcache->invalidate ||
132 pcache->uuid_lo != oid.pool_uuid_lo) {
133 pcache->invalidate = _pobj_cache_invalidate;
134
135 if ((pcache->pop = pmemobj_pool_by_oid(oid)) == NULL) {
136 pcache->uuid_lo = 0;
137 return NULL;
138 }
139
140 pcache->uuid_lo = oid.pool_uuid_lo;
141 }
142
143 return (void *)((uintptr_t)pcache->pop + oid.off);
144 }
145
146 #endif /* _WIN32 */
147
148 /*
149 * obj_ctl_init_and_load -- (static) initializes CTL and loads configuration
150 * from env variable and file
151 */
152 static int
153 obj_ctl_init_and_load(PMEMobjpool *pop)
154 {
155 LOG(3, "pop %p", pop);
156
157 if (pop != NULL && (pop->ctl = ctl_new()) == NULL) {
158 LOG(2, "!ctl_new");
159 return -1;
160 }
161
162 if (pop) {
163 tx_ctl_register(pop);
164 pmalloc_ctl_register(pop);
165 stats_ctl_register(pop);
166 debug_ctl_register(pop);
167 }
168
169 char *env_config = os_getenv(OBJ_CONFIG_ENV_VARIABLE);
170 if (env_config != NULL) {
171 if (ctl_load_config_from_string(pop ? pop->ctl : NULL,
172 pop, env_config) != 0) {
173 LOG(2, "unable to parse config stored in %s "
174 "environment variable",
175 OBJ_CONFIG_ENV_VARIABLE);
176 goto err;
177 }
178 }
179
180 char *env_config_file = os_getenv(OBJ_CONFIG_FILE_ENV_VARIABLE);
181 if (env_config_file != NULL && env_config_file[0] != '\0') {
182 if (ctl_load_config_from_file(pop ? pop->ctl : NULL,
183 pop, env_config_file) != 0) {
184 LOG(2, "unable to parse config stored in %s "
185 "file (from %s environment variable)",
186 env_config_file,
187 OBJ_CONFIG_FILE_ENV_VARIABLE);
188 goto err;
189 }
190 }
191
192 return 0;
193 err:
194 if (pop)
195 ctl_delete(pop->ctl);
196 return -1;
197 }
198
199 /*
200 * obj_pool_init -- (internal) allocate global structs holding all opened pools
201 *
202 * This is invoked on a first call to pmemobj_open() or pmemobj_create().
203 * Memory is released in library destructor.
204 *
205 * This function needs to be threadsafe.
206 */
207 static void
208 obj_pool_init(void)
209 {
210 LOG(3, NULL);
211
212 struct critnib *c;
213
214 if (pools_ht == NULL) {
215 c = critnib_new();
216 if (c == NULL)
217 FATAL("!critnib_new for pools_ht");
218 if (!util_bool_compare_and_swap64(&pools_ht, NULL, c))
219 critnib_delete(c);
220 }
221
222 if (pools_tree == NULL) {
223 c = critnib_new();
224 if (c == NULL)
225 FATAL("!critnib_new for pools_tree");
226 if (!util_bool_compare_and_swap64(&pools_tree, NULL, c))
227 critnib_delete(c);
228 }
229 }
230
231 /*
232 * pmemobj_oid -- return a PMEMoid based on the virtual address
233 *
234 * If the address does not belong to any pool OID_NULL is returned.
235 */
236 PMEMoid
237 pmemobj_oid(const void *addr)
238 {
239 PMEMobjpool *pop = pmemobj_pool_by_ptr(addr);
240 if (pop == NULL)
241 return OID_NULL;
242
243 PMEMoid oid = {pop->uuid_lo, (uintptr_t)addr - (uintptr_t)pop};
244 return oid;
245 }
246
247 /*
248 * obj_init -- initialization of obj
249 *
250 * Called by constructor.
251 */
252 void
253 obj_init(void)
254 {
255 LOG(3, NULL);
256
257 COMPILE_ERROR_ON(sizeof(struct pmemobjpool) !=
258 POOL_HDR_SIZE + POOL_DESC_SIZE);
259
260 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_NODRAIN != PMEM_F_MEM_NODRAIN);
261
262 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_NONTEMPORAL != PMEM_F_MEM_NONTEMPORAL);
263 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_TEMPORAL != PMEM_F_MEM_TEMPORAL);
264
265 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_WC != PMEM_F_MEM_WC);
266 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_WB != PMEM_F_MEM_WB);
267
268 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_NOFLUSH != PMEM_F_MEM_NOFLUSH);
269
270 #ifdef _WIN32
271 /* XXX - temporary implementation (see above) */
272 os_once(&Cached_pool_key_once, _Cached_pool_key_alloc);
273 #endif
274 /*
275 * Load global config, ignore any issues. They will be caught on the
276 * subsequent call to this function for individual pools.
277 */
278 ctl_global_register();
279
280 if (obj_ctl_init_and_load(NULL))
281 FATAL("error: %s", pmemobj_errormsg());
282
283 lane_info_boot();
284
285 util_remote_init();
286 }
287
288 /*
289 * obj_fini -- cleanup of obj
290 *
291 * Called by destructor.
292 */
293 void
294 obj_fini(void)
295 {
296 LOG(3, NULL);
297
298 if (pools_ht)
299 critnib_delete(pools_ht);
300 if (pools_tree)
301 critnib_delete(pools_tree);
302 lane_info_destroy();
303 util_remote_fini();
304
305 #ifdef _WIN32
306 (void) os_tls_key_delete(Cached_pool_key);
307 #endif
308 }
309
310 /*
311 * obj_drain_empty -- (internal) empty function for drain on non-pmem memory
312 */
313 static void
314 obj_drain_empty(void)
315 {
316 /* do nothing */
317 }
318
319 /*
320 * obj_msync_nofail -- (internal) pmem_msync wrapper that never fails from
321 * caller's perspective
322 */
323 static void
324 obj_msync_nofail(const void *addr, size_t size)
325 {
326 if (pmem_msync(addr, size))
327 FATAL("!pmem_msync");
328 }
329
330 /*
331 * obj_nopmem_memcpy -- (internal) memcpy followed by an msync
332 */
333 static void *
334 obj_nopmem_memcpy(void *dest, const void *src, size_t len, unsigned flags)
335 {
336 LOG(15, "dest %p src %p len %zu flags 0x%x", dest, src, len, flags);
337
338 /*
339 * Use pmem_memcpy instead of memcpy, because pmemobj_memcpy is supposed
340 * to guarantee that multiple of 8 byte stores to 8 byte aligned
341 * addresses are fail safe atomic. pmem_memcpy guarantees that, while
342 * libc memcpy does not.
343 */
344 pmem_memcpy(dest, src, len, PMEM_F_MEM_NOFLUSH);
345 obj_msync_nofail(dest, len);
346 return dest;
347 }
348
349 /*
350 * obj_nopmem_memmove -- (internal) memmove followed by an msync
351 */
352 static void *
353 obj_nopmem_memmove(void *dest, const void *src, size_t len, unsigned flags)
354 {
355 LOG(15, "dest %p src %p len %zu flags 0x%x", dest, src, len, flags);
356
357 /* see comment in obj_nopmem_memcpy */
358 pmem_memmove(dest, src, len, PMEM_F_MEM_NOFLUSH);
359 obj_msync_nofail(dest, len);
360 return dest;
361 }
362
363 /*
364 * obj_nopmem_memset -- (internal) memset followed by an msync
365 */
366 static void *
367 obj_nopmem_memset(void *dest, int c, size_t len, unsigned flags)
368 {
369 LOG(15, "dest %p c 0x%02x len %zu flags 0x%x", dest, c, len, flags);
370
371 /* see comment in obj_nopmem_memcpy */
372 pmem_memset(dest, c, len, PMEM_F_MEM_NOFLUSH);
373 obj_msync_nofail(dest, len);
374 return dest;
375 }
376
377 /*
378 * obj_remote_persist -- (internal) remote persist function
379 */
380 static int
381 obj_remote_persist(PMEMobjpool *pop, const void *addr, size_t len,
382 unsigned lane, unsigned flags)
383 {
384 LOG(15, "pop %p addr %p len %zu lane %u flags %u",
385 pop, addr, len, lane, flags);
386
387 ASSERTne(pop->rpp, NULL);
388
389 uintptr_t offset = (uintptr_t)addr - pop->remote_base;
390
391 unsigned rpmem_flags = 0;
392 if (flags & PMEMOBJ_F_RELAXED)
393 rpmem_flags |= RPMEM_PERSIST_RELAXED;
394
395 int rv = Rpmem_persist(pop->rpp, offset, len, lane, rpmem_flags);
396 if (rv) {
397 ERR("!rpmem_persist(rpp %p offset %zu length %zu lane %u)"
398 " FATAL ERROR (returned value %i)",
399 pop->rpp, offset, len, lane, rv);
400 return -1;
401 }
402
403 return 0;
404 }
405
406 /*
407 * XXX - Consider removing obj_norep_*() wrappers to call *_local()
408 * functions directly. Alternatively, always use obj_rep_*(), even
409 * if there are no replicas. Verify the performance penalty.
410 */
411
412 /*
413 * obj_norep_memcpy -- (internal) memcpy w/o replication
414 */
415 static void *
416 obj_norep_memcpy(void *ctx, void *dest, const void *src, size_t len,
417 unsigned flags)
418 {
419 PMEMobjpool *pop = ctx;
420 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop, dest, src, len,
421 flags);
422
423 return pop->memcpy_local(dest, src, len,
424 flags & PMEM_F_MEM_VALID_FLAGS);
425 }
426
427 /*
428 * obj_norep_memmove -- (internal) memmove w/o replication
429 */
430 static void *
431 obj_norep_memmove(void *ctx, void *dest, const void *src, size_t len,
432 unsigned flags)
433 {
434 PMEMobjpool *pop = ctx;
435 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop, dest, src, len,
436 flags);
437
438 return pop->memmove_local(dest, src, len,
439 flags & PMEM_F_MEM_VALID_FLAGS);
440 }
441
442 /*
443 * obj_norep_memset -- (internal) memset w/o replication
444 */
445 static void *
446 obj_norep_memset(void *ctx, void *dest, int c, size_t len, unsigned flags)
447 {
448 PMEMobjpool *pop = ctx;
449 LOG(15, "pop %p dest %p c 0x%02x len %zu flags 0x%x", pop, dest, c, len,
450 flags);
451
452 return pop->memset_local(dest, c, len, flags & PMEM_F_MEM_VALID_FLAGS);
453 }
454
455 /*
456 * obj_norep_persist -- (internal) persist w/o replication
457 */
458 static int
459 obj_norep_persist(void *ctx, const void *addr, size_t len, unsigned flags)
460 {
461 PMEMobjpool *pop = ctx;
462 LOG(15, "pop %p addr %p len %zu", pop, addr, len);
463
464 pop->persist_local(addr, len);
465
466 return 0;
467 }
468
469 /*
470 * obj_norep_flush -- (internal) flush w/o replication
471 */
472 static int
473 obj_norep_flush(void *ctx, const void *addr, size_t len, unsigned flags)
474 {
475 PMEMobjpool *pop = ctx;
476 LOG(15, "pop %p addr %p len %zu", pop, addr, len);
477
478 pop->flush_local(addr, len);
479
480 return 0;
481 }
482
483 /*
484 * obj_norep_drain -- (internal) drain w/o replication
485 */
486 static void
487 obj_norep_drain(void *ctx)
488 {
489 PMEMobjpool *pop = ctx;
490 LOG(15, "pop %p", pop);
491
492 pop->drain_local();
493 }
494
495 static void obj_pool_cleanup(PMEMobjpool *pop);
496
497 /*
498 * obj_handle_remote_persist_error -- (internal) handle remote persist
499 * fatal error
500 */
501 static void
502 obj_handle_remote_persist_error(PMEMobjpool *pop)
503 {
504 LOG(1, "pop %p", pop);
505
506 ERR("error clean up...");
507 obj_pool_cleanup(pop);
508
509 FATAL("Fatal error of remote persist. Aborting...");
510 }
511
512 /*
513 * obj_rep_memcpy -- (internal) memcpy with replication
514 */
515 static void *
516 obj_rep_memcpy(void *ctx, void *dest, const void *src, size_t len,
517 unsigned flags)
518 {
519 PMEMobjpool *pop = ctx;
520 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop, dest, src, len,
521 flags);
522
523 unsigned lane = UINT_MAX;
524
525 if (pop->has_remote_replicas)
526 lane = lane_hold(pop, NULL);
527
528 void *ret = pop->memcpy_local(dest, src, len, flags);
529
530 PMEMobjpool *rep = pop->replica;
531 while (rep) {
532 void *rdest = (char *)rep + (uintptr_t)dest - (uintptr_t)pop;
533 if (rep->rpp == NULL) {
534 rep->memcpy_local(rdest, src, len,
535 flags & PMEM_F_MEM_VALID_FLAGS);
536 } else {
537 if (rep->persist_remote(rep, rdest, len, lane, flags))
538 obj_handle_remote_persist_error(pop);
539 }
540 rep = rep->replica;
541 }
542
543 if (pop->has_remote_replicas)
544 lane_release(pop);
545
546 return ret;
547 }
548
549 /*
550 * obj_rep_memmove -- (internal) memmove with replication
551 */
552 static void *
553 obj_rep_memmove(void *ctx, void *dest, const void *src, size_t len,
554 unsigned flags)
555 {
556 PMEMobjpool *pop = ctx;
557 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop, dest, src, len,
558 flags);
559
560 unsigned lane = UINT_MAX;
561
562 if (pop->has_remote_replicas)
563 lane = lane_hold(pop, NULL);
564
565 void *ret = pop->memmove_local(dest, src, len, flags);
566
567 PMEMobjpool *rep = pop->replica;
568 while (rep) {
569 void *rdest = (char *)rep + (uintptr_t)dest - (uintptr_t)pop;
570 if (rep->rpp == NULL) {
571 rep->memmove_local(rdest, src, len,
572 flags & PMEM_F_MEM_VALID_FLAGS);
573 } else {
574 if (rep->persist_remote(rep, rdest, len, lane, flags))
575 obj_handle_remote_persist_error(pop);
576 }
577 rep = rep->replica;
578 }
579
580 if (pop->has_remote_replicas)
581 lane_release(pop);
582
583 return ret;
584 }
585
586 /*
587 * obj_rep_memset -- (internal) memset with replication
588 */
589 static void *
590 obj_rep_memset(void *ctx, void *dest, int c, size_t len, unsigned flags)
591 {
592 PMEMobjpool *pop = ctx;
593 LOG(15, "pop %p dest %p c 0x%02x len %zu flags 0x%x", pop, dest, c, len,
594 flags);
595
596 unsigned lane = UINT_MAX;
597
598 if (pop->has_remote_replicas)
599 lane = lane_hold(pop, NULL);
600
601 void *ret = pop->memset_local(dest, c, len, flags);
602
603 PMEMobjpool *rep = pop->replica;
604 while (rep) {
605 void *rdest = (char *)rep + (uintptr_t)dest - (uintptr_t)pop;
606 if (rep->rpp == NULL) {
607 rep->memset_local(rdest, c, len,
608 flags & PMEM_F_MEM_VALID_FLAGS);
609 } else {
610 if (rep->persist_remote(rep, rdest, len, lane, flags))
611 obj_handle_remote_persist_error(pop);
612 }
613 rep = rep->replica;
614 }
615
616 if (pop->has_remote_replicas)
617 lane_release(pop);
618
619 return ret;
620 }
621
622 /*
623 * obj_rep_persist -- (internal) persist with replication
624 */
625 static int
626 obj_rep_persist(void *ctx, const void *addr, size_t len, unsigned flags)
627 {
628 PMEMobjpool *pop = ctx;
629 LOG(15, "pop %p addr %p len %zu", pop, addr, len);
630
631 unsigned lane = UINT_MAX;
632
633 if (pop->has_remote_replicas)
634 lane = lane_hold(pop, NULL);
635
636 pop->persist_local(addr, len);
637
638 PMEMobjpool *rep = pop->replica;
639 while (rep) {
640 void *raddr = (char *)rep + (uintptr_t)addr - (uintptr_t)pop;
641 if (rep->rpp == NULL) {
642 rep->memcpy_local(raddr, addr, len, 0);
643 } else {
644 if (rep->persist_remote(rep, raddr, len, lane, flags))
645 obj_handle_remote_persist_error(pop);
646 }
647 rep = rep->replica;
648 }
649
650 if (pop->has_remote_replicas)
651 lane_release(pop);
652
653 return 0;
654 }
655
656 /*
657 * obj_rep_flush -- (internal) flush with replication
658 */
659 static int
660 obj_rep_flush(void *ctx, const void *addr, size_t len, unsigned flags)
661 {
662 PMEMobjpool *pop = ctx;
663 LOG(15, "pop %p addr %p len %zu", pop, addr, len);
664
665 unsigned lane = UINT_MAX;
666
667 if (pop->has_remote_replicas)
668 lane = lane_hold(pop, NULL);
669
670 pop->flush_local(addr, len);
671
672 PMEMobjpool *rep = pop->replica;
673 while (rep) {
674 void *raddr = (char *)rep + (uintptr_t)addr - (uintptr_t)pop;
675 if (rep->rpp == NULL) {
676 rep->memcpy_local(raddr, addr, len,
677 PMEM_F_MEM_NODRAIN);
678 } else {
679 if (rep->persist_remote(rep, raddr, len, lane, flags))
680 obj_handle_remote_persist_error(pop);
681 }
682 rep = rep->replica;
683 }
684
685 if (pop->has_remote_replicas)
686 lane_release(pop);
687
688 return 0;
689 }
690
691 /*
692 * obj_rep_drain -- (internal) drain with replication
693 */
694 static void
695 obj_rep_drain(void *ctx)
696 {
697 PMEMobjpool *pop = ctx;
698 LOG(15, "pop %p", pop);
699
700 pop->drain_local();
701
702 PMEMobjpool *rep = pop->replica;
703 while (rep) {
704 if (rep->rpp == NULL)
705 rep->drain_local();
706 rep = rep->replica;
707 }
708 }
709
710 #if VG_MEMCHECK_ENABLED
711 /*
712 * Arbitrary value. When there's more undefined regions than MAX_UNDEFS, it's
713 * not worth reporting everything - developer should fix the code.
714 */
715 #define MAX_UNDEFS 1000
716
717 /*
718 * obj_vg_check_no_undef -- (internal) check whether there are any undefined
719 * regions
720 */
721 static void
722 obj_vg_check_no_undef(struct pmemobjpool *pop)
723 {
724 LOG(4, "pop %p", pop);
725
726 struct {
727 void *start, *end;
728 } undefs[MAX_UNDEFS];
729 int num_undefs = 0;
730
731 VALGRIND_DO_DISABLE_ERROR_REPORTING;
732 char *addr_start = pop->addr;
733 char *addr_end = addr_start + pop->set->poolsize;
734
735 while (addr_start < addr_end) {
736 char *noaccess = (char *)VALGRIND_CHECK_MEM_IS_ADDRESSABLE(
737 addr_start, addr_end - addr_start);
738 if (noaccess == NULL)
739 noaccess = addr_end;
740
741 while (addr_start < noaccess) {
742 char *undefined =
743 (char *)VALGRIND_CHECK_MEM_IS_DEFINED(
744 addr_start, noaccess - addr_start);
745
746 if (undefined) {
747 addr_start = undefined;
748
749 #ifdef VALGRIND_CHECK_MEM_IS_UNDEFINED
750 addr_start = (char *)
751 VALGRIND_CHECK_MEM_IS_UNDEFINED(
752 addr_start, noaccess - addr_start);
753 if (addr_start == NULL)
754 addr_start = noaccess;
755 #else
756 while (addr_start < noaccess &&
757 VALGRIND_CHECK_MEM_IS_DEFINED(
758 addr_start, 1))
759 addr_start++;
760 #endif
761
762 if (num_undefs < MAX_UNDEFS) {
763 undefs[num_undefs].start = undefined;
764 undefs[num_undefs].end = addr_start - 1;
765 num_undefs++;
766 }
767 } else
768 addr_start = noaccess;
769 }
770
771 #ifdef VALGRIND_CHECK_MEM_IS_UNADDRESSABLE
772 addr_start = (char *)VALGRIND_CHECK_MEM_IS_UNADDRESSABLE(
773 addr_start, addr_end - addr_start);
774 if (addr_start == NULL)
775 addr_start = addr_end;
776 #else
777 while (addr_start < addr_end &&
778 (char *)VALGRIND_CHECK_MEM_IS_ADDRESSABLE(
779 addr_start, 1) == addr_start)
780 addr_start++;
781 #endif
782 }
783 VALGRIND_DO_ENABLE_ERROR_REPORTING;
784
785 if (num_undefs) {
786 /*
787 * How to resolve this error:
788 * If it's part of the free space Valgrind should be told about
789 * it by VALGRIND_DO_MAKE_MEM_NOACCESS request. If it's
790 * allocated - initialize it or use VALGRIND_DO_MAKE_MEM_DEFINED
791 * request.
792 */
793
794 VALGRIND_PRINTF("Part of the pool is left in undefined state on"
795 " boot. This is pmemobj's bug.\nUndefined"
796 " regions: [pool address: %p]\n", pop);
797 for (int i = 0; i < num_undefs; ++i)
798 VALGRIND_PRINTF(" [%p, %p]\n", undefs[i].start,
799 undefs[i].end);
800 if (num_undefs == MAX_UNDEFS)
801 VALGRIND_PRINTF(" ...\n");
802
803 /* Trigger error. */
804 VALGRIND_CHECK_MEM_IS_DEFINED(undefs[0].start, 1);
805 }
806 }
807
808 /*
809 * obj_vg_boot -- (internal) notify Valgrind about pool objects
810 */
811 static void
812 obj_vg_boot(struct pmemobjpool *pop)
813 {
814 if (!On_memcheck)
815 return;
816
817 LOG(4, "pop %p", pop);
818
819 if (os_getenv("PMEMOBJ_VG_CHECK_UNDEF"))
820 obj_vg_check_no_undef(pop);
821 }
822
823 #endif
824
825 /*
826 * obj_runtime_init_common -- (internal) runtime initialization
827 *
828 * Common routine for create/open and check.
829 */
830 static int
831 obj_runtime_init_common(PMEMobjpool *pop)
832 {
833 LOG(3, "pop %p", pop);
834
835 if ((errno = lane_boot(pop)) != 0) {
836 ERR("!lane_boot");
837 return errno;
838 }
839
840 if ((errno = lane_recover_and_section_boot(pop)) != 0) {
841 ERR("!lane_recover_and_section_boot");
842 return errno;
843 }
844
845 pop->conversion_flags = 0;
846 pmemops_persist(&pop->p_ops,
847 &pop->conversion_flags, sizeof(pop->conversion_flags));
848
849 return 0;
850 }
851
852 /*
853 * obj_runtime_cleanup_common -- (internal) runtime cleanup
854 *
855 * Common routine for create/open and check
856 */
857 static void
858 obj_runtime_cleanup_common(PMEMobjpool *pop)
859 {
860 lane_section_cleanup(pop);
861 lane_cleanup(pop);
862 }
863
864 /*
865 * obj_descr_create -- (internal) create obj pool descriptor
866 */
867 static int
868 obj_descr_create(PMEMobjpool *pop, const char *layout, size_t poolsize)
869 {
870 LOG(3, "pop %p layout %s poolsize %zu", pop, layout, poolsize);
871
872 ASSERTeq(poolsize % Pagesize, 0);
873
874 /* opaque info lives at the beginning of mapped memory pool */
875 void *dscp = (void *)((uintptr_t)pop + sizeof(struct pool_hdr));
876
877 /* create the persistent part of pool's descriptor */
878 memset(dscp, 0, OBJ_DSC_P_SIZE);
879 if (layout)
880 strncpy(pop->layout, layout, PMEMOBJ_MAX_LAYOUT - 1);
881 struct pmem_ops *p_ops = &pop->p_ops;
882
883 pop->lanes_offset = OBJ_LANES_OFFSET;
884 pop->nlanes = OBJ_NLANES;
885
886 /* zero all lanes */
887 lane_init_data(pop);
888
889 pop->heap_offset = pop->lanes_offset +
890 pop->nlanes * sizeof(struct lane_layout);
891 pop->heap_offset = (pop->heap_offset + Pagesize - 1) & ~(Pagesize - 1);
892
893 size_t heap_size = pop->set->poolsize - pop->heap_offset;
894
895 /* initialize heap prior to storing the checksum */
896 errno = palloc_init((char *)pop + pop->heap_offset, heap_size,
897 &pop->heap_size, p_ops);
898 if (errno != 0) {
899 ERR("!palloc_init");
900 return -1;
901 }
902
903 util_checksum(dscp, OBJ_DSC_P_SIZE, &pop->checksum, 1, 0);
904
905 /*
906 * store the persistent part of pool's descriptor (2kB)
907 *
908 * It's safe to use PMEMOBJ_F_RELAXED flag because the entire
909 * structure is protected by checksum.
910 */
911 pmemops_xpersist(p_ops, dscp, OBJ_DSC_P_SIZE, PMEMOBJ_F_RELAXED);
912
913 /* initialize run_id, it will be incremented later */
914 pop->run_id = 0;
915 pmemops_persist(p_ops, &pop->run_id, sizeof(pop->run_id));
916
917 pop->root_offset = 0;
918 pmemops_persist(p_ops, &pop->root_offset, sizeof(pop->root_offset));
919 pop->root_size = 0;
920 pmemops_persist(p_ops, &pop->root_size, sizeof(pop->root_size));
921
922 pop->conversion_flags = 0;
923 pmemops_persist(p_ops, &pop->conversion_flags,
924 sizeof(pop->conversion_flags));
925
926 /*
927 * It's safe to use PMEMOBJ_F_RELAXED flag because the reserved
928 * area must be entirely zeroed.
929 */
930 pmemops_memset(p_ops, pop->pmem_reserved, 0,
931 sizeof(pop->pmem_reserved), PMEMOBJ_F_RELAXED);
932
933 return 0;
934 }
935
936 /*
937 * obj_descr_check -- (internal) validate obj pool descriptor
938 */
939 static int
940 obj_descr_check(PMEMobjpool *pop, const char *layout, size_t poolsize)
941 {
942 LOG(3, "pop %p layout %s poolsize %zu", pop, layout, poolsize);
943
944 void *dscp = (void *)((uintptr_t)pop + sizeof(struct pool_hdr));
945
946 if (pop->rpp) {
947 /* read remote descriptor */
948 if (obj_read_remote(pop->rpp, pop->remote_base, dscp, dscp,
949 OBJ_DSC_P_SIZE)) {
950 ERR("!obj_read_remote");
951 return -1;
952 }
953 }
954
955 if (!util_checksum(dscp, OBJ_DSC_P_SIZE, &pop->checksum, 0, 0)) {
956 ERR("invalid checksum of pool descriptor");
957 errno = EINVAL;
958 return -1;
959 }
960
961 if (layout &&
962 strncmp(pop->layout, layout, PMEMOBJ_MAX_LAYOUT)) {
963 ERR("wrong layout (\"%s\"), "
964 "pool created with layout \"%s\"",
965 layout, pop->layout);
966 errno = EINVAL;
967 return -1;
968 }
969
970 if (pop->heap_offset % Pagesize) {
971 ERR("unaligned heap: off %" PRIu64, pop->heap_offset);
972 errno = EINVAL;
973 return -1;
974 }
975
976 return 0;
977 }
978
979 /*
980 * obj_replica_init_local -- (internal) initialize runtime part
981 * of the local replicas
982 */
983 static int
984 obj_replica_init_local(PMEMobjpool *rep, int is_pmem, size_t resvsize)
985 {
986 LOG(3, "rep %p is_pmem %d resvsize %zu", rep, is_pmem, resvsize);
987
988 /*
989 * Use some of the memory pool area for run-time info. This
990 * run-time state is never loaded from the file, it is always
991 * created here, so no need to worry about byte-order.
992 */
993 rep->is_pmem = is_pmem;
994
995 /* init hooks */
996 rep->persist_remote = NULL;
997
998 /*
999 * All replicas, except for master, are ignored as far as valgrind is
1000 * concerned. This is to save CPU time and lessen the complexity of
1001 * instrumentation.
1002 */
1003 if (!rep->is_master_replica)
1004 VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(rep, resvsize);
1005
1006 if (rep->is_pmem) {
1007 rep->persist_local = pmem_persist;
1008 rep->flush_local = pmem_flush;
1009 rep->drain_local = pmem_drain;
1010 rep->memcpy_local = pmem_memcpy;
1011 rep->memmove_local = pmem_memmove;
1012 rep->memset_local = pmem_memset;
1013 } else {
1014 rep->persist_local = obj_msync_nofail;
1015 rep->flush_local = obj_msync_nofail;
1016 rep->drain_local = obj_drain_empty;
1017 rep->memcpy_local = obj_nopmem_memcpy;
1018 rep->memmove_local = obj_nopmem_memmove;
1019 rep->memset_local = obj_nopmem_memset;
1020 }
1021
1022 return 0;
1023 }
1024
1025 /*
1026 * obj_replica_init_remote -- (internal) initialize runtime part
1027 * of a remote replica
1028 */
1029 static int
1030 obj_replica_init_remote(PMEMobjpool *rep, struct pool_set *set,
1031 unsigned repidx, int create)
1032 {
1033 LOG(3, "rep %p set %p repidx %u", rep, set, repidx);
1034
1035 struct pool_replica *repset = set->replica[repidx];
1036
1037 ASSERTne(repset->remote->rpp, NULL);
1038 ASSERTne(repset->remote->node_addr, NULL);
1039 ASSERTne(repset->remote->pool_desc, NULL);
1040
1041 rep->node_addr = Strdup(repset->remote->node_addr);
1042 if (rep->node_addr == NULL)
1043 return -1;
1044 rep->pool_desc = Strdup(repset->remote->pool_desc);
1045 if (rep->pool_desc == NULL) {
1046 Free(rep->node_addr);
1047 return -1;
1048 }
1049
1050 rep->rpp = repset->remote->rpp;
1051
1052 /* remote_base - beginning of the remote pool */
1053 rep->remote_base = (uintptr_t)rep->addr;
1054
1055 /* init hooks */
1056 rep->persist_remote = obj_remote_persist;
1057 rep->persist_local = NULL;
1058 rep->flush_local = NULL;
1059 rep->drain_local = NULL;
1060 rep->memcpy_local = NULL;
1061 rep->memmove_local = NULL;
1062 rep->memset_local = NULL;
1063
1064 rep->p_ops.remote.read = obj_read_remote;
1065 rep->p_ops.remote.ctx = rep->rpp;
1066 rep->p_ops.remote.base = rep->remote_base;
1067
1068 return 0;
1069 }
1070
1071 /*
1072 * obj_cleanup_remote -- (internal) clean up the remote pools data
1073 */
1074 static void
1075 obj_cleanup_remote(PMEMobjpool *pop)
1076 {
1077 LOG(3, "pop %p", pop);
1078
1079 for (; pop != NULL; pop = pop->replica) {
1080 if (pop->rpp != NULL) {
1081 Free(pop->node_addr);
1082 Free(pop->pool_desc);
1083 pop->rpp = NULL;
1084 }
1085 }
1086 }
1087
1088 /*
1089 * obj_replica_init -- (internal) initialize runtime part of the replica
1090 */
1091 static int
1092 obj_replica_init(PMEMobjpool *rep, struct pool_set *set, unsigned repidx,
1093 int create)
1094 {
1095 struct pool_replica *repset = set->replica[repidx];
1096
1097 if (repidx == 0) {
1098 /* master replica */
1099 rep->is_master_replica = 1;
1100 rep->has_remote_replicas = set->remote;
1101
1102 if (set->nreplicas > 1) {
1103 rep->p_ops.persist = obj_rep_persist;
1104 rep->p_ops.flush = obj_rep_flush;
1105 rep->p_ops.drain = obj_rep_drain;
1106 rep->p_ops.memcpy = obj_rep_memcpy;
1107 rep->p_ops.memmove = obj_rep_memmove;
1108 rep->p_ops.memset = obj_rep_memset;
1109 } else {
1110 rep->p_ops.persist = obj_norep_persist;
1111 rep->p_ops.flush = obj_norep_flush;
1112 rep->p_ops.drain = obj_norep_drain;
1113 rep->p_ops.memcpy = obj_norep_memcpy;
1114 rep->p_ops.memmove = obj_norep_memmove;
1115 rep->p_ops.memset = obj_norep_memset;
1116 }
1117 rep->p_ops.base = rep;
1118 } else {
1119 /* non-master replicas */
1120 rep->is_master_replica = 0;
1121 rep->has_remote_replicas = 0;
1122
1123 rep->p_ops.persist = NULL;
1124 rep->p_ops.flush = NULL;
1125 rep->p_ops.drain = NULL;
1126 rep->p_ops.memcpy = NULL;
1127 rep->p_ops.memmove = NULL;
1128 rep->p_ops.memset = NULL;
1129
1130 rep->p_ops.base = NULL;
1131 }
1132
1133 rep->is_dev_dax = set->replica[repidx]->part[0].is_dev_dax;
1134
1135 int ret;
1136 if (repset->remote)
1137 ret = obj_replica_init_remote(rep, set, repidx, create);
1138 else
1139 ret = obj_replica_init_local(rep, repset->is_pmem,
1140 set->resvsize);
1141 if (ret)
1142 return ret;
1143
1144 return 0;
1145 }
1146
1147 /*
1148 * obj_replica_fini -- (internal) deinitialize replica
1149 */
1150 static void
1151 obj_replica_fini(struct pool_replica *repset)
1152 {
1153 PMEMobjpool *rep = repset->part[0].addr;
1154
1155 if (repset->remote)
1156 obj_cleanup_remote(rep);
1157 }
1158
1159 /*
1160 * obj_runtime_init -- (internal) initialize runtime part of the pool header
1161 */
1162 static int
1163 obj_runtime_init(PMEMobjpool *pop, int rdonly, int boot, unsigned nlanes)
1164 {
1165 LOG(3, "pop %p rdonly %d boot %d", pop, rdonly, boot);
1166 struct pmem_ops *p_ops = &pop->p_ops;
1167
1168 /* run_id is made unique by incrementing the previous value */
1169 pop->run_id += 2;
1170 if (pop->run_id == 0)
1171 pop->run_id += 2;
1172 pmemops_persist(p_ops, &pop->run_id, sizeof(pop->run_id));
1173
1174 /*
1175 * Use some of the memory pool area for run-time info. This
1176 * run-time state is never loaded from the file, it is always
1177 * created here, so no need to worry about byte-order.
1178 */
1179 pop->rdonly = rdonly;
1180
1181 pop->uuid_lo = pmemobj_get_uuid_lo(pop);
1182
1183 pop->lanes_desc.runtime_nlanes = nlanes;
1184
1185 pop->tx_params = tx_params_new();
1186 if (pop->tx_params == NULL)
1187 goto err_tx_params;
1188
1189 pop->stats = stats_new(pop);
1190 if (pop->stats == NULL)
1191 goto err_stat;
1192
1193 pop->user_data = NULL;
1194
1195 VALGRIND_REMOVE_PMEM_MAPPING(&pop->mutex_head,
1196 sizeof(pop->mutex_head));
1197 VALGRIND_REMOVE_PMEM_MAPPING(&pop->rwlock_head,
1198 sizeof(pop->rwlock_head));
1199 VALGRIND_REMOVE_PMEM_MAPPING(&pop->cond_head,
1200 sizeof(pop->cond_head));
1201 pop->mutex_head = NULL;
1202 pop->rwlock_head = NULL;
1203 pop->cond_head = NULL;
1204
1205 if (boot) {
1206 if ((errno = obj_runtime_init_common(pop)) != 0)
1207 goto err_boot;
1208
1209 #if VG_MEMCHECK_ENABLED
1210 if (On_memcheck) {
1211 /* mark unused part of the pool as not accessible */
1212 void *end = palloc_heap_end(&pop->heap);
1213 VALGRIND_DO_MAKE_MEM_NOACCESS(end,
1214 (char *)pop + pop->set->poolsize - (char *)end);
1215 }
1216 #endif
1217
1218 obj_pool_init();
1219
1220 if ((errno = critnib_insert(pools_ht, pop->uuid_lo, pop))) {
1221 ERR("!critnib_insert to pools_ht");
1222 goto err_critnib_insert;
1223 }
1224
1225 if ((errno = critnib_insert(pools_tree, (uint64_t)pop, pop))) {
1226 ERR("!critnib_insert to pools_tree");
1227 goto err_tree_insert;
1228 }
1229 }
1230
1231 if (obj_ctl_init_and_load(pop) != 0) {
1232 errno = EINVAL;
1233 goto err_ctl;
1234 }
1235
1236 util_mutex_init(&pop->ulog_user_buffers.lock);
1237 pop->ulog_user_buffers.map = ravl_new_sized(
1238 operation_user_buffer_range_cmp,
1239 sizeof(struct user_buffer_def));
1240 if (pop->ulog_user_buffers.map == NULL) {
1241 ERR("!ravl_new_sized");
1242 goto err_user_buffers_map;
1243 }
1244 pop->ulog_user_buffers.verify = 0;
1245
1246 /*
1247 * If possible, turn off all permissions on the pool header page.
1248 *
1249 * The prototype PMFS doesn't allow this when large pages are in
1250 * use. It is not considered an error if this fails.
1251 */
1252 RANGE_NONE(pop->addr, sizeof(struct pool_hdr), pop->is_dev_dax);
1253
1254 return 0;
1255
1256 err_user_buffers_map:
1257 util_mutex_destroy(&pop->ulog_user_buffers.lock);
1258 ctl_delete(pop->ctl);
1259 err_ctl:;
1260 void *n = critnib_remove(pools_tree, (uint64_t)pop);
1261 ASSERTne(n, NULL);
1262 err_tree_insert:
1263 critnib_remove(pools_ht, pop->uuid_lo);
1264 err_critnib_insert:
1265 obj_runtime_cleanup_common(pop);
1266 err_boot:
1267 stats_delete(pop, pop->stats);
1268 err_stat:
1269 tx_params_delete(pop->tx_params);
1270 err_tx_params:
1271
1272 return -1;
1273 }
1274
1275 /*
1276 * obj_get_nlanes -- get a number of lanes available at runtime. If the value
1277 * provided with the PMEMOBJ_NLANES environment variable is greater than 0 and
1278 * smaller than OBJ_NLANES constant it returns PMEMOBJ_NLANES. Otherwise it
1279 * returns OBJ_NLANES.
1280 */
1281 static unsigned
1282 obj_get_nlanes(void)
1283 {
1284 LOG(3, NULL);
1285
1286 char *env_nlanes = os_getenv(OBJ_NLANES_ENV_VARIABLE);
1287 if (env_nlanes) {
1288 int nlanes = atoi(env_nlanes);
1289 if (nlanes <= 0) {
1290 ERR("%s variable must be a positive integer",
1291 OBJ_NLANES_ENV_VARIABLE);
1292 errno = EINVAL;
1293 goto no_valid_env;
1294 }
1295
1296 return (unsigned)(OBJ_NLANES < nlanes ? OBJ_NLANES : nlanes);
1297 }
1298
1299 no_valid_env:
1300 return OBJ_NLANES;
1301 }
1302
1303 /*
1304 * pmemobj_createU -- create a transactional memory pool (set)
1305 */
1306 #ifndef _WIN32
1307 static inline
1308 #endif
1309 PMEMobjpool *
1310 pmemobj_createU(const char *path, const char *layout,
1311 size_t poolsize, mode_t mode)
1312 {
1313 LOG(3, "path %s layout %s poolsize %zu mode %o",
1314 path, layout, poolsize, mode);
1315
1316 PMEMobjpool *pop;
1317 struct pool_set *set;
1318
1319 /* check length of layout */
1320 if (layout && (strlen(layout) >= PMEMOBJ_MAX_LAYOUT)) {
1321 ERR("Layout too long");
1322 errno = EINVAL;
1323 return NULL;
1324 }
1325
1326 /*
1327 * A number of lanes available at runtime equals the lowest value
1328 * from all reported by remote replicas hosts. In the single host mode
1329 * the runtime number of lanes is equal to the total number of lanes
1330 * available in the pool or the value provided with PMEMOBJ_NLANES
1331 * environment variable whichever is lower.
1332 */
1333 unsigned runtime_nlanes = obj_get_nlanes();
1334
1335 struct pool_attr adj_pool_attr = Obj_create_attr;
1336
1337 /* force set SDS feature */
1338 if (SDS_at_create)
1339 adj_pool_attr.features.incompat |= POOL_FEAT_SDS;
1340 else
1341 adj_pool_attr.features.incompat &= ~POOL_FEAT_SDS;
1342
1343 if (util_pool_create(&set, path, poolsize, PMEMOBJ_MIN_POOL,
1344 PMEMOBJ_MIN_PART, &adj_pool_attr, &runtime_nlanes,
1345 REPLICAS_ENABLED) != 0) {
1346 LOG(2, "cannot create pool or pool set");
1347 return NULL;
1348 }
1349
1350 ASSERT(set->nreplicas > 0);
1351
1352 /* pop is master replica from now on */
1353 pop = set->replica[0]->part[0].addr;
1354
1355 for (unsigned r = 0; r < set->nreplicas; r++) {
1356 struct pool_replica *repset = set->replica[r];
1357 PMEMobjpool *rep = repset->part[0].addr;
1358
1359 size_t rt_size = (uintptr_t)(rep + 1) - (uintptr_t)&rep->addr;
1360 VALGRIND_REMOVE_PMEM_MAPPING(&rep->addr, rt_size);
1361
1362 memset(&rep->addr, 0, rt_size);
1363
1364 rep->addr = rep;
1365 rep->replica = NULL;
1366 rep->rpp = NULL;
1367
1368 /* initialize replica runtime - is_pmem, funcs, ... */
1369 if (obj_replica_init(rep, set, r, 1 /* create */) != 0) {
1370 ERR("initialization of replica #%u failed", r);
1371 goto err;
1372 }
1373
1374 /* link replicas */
1375 if (r < set->nreplicas - 1)
1376 rep->replica = set->replica[r + 1]->part[0].addr;
1377 }
1378
1379 pop->set = set;
1380
1381 /* create pool descriptor */
1382 if (obj_descr_create(pop, layout, set->poolsize) != 0) {
1383 LOG(2, "creation of pool descriptor failed");
1384 goto err;
1385 }
1386
1387 /* initialize runtime parts - lanes, obj stores, ... */
1388 if (obj_runtime_init(pop, 0, 1 /* boot */,
1389 runtime_nlanes) != 0) {
1390 ERR("pool initialization failed");
1391 goto err;
1392 }
1393
1394 if (util_poolset_chmod(set, mode))
1395 goto err;
1396
1397 util_poolset_fdclose(set);
1398
1399 LOG(3, "pop %p", pop);
1400
1401 return pop;
1402
1403 err:
1404 LOG(4, "error clean up");
1405 int oerrno = errno;
1406 if (set->remote)
1407 obj_cleanup_remote(pop);
1408 util_poolset_close(set, DELETE_CREATED_PARTS);
1409 errno = oerrno;
1410 return NULL;
1411 }
1412
1413 #ifndef _WIN32
1414 /*
1415 * pmemobj_create -- create a transactional memory pool (set)
1416 */
1417 PMEMobjpool *
1418 pmemobj_create(const char *path, const char *layout,
1419 size_t poolsize, mode_t mode)
1420 {
1421 PMEMOBJ_API_START();
1422
1423 PMEMobjpool *pop = pmemobj_createU(path, layout, poolsize, mode);
1424
1425 PMEMOBJ_API_END();
1426 return pop;
1427 }
1428 #else
1429 /*
1430 * pmemobj_createW -- create a transactional memory pool (set)
1431 */
1432 PMEMobjpool *
1433 pmemobj_createW(const wchar_t *path, const wchar_t *layout, size_t poolsize,
1434 mode_t mode)
1435 {
1436 char *upath = util_toUTF8(path);
1437 if (upath == NULL)
1438 return NULL;
1439 char *ulayout = NULL;
1440 if (layout != NULL) {
1441 ulayout = util_toUTF8(layout);
1442 if (ulayout == NULL) {
1443 util_free_UTF8(upath);
1444 return NULL;
1445 }
1446 }
1447 PMEMobjpool *ret = pmemobj_createU(upath, ulayout, poolsize, mode);
1448
1449 util_free_UTF8(upath);
1450 util_free_UTF8(ulayout);
1451
1452 return ret;
1453 }
1454 #endif
1455
1456 /*
1457 * obj_check_basic_local -- (internal) basic pool consistency check
1458 * of a local replica
1459 */
1460 static int
1461 obj_check_basic_local(PMEMobjpool *pop, size_t mapped_size)
1462 {
1463 LOG(3, "pop %p mapped_size %zu", pop, mapped_size);
1464
1465 ASSERTeq(pop->rpp, NULL);
1466
1467 int consistent = 1;
1468
1469 if (pop->run_id % 2) {
1470 ERR("invalid run_id %" PRIu64, pop->run_id);
1471 consistent = 0;
1472 }
1473
1474 if ((errno = lane_check(pop)) != 0) {
1475 LOG(2, "!lane_check");
1476 consistent = 0;
1477 }
1478
1479 /* pop->heap_size can still be 0 at this point */
1480 size_t heap_size = mapped_size - pop->heap_offset;
1481 errno = palloc_heap_check((char *)pop + pop->heap_offset,
1482 heap_size);
1483 if (errno != 0) {
1484 LOG(2, "!heap_check");
1485 consistent = 0;
1486 }
1487
1488 return consistent;
1489 }
1490
1491 /*
1492 * obj_read_remote -- read data from remote replica
1493 *
1494 * It reads data of size 'length' from the remote replica 'pop'
1495 * from address 'addr' and saves it at address 'dest'.
1496 */
1497 int
1498 obj_read_remote(void *ctx, uintptr_t base, void *dest, void *addr,
1499 size_t length)
1500 {
1501 LOG(3, "ctx %p base 0x%lx dest %p addr %p length %zu", ctx, base, dest,
1502 addr, length);
1503
1504 ASSERTne(ctx, NULL);
1505 ASSERT((uintptr_t)addr >= base);
1506
1507 uintptr_t offset = (uintptr_t)addr - base;
1508 if (Rpmem_read(ctx, dest, offset, length, RLANE_DEFAULT)) {
1509 ERR("!rpmem_read");
1510 return -1;
1511 }
1512
1513 return 0;
1514 }
1515
1516 /*
1517 * obj_check_basic_remote -- (internal) basic pool consistency check
1518 * of a remote replica
1519 */
1520 static int
1521 obj_check_basic_remote(PMEMobjpool *pop, size_t mapped_size)
1522 {
1523 LOG(3, "pop %p mapped_size %zu", pop, mapped_size);
1524
1525 ASSERTne(pop->rpp, NULL);
1526
1527 int consistent = 1;
1528
1529 /* read pop->run_id */
1530 if (obj_read_remote(pop->rpp, pop->remote_base, &pop->run_id,
1531 &pop->run_id, sizeof(pop->run_id))) {
1532 ERR("!obj_read_remote");
1533 return -1;
1534 }
1535
1536 if (pop->run_id % 2) {
1537 ERR("invalid run_id %" PRIu64, pop->run_id);
1538 consistent = 0;
1539 }
1540
1541 /* XXX add lane_check_remote */
1542
1543 /* pop->heap_size can still be 0 at this point */
1544 size_t heap_size = mapped_size - pop->heap_offset;
1545 if (palloc_heap_check_remote((char *)pop + pop->heap_offset,
1546 heap_size, &pop->p_ops.remote)) {
1547 LOG(2, "!heap_check_remote");
1548 consistent = 0;
1549 }
1550
1551 return consistent;
1552 }
1553
1554 /*
1555 * obj_check_basic -- (internal) basic pool consistency check
1556 *
1557 * Used to check if all the replicas are consistent prior to pool recovery.
1558 */
1559 static int
1560 obj_check_basic(PMEMobjpool *pop, size_t mapped_size)
1561 {
1562 LOG(3, "pop %p mapped_size %zu", pop, mapped_size);
1563
1564 if (pop->rpp == NULL)
1565 return obj_check_basic_local(pop, mapped_size);
1566 else
1567 return obj_check_basic_remote(pop, mapped_size);
1568 }
1569
1570 /*
1571 * obj_pool_close -- (internal) close the pool set
1572 */
1573 static void
1574 obj_pool_close(struct pool_set *set)
1575 {
1576 int oerrno = errno;
1577 util_poolset_close(set, DO_NOT_DELETE_PARTS);
1578 errno = oerrno;
1579 }
1580
1581 /*
1582 * obj_pool_open -- (internal) open the given pool
1583 */
1584 static int
1585 obj_pool_open(struct pool_set **set, const char *path, unsigned flags,
1586 unsigned *nlanes)
1587 {
1588 if (util_pool_open(set, path, PMEMOBJ_MIN_PART, &Obj_open_attr,
1589 nlanes, NULL, flags) != 0) {
1590 LOG(2, "cannot open pool or pool set");
1591 return -1;
1592 }
1593
1594 ASSERT((*set)->nreplicas > 0);
1595
1596 /* read-only mode is not supported in libpmemobj */
1597 if ((*set)->rdonly) {
1598 ERR("read-only mode is not supported");
1599 errno = EINVAL;
1600 goto err_rdonly;
1601 }
1602
1603 return 0;
1604 err_rdonly:
1605 obj_pool_close(*set);
1606 return -1;
1607 }
1608
1609 /*
1610 * obj_replicas_init -- (internal) initialize all replicas
1611 */
1612 static int
1613 obj_replicas_init(struct pool_set *set)
1614 {
1615 unsigned r;
1616 for (r = 0; r < set->nreplicas; r++) {
1617 struct pool_replica *repset = set->replica[r];
1618 PMEMobjpool *rep = repset->part[0].addr;
1619
1620 size_t rt_size = (uintptr_t)(rep + 1) - (uintptr_t)&rep->addr;
1621
1622 VALGRIND_REMOVE_PMEM_MAPPING(&rep->addr, rt_size);
1623
1624 memset(&rep->addr, 0, rt_size);
1625
1626 rep->addr = rep;
1627 rep->replica = NULL;
1628 rep->rpp = NULL;
1629
1630 /* initialize replica runtime - is_pmem, funcs, ... */
1631 if (obj_replica_init(rep, set, r, 0 /* open */) != 0) {
1632 ERR("initialization of replica #%u failed", r);
1633 goto err;
1634 }
1635
1636 /* link replicas */
1637 if (r < set->nreplicas - 1)
1638 rep->replica = set->replica[r + 1]->part[0].addr;
1639 }
1640
1641 return 0;
1642 err:
1643 for (unsigned p = 0; p < r; p++)
1644 obj_replica_fini(set->replica[p]);
1645
1646 return -1;
1647 }
1648
1649 /*
1650 * obj_replicas_fini -- (internal) deinitialize all replicas
1651 */
1652 static void
1653 obj_replicas_fini(struct pool_set *set)
1654 {
1655 int oerrno = errno;
1656 for (unsigned r = 0; r < set->nreplicas; r++)
1657 obj_replica_fini(set->replica[r]);
1658 errno = oerrno;
1659 }
1660
1661 /*
1662 * obj_replicas_check_basic -- (internal) perform basic consistency check
1663 * for all replicas
1664 */
1665 static int
1666 obj_replicas_check_basic(PMEMobjpool *pop)
1667 {
1668 PMEMobjpool *rep;
1669 for (unsigned r = 0; r < pop->set->nreplicas; r++) {
1670 rep = pop->set->replica[r]->part[0].addr;
1671 if (obj_check_basic(rep, pop->set->poolsize) == 0) {
1672 ERR("inconsistent replica #%u", r);
1673 return -1;
1674 }
1675 }
1676
1677 /* copy lanes */
1678 void *src = (void *)((uintptr_t)pop + pop->lanes_offset);
1679 size_t len = pop->nlanes * sizeof(struct lane_layout);
1680
1681 for (unsigned r = 1; r < pop->set->nreplicas; r++) {
1682 rep = pop->set->replica[r]->part[0].addr;
1683 void *dst = (void *)((uintptr_t)rep + pop->lanes_offset);
1684 if (rep->rpp == NULL) {
1685 rep->memcpy_local(dst, src, len, 0);
1686 } else {
1687 if (rep->persist_remote(rep, dst, len,
1688 RLANE_DEFAULT, 0))
1689 obj_handle_remote_persist_error(pop);
1690 }
1691 }
1692
1693 return 0;
1694 }
1695
1696 /*
1697 * obj_open_common -- open a transactional memory pool (set)
1698 *
1699 * This routine takes flags and does all the work
1700 * (flag POOL_OPEN_COW - internal calls can map a read-only pool if required).
1701 */
1702 static PMEMobjpool *
1703 obj_open_common(const char *path, const char *layout, unsigned flags, int boot)
1704 {
1705 LOG(3, "path %s layout %s flags 0x%x", path, layout, flags);
1706
1707 PMEMobjpool *pop = NULL;
1708 struct pool_set *set;
1709
1710 /*
1711 * A number of lanes available at runtime equals the lowest value
1712 * from all reported by remote replicas hosts. In the single host mode
1713 * the runtime number of lanes is equal to the total number of lanes
1714 * available in the pool or the value provided with PMEMOBJ_NLANES
1715 * environment variable whichever is lower.
1716 */
1717 unsigned runtime_nlanes = obj_get_nlanes();
1718 if (obj_pool_open(&set, path, flags, &runtime_nlanes))
1719 return NULL;
1720
1721 /* pop is master replica from now on */
1722 pop = set->replica[0]->part[0].addr;
1723
1724 if (obj_replicas_init(set))
1725 goto replicas_init;
1726
1727 for (unsigned r = 0; r < set->nreplicas; r++) {
1728 struct pool_replica *repset = set->replica[r];
1729 PMEMobjpool *rep = repset->part[0].addr;
1730 /* check descriptor */
1731 if (obj_descr_check(rep, layout, set->poolsize) != 0) {
1732 LOG(2, "descriptor check of replica #%u failed", r);
1733 goto err_descr_check;
1734 }
1735 }
1736
1737 pop->set = set;
1738
1739 if (boot) {
1740 /* check consistency of 'master' replica */
1741 if (obj_check_basic(pop, pop->set->poolsize) == 0) {
1742 goto err_check_basic;
1743 }
1744 }
1745
1746 if (set->nreplicas > 1) {
1747 if (obj_replicas_check_basic(pop))
1748 goto err_replicas_check_basic;
1749 }
1750
1751 /*
1752 * before runtime initialization lanes are unavailable, remote persists
1753 * should use RLANE_DEFAULT
1754 */
1755 pop->lanes_desc.runtime_nlanes = 0;
1756
1757 #if VG_MEMCHECK_ENABLED
1758 pop->vg_boot = boot;
1759 #endif
1760 /* initialize runtime parts - lanes, obj stores, ... */
1761 if (obj_runtime_init(pop, 0, boot, runtime_nlanes) != 0) {
1762 ERR("pool initialization failed");
1763 goto err_runtime_init;
1764 }
1765
1766 #if VG_MEMCHECK_ENABLED
1767 if (boot)
1768 obj_vg_boot(pop);
1769 #endif
1770
1771 util_poolset_fdclose(set);
1772
1773 LOG(3, "pop %p", pop);
1774
1775 return pop;
1776
1777 err_runtime_init:
1778 err_replicas_check_basic:
1779 err_check_basic:
1780 err_descr_check:
1781 obj_replicas_fini(set);
1782 replicas_init:
1783 obj_pool_close(set);
1784 return NULL;
1785 }
1786
1787 /*
1788 * pmemobj_openU -- open a transactional memory pool
1789 */
1790 #ifndef _WIN32
1791 static inline
1792 #endif
1793 PMEMobjpool *
1794 pmemobj_openU(const char *path, const char *layout)
1795 {
1796 LOG(3, "path %s layout %s", path, layout);
1797
1798 return obj_open_common(path, layout,
1799 COW_at_open ? POOL_OPEN_COW : 0, 1);
1800 }
1801
1802 #ifndef _WIN32
1803 /*
1804 * pmemobj_open -- open a transactional memory pool
1805 */
1806 PMEMobjpool *
1807 pmemobj_open(const char *path, const char *layout)
1808 {
1809 PMEMOBJ_API_START();
1810
1811 PMEMobjpool *pop = pmemobj_openU(path, layout);
1812
1813 PMEMOBJ_API_END();
1814 return pop;
1815 }
1816 #else
1817 /*
1818 * pmemobj_openW -- open a transactional memory pool
1819 */
1820 PMEMobjpool *
1821 pmemobj_openW(const wchar_t *path, const wchar_t *layout)
1822 {
1823 char *upath = util_toUTF8(path);
1824 if (upath == NULL)
1825 return NULL;
1826
1827 char *ulayout = NULL;
1828 if (layout != NULL) {
1829 ulayout = util_toUTF8(layout);
1830 if (ulayout == NULL) {
1831 util_free_UTF8(upath);
1832 return NULL;
1833 }
1834 }
1835
1836 PMEMobjpool *ret = pmemobj_openU(upath, ulayout);
1837 util_free_UTF8(upath);
1838 util_free_UTF8(ulayout);
1839 return ret;
1840 }
1841 #endif
1842
1843 /*
1844 * obj_replicas_cleanup -- (internal) free resources allocated for replicas
1845 */
1846 static void
1847 obj_replicas_cleanup(struct pool_set *set)
1848 {
1849 LOG(3, "set %p", set);
1850
1851 for (unsigned r = 0; r < set->nreplicas; r++) {
1852 struct pool_replica *rep = set->replica[r];
1853
1854 PMEMobjpool *pop = rep->part[0].addr;
1855
1856 if (pop->rpp != NULL) {
1857 /*
1858 * remote replica will be closed in util_poolset_close
1859 */
1860 pop->rpp = NULL;
1861
1862 Free(pop->node_addr);
1863 Free(pop->pool_desc);
1864 }
1865 }
1866 }
1867
1868 /*
1869 * obj_pool_lock_cleanup -- (internal) Destroy any locks or condition
1870 * variables that were allocated at run time
1871 */
1872 static void
1873 obj_pool_lock_cleanup(PMEMobjpool *pop)
1874 {
1875 LOG(3, "pop %p", pop);
1876
1877 PMEMmutex_internal *nextm;
1878 for (PMEMmutex_internal *m = pop->mutex_head; m != NULL; m = nextm) {
1879 nextm = m->PMEMmutex_next;
1880 LOG(4, "mutex %p *mutex %p", &m->PMEMmutex_lock,
1881 m->PMEMmutex_bsd_mutex_p);
1882 os_mutex_destroy(&m->PMEMmutex_lock);
1883 m->PMEMmutex_next = NULL;
1884 m->PMEMmutex_bsd_mutex_p = NULL;
1885 }
1886 pop->mutex_head = NULL;
1887
1888 PMEMrwlock_internal *nextr;
1889 for (PMEMrwlock_internal *r = pop->rwlock_head; r != NULL; r = nextr) {
1890 nextr = r->PMEMrwlock_next;
1891 LOG(4, "rwlock %p *rwlock %p", &r->PMEMrwlock_lock,
1892 r->PMEMrwlock_bsd_rwlock_p);
1893 os_rwlock_destroy(&r->PMEMrwlock_lock);
1894 r->PMEMrwlock_next = NULL;
1895 r->PMEMrwlock_bsd_rwlock_p = NULL;
1896 }
1897 pop->rwlock_head = NULL;
1898
1899 PMEMcond_internal *nextc;
1900 for (PMEMcond_internal *c = pop->cond_head; c != NULL; c = nextc) {
1901 nextc = c->PMEMcond_next;
1902 LOG(4, "cond %p *cond %p", &c->PMEMcond_cond,
1903 c->PMEMcond_bsd_cond_p);
1904 os_cond_destroy(&c->PMEMcond_cond);
1905 c->PMEMcond_next = NULL;
1906 c->PMEMcond_bsd_cond_p = NULL;
1907 }
1908 pop->cond_head = NULL;
1909 }
1910 /*
1911 * obj_pool_cleanup -- (internal) cleanup the pool and unmap
1912 */
1913 static void
1914 obj_pool_cleanup(PMEMobjpool *pop)
1915 {
1916 LOG(3, "pop %p", pop);
1917
1918 ravl_delete(pop->ulog_user_buffers.map);
1919 util_mutex_destroy(&pop->ulog_user_buffers.lock);
1920
1921 stats_delete(pop, pop->stats);
1922 tx_params_delete(pop->tx_params);
1923 ctl_delete(pop->ctl);
1924
1925 obj_pool_lock_cleanup(pop);
1926
1927 lane_section_cleanup(pop);
1928 lane_cleanup(pop);
1929
1930 /* unmap all the replicas */
1931 obj_replicas_cleanup(pop->set);
1932 util_poolset_close(pop->set, DO_NOT_DELETE_PARTS);
1933 }
1934
1935 /*
1936 * pmemobj_close -- close a transactional memory pool
1937 */
1938 void
1939 pmemobj_close(PMEMobjpool *pop)
1940 {
1941 LOG(3, "pop %p", pop);
1942 PMEMOBJ_API_START();
1943
1944 _pobj_cache_invalidate++;
1945
1946 if (critnib_remove(pools_ht, pop->uuid_lo) != pop) {
1947 ERR("critnib_remove for pools_ht");
1948 }
1949
1950 if (critnib_remove(pools_tree, (uint64_t)pop) != pop)
1951 ERR("critnib_remove for pools_tree");
1952
1953 #ifndef _WIN32
1954
1955 if (_pobj_cached_pool.pop == pop) {
1956 _pobj_cached_pool.pop = NULL;
1957 _pobj_cached_pool.uuid_lo = 0;
1958 }
1959
1960 #else /* _WIN32 */
1961
1962 struct _pobj_pcache *pcache = os_tls_get(Cached_pool_key);
1963 if (pcache != NULL) {
1964 if (pcache->pop == pop) {
1965 pcache->pop = NULL;
1966 pcache->uuid_lo = 0;
1967 }
1968 }
1969
1970 #endif /* _WIN32 */
1971
1972 obj_pool_cleanup(pop);
1973 PMEMOBJ_API_END();
1974 }
1975
1976 /*
1977 * pmemobj_checkU -- transactional memory pool consistency check
1978 */
1979 #ifndef _WIN32
1980 static inline
1981 #endif
1982 int
1983 pmemobj_checkU(const char *path, const char *layout)
1984 {
1985 LOG(3, "path %s layout %s", path, layout);
1986
1987 PMEMobjpool *pop = obj_open_common(path, layout, POOL_OPEN_COW, 0);
1988 if (pop == NULL)
1989 return -1; /* errno set by obj_open_common() */
1990
1991 int consistent = 1;
1992
1993 /*
1994 * For replicated pools, basic consistency check is performed
1995 * in obj_open_common().
1996 */
1997 if (pop->replica == NULL)
1998 consistent = obj_check_basic(pop, pop->set->poolsize);
1999
2000 if (consistent && (errno = obj_runtime_init_common(pop)) != 0) {
2001 LOG(3, "!obj_boot");
2002 consistent = 0;
2003 }
2004
2005 if (consistent) {
2006 obj_pool_cleanup(pop);
2007 } else {
2008 stats_delete(pop, pop->stats);
2009 tx_params_delete(pop->tx_params);
2010 ctl_delete(pop->ctl);
2011
2012 /* unmap all the replicas */
2013 obj_replicas_cleanup(pop->set);
2014 util_poolset_close(pop->set, DO_NOT_DELETE_PARTS);
2015 }
2016
2017 if (consistent)
2018 LOG(4, "pool consistency check OK");
2019
2020 return consistent;
2021 }
2022
2023 #ifndef _WIN32
2024 /*
2025 * pmemobj_check -- transactional memory pool consistency check
2026 */
2027 int
2028 pmemobj_check(const char *path, const char *layout)
2029 {
2030 PMEMOBJ_API_START();
2031
2032 int ret = pmemobj_checkU(path, layout);
2033
2034 PMEMOBJ_API_END();
2035 return ret;
2036 }
2037 #else
2038 /*
2039 * pmemobj_checkW -- transactional memory pool consistency check
2040 */
2041 int
2042 pmemobj_checkW(const wchar_t *path, const wchar_t *layout)
2043 {
2044 char *upath = util_toUTF8(path);
2045 if (upath == NULL)
2046 return -1;
2047
2048 char *ulayout = NULL;
2049 if (layout != NULL) {
2050 ulayout = util_toUTF8(layout);
2051 if (ulayout == NULL) {
2052 util_free_UTF8(upath);
2053 return -1;
2054 }
2055 }
2056
2057 int ret = pmemobj_checkU(upath, ulayout);
2058
2059 util_free_UTF8(upath);
2060 util_free_UTF8(ulayout);
2061
2062 return ret;
2063 }
2064 #endif
2065
2066 /*
2067 * pmemobj_pool_by_oid -- returns the pool handle associated with the oid
2068 */
2069 PMEMobjpool *
2070 pmemobj_pool_by_oid(PMEMoid oid)
2071 {
2072 LOG(3, "oid.off 0x%016" PRIx64, oid.off);
2073
2074 /* XXX this is a temporary fix, to be fixed properly later */
2075 if (pools_ht == NULL)
2076 return NULL;
2077
2078 return critnib_get(pools_ht, oid.pool_uuid_lo);
2079 }
2080
2081 /*
2082 * pmemobj_pool_by_ptr -- returns the pool handle associated with the address
2083 */
2084 PMEMobjpool *
2085 pmemobj_pool_by_ptr(const void *addr)
2086 {
2087 LOG(3, "addr %p", addr);
2088
2089 /* fast path for transactions */
2090 PMEMobjpool *pop = tx_get_pop();
2091
2092 if ((pop != NULL) && OBJ_PTR_FROM_POOL(pop, addr))
2093 return pop;
2094
2095 /* XXX this is a temporary fix, to be fixed properly later */
2096 if (pools_tree == NULL)
2097 return NULL;
2098
2099 pop = critnib_find_le(pools_tree, (uint64_t)addr);
2100 if (pop == NULL)
2101 return NULL;
2102
2103 size_t pool_size = pop->heap_offset + pop->heap_size;
2104 if ((char *)addr >= (char *)pop + pool_size)
2105 return NULL;
2106
2107 return pop;
2108 }
2109
2110 /*
2111 * pmemobj_set_user_data -- sets volatile pointer to the user data for specified
2112 * pool
2113 */
2114 void
2115 pmemobj_set_user_data(PMEMobjpool *pop, void *data)
2116 {
2117 LOG(3, "pop %p data %p", pop, data);
2118
2119 pop->user_data = data;
2120 }
2121
2122 /*
2123 * pmemobj_get_user_data -- gets volatile pointer to the user data associated
2124 * with the specified pool
2125 */
2126 void *
2127 pmemobj_get_user_data(PMEMobjpool *pop)
2128 {
2129 LOG(3, "pop %p", pop);
2130
2131 return pop->user_data;
2132 }
2133
2134 /* arguments for constructor_alloc */
2135 struct constr_args {
2136 int zero_init;
2137 pmemobj_constr constructor;
2138 void *arg;
2139 };
2140
2141 /*
2142 * constructor_alloc -- (internal) constructor for obj_alloc_construct
2143 */
2144 static int
2145 constructor_alloc(void *ctx, void *ptr, size_t usable_size, void *arg)
2146 {
2147 PMEMobjpool *pop = ctx;
2148 LOG(3, "pop %p ptr %p arg %p", pop, ptr, arg);
2149 struct pmem_ops *p_ops = &pop->p_ops;
2150
2151 ASSERTne(ptr, NULL);
2152 ASSERTne(arg, NULL);
2153
2154 struct constr_args *carg = arg;
2155
2156 if (carg->zero_init)
2157 pmemops_memset(p_ops, ptr, 0, usable_size, 0);
2158
2159 int ret = 0;
2160 if (carg->constructor)
2161 ret = carg->constructor(pop, ptr, carg->arg);
2162
2163 return ret;
2164 }
2165
2166 /*
2167 * obj_alloc_construct -- (internal) allocates a new object with constructor
2168 */
2169 static int
2170 obj_alloc_construct(PMEMobjpool *pop, PMEMoid *oidp, size_t size,
2171 type_num_t type_num, uint64_t flags,
2172 pmemobj_constr constructor, void *arg)
2173 {
2174 if (size > PMEMOBJ_MAX_ALLOC_SIZE) {
2175 ERR("requested size too large");
2176 errno = ENOMEM;
2177 return -1;
2178 }
2179
2180 struct constr_args carg;
2181
2182 carg.zero_init = flags & POBJ_FLAG_ZERO;
2183 carg.constructor = constructor;
2184 carg.arg = arg;
2185
2186 struct operation_context *ctx = pmalloc_operation_hold(pop);
2187
2188 if (oidp)
2189 operation_add_entry(ctx, &oidp->pool_uuid_lo, pop->uuid_lo,
2190 ULOG_OPERATION_SET);
2191
2192 int ret = palloc_operation(&pop->heap, 0,
2193 oidp != NULL ? &oidp->off : NULL, size,
2194 constructor_alloc, &carg, type_num, 0,
2195 CLASS_ID_FROM_FLAG(flags), ARENA_ID_FROM_FLAG(flags),
2196 ctx);
2197
2198 pmalloc_operation_release(pop);
2199
2200 return ret;
2201 }
2202
2203 /*
2204 * pmemobj_alloc -- allocates a new object
2205 */
2206 int
2207 pmemobj_alloc(PMEMobjpool *pop, PMEMoid *oidp, size_t size,
2208 uint64_t type_num, pmemobj_constr constructor, void *arg)
2209 {
2210 LOG(3, "pop %p oidp %p size %zu type_num %llx constructor %p arg %p",
2211 pop, oidp, size, (unsigned long long)type_num,
2212 constructor, arg);
2213
2214 /* log notice message if used inside a transaction */
2215 _POBJ_DEBUG_NOTICE_IN_TX();
2216
2217 if (size == 0) {
2218 ERR("allocation with size 0");
2219 errno = EINVAL;
2220 return -1;
2221 }
2222
2223 PMEMOBJ_API_START();
2224 int ret = obj_alloc_construct(pop, oidp, size, type_num,
2225 0, constructor, arg);
2226
2227 PMEMOBJ_API_END();
2228 return ret;
2229 }
2230
2231 /*
2232 * pmemobj_xalloc -- allocates with flags
2233 */
2234 int
2235 pmemobj_xalloc(PMEMobjpool *pop, PMEMoid *oidp, size_t size,
2236 uint64_t type_num, uint64_t flags,
2237 pmemobj_constr constructor, void *arg)
2238 {
2239 LOG(3, "pop %p oidp %p size %zu type_num %llx flags %llx "
2240 "constructor %p arg %p",
2241 pop, oidp, size, (unsigned long long)type_num,
2242 (unsigned long long)flags,
2243 constructor, arg);
2244
2245 /* log notice message if used inside a transaction */
2246 _POBJ_DEBUG_NOTICE_IN_TX();
2247
2248 if (size == 0) {
2249 ERR("allocation with size 0");
2250 errno = EINVAL;
2251 return -1;
2252 }
2253
2254 if (flags & ~POBJ_TX_XALLOC_VALID_FLAGS) {
2255 ERR("unknown flags 0x%" PRIx64,
2256 flags & ~POBJ_TX_XALLOC_VALID_FLAGS);
2257 errno = EINVAL;
2258 return -1;
2259 }
2260
2261 PMEMOBJ_API_START();
2262 int ret = obj_alloc_construct(pop, oidp, size, type_num,
2263 flags, constructor, arg);
2264
2265 PMEMOBJ_API_END();
2266 return ret;
2267 }
2268
2269 /* arguments for constructor_realloc and constructor_zrealloc */
2270 struct carg_realloc {
2271 void *ptr;
2272 size_t old_size;
2273 size_t new_size;
2274 int zero_init;
2275 type_num_t user_type;
2276 pmemobj_constr constructor;
2277 void *arg;
2278 };
2279
2280 /*
2281 * pmemobj_zalloc -- allocates a new zeroed object
2282 */
2283 int
2284 pmemobj_zalloc(PMEMobjpool *pop, PMEMoid *oidp, size_t size,
2285 uint64_t type_num)
2286 {
2287 LOG(3, "pop %p oidp %p size %zu type_num %llx",
2288 pop, oidp, size, (unsigned long long)type_num);
2289
2290 /* log notice message if used inside a transaction */
2291 _POBJ_DEBUG_NOTICE_IN_TX();
2292
2293 if (size == 0) {
2294 ERR("allocation with size 0");
2295 errno = EINVAL;
2296 return -1;
2297 }
2298
2299 PMEMOBJ_API_START();
2300 int ret = obj_alloc_construct(pop, oidp, size, type_num, POBJ_FLAG_ZERO,
2301 NULL, NULL);
2302
2303 PMEMOBJ_API_END();
2304 return ret;
2305 }
2306
2307 /*
2308 * obj_free -- (internal) free an object
2309 */
2310 static void
2311 obj_free(PMEMobjpool *pop, PMEMoid *oidp)
2312 {
2313 ASSERTne(oidp, NULL);
2314
2315 struct operation_context *ctx = pmalloc_operation_hold(pop);
2316
2317 operation_add_entry(ctx, &oidp->pool_uuid_lo, 0, ULOG_OPERATION_SET);
2318
2319 palloc_operation(&pop->heap, oidp->off, &oidp->off, 0, NULL, NULL,
2320 0, 0, 0, 0, ctx);
2321
2322 pmalloc_operation_release(pop);
2323 }
2324
2325 /*
2326 * constructor_realloc -- (internal) constructor for pmemobj_realloc
2327 */
2328 static int
2329 constructor_realloc(void *ctx, void *ptr, size_t usable_size, void *arg)
2330 {
2331 PMEMobjpool *pop = ctx;
2332 LOG(3, "pop %p ptr %p arg %p", pop, ptr, arg);
2333 struct pmem_ops *p_ops = &pop->p_ops;
2334
2335 ASSERTne(ptr, NULL);
2336 ASSERTne(arg, NULL);
2337
2338 struct carg_realloc *carg = arg;
2339
2340 if (!carg->zero_init)
2341 return 0;
2342
2343 if (usable_size > carg->old_size) {
2344 size_t grow_len = usable_size - carg->old_size;
2345 void *new_data_ptr = (void *)((uintptr_t)ptr + carg->old_size);
2346
2347 pmemops_memset(p_ops, new_data_ptr, 0, grow_len, 0);
2348 }
2349
2350 return 0;
2351 }
2352
2353 /*
2354 * obj_realloc_common -- (internal) common routine for resizing
2355 * existing objects
2356 */
2357 static int
2358 obj_realloc_common(PMEMobjpool *pop,
2359 PMEMoid *oidp, size_t size, type_num_t type_num, int zero_init)
2360 {
2361 /* if OID is NULL just allocate memory */
2362 if (OBJ_OID_IS_NULL(*oidp)) {
2363 /* if size is 0 - do nothing */
2364 if (size == 0)
2365 return 0;
2366
2367 return obj_alloc_construct(pop, oidp, size, type_num,
2368 POBJ_FLAG_ZERO, NULL, NULL);
2369 }
2370
2371 if (size > PMEMOBJ_MAX_ALLOC_SIZE) {
2372 ERR("requested size too large");
2373 errno = ENOMEM;
2374 return -1;
2375 }
2376
2377 /* if size is 0 just free */
2378 if (size == 0) {
2379 obj_free(pop, oidp);
2380 return 0;
2381 }
2382
2383 struct carg_realloc carg;
2384 carg.ptr = OBJ_OFF_TO_PTR(pop, oidp->off);
2385 carg.new_size = size;
2386 carg.old_size = pmemobj_alloc_usable_size(*oidp);
2387 carg.user_type = type_num;
2388 carg.constructor = NULL;
2389 carg.arg = NULL;
2390 carg.zero_init = zero_init;
2391
2392 struct operation_context *ctx = pmalloc_operation_hold(pop);
2393
2394 int ret = palloc_operation(&pop->heap, oidp->off, &oidp->off,
2395 size, constructor_realloc, &carg, type_num,
2396 0, 0, 0, ctx);
2397
2398 pmalloc_operation_release(pop);
2399
2400 return ret;
2401 }
2402
2403 /*
2404 * constructor_zrealloc_root -- (internal) constructor for pmemobj_root
2405 */
2406 static int
2407 constructor_zrealloc_root(void *ctx, void *ptr, size_t usable_size, void *arg)
2408 {
2409 PMEMobjpool *pop = ctx;
2410 LOG(3, "pop %p ptr %p arg %p", pop, ptr, arg);
2411
2412 ASSERTne(ptr, NULL);
2413 ASSERTne(arg, NULL);
2414
2415 VALGRIND_ADD_TO_TX(ptr, usable_size);
2416
2417 struct carg_realloc *carg = arg;
2418
2419 constructor_realloc(pop, ptr, usable_size, arg);
2420 int ret = 0;
2421 if (carg->constructor)
2422 ret = carg->constructor(pop, ptr, carg->arg);
2423
2424 VALGRIND_REMOVE_FROM_TX(ptr, usable_size);
2425
2426 return ret;
2427 }
2428
2429 /*
2430 * pmemobj_realloc -- resizes an existing object
2431 */
2432 int
2433 pmemobj_realloc(PMEMobjpool *pop, PMEMoid *oidp, size_t size,
2434 uint64_t type_num)
2435 {
2436 ASSERTne(oidp, NULL);
2437
2438 LOG(3, "pop %p oid.off 0x%016" PRIx64 " size %zu type_num %" PRIu64,
2439 pop, oidp->off, size, type_num);
2440
2441 PMEMOBJ_API_START();
2442 /* log notice message if used inside a transaction */
2443 _POBJ_DEBUG_NOTICE_IN_TX();
2444 ASSERT(OBJ_OID_IS_VALID(pop, *oidp));
2445
2446 int ret = obj_realloc_common(pop, oidp, size, (type_num_t)type_num, 0);
2447
2448 PMEMOBJ_API_END();
2449 return ret;
2450 }
2451
2452 /*
2453 * pmemobj_zrealloc -- resizes an existing object, any new space is zeroed.
2454 */
2455 int
2456 pmemobj_zrealloc(PMEMobjpool *pop, PMEMoid *oidp, size_t size,
2457 uint64_t type_num)
2458 {
2459 ASSERTne(oidp, NULL);
2460
2461 LOG(3, "pop %p oid.off 0x%016" PRIx64 " size %zu type_num %" PRIu64,
2462 pop, oidp->off, size, type_num);
2463
2464 PMEMOBJ_API_START();
2465
2466 /* log notice message if used inside a transaction */
2467 _POBJ_DEBUG_NOTICE_IN_TX();
2468 ASSERT(OBJ_OID_IS_VALID(pop, *oidp));
2469
2470 int ret = obj_realloc_common(pop, oidp, size, (type_num_t)type_num, 1);
2471
2472 PMEMOBJ_API_END();
2473 return ret;
2474 }
2475
2476 /* arguments for constructor_strdup */
2477 struct carg_strdup {
2478 size_t size;
2479 const char *s;
2480 };
2481
2482 /*
2483 * constructor_strdup -- (internal) constructor of pmemobj_strdup
2484 */
2485 static int
2486 constructor_strdup(PMEMobjpool *pop, void *ptr, void *arg)
2487 {
2488 LOG(3, "pop %p ptr %p arg %p", pop, ptr, arg);
2489
2490 ASSERTne(ptr, NULL);
2491 ASSERTne(arg, NULL);
2492
2493 struct carg_strdup *carg = arg;
2494
2495 /* copy string */
2496 pmemops_memcpy(&pop->p_ops, ptr, carg->s, carg->size, 0);
2497
2498 return 0;
2499 }
2500
2501 /*
2502 * pmemobj_strdup -- allocates a new object with duplicate of the string s.
2503 */
2504 int
2505 pmemobj_strdup(PMEMobjpool *pop, PMEMoid *oidp, const char *s,
2506 uint64_t type_num)
2507 {
2508 LOG(3, "pop %p oidp %p string %s type_num %" PRIu64,
2509 pop, oidp, s, type_num);
2510
2511 /* log notice message if used inside a transaction */
2512 _POBJ_DEBUG_NOTICE_IN_TX();
2513
2514 if (NULL == s) {
2515 errno = EINVAL;
2516 return -1;
2517 }
2518
2519 PMEMOBJ_API_START();
2520 struct carg_strdup carg;
2521 carg.size = (strlen(s) + 1) * sizeof(char);
2522 carg.s = s;
2523
2524 int ret = obj_alloc_construct(pop, oidp, carg.size,
2525 (type_num_t)type_num, 0, constructor_strdup, &carg);
2526
2527 PMEMOBJ_API_END();
2528 return ret;
2529 }
2530
2531 /* arguments for constructor_wcsdup */
2532 struct carg_wcsdup {
2533 size_t size;
2534 const wchar_t *s;
2535 };
2536
2537 /*
2538 * constructor_wcsdup -- (internal) constructor of pmemobj_wcsdup
2539 */
2540 static int
2541 constructor_wcsdup(PMEMobjpool *pop, void *ptr, void *arg)
2542 {
2543 LOG(3, "pop %p ptr %p arg %p", pop, ptr, arg);
2544
2545 ASSERTne(ptr, NULL);
2546 ASSERTne(arg, NULL);
2547
2548 struct carg_wcsdup *carg = arg;
2549
2550 /* copy string */
2551 pmemops_memcpy(&pop->p_ops, ptr, carg->s, carg->size, 0);
2552
2553 return 0;
2554 }
2555
2556 /*
2557 * pmemobj_wcsdup -- allocates a new object with duplicate of the wide character
2558 * string s.
2559 */
2560 int
2561 pmemobj_wcsdup(PMEMobjpool *pop, PMEMoid *oidp, const wchar_t *s,
2562 uint64_t type_num)
2563 {
2564 LOG(3, "pop %p oidp %p string %S type_num %" PRIu64,
2565 pop, oidp, s, type_num);
2566
2567 /* log notice message if used inside a transaction */
2568 _POBJ_DEBUG_NOTICE_IN_TX();
2569
2570 if (NULL == s) {
2571 errno = EINVAL;
2572 return -1;
2573 }
2574
2575 PMEMOBJ_API_START();
2576 struct carg_wcsdup carg;
2577 carg.size = (wcslen(s) + 1) * sizeof(wchar_t);
2578 carg.s = s;
2579
2580 int ret = obj_alloc_construct(pop, oidp, carg.size,
2581 (type_num_t)type_num, 0, constructor_wcsdup, &carg);
2582
2583 PMEMOBJ_API_END();
2584 return ret;
2585 }
2586
2587 /*
2588 * pmemobj_free -- frees an existing object
2589 */
2590 void
2591 pmemobj_free(PMEMoid *oidp)
2592 {
2593 ASSERTne(oidp, NULL);
2594
2595 LOG(3, "oid.off 0x%016" PRIx64, oidp->off);
2596
2597 /* log notice message if used inside a transaction */
2598 _POBJ_DEBUG_NOTICE_IN_TX();
2599
2600 if (oidp->off == 0)
2601 return;
2602
2603 PMEMOBJ_API_START();
2604 PMEMobjpool *pop = pmemobj_pool_by_oid(*oidp);
2605
2606 ASSERTne(pop, NULL);
2607 ASSERT(OBJ_OID_IS_VALID(pop, *oidp));
2608
2609 obj_free(pop, oidp);
2610 PMEMOBJ_API_END();
2611 }
2612
2613 /*
2614 * pmemobj_alloc_usable_size -- returns usable size of object
2615 */
2616 size_t
2617 pmemobj_alloc_usable_size(PMEMoid oid)
2618 {
2619 LOG(3, "oid.off 0x%016" PRIx64, oid.off);
2620
2621 if (oid.off == 0)
2622 return 0;
2623
2624 PMEMobjpool *pop = pmemobj_pool_by_oid(oid);
2625
2626 ASSERTne(pop, NULL);
2627 ASSERT(OBJ_OID_IS_VALID(pop, oid));
2628
2629 return (palloc_usable_size(&pop->heap, oid.off));
2630 }
2631
2632 /*
2633 * pmemobj_memcpy_persist -- pmemobj version of memcpy
2634 */
2635 void *
2636 pmemobj_memcpy_persist(PMEMobjpool *pop, void *dest, const void *src,
2637 size_t len)
2638 {
2639 LOG(15, "pop %p dest %p src %p len %zu", pop, dest, src, len);
2640 PMEMOBJ_API_START();
2641
2642 void *ptr = pmemops_memcpy(&pop->p_ops, dest, src, len, 0);
2643
2644 PMEMOBJ_API_END();
2645 return ptr;
2646 }
2647
2648 /*
2649 * pmemobj_memset_persist -- pmemobj version of memset
2650 */
2651 void *
2652 pmemobj_memset_persist(PMEMobjpool *pop, void *dest, int c, size_t len)
2653 {
2654 LOG(15, "pop %p dest %p c 0x%02x len %zu", pop, dest, c, len);
2655 PMEMOBJ_API_START();
2656
2657 void *ptr = pmemops_memset(&pop->p_ops, dest, c, len, 0);
2658
2659 PMEMOBJ_API_END();
2660 return ptr;
2661 }
2662
2663 /*
2664 * pmemobj_memcpy -- pmemobj version of memcpy
2665 */
2666 void *
2667 pmemobj_memcpy(PMEMobjpool *pop, void *dest, const void *src, size_t len,
2668 unsigned flags)
2669 {
2670 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop, dest, src, len,
2671 flags);
2672
2673 PMEMOBJ_API_START();
2674
2675 void *ptr = pmemops_memcpy(&pop->p_ops, dest, src, len, flags);
2676
2677 PMEMOBJ_API_END();
2678 return ptr;
2679 }
2680
2681 /*
2682 * pmemobj_memmove -- pmemobj version of memmove
2683 */
2684 void *
2685 pmemobj_memmove(PMEMobjpool *pop, void *dest, const void *src, size_t len,
2686 unsigned flags)
2687 {
2688 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop, dest, src, len,
2689 flags);
2690
2691 PMEMOBJ_API_START();
2692
2693 void *ptr = pmemops_memmove(&pop->p_ops, dest, src, len, flags);
2694
2695 PMEMOBJ_API_END();
2696 return ptr;
2697 }
2698
2699 /*
2700 * pmemobj_memset -- pmemobj version of memset
2701 */
2702 void *
2703 pmemobj_memset(PMEMobjpool *pop, void *dest, int c, size_t len, unsigned flags)
2704 {
2705 LOG(15, "pop %p dest %p c 0x%02x len %zu flags 0x%x", pop, dest, c, len,
2706 flags);
2707
2708 PMEMOBJ_API_START();
2709
2710 void *ptr = pmemops_memset(&pop->p_ops, dest, c, len, flags);
2711
2712 PMEMOBJ_API_END();
2713 return ptr;
2714 }
2715
2716 /*
2717 * pmemobj_persist -- pmemobj version of pmem_persist
2718 */
2719 void
2720 pmemobj_persist(PMEMobjpool *pop, const void *addr, size_t len)
2721 {
2722 LOG(15, "pop %p addr %p len %zu", pop, addr, len);
2723
2724 pmemops_persist(&pop->p_ops, addr, len);
2725 }
2726
2727 /*
2728 * pmemobj_flush -- pmemobj version of pmem_flush
2729 */
2730 void
2731 pmemobj_flush(PMEMobjpool *pop, const void *addr, size_t len)
2732 {
2733 LOG(15, "pop %p addr %p len %zu", pop, addr, len);
2734
2735 pmemops_flush(&pop->p_ops, addr, len);
2736 }
2737
2738 /*
2739 * pmemobj_xpersist -- pmemobj version of pmem_persist with additional flags
2740 * argument
2741 */
2742 int
2743 pmemobj_xpersist(PMEMobjpool *pop, const void *addr, size_t len, unsigned flags)
2744 {
2745 LOG(15, "pop %p addr %p len %zu", pop, addr, len);
2746
2747 if (flags & ~OBJ_X_VALID_FLAGS) {
2748 errno = EINVAL;
2749 ERR("invalid flags 0x%x", flags);
2750 return -1;
2751 }
2752
2753 return pmemops_xpersist(&pop->p_ops, addr, len, flags);
2754 }
2755
2756 /*
2757 * pmemobj_xflush -- pmemobj version of pmem_flush with additional flags
2758 * argument
2759 */
2760 int
2761 pmemobj_xflush(PMEMobjpool *pop, const void *addr, size_t len, unsigned flags)
2762 {
2763 LOG(15, "pop %p addr %p len %zu", pop, addr, len);
2764
2765 if (flags & ~OBJ_X_VALID_FLAGS) {
2766 errno = EINVAL;
2767 ERR("invalid flags 0x%x", flags);
2768 return -1;
2769 }
2770
2771 return pmemops_xflush(&pop->p_ops, addr, len, flags);
2772 }
2773
2774 /*
2775 * pmemobj_drain -- pmemobj version of pmem_drain
2776 */
2777 void
2778 pmemobj_drain(PMEMobjpool *pop)
2779 {
2780 LOG(15, "pop %p", pop);
2781
2782 pmemops_drain(&pop->p_ops);
2783 }
2784
2785 /*
2786 * pmemobj_type_num -- returns type number of object
2787 */
2788 uint64_t
2789 pmemobj_type_num(PMEMoid oid)
2790 {
2791 LOG(3, "oid.off 0x%016" PRIx64, oid.off);
2792
2793 ASSERT(!OID_IS_NULL(oid));
2794
2795 PMEMobjpool *pop = pmemobj_pool_by_oid(oid);
2796
2797 ASSERTne(pop, NULL);
2798 ASSERT(OBJ_OID_IS_VALID(pop, oid));
2799
2800 return palloc_extra(&pop->heap, oid.off);
2801 }
2802
2803 /* arguments for constructor_alloc_root */
2804 struct carg_root {
2805 size_t size;
2806 pmemobj_constr constructor;
2807 void *arg;
2808 };
2809
2810 /*
2811 * obj_realloc_root -- (internal) reallocate root object
2812 */
2813 static int
2814 obj_alloc_root(PMEMobjpool *pop, size_t size,
2815 pmemobj_constr constructor, void *arg)
2816 {
2817 LOG(3, "pop %p size %zu", pop, size);
2818
2819 struct carg_realloc carg;
2820
2821 carg.ptr = OBJ_OFF_TO_PTR(pop, pop->root_offset);
2822 carg.old_size = pop->root_size;
2823 carg.new_size = size;
2824 carg.user_type = POBJ_ROOT_TYPE_NUM;
2825 carg.constructor = constructor;
2826 carg.zero_init = 1;
2827 carg.arg = arg;
2828
2829 struct operation_context *ctx = pmalloc_operation_hold(pop);
2830
2831 operation_add_entry(ctx, &pop->root_size, size, ULOG_OPERATION_SET);
2832
2833 int ret = palloc_operation(&pop->heap, pop->root_offset,
2834 &pop->root_offset, size,
2835 constructor_zrealloc_root, &carg,
2836 POBJ_ROOT_TYPE_NUM, OBJ_INTERNAL_OBJECT_MASK,
2837 0, 0, ctx);
2838
2839 pmalloc_operation_release(pop);
2840
2841 return ret;
2842 }
2843
2844 /*
2845 * pmemobj_root_size -- returns size of the root object
2846 */
2847 size_t
2848 pmemobj_root_size(PMEMobjpool *pop)
2849 {
2850 LOG(3, "pop %p", pop);
2851
2852 if (pop->root_offset && pop->root_size) {
2853 return pop->root_size;
2854 } else
2855 return 0;
2856 }
2857
2858 /*
2859 * pmemobj_root_construct -- returns root object
2860 */
2861 PMEMoid
2862 pmemobj_root_construct(PMEMobjpool *pop, size_t size,
2863 pmemobj_constr constructor, void *arg)
2864 {
2865 LOG(3, "pop %p size %zu constructor %p args %p", pop, size, constructor,
2866 arg);
2867
2868 if (size > PMEMOBJ_MAX_ALLOC_SIZE) {
2869 ERR("requested size too large");
2870 errno = ENOMEM;
2871 return OID_NULL;
2872 }
2873
2874 if (size == 0 && pop->root_offset == 0) {
2875 ERR("requested size cannot equals zero");
2876 errno = EINVAL;
2877 return OID_NULL;
2878 }
2879
2880 PMEMOBJ_API_START();
2881
2882 PMEMoid root;
2883
2884 pmemobj_mutex_lock_nofail(pop, &pop->rootlock);
2885
2886 if (size > pop->root_size &&
2887 obj_alloc_root(pop, size, constructor, arg)) {
2888 pmemobj_mutex_unlock_nofail(pop, &pop->rootlock);
2889 LOG(2, "obj_realloc_root failed");
2890 PMEMOBJ_API_END();
2891 return OID_NULL;
2892 }
2893
2894 root.pool_uuid_lo = pop->uuid_lo;
2895 root.off = pop->root_offset;
2896
2897 pmemobj_mutex_unlock_nofail(pop, &pop->rootlock);
2898
2899 PMEMOBJ_API_END();
2900 return root;
2901 }
2902
2903 /*
2904 * pmemobj_root -- returns root object
2905 */
2906 PMEMoid
2907 pmemobj_root(PMEMobjpool *pop, size_t size)
2908 {
2909 LOG(3, "pop %p size %zu", pop, size);
2910
2911 PMEMOBJ_API_START();
2912 PMEMoid oid = pmemobj_root_construct(pop, size, NULL, NULL);
2913 PMEMOBJ_API_END();
2914 return oid;
2915 }
2916
2917 /*
2918 * pmemobj_first - returns first object of specified type
2919 */
2920 PMEMoid
2921 pmemobj_first(PMEMobjpool *pop)
2922 {
2923 LOG(3, "pop %p", pop);
2924
2925 PMEMoid ret = {0, 0};
2926
2927 uint64_t off = palloc_first(&pop->heap);
2928 if (off != 0) {
2929 ret.off = off;
2930 ret.pool_uuid_lo = pop->uuid_lo;
2931
2932 if (palloc_flags(&pop->heap, off) & OBJ_INTERNAL_OBJECT_MASK) {
2933 return pmemobj_next(ret);
2934 }
2935 }
2936
2937 return ret;
2938 }
2939
2940 /*
2941 * pmemobj_next - returns next object of specified type
2942 */
2943 PMEMoid
2944 pmemobj_next(PMEMoid oid)
2945 {
2946 LOG(3, "oid.off 0x%016" PRIx64, oid.off);
2947
2948 PMEMoid curr = oid;
2949 if (curr.off == 0)
2950 return OID_NULL;
2951
2952 PMEMobjpool *pop = pmemobj_pool_by_oid(curr);
2953 ASSERTne(pop, NULL);
2954
2955 do {
2956 ASSERT(OBJ_OID_IS_VALID(pop, curr));
2957 uint64_t next_off = palloc_next(&pop->heap, curr.off);
2958
2959 if (next_off == 0)
2960 return OID_NULL;
2961
2962 /* next object exists */
2963 curr.off = next_off;
2964
2965 } while (palloc_flags(&pop->heap, curr.off) & OBJ_INTERNAL_OBJECT_MASK);
2966
2967 return curr;
2968 }
2969
2970 /*
2971 * pmemobj_reserve -- reserves a single object
2972 */
2973 PMEMoid
2974 pmemobj_reserve(PMEMobjpool *pop, struct pobj_action *act,
2975 size_t size, uint64_t type_num)
2976 {
2977 LOG(3, "pop %p act %p size %zu type_num %llx",
2978 pop, act, size,
2979 (unsigned long long)type_num);
2980
2981 PMEMOBJ_API_START();
2982 PMEMoid oid = OID_NULL;
2983
2984 if (palloc_reserve(&pop->heap, size, NULL, NULL, type_num,
2985 0, 0, 0, act) != 0) {
2986 PMEMOBJ_API_END();
2987 return oid;
2988 }
2989
2990 oid.off = act->heap.offset;
2991 oid.pool_uuid_lo = pop->uuid_lo;
2992
2993 PMEMOBJ_API_END();
2994 return oid;
2995 }
2996
2997 /*
2998 * pmemobj_xreserve -- reserves a single object
2999 */
3000 PMEMoid
3001 pmemobj_xreserve(PMEMobjpool *pop, struct pobj_action *act,
3002 size_t size, uint64_t type_num, uint64_t flags)
3003 {
3004 LOG(3, "pop %p act %p size %zu type_num %llx flags %llx",
3005 pop, act, size,
3006 (unsigned long long)type_num, (unsigned long long)flags);
3007
3008 PMEMoid oid = OID_NULL;
3009
3010 if (flags & ~POBJ_ACTION_XRESERVE_VALID_FLAGS) {
3011 ERR("unknown flags 0x%" PRIx64,
3012 flags & ~POBJ_ACTION_XRESERVE_VALID_FLAGS);
3013 errno = EINVAL;
3014 return oid;
3015 }
3016
3017 PMEMOBJ_API_START();
3018 struct constr_args carg;
3019
3020 carg.zero_init = flags & POBJ_FLAG_ZERO;
3021 carg.constructor = NULL;
3022 carg.arg = NULL;
3023
3024 if (palloc_reserve(&pop->heap, size, constructor_alloc, &carg,
3025 type_num, 0, CLASS_ID_FROM_FLAG(flags),
3026 ARENA_ID_FROM_FLAG(flags), act) != 0) {
3027 PMEMOBJ_API_END();
3028 return oid;
3029 }
3030
3031 oid.off = act->heap.offset;
3032 oid.pool_uuid_lo = pop->uuid_lo;
3033
3034 PMEMOBJ_API_END();
3035 return oid;
3036 }
3037
3038 /*
3039 * pmemobj_set_value -- creates an action to set a value
3040 */
3041 void
3042 pmemobj_set_value(PMEMobjpool *pop, struct pobj_action *act,
3043 uint64_t *ptr, uint64_t value)
3044 {
3045 palloc_set_value(&pop->heap, act, ptr, value);
3046 }
3047
3048 /*
3049 * pmemobj_defer_free -- creates a deferred free action
3050 */
3051 void
3052 pmemobj_defer_free(PMEMobjpool *pop, PMEMoid oid, struct pobj_action *act)
3053 {
3054 ASSERT(!OID_IS_NULL(oid));
3055 palloc_defer_free(&pop->heap, oid.off, act);
3056 }
3057
3058 /*
3059 * pmemobj_publish -- publishes a collection of actions
3060 */
3061 int
3062 pmemobj_publish(PMEMobjpool *pop, struct pobj_action *actv, size_t actvcnt)
3063 {
3064 PMEMOBJ_API_START();
3065 struct operation_context *ctx = pmalloc_operation_hold(pop);
3066
3067 size_t entries_size = actvcnt * sizeof(struct ulog_entry_val);
3068
3069 if (operation_reserve(ctx, entries_size) != 0) {
3070 PMEMOBJ_API_END();
3071 return -1;
3072 }
3073
3074 palloc_publish(&pop->heap, actv, actvcnt, ctx);
3075
3076 pmalloc_operation_release(pop);
3077
3078 PMEMOBJ_API_END();
3079 return 0;
3080 }
3081
3082 /*
3083 * pmemobj_cancel -- cancels collection of actions
3084 */
3085 void
3086 pmemobj_cancel(PMEMobjpool *pop, struct pobj_action *actv, size_t actvcnt)
3087 {
3088 PMEMOBJ_API_START();
3089 palloc_cancel(&pop->heap, actv, actvcnt);
3090 PMEMOBJ_API_END();
3091 }
3092
3093 /*
3094 * pmemobj_defrag -- reallocates provided PMEMoids so that the underlying memory
3095 * is efficiently arranged.
3096 */
3097 int
3098 pmemobj_defrag(PMEMobjpool *pop, PMEMoid **oidv, size_t oidcnt,
3099 struct pobj_defrag_result *result)
3100 {
3101 PMEMOBJ_API_START();
3102
3103 if (result) {
3104 result->relocated = 0;
3105 result->total = 0;
3106 }
3107
3108 uint64_t **objv = Malloc(sizeof(uint64_t *) * oidcnt);
3109 if (objv == NULL)
3110 return -1;
3111
3112 int ret = 0;
3113
3114 size_t j = 0;
3115 for (size_t i = 0; i < oidcnt; ++i) {
3116 if (OID_IS_NULL(*oidv[i]))
3117 continue;
3118 if (oidv[i]->pool_uuid_lo != pop->uuid_lo) {
3119 ret = -1;
3120 ERR("Not all PMEMoids belong to the provided pool");
3121 goto out;
3122 }
3123 objv[j++] = &oidv[i]->off;
3124 }
3125
3126 struct operation_context *ctx = pmalloc_operation_hold(pop);
3127
3128 ret = palloc_defrag(&pop->heap, objv, j, ctx, result);
3129
3130 pmalloc_operation_release(pop);
3131
3132 out:
3133 Free(objv);
3134
3135 PMEMOBJ_API_END();
3136 return ret;
3137 }
3138
3139 /*
3140 * pmemobj_list_insert -- adds object to a list
3141 */
3142 int
3143 pmemobj_list_insert(PMEMobjpool *pop, size_t pe_offset, void *head,
3144 PMEMoid dest, int before, PMEMoid oid)
3145 {
3146 LOG(3, "pop %p pe_offset %zu head %p dest.off 0x%016" PRIx64
3147 " before %d oid.off 0x%016" PRIx64,
3148 pop, pe_offset, head, dest.off, before, oid.off);
3149 PMEMOBJ_API_START();
3150
3151 /* log notice message if used inside a transaction */
3152 _POBJ_DEBUG_NOTICE_IN_TX();
3153 ASSERT(OBJ_OID_IS_VALID(pop, oid));
3154 ASSERT(OBJ_OID_IS_VALID(pop, dest));
3155
3156 ASSERT(pe_offset <= pmemobj_alloc_usable_size(dest)
3157 - sizeof(struct list_entry));
3158 ASSERT(pe_offset <= pmemobj_alloc_usable_size(oid)
3159 - sizeof(struct list_entry));
3160
3161 int ret = list_insert(pop, (ssize_t)pe_offset, head, dest, before, oid);
3162
3163 PMEMOBJ_API_END();
3164 return ret;
3165 }
3166
3167 /*
3168 * pmemobj_list_insert_new -- adds new object to a list
3169 */
3170 PMEMoid
3171 pmemobj_list_insert_new(PMEMobjpool *pop, size_t pe_offset, void *head,
3172 PMEMoid dest, int before, size_t size,
3173 uint64_t type_num,
3174 pmemobj_constr constructor, void *arg)
3175 {
3176 LOG(3, "pop %p pe_offset %zu head %p dest.off 0x%016" PRIx64
3177 " before %d size %zu type_num %" PRIu64,
3178 pop, pe_offset, head, dest.off, before, size, type_num);
3179
3180 /* log notice message if used inside a transaction */
3181 _POBJ_DEBUG_NOTICE_IN_TX();
3182 ASSERT(OBJ_OID_IS_VALID(pop, dest));
3183
3184 ASSERT(pe_offset <= pmemobj_alloc_usable_size(dest)
3185 - sizeof(struct list_entry));
3186 ASSERT(pe_offset <= size - sizeof(struct list_entry));
3187
3188 if (size > PMEMOBJ_MAX_ALLOC_SIZE) {
3189 ERR("requested size too large");
3190 errno = ENOMEM;
3191 return OID_NULL;
3192 }
3193
3194 PMEMOBJ_API_START();
3195 struct constr_args carg;
3196
3197 carg.constructor = constructor;
3198 carg.arg = arg;
3199 carg.zero_init = 0;
3200
3201 PMEMoid retoid = OID_NULL;
3202 list_insert_new_user(pop, pe_offset, head, dest, before, size, type_num,
3203 constructor_alloc, &carg, &retoid);
3204
3205 PMEMOBJ_API_END();
3206 return retoid;
3207 }
3208
3209 /*
3210 * pmemobj_list_remove -- removes object from a list
3211 */
3212 int
3213 pmemobj_list_remove(PMEMobjpool *pop, size_t pe_offset, void *head,
3214 PMEMoid oid, int free)
3215 {
3216 LOG(3, "pop %p pe_offset %zu head %p oid.off 0x%016" PRIx64 " free %d",
3217 pop, pe_offset, head, oid.off, free);
3218 PMEMOBJ_API_START();
3219
3220 /* log notice message if used inside a transaction */
3221 _POBJ_DEBUG_NOTICE_IN_TX();
3222 ASSERT(OBJ_OID_IS_VALID(pop, oid));
3223
3224 ASSERT(pe_offset <= pmemobj_alloc_usable_size(oid)
3225 - sizeof(struct list_entry));
3226
3227 int ret;
3228 if (free)
3229 ret = list_remove_free_user(pop, pe_offset, head, &oid);
3230 else
3231 ret = list_remove(pop, (ssize_t)pe_offset, head, oid);
3232
3233 PMEMOBJ_API_END();
3234 return ret;
3235 }
3236
3237 /*
3238 * pmemobj_list_move -- moves object between lists
3239 */
3240 int
3241 pmemobj_list_move(PMEMobjpool *pop, size_t pe_old_offset, void *head_old,
3242 size_t pe_new_offset, void *head_new,
3243 PMEMoid dest, int before, PMEMoid oid)
3244 {
3245 LOG(3, "pop %p pe_old_offset %zu pe_new_offset %zu"
3246 " head_old %p head_new %p dest.off 0x%016" PRIx64
3247 " before %d oid.off 0x%016" PRIx64 "",
3248 pop, pe_old_offset, pe_new_offset,
3249 head_old, head_new, dest.off, before, oid.off);
3250 PMEMOBJ_API_START();
3251
3252 /* log notice message if used inside a transaction */
3253 _POBJ_DEBUG_NOTICE_IN_TX();
3254
3255 ASSERT(OBJ_OID_IS_VALID(pop, oid));
3256 ASSERT(OBJ_OID_IS_VALID(pop, dest));
3257
3258 ASSERT(pe_old_offset <= pmemobj_alloc_usable_size(oid)
3259 - sizeof(struct list_entry));
3260 ASSERT(pe_new_offset <= pmemobj_alloc_usable_size(oid)
3261 - sizeof(struct list_entry));
3262 ASSERT(pe_old_offset <= pmemobj_alloc_usable_size(dest)
3263 - sizeof(struct list_entry));
3264 ASSERT(pe_new_offset <= pmemobj_alloc_usable_size(dest)
3265 - sizeof(struct list_entry));
3266
3267 int ret = list_move(pop, pe_old_offset, head_old,
3268 pe_new_offset, head_new,
3269 dest, before, oid);
3270
3271 PMEMOBJ_API_END();
3272 return ret;
3273 }
3274
3275 /*
3276 * pmemobj_ctl_getU -- programmatically executes a read ctl query
3277 */
3278 #ifndef _WIN32
3279 static inline
3280 #endif
3281 int
3282 pmemobj_ctl_getU(PMEMobjpool *pop, const char *name, void *arg)
3283 {
3284 LOG(3, "pop %p name %s arg %p", pop, name, arg);
3285 return ctl_query(pop == NULL ? NULL : pop->ctl, pop,
3286 CTL_QUERY_PROGRAMMATIC, name, CTL_QUERY_READ, arg);
3287 }
3288
3289 /*
3290 * pmemobj_ctl_setU -- programmatically executes a write ctl query
3291 */
3292 #ifndef _WIN32
3293 static inline
3294 #endif
3295 int
3296 pmemobj_ctl_setU(PMEMobjpool *pop, const char *name, void *arg)
3297 {
3298 LOG(3, "pop %p name %s arg %p", pop, name, arg);
3299 return ctl_query(pop == NULL ? NULL : pop->ctl, pop,
3300 CTL_QUERY_PROGRAMMATIC, name, CTL_QUERY_WRITE, arg);
3301 }
3302
3303 /*
3304 * pmemobj_ctl_execU -- programmatically executes a runnable ctl query
3305 */
3306 #ifndef _WIN32
3307 static inline
3308 #endif
3309 int
3310 pmemobj_ctl_execU(PMEMobjpool *pop, const char *name, void *arg)
3311 {
3312 LOG(3, "pop %p name %s arg %p", pop, name, arg);
3313 return ctl_query(pop == NULL ? NULL : pop->ctl, pop,
3314 CTL_QUERY_PROGRAMMATIC, name, CTL_QUERY_RUNNABLE, arg);
3315 }
3316
3317 #ifndef _WIN32
3318 /*
3319 * pmemobj_ctl_get -- programmatically executes a read ctl query
3320 */
3321 int
3322 pmemobj_ctl_get(PMEMobjpool *pop, const char *name, void *arg)
3323 {
3324 return pmemobj_ctl_getU(pop, name, arg);
3325 }
3326
3327 /*
3328 * pmemobj_ctl_set -- programmatically executes a write ctl query
3329 */
3330 int
3331 pmemobj_ctl_set(PMEMobjpool *pop, const char *name, void *arg)
3332 {
3333 PMEMOBJ_API_START();
3334
3335 int ret = pmemobj_ctl_setU(pop, name, arg);
3336
3337 PMEMOBJ_API_END();
3338 return ret;
3339 }
3340
3341 /*
3342 * pmemobj_ctl_exec -- programmatically executes a runnable ctl query
3343 */
3344 int
3345 pmemobj_ctl_exec(PMEMobjpool *pop, const char *name, void *arg)
3346 {
3347 PMEMOBJ_API_START();
3348
3349 int ret = pmemobj_ctl_execU(pop, name, arg);
3350
3351 PMEMOBJ_API_END();
3352 return ret;
3353 }
3354 #else
3355 /*
3356 * pmemobj_ctl_getW -- programmatically executes a read ctl query
3357 */
3358 int
3359 pmemobj_ctl_getW(PMEMobjpool *pop, const wchar_t *name, void *arg)
3360 {
3361 char *uname = util_toUTF8(name);
3362 if (uname == NULL)
3363 return -1;
3364
3365 int ret = pmemobj_ctl_getU(pop, uname, arg);
3366 util_free_UTF8(uname);
3367
3368 return ret;
3369 }
3370
3371 /*
3372 * pmemobj_ctl_setW -- programmatically executes a write ctl query
3373 */
3374 int
3375 pmemobj_ctl_setW(PMEMobjpool *pop, const wchar_t *name, void *arg)
3376 {
3377 char *uname = util_toUTF8(name);
3378 if (uname == NULL)
3379 return -1;
3380
3381 int ret = pmemobj_ctl_setU(pop, uname, arg);
3382 util_free_UTF8(uname);
3383
3384 return ret;
3385 }
3386
3387 /*
3388 * pmemobj_ctl_execW -- programmatically executes a runnable ctl query
3389 */
3390 int
3391 pmemobj_ctl_execW(PMEMobjpool *pop, const wchar_t *name, void *arg)
3392 {
3393 char *uname = util_toUTF8(name);
3394 if (uname == NULL)
3395 return -1;
3396
3397 int ret = pmemobj_ctl_execU(pop, uname, arg);
3398 util_free_UTF8(uname);
3399
3400 return ret;
3401 }
3402 #endif
3403
3404 /*
3405 * _pobj_debug_notice -- logs notice message if used inside a transaction
3406 */
3407 void
3408 _pobj_debug_notice(const char *api_name, const char *file, int line)
3409 {
3410 #ifdef DEBUG
3411 if (pmemobj_tx_stage() != TX_STAGE_NONE) {
3412 if (file)
3413 LOG(4, "Notice: non-transactional API"
3414 " used inside a transaction (%s in %s:%d)",
3415 api_name, file, line);
3416 else
3417 LOG(4, "Notice: non-transactional API"
3418 " used inside a transaction (%s)", api_name);
3419 }
3420 #endif /* DEBUG */
3421 }
3422
3423 #if VG_PMEMCHECK_ENABLED
3424 /*
3425 * pobj_emit_log -- logs library and function names to pmemcheck store log
3426 */
3427 void
3428 pobj_emit_log(const char *func, int order)
3429 {
3430 util_emit_log("libpmemobj", func, order);
3431 }
3432 #endif
3433
3434 #if FAULT_INJECTION
3435 void
3436 pmemobj_inject_fault_at(enum pmem_allocation_type type, int nth,
3437 const char *at)
3438 {
3439 core_inject_fault_at(type, nth, at);
3440 }
3441
3442 int
3443 pmemobj_fault_injection_enabled(void)
3444 {
3445 return core_fault_injection_enabled();
3446 }
3447 #endif