1 // SPDX-License-Identifier: BSD-3-Clause
2 /* Copyright 2014-2020, Intel Corporation */
5 * obj.c -- transactional object store implementation
12 #include "valgrind_internal.h"
19 #include "ctl_global.h"
22 #include "heap_layout.h"
24 #include "os_thread.h"
32 * The variable from which the config is directly loaded. The string
33 * cannot contain any comments or extraneous white characters.
35 #define OBJ_CONFIG_ENV_VARIABLE "PMEMOBJ_CONF"
38 * The variable that points to a config file from which the config is loaded.
40 #define OBJ_CONFIG_FILE_ENV_VARIABLE "PMEMOBJ_CONF_FILE"
43 * The variable which overwrites a number of lanes available at runtime.
45 #define OBJ_NLANES_ENV_VARIABLE "PMEMOBJ_NLANES"
47 #define OBJ_X_VALID_FLAGS PMEMOBJ_F_RELAXED
49 static const struct pool_attr Obj_create_attr
= {
52 OBJ_FORMAT_FEAT_DEFAULT
,
53 {0}, {0}, {0}, {0}, {0}
56 static const struct pool_attr Obj_open_attr
= {
59 OBJ_FORMAT_FEAT_CHECK
,
60 {0}, {0}, {0}, {0}, {0}
63 static struct critnib
*pools_ht
; /* hash table used for searching by UUID */
64 static struct critnib
*pools_tree
; /* tree used for searching by address */
66 int _pobj_cache_invalidate
;
70 __thread
struct _pobj_pcache _pobj_cached_pool
;
73 * pmemobj_direct -- returns the direct pointer of an object
76 pmemobj_direct(PMEMoid oid
)
78 return pmemobj_direct_inline(oid
);
84 * XXX - this is a temporary implementation
86 * Seems like we could still use TLS and simply substitute "__thread" with
87 * "__declspec(thread)", however it's not clear if it would work correctly
89 * Need to verify that once we have the multi-threaded tests ported.
98 static os_once_t Cached_pool_key_once
= OS_ONCE_INIT
;
99 static os_tls_key_t Cached_pool_key
;
102 * _Cached_pool_key_alloc -- (internal) allocate pool cache pthread key
105 _Cached_pool_key_alloc(void)
107 int pth_ret
= os_tls_key_create(&Cached_pool_key
, free
);
109 FATAL("!os_tls_key_create");
113 * pmemobj_direct -- returns the direct pointer of an object
116 pmemobj_direct(PMEMoid oid
)
118 if (oid
.off
== 0 || oid
.pool_uuid_lo
== 0)
121 struct _pobj_pcache
*pcache
= os_tls_get(Cached_pool_key
);
122 if (pcache
== NULL
) {
123 pcache
= calloc(sizeof(struct _pobj_pcache
), 1);
125 FATAL("!pcache malloc");
126 int ret
= os_tls_set(Cached_pool_key
, pcache
);
128 FATAL("!os_tls_set");
131 if (_pobj_cache_invalidate
!= pcache
->invalidate
||
132 pcache
->uuid_lo
!= oid
.pool_uuid_lo
) {
133 pcache
->invalidate
= _pobj_cache_invalidate
;
135 if ((pcache
->pop
= pmemobj_pool_by_oid(oid
)) == NULL
) {
140 pcache
->uuid_lo
= oid
.pool_uuid_lo
;
143 return (void *)((uintptr_t)pcache
->pop
+ oid
.off
);
149 * obj_ctl_init_and_load -- (static) initializes CTL and loads configuration
150 * from env variable and file
153 obj_ctl_init_and_load(PMEMobjpool
*pop
)
155 LOG(3, "pop %p", pop
);
157 if (pop
!= NULL
&& (pop
->ctl
= ctl_new()) == NULL
) {
163 tx_ctl_register(pop
);
164 pmalloc_ctl_register(pop
);
165 stats_ctl_register(pop
);
166 debug_ctl_register(pop
);
169 char *env_config
= os_getenv(OBJ_CONFIG_ENV_VARIABLE
);
170 if (env_config
!= NULL
) {
171 if (ctl_load_config_from_string(pop
? pop
->ctl
: NULL
,
172 pop
, env_config
) != 0) {
173 LOG(2, "unable to parse config stored in %s "
174 "environment variable",
175 OBJ_CONFIG_ENV_VARIABLE
);
180 char *env_config_file
= os_getenv(OBJ_CONFIG_FILE_ENV_VARIABLE
);
181 if (env_config_file
!= NULL
&& env_config_file
[0] != '\0') {
182 if (ctl_load_config_from_file(pop
? pop
->ctl
: NULL
,
183 pop
, env_config_file
) != 0) {
184 LOG(2, "unable to parse config stored in %s "
185 "file (from %s environment variable)",
187 OBJ_CONFIG_FILE_ENV_VARIABLE
);
195 ctl_delete(pop
->ctl
);
200 * obj_pool_init -- (internal) allocate global structs holding all opened pools
202 * This is invoked on a first call to pmemobj_open() or pmemobj_create().
203 * Memory is released in library destructor.
205 * This function needs to be threadsafe.
214 if (pools_ht
== NULL
) {
217 FATAL("!critnib_new for pools_ht");
218 if (!util_bool_compare_and_swap64(&pools_ht
, NULL
, c
))
222 if (pools_tree
== NULL
) {
225 FATAL("!critnib_new for pools_tree");
226 if (!util_bool_compare_and_swap64(&pools_tree
, NULL
, c
))
232 * pmemobj_oid -- return a PMEMoid based on the virtual address
234 * If the address does not belong to any pool OID_NULL is returned.
237 pmemobj_oid(const void *addr
)
239 PMEMobjpool
*pop
= pmemobj_pool_by_ptr(addr
);
243 PMEMoid oid
= {pop
->uuid_lo
, (uintptr_t)addr
- (uintptr_t)pop
};
248 * obj_init -- initialization of obj
250 * Called by constructor.
257 COMPILE_ERROR_ON(sizeof(struct pmemobjpool
) !=
258 POOL_HDR_SIZE
+ POOL_DESC_SIZE
);
260 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_NODRAIN
!= PMEM_F_MEM_NODRAIN
);
262 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_NONTEMPORAL
!= PMEM_F_MEM_NONTEMPORAL
);
263 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_TEMPORAL
!= PMEM_F_MEM_TEMPORAL
);
265 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_WC
!= PMEM_F_MEM_WC
);
266 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_WB
!= PMEM_F_MEM_WB
);
268 COMPILE_ERROR_ON(PMEMOBJ_F_MEM_NOFLUSH
!= PMEM_F_MEM_NOFLUSH
);
271 /* XXX - temporary implementation (see above) */
272 os_once(&Cached_pool_key_once
, _Cached_pool_key_alloc
);
275 * Load global config, ignore any issues. They will be caught on the
276 * subsequent call to this function for individual pools.
278 ctl_global_register();
280 if (obj_ctl_init_and_load(NULL
))
281 FATAL("error: %s", pmemobj_errormsg());
289 * obj_fini -- cleanup of obj
291 * Called by destructor.
299 critnib_delete(pools_ht
);
301 critnib_delete(pools_tree
);
306 (void) os_tls_key_delete(Cached_pool_key
);
311 * obj_drain_empty -- (internal) empty function for drain on non-pmem memory
314 obj_drain_empty(void)
320 * obj_msync_nofail -- (internal) pmem_msync wrapper that never fails from
321 * caller's perspective
324 obj_msync_nofail(const void *addr
, size_t size
)
326 if (pmem_msync(addr
, size
))
327 FATAL("!pmem_msync");
331 * obj_nopmem_memcpy -- (internal) memcpy followed by an msync
334 obj_nopmem_memcpy(void *dest
, const void *src
, size_t len
, unsigned flags
)
336 LOG(15, "dest %p src %p len %zu flags 0x%x", dest
, src
, len
, flags
);
339 * Use pmem_memcpy instead of memcpy, because pmemobj_memcpy is supposed
340 * to guarantee that multiple of 8 byte stores to 8 byte aligned
341 * addresses are fail safe atomic. pmem_memcpy guarantees that, while
342 * libc memcpy does not.
344 pmem_memcpy(dest
, src
, len
, PMEM_F_MEM_NOFLUSH
);
345 obj_msync_nofail(dest
, len
);
350 * obj_nopmem_memmove -- (internal) memmove followed by an msync
353 obj_nopmem_memmove(void *dest
, const void *src
, size_t len
, unsigned flags
)
355 LOG(15, "dest %p src %p len %zu flags 0x%x", dest
, src
, len
, flags
);
357 /* see comment in obj_nopmem_memcpy */
358 pmem_memmove(dest
, src
, len
, PMEM_F_MEM_NOFLUSH
);
359 obj_msync_nofail(dest
, len
);
364 * obj_nopmem_memset -- (internal) memset followed by an msync
367 obj_nopmem_memset(void *dest
, int c
, size_t len
, unsigned flags
)
369 LOG(15, "dest %p c 0x%02x len %zu flags 0x%x", dest
, c
, len
, flags
);
371 /* see comment in obj_nopmem_memcpy */
372 pmem_memset(dest
, c
, len
, PMEM_F_MEM_NOFLUSH
);
373 obj_msync_nofail(dest
, len
);
378 * obj_remote_persist -- (internal) remote persist function
381 obj_remote_persist(PMEMobjpool
*pop
, const void *addr
, size_t len
,
382 unsigned lane
, unsigned flags
)
384 LOG(15, "pop %p addr %p len %zu lane %u flags %u",
385 pop
, addr
, len
, lane
, flags
);
387 ASSERTne(pop
->rpp
, NULL
);
389 uintptr_t offset
= (uintptr_t)addr
- pop
->remote_base
;
391 unsigned rpmem_flags
= 0;
392 if (flags
& PMEMOBJ_F_RELAXED
)
393 rpmem_flags
|= RPMEM_PERSIST_RELAXED
;
395 int rv
= Rpmem_persist(pop
->rpp
, offset
, len
, lane
, rpmem_flags
);
397 ERR("!rpmem_persist(rpp %p offset %zu length %zu lane %u)"
398 " FATAL ERROR (returned value %i)",
399 pop
->rpp
, offset
, len
, lane
, rv
);
407 * XXX - Consider removing obj_norep_*() wrappers to call *_local()
408 * functions directly. Alternatively, always use obj_rep_*(), even
409 * if there are no replicas. Verify the performance penalty.
413 * obj_norep_memcpy -- (internal) memcpy w/o replication
416 obj_norep_memcpy(void *ctx
, void *dest
, const void *src
, size_t len
,
419 PMEMobjpool
*pop
= ctx
;
420 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop
, dest
, src
, len
,
423 return pop
->memcpy_local(dest
, src
, len
,
424 flags
& PMEM_F_MEM_VALID_FLAGS
);
428 * obj_norep_memmove -- (internal) memmove w/o replication
431 obj_norep_memmove(void *ctx
, void *dest
, const void *src
, size_t len
,
434 PMEMobjpool
*pop
= ctx
;
435 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop
, dest
, src
, len
,
438 return pop
->memmove_local(dest
, src
, len
,
439 flags
& PMEM_F_MEM_VALID_FLAGS
);
443 * obj_norep_memset -- (internal) memset w/o replication
446 obj_norep_memset(void *ctx
, void *dest
, int c
, size_t len
, unsigned flags
)
448 PMEMobjpool
*pop
= ctx
;
449 LOG(15, "pop %p dest %p c 0x%02x len %zu flags 0x%x", pop
, dest
, c
, len
,
452 return pop
->memset_local(dest
, c
, len
, flags
& PMEM_F_MEM_VALID_FLAGS
);
456 * obj_norep_persist -- (internal) persist w/o replication
459 obj_norep_persist(void *ctx
, const void *addr
, size_t len
, unsigned flags
)
461 PMEMobjpool
*pop
= ctx
;
462 LOG(15, "pop %p addr %p len %zu", pop
, addr
, len
);
464 pop
->persist_local(addr
, len
);
470 * obj_norep_flush -- (internal) flush w/o replication
473 obj_norep_flush(void *ctx
, const void *addr
, size_t len
, unsigned flags
)
475 PMEMobjpool
*pop
= ctx
;
476 LOG(15, "pop %p addr %p len %zu", pop
, addr
, len
);
478 pop
->flush_local(addr
, len
);
484 * obj_norep_drain -- (internal) drain w/o replication
487 obj_norep_drain(void *ctx
)
489 PMEMobjpool
*pop
= ctx
;
490 LOG(15, "pop %p", pop
);
495 static void obj_pool_cleanup(PMEMobjpool
*pop
);
498 * obj_handle_remote_persist_error -- (internal) handle remote persist
502 obj_handle_remote_persist_error(PMEMobjpool
*pop
)
504 LOG(1, "pop %p", pop
);
506 ERR("error clean up...");
507 obj_pool_cleanup(pop
);
509 FATAL("Fatal error of remote persist. Aborting...");
513 * obj_rep_memcpy -- (internal) memcpy with replication
516 obj_rep_memcpy(void *ctx
, void *dest
, const void *src
, size_t len
,
519 PMEMobjpool
*pop
= ctx
;
520 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop
, dest
, src
, len
,
523 unsigned lane
= UINT_MAX
;
525 if (pop
->has_remote_replicas
)
526 lane
= lane_hold(pop
, NULL
);
528 void *ret
= pop
->memcpy_local(dest
, src
, len
, flags
);
530 PMEMobjpool
*rep
= pop
->replica
;
532 void *rdest
= (char *)rep
+ (uintptr_t)dest
- (uintptr_t)pop
;
533 if (rep
->rpp
== NULL
) {
534 rep
->memcpy_local(rdest
, src
, len
,
535 flags
& PMEM_F_MEM_VALID_FLAGS
);
537 if (rep
->persist_remote(rep
, rdest
, len
, lane
, flags
))
538 obj_handle_remote_persist_error(pop
);
543 if (pop
->has_remote_replicas
)
550 * obj_rep_memmove -- (internal) memmove with replication
553 obj_rep_memmove(void *ctx
, void *dest
, const void *src
, size_t len
,
556 PMEMobjpool
*pop
= ctx
;
557 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop
, dest
, src
, len
,
560 unsigned lane
= UINT_MAX
;
562 if (pop
->has_remote_replicas
)
563 lane
= lane_hold(pop
, NULL
);
565 void *ret
= pop
->memmove_local(dest
, src
, len
, flags
);
567 PMEMobjpool
*rep
= pop
->replica
;
569 void *rdest
= (char *)rep
+ (uintptr_t)dest
- (uintptr_t)pop
;
570 if (rep
->rpp
== NULL
) {
571 rep
->memmove_local(rdest
, src
, len
,
572 flags
& PMEM_F_MEM_VALID_FLAGS
);
574 if (rep
->persist_remote(rep
, rdest
, len
, lane
, flags
))
575 obj_handle_remote_persist_error(pop
);
580 if (pop
->has_remote_replicas
)
587 * obj_rep_memset -- (internal) memset with replication
590 obj_rep_memset(void *ctx
, void *dest
, int c
, size_t len
, unsigned flags
)
592 PMEMobjpool
*pop
= ctx
;
593 LOG(15, "pop %p dest %p c 0x%02x len %zu flags 0x%x", pop
, dest
, c
, len
,
596 unsigned lane
= UINT_MAX
;
598 if (pop
->has_remote_replicas
)
599 lane
= lane_hold(pop
, NULL
);
601 void *ret
= pop
->memset_local(dest
, c
, len
, flags
);
603 PMEMobjpool
*rep
= pop
->replica
;
605 void *rdest
= (char *)rep
+ (uintptr_t)dest
- (uintptr_t)pop
;
606 if (rep
->rpp
== NULL
) {
607 rep
->memset_local(rdest
, c
, len
,
608 flags
& PMEM_F_MEM_VALID_FLAGS
);
610 if (rep
->persist_remote(rep
, rdest
, len
, lane
, flags
))
611 obj_handle_remote_persist_error(pop
);
616 if (pop
->has_remote_replicas
)
623 * obj_rep_persist -- (internal) persist with replication
626 obj_rep_persist(void *ctx
, const void *addr
, size_t len
, unsigned flags
)
628 PMEMobjpool
*pop
= ctx
;
629 LOG(15, "pop %p addr %p len %zu", pop
, addr
, len
);
631 unsigned lane
= UINT_MAX
;
633 if (pop
->has_remote_replicas
)
634 lane
= lane_hold(pop
, NULL
);
636 pop
->persist_local(addr
, len
);
638 PMEMobjpool
*rep
= pop
->replica
;
640 void *raddr
= (char *)rep
+ (uintptr_t)addr
- (uintptr_t)pop
;
641 if (rep
->rpp
== NULL
) {
642 rep
->memcpy_local(raddr
, addr
, len
, 0);
644 if (rep
->persist_remote(rep
, raddr
, len
, lane
, flags
))
645 obj_handle_remote_persist_error(pop
);
650 if (pop
->has_remote_replicas
)
657 * obj_rep_flush -- (internal) flush with replication
660 obj_rep_flush(void *ctx
, const void *addr
, size_t len
, unsigned flags
)
662 PMEMobjpool
*pop
= ctx
;
663 LOG(15, "pop %p addr %p len %zu", pop
, addr
, len
);
665 unsigned lane
= UINT_MAX
;
667 if (pop
->has_remote_replicas
)
668 lane
= lane_hold(pop
, NULL
);
670 pop
->flush_local(addr
, len
);
672 PMEMobjpool
*rep
= pop
->replica
;
674 void *raddr
= (char *)rep
+ (uintptr_t)addr
- (uintptr_t)pop
;
675 if (rep
->rpp
== NULL
) {
676 rep
->memcpy_local(raddr
, addr
, len
,
679 if (rep
->persist_remote(rep
, raddr
, len
, lane
, flags
))
680 obj_handle_remote_persist_error(pop
);
685 if (pop
->has_remote_replicas
)
692 * obj_rep_drain -- (internal) drain with replication
695 obj_rep_drain(void *ctx
)
697 PMEMobjpool
*pop
= ctx
;
698 LOG(15, "pop %p", pop
);
702 PMEMobjpool
*rep
= pop
->replica
;
704 if (rep
->rpp
== NULL
)
710 #if VG_MEMCHECK_ENABLED
712 * Arbitrary value. When there's more undefined regions than MAX_UNDEFS, it's
713 * not worth reporting everything - developer should fix the code.
715 #define MAX_UNDEFS 1000
718 * obj_vg_check_no_undef -- (internal) check whether there are any undefined
722 obj_vg_check_no_undef(struct pmemobjpool
*pop
)
724 LOG(4, "pop %p", pop
);
728 } undefs
[MAX_UNDEFS
];
731 VALGRIND_DO_DISABLE_ERROR_REPORTING
;
732 char *addr_start
= pop
->addr
;
733 char *addr_end
= addr_start
+ pop
->set
->poolsize
;
735 while (addr_start
< addr_end
) {
736 char *noaccess
= (char *)VALGRIND_CHECK_MEM_IS_ADDRESSABLE(
737 addr_start
, addr_end
- addr_start
);
738 if (noaccess
== NULL
)
741 while (addr_start
< noaccess
) {
743 (char *)VALGRIND_CHECK_MEM_IS_DEFINED(
744 addr_start
, noaccess
- addr_start
);
747 addr_start
= undefined
;
749 #ifdef VALGRIND_CHECK_MEM_IS_UNDEFINED
750 addr_start
= (char *)
751 VALGRIND_CHECK_MEM_IS_UNDEFINED(
752 addr_start
, noaccess
- addr_start
);
753 if (addr_start
== NULL
)
754 addr_start
= noaccess
;
756 while (addr_start
< noaccess
&&
757 VALGRIND_CHECK_MEM_IS_DEFINED(
762 if (num_undefs
< MAX_UNDEFS
) {
763 undefs
[num_undefs
].start
= undefined
;
764 undefs
[num_undefs
].end
= addr_start
- 1;
768 addr_start
= noaccess
;
771 #ifdef VALGRIND_CHECK_MEM_IS_UNADDRESSABLE
772 addr_start
= (char *)VALGRIND_CHECK_MEM_IS_UNADDRESSABLE(
773 addr_start
, addr_end
- addr_start
);
774 if (addr_start
== NULL
)
775 addr_start
= addr_end
;
777 while (addr_start
< addr_end
&&
778 (char *)VALGRIND_CHECK_MEM_IS_ADDRESSABLE(
779 addr_start
, 1) == addr_start
)
783 VALGRIND_DO_ENABLE_ERROR_REPORTING
;
787 * How to resolve this error:
788 * If it's part of the free space Valgrind should be told about
789 * it by VALGRIND_DO_MAKE_MEM_NOACCESS request. If it's
790 * allocated - initialize it or use VALGRIND_DO_MAKE_MEM_DEFINED
794 VALGRIND_PRINTF("Part of the pool is left in undefined state on"
795 " boot. This is pmemobj's bug.\nUndefined"
796 " regions: [pool address: %p]\n", pop
);
797 for (int i
= 0; i
< num_undefs
; ++i
)
798 VALGRIND_PRINTF(" [%p, %p]\n", undefs
[i
].start
,
800 if (num_undefs
== MAX_UNDEFS
)
801 VALGRIND_PRINTF(" ...\n");
804 VALGRIND_CHECK_MEM_IS_DEFINED(undefs
[0].start
, 1);
809 * obj_vg_boot -- (internal) notify Valgrind about pool objects
812 obj_vg_boot(struct pmemobjpool
*pop
)
817 LOG(4, "pop %p", pop
);
819 if (os_getenv("PMEMOBJ_VG_CHECK_UNDEF"))
820 obj_vg_check_no_undef(pop
);
826 * obj_runtime_init_common -- (internal) runtime initialization
828 * Common routine for create/open and check.
831 obj_runtime_init_common(PMEMobjpool
*pop
)
833 LOG(3, "pop %p", pop
);
835 if ((errno
= lane_boot(pop
)) != 0) {
840 if ((errno
= lane_recover_and_section_boot(pop
)) != 0) {
841 ERR("!lane_recover_and_section_boot");
845 pop
->conversion_flags
= 0;
846 pmemops_persist(&pop
->p_ops
,
847 &pop
->conversion_flags
, sizeof(pop
->conversion_flags
));
853 * obj_runtime_cleanup_common -- (internal) runtime cleanup
855 * Common routine for create/open and check
858 obj_runtime_cleanup_common(PMEMobjpool
*pop
)
860 lane_section_cleanup(pop
);
865 * obj_descr_create -- (internal) create obj pool descriptor
868 obj_descr_create(PMEMobjpool
*pop
, const char *layout
, size_t poolsize
)
870 LOG(3, "pop %p layout %s poolsize %zu", pop
, layout
, poolsize
);
872 ASSERTeq(poolsize
% Pagesize
, 0);
874 /* opaque info lives at the beginning of mapped memory pool */
875 void *dscp
= (void *)((uintptr_t)pop
+ sizeof(struct pool_hdr
));
877 /* create the persistent part of pool's descriptor */
878 memset(dscp
, 0, OBJ_DSC_P_SIZE
);
880 strncpy(pop
->layout
, layout
, PMEMOBJ_MAX_LAYOUT
- 1);
881 struct pmem_ops
*p_ops
= &pop
->p_ops
;
883 pop
->lanes_offset
= OBJ_LANES_OFFSET
;
884 pop
->nlanes
= OBJ_NLANES
;
889 pop
->heap_offset
= pop
->lanes_offset
+
890 pop
->nlanes
* sizeof(struct lane_layout
);
891 pop
->heap_offset
= (pop
->heap_offset
+ Pagesize
- 1) & ~(Pagesize
- 1);
893 size_t heap_size
= pop
->set
->poolsize
- pop
->heap_offset
;
895 /* initialize heap prior to storing the checksum */
896 errno
= palloc_init((char *)pop
+ pop
->heap_offset
, heap_size
,
897 &pop
->heap_size
, p_ops
);
903 util_checksum(dscp
, OBJ_DSC_P_SIZE
, &pop
->checksum
, 1, 0);
906 * store the persistent part of pool's descriptor (2kB)
908 * It's safe to use PMEMOBJ_F_RELAXED flag because the entire
909 * structure is protected by checksum.
911 pmemops_xpersist(p_ops
, dscp
, OBJ_DSC_P_SIZE
, PMEMOBJ_F_RELAXED
);
913 /* initialize run_id, it will be incremented later */
915 pmemops_persist(p_ops
, &pop
->run_id
, sizeof(pop
->run_id
));
917 pop
->root_offset
= 0;
918 pmemops_persist(p_ops
, &pop
->root_offset
, sizeof(pop
->root_offset
));
920 pmemops_persist(p_ops
, &pop
->root_size
, sizeof(pop
->root_size
));
922 pop
->conversion_flags
= 0;
923 pmemops_persist(p_ops
, &pop
->conversion_flags
,
924 sizeof(pop
->conversion_flags
));
927 * It's safe to use PMEMOBJ_F_RELAXED flag because the reserved
928 * area must be entirely zeroed.
930 pmemops_memset(p_ops
, pop
->pmem_reserved
, 0,
931 sizeof(pop
->pmem_reserved
), PMEMOBJ_F_RELAXED
);
937 * obj_descr_check -- (internal) validate obj pool descriptor
940 obj_descr_check(PMEMobjpool
*pop
, const char *layout
, size_t poolsize
)
942 LOG(3, "pop %p layout %s poolsize %zu", pop
, layout
, poolsize
);
944 void *dscp
= (void *)((uintptr_t)pop
+ sizeof(struct pool_hdr
));
947 /* read remote descriptor */
948 if (obj_read_remote(pop
->rpp
, pop
->remote_base
, dscp
, dscp
,
950 ERR("!obj_read_remote");
955 if (!util_checksum(dscp
, OBJ_DSC_P_SIZE
, &pop
->checksum
, 0, 0)) {
956 ERR("invalid checksum of pool descriptor");
962 strncmp(pop
->layout
, layout
, PMEMOBJ_MAX_LAYOUT
)) {
963 ERR("wrong layout (\"%s\"), "
964 "pool created with layout \"%s\"",
965 layout
, pop
->layout
);
970 if (pop
->heap_offset
% Pagesize
) {
971 ERR("unaligned heap: off %" PRIu64
, pop
->heap_offset
);
980 * obj_replica_init_local -- (internal) initialize runtime part
981 * of the local replicas
984 obj_replica_init_local(PMEMobjpool
*rep
, int is_pmem
, size_t resvsize
)
986 LOG(3, "rep %p is_pmem %d resvsize %zu", rep
, is_pmem
, resvsize
);
989 * Use some of the memory pool area for run-time info. This
990 * run-time state is never loaded from the file, it is always
991 * created here, so no need to worry about byte-order.
993 rep
->is_pmem
= is_pmem
;
996 rep
->persist_remote
= NULL
;
999 * All replicas, except for master, are ignored as far as valgrind is
1000 * concerned. This is to save CPU time and lessen the complexity of
1003 if (!rep
->is_master_replica
)
1004 VALGRIND_ADD_TO_GLOBAL_TX_IGNORE(rep
, resvsize
);
1007 rep
->persist_local
= pmem_persist
;
1008 rep
->flush_local
= pmem_flush
;
1009 rep
->drain_local
= pmem_drain
;
1010 rep
->memcpy_local
= pmem_memcpy
;
1011 rep
->memmove_local
= pmem_memmove
;
1012 rep
->memset_local
= pmem_memset
;
1014 rep
->persist_local
= obj_msync_nofail
;
1015 rep
->flush_local
= obj_msync_nofail
;
1016 rep
->drain_local
= obj_drain_empty
;
1017 rep
->memcpy_local
= obj_nopmem_memcpy
;
1018 rep
->memmove_local
= obj_nopmem_memmove
;
1019 rep
->memset_local
= obj_nopmem_memset
;
1026 * obj_replica_init_remote -- (internal) initialize runtime part
1027 * of a remote replica
1030 obj_replica_init_remote(PMEMobjpool
*rep
, struct pool_set
*set
,
1031 unsigned repidx
, int create
)
1033 LOG(3, "rep %p set %p repidx %u", rep
, set
, repidx
);
1035 struct pool_replica
*repset
= set
->replica
[repidx
];
1037 ASSERTne(repset
->remote
->rpp
, NULL
);
1038 ASSERTne(repset
->remote
->node_addr
, NULL
);
1039 ASSERTne(repset
->remote
->pool_desc
, NULL
);
1041 rep
->node_addr
= Strdup(repset
->remote
->node_addr
);
1042 if (rep
->node_addr
== NULL
)
1044 rep
->pool_desc
= Strdup(repset
->remote
->pool_desc
);
1045 if (rep
->pool_desc
== NULL
) {
1046 Free(rep
->node_addr
);
1050 rep
->rpp
= repset
->remote
->rpp
;
1052 /* remote_base - beginning of the remote pool */
1053 rep
->remote_base
= (uintptr_t)rep
->addr
;
1056 rep
->persist_remote
= obj_remote_persist
;
1057 rep
->persist_local
= NULL
;
1058 rep
->flush_local
= NULL
;
1059 rep
->drain_local
= NULL
;
1060 rep
->memcpy_local
= NULL
;
1061 rep
->memmove_local
= NULL
;
1062 rep
->memset_local
= NULL
;
1064 rep
->p_ops
.remote
.read
= obj_read_remote
;
1065 rep
->p_ops
.remote
.ctx
= rep
->rpp
;
1066 rep
->p_ops
.remote
.base
= rep
->remote_base
;
1072 * obj_cleanup_remote -- (internal) clean up the remote pools data
1075 obj_cleanup_remote(PMEMobjpool
*pop
)
1077 LOG(3, "pop %p", pop
);
1079 for (; pop
!= NULL
; pop
= pop
->replica
) {
1080 if (pop
->rpp
!= NULL
) {
1081 Free(pop
->node_addr
);
1082 Free(pop
->pool_desc
);
1089 * obj_replica_init -- (internal) initialize runtime part of the replica
1092 obj_replica_init(PMEMobjpool
*rep
, struct pool_set
*set
, unsigned repidx
,
1095 struct pool_replica
*repset
= set
->replica
[repidx
];
1098 /* master replica */
1099 rep
->is_master_replica
= 1;
1100 rep
->has_remote_replicas
= set
->remote
;
1102 if (set
->nreplicas
> 1) {
1103 rep
->p_ops
.persist
= obj_rep_persist
;
1104 rep
->p_ops
.flush
= obj_rep_flush
;
1105 rep
->p_ops
.drain
= obj_rep_drain
;
1106 rep
->p_ops
.memcpy
= obj_rep_memcpy
;
1107 rep
->p_ops
.memmove
= obj_rep_memmove
;
1108 rep
->p_ops
.memset
= obj_rep_memset
;
1110 rep
->p_ops
.persist
= obj_norep_persist
;
1111 rep
->p_ops
.flush
= obj_norep_flush
;
1112 rep
->p_ops
.drain
= obj_norep_drain
;
1113 rep
->p_ops
.memcpy
= obj_norep_memcpy
;
1114 rep
->p_ops
.memmove
= obj_norep_memmove
;
1115 rep
->p_ops
.memset
= obj_norep_memset
;
1117 rep
->p_ops
.base
= rep
;
1119 /* non-master replicas */
1120 rep
->is_master_replica
= 0;
1121 rep
->has_remote_replicas
= 0;
1123 rep
->p_ops
.persist
= NULL
;
1124 rep
->p_ops
.flush
= NULL
;
1125 rep
->p_ops
.drain
= NULL
;
1126 rep
->p_ops
.memcpy
= NULL
;
1127 rep
->p_ops
.memmove
= NULL
;
1128 rep
->p_ops
.memset
= NULL
;
1130 rep
->p_ops
.base
= NULL
;
1133 rep
->is_dev_dax
= set
->replica
[repidx
]->part
[0].is_dev_dax
;
1137 ret
= obj_replica_init_remote(rep
, set
, repidx
, create
);
1139 ret
= obj_replica_init_local(rep
, repset
->is_pmem
,
1148 * obj_replica_fini -- (internal) deinitialize replica
1151 obj_replica_fini(struct pool_replica
*repset
)
1153 PMEMobjpool
*rep
= repset
->part
[0].addr
;
1156 obj_cleanup_remote(rep
);
1160 * obj_runtime_init -- (internal) initialize runtime part of the pool header
1163 obj_runtime_init(PMEMobjpool
*pop
, int rdonly
, int boot
, unsigned nlanes
)
1165 LOG(3, "pop %p rdonly %d boot %d", pop
, rdonly
, boot
);
1166 struct pmem_ops
*p_ops
= &pop
->p_ops
;
1168 /* run_id is made unique by incrementing the previous value */
1170 if (pop
->run_id
== 0)
1172 pmemops_persist(p_ops
, &pop
->run_id
, sizeof(pop
->run_id
));
1175 * Use some of the memory pool area for run-time info. This
1176 * run-time state is never loaded from the file, it is always
1177 * created here, so no need to worry about byte-order.
1179 pop
->rdonly
= rdonly
;
1181 pop
->uuid_lo
= pmemobj_get_uuid_lo(pop
);
1183 pop
->lanes_desc
.runtime_nlanes
= nlanes
;
1185 pop
->tx_params
= tx_params_new();
1186 if (pop
->tx_params
== NULL
)
1189 pop
->stats
= stats_new(pop
);
1190 if (pop
->stats
== NULL
)
1193 pop
->user_data
= NULL
;
1195 VALGRIND_REMOVE_PMEM_MAPPING(&pop
->mutex_head
,
1196 sizeof(pop
->mutex_head
));
1197 VALGRIND_REMOVE_PMEM_MAPPING(&pop
->rwlock_head
,
1198 sizeof(pop
->rwlock_head
));
1199 VALGRIND_REMOVE_PMEM_MAPPING(&pop
->cond_head
,
1200 sizeof(pop
->cond_head
));
1201 pop
->mutex_head
= NULL
;
1202 pop
->rwlock_head
= NULL
;
1203 pop
->cond_head
= NULL
;
1206 if ((errno
= obj_runtime_init_common(pop
)) != 0)
1209 #if VG_MEMCHECK_ENABLED
1211 /* mark unused part of the pool as not accessible */
1212 void *end
= palloc_heap_end(&pop
->heap
);
1213 VALGRIND_DO_MAKE_MEM_NOACCESS(end
,
1214 (char *)pop
+ pop
->set
->poolsize
- (char *)end
);
1220 if ((errno
= critnib_insert(pools_ht
, pop
->uuid_lo
, pop
))) {
1221 ERR("!critnib_insert to pools_ht");
1222 goto err_critnib_insert
;
1225 if ((errno
= critnib_insert(pools_tree
, (uint64_t)pop
, pop
))) {
1226 ERR("!critnib_insert to pools_tree");
1227 goto err_tree_insert
;
1231 if (obj_ctl_init_and_load(pop
) != 0) {
1236 util_mutex_init(&pop
->ulog_user_buffers
.lock
);
1237 pop
->ulog_user_buffers
.map
= ravl_new_sized(
1238 operation_user_buffer_range_cmp
,
1239 sizeof(struct user_buffer_def
));
1240 if (pop
->ulog_user_buffers
.map
== NULL
) {
1241 ERR("!ravl_new_sized");
1242 goto err_user_buffers_map
;
1244 pop
->ulog_user_buffers
.verify
= 0;
1247 * If possible, turn off all permissions on the pool header page.
1249 * The prototype PMFS doesn't allow this when large pages are in
1250 * use. It is not considered an error if this fails.
1252 RANGE_NONE(pop
->addr
, sizeof(struct pool_hdr
), pop
->is_dev_dax
);
1256 err_user_buffers_map
:
1257 util_mutex_destroy(&pop
->ulog_user_buffers
.lock
);
1258 ctl_delete(pop
->ctl
);
1260 void *n
= critnib_remove(pools_tree
, (uint64_t)pop
);
1263 critnib_remove(pools_ht
, pop
->uuid_lo
);
1265 obj_runtime_cleanup_common(pop
);
1267 stats_delete(pop
, pop
->stats
);
1269 tx_params_delete(pop
->tx_params
);
1276 * obj_get_nlanes -- get a number of lanes available at runtime. If the value
1277 * provided with the PMEMOBJ_NLANES environment variable is greater than 0 and
1278 * smaller than OBJ_NLANES constant it returns PMEMOBJ_NLANES. Otherwise it
1279 * returns OBJ_NLANES.
1282 obj_get_nlanes(void)
1286 char *env_nlanes
= os_getenv(OBJ_NLANES_ENV_VARIABLE
);
1288 int nlanes
= atoi(env_nlanes
);
1290 ERR("%s variable must be a positive integer",
1291 OBJ_NLANES_ENV_VARIABLE
);
1296 return (unsigned)(OBJ_NLANES
< nlanes
? OBJ_NLANES
: nlanes
);
1304 * pmemobj_createU -- create a transactional memory pool (set)
1310 pmemobj_createU(const char *path
, const char *layout
,
1311 size_t poolsize
, mode_t mode
)
1313 LOG(3, "path %s layout %s poolsize %zu mode %o",
1314 path
, layout
, poolsize
, mode
);
1317 struct pool_set
*set
;
1319 /* check length of layout */
1320 if (layout
&& (strlen(layout
) >= PMEMOBJ_MAX_LAYOUT
)) {
1321 ERR("Layout too long");
1327 * A number of lanes available at runtime equals the lowest value
1328 * from all reported by remote replicas hosts. In the single host mode
1329 * the runtime number of lanes is equal to the total number of lanes
1330 * available in the pool or the value provided with PMEMOBJ_NLANES
1331 * environment variable whichever is lower.
1333 unsigned runtime_nlanes
= obj_get_nlanes();
1335 struct pool_attr adj_pool_attr
= Obj_create_attr
;
1337 /* force set SDS feature */
1339 adj_pool_attr
.features
.incompat
|= POOL_FEAT_SDS
;
1341 adj_pool_attr
.features
.incompat
&= ~POOL_FEAT_SDS
;
1343 if (util_pool_create(&set
, path
, poolsize
, PMEMOBJ_MIN_POOL
,
1344 PMEMOBJ_MIN_PART
, &adj_pool_attr
, &runtime_nlanes
,
1345 REPLICAS_ENABLED
) != 0) {
1346 LOG(2, "cannot create pool or pool set");
1350 ASSERT(set
->nreplicas
> 0);
1352 /* pop is master replica from now on */
1353 pop
= set
->replica
[0]->part
[0].addr
;
1355 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
1356 struct pool_replica
*repset
= set
->replica
[r
];
1357 PMEMobjpool
*rep
= repset
->part
[0].addr
;
1359 size_t rt_size
= (uintptr_t)(rep
+ 1) - (uintptr_t)&rep
->addr
;
1360 VALGRIND_REMOVE_PMEM_MAPPING(&rep
->addr
, rt_size
);
1362 memset(&rep
->addr
, 0, rt_size
);
1365 rep
->replica
= NULL
;
1368 /* initialize replica runtime - is_pmem, funcs, ... */
1369 if (obj_replica_init(rep
, set
, r
, 1 /* create */) != 0) {
1370 ERR("initialization of replica #%u failed", r
);
1375 if (r
< set
->nreplicas
- 1)
1376 rep
->replica
= set
->replica
[r
+ 1]->part
[0].addr
;
1381 /* create pool descriptor */
1382 if (obj_descr_create(pop
, layout
, set
->poolsize
) != 0) {
1383 LOG(2, "creation of pool descriptor failed");
1387 /* initialize runtime parts - lanes, obj stores, ... */
1388 if (obj_runtime_init(pop
, 0, 1 /* boot */,
1389 runtime_nlanes
) != 0) {
1390 ERR("pool initialization failed");
1394 if (util_poolset_chmod(set
, mode
))
1397 util_poolset_fdclose(set
);
1399 LOG(3, "pop %p", pop
);
1404 LOG(4, "error clean up");
1407 obj_cleanup_remote(pop
);
1408 util_poolset_close(set
, DELETE_CREATED_PARTS
);
1415 * pmemobj_create -- create a transactional memory pool (set)
1418 pmemobj_create(const char *path
, const char *layout
,
1419 size_t poolsize
, mode_t mode
)
1421 PMEMOBJ_API_START();
1423 PMEMobjpool
*pop
= pmemobj_createU(path
, layout
, poolsize
, mode
);
1430 * pmemobj_createW -- create a transactional memory pool (set)
1433 pmemobj_createW(const wchar_t *path
, const wchar_t *layout
, size_t poolsize
,
1436 char *upath
= util_toUTF8(path
);
1439 char *ulayout
= NULL
;
1440 if (layout
!= NULL
) {
1441 ulayout
= util_toUTF8(layout
);
1442 if (ulayout
== NULL
) {
1443 util_free_UTF8(upath
);
1447 PMEMobjpool
*ret
= pmemobj_createU(upath
, ulayout
, poolsize
, mode
);
1449 util_free_UTF8(upath
);
1450 util_free_UTF8(ulayout
);
1457 * obj_check_basic_local -- (internal) basic pool consistency check
1458 * of a local replica
1461 obj_check_basic_local(PMEMobjpool
*pop
, size_t mapped_size
)
1463 LOG(3, "pop %p mapped_size %zu", pop
, mapped_size
);
1465 ASSERTeq(pop
->rpp
, NULL
);
1469 if (pop
->run_id
% 2) {
1470 ERR("invalid run_id %" PRIu64
, pop
->run_id
);
1474 if ((errno
= lane_check(pop
)) != 0) {
1475 LOG(2, "!lane_check");
1479 /* pop->heap_size can still be 0 at this point */
1480 size_t heap_size
= mapped_size
- pop
->heap_offset
;
1481 errno
= palloc_heap_check((char *)pop
+ pop
->heap_offset
,
1484 LOG(2, "!heap_check");
1492 * obj_read_remote -- read data from remote replica
1494 * It reads data of size 'length' from the remote replica 'pop'
1495 * from address 'addr' and saves it at address 'dest'.
1498 obj_read_remote(void *ctx
, uintptr_t base
, void *dest
, void *addr
,
1501 LOG(3, "ctx %p base 0x%lx dest %p addr %p length %zu", ctx
, base
, dest
,
1504 ASSERTne(ctx
, NULL
);
1505 ASSERT((uintptr_t)addr
>= base
);
1507 uintptr_t offset
= (uintptr_t)addr
- base
;
1508 if (Rpmem_read(ctx
, dest
, offset
, length
, RLANE_DEFAULT
)) {
1517 * obj_check_basic_remote -- (internal) basic pool consistency check
1518 * of a remote replica
1521 obj_check_basic_remote(PMEMobjpool
*pop
, size_t mapped_size
)
1523 LOG(3, "pop %p mapped_size %zu", pop
, mapped_size
);
1525 ASSERTne(pop
->rpp
, NULL
);
1529 /* read pop->run_id */
1530 if (obj_read_remote(pop
->rpp
, pop
->remote_base
, &pop
->run_id
,
1531 &pop
->run_id
, sizeof(pop
->run_id
))) {
1532 ERR("!obj_read_remote");
1536 if (pop
->run_id
% 2) {
1537 ERR("invalid run_id %" PRIu64
, pop
->run_id
);
1541 /* XXX add lane_check_remote */
1543 /* pop->heap_size can still be 0 at this point */
1544 size_t heap_size
= mapped_size
- pop
->heap_offset
;
1545 if (palloc_heap_check_remote((char *)pop
+ pop
->heap_offset
,
1546 heap_size
, &pop
->p_ops
.remote
)) {
1547 LOG(2, "!heap_check_remote");
1555 * obj_check_basic -- (internal) basic pool consistency check
1557 * Used to check if all the replicas are consistent prior to pool recovery.
1560 obj_check_basic(PMEMobjpool
*pop
, size_t mapped_size
)
1562 LOG(3, "pop %p mapped_size %zu", pop
, mapped_size
);
1564 if (pop
->rpp
== NULL
)
1565 return obj_check_basic_local(pop
, mapped_size
);
1567 return obj_check_basic_remote(pop
, mapped_size
);
1571 * obj_pool_close -- (internal) close the pool set
1574 obj_pool_close(struct pool_set
*set
)
1577 util_poolset_close(set
, DO_NOT_DELETE_PARTS
);
1582 * obj_pool_open -- (internal) open the given pool
1585 obj_pool_open(struct pool_set
**set
, const char *path
, unsigned flags
,
1588 if (util_pool_open(set
, path
, PMEMOBJ_MIN_PART
, &Obj_open_attr
,
1589 nlanes
, NULL
, flags
) != 0) {
1590 LOG(2, "cannot open pool or pool set");
1594 ASSERT((*set
)->nreplicas
> 0);
1596 /* read-only mode is not supported in libpmemobj */
1597 if ((*set
)->rdonly
) {
1598 ERR("read-only mode is not supported");
1605 obj_pool_close(*set
);
1610 * obj_replicas_init -- (internal) initialize all replicas
1613 obj_replicas_init(struct pool_set
*set
)
1616 for (r
= 0; r
< set
->nreplicas
; r
++) {
1617 struct pool_replica
*repset
= set
->replica
[r
];
1618 PMEMobjpool
*rep
= repset
->part
[0].addr
;
1620 size_t rt_size
= (uintptr_t)(rep
+ 1) - (uintptr_t)&rep
->addr
;
1622 VALGRIND_REMOVE_PMEM_MAPPING(&rep
->addr
, rt_size
);
1624 memset(&rep
->addr
, 0, rt_size
);
1627 rep
->replica
= NULL
;
1630 /* initialize replica runtime - is_pmem, funcs, ... */
1631 if (obj_replica_init(rep
, set
, r
, 0 /* open */) != 0) {
1632 ERR("initialization of replica #%u failed", r
);
1637 if (r
< set
->nreplicas
- 1)
1638 rep
->replica
= set
->replica
[r
+ 1]->part
[0].addr
;
1643 for (unsigned p
= 0; p
< r
; p
++)
1644 obj_replica_fini(set
->replica
[p
]);
1650 * obj_replicas_fini -- (internal) deinitialize all replicas
1653 obj_replicas_fini(struct pool_set
*set
)
1656 for (unsigned r
= 0; r
< set
->nreplicas
; r
++)
1657 obj_replica_fini(set
->replica
[r
]);
1662 * obj_replicas_check_basic -- (internal) perform basic consistency check
1666 obj_replicas_check_basic(PMEMobjpool
*pop
)
1669 for (unsigned r
= 0; r
< pop
->set
->nreplicas
; r
++) {
1670 rep
= pop
->set
->replica
[r
]->part
[0].addr
;
1671 if (obj_check_basic(rep
, pop
->set
->poolsize
) == 0) {
1672 ERR("inconsistent replica #%u", r
);
1678 void *src
= (void *)((uintptr_t)pop
+ pop
->lanes_offset
);
1679 size_t len
= pop
->nlanes
* sizeof(struct lane_layout
);
1681 for (unsigned r
= 1; r
< pop
->set
->nreplicas
; r
++) {
1682 rep
= pop
->set
->replica
[r
]->part
[0].addr
;
1683 void *dst
= (void *)((uintptr_t)rep
+ pop
->lanes_offset
);
1684 if (rep
->rpp
== NULL
) {
1685 rep
->memcpy_local(dst
, src
, len
, 0);
1687 if (rep
->persist_remote(rep
, dst
, len
,
1689 obj_handle_remote_persist_error(pop
);
1697 * obj_open_common -- open a transactional memory pool (set)
1699 * This routine takes flags and does all the work
1700 * (flag POOL_OPEN_COW - internal calls can map a read-only pool if required).
1702 static PMEMobjpool
*
1703 obj_open_common(const char *path
, const char *layout
, unsigned flags
, int boot
)
1705 LOG(3, "path %s layout %s flags 0x%x", path
, layout
, flags
);
1707 PMEMobjpool
*pop
= NULL
;
1708 struct pool_set
*set
;
1711 * A number of lanes available at runtime equals the lowest value
1712 * from all reported by remote replicas hosts. In the single host mode
1713 * the runtime number of lanes is equal to the total number of lanes
1714 * available in the pool or the value provided with PMEMOBJ_NLANES
1715 * environment variable whichever is lower.
1717 unsigned runtime_nlanes
= obj_get_nlanes();
1718 if (obj_pool_open(&set
, path
, flags
, &runtime_nlanes
))
1721 /* pop is master replica from now on */
1722 pop
= set
->replica
[0]->part
[0].addr
;
1724 if (obj_replicas_init(set
))
1727 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
1728 struct pool_replica
*repset
= set
->replica
[r
];
1729 PMEMobjpool
*rep
= repset
->part
[0].addr
;
1730 /* check descriptor */
1731 if (obj_descr_check(rep
, layout
, set
->poolsize
) != 0) {
1732 LOG(2, "descriptor check of replica #%u failed", r
);
1733 goto err_descr_check
;
1740 /* check consistency of 'master' replica */
1741 if (obj_check_basic(pop
, pop
->set
->poolsize
) == 0) {
1742 goto err_check_basic
;
1746 if (set
->nreplicas
> 1) {
1747 if (obj_replicas_check_basic(pop
))
1748 goto err_replicas_check_basic
;
1752 * before runtime initialization lanes are unavailable, remote persists
1753 * should use RLANE_DEFAULT
1755 pop
->lanes_desc
.runtime_nlanes
= 0;
1757 #if VG_MEMCHECK_ENABLED
1758 pop
->vg_boot
= boot
;
1760 /* initialize runtime parts - lanes, obj stores, ... */
1761 if (obj_runtime_init(pop
, 0, boot
, runtime_nlanes
) != 0) {
1762 ERR("pool initialization failed");
1763 goto err_runtime_init
;
1766 #if VG_MEMCHECK_ENABLED
1771 util_poolset_fdclose(set
);
1773 LOG(3, "pop %p", pop
);
1778 err_replicas_check_basic
:
1781 obj_replicas_fini(set
);
1783 obj_pool_close(set
);
1788 * pmemobj_openU -- open a transactional memory pool
1794 pmemobj_openU(const char *path
, const char *layout
)
1796 LOG(3, "path %s layout %s", path
, layout
);
1798 return obj_open_common(path
, layout
,
1799 COW_at_open
? POOL_OPEN_COW
: 0, 1);
1804 * pmemobj_open -- open a transactional memory pool
1807 pmemobj_open(const char *path
, const char *layout
)
1809 PMEMOBJ_API_START();
1811 PMEMobjpool
*pop
= pmemobj_openU(path
, layout
);
1818 * pmemobj_openW -- open a transactional memory pool
1821 pmemobj_openW(const wchar_t *path
, const wchar_t *layout
)
1823 char *upath
= util_toUTF8(path
);
1827 char *ulayout
= NULL
;
1828 if (layout
!= NULL
) {
1829 ulayout
= util_toUTF8(layout
);
1830 if (ulayout
== NULL
) {
1831 util_free_UTF8(upath
);
1836 PMEMobjpool
*ret
= pmemobj_openU(upath
, ulayout
);
1837 util_free_UTF8(upath
);
1838 util_free_UTF8(ulayout
);
1844 * obj_replicas_cleanup -- (internal) free resources allocated for replicas
1847 obj_replicas_cleanup(struct pool_set
*set
)
1849 LOG(3, "set %p", set
);
1851 for (unsigned r
= 0; r
< set
->nreplicas
; r
++) {
1852 struct pool_replica
*rep
= set
->replica
[r
];
1854 PMEMobjpool
*pop
= rep
->part
[0].addr
;
1856 if (pop
->rpp
!= NULL
) {
1858 * remote replica will be closed in util_poolset_close
1862 Free(pop
->node_addr
);
1863 Free(pop
->pool_desc
);
1869 * obj_pool_lock_cleanup -- (internal) Destroy any locks or condition
1870 * variables that were allocated at run time
1873 obj_pool_lock_cleanup(PMEMobjpool
*pop
)
1875 LOG(3, "pop %p", pop
);
1877 PMEMmutex_internal
*nextm
;
1878 for (PMEMmutex_internal
*m
= pop
->mutex_head
; m
!= NULL
; m
= nextm
) {
1879 nextm
= m
->PMEMmutex_next
;
1880 LOG(4, "mutex %p *mutex %p", &m
->PMEMmutex_lock
,
1881 m
->PMEMmutex_bsd_mutex_p
);
1882 os_mutex_destroy(&m
->PMEMmutex_lock
);
1883 m
->PMEMmutex_next
= NULL
;
1884 m
->PMEMmutex_bsd_mutex_p
= NULL
;
1886 pop
->mutex_head
= NULL
;
1888 PMEMrwlock_internal
*nextr
;
1889 for (PMEMrwlock_internal
*r
= pop
->rwlock_head
; r
!= NULL
; r
= nextr
) {
1890 nextr
= r
->PMEMrwlock_next
;
1891 LOG(4, "rwlock %p *rwlock %p", &r
->PMEMrwlock_lock
,
1892 r
->PMEMrwlock_bsd_rwlock_p
);
1893 os_rwlock_destroy(&r
->PMEMrwlock_lock
);
1894 r
->PMEMrwlock_next
= NULL
;
1895 r
->PMEMrwlock_bsd_rwlock_p
= NULL
;
1897 pop
->rwlock_head
= NULL
;
1899 PMEMcond_internal
*nextc
;
1900 for (PMEMcond_internal
*c
= pop
->cond_head
; c
!= NULL
; c
= nextc
) {
1901 nextc
= c
->PMEMcond_next
;
1902 LOG(4, "cond %p *cond %p", &c
->PMEMcond_cond
,
1903 c
->PMEMcond_bsd_cond_p
);
1904 os_cond_destroy(&c
->PMEMcond_cond
);
1905 c
->PMEMcond_next
= NULL
;
1906 c
->PMEMcond_bsd_cond_p
= NULL
;
1908 pop
->cond_head
= NULL
;
1911 * obj_pool_cleanup -- (internal) cleanup the pool and unmap
1914 obj_pool_cleanup(PMEMobjpool
*pop
)
1916 LOG(3, "pop %p", pop
);
1918 ravl_delete(pop
->ulog_user_buffers
.map
);
1919 util_mutex_destroy(&pop
->ulog_user_buffers
.lock
);
1921 stats_delete(pop
, pop
->stats
);
1922 tx_params_delete(pop
->tx_params
);
1923 ctl_delete(pop
->ctl
);
1925 obj_pool_lock_cleanup(pop
);
1927 lane_section_cleanup(pop
);
1930 /* unmap all the replicas */
1931 obj_replicas_cleanup(pop
->set
);
1932 util_poolset_close(pop
->set
, DO_NOT_DELETE_PARTS
);
1936 * pmemobj_close -- close a transactional memory pool
1939 pmemobj_close(PMEMobjpool
*pop
)
1941 LOG(3, "pop %p", pop
);
1942 PMEMOBJ_API_START();
1944 _pobj_cache_invalidate
++;
1946 if (critnib_remove(pools_ht
, pop
->uuid_lo
) != pop
) {
1947 ERR("critnib_remove for pools_ht");
1950 if (critnib_remove(pools_tree
, (uint64_t)pop
) != pop
)
1951 ERR("critnib_remove for pools_tree");
1955 if (_pobj_cached_pool
.pop
== pop
) {
1956 _pobj_cached_pool
.pop
= NULL
;
1957 _pobj_cached_pool
.uuid_lo
= 0;
1962 struct _pobj_pcache
*pcache
= os_tls_get(Cached_pool_key
);
1963 if (pcache
!= NULL
) {
1964 if (pcache
->pop
== pop
) {
1966 pcache
->uuid_lo
= 0;
1972 obj_pool_cleanup(pop
);
1977 * pmemobj_checkU -- transactional memory pool consistency check
1983 pmemobj_checkU(const char *path
, const char *layout
)
1985 LOG(3, "path %s layout %s", path
, layout
);
1987 PMEMobjpool
*pop
= obj_open_common(path
, layout
, POOL_OPEN_COW
, 0);
1989 return -1; /* errno set by obj_open_common() */
1994 * For replicated pools, basic consistency check is performed
1995 * in obj_open_common().
1997 if (pop
->replica
== NULL
)
1998 consistent
= obj_check_basic(pop
, pop
->set
->poolsize
);
2000 if (consistent
&& (errno
= obj_runtime_init_common(pop
)) != 0) {
2001 LOG(3, "!obj_boot");
2006 obj_pool_cleanup(pop
);
2008 stats_delete(pop
, pop
->stats
);
2009 tx_params_delete(pop
->tx_params
);
2010 ctl_delete(pop
->ctl
);
2012 /* unmap all the replicas */
2013 obj_replicas_cleanup(pop
->set
);
2014 util_poolset_close(pop
->set
, DO_NOT_DELETE_PARTS
);
2018 LOG(4, "pool consistency check OK");
2025 * pmemobj_check -- transactional memory pool consistency check
2028 pmemobj_check(const char *path
, const char *layout
)
2030 PMEMOBJ_API_START();
2032 int ret
= pmemobj_checkU(path
, layout
);
2039 * pmemobj_checkW -- transactional memory pool consistency check
2042 pmemobj_checkW(const wchar_t *path
, const wchar_t *layout
)
2044 char *upath
= util_toUTF8(path
);
2048 char *ulayout
= NULL
;
2049 if (layout
!= NULL
) {
2050 ulayout
= util_toUTF8(layout
);
2051 if (ulayout
== NULL
) {
2052 util_free_UTF8(upath
);
2057 int ret
= pmemobj_checkU(upath
, ulayout
);
2059 util_free_UTF8(upath
);
2060 util_free_UTF8(ulayout
);
2067 * pmemobj_pool_by_oid -- returns the pool handle associated with the oid
2070 pmemobj_pool_by_oid(PMEMoid oid
)
2072 LOG(3, "oid.off 0x%016" PRIx64
, oid
.off
);
2074 /* XXX this is a temporary fix, to be fixed properly later */
2075 if (pools_ht
== NULL
)
2078 return critnib_get(pools_ht
, oid
.pool_uuid_lo
);
2082 * pmemobj_pool_by_ptr -- returns the pool handle associated with the address
2085 pmemobj_pool_by_ptr(const void *addr
)
2087 LOG(3, "addr %p", addr
);
2089 /* fast path for transactions */
2090 PMEMobjpool
*pop
= tx_get_pop();
2092 if ((pop
!= NULL
) && OBJ_PTR_FROM_POOL(pop
, addr
))
2095 /* XXX this is a temporary fix, to be fixed properly later */
2096 if (pools_tree
== NULL
)
2099 pop
= critnib_find_le(pools_tree
, (uint64_t)addr
);
2103 size_t pool_size
= pop
->heap_offset
+ pop
->heap_size
;
2104 if ((char *)addr
>= (char *)pop
+ pool_size
)
2111 * pmemobj_set_user_data -- sets volatile pointer to the user data for specified
2115 pmemobj_set_user_data(PMEMobjpool
*pop
, void *data
)
2117 LOG(3, "pop %p data %p", pop
, data
);
2119 pop
->user_data
= data
;
2123 * pmemobj_get_user_data -- gets volatile pointer to the user data associated
2124 * with the specified pool
2127 pmemobj_get_user_data(PMEMobjpool
*pop
)
2129 LOG(3, "pop %p", pop
);
2131 return pop
->user_data
;
2134 /* arguments for constructor_alloc */
2135 struct constr_args
{
2137 pmemobj_constr constructor
;
2142 * constructor_alloc -- (internal) constructor for obj_alloc_construct
2145 constructor_alloc(void *ctx
, void *ptr
, size_t usable_size
, void *arg
)
2147 PMEMobjpool
*pop
= ctx
;
2148 LOG(3, "pop %p ptr %p arg %p", pop
, ptr
, arg
);
2149 struct pmem_ops
*p_ops
= &pop
->p_ops
;
2151 ASSERTne(ptr
, NULL
);
2152 ASSERTne(arg
, NULL
);
2154 struct constr_args
*carg
= arg
;
2156 if (carg
->zero_init
)
2157 pmemops_memset(p_ops
, ptr
, 0, usable_size
, 0);
2160 if (carg
->constructor
)
2161 ret
= carg
->constructor(pop
, ptr
, carg
->arg
);
2167 * obj_alloc_construct -- (internal) allocates a new object with constructor
2170 obj_alloc_construct(PMEMobjpool
*pop
, PMEMoid
*oidp
, size_t size
,
2171 type_num_t type_num
, uint64_t flags
,
2172 pmemobj_constr constructor
, void *arg
)
2174 if (size
> PMEMOBJ_MAX_ALLOC_SIZE
) {
2175 ERR("requested size too large");
2180 struct constr_args carg
;
2182 carg
.zero_init
= flags
& POBJ_FLAG_ZERO
;
2183 carg
.constructor
= constructor
;
2186 struct operation_context
*ctx
= pmalloc_operation_hold(pop
);
2189 operation_add_entry(ctx
, &oidp
->pool_uuid_lo
, pop
->uuid_lo
,
2190 ULOG_OPERATION_SET
);
2192 int ret
= palloc_operation(&pop
->heap
, 0,
2193 oidp
!= NULL
? &oidp
->off
: NULL
, size
,
2194 constructor_alloc
, &carg
, type_num
, 0,
2195 CLASS_ID_FROM_FLAG(flags
), ARENA_ID_FROM_FLAG(flags
),
2198 pmalloc_operation_release(pop
);
2204 * pmemobj_alloc -- allocates a new object
2207 pmemobj_alloc(PMEMobjpool
*pop
, PMEMoid
*oidp
, size_t size
,
2208 uint64_t type_num
, pmemobj_constr constructor
, void *arg
)
2210 LOG(3, "pop %p oidp %p size %zu type_num %llx constructor %p arg %p",
2211 pop
, oidp
, size
, (unsigned long long)type_num
,
2214 /* log notice message if used inside a transaction */
2215 _POBJ_DEBUG_NOTICE_IN_TX();
2218 ERR("allocation with size 0");
2223 PMEMOBJ_API_START();
2224 int ret
= obj_alloc_construct(pop
, oidp
, size
, type_num
,
2225 0, constructor
, arg
);
2232 * pmemobj_xalloc -- allocates with flags
2235 pmemobj_xalloc(PMEMobjpool
*pop
, PMEMoid
*oidp
, size_t size
,
2236 uint64_t type_num
, uint64_t flags
,
2237 pmemobj_constr constructor
, void *arg
)
2239 LOG(3, "pop %p oidp %p size %zu type_num %llx flags %llx "
2240 "constructor %p arg %p",
2241 pop
, oidp
, size
, (unsigned long long)type_num
,
2242 (unsigned long long)flags
,
2245 /* log notice message if used inside a transaction */
2246 _POBJ_DEBUG_NOTICE_IN_TX();
2249 ERR("allocation with size 0");
2254 if (flags
& ~POBJ_TX_XALLOC_VALID_FLAGS
) {
2255 ERR("unknown flags 0x%" PRIx64
,
2256 flags
& ~POBJ_TX_XALLOC_VALID_FLAGS
);
2261 PMEMOBJ_API_START();
2262 int ret
= obj_alloc_construct(pop
, oidp
, size
, type_num
,
2263 flags
, constructor
, arg
);
2269 /* arguments for constructor_realloc and constructor_zrealloc */
2270 struct carg_realloc
{
2275 type_num_t user_type
;
2276 pmemobj_constr constructor
;
2281 * pmemobj_zalloc -- allocates a new zeroed object
2284 pmemobj_zalloc(PMEMobjpool
*pop
, PMEMoid
*oidp
, size_t size
,
2287 LOG(3, "pop %p oidp %p size %zu type_num %llx",
2288 pop
, oidp
, size
, (unsigned long long)type_num
);
2290 /* log notice message if used inside a transaction */
2291 _POBJ_DEBUG_NOTICE_IN_TX();
2294 ERR("allocation with size 0");
2299 PMEMOBJ_API_START();
2300 int ret
= obj_alloc_construct(pop
, oidp
, size
, type_num
, POBJ_FLAG_ZERO
,
2308 * obj_free -- (internal) free an object
2311 obj_free(PMEMobjpool
*pop
, PMEMoid
*oidp
)
2313 ASSERTne(oidp
, NULL
);
2315 struct operation_context
*ctx
= pmalloc_operation_hold(pop
);
2317 operation_add_entry(ctx
, &oidp
->pool_uuid_lo
, 0, ULOG_OPERATION_SET
);
2319 palloc_operation(&pop
->heap
, oidp
->off
, &oidp
->off
, 0, NULL
, NULL
,
2322 pmalloc_operation_release(pop
);
2326 * constructor_realloc -- (internal) constructor for pmemobj_realloc
2329 constructor_realloc(void *ctx
, void *ptr
, size_t usable_size
, void *arg
)
2331 PMEMobjpool
*pop
= ctx
;
2332 LOG(3, "pop %p ptr %p arg %p", pop
, ptr
, arg
);
2333 struct pmem_ops
*p_ops
= &pop
->p_ops
;
2335 ASSERTne(ptr
, NULL
);
2336 ASSERTne(arg
, NULL
);
2338 struct carg_realloc
*carg
= arg
;
2340 if (!carg
->zero_init
)
2343 if (usable_size
> carg
->old_size
) {
2344 size_t grow_len
= usable_size
- carg
->old_size
;
2345 void *new_data_ptr
= (void *)((uintptr_t)ptr
+ carg
->old_size
);
2347 pmemops_memset(p_ops
, new_data_ptr
, 0, grow_len
, 0);
2354 * obj_realloc_common -- (internal) common routine for resizing
2358 obj_realloc_common(PMEMobjpool
*pop
,
2359 PMEMoid
*oidp
, size_t size
, type_num_t type_num
, int zero_init
)
2361 /* if OID is NULL just allocate memory */
2362 if (OBJ_OID_IS_NULL(*oidp
)) {
2363 /* if size is 0 - do nothing */
2367 return obj_alloc_construct(pop
, oidp
, size
, type_num
,
2368 POBJ_FLAG_ZERO
, NULL
, NULL
);
2371 if (size
> PMEMOBJ_MAX_ALLOC_SIZE
) {
2372 ERR("requested size too large");
2377 /* if size is 0 just free */
2379 obj_free(pop
, oidp
);
2383 struct carg_realloc carg
;
2384 carg
.ptr
= OBJ_OFF_TO_PTR(pop
, oidp
->off
);
2385 carg
.new_size
= size
;
2386 carg
.old_size
= pmemobj_alloc_usable_size(*oidp
);
2387 carg
.user_type
= type_num
;
2388 carg
.constructor
= NULL
;
2390 carg
.zero_init
= zero_init
;
2392 struct operation_context
*ctx
= pmalloc_operation_hold(pop
);
2394 int ret
= palloc_operation(&pop
->heap
, oidp
->off
, &oidp
->off
,
2395 size
, constructor_realloc
, &carg
, type_num
,
2398 pmalloc_operation_release(pop
);
2404 * constructor_zrealloc_root -- (internal) constructor for pmemobj_root
2407 constructor_zrealloc_root(void *ctx
, void *ptr
, size_t usable_size
, void *arg
)
2409 PMEMobjpool
*pop
= ctx
;
2410 LOG(3, "pop %p ptr %p arg %p", pop
, ptr
, arg
);
2412 ASSERTne(ptr
, NULL
);
2413 ASSERTne(arg
, NULL
);
2415 VALGRIND_ADD_TO_TX(ptr
, usable_size
);
2417 struct carg_realloc
*carg
= arg
;
2419 constructor_realloc(pop
, ptr
, usable_size
, arg
);
2421 if (carg
->constructor
)
2422 ret
= carg
->constructor(pop
, ptr
, carg
->arg
);
2424 VALGRIND_REMOVE_FROM_TX(ptr
, usable_size
);
2430 * pmemobj_realloc -- resizes an existing object
2433 pmemobj_realloc(PMEMobjpool
*pop
, PMEMoid
*oidp
, size_t size
,
2436 ASSERTne(oidp
, NULL
);
2438 LOG(3, "pop %p oid.off 0x%016" PRIx64
" size %zu type_num %" PRIu64
,
2439 pop
, oidp
->off
, size
, type_num
);
2441 PMEMOBJ_API_START();
2442 /* log notice message if used inside a transaction */
2443 _POBJ_DEBUG_NOTICE_IN_TX();
2444 ASSERT(OBJ_OID_IS_VALID(pop
, *oidp
));
2446 int ret
= obj_realloc_common(pop
, oidp
, size
, (type_num_t
)type_num
, 0);
2453 * pmemobj_zrealloc -- resizes an existing object, any new space is zeroed.
2456 pmemobj_zrealloc(PMEMobjpool
*pop
, PMEMoid
*oidp
, size_t size
,
2459 ASSERTne(oidp
, NULL
);
2461 LOG(3, "pop %p oid.off 0x%016" PRIx64
" size %zu type_num %" PRIu64
,
2462 pop
, oidp
->off
, size
, type_num
);
2464 PMEMOBJ_API_START();
2466 /* log notice message if used inside a transaction */
2467 _POBJ_DEBUG_NOTICE_IN_TX();
2468 ASSERT(OBJ_OID_IS_VALID(pop
, *oidp
));
2470 int ret
= obj_realloc_common(pop
, oidp
, size
, (type_num_t
)type_num
, 1);
2476 /* arguments for constructor_strdup */
2477 struct carg_strdup
{
2483 * constructor_strdup -- (internal) constructor of pmemobj_strdup
2486 constructor_strdup(PMEMobjpool
*pop
, void *ptr
, void *arg
)
2488 LOG(3, "pop %p ptr %p arg %p", pop
, ptr
, arg
);
2490 ASSERTne(ptr
, NULL
);
2491 ASSERTne(arg
, NULL
);
2493 struct carg_strdup
*carg
= arg
;
2496 pmemops_memcpy(&pop
->p_ops
, ptr
, carg
->s
, carg
->size
, 0);
2502 * pmemobj_strdup -- allocates a new object with duplicate of the string s.
2505 pmemobj_strdup(PMEMobjpool
*pop
, PMEMoid
*oidp
, const char *s
,
2508 LOG(3, "pop %p oidp %p string %s type_num %" PRIu64
,
2509 pop
, oidp
, s
, type_num
);
2511 /* log notice message if used inside a transaction */
2512 _POBJ_DEBUG_NOTICE_IN_TX();
2519 PMEMOBJ_API_START();
2520 struct carg_strdup carg
;
2521 carg
.size
= (strlen(s
) + 1) * sizeof(char);
2524 int ret
= obj_alloc_construct(pop
, oidp
, carg
.size
,
2525 (type_num_t
)type_num
, 0, constructor_strdup
, &carg
);
2531 /* arguments for constructor_wcsdup */
2532 struct carg_wcsdup
{
2538 * constructor_wcsdup -- (internal) constructor of pmemobj_wcsdup
2541 constructor_wcsdup(PMEMobjpool
*pop
, void *ptr
, void *arg
)
2543 LOG(3, "pop %p ptr %p arg %p", pop
, ptr
, arg
);
2545 ASSERTne(ptr
, NULL
);
2546 ASSERTne(arg
, NULL
);
2548 struct carg_wcsdup
*carg
= arg
;
2551 pmemops_memcpy(&pop
->p_ops
, ptr
, carg
->s
, carg
->size
, 0);
2557 * pmemobj_wcsdup -- allocates a new object with duplicate of the wide character
2561 pmemobj_wcsdup(PMEMobjpool
*pop
, PMEMoid
*oidp
, const wchar_t *s
,
2564 LOG(3, "pop %p oidp %p string %S type_num %" PRIu64
,
2565 pop
, oidp
, s
, type_num
);
2567 /* log notice message if used inside a transaction */
2568 _POBJ_DEBUG_NOTICE_IN_TX();
2575 PMEMOBJ_API_START();
2576 struct carg_wcsdup carg
;
2577 carg
.size
= (wcslen(s
) + 1) * sizeof(wchar_t);
2580 int ret
= obj_alloc_construct(pop
, oidp
, carg
.size
,
2581 (type_num_t
)type_num
, 0, constructor_wcsdup
, &carg
);
2588 * pmemobj_free -- frees an existing object
2591 pmemobj_free(PMEMoid
*oidp
)
2593 ASSERTne(oidp
, NULL
);
2595 LOG(3, "oid.off 0x%016" PRIx64
, oidp
->off
);
2597 /* log notice message if used inside a transaction */
2598 _POBJ_DEBUG_NOTICE_IN_TX();
2603 PMEMOBJ_API_START();
2604 PMEMobjpool
*pop
= pmemobj_pool_by_oid(*oidp
);
2606 ASSERTne(pop
, NULL
);
2607 ASSERT(OBJ_OID_IS_VALID(pop
, *oidp
));
2609 obj_free(pop
, oidp
);
2614 * pmemobj_alloc_usable_size -- returns usable size of object
2617 pmemobj_alloc_usable_size(PMEMoid oid
)
2619 LOG(3, "oid.off 0x%016" PRIx64
, oid
.off
);
2624 PMEMobjpool
*pop
= pmemobj_pool_by_oid(oid
);
2626 ASSERTne(pop
, NULL
);
2627 ASSERT(OBJ_OID_IS_VALID(pop
, oid
));
2629 return (palloc_usable_size(&pop
->heap
, oid
.off
));
2633 * pmemobj_memcpy_persist -- pmemobj version of memcpy
2636 pmemobj_memcpy_persist(PMEMobjpool
*pop
, void *dest
, const void *src
,
2639 LOG(15, "pop %p dest %p src %p len %zu", pop
, dest
, src
, len
);
2640 PMEMOBJ_API_START();
2642 void *ptr
= pmemops_memcpy(&pop
->p_ops
, dest
, src
, len
, 0);
2649 * pmemobj_memset_persist -- pmemobj version of memset
2652 pmemobj_memset_persist(PMEMobjpool
*pop
, void *dest
, int c
, size_t len
)
2654 LOG(15, "pop %p dest %p c 0x%02x len %zu", pop
, dest
, c
, len
);
2655 PMEMOBJ_API_START();
2657 void *ptr
= pmemops_memset(&pop
->p_ops
, dest
, c
, len
, 0);
2664 * pmemobj_memcpy -- pmemobj version of memcpy
2667 pmemobj_memcpy(PMEMobjpool
*pop
, void *dest
, const void *src
, size_t len
,
2670 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop
, dest
, src
, len
,
2673 PMEMOBJ_API_START();
2675 void *ptr
= pmemops_memcpy(&pop
->p_ops
, dest
, src
, len
, flags
);
2682 * pmemobj_memmove -- pmemobj version of memmove
2685 pmemobj_memmove(PMEMobjpool
*pop
, void *dest
, const void *src
, size_t len
,
2688 LOG(15, "pop %p dest %p src %p len %zu flags 0x%x", pop
, dest
, src
, len
,
2691 PMEMOBJ_API_START();
2693 void *ptr
= pmemops_memmove(&pop
->p_ops
, dest
, src
, len
, flags
);
2700 * pmemobj_memset -- pmemobj version of memset
2703 pmemobj_memset(PMEMobjpool
*pop
, void *dest
, int c
, size_t len
, unsigned flags
)
2705 LOG(15, "pop %p dest %p c 0x%02x len %zu flags 0x%x", pop
, dest
, c
, len
,
2708 PMEMOBJ_API_START();
2710 void *ptr
= pmemops_memset(&pop
->p_ops
, dest
, c
, len
, flags
);
2717 * pmemobj_persist -- pmemobj version of pmem_persist
2720 pmemobj_persist(PMEMobjpool
*pop
, const void *addr
, size_t len
)
2722 LOG(15, "pop %p addr %p len %zu", pop
, addr
, len
);
2724 pmemops_persist(&pop
->p_ops
, addr
, len
);
2728 * pmemobj_flush -- pmemobj version of pmem_flush
2731 pmemobj_flush(PMEMobjpool
*pop
, const void *addr
, size_t len
)
2733 LOG(15, "pop %p addr %p len %zu", pop
, addr
, len
);
2735 pmemops_flush(&pop
->p_ops
, addr
, len
);
2739 * pmemobj_xpersist -- pmemobj version of pmem_persist with additional flags
2743 pmemobj_xpersist(PMEMobjpool
*pop
, const void *addr
, size_t len
, unsigned flags
)
2745 LOG(15, "pop %p addr %p len %zu", pop
, addr
, len
);
2747 if (flags
& ~OBJ_X_VALID_FLAGS
) {
2749 ERR("invalid flags 0x%x", flags
);
2753 return pmemops_xpersist(&pop
->p_ops
, addr
, len
, flags
);
2757 * pmemobj_xflush -- pmemobj version of pmem_flush with additional flags
2761 pmemobj_xflush(PMEMobjpool
*pop
, const void *addr
, size_t len
, unsigned flags
)
2763 LOG(15, "pop %p addr %p len %zu", pop
, addr
, len
);
2765 if (flags
& ~OBJ_X_VALID_FLAGS
) {
2767 ERR("invalid flags 0x%x", flags
);
2771 return pmemops_xflush(&pop
->p_ops
, addr
, len
, flags
);
2775 * pmemobj_drain -- pmemobj version of pmem_drain
2778 pmemobj_drain(PMEMobjpool
*pop
)
2780 LOG(15, "pop %p", pop
);
2782 pmemops_drain(&pop
->p_ops
);
2786 * pmemobj_type_num -- returns type number of object
2789 pmemobj_type_num(PMEMoid oid
)
2791 LOG(3, "oid.off 0x%016" PRIx64
, oid
.off
);
2793 ASSERT(!OID_IS_NULL(oid
));
2795 PMEMobjpool
*pop
= pmemobj_pool_by_oid(oid
);
2797 ASSERTne(pop
, NULL
);
2798 ASSERT(OBJ_OID_IS_VALID(pop
, oid
));
2800 return palloc_extra(&pop
->heap
, oid
.off
);
2803 /* arguments for constructor_alloc_root */
2806 pmemobj_constr constructor
;
2811 * obj_realloc_root -- (internal) reallocate root object
2814 obj_alloc_root(PMEMobjpool
*pop
, size_t size
,
2815 pmemobj_constr constructor
, void *arg
)
2817 LOG(3, "pop %p size %zu", pop
, size
);
2819 struct carg_realloc carg
;
2821 carg
.ptr
= OBJ_OFF_TO_PTR(pop
, pop
->root_offset
);
2822 carg
.old_size
= pop
->root_size
;
2823 carg
.new_size
= size
;
2824 carg
.user_type
= POBJ_ROOT_TYPE_NUM
;
2825 carg
.constructor
= constructor
;
2829 struct operation_context
*ctx
= pmalloc_operation_hold(pop
);
2831 operation_add_entry(ctx
, &pop
->root_size
, size
, ULOG_OPERATION_SET
);
2833 int ret
= palloc_operation(&pop
->heap
, pop
->root_offset
,
2834 &pop
->root_offset
, size
,
2835 constructor_zrealloc_root
, &carg
,
2836 POBJ_ROOT_TYPE_NUM
, OBJ_INTERNAL_OBJECT_MASK
,
2839 pmalloc_operation_release(pop
);
2845 * pmemobj_root_size -- returns size of the root object
2848 pmemobj_root_size(PMEMobjpool
*pop
)
2850 LOG(3, "pop %p", pop
);
2852 if (pop
->root_offset
&& pop
->root_size
) {
2853 return pop
->root_size
;
2859 * pmemobj_root_construct -- returns root object
2862 pmemobj_root_construct(PMEMobjpool
*pop
, size_t size
,
2863 pmemobj_constr constructor
, void *arg
)
2865 LOG(3, "pop %p size %zu constructor %p args %p", pop
, size
, constructor
,
2868 if (size
> PMEMOBJ_MAX_ALLOC_SIZE
) {
2869 ERR("requested size too large");
2874 if (size
== 0 && pop
->root_offset
== 0) {
2875 ERR("requested size cannot equals zero");
2880 PMEMOBJ_API_START();
2884 pmemobj_mutex_lock_nofail(pop
, &pop
->rootlock
);
2886 if (size
> pop
->root_size
&&
2887 obj_alloc_root(pop
, size
, constructor
, arg
)) {
2888 pmemobj_mutex_unlock_nofail(pop
, &pop
->rootlock
);
2889 LOG(2, "obj_realloc_root failed");
2894 root
.pool_uuid_lo
= pop
->uuid_lo
;
2895 root
.off
= pop
->root_offset
;
2897 pmemobj_mutex_unlock_nofail(pop
, &pop
->rootlock
);
2904 * pmemobj_root -- returns root object
2907 pmemobj_root(PMEMobjpool
*pop
, size_t size
)
2909 LOG(3, "pop %p size %zu", pop
, size
);
2911 PMEMOBJ_API_START();
2912 PMEMoid oid
= pmemobj_root_construct(pop
, size
, NULL
, NULL
);
2918 * pmemobj_first - returns first object of specified type
2921 pmemobj_first(PMEMobjpool
*pop
)
2923 LOG(3, "pop %p", pop
);
2925 PMEMoid ret
= {0, 0};
2927 uint64_t off
= palloc_first(&pop
->heap
);
2930 ret
.pool_uuid_lo
= pop
->uuid_lo
;
2932 if (palloc_flags(&pop
->heap
, off
) & OBJ_INTERNAL_OBJECT_MASK
) {
2933 return pmemobj_next(ret
);
2941 * pmemobj_next - returns next object of specified type
2944 pmemobj_next(PMEMoid oid
)
2946 LOG(3, "oid.off 0x%016" PRIx64
, oid
.off
);
2952 PMEMobjpool
*pop
= pmemobj_pool_by_oid(curr
);
2953 ASSERTne(pop
, NULL
);
2956 ASSERT(OBJ_OID_IS_VALID(pop
, curr
));
2957 uint64_t next_off
= palloc_next(&pop
->heap
, curr
.off
);
2962 /* next object exists */
2963 curr
.off
= next_off
;
2965 } while (palloc_flags(&pop
->heap
, curr
.off
) & OBJ_INTERNAL_OBJECT_MASK
);
2971 * pmemobj_reserve -- reserves a single object
2974 pmemobj_reserve(PMEMobjpool
*pop
, struct pobj_action
*act
,
2975 size_t size
, uint64_t type_num
)
2977 LOG(3, "pop %p act %p size %zu type_num %llx",
2979 (unsigned long long)type_num
);
2981 PMEMOBJ_API_START();
2982 PMEMoid oid
= OID_NULL
;
2984 if (palloc_reserve(&pop
->heap
, size
, NULL
, NULL
, type_num
,
2985 0, 0, 0, act
) != 0) {
2990 oid
.off
= act
->heap
.offset
;
2991 oid
.pool_uuid_lo
= pop
->uuid_lo
;
2998 * pmemobj_xreserve -- reserves a single object
3001 pmemobj_xreserve(PMEMobjpool
*pop
, struct pobj_action
*act
,
3002 size_t size
, uint64_t type_num
, uint64_t flags
)
3004 LOG(3, "pop %p act %p size %zu type_num %llx flags %llx",
3006 (unsigned long long)type_num
, (unsigned long long)flags
);
3008 PMEMoid oid
= OID_NULL
;
3010 if (flags
& ~POBJ_ACTION_XRESERVE_VALID_FLAGS
) {
3011 ERR("unknown flags 0x%" PRIx64
,
3012 flags
& ~POBJ_ACTION_XRESERVE_VALID_FLAGS
);
3017 PMEMOBJ_API_START();
3018 struct constr_args carg
;
3020 carg
.zero_init
= flags
& POBJ_FLAG_ZERO
;
3021 carg
.constructor
= NULL
;
3024 if (palloc_reserve(&pop
->heap
, size
, constructor_alloc
, &carg
,
3025 type_num
, 0, CLASS_ID_FROM_FLAG(flags
),
3026 ARENA_ID_FROM_FLAG(flags
), act
) != 0) {
3031 oid
.off
= act
->heap
.offset
;
3032 oid
.pool_uuid_lo
= pop
->uuid_lo
;
3039 * pmemobj_set_value -- creates an action to set a value
3042 pmemobj_set_value(PMEMobjpool
*pop
, struct pobj_action
*act
,
3043 uint64_t *ptr
, uint64_t value
)
3045 palloc_set_value(&pop
->heap
, act
, ptr
, value
);
3049 * pmemobj_defer_free -- creates a deferred free action
3052 pmemobj_defer_free(PMEMobjpool
*pop
, PMEMoid oid
, struct pobj_action
*act
)
3054 ASSERT(!OID_IS_NULL(oid
));
3055 palloc_defer_free(&pop
->heap
, oid
.off
, act
);
3059 * pmemobj_publish -- publishes a collection of actions
3062 pmemobj_publish(PMEMobjpool
*pop
, struct pobj_action
*actv
, size_t actvcnt
)
3064 PMEMOBJ_API_START();
3065 struct operation_context
*ctx
= pmalloc_operation_hold(pop
);
3067 size_t entries_size
= actvcnt
* sizeof(struct ulog_entry_val
);
3069 if (operation_reserve(ctx
, entries_size
) != 0) {
3074 palloc_publish(&pop
->heap
, actv
, actvcnt
, ctx
);
3076 pmalloc_operation_release(pop
);
3083 * pmemobj_cancel -- cancels collection of actions
3086 pmemobj_cancel(PMEMobjpool
*pop
, struct pobj_action
*actv
, size_t actvcnt
)
3088 PMEMOBJ_API_START();
3089 palloc_cancel(&pop
->heap
, actv
, actvcnt
);
3094 * pmemobj_defrag -- reallocates provided PMEMoids so that the underlying memory
3095 * is efficiently arranged.
3098 pmemobj_defrag(PMEMobjpool
*pop
, PMEMoid
**oidv
, size_t oidcnt
,
3099 struct pobj_defrag_result
*result
)
3101 PMEMOBJ_API_START();
3104 result
->relocated
= 0;
3108 uint64_t **objv
= Malloc(sizeof(uint64_t *) * oidcnt
);
3115 for (size_t i
= 0; i
< oidcnt
; ++i
) {
3116 if (OID_IS_NULL(*oidv
[i
]))
3118 if (oidv
[i
]->pool_uuid_lo
!= pop
->uuid_lo
) {
3120 ERR("Not all PMEMoids belong to the provided pool");
3123 objv
[j
++] = &oidv
[i
]->off
;
3126 struct operation_context
*ctx
= pmalloc_operation_hold(pop
);
3128 ret
= palloc_defrag(&pop
->heap
, objv
, j
, ctx
, result
);
3130 pmalloc_operation_release(pop
);
3140 * pmemobj_list_insert -- adds object to a list
3143 pmemobj_list_insert(PMEMobjpool
*pop
, size_t pe_offset
, void *head
,
3144 PMEMoid dest
, int before
, PMEMoid oid
)
3146 LOG(3, "pop %p pe_offset %zu head %p dest.off 0x%016" PRIx64
3147 " before %d oid.off 0x%016" PRIx64
,
3148 pop
, pe_offset
, head
, dest
.off
, before
, oid
.off
);
3149 PMEMOBJ_API_START();
3151 /* log notice message if used inside a transaction */
3152 _POBJ_DEBUG_NOTICE_IN_TX();
3153 ASSERT(OBJ_OID_IS_VALID(pop
, oid
));
3154 ASSERT(OBJ_OID_IS_VALID(pop
, dest
));
3156 ASSERT(pe_offset
<= pmemobj_alloc_usable_size(dest
)
3157 - sizeof(struct list_entry
));
3158 ASSERT(pe_offset
<= pmemobj_alloc_usable_size(oid
)
3159 - sizeof(struct list_entry
));
3161 int ret
= list_insert(pop
, (ssize_t
)pe_offset
, head
, dest
, before
, oid
);
3168 * pmemobj_list_insert_new -- adds new object to a list
3171 pmemobj_list_insert_new(PMEMobjpool
*pop
, size_t pe_offset
, void *head
,
3172 PMEMoid dest
, int before
, size_t size
,
3174 pmemobj_constr constructor
, void *arg
)
3176 LOG(3, "pop %p pe_offset %zu head %p dest.off 0x%016" PRIx64
3177 " before %d size %zu type_num %" PRIu64
,
3178 pop
, pe_offset
, head
, dest
.off
, before
, size
, type_num
);
3180 /* log notice message if used inside a transaction */
3181 _POBJ_DEBUG_NOTICE_IN_TX();
3182 ASSERT(OBJ_OID_IS_VALID(pop
, dest
));
3184 ASSERT(pe_offset
<= pmemobj_alloc_usable_size(dest
)
3185 - sizeof(struct list_entry
));
3186 ASSERT(pe_offset
<= size
- sizeof(struct list_entry
));
3188 if (size
> PMEMOBJ_MAX_ALLOC_SIZE
) {
3189 ERR("requested size too large");
3194 PMEMOBJ_API_START();
3195 struct constr_args carg
;
3197 carg
.constructor
= constructor
;
3201 PMEMoid retoid
= OID_NULL
;
3202 list_insert_new_user(pop
, pe_offset
, head
, dest
, before
, size
, type_num
,
3203 constructor_alloc
, &carg
, &retoid
);
3210 * pmemobj_list_remove -- removes object from a list
3213 pmemobj_list_remove(PMEMobjpool
*pop
, size_t pe_offset
, void *head
,
3214 PMEMoid oid
, int free
)
3216 LOG(3, "pop %p pe_offset %zu head %p oid.off 0x%016" PRIx64
" free %d",
3217 pop
, pe_offset
, head
, oid
.off
, free
);
3218 PMEMOBJ_API_START();
3220 /* log notice message if used inside a transaction */
3221 _POBJ_DEBUG_NOTICE_IN_TX();
3222 ASSERT(OBJ_OID_IS_VALID(pop
, oid
));
3224 ASSERT(pe_offset
<= pmemobj_alloc_usable_size(oid
)
3225 - sizeof(struct list_entry
));
3229 ret
= list_remove_free_user(pop
, pe_offset
, head
, &oid
);
3231 ret
= list_remove(pop
, (ssize_t
)pe_offset
, head
, oid
);
3238 * pmemobj_list_move -- moves object between lists
3241 pmemobj_list_move(PMEMobjpool
*pop
, size_t pe_old_offset
, void *head_old
,
3242 size_t pe_new_offset
, void *head_new
,
3243 PMEMoid dest
, int before
, PMEMoid oid
)
3245 LOG(3, "pop %p pe_old_offset %zu pe_new_offset %zu"
3246 " head_old %p head_new %p dest.off 0x%016" PRIx64
3247 " before %d oid.off 0x%016" PRIx64
"",
3248 pop
, pe_old_offset
, pe_new_offset
,
3249 head_old
, head_new
, dest
.off
, before
, oid
.off
);
3250 PMEMOBJ_API_START();
3252 /* log notice message if used inside a transaction */
3253 _POBJ_DEBUG_NOTICE_IN_TX();
3255 ASSERT(OBJ_OID_IS_VALID(pop
, oid
));
3256 ASSERT(OBJ_OID_IS_VALID(pop
, dest
));
3258 ASSERT(pe_old_offset
<= pmemobj_alloc_usable_size(oid
)
3259 - sizeof(struct list_entry
));
3260 ASSERT(pe_new_offset
<= pmemobj_alloc_usable_size(oid
)
3261 - sizeof(struct list_entry
));
3262 ASSERT(pe_old_offset
<= pmemobj_alloc_usable_size(dest
)
3263 - sizeof(struct list_entry
));
3264 ASSERT(pe_new_offset
<= pmemobj_alloc_usable_size(dest
)
3265 - sizeof(struct list_entry
));
3267 int ret
= list_move(pop
, pe_old_offset
, head_old
,
3268 pe_new_offset
, head_new
,
3276 * pmemobj_ctl_getU -- programmatically executes a read ctl query
3282 pmemobj_ctl_getU(PMEMobjpool
*pop
, const char *name
, void *arg
)
3284 LOG(3, "pop %p name %s arg %p", pop
, name
, arg
);
3285 return ctl_query(pop
== NULL
? NULL
: pop
->ctl
, pop
,
3286 CTL_QUERY_PROGRAMMATIC
, name
, CTL_QUERY_READ
, arg
);
3290 * pmemobj_ctl_setU -- programmatically executes a write ctl query
3296 pmemobj_ctl_setU(PMEMobjpool
*pop
, const char *name
, void *arg
)
3298 LOG(3, "pop %p name %s arg %p", pop
, name
, arg
);
3299 return ctl_query(pop
== NULL
? NULL
: pop
->ctl
, pop
,
3300 CTL_QUERY_PROGRAMMATIC
, name
, CTL_QUERY_WRITE
, arg
);
3304 * pmemobj_ctl_execU -- programmatically executes a runnable ctl query
3310 pmemobj_ctl_execU(PMEMobjpool
*pop
, const char *name
, void *arg
)
3312 LOG(3, "pop %p name %s arg %p", pop
, name
, arg
);
3313 return ctl_query(pop
== NULL
? NULL
: pop
->ctl
, pop
,
3314 CTL_QUERY_PROGRAMMATIC
, name
, CTL_QUERY_RUNNABLE
, arg
);
3319 * pmemobj_ctl_get -- programmatically executes a read ctl query
3322 pmemobj_ctl_get(PMEMobjpool
*pop
, const char *name
, void *arg
)
3324 return pmemobj_ctl_getU(pop
, name
, arg
);
3328 * pmemobj_ctl_set -- programmatically executes a write ctl query
3331 pmemobj_ctl_set(PMEMobjpool
*pop
, const char *name
, void *arg
)
3333 PMEMOBJ_API_START();
3335 int ret
= pmemobj_ctl_setU(pop
, name
, arg
);
3342 * pmemobj_ctl_exec -- programmatically executes a runnable ctl query
3345 pmemobj_ctl_exec(PMEMobjpool
*pop
, const char *name
, void *arg
)
3347 PMEMOBJ_API_START();
3349 int ret
= pmemobj_ctl_execU(pop
, name
, arg
);
3356 * pmemobj_ctl_getW -- programmatically executes a read ctl query
3359 pmemobj_ctl_getW(PMEMobjpool
*pop
, const wchar_t *name
, void *arg
)
3361 char *uname
= util_toUTF8(name
);
3365 int ret
= pmemobj_ctl_getU(pop
, uname
, arg
);
3366 util_free_UTF8(uname
);
3372 * pmemobj_ctl_setW -- programmatically executes a write ctl query
3375 pmemobj_ctl_setW(PMEMobjpool
*pop
, const wchar_t *name
, void *arg
)
3377 char *uname
= util_toUTF8(name
);
3381 int ret
= pmemobj_ctl_setU(pop
, uname
, arg
);
3382 util_free_UTF8(uname
);
3388 * pmemobj_ctl_execW -- programmatically executes a runnable ctl query
3391 pmemobj_ctl_execW(PMEMobjpool
*pop
, const wchar_t *name
, void *arg
)
3393 char *uname
= util_toUTF8(name
);
3397 int ret
= pmemobj_ctl_execU(pop
, uname
, arg
);
3398 util_free_UTF8(uname
);
3405 * _pobj_debug_notice -- logs notice message if used inside a transaction
3408 _pobj_debug_notice(const char *api_name
, const char *file
, int line
)
3411 if (pmemobj_tx_stage() != TX_STAGE_NONE
) {
3413 LOG(4, "Notice: non-transactional API"
3414 " used inside a transaction (%s in %s:%d)",
3415 api_name
, file
, line
);
3417 LOG(4, "Notice: non-transactional API"
3418 " used inside a transaction (%s)", api_name
);
3423 #if VG_PMEMCHECK_ENABLED
3425 * pobj_emit_log -- logs library and function names to pmemcheck store log
3428 pobj_emit_log(const char *func
, int order
)
3430 util_emit_log("libpmemobj", func
, order
);
3436 pmemobj_inject_fault_at(enum pmem_allocation_type type
, int nth
,
3439 core_inject_fault_at(type
, nth
, at
);
3443 pmemobj_fault_injection_enabled(void)
3445 return core_fault_injection_enabled();