]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/pmdk/src/libpmempool/sync.c
import ceph 16.2.7
[ceph.git] / ceph / src / pmdk / src / libpmempool / sync.c
diff --git a/ceph/src/pmdk/src/libpmempool/sync.c b/ceph/src/pmdk/src/libpmempool/sync.c
new file mode 100644 (file)
index 0000000..b7c0cb4
--- /dev/null
@@ -0,0 +1,1646 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/* Copyright 2016-2020, Intel Corporation */
+
+/*
+ * sync.c -- a module for poolset synchronizing
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#include "libpmem.h"
+#include "replica.h"
+#include "out.h"
+#include "os.h"
+#include "util_pmem.h"
+#include "util.h"
+
+#ifdef USE_RPMEM
+#include "rpmem_common.h"
+#include "rpmem_ssh.h"
+#endif
+
+#define BB_DATA_STR "offset 0x%zx, length 0x%zx, nhealthy %i"
+
+/* defines 'struct bb_vec' - the vector of the 'struct bad_block' structures */
+VEC(bb_vec, struct bad_block);
+
+/*
+ * validate_args -- (internal) check whether passed arguments are valid
+ */
+static int
+validate_args(struct pool_set *set)
+{
+       LOG(3, "set %p", set);
+       ASSERTne(set, NULL);
+
+       /* the checks below help detect use of incorrect poolset file */
+
+       /*
+        * check if all parts in the poolset are large enough
+        * (now replication works only for pmemobj pools)
+        */
+       if (replica_check_part_sizes(set, PMEMOBJ_MIN_POOL)) {
+               LOG(2, "part sizes check failed");
+               goto err;
+       }
+
+       /*
+        * check if all directories for part files exist
+        */
+       if (replica_check_part_dirs(set)) {
+               LOG(2, "part directories check failed");
+               goto err;
+       }
+
+       return 0;
+
+err:
+       if (errno == 0)
+               errno = EINVAL;
+       return -1;
+}
+
+/*
+ * sync_copy_data -- (internal) copy data from the healthy replica
+ *                   to the broken one
+ */
+static int
+sync_copy_data(void *src_addr, void *dst_addr, size_t off, size_t len,
+               struct pool_replica *rep_h,
+               struct pool_replica *rep, const struct pool_set_part *part)
+{
+       LOG(3, "src_addr %p dst_addr %p off %zu len %zu "
+               "rep_h %p rep %p part %p",
+               src_addr, dst_addr, off, len, rep_h, rep, part);
+
+       int ret;
+
+       if (rep->remote) {
+               LOG(10,
+                       "copying data (offset 0x%zx length 0x%zx) to remote node -- '%s' on '%s'",
+                       off, len,
+                       rep->remote->pool_desc,
+                       rep->remote->node_addr);
+
+               ret = Rpmem_persist(rep->remote->rpp, off, len, 0, 0);
+               if (ret) {
+                       LOG(1,
+                               "copying data to remote node failed -- '%s' on '%s'",
+                               rep->remote->pool_desc,
+                               rep->remote->node_addr);
+                       return -1;
+               }
+       } else if (rep_h->remote) {
+               LOG(10,
+                       "reading data (offset 0x%zx length 0x%zx) from remote node -- '%s' on '%s'",
+                       off, len,
+                       rep_h->remote->pool_desc,
+                       rep_h->remote->node_addr);
+
+               ret = Rpmem_read(rep_h->remote->rpp, dst_addr, off, len, 0);
+               if (ret) {
+                       LOG(1,
+                               "reading data from remote node failed -- '%s' on '%s'",
+                               rep_h->remote->pool_desc,
+                               rep_h->remote->node_addr);
+                       return -1;
+               }
+       } else {
+               LOG(10,
+                       "copying data (offset 0x%zx length 0x%zx) from local replica -- '%s'",
+                       off, len, rep_h->part[0].path);
+
+               /* copy all data */
+               memcpy(dst_addr, src_addr, len);
+               util_persist(part->is_dev_dax, dst_addr, len);
+       }
+
+       return 0;
+}
+
+/*
+ * sync_recreate_header -- (internal) recreate the header
+ */
+static int
+sync_recreate_header(struct pool_set *set, unsigned r, unsigned p,
+                       struct pool_hdr *src_hdr)
+{
+       LOG(3, "set %p replica %u part %u src_hdr %p", set, r, p, src_hdr);
+
+       struct pool_attr attr;
+       util_pool_hdr2attr(&attr, src_hdr);
+
+       if (util_header_create(set, r, p, &attr, 1) != 0) {
+               LOG(1, "part headers create failed for replica %u part %u",
+                       r, p);
+               errno = EINVAL;
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+ * sync_mark_replica_no_badblocks -- (internal) mark replica as not having
+ *                                              bad blocks
+ */
+static void
+sync_mark_replica_no_badblocks(unsigned repn,
+                               struct poolset_health_status *set_hs)
+{
+       LOG(3, "repn %u set_hs %p", repn, set_hs);
+
+       struct replica_health_status *rhs = REP_HEALTH(set_hs, repn);
+
+       if (rhs->flags & HAS_BAD_BLOCKS) {
+               rhs->flags &= ~HAS_BAD_BLOCKS;
+               LOG(4, "replica %u has no bad blocks now", repn);
+       }
+}
+
+/*
+ * sync_mark_part_no_badblocks -- (internal) mark part as not having bad blocks
+ */
+static void
+sync_mark_part_no_badblocks(unsigned repn, unsigned partn,
+                               struct poolset_health_status *set_hs)
+{
+       LOG(3, "repn %u partn %u set_hs %p", repn, partn, set_hs);
+
+       struct replica_health_status *rhs = REP_HEALTH(set_hs, repn);
+
+       if (rhs->part[PART_HEALTHidx(rhs, partn)].flags & HAS_BAD_BLOCKS) {
+               rhs->part[PART_HEALTHidx(rhs, partn)].flags &= ~HAS_BAD_BLOCKS;
+               LOG(4, "replica %u part %u has no bad blocks now", repn, partn);
+       }
+}
+
+/*
+ * sync_recalc_badblocks -- (internal) recalculate offset and length
+ *                          of bad blocks to absolute ones
+ *                          (relative to the beginning of the pool)
+ */
+static int
+sync_recalc_badblocks(struct pool_set *set,
+                       struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p set_hs %p", set, set_hs);
+
+       /* header size for all headers but the first one */
+       size_t hdrsize = (set->options & (OPTION_SINGLEHDR | OPTION_NOHDRS)) ?
+                               0 : Mmap_align;
+
+       for (unsigned r = 0; r < set->nreplicas; ++r) {
+               struct pool_replica *rep = REP(set, r);
+               struct replica_health_status *rep_hs = set_hs->replica[r];
+
+               for (unsigned p = 0; p < rep->nparts; ++p) {
+
+                       struct part_health_status *phs = &rep_hs->part[p];
+
+                       if (!replica_part_has_bad_blocks(phs)) {
+                               /* skip parts with no bad blocks */
+                               continue;
+                       }
+
+                       ASSERTne(phs->bbs.bb_cnt, 0);
+                       ASSERTne(phs->bbs.bbv, NULL);
+
+                       LOG(10, "Replica %u part %u HAS %u bad blocks",
+                               r, p, phs->bbs.bb_cnt);
+
+                       size_t part_off = replica_get_part_offset(set, r, p);
+
+                       for (unsigned i = 0; i < phs->bbs.bb_cnt; i++) {
+                               LOG(10,
+                                       "relative bad block #%i: offset %zu, length %zu",
+                                       i,
+                                       phs->bbs.bbv[i].offset,
+                                       phs->bbs.bbv[i].length);
+
+                               size_t off = phs->bbs.bbv[i].offset;
+                               size_t len = phs->bbs.bbv[i].length;
+
+                               if (len + off <= hdrsize)
+                                       continue;
+
+                               /* parts #>0 are mapped without the header */
+                               if (p > 0 && hdrsize > 0) {
+                                       if (off >= hdrsize) {
+                                               /*
+                                                * Bad block does not overlap
+                                                * with the header, so only
+                                                * adjust the offset.
+                                                */
+                                               off -= hdrsize;
+                                       } else {
+                                               /*
+                                                * Bad block overlaps
+                                                * with the header,
+                                                * so adjust the length
+                                                * and zero the offset.
+                                                */
+                                               len -= hdrsize - off;
+                                               off = 0;
+                                       }
+                               }
+
+                               replica_align_badblock_offset_length(&off, &len,
+                                                               set, r, p);
+
+                               phs->bbs.bbv[i].offset = part_off + off;
+                               phs->bbs.bbv[i].length = (unsigned)len;
+
+                               LOG(10,
+                                       "absolute bad block #%i: offset 0x%zx, length 0x%zx",
+                                       i,
+                                       phs->bbs.bbv[i].offset,
+                                       phs->bbs.bbv[i].length);
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * sync_badblocks_find_healthy_replica -- (internal) look for a healthy replica
+ *                                                   for each bad block
+ *
+ * This function looks for a healthy replica for each bad block. Bad blocks
+ * can overlap across replicas, so each bad block may have to be divided
+ * into smaller parts which can be fixed using different healthy replica.
+ *
+ * Key variables:
+ * - bbv_all[] - array containing all (possibly divided) bad blocks
+ *               from all previous replicas.
+ * - bbv_aux[] - array containing all (possibly divided) bad blocks
+ *               from all previous parts of the current replica merged with
+ *               these bad blocks from bbv_all[] that have offsets less or equal
+ *               the greatest bad block's offset in the previous part.
+ *
+ * This function merges bad blocks from bbv_all[] with bad blocks
+ * from the current part and writes the outcome bad blocks to bbv_aux[].
+ * Only bad blocks with offsets less or equal the greatest bad block's offset
+ * in the current part will be moved from bbv_all[] to bbv_aux[].
+ * The rest of them has to be moved at the end by sync_badblocks_move_vec().
+ *
+ * bbv_aux[] becomes new bbv_all[] and bbv_aux[] is zeroed
+ * before checking the next replica (bbv_all = bbv_aux; bbv_aux = 0).
+ *
+ * For example (all replicas have only one part):
+ * - bbv_all with rep#0: |__----___________----__|
+ * - merged with  rep#1: |____----_______----____|
+ * - gives such bbv_aux: |__11--00_______00--11__|
+ * - merged with  rep#2: |__________---__________|
+ * - gives such bbv_aux: |__112200__000__002211__| (all bad blocks can be fixed)
+ *
+ * where:
+ *   '_' stands for a healthy block (no bad block)
+ *   '-' stands for a bad block with nhealthy == NO_HEALTHY_REPLICA
+ *   'N' stands for a bad block with nhealthy == N (can be fixed using rep#N)
+ */
+static int
+sync_badblocks_find_healthy_replica(struct part_health_status *phs,
+                                       int rep,
+                                       struct bb_vec *pbbv_all,
+                                       struct bb_vec *pbbv_aux,
+                                       unsigned *i_all)
+{
+       LOG(3, "phs %p rep %i pbbv_all %p pbbv_aux %p i_all %i",
+               phs, rep, pbbv_all, pbbv_aux, *i_all);
+
+       struct bad_block bb_add;        /* the element which is being added  */
+       struct bad_block bb_new;        /* a new element */
+       struct bad_block *pbb_all;      /* current element of bbv_all[] */
+
+       unsigned long long beg_prev;
+       unsigned long long end_prev;
+       unsigned long long beg_new;
+       unsigned long long end_new;
+       size_t len_prev;
+       size_t len_new;
+
+       size_t size_all = VEC_SIZE(pbbv_all);
+
+       if (size_all == 0) {
+               /* there were no bad blocks so far, so fill up bbv_aux[] */
+               for (unsigned i = 0; i < phs->bbs.bb_cnt; i++) {
+                       bb_add = phs->bbs.bbv[i];
+
+                       if (rep > 0)
+                               /* bad block can be fixed with replica #0 */
+                               bb_add.nhealthy = 0;
+
+                       if (VEC_PUSH_BACK(pbbv_aux, bb_add))
+                               return -1;
+
+                       LOG(10,
+                               "added bad block (prev-empty): " BB_DATA_STR,
+                               bb_add.offset, bb_add.length, bb_add.nhealthy);
+               }
+       } else {
+               if (*i_all < size_all) {
+                       pbb_all = VEC_GET(pbbv_all, (*i_all)++);
+               } else {
+                       pbb_all = NULL;
+               }
+
+               for (unsigned i = 0; i < phs->bbs.bb_cnt; i++) {
+                       bb_new = phs->bbs.bbv[i];
+
+                       LOG(10,
+                               " * (%u) inserting new bad block: " BB_DATA_STR,
+                               i + 1,
+                               bb_new.offset, bb_new.length, bb_new.nhealthy);
+
+                       if (pbb_all == NULL || pbb_all->length == 0) {
+                               if (*i_all < size_all)
+                                       pbb_all = VEC_GET(pbbv_all, (*i_all)++);
+                               else
+                                       pbb_all = NULL;
+                       }
+
+                       /* all from bbv_all before the bb_new */
+                       while (pbb_all != NULL && pbb_all->offset
+                                                       + pbb_all->length - 1
+                                                       < bb_new.offset) {
+                               if (pbb_all->nhealthy == NO_HEALTHY_REPLICA)
+                                       /* can be fixed with this replica */
+                                       pbb_all->nhealthy = rep;
+
+                               if (VEC_PUSH_BACK(pbbv_aux, *pbb_all))
+                                       return -1;
+
+                               LOG(10,
+                                       "added bad block (prev-before): "
+                                       BB_DATA_STR,
+                                       pbb_all->offset, pbb_all->length,
+                                       pbb_all->nhealthy);
+
+                               if (*i_all < size_all) {
+                                       pbb_all = VEC_GET(pbbv_all, (*i_all)++);
+                               } else {
+                                       pbb_all = NULL;
+                                       break;
+                               }
+                       }
+
+                       beg_new = bb_new.offset;
+                       len_new = bb_new.length;
+                       end_new = beg_new + len_new - 1;
+
+                       /* all pbb_all overlapping with the bb_new */
+                       while (len_new > 0 && pbb_all != NULL) {
+
+                               beg_prev = pbb_all->offset;
+                               len_prev = pbb_all->length;
+                               end_prev = beg_prev + len_prev - 1;
+
+                               /* check if new overlaps with prev */
+                               if (end_prev < beg_new || end_new < beg_prev)
+                                       break;
+
+                               /*
+                                * 1st part: non-overlapping part
+                                * of pbb_all or bb_new
+                                */
+                               if (beg_prev < beg_new) {
+                                       /* non-overlapping part of pbb_all */
+                                       bb_add.offset = beg_prev;
+                                       bb_add.length = (unsigned)
+                                                       (beg_new - beg_prev);
+
+                                       if (pbb_all->nhealthy !=
+                                                       NO_HEALTHY_REPLICA) {
+                                               bb_add.nhealthy =
+                                                       pbb_all->nhealthy;
+                                       } else {
+                                               /*
+                                                * It can be fixed with
+                                                * this replica.
+                                                */
+                                               bb_add.nhealthy = rep;
+                                       }
+
+                                       if (VEC_PUSH_BACK(pbbv_aux, bb_add))
+                                               return -1;
+
+                                       LOG(10,
+                                               "added bad block (prev-only): "
+                                               BB_DATA_STR,
+                                               bb_add.offset, bb_add.length,
+                                               bb_add.nhealthy);
+
+                                       beg_prev += bb_add.length;
+                                       len_prev -= bb_add.length;
+
+                               } else if (beg_new < beg_prev) {
+                                       /* non-overlapping part of bb_new */
+                                       bb_add.offset = beg_new;
+                                       bb_add.length = (unsigned)
+                                                       (beg_prev - beg_new);
+
+                                       if (rep == 0) {
+                                               bb_add.nhealthy =
+                                                       NO_HEALTHY_REPLICA;
+                                       } else {
+                                               /*
+                                                * It can be fixed with any
+                                                * previous replica, so let's
+                                                * choose replia #0.
+                                                */
+                                               bb_add.nhealthy = 0;
+                                       }
+
+                                       if (VEC_PUSH_BACK(pbbv_aux, bb_add))
+                                               return -1;
+
+                                       LOG(10,
+                                               "added bad block (new-only): "
+                                               BB_DATA_STR,
+                                               bb_add.offset, bb_add.length,
+                                               bb_add.nhealthy);
+
+                                       beg_new += bb_add.length;
+                                       len_new -= bb_add.length;
+                               }
+
+                               /*
+                                * 2nd part: overlapping part
+                                * of pbb_all and bb_new
+                                */
+                               if (len_prev <= len_new) {
+                                       bb_add.offset = beg_prev;
+                                       bb_add.length = len_prev;
+
+                                       beg_new += len_prev;
+                                       len_new -= len_prev;
+
+                                       /* whole pbb_all was added */
+                                       len_prev = 0;
+                               } else {
+                                       bb_add.offset = beg_new;
+                                       bb_add.length = len_new;
+
+                                       beg_prev += len_new;
+                                       len_prev -= len_new;
+
+                                       /* whole bb_new was added */
+                                       len_new = 0;
+                               }
+
+                               bb_add.nhealthy = pbb_all->nhealthy;
+
+                               if (VEC_PUSH_BACK(pbbv_aux, bb_add))
+                                       return -1;
+
+                               LOG(10,
+                                       "added bad block (common): "
+                                       BB_DATA_STR,
+                                       bb_add.offset, bb_add.length,
+                                       bb_add.nhealthy);
+
+                               /* update pbb_all */
+                               pbb_all->offset = beg_prev;
+                               pbb_all->length = len_prev;
+
+                               if (len_prev == 0) {
+                                       if (*i_all < size_all)
+                                               pbb_all = VEC_GET(pbbv_all,
+                                                               (*i_all)++);
+                                       else
+                                               pbb_all = NULL;
+                               }
+                       }
+
+                       /* the rest of the bb_new */
+                       if (len_new > 0) {
+                               bb_add.offset = beg_new;
+                               bb_add.length = len_new;
+
+                               if (rep > 0)
+                                       /* it can be fixed with replica #0 */
+                                       bb_add.nhealthy = 0;
+                               else
+                                       bb_add.nhealthy = NO_HEALTHY_REPLICA;
+
+                               if (VEC_PUSH_BACK(pbbv_aux, bb_add))
+                                       return -1;
+
+                               LOG(10,
+                                       "added bad block (new-rest): "
+                                       BB_DATA_STR,
+                                       bb_add.offset, bb_add.length,
+                                       bb_add.nhealthy);
+                       }
+               }
+
+               if (pbb_all != NULL && pbb_all->length > 0 && *i_all > 0)
+                       /* this pbb_all will be used again in the next part */
+                       (*i_all)--;
+       }
+
+       return 0;
+}
+
+/*
+ * sync_badblocks_assign_healthy_replica -- (internal) assign healthy replica
+ *                                                   for each bad block
+ */
+static int
+sync_badblocks_assign_healthy_replica(struct part_health_status *phs,
+                                       int rep,
+                                       struct bb_vec *pbbv_all,
+                                       unsigned *i_all)
+{
+       LOG(3, "phs %p rep %i pbbv_all %p i_all %i",
+               phs, rep, pbbv_all, *i_all);
+
+       struct bad_block bb_new;        /* a new element */
+       struct bad_block bb_old;        /* an old element */
+       struct bad_block *pbb_all;      /* current element of bbv_all[] */
+
+       size_t length_left;
+
+       struct bb_vec bbv_new = VEC_INITIALIZER;
+
+       size_t size_all = VEC_SIZE(pbbv_all);
+       pbb_all = VEC_GET(pbbv_all, *i_all);
+
+       for (unsigned i = 0; i < phs->bbs.bb_cnt; i++) {
+               bb_old = phs->bbs.bbv[i];
+
+               LOG(10,
+                       "assigning old bad block: " BB_DATA_STR,
+                       bb_old.offset, bb_old.length, bb_old.nhealthy);
+
+               /*
+                * Skip all bad blocks from bbv_all with offsets
+                * less than the offset of the current bb_old.
+                */
+               while (pbb_all->offset < bb_old.offset) {
+                       /* (*i_all) has to be less than (size_all - 1) */
+                       ASSERT(*i_all < size_all - 1);
+                       pbb_all = VEC_GET(pbbv_all, ++(*i_all));
+               }
+
+               bb_new.offset = bb_old.offset;
+               length_left = bb_old.length;
+
+               while (length_left > 0) {
+                       LOG(10,
+                               "checking saved bad block: " BB_DATA_STR,
+                               pbb_all->offset, pbb_all->length,
+                               pbb_all->nhealthy);
+
+                       ASSERTeq(pbb_all->offset, bb_new.offset);
+                       ASSERT(pbb_all->length <= length_left);
+
+                       bb_new.length = pbb_all->length;
+                       bb_new.nhealthy = pbb_all->nhealthy;
+
+                       if (VEC_PUSH_BACK(&bbv_new, bb_new))
+                               goto error_exit;
+
+                       LOG(10,
+                               "added new bad block: " BB_DATA_STR,
+                               bb_new.offset, bb_new.length, bb_new.nhealthy);
+
+                       bb_new.offset += bb_new.length;
+                       length_left -= bb_new.length;
+
+                       if (length_left == 0)
+                               continue;
+
+                       /* (*i_all) has to be less than (size_all - 1) */
+                       ASSERT(*i_all < size_all - 1);
+                       pbb_all = VEC_GET(pbbv_all, ++(*i_all));
+               }
+       }
+
+       Free(phs->bbs.bbv);
+       phs->bbs.bbv = VEC_ARR(&bbv_new);
+       phs->bbs.bb_cnt = (unsigned)VEC_SIZE(&bbv_new);
+
+       LOG(10, "added %u new bad blocks", phs->bbs.bb_cnt);
+
+       return 0;
+
+error_exit:
+       VEC_DELETE(&bbv_new);
+       return -1;
+}
+
+/*
+ * sync_badblocks_move_vec -- (internal) move bad blocks from vector pbbv_all
+ *                                       to vector pbbv_aux
+ */
+static int
+sync_badblocks_move_vec(struct bb_vec *pbbv_all,
+                       struct bb_vec *pbbv_aux,
+                       unsigned i_all,
+                       unsigned rep)
+{
+       LOG(3, "pbbv_all %p pbbv_aux %p i_all %u rep  %u",
+               pbbv_all, pbbv_aux, i_all, rep);
+
+       size_t size_all = VEC_SIZE(pbbv_all);
+       struct bad_block *pbb_all;
+
+       while (i_all < size_all) {
+               pbb_all = VEC_GET(pbbv_all, i_all++);
+
+               if (pbb_all->length == 0)
+                       continue;
+
+               if (pbb_all->nhealthy == NO_HEALTHY_REPLICA && rep > 0)
+                       /* it can be fixed using the last replica */
+                       pbb_all->nhealthy = (int)rep;
+
+               if (VEC_PUSH_BACK(pbbv_aux, *pbb_all))
+                       return -1;
+
+               LOG(10,
+                       "added bad block (prev-after): " BB_DATA_STR,
+                       pbb_all->offset, pbb_all->length,
+                       pbb_all->nhealthy);
+       }
+
+       return 0;
+}
+
+/*
+ * sync_check_bad_blocks_overlap -- (internal) check if there are uncorrectable
+ *                                  bad blocks (bad blocks overlapping
+ *                                  in all replicas)
+ */
+static int
+sync_check_bad_blocks_overlap(struct pool_set *set,
+                               struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p set_hs %p", set, set_hs);
+
+       struct bb_vec bbv_all = VEC_INITIALIZER;
+       struct bb_vec bbv_aux = VEC_INITIALIZER;
+
+       int ret = -1;
+
+       for (unsigned r = 0; r < set->nreplicas; ++r) {
+               struct pool_replica *rep = REP(set, r);
+               struct replica_health_status *rep_hs = set_hs->replica[r];
+
+               unsigned i_all = 0;     /* index in bbv_all */
+
+               for (unsigned p = 0; p < rep->nparts; ++p) {
+                       struct part_health_status *phs = &rep_hs->part[p];
+
+                       if (!replica_part_has_bad_blocks(phs)) {
+                               /* skip parts with no bad blocks */
+                               continue;
+                       }
+
+                       ASSERTne(phs->bbs.bb_cnt, 0);
+                       ASSERTne(phs->bbs.bbv, NULL);
+
+                       LOG(10, "Replica %u part %u HAS %u bad blocks",
+                               r, p, phs->bbs.bb_cnt);
+
+                       /*
+                        * This function merges bad blocks from bbv_all
+                        * with bad blocks from the current part
+                        * and writes the outcome bad blocks to bbv_aux.
+                        * Only bad blocks with offsets less or equal
+                        * the greatest bad block's offset in the current part
+                        * will be moved from bbv_all to bbv_aux.
+                        * The rest of them has to be moved at the end
+                        * by sync_badblocks_move_vec() below.
+                        */
+                       if (sync_badblocks_find_healthy_replica(phs, (int)r,
+                                                       &bbv_all, &bbv_aux,
+                                                       &i_all))
+                               goto exit;
+               }
+
+               /*
+                * Move the rest of bad blocks from bbv_all to bbv_aux
+                * (for more details see the comment above).
+                * All these bad blocks can be fixed using the last replica 'r'.
+                */
+               if (sync_badblocks_move_vec(&bbv_all, &bbv_aux, i_all, r))
+                       return -1;
+
+               /* bbv_aux becomes a new bbv_all */
+               VEC_MOVE(&bbv_all, &bbv_aux);
+               i_all = 0;
+       }
+
+       ret = 0;
+
+       /* check if there is an uncorrectable bad block */
+       size_t size_all = VEC_SIZE(&bbv_all);
+       for (unsigned i = 0; i < size_all; i++) {
+               struct bad_block *pbb_all = VEC_GET(&bbv_all, i);
+               if (pbb_all->nhealthy == NO_HEALTHY_REPLICA) {
+                       ret = 1; /* this bad block cannot be fixed */
+
+                       LOG(1,
+                               "uncorrectable bad block found: offset 0x%zx, length 0x%zx",
+                               pbb_all->offset, pbb_all->length);
+
+                       goto exit;
+               }
+       }
+
+       /*
+        * All bad blocks can be fixed,
+        * so assign healthy replica for each of them.
+        */
+       for (unsigned r = 0; r < set->nreplicas; ++r) {
+               struct pool_replica *rep = REP(set, r);
+               struct replica_health_status *rep_hs = set_hs->replica[r];
+
+               if (!replica_has_bad_blocks(r, set_hs)) {
+                       /* skip replicas with no bad blocks */
+                       continue;
+               }
+
+               unsigned i_all = 0;     /* index in bbv_all */
+
+               for (unsigned p = 0; p < rep->nparts; ++p) {
+                       struct part_health_status *phs = &rep_hs->part[p];
+
+                       if (!replica_part_has_bad_blocks(phs)) {
+                               /* skip parts with no bad blocks */
+                               continue;
+                       }
+
+                       if (sync_badblocks_assign_healthy_replica(phs, (int)r,
+                                                               &bbv_all,
+                                                               &i_all))
+                               goto exit;
+               }
+       }
+
+exit:
+       VEC_DELETE(&bbv_aux);
+       VEC_DELETE(&bbv_all);
+
+       return ret;
+}
+
+/*
+ * sync_badblocks_data -- (internal) clear bad blocks in replica
+ */
+static int
+sync_badblocks_data(struct pool_set *set, struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p, set_hs %p", set, set_hs);
+
+       struct pool_replica *rep_h;
+
+       for (unsigned r = 0; r < set->nreplicas; ++r) {
+               struct pool_replica *rep = REP(set, r);
+               struct replica_health_status *rep_hs = set_hs->replica[r];
+
+               for (unsigned p = 0; p < rep->nparts; ++p) {
+
+                       struct part_health_status *phs = &rep_hs->part[p];
+
+                       if (!replica_part_has_bad_blocks(phs)) {
+                               /* skip parts with no bad blocks */
+                               continue;
+                       }
+
+                       ASSERTne(phs->bbs.bb_cnt, 0);
+                       ASSERTne(phs->bbs.bbv, NULL);
+
+                       const struct pool_set_part *part = &rep->part[p];
+                       size_t part_off = replica_get_part_offset(set, r, p);
+
+                       for (unsigned i = 0; i < phs->bbs.bb_cnt; i++) {
+                               size_t off = phs->bbs.bbv[i].offset - part_off;
+                               size_t len = phs->bbs.bbv[i].length;
+
+                               ASSERT(phs->bbs.bbv[i].nhealthy >= 0);
+
+                               rep_h = REP(set,
+                                       (unsigned)phs->bbs.bbv[i].nhealthy);
+
+                               void *src_addr = ADDR_SUM(rep_h->part[0].addr,
+                                                               part_off + off);
+                               void *dst_addr = ADDR_SUM(part->addr, off);
+
+                               if (sync_copy_data(src_addr, dst_addr,
+                                                       part_off + off, len,
+                                                       rep_h, rep, part))
+                                       return -1;
+                       }
+
+                       /* free array of bad blocks */
+                       Free(phs->bbs.bbv);
+                       phs->bbs.bbv = NULL;
+
+                       /* mark part as having no bad blocks */
+                       sync_mark_part_no_badblocks(r, p, set_hs);
+               }
+
+               /* mark replica as having no bad blocks */
+               sync_mark_replica_no_badblocks(r, set_hs);
+       }
+
+       LOG(1, "all bad blocks have been fixed");
+
+       if (replica_remove_all_recovery_files(set_hs)) {
+               LOG(1, "removing bad block recovery files failed");
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+ * recreate_broken_parts -- (internal) create parts in place of the broken ones
+ */
+static int
+recreate_broken_parts(struct pool_set *set,
+                       struct poolset_health_status *set_hs,
+                       int fix_bad_blocks)
+{
+       LOG(3, "set %p set_hs %p fix_bad_blocks %i",
+               set, set_hs, fix_bad_blocks);
+
+       for (unsigned r = 0; r < set_hs->nreplicas; ++r) {
+               if (set->replica[r]->remote)
+                       continue;
+
+               struct pool_replica *broken_r = set->replica[r];
+
+               for (unsigned p = 0; p < set_hs->replica[r]->nparts; ++p) {
+                       /* skip unbroken parts */
+                       if (!replica_is_part_broken(r, p, set_hs))
+                               continue;
+
+                       /* remove parts from broken replica */
+                       if (replica_remove_part(set, r, p, fix_bad_blocks)) {
+                               LOG(2, "cannot remove part");
+                               return -1;
+                       }
+
+                       /* create removed part and open it */
+                       if (util_part_open(&broken_r->part[p], 0,
+                                               1 /* create */)) {
+                               LOG(2, "cannot open/create parts");
+                               return -1;
+                       }
+
+                       sync_mark_part_no_badblocks(r, p, set_hs);
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * fill_struct_part_uuids -- (internal) set part uuids in pool_set structure
+ */
+static void
+fill_struct_part_uuids(struct pool_set *set, unsigned repn,
+               struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p, repn %u, set_hs %p", set, repn, set_hs);
+       struct pool_replica *rep = REP(set, repn);
+       struct pool_hdr *hdrp;
+       for (unsigned p = 0; p < rep->nhdrs; ++p) {
+               /* skip broken parts */
+               if (replica_is_part_broken(repn, p, set_hs))
+                       continue;
+
+               hdrp = HDR(rep, p);
+               memcpy(rep->part[p].uuid, hdrp->uuid, POOL_HDR_UUID_LEN);
+       }
+}
+
+/*
+ * is_uuid_already_used -- (internal) check if given uuid is assigned to
+ *                         any of the earlier replicas
+ */
+static int
+is_uuid_already_used(uuid_t uuid, struct pool_set *set, unsigned repn)
+{
+       for (unsigned r = 0; r < repn; ++r) {
+               if (uuidcmp(uuid, PART(REP(set, r), 0)->uuid) == 0)
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ * fill_struct_broken_part_uuids -- (internal) set part uuids in pool_set
+ *                                  structure
+ */
+static int
+fill_struct_broken_part_uuids(struct pool_set *set, unsigned repn,
+               struct poolset_health_status *set_hs, unsigned flags)
+{
+       LOG(3, "set %p, repn %u, set_hs %p, flags %u", set, repn, set_hs,
+                       flags);
+       struct pool_replica *rep = REP(set, repn);
+       struct pool_hdr *hdrp;
+       for (unsigned p = 0; p < rep->nhdrs; ++p) {
+               /* skip unbroken parts */
+               if (!replica_is_part_broken(repn, p, set_hs))
+                       continue;
+
+               /* check if part was damaged or was added by transform */
+               if (replica_is_poolset_transformed(flags)) {
+                       /* generate new uuid for this part */
+                       if (util_uuid_generate(rep->part[p].uuid) < 0) {
+                               ERR("cannot generate pool set part UUID");
+                               errno = EINVAL;
+                               return -1;
+                       }
+                       continue;
+               }
+
+               if (!replica_is_part_broken(repn, p - 1, set_hs) &&
+                               !(set->options & OPTION_SINGLEHDR)) {
+                       /* try to get part uuid from the previous part */
+                       hdrp = HDRP(rep, p);
+                       memcpy(rep->part[p].uuid, hdrp->next_part_uuid,
+                                       POOL_HDR_UUID_LEN);
+               } else if (!replica_is_part_broken(repn, p + 1, set_hs) &&
+                               !(set->options & OPTION_SINGLEHDR)) {
+                       /* try to get part uuid from the next part */
+                       hdrp = HDRN(rep, p);
+                       memcpy(rep->part[p].uuid, hdrp->prev_part_uuid,
+                                       POOL_HDR_UUID_LEN);
+               } else if (p == 0 &&
+                       !replica_is_part_broken(repn - 1, 0, set_hs)) {
+                       /* try to get part uuid from the previous replica */
+                       hdrp = HDR(REPP(set, repn), 0);
+                       if (is_uuid_already_used(hdrp->next_repl_uuid, set,
+                                       repn)) {
+                               ERR(
+                                       "repeated uuid - some replicas were created with a different poolset file");
+                               errno = EINVAL;
+                               return -1;
+                       }
+                       memcpy(rep->part[p].uuid, hdrp->next_repl_uuid,
+                                               POOL_HDR_UUID_LEN);
+               } else if (p == 0 &&
+                       !replica_is_part_broken(repn + 1, 0, set_hs)) {
+                       /* try to get part uuid from the next replica */
+                       hdrp = HDR(REPN(set, repn), 0);
+                       if (is_uuid_already_used(hdrp->prev_repl_uuid, set,
+                                       repn)) {
+                               ERR(
+                                       "repeated uuid - some replicas were created with a different poolset file");
+                               errno = EINVAL;
+                               return -1;
+                       }
+                       memcpy(rep->part[p].uuid, hdrp->prev_repl_uuid,
+                                               POOL_HDR_UUID_LEN);
+               } else {
+                       /* generate new uuid for this part */
+                       if (util_uuid_generate(rep->part[p].uuid) < 0) {
+                               ERR("cannot generate pool set part UUID");
+                               errno = EINVAL;
+                               return -1;
+                       }
+               }
+       }
+       return 0;
+}
+
+/*
+ * fill_struct_uuids -- (internal) fill fields in pool_set needed for further
+ *                      altering of uuids
+ */
+static int
+fill_struct_uuids(struct pool_set *set, unsigned src_replica,
+               struct poolset_health_status *set_hs, unsigned flags)
+{
+       LOG(3, "set %p, src_replica %u, set_hs %p, flags %u", set, src_replica,
+                       set_hs, flags);
+
+       /* set poolset uuid */
+       struct pool_hdr *src_hdr0 = HDR(REP(set, src_replica), 0);
+       memcpy(set->uuid, src_hdr0->poolset_uuid, POOL_HDR_UUID_LEN);
+
+       /* set unbroken parts' uuids */
+       for (unsigned r = 0; r < set->nreplicas; ++r) {
+               fill_struct_part_uuids(set, r, set_hs);
+       }
+
+       /* set broken parts' uuids */
+       for (unsigned r = 0; r < set->nreplicas; ++r) {
+               if (fill_struct_broken_part_uuids(set, r, set_hs, flags))
+                       return -1;
+       }
+       return 0;
+}
+
+/*
+ * create_headers_for_broken_parts -- (internal) create headers for all new
+ *                                    parts created in place of the broken ones
+ */
+static int
+create_headers_for_broken_parts(struct pool_set *set, unsigned src_replica,
+               struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p, src_replica %u, set_hs %p", set, src_replica, set_hs);
+
+       struct pool_hdr *src_hdr = HDR(REP(set, src_replica), 0);
+
+       for (unsigned r = 0; r < set_hs->nreplicas; ++r) {
+               /* skip unbroken replicas */
+               if (!replica_is_replica_broken(r, set_hs) &&
+                   !replica_has_bad_blocks(r, set_hs))
+                       continue;
+
+               for (unsigned p = 0; p < set_hs->replica[r]->nhdrs; p++) {
+                       /* skip unbroken parts */
+                       if (!replica_is_part_broken(r, p, set_hs) &&
+                           !replica_part_has_corrupted_header(r, p, set_hs))
+                               continue;
+
+                       if (sync_recreate_header(set, r, p, src_hdr))
+                               return -1;
+               }
+       }
+       return 0;
+}
+
+/*
+ * copy_data_to_broken_parts -- (internal) copy data to all parts created
+ *                              in place of the broken ones
+ */
+static int
+copy_data_to_broken_parts(struct pool_set *set, unsigned healthy_replica,
+               unsigned flags, struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p, healthy_replica %u, flags %u, set_hs %p", set,
+                       healthy_replica, flags, set_hs);
+
+       /* get pool size from healthy replica */
+       size_t poolsize = set->poolsize;
+
+       for (unsigned r = 0; r < set_hs->nreplicas; ++r) {
+               /* skip unbroken and consistent replicas */
+               if (replica_is_replica_healthy(r, set_hs))
+                       continue;
+
+               struct pool_replica *rep = REP(set, r);
+               struct pool_replica *rep_h = REP(set, healthy_replica);
+
+               for (unsigned p = 0; p < rep->nparts; ++p) {
+                       /* skip unbroken parts from consistent replicas */
+                       if (!replica_is_part_broken(r, p, set_hs) &&
+                               replica_is_replica_consistent(r, set_hs))
+                               continue;
+
+                       const struct pool_set_part *part = &rep->part[p];
+
+                       size_t off = replica_get_part_data_offset(set, r, p);
+                       size_t len = replica_get_part_data_len(set, r, p);
+
+                       /* do not allow copying too much data */
+                       if (off >= poolsize)
+                               continue;
+
+                       if (off + len > poolsize || rep->remote)
+                               len = poolsize - off;
+
+                       /*
+                        * First part of replica is mapped
+                        * with header
+                        */
+                       size_t fpoff = (p == 0) ? POOL_HDR_SIZE : 0;
+                       void *src_addr = ADDR_SUM(rep_h->part[0].addr, off);
+                       void *dst_addr = ADDR_SUM(part->addr, fpoff);
+
+                       if (sync_copy_data(src_addr, dst_addr, off, len,
+                                               rep_h, rep, part))
+                               return -1;
+               }
+       }
+       return 0;
+}
+
+/*
+ * grant_created_parts_perm -- (internal) set RW permission rights to all
+ *                            the parts created in place of the broken ones
+ */
+static int
+grant_created_parts_perm(struct pool_set *set, unsigned src_repn,
+               struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p, src_repn %u, set_hs %p", set, src_repn, set_hs);
+
+       /* choose the default permissions */
+       mode_t def_mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+
+       /* get permissions of the first part of the source replica */
+       mode_t src_mode;
+       os_stat_t sb;
+       if (REP(set, src_repn)->remote) {
+               src_mode = def_mode;
+       } else if (os_stat(PART(REP(set, src_repn), 0)->path, &sb) != 0) {
+               ERR("cannot check file permissions of %s (replica %u, part %u)",
+                               PART(REP(set, src_repn), 0)->path, src_repn, 0);
+               src_mode = def_mode;
+       } else {
+               src_mode = sb.st_mode;
+       }
+
+       /* set permissions to all recreated parts */
+       for (unsigned r = 0; r < set_hs->nreplicas; ++r) {
+               /* skip unbroken replicas */
+               if (!replica_is_replica_broken(r, set_hs))
+                       continue;
+
+               if (set->replica[r]->remote)
+                       continue;
+
+               for (unsigned p = 0; p < set_hs->replica[r]->nparts; p++) {
+                       /* skip parts which were not created */
+                       if (!PART(REP(set, r), p)->created)
+                               continue;
+
+                       LOG(4, "setting permissions for part %u, replica %u",
+                                       p, r);
+
+                       /* set rights to those of existing part files */
+                       if (os_chmod(PART(REP(set, r), p)->path, src_mode)) {
+                               ERR(
+                                       "cannot set permission rights for created parts: replica %u, part %u",
+                                       r, p);
+                               errno = EPERM;
+                               return -1;
+                       }
+               }
+       }
+       return 0;
+}
+
+/*
+ * update_parts_linkage -- (internal) set uuids linking recreated parts within
+ *                         a replica
+ */
+static int
+update_parts_linkage(struct pool_set *set, unsigned repn,
+               struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p, repn %u, set_hs %p", set, repn, set_hs);
+       struct pool_replica *rep = REP(set, repn);
+       for (unsigned p = 0; p < rep->nhdrs; ++p) {
+               struct pool_hdr *hdrp = HDR(rep, p);
+               struct pool_hdr *prev_hdrp = HDRP(rep, p);
+               struct pool_hdr *next_hdrp = HDRN(rep, p);
+
+               /* set uuids in the current part */
+               memcpy(hdrp->prev_part_uuid, PARTP(rep, p)->uuid,
+                               POOL_HDR_UUID_LEN);
+               memcpy(hdrp->next_part_uuid, PARTN(rep, p)->uuid,
+                               POOL_HDR_UUID_LEN);
+               util_checksum(hdrp, sizeof(*hdrp), &hdrp->checksum,
+                       1, POOL_HDR_CSUM_END_OFF(hdrp));
+
+               /* set uuids in the previous part */
+               memcpy(prev_hdrp->next_part_uuid, PART(rep, p)->uuid,
+                               POOL_HDR_UUID_LEN);
+               util_checksum(prev_hdrp, sizeof(*prev_hdrp),
+                       &prev_hdrp->checksum, 1,
+                       POOL_HDR_CSUM_END_OFF(prev_hdrp));
+
+               /* set uuids in the next part */
+               memcpy(next_hdrp->prev_part_uuid, PART(rep, p)->uuid,
+                               POOL_HDR_UUID_LEN);
+               util_checksum(next_hdrp, sizeof(*next_hdrp),
+                       &next_hdrp->checksum, 1,
+                       POOL_HDR_CSUM_END_OFF(next_hdrp));
+
+               /* store pool's header */
+               util_persist(PART(rep, p)->is_dev_dax, hdrp, sizeof(*hdrp));
+               util_persist(PARTP(rep, p)->is_dev_dax, prev_hdrp,
+                               sizeof(*prev_hdrp));
+               util_persist(PARTN(rep, p)->is_dev_dax, next_hdrp,
+                               sizeof(*next_hdrp));
+
+       }
+       return 0;
+}
+
+/*
+ * update_replicas_linkage -- (internal) update uuids linking replicas
+ */
+static int
+update_replicas_linkage(struct pool_set *set, unsigned repn)
+{
+       LOG(3, "set %p, repn %u", set, repn);
+       struct pool_replica *rep = REP(set, repn);
+       struct pool_replica *prev_r = REPP(set, repn);
+       struct pool_replica *next_r = REPN(set, repn);
+
+       ASSERT(rep->nparts > 0);
+       ASSERT(prev_r->nparts > 0);
+       ASSERT(next_r->nparts > 0);
+
+       /* set uuids in the current replica */
+       for (unsigned p = 0; p < rep->nhdrs; ++p) {
+               struct pool_hdr *hdrp = HDR(rep, p);
+               memcpy(hdrp->prev_repl_uuid, PART(prev_r, 0)->uuid,
+                               POOL_HDR_UUID_LEN);
+               memcpy(hdrp->next_repl_uuid, PART(next_r, 0)->uuid,
+                               POOL_HDR_UUID_LEN);
+               util_checksum(hdrp, sizeof(*hdrp), &hdrp->checksum,
+                       1, POOL_HDR_CSUM_END_OFF(hdrp));
+
+               /* store pool's header */
+               util_persist(PART(rep, p)->is_dev_dax, hdrp, sizeof(*hdrp));
+       }
+
+       /* set uuids in the previous replica */
+       for (unsigned p = 0; p < prev_r->nhdrs; ++p) {
+               struct pool_hdr *prev_hdrp = HDR(prev_r, p);
+               memcpy(prev_hdrp->next_repl_uuid, PART(rep, 0)->uuid,
+                               POOL_HDR_UUID_LEN);
+               util_checksum(prev_hdrp, sizeof(*prev_hdrp),
+                       &prev_hdrp->checksum, 1,
+                       POOL_HDR_CSUM_END_OFF(prev_hdrp));
+
+               /* store pool's header */
+               util_persist(PART(prev_r, p)->is_dev_dax, prev_hdrp,
+                               sizeof(*prev_hdrp));
+       }
+
+       /* set uuids in the next replica */
+       for (unsigned p = 0; p < next_r->nhdrs; ++p) {
+               struct pool_hdr *next_hdrp = HDR(next_r, p);
+
+               memcpy(next_hdrp->prev_repl_uuid, PART(rep, 0)->uuid,
+                               POOL_HDR_UUID_LEN);
+               util_checksum(next_hdrp, sizeof(*next_hdrp),
+                       &next_hdrp->checksum, 1,
+                       POOL_HDR_CSUM_END_OFF(next_hdrp));
+
+               /* store pool's header */
+               util_persist(PART(next_r, p)->is_dev_dax, next_hdrp,
+                               sizeof(*next_hdrp));
+       }
+
+       return 0;
+}
+
+/*
+ * update_poolset_uuids -- (internal) update poolset uuid in recreated parts
+ */
+static int
+update_poolset_uuids(struct pool_set *set, unsigned repn,
+               struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p, repn %u, set_hs %p", set, repn, set_hs);
+       struct pool_replica *rep = REP(set, repn);
+       for (unsigned p = 0; p < rep->nhdrs; ++p) {
+               struct pool_hdr *hdrp = HDR(rep, p);
+               memcpy(hdrp->poolset_uuid, set->uuid, POOL_HDR_UUID_LEN);
+               util_checksum(hdrp, sizeof(*hdrp), &hdrp->checksum,
+                       1, POOL_HDR_CSUM_END_OFF(hdrp));
+
+               /* store pool's header */
+               util_persist(PART(rep, p)->is_dev_dax, hdrp, sizeof(*hdrp));
+       }
+       return 0;
+}
+
+/*
+ * update_remote_headers -- (internal) update headers of existing remote
+ *                          replicas
+ */
+static int
+update_remote_headers(struct pool_set *set)
+{
+       LOG(3, "set %p", set);
+       for (unsigned r = 0; r < set->nreplicas; ++r) {
+               /* skip local or just created replicas */
+               if (REP(set, r)->remote == NULL ||
+                               PART(REP(set, r), 0)->created == 1)
+                       continue;
+
+               if (util_update_remote_header(set, r)) {
+                       LOG(1,
+                           "updating header of a remote replica no. %u failed",
+                           r);
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+/*
+ * update_uuids -- (internal) set all uuids that might have changed or be unset
+ *                 after recreating parts
+ */
+static int
+update_uuids(struct pool_set *set, struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p, set_hs %p", set, set_hs);
+       for (unsigned r = 0; r < set->nreplicas; ++r) {
+               if (!replica_is_replica_healthy(r, set_hs))
+                       update_parts_linkage(set, r, set_hs);
+
+               update_replicas_linkage(set, r);
+               update_poolset_uuids(set, r, set_hs);
+       }
+
+       if (update_remote_headers(set))
+               return -1;
+
+       return 0;
+}
+
+/*
+ * remove_remote -- (internal) remove remote pool
+ */
+static int
+remove_remote(const char *target, const char *pool_set)
+{
+       LOG(3, "target %s, pool_set %s", target, pool_set);
+#ifdef USE_RPMEM
+       struct rpmem_target_info *info = rpmem_target_parse(target);
+       if (!info)
+               goto err_parse;
+
+       struct rpmem_ssh *ssh = rpmem_ssh_exec(info, "--remove",
+                       pool_set, "--force", NULL);
+       if (!ssh) {
+               goto err_ssh_exec;
+       }
+
+       if (rpmem_ssh_monitor(ssh, 0))
+               goto err_ssh_monitor;
+
+       int ret = rpmem_ssh_close(ssh);
+       rpmem_target_free(info);
+
+       return ret;
+err_ssh_monitor:
+       rpmem_ssh_close(ssh);
+err_ssh_exec:
+       rpmem_target_free(info);
+err_parse:
+       return -1;
+#else
+       FATAL("remote replication not supported");
+       return -1;
+#endif
+}
+
+/*
+ * open_remote_replicas -- (internal) open all unbroken remote replicas
+ */
+static int
+open_remote_replicas(struct pool_set *set,
+       struct poolset_health_status *set_hs)
+{
+       LOG(3, "set %p, set_hs %p", set, set_hs);
+       for (unsigned r = 0; r < set->nreplicas; r++) {
+               struct pool_replica *rep = set->replica[r];
+               if (!rep->remote)
+                       continue;
+               if (!replica_is_replica_healthy(r, set_hs))
+                       continue;
+
+               unsigned nlanes = REMOTE_NLANES;
+               int ret = util_poolset_remote_replica_open(set, r,
+                               set->poolsize, 0, &nlanes);
+               if (ret) {
+                       LOG(1, "Opening '%s' on '%s' failed",
+                                       rep->remote->pool_desc,
+                                       rep->remote->node_addr);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * create_remote_replicas -- (internal) recreate all broken replicas
+ */
+static int
+create_remote_replicas(struct pool_set *set,
+       struct poolset_health_status *set_hs, unsigned flags)
+{
+       LOG(3, "set %p, set_hs %p", set, set_hs);
+       for (unsigned r = 0; r < set->nreplicas; r++) {
+               struct pool_replica *rep = set->replica[r];
+               if (!rep->remote)
+                       continue;
+               if (replica_is_replica_healthy(r, set_hs))
+                       continue;
+
+               if (!replica_is_poolset_transformed(flags)) {
+                       /* ignore errors from remove operation */
+                       remove_remote(rep->remote->node_addr,
+                                       rep->remote->pool_desc);
+               }
+
+               unsigned nlanes = REMOTE_NLANES;
+               int ret = util_poolset_remote_replica_open(set, r,
+                               set->poolsize, 1, &nlanes);
+               if (ret) {
+                       LOG(1, "Creating '%s' on '%s' failed",
+                                       rep->remote->pool_desc,
+                                       rep->remote->node_addr);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * sync_replica -- synchronize data across replicas within a poolset
+ */
+int
+replica_sync(struct pool_set *set, struct poolset_health_status *s_hs,
+               unsigned flags)
+{
+       LOG(3, "set %p, flags %u", set, flags);
+       int ret = 0;
+       struct poolset_health_status *set_hs = NULL;
+
+       /* check if we already know the poolset health status */
+       if (s_hs == NULL) {
+               /* validate poolset before checking its health */
+               if (validate_args(set))
+                       return -1;
+
+               /* examine poolset's health */
+               if (replica_check_poolset_health(set, &set_hs,
+                                               1 /* called from sync */,
+                                               flags)) {
+                       LOG(1, "poolset health check failed");
+                       return -1;
+               }
+
+               /* check if poolset is broken; if not, nothing to do */
+               if (replica_is_poolset_healthy(set_hs)) {
+                       LOG(1, "poolset is healthy");
+                       goto out;
+               }
+       } else {
+               set_hs = s_hs;
+       }
+
+       /* find a replica with healthy header; it will be the source of data */
+       unsigned healthy_replica = replica_find_healthy_replica(set_hs);
+       unsigned healthy_header = healthy_replica;
+       if (healthy_header == UNDEF_REPLICA) {
+               healthy_header = replica_find_replica_healthy_header(set_hs);
+               if (healthy_header == UNDEF_REPLICA) {
+                       ERR("no healthy replica found");
+                       errno = EINVAL;
+                       ret = -1;
+                       goto out;
+               }
+       }
+
+       /* in dry-run mode we can stop here */
+       if (is_dry_run(flags)) {
+               LOG(1, "Sync in dry-run mode finished successfully");
+               goto out;
+       }
+
+       /* recreate broken parts */
+       if (recreate_broken_parts(set, set_hs, fix_bad_blocks(flags))) {
+               ERR("recreating broken parts failed");
+               ret = -1;
+               goto out;
+       }
+
+       /* open all part files */
+       if (replica_open_poolset_part_files(set)) {
+               ERR("opening poolset part files failed");
+               ret = -1;
+               goto out;
+       }
+
+       /* map all replicas */
+       if (util_poolset_open(set)) {
+               ERR("opening poolset failed");
+               ret = -1;
+               goto out;
+       }
+
+       /* this is required for opening remote pools */
+       set->poolsize = set_hs->replica[healthy_header]->pool_size;
+       LOG(3, "setting the pool size (%zu) from replica #%u",
+               set->poolsize, healthy_header);
+
+       /* open all remote replicas */
+       if (open_remote_replicas(set, set_hs)) {
+               ERR("opening remote replicas failed");
+               ret = -1;
+               goto out;
+       }
+
+       /* recalculate offset and length of bad blocks */
+       if (sync_recalc_badblocks(set, set_hs)) {
+               LOG(1, "syncing bad blocks data failed");
+               ret = -1;
+               goto out;
+       }
+
+       /*
+        * Check if there are uncorrectable bad blocks
+        * (bad blocks overlapping in all replicas).
+        */
+       int status = sync_check_bad_blocks_overlap(set, set_hs);
+       if (status == -1) {
+               LOG(1, "checking bad blocks failed");
+               ret = -1;
+               goto out;
+       }
+
+       if (status == 1) {
+               ERR(
+                       "a part of the pool has uncorrectable errors in all replicas");
+               errno = EINVAL;
+               ret = -1;
+               goto out;
+       }
+
+       LOG(3, "bad blocks do not overlap");
+
+       /* sync data in bad blocks */
+       if (sync_badblocks_data(set, set_hs)) {
+               LOG(1, "syncing bad blocks data failed");
+               ret = -1;
+               goto out;
+       }
+
+       /* find one good replica; it will be the source of data */
+       healthy_replica = replica_find_healthy_replica(set_hs);
+       if (healthy_replica == UNDEF_REPLICA) {
+               ERR("no healthy replica found");
+               errno = EINVAL;
+               ret = -1;
+               goto out;
+       }
+
+       /* update uuid fields in the set structure with part headers */
+       if (fill_struct_uuids(set, healthy_replica, set_hs, flags)) {
+               ERR("gathering uuids failed");
+               ret = -1;
+               goto out;
+       }
+
+       /* create headers for broken parts */
+       if (create_headers_for_broken_parts(set, healthy_replica, set_hs)) {
+               ERR("creating headers for broken parts failed");
+               ret = -1;
+               goto out;
+       }
+
+       /* create all remote replicas */
+       if (create_remote_replicas(set, set_hs, flags)) {
+               ERR("creating remote replicas failed");
+               ret = -1;
+               goto out;
+       }
+
+       /* check and copy data if possible */
+       if (copy_data_to_broken_parts(set, healthy_replica,
+                       flags, set_hs)) {
+               ERR("copying data to broken parts failed");
+               ret = -1;
+               goto out;
+       }
+
+       /* update uuids of replicas and parts */
+       if (update_uuids(set, set_hs)) {
+               ERR("updating uuids failed");
+               ret = -1;
+               goto out;
+       }
+
+       /* grant permissions to all created parts */
+       if (grant_created_parts_perm(set, healthy_replica, set_hs)) {
+               ERR("granting permissions to created parts failed");
+               ret = -1;
+       }
+
+out:
+       if (s_hs == NULL)
+               replica_free_poolset_health_status(set_hs);
+       return ret;
+}