--- /dev/null
+// SPDX-License-Identifier: BSD-3-Clause
+/* Copyright 2016-2019, Intel Corporation */
+
+/*
+ * transform.c -- a module for poolset transforming
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <dirent.h>
+#include <assert.h>
+
+#include "replica.h"
+#include "out.h"
+#include "file.h"
+#include "os.h"
+#include "libpmem.h"
+#include "util_pmem.h"
+
+/*
+ * poolset_compare_status - a helping structure for gathering corresponding
+ * replica numbers when comparing poolsets
+ */
+struct poolset_compare_status
+{
+ unsigned nreplicas;
+ unsigned flags;
+ unsigned replica[];
+};
+
+/*
+ * type of transform operation to be done
+ */
+enum transform_op {
+ NOT_TRANSFORMABLE,
+ ADD_REPLICAS,
+ RM_REPLICAS,
+ ADD_HDRS,
+ RM_HDRS,
+};
+
+/*
+ * check_if_part_used_once -- (internal) check if the part is used only once in
+ * the rest of the poolset
+ */
+static int
+check_if_part_used_once(struct pool_set *set, unsigned repn, unsigned partn)
+{
+ LOG(3, "set %p, repn %u, partn %u", set, repn, partn);
+ struct pool_replica *rep = REP(set, repn);
+ char *path = util_part_realpath(PART(rep, partn)->path);
+ if (path == NULL) {
+ LOG(1, "cannot get absolute path for %s, replica %u, part %u",
+ PART(rep, partn)->path, repn, partn);
+ errno = 0;
+ path = strdup(PART(rep, partn)->path);
+ if (path == NULL) {
+ ERR("!strdup");
+ return -1;
+ }
+ }
+ int ret = 0;
+ for (unsigned r = repn; r < set->nreplicas; ++r) {
+ struct pool_replica *repr = set->replica[r];
+ /* skip remote replicas */
+ if (repr->remote != NULL)
+ continue;
+
+ /* avoid superfluous comparisons */
+ unsigned i = (r == repn) ? partn + 1 : 0;
+ for (unsigned p = i; p < repr->nparts; ++p) {
+ char *pathp = util_part_realpath(PART(repr, p)->path);
+ if (pathp == NULL) {
+ if (errno != ENOENT) {
+ ERR("realpath failed for %s, errno %d",
+ PART(repr, p)->path, errno);
+ ret = -1;
+ goto out;
+ }
+ LOG(1, "cannot get absolute path for %s,"
+ " replica %u, part %u",
+ PART(rep, partn)->path, repn,
+ partn);
+ pathp = strdup(PART(repr, p)->path);
+ errno = 0;
+ }
+ int result = util_compare_file_inodes(path, pathp);
+ if (result == 0) {
+ /* same file used multiple times */
+ ERR("some part file's path is"
+ " used multiple times");
+ ret = -1;
+ errno = EINVAL;
+ free(pathp);
+ goto out;
+ } else if (result < 0) {
+ ERR("comparing file inodes failed for %s and"
+ " %s", path, pathp);
+ ret = -1;
+ free(pathp);
+ goto out;
+ }
+ free(pathp);
+ }
+ }
+out:
+ free(path);
+ return ret;
+}
+
+/*
+ * check_if_remote_replica_used_once -- (internal) check if remote replica is
+ * used only once in the rest of the
+ * poolset
+ */
+static int
+check_if_remote_replica_used_once(struct pool_set *set, unsigned repn)
+{
+ LOG(3, "set %p, repn %u", set, repn);
+ struct remote_replica *rep = REP(set, repn)->remote;
+ ASSERTne(rep, NULL);
+ for (unsigned r = repn + 1; r < set->nreplicas; ++r) {
+ /* skip local replicas */
+ if (REP(set, r)->remote == NULL)
+ continue;
+
+ struct remote_replica *repr = REP(set, r)->remote;
+ /* XXX: add comparing resolved addresses of the nodes */
+ if (strcmp(rep->node_addr, repr->node_addr) == 0 &&
+ strcmp(rep->pool_desc, repr->pool_desc) == 0) {
+ ERR("remote replica %u is used multiple times", repn);
+ errno = EINVAL;
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * check_paths -- (internal) check if directories for part files exist
+ * and if paths for part files do not repeat in the poolset
+ */
+static int
+check_paths(struct pool_set *set)
+{
+ LOG(3, "set %p", set);
+ for (unsigned r = 0; r < set->nreplicas; ++r) {
+ struct pool_replica *rep = set->replica[r];
+ if (rep->remote != NULL) {
+ if (check_if_remote_replica_used_once(set, r))
+ return -1;
+ } else {
+ for (unsigned p = 0; p < rep->nparts; ++p) {
+ if (replica_check_local_part_dir(set, r, p))
+ return -1;
+
+ if (check_if_part_used_once(set, r, p))
+ return -1;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * validate_args -- (internal) check whether passed arguments are valid
+ */
+static int
+validate_args(struct pool_set *set_in, struct pool_set *set_out)
+{
+ LOG(3, "set_in %p, set_out %p", set_in, set_out);
+
+ if (set_in->directory_based) {
+ ERR("transform of directory poolsets is not supported");
+ errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * check if all parts in the target poolset are large enough
+ * (now replication works only for pmemobj pools)
+ */
+ if (replica_check_part_sizes(set_out, PMEMOBJ_MIN_POOL)) {
+ ERR("part sizes check failed");
+ return -1;
+ }
+
+ /*
+ * check if all directories for part files exist and if part files
+ * do not reoccur in the poolset
+ */
+ if (check_paths(set_out))
+ return -1;
+
+ /*
+ * check if set_out has enough size, i.e. if the target poolset
+ * structure has enough capacity to accommodate the effective size of
+ * the source poolset
+ */
+ ssize_t master_pool_size = replica_get_pool_size(set_in, 0);
+ if (master_pool_size < 0) {
+ ERR("getting pool size from master replica failed");
+ return -1;
+ }
+
+ if (set_out->poolsize < (size_t)master_pool_size) {
+ ERR("target poolset is too small");
+ errno = EINVAL;
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * create poolset_compare_status -- (internal) create structure for gathering
+ * status of poolset comparison
+ */
+static int
+create_poolset_compare_status(struct pool_set *set,
+ struct poolset_compare_status **set_sp)
+{
+ LOG(3, "set %p, set_sp %p", set, set_sp);
+ struct poolset_compare_status *set_s;
+ set_s = Zalloc(sizeof(struct poolset_compare_status)
+ + set->nreplicas * sizeof(unsigned));
+ if (set_s == NULL) {
+ ERR("!Zalloc for poolset status");
+ return -1;
+ }
+ for (unsigned r = 0; r < set->nreplicas; ++r)
+ set_s->replica[r] = UNDEF_REPLICA;
+
+ set_s->nreplicas = set->nreplicas;
+ *set_sp = set_s;
+ return 0;
+}
+
+/*
+ * compare_parts -- (internal) check if two parts can be considered the same
+ */
+static int
+compare_parts(struct pool_set_part *p1, struct pool_set_part *p2)
+{
+ LOG(3, "p1 %p, p2 %p", p1, p2);
+ LOG(4, "p1->path: %s, p1->filesize: %lu", p1->path, p1->filesize);
+ LOG(4, "p2->path: %s, p2->filesize: %lu", p2->path, p2->filesize);
+ return strcmp(p1->path, p2->path) || (p1->filesize != p2->filesize);
+}
+
+/*
+ * compare_replicas -- (internal) check if two replicas are different
+ */
+static int
+compare_replicas(struct pool_replica *r1, struct pool_replica *r2)
+{
+ LOG(3, "r1 %p, r2 %p", r1, r2);
+ LOG(4, "r1->nparts: %u, r2->nparts: %u", r1->nparts, r2->nparts);
+ /* both replicas are local */
+ if (r1->remote == NULL && r2->remote == NULL) {
+ if (r1->nparts != r2->nparts)
+ return 1;
+
+ for (unsigned p = 0; p < r1->nparts; ++p) {
+ if (compare_parts(&r1->part[p], &r2->part[p]))
+ return 1;
+ }
+ return 0;
+ }
+ /* both replicas are remote */
+ if (r1->remote != NULL && r2->remote != NULL) {
+ return strcmp(r1->remote->node_addr, r2->remote->node_addr) ||
+ strcmp(r1->remote->pool_desc, r2->remote->pool_desc);
+ }
+ /* a remote and a local replicas */
+ return 1;
+}
+
+/*
+ * check_compare_poolsets_status -- (internal) find different replicas between
+ * two poolsets; for each replica which has
+ * a counterpart in the other poolset store
+ * the other replica's number in a helping
+ * structure
+ */
+static int
+check_compare_poolsets_status(struct pool_set *set_in,
+ struct pool_set *set_out,
+ struct poolset_compare_status *set_in_s,
+ struct poolset_compare_status *set_out_s)
+{
+ LOG(3, "set_in %p, set_out %p, set_in_s %p, set_out_s %p", set_in,
+ set_out, set_in_s, set_out_s);
+ for (unsigned ri = 0; ri < set_in->nreplicas; ++ri) {
+ struct pool_replica *rep_in = REP(set_in, ri);
+ for (unsigned ro = 0; ro < set_out->nreplicas; ++ro) {
+ struct pool_replica *rep_out = REP(set_out, ro);
+ LOG(1, "comparing rep_in %u with rep_out %u", ri, ro);
+ /* skip different replicas */
+ if (compare_replicas(rep_in, rep_out))
+ continue;
+
+ if (set_in_s->replica[ri] != UNDEF_REPLICA ||
+ set_out_s->replica[ro]
+ != UNDEF_REPLICA) {
+ /* there are more than one counterparts */
+ ERR("there are more then one corresponding"
+ " replicas; cannot transform");
+ errno = EINVAL;
+ return -1;
+ }
+
+ set_in_s->replica[ri] = ro;
+ set_out_s->replica[ro] = ri;
+ }
+ }
+ return 0;
+}
+
+/*
+ * check_compare_poolset_options -- (internal) check poolset options
+ */
+static int
+check_compare_poolsets_options(struct pool_set *set_in,
+ struct pool_set *set_out,
+ struct poolset_compare_status *set_in_s,
+ struct poolset_compare_status *set_out_s)
+{
+ if (set_in->options & OPTION_SINGLEHDR)
+ set_in_s->flags |= OPTION_SINGLEHDR;
+
+ if (set_out->options & OPTION_SINGLEHDR)
+ set_out_s->flags |= OPTION_SINGLEHDR;
+
+ if ((set_in->options & OPTION_NOHDRS) ||
+ (set_out->options & OPTION_NOHDRS)) {
+ errno = EINVAL;
+ ERR(
+ "the NOHDRS poolset option is not supported in local poolset files");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * compare_poolsets -- (internal) compare two poolsets; for each replica which
+ * has a counterpart in the other poolset store the other
+ * replica's number in a helping structure
+ */
+static int
+compare_poolsets(struct pool_set *set_in, struct pool_set *set_out,
+ struct poolset_compare_status **set_in_s,
+ struct poolset_compare_status **set_out_s)
+{
+ LOG(3, "set_in %p, set_out %p, set_in_s %p, set_out_s %p", set_in,
+ set_out, set_in_s, set_out_s);
+ if (create_poolset_compare_status(set_in, set_in_s))
+ return -1;
+
+ if (create_poolset_compare_status(set_out, set_out_s))
+ goto err_free_in;
+
+ if (check_compare_poolsets_status(set_in, set_out, *set_in_s,
+ *set_out_s))
+ goto err_free_out;
+
+ if (check_compare_poolsets_options(set_in, set_out, *set_in_s,
+ *set_out_s))
+ goto err_free_out;
+
+ return 0;
+
+err_free_out:
+ Free(*set_out_s);
+err_free_in:
+ Free(*set_in_s);
+ return -1;
+}
+
+/*
+ * replica_counterpart -- (internal) returns index of a counterpart replica
+ */
+static unsigned
+replica_counterpart(unsigned repn,
+ struct poolset_compare_status *set_s)
+{
+ return set_s->replica[repn];
+}
+
+/*
+ * are_poolsets_transformable -- (internal) check if poolsets can be transformed
+ * one into the other; also gather info about
+ * replicas's health
+ */
+static enum transform_op
+identify_transform_operation(struct poolset_compare_status *set_in_s,
+ struct poolset_compare_status *set_out_s,
+ struct poolset_health_status *set_in_hs,
+ struct poolset_health_status *set_out_hs)
+{
+ LOG(3, "set_in_s %p, set_out_s %p", set_in_s, set_out_s);
+
+ int has_replica_to_keep = 0;
+ int is_removing_replicas = 0;
+ int is_adding_replicas = 0;
+
+ /* check if there are replicas to be removed */
+ for (unsigned r = 0; r < set_in_s->nreplicas; ++r) {
+ unsigned c = replica_counterpart(r, set_in_s);
+ if (c != UNDEF_REPLICA) {
+ LOG(2, "replica %u has a counterpart %u", r,
+ set_in_s->replica[r]);
+ has_replica_to_keep = 1;
+ REP_HEALTH(set_out_hs, c)->pool_size =
+ REP_HEALTH(set_in_hs, r)->pool_size;
+ } else {
+ LOG(2, "replica %u has no counterpart", r);
+ is_removing_replicas = 1;
+ }
+ }
+
+ /* make sure we have at least one replica to keep */
+ if (!has_replica_to_keep) {
+ ERR("there must be at least one replica left");
+ return NOT_TRANSFORMABLE;
+ }
+
+ /* check if there are replicas to be added */
+ for (unsigned r = 0; r < set_out_s->nreplicas; ++r) {
+ if (replica_counterpart(r, set_out_s) == UNDEF_REPLICA) {
+ LOG(2, "Replica %u from output set has no counterpart",
+ r);
+ if (is_removing_replicas) {
+ ERR(
+ "adding and removing replicas at the same time is not allowed");
+ return NOT_TRANSFORMABLE;
+ }
+
+ REP_HEALTH(set_out_hs, r)->flags |= IS_BROKEN;
+ is_adding_replicas = 1;
+ }
+ }
+
+ /* check if there is anything to do */
+ if (!is_removing_replicas && !is_adding_replicas &&
+ (set_in_s->flags & OPTION_SINGLEHDR) ==
+ (set_out_s->flags & OPTION_SINGLEHDR)) {
+ ERR("both poolsets are equal");
+ return NOT_TRANSFORMABLE;
+ }
+
+ /* allow changing the SINGLEHDR option only as the sole operation */
+ if ((is_removing_replicas || is_adding_replicas) &&
+ (set_in_s->flags & OPTION_SINGLEHDR) !=
+ (set_out_s->flags & OPTION_SINGLEHDR)) {
+ ERR(
+ "cannot add/remove replicas and change the SINGLEHDR option at the same time");
+ return NOT_TRANSFORMABLE;
+ }
+
+ if (is_removing_replicas)
+ return RM_REPLICAS;
+
+ if (is_adding_replicas)
+ return ADD_REPLICAS;
+
+ if (set_out_s->flags & OPTION_SINGLEHDR)
+ return RM_HDRS;
+
+ if (set_in_s->flags & OPTION_SINGLEHDR)
+ return ADD_HDRS;
+
+ ASSERT(0);
+ return NOT_TRANSFORMABLE;
+}
+
+/*
+ * do_added_parts_exist -- (internal) check if any part of the replicas that are
+ * to be added (marked as broken) already exists
+ */
+static int
+do_added_parts_exist(struct pool_set *set,
+ struct poolset_health_status *set_hs)
+{
+ for (unsigned r = 0; r < set->nreplicas; ++r) {
+ /* skip unbroken (i.e. not being added) replicas */
+ if (!replica_is_replica_broken(r, set_hs))
+ continue;
+
+ struct pool_replica *rep = REP(set, r);
+
+ /* skip remote replicas */
+ if (rep->remote)
+ continue;
+
+ for (unsigned p = 0; p < rep->nparts; ++p) {
+ /* check if part file exists */
+ int oerrno = errno;
+ int exists = util_file_exists(rep->part[p].path);
+ if (exists < 0)
+ return -1;
+
+ if (exists && !rep->part[p].is_dev_dax) {
+ LOG(1, "part file %s exists",
+ rep->part[p].path);
+ return 1;
+ }
+ errno = oerrno;
+ }
+ }
+ return 0;
+}
+
+/*
+ * delete_replicas -- (internal) delete replicas which do not have their
+ * counterpart set in the helping status structure
+ */
+static int
+delete_replicas(struct pool_set *set, struct poolset_compare_status *set_s)
+{
+ LOG(3, "set %p, set_s %p", set, set_s);
+ for (unsigned r = 0; r < set->nreplicas; ++r) {
+ struct pool_replica *rep = REP(set, r);
+ if (replica_counterpart(r, set_s) == UNDEF_REPLICA) {
+ if (!rep->remote) {
+ if (util_replica_close_local(rep, r,
+ DELETE_ALL_PARTS))
+ return -1;
+ } else {
+ if (util_replica_close_remote(rep, r,
+ DELETE_ALL_PARTS))
+ return -1;
+ }
+ }
+ }
+ return 0;
+}
+
+/*
+ * copy_replica_data_fw -- (internal) copy data between replicas of two
+ * poolsets, starting from the beginning of the
+ * second part
+ */
+static void
+copy_replica_data_fw(struct pool_set *set_dst, struct pool_set *set_src,
+ unsigned repn)
+{
+ LOG(3, "set_in %p, set_out %p, repn %u", set_src, set_dst, repn);
+ ssize_t pool_size = replica_get_pool_size(set_src, repn);
+ if (pool_size < 0) {
+ LOG(1, "getting pool size from replica %u failed", repn);
+ pool_size = (ssize_t)set_src->poolsize;
+ }
+
+ size_t len = (size_t)pool_size - POOL_HDR_SIZE -
+ replica_get_part_data_len(set_src, repn, 0);
+ void *src = PART(REP(set_src, repn), 1)->addr;
+ void *dst = PART(REP(set_dst, repn), 1)->addr;
+ size_t count = len / POOL_HDR_SIZE;
+ while (count-- > 0) {
+ pmem_memcpy_persist(dst, src, POOL_HDR_SIZE);
+ src = ADDR_SUM(src, POOL_HDR_SIZE);
+ dst = ADDR_SUM(dst, POOL_HDR_SIZE);
+ }
+}
+
+/*
+ * copy_replica_data_bw -- (internal) copy data between replicas of two
+ * poolsets, starting from the end of the pool
+ */
+static void
+copy_replica_data_bw(struct pool_set *set_dst, struct pool_set *set_src,
+ unsigned repn)
+{
+ LOG(3, "set_in %p, set_out %p, repn %u", set_src, set_dst, repn);
+ ssize_t pool_size = replica_get_pool_size(set_src, repn);
+ if (pool_size < 0) {
+ LOG(1, "getting pool size from replica %u failed", repn);
+ pool_size = (ssize_t)set_src->poolsize;
+ }
+
+ size_t len = (size_t)pool_size - POOL_HDR_SIZE -
+ replica_get_part_data_len(set_src, repn, 0);
+ size_t count = len / POOL_HDR_SIZE;
+ void *src = ADDR_SUM(PART(REP(set_src, repn), 1)->addr, len);
+ void *dst = ADDR_SUM(PART(REP(set_dst, repn), 1)->addr, len);
+ while (count-- > 0) {
+ src = ADDR_SUM(src, -(ssize_t)POOL_HDR_SIZE);
+ dst = ADDR_SUM(dst, -(ssize_t)POOL_HDR_SIZE);
+ pmem_memcpy_persist(dst, src, POOL_HDR_SIZE);
+ }
+}
+
+/*
+ * create_missing_headers -- (internal) create headers for all parts but the
+ * first one
+ */
+static int
+create_missing_headers(struct pool_set *set, unsigned repn)
+{
+ LOG(3, "set %p, repn %u", set, repn);
+ struct pool_hdr *src_hdr = HDR(REP(set, repn), 0);
+ for (unsigned p = 1; p < set->replica[repn]->nhdrs; ++p) {
+ struct pool_attr attr;
+ util_pool_hdr2attr(&attr, src_hdr);
+ attr.features.incompat &= (uint32_t)(~POOL_FEAT_SINGLEHDR);
+ if (util_header_create(set, repn, p, &attr, 1) != 0) {
+ LOG(1, "part headers create failed for"
+ " replica %u part %u", repn, p);
+ errno = EINVAL;
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * update_replica_header -- (internal) update field values in the first header
+ * in the replica
+ */
+static void
+update_replica_header(struct pool_set *set, unsigned repn)
+{
+ LOG(3, "set %p, repn %u", set, repn);
+ struct pool_replica *rep = REP(set, repn);
+ struct pool_set_part *part = PART(REP(set, repn), 0);
+ struct pool_hdr *hdr = (struct pool_hdr *)part->hdr;
+ if (set->options & OPTION_SINGLEHDR) {
+ hdr->features.incompat |= POOL_FEAT_SINGLEHDR;
+ memcpy(hdr->next_part_uuid, hdr->uuid, POOL_HDR_UUID_LEN);
+ memcpy(hdr->prev_part_uuid, hdr->uuid, POOL_HDR_UUID_LEN);
+ } else {
+ hdr->features.incompat &= (uint32_t)(~POOL_FEAT_SINGLEHDR);
+
+ }
+ util_checksum(hdr, sizeof(*hdr), &hdr->checksum, 1,
+ POOL_HDR_CSUM_END_OFF(hdr));
+ util_persist_auto(rep->is_pmem, hdr, sizeof(*hdr));
+}
+
+/*
+ * fill_replica_struct_uuids -- (internal) gather all uuids required for the
+ * replica in the helper structure
+ */
+static int
+fill_replica_struct_uuids(struct pool_set *set, unsigned repn)
+{
+ LOG(3, "set %p, repn %u", set, repn);
+ struct pool_replica *rep = REP(set, repn);
+ memcpy(PART(rep, 0)->uuid, HDR(rep, 0)->uuid, POOL_HDR_UUID_LEN);
+ for (unsigned p = 1; p < rep->nhdrs; ++p) {
+ if (util_uuid_generate(rep->part[p].uuid) < 0) {
+ ERR("cannot generate part UUID");
+ errno = EINVAL;
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * update_uuids -- (internal) update uuids in all headers in the replica
+ */
+static void
+update_uuids(struct pool_set *set, unsigned repn)
+{
+ LOG(3, "set %p, repn %u", set, repn);
+ struct pool_replica *rep = REP(set, repn);
+ struct pool_hdr *hdr0 = HDR(rep, 0);
+ for (unsigned p = 0; p < rep->nhdrs; ++p) {
+ struct pool_hdr *hdrp = HDR(rep, p);
+ memcpy(hdrp->next_part_uuid, PARTN(rep, p)->uuid,
+ POOL_HDR_UUID_LEN);
+ memcpy(hdrp->prev_part_uuid, PARTP(rep, p)->uuid,
+ POOL_HDR_UUID_LEN);
+
+ /* Avoid calling memcpy() on identical regions */
+ if (p != 0) {
+ memcpy(hdrp->next_repl_uuid, hdr0->next_repl_uuid,
+ POOL_HDR_UUID_LEN);
+ memcpy(hdrp->prev_repl_uuid, hdr0->prev_repl_uuid,
+ POOL_HDR_UUID_LEN);
+ memcpy(hdrp->poolset_uuid, hdr0->poolset_uuid,
+ POOL_HDR_UUID_LEN);
+ }
+
+ util_checksum(hdrp, sizeof(*hdrp), &hdrp->checksum, 1,
+ POOL_HDR_CSUM_END_OFF(hdrp));
+ util_persist(PART(rep, p)->is_dev_dax, hdrp, sizeof(*hdrp));
+ }
+}
+
+/*
+ * copy_part_fds -- (internal) copy poolset part file descriptors between
+ * two poolsets
+ */
+static void
+copy_part_fds(struct pool_set *set_dst, struct pool_set *set_src)
+{
+ ASSERTeq(set_src->nreplicas, set_dst->nreplicas);
+ for (unsigned r = 0; r < set_dst->nreplicas; ++r) {
+ ASSERTeq(REP(set_src, r)->nparts, REP(set_dst, r)->nparts);
+ for (unsigned p = 0; p < REP(set_dst, r)->nparts; ++p) {
+ PART(REP(set_dst, r), p)->fd =
+ PART(REP(set_src, r), p)->fd;
+ }
+ }
+
+}
+
+/*
+ * remove_hdrs_replica -- (internal) remove headers from the replica
+ */
+static int
+remove_hdrs_replica(struct pool_set *set_in, struct pool_set *set_out,
+ unsigned repn)
+{
+ LOG(3, "set %p, repn %u", set_in, repn);
+ int ret = 0;
+
+ /* open all part files of the input replica */
+ if (replica_open_replica_part_files(set_in, repn)) {
+ LOG(1, "opening replica %u, part files failed", repn);
+ ret = -1;
+ goto out;
+ }
+
+ /* share part file descriptors between poolset structures */
+ copy_part_fds(set_out, set_in);
+
+ /* map the whole input replica */
+ if (util_replica_open(set_in, repn, MAP_SHARED)) {
+ LOG(1, "opening input replica failed: replica %u", repn);
+ ret = -1;
+ goto out_close;
+ }
+
+ /* map the whole output replica */
+ if (util_replica_open(set_out, repn, MAP_SHARED)) {
+ LOG(1, "opening output replica failed: replica %u", repn);
+ ret = -1;
+ goto out_unmap_in;
+ }
+
+ /* move data between the two mappings of the replica */
+ if (REP(set_in, repn)->nparts > 1)
+ copy_replica_data_fw(set_out, set_in, repn);
+
+ /* make changes to the first part's header */
+ update_replica_header(set_out, repn);
+
+ util_replica_close(set_out, repn);
+out_unmap_in:
+ util_replica_close(set_in, repn);
+out_close:
+ util_replica_fdclose(REP(set_in, repn));
+out:
+ return ret;
+}
+
+/*
+ * add_hdrs_replica -- (internal) add lacking headers to the replica
+ *
+ * when the operation fails and returns -1, the replica remains untouched
+ */
+static int
+add_hdrs_replica(struct pool_set *set_in, struct pool_set *set_out,
+ unsigned repn)
+{
+ LOG(3, "set %p, repn %u", set_in, repn);
+ int ret = 0;
+
+ /* open all part files of the input replica */
+ if (replica_open_replica_part_files(set_in, repn)) {
+ LOG(1, "opening replica %u, part files failed", repn);
+ ret = -1;
+ goto out;
+ }
+
+ /* share part file descriptors between poolset structures */
+ copy_part_fds(set_out, set_in);
+
+ /* map the whole input replica */
+ if (util_replica_open(set_in, repn, MAP_SHARED)) {
+ LOG(1, "opening input replica failed: replica %u", repn);
+ ret = -1;
+ goto out_close;
+ }
+
+ /* map the whole output replica */
+ if (util_replica_open(set_out, repn, MAP_SHARED)) {
+ LOG(1, "opening output replica failed: replica %u", repn);
+ ret = -1;
+ goto out_unmap_in;
+ }
+
+ /* generate new uuids for lacking headers */
+ if (fill_replica_struct_uuids(set_out, repn)) {
+ LOG(1, "generating lacking uuids for parts failed: replica %u",
+ repn);
+ ret = -1;
+ goto out_unmap_out;
+ }
+
+ /* copy data between the two mappings of the replica */
+ if (REP(set_in, repn)->nparts > 1)
+ copy_replica_data_bw(set_out, set_in, repn);
+
+ /* create the missing headers */
+ if (create_missing_headers(set_out, repn)) {
+ LOG(1, "creating lacking headers failed: replica %u", repn);
+ /*
+ * copy the data back, so we could fall back to the original
+ * state
+ */
+ if (REP(set_in, repn)->nparts > 1)
+ copy_replica_data_fw(set_in, set_out, repn);
+ ret = -1;
+ goto out_unmap_out;
+ }
+
+ /* make changes to the first part's header */
+ update_replica_header(set_out, repn);
+
+ /* store new uuids in all headers and update linkage in the replica */
+ update_uuids(set_out, repn);
+
+out_unmap_out:
+ util_replica_close(set_out, repn);
+out_unmap_in:
+ util_replica_close(set_in, repn);
+out_close:
+ util_replica_fdclose(REP(set_in, repn));
+out:
+ return ret;
+}
+
+/*
+ * remove_hdrs -- (internal) transform a poolset without the SINGLEHDR option
+ * (with headers) into a poolset with the SINGLEHDR option
+ * (without headers)
+ */
+static int
+remove_hdrs(struct pool_set *set_in, struct pool_set *set_out,
+ struct poolset_health_status *set_in_hs, unsigned flags)
+{
+ LOG(3, "set_in %p, set_out %p, set_in_hs %p, flags %u",
+ set_in, set_out, set_in_hs, flags);
+ for (unsigned r = 0; r < set_in->nreplicas; ++r) {
+ if (remove_hdrs_replica(set_in, set_out, r)) {
+ LOG(1, "removing headers from replica %u failed", r);
+ /* mark all previous replicas as damaged */
+ while (--r < set_in->nreplicas)
+ REP_HEALTH(set_in_hs, r)->flags |= IS_BROKEN;
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * add_hdrs -- (internal) transform a poolset with the SINGLEHDR option (without
+ * headers) into a poolset without the SINGLEHDR option (with
+ * headers)
+ */
+static int
+add_hdrs(struct pool_set *set_in, struct pool_set *set_out,
+ struct poolset_health_status *set_in_hs,
+ unsigned flags)
+{
+ LOG(3, "set_in %p, set_out %p, set_in_hs %p, flags %u",
+ set_in, set_out, set_in_hs, flags);
+ for (unsigned r = 0; r < set_in->nreplicas; ++r) {
+ if (add_hdrs_replica(set_in, set_out, r)) {
+ LOG(1, "adding headers to replica %u failed", r);
+ /* mark all previous replicas as damaged */
+ while (--r < set_in->nreplicas)
+ REP_HEALTH(set_in_hs, r)->flags |= IS_BROKEN;
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * transform_replica -- transforming one poolset into another
+ */
+int
+replica_transform(struct pool_set *set_in, struct pool_set *set_out,
+ unsigned flags)
+{
+ LOG(3, "set_in %p, set_out %p", set_in, set_out);
+
+ int ret = 0;
+ /* validate user arguments */
+ if (validate_args(set_in, set_out))
+ return -1;
+
+ /* check if the source poolset is healthy */
+ struct poolset_health_status *set_in_hs = NULL;
+ if (replica_check_poolset_health(set_in, &set_in_hs,
+ 0 /* called from transform */, flags)) {
+ ERR("source poolset health check failed");
+ return -1;
+ }
+
+ if (!replica_is_poolset_healthy(set_in_hs)) {
+ ERR("source poolset is broken");
+ ret = -1;
+ errno = EINVAL;
+ goto free_hs_in;
+ }
+
+ /* copy value of the ignore_sds flag from the input poolset */
+ set_out->ignore_sds = set_in->ignore_sds;
+
+ struct poolset_health_status *set_out_hs = NULL;
+ if (replica_create_poolset_health_status(set_out, &set_out_hs)) {
+ ERR("creating poolset health status failed");
+ ret = -1;
+ goto free_hs_in;
+ }
+
+ /* check if the poolsets are transformable */
+ struct poolset_compare_status *set_in_cs = NULL;
+ struct poolset_compare_status *set_out_cs = NULL;
+ if (compare_poolsets(set_in, set_out, &set_in_cs, &set_out_cs)) {
+ ERR("comparing poolsets failed");
+ ret = -1;
+ goto free_hs_out;
+ }
+
+ enum transform_op operation = identify_transform_operation(set_in_cs,
+ set_out_cs, set_in_hs, set_out_hs);
+
+ if (operation == NOT_TRANSFORMABLE) {
+ LOG(1, "poolsets are not transformable");
+ ret = -1;
+ errno = EINVAL;
+ goto free_cs;
+ }
+
+ if (operation == RM_HDRS) {
+ if (!is_dry_run(flags) &&
+ remove_hdrs(set_in, set_out, set_in_hs,
+ flags)) {
+ ERR("removing headers failed; falling back to the "
+ "input poolset");
+ if (replica_sync(set_in, set_in_hs,
+ flags | IS_TRANSFORMED)) {
+ LOG(1, "falling back to the input poolset "
+ "failed");
+ } else {
+ LOG(1, "falling back to the input poolset "
+ "succeeded");
+ }
+ ret = -1;
+ }
+ goto free_cs;
+ }
+
+ if (operation == ADD_HDRS) {
+ if (!is_dry_run(flags) &&
+ add_hdrs(set_in, set_out, set_in_hs, flags)) {
+ ERR("adding headers failed; falling back to the "
+ "input poolset");
+ if (replica_sync(set_in, set_in_hs,
+ flags | IS_TRANSFORMED)) {
+ LOG(1, "falling back to the input poolset "
+ "failed");
+ } else {
+ LOG(1, "falling back to the input poolset "
+ "succeeded");
+ }
+ ret = -1;
+ }
+ goto free_cs;
+ }
+
+ if (operation == ADD_REPLICAS) {
+ /*
+ * check if any of the parts that are to be added already exists
+ */
+ if (do_added_parts_exist(set_out, set_out_hs)) {
+ ERR("some parts being added already exist");
+ ret = -1;
+ errno = EINVAL;
+ goto free_cs;
+ }
+ }
+
+ /* signal that sync is called by transform */
+ if (replica_sync(set_out, set_out_hs, flags | IS_TRANSFORMED)) {
+ ret = -1;
+ goto free_cs;
+ }
+
+ if (operation == RM_REPLICAS) {
+ if (!is_dry_run(flags) && delete_replicas(set_in, set_in_cs))
+ ret = -1;
+ }
+
+free_cs:
+ Free(set_in_cs);
+ Free(set_out_cs);
+free_hs_out:
+ replica_free_poolset_health_status(set_out_hs);
+free_hs_in:
+ replica_free_poolset_health_status(set_in_hs);
+ return ret;
+}