return -EINVAL;
}
-static int skip_name_map(void **p, void *end)
-{
- int len;
- ceph_decode_32_safe(p, end, len ,bad);
- while (len--) {
- int strlen;
- *p += sizeof(u32);
- ceph_decode_32_safe(p, end, strlen, bad);
- *p += strlen;
-}
- return 0;
-bad:
- return -EINVAL;
-}
-
static void crush_finalize(struct crush_map *c)
{
__s32 b;
void **p = &pbyval;
void *start = pbyval;
u32 magic;
- u32 num_name_maps;
dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
}
}
- /* ignore trailing name maps. */
- for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
- err = skip_name_map(p, end);
- if (err < 0)
- goto done;
- }
+ ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
+ ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
+ ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
/* tunables */
ceph_decode_need(p, end, 3*sizeof(u32), done);
return 0;
}
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds) and primary_temp (explicit primary setting)
- */
-static int __insert_pg_mapping(struct ceph_pg_mapping *new,
- struct rb_root *root)
+int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct ceph_pg_mapping *pg = NULL;
- int c;
+ int ret;
- dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new);
- while (*p) {
- parent = *p;
- pg = rb_entry(parent, struct ceph_pg_mapping, node);
- c = ceph_pg_compare(&new->pgid, &pg->pgid);
- if (c < 0)
- p = &(*p)->rb_left;
- else if (c > 0)
- p = &(*p)->rb_right;
- else
- return -EEXIST;
- }
+ ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid);
+ if (ret)
+ return ret;
+
+ if (lhs->shard < rhs->shard)
+ return -1;
+ if (lhs->shard > rhs->shard)
+ return 1;
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, root);
return 0;
}
-static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
- struct ceph_pg pgid)
+static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len)
{
- struct rb_node *n = root->rb_node;
struct ceph_pg_mapping *pg;
- int c;
- while (n) {
- pg = rb_entry(n, struct ceph_pg_mapping, node);
- c = ceph_pg_compare(&pgid, &pg->pgid);
- if (c < 0) {
- n = n->rb_left;
- } else if (c > 0) {
- n = n->rb_right;
- } else {
- dout("__lookup_pg_mapping %lld.%x got %p\n",
- pgid.pool, pgid.seed, pg);
- return pg;
- }
- }
- return NULL;
+ pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO);
+ if (!pg)
+ return NULL;
+
+ RB_CLEAR_NODE(&pg->node);
+ return pg;
}
-static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid)
+static void free_pg_mapping(struct ceph_pg_mapping *pg)
{
- struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid);
+ WARN_ON(!RB_EMPTY_NODE(&pg->node));
- if (pg) {
- dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed,
- pg);
- rb_erase(&pg->node, root);
- kfree(pg);
- return 0;
- }
- dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed);
- return -ENOENT;
+ kfree(pg);
}
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds) and primary_temp (explicit primary setting)
+ */
+DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
+ RB_BYPTR, const struct ceph_pg *, node)
+
/*
* rbtree of pg pool info
*/
*p += len;
}
+ /*
+ * last_force_op_resend_preluminous, will be overridden if the
+ * map was encoded with RESEND_ON_SPLIT
+ */
if (ev >= 15)
pi->last_force_request_resend = ceph_decode_32(p);
else
pi->last_force_request_resend = 0;
+ if (ev >= 16)
+ *p += 4; /* skip min_read_recency_for_promote */
+
+ if (ev >= 17)
+ *p += 8; /* skip expected_num_objects */
+
+ if (ev >= 19)
+ *p += 4; /* skip cache_target_dirty_high_ratio_micro */
+
+ if (ev >= 20)
+ *p += 4; /* skip min_write_recency_for_promote */
+
+ if (ev >= 21)
+ *p += 1; /* skip use_gmt_hitset */
+
+ if (ev >= 22)
+ *p += 1; /* skip fast_read */
+
+ if (ev >= 23) {
+ *p += 4; /* skip hit_set_grade_decay_rate */
+ *p += 4; /* skip hit_set_search_last_n */
+ }
+
+ if (ev >= 24) {
+ /* skip opts */
+ *p += 1 + 1; /* versions */
+ len = ceph_decode_32(p);
+ *p += len;
+ }
+
+ if (ev >= 25)
+ pi->last_force_request_resend = ceph_decode_32(p);
+
/* ignore the rest */
*p = pool_end;
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_temp),
struct ceph_pg_mapping, node);
- rb_erase(&pg->node, &map->pg_temp);
- kfree(pg);
+ erase_pg_mapping(&map->pg_temp, pg);
+ free_pg_mapping(pg);
}
while (!RB_EMPTY_ROOT(&map->primary_temp)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->primary_temp),
struct ceph_pg_mapping, node);
- rb_erase(&pg->node, &map->primary_temp);
- kfree(pg);
+ erase_pg_mapping(&map->primary_temp, pg);
+ free_pg_mapping(pg);
}
while (!RB_EMPTY_ROOT(&map->pg_pools)) {
struct ceph_pg_pool_info *pi =
return __decode_pools(p, end, map, true);
}
-static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
- bool incremental)
+typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool);
+
+static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root,
+ decode_mapping_fn_t fn, bool incremental)
{
u32 n;
+ WARN_ON(!incremental && !fn);
+
ceph_decode_32_safe(p, end, n, e_inval);
while (n--) {
+ struct ceph_pg_mapping *pg;
struct ceph_pg pgid;
- u32 len, i;
int ret;
ret = ceph_decode_pgid(p, end, &pgid);
if (ret)
return ret;
- ceph_decode_32_safe(p, end, len, e_inval);
-
- ret = __remove_pg_mapping(&map->pg_temp, pgid);
- BUG_ON(!incremental && ret != -ENOENT);
-
- if (!incremental || len > 0) {
- struct ceph_pg_mapping *pg;
-
- ceph_decode_need(p, end, len*sizeof(u32), e_inval);
-
- if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
- return -EINVAL;
-
- pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS);
- if (!pg)
- return -ENOMEM;
+ pg = lookup_pg_mapping(mapping_root, &pgid);
+ if (pg) {
+ WARN_ON(!incremental);
+ erase_pg_mapping(mapping_root, pg);
+ free_pg_mapping(pg);
+ }
- pg->pgid = pgid;
- pg->pg_temp.len = len;
- for (i = 0; i < len; i++)
- pg->pg_temp.osds[i] = ceph_decode_32(p);
+ if (fn) {
+ pg = fn(p, end, incremental);
+ if (IS_ERR(pg))
+ return PTR_ERR(pg);
- ret = __insert_pg_mapping(pg, &map->pg_temp);
- if (ret) {
- kfree(pg);
- return ret;
+ if (pg) {
+ pg->pgid = pgid; /* struct */
+ insert_pg_mapping(mapping_root, pg);
}
}
}
return -EINVAL;
}
+static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end,
+ bool incremental)
+{
+ struct ceph_pg_mapping *pg;
+ u32 len, i;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len == 0 && incremental)
+ return NULL; /* new_pg_temp: [] to remove */
+ if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32))
+ return ERR_PTR(-EINVAL);
+
+ ceph_decode_need(p, end, len * sizeof(u32), e_inval);
+ pg = alloc_pg_mapping(len * sizeof(u32));
+ if (!pg)
+ return ERR_PTR(-ENOMEM);
+
+ pg->pg_temp.len = len;
+ for (i = 0; i < len; i++)
+ pg->pg_temp.osds[i] = ceph_decode_32(p);
+
+ return pg;
+
+e_inval:
+ return ERR_PTR(-EINVAL);
+}
+
static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
{
- return __decode_pg_temp(p, end, map, false);
+ return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
+ false);
}
static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
{
- return __decode_pg_temp(p, end, map, true);
+ return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
+ true);
}
-static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
- bool incremental)
+static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end,
+ bool incremental)
{
- u32 n;
-
- ceph_decode_32_safe(p, end, n, e_inval);
- while (n--) {
- struct ceph_pg pgid;
- u32 osd;
- int ret;
-
- ret = ceph_decode_pgid(p, end, &pgid);
- if (ret)
- return ret;
-
- ceph_decode_32_safe(p, end, osd, e_inval);
-
- ret = __remove_pg_mapping(&map->primary_temp, pgid);
- BUG_ON(!incremental && ret != -ENOENT);
-
- if (!incremental || osd != (u32)-1) {
- struct ceph_pg_mapping *pg;
-
- pg = kzalloc(sizeof(*pg), GFP_NOFS);
- if (!pg)
- return -ENOMEM;
+ struct ceph_pg_mapping *pg;
+ u32 osd;
- pg->pgid = pgid;
- pg->primary_temp.osd = osd;
+ ceph_decode_32_safe(p, end, osd, e_inval);
+ if (osd == (u32)-1 && incremental)
+ return NULL; /* new_primary_temp: -1 to remove */
- ret = __insert_pg_mapping(pg, &map->primary_temp);
- if (ret) {
- kfree(pg);
- return ret;
- }
- }
- }
+ pg = alloc_pg_mapping(0);
+ if (!pg)
+ return ERR_PTR(-ENOMEM);
- return 0;
+ pg->primary_temp.osd = osd;
+ return pg;
e_inval:
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
{
- return __decode_primary_temp(p, end, map, false);
+ return decode_pg_mapping(p, end, &map->primary_temp,
+ __decode_primary_temp, false);
}
static int decode_new_primary_temp(void **p, void *end,
struct ceph_osdmap *map)
{
- return __decode_primary_temp(p, end, map, true);
+ return decode_pg_mapping(p, end, &map->primary_temp,
+ __decode_primary_temp, true);
}
u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
void ceph_oloc_copy(struct ceph_object_locator *dest,
const struct ceph_object_locator *src)
{
- WARN_ON(!ceph_oloc_empty(dest));
- WARN_ON(dest->pool_ns); /* empty() only covers ->pool */
+ ceph_oloc_destroy(dest);
dest->pool = src->pool;
if (src->pool_ns)
dest->pool_ns = ceph_get_string(src->pool_ns);
+ else
+ dest->pool_ns = NULL;
}
EXPORT_SYMBOL(ceph_oloc_copy);
void ceph_oid_copy(struct ceph_object_id *dest,
const struct ceph_object_id *src)
{
- WARN_ON(!ceph_oid_empty(dest));
+ ceph_oid_destroy(dest);
if (src->name != src->inline_name) {
/* very rare, see ceph_object_id definition */
dest->name = kmalloc(src->name_len + 1,
GFP_NOIO | __GFP_NOFAIL);
+ } else {
+ dest->name = dest->inline_name;
}
-
memcpy(dest->name, src->name, src->name_len + 1);
dest->name_len = src->name_len;
}
dest->primary = src->primary;
}
-static bool is_split(const struct ceph_pg *pgid,
- u32 old_pg_num,
- u32 new_pg_num)
+bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
+ u32 new_pg_num)
{
int old_bits = calc_bits_of(old_pg_num);
int old_mask = (1 << old_bits) - 1;
!osds_equal(old_up, new_up) ||
old_size != new_size ||
old_min_size != new_min_size ||
- is_split(pgid, old_pg_num, new_pg_num) ||
+ ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
old_sort_bitwise != new_sort_bitwise;
}
* Should only be called with target_oid and target_oloc (as opposed to
* base_oid and base_oloc), since tiering isn't taken into account.
*/
-int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
- struct ceph_object_id *oid,
- struct ceph_object_locator *oloc,
- struct ceph_pg *raw_pgid)
+int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
+ const struct ceph_object_id *oid,
+ const struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid)
{
- struct ceph_pg_pool_info *pi;
-
- pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
- if (!pi)
- return -ENOENT;
+ WARN_ON(pi->id != oloc->pool);
if (!oloc->pool_ns) {
raw_pgid->pool = oloc->pool;
}
return 0;
}
+
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+ const struct ceph_object_id *oid,
+ const struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
+ if (!pi)
+ return -ENOENT;
+
+ return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid);
+}
EXPORT_SYMBOL(ceph_object_locator_to_pg);
/*
ceph_osds_init(temp);
/* pg_temp? */
- pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+ pg = lookup_pg_mapping(&osdmap->pg_temp, &pgid);
if (pg) {
for (i = 0; i < pg->pg_temp.len; i++) {
if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
}
/* primary_temp? */
- pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
+ pg = lookup_pg_mapping(&osdmap->primary_temp, &pgid);
if (pg)
temp->primary = pg->primary_temp.osd;
}
* resend a request.
*/
void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_osds *up,
struct ceph_osds *acting)
{
- struct ceph_pg_pool_info *pi;
u32 pps;
- pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
- if (!pi) {
- ceph_osds_init(up);
- ceph_osds_init(acting);
- goto out;
- }
+ WARN_ON(pi->id != raw_pgid->pool);
pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
raw_to_up_osds(osdmap, pi, up);
if (acting->primary == -1)
acting->primary = up->primary;
}
-out:
WARN_ON(!osds_valid(up) || !osds_valid(acting));
}
+bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_spg *spgid)
+{
+ struct ceph_pg pgid;
+ struct ceph_osds up, acting;
+ int i;
+
+ WARN_ON(pi->id != raw_pgid->pool);
+ raw_pg_to_pg(pi, raw_pgid, &pgid);
+
+ if (ceph_can_shift_osds(pi)) {
+ spgid->pgid = pgid; /* struct */
+ spgid->shard = CEPH_SPG_NOSHARD;
+ return true;
+ }
+
+ ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting);
+ for (i = 0; i < acting.size; i++) {
+ if (acting.osds[i] == acting.primary) {
+ spgid->pgid = pgid; /* struct */
+ spgid->shard = i;
+ return true;
+ }
+ }
+
+ return false;
+}
+
/*
* Return acting primary for given PG, or -1 if none.
*/
int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid)
{
+ struct ceph_pg_pool_info *pi;
struct ceph_osds up, acting;
- ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
+ pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
+ if (!pi)
+ return -1;
+
+ ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting);
return acting.primary;
}
EXPORT_SYMBOL(ceph_pg_to_acting_primary);