]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/filestore/HashIndex.cc
import ceph 14.2.5
[ceph.git] / ceph / src / os / filestore / HashIndex.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/compat.h"
16 #include "include/types.h"
17 #include "include/buffer.h"
18 #include "osd/osd_types.h"
19 #include <errno.h>
20
21 #include "HashIndex.h"
22
23 #include "common/errno.h"
24 #include "common/debug.h"
25 #define dout_context cct
26 #define dout_subsys ceph_subsys_filestore
27
28 const string HashIndex::SUBDIR_ATTR = "contents";
29 const string HashIndex::SETTINGS_ATTR = "settings";
30 const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
31
32 /// hex digit to integer value
33 int hex_to_int(char c)
34 {
35 if (c >= '0' && c <= '9')
36 return c - '0';
37 if (c >= 'A' && c <= 'F')
38 return c - 'A' + 10;
39 ceph_abort();
40 }
41
42 /// int value to hex digit
43 char int_to_hex(int v)
44 {
45 ceph_assert(v < 16);
46 if (v < 10)
47 return '0' + v;
48 return 'A' + v - 10;
49 }
50
51 /// reverse bits in a nibble (0..15)
52 int reverse_nibble_bits(int in)
53 {
54 ceph_assert(in < 16);
55 return
56 ((in & 8) >> 3) |
57 ((in & 4) >> 1) |
58 ((in & 2) << 1) |
59 ((in & 1) << 3);
60 }
61
62 /// reverse nibble bits in a hex digit
63 char reverse_hexdigit_bits(char c)
64 {
65 return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
66 }
67
68 /// reverse nibble bits in a hex string
69 string reverse_hexdigit_bits_string(string s)
70 {
71 for (unsigned i=0; i<s.size(); ++i)
72 s[i] = reverse_hexdigit_bits(s[i]);
73 return s;
74 }
75
76 /// compare hex digit (as length 1 string) bitwise
77 bool cmp_hexdigit_bitwise(const string& l, const string& r)
78 {
79 ceph_assert(l.length() == 1 && r.length() == 1);
80 int lv = hex_to_int(l[0]);
81 int rv = hex_to_int(r[0]);
82 ceph_assert(lv < 16);
83 ceph_assert(rv < 16);
84 return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
85 }
86
87 /// compare hex digit string bitwise
88 bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
89 {
90 string ll = reverse_hexdigit_bits_string(l);
91 string rr = reverse_hexdigit_bits_string(r);
92 return ll < rr;
93 }
94
95 int HashIndex::cleanup() {
96 bufferlist bl;
97 int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
98 if (r < 0) {
99 // No in progress operations!
100 return 0;
101 }
102 auto i = bl.cbegin();
103 InProgressOp in_progress(i);
104 subdir_info_s info;
105 r = get_info(in_progress.path, &info);
106 if (r == -ENOENT) {
107 return end_split_or_merge(in_progress.path);
108 } else if (r < 0) {
109 return r;
110 }
111
112 if (in_progress.is_split())
113 return complete_split(in_progress.path, info);
114 else if (in_progress.is_merge())
115 return complete_merge(in_progress.path, info);
116 else if (in_progress.is_col_split()) {
117 for (vector<string>::iterator i = in_progress.path.begin();
118 i != in_progress.path.end();
119 ++i) {
120 vector<string> path(in_progress.path.begin(), i);
121 int r = reset_attr(path);
122 if (r < 0)
123 return r;
124 }
125 return 0;
126 }
127 else
128 return -EINVAL;
129 }
130
131 int HashIndex::reset_attr(
132 const vector<string> &path)
133 {
134 int exists = 0;
135 int r = path_exists(path, &exists);
136 if (r < 0)
137 return r;
138 if (!exists)
139 return 0;
140 map<string, ghobject_t> objects;
141 vector<string> subdirs;
142 r = list_objects(path, 0, 0, &objects);
143 if (r < 0)
144 return r;
145 r = list_subdirs(path, &subdirs);
146 if (r < 0)
147 return r;
148
149 subdir_info_s info;
150 info.hash_level = path.size();
151 info.objs = objects.size();
152 info.subdirs = subdirs.size();
153 return set_info(path, info);
154 }
155
156 int HashIndex::col_split_level(
157 HashIndex &from,
158 HashIndex &to,
159 const vector<string> &path,
160 uint32_t inbits,
161 uint32_t match,
162 unsigned *mkdirred)
163 {
164 /* For each subdir, move, recurse, or ignore based on comparing the low order
165 * bits of the hash represented by the subdir path with inbits, match passed
166 * in.
167 */
168 vector<string> subdirs;
169 int r = from.list_subdirs(path, &subdirs);
170 if (r < 0)
171 return r;
172 map<string, ghobject_t> objects;
173 r = from.list_objects(path, 0, 0, &objects);
174 if (r < 0)
175 return r;
176
177 set<string> to_move;
178 for (vector<string>::iterator i = subdirs.begin();
179 i != subdirs.end();
180 ++i) {
181 uint32_t bits = 0;
182 uint32_t hash = 0;
183 vector<string> sub_path(path.begin(), path.end());
184 sub_path.push_back(*i);
185 path_to_hobject_hash_prefix(sub_path, &bits, &hash);
186 if (bits < inbits) {
187 if (hobject_t::match_hash(hash, bits, match)) {
188 r = col_split_level(
189 from,
190 to,
191 sub_path,
192 inbits,
193 match,
194 mkdirred);
195 if (r < 0)
196 return r;
197 if (*mkdirred > path.size())
198 *mkdirred = path.size();
199 } // else, skip, doesn't need to be moved or recursed into
200 } else {
201 if (hobject_t::match_hash(hash, inbits, match)) {
202 to_move.insert(*i);
203 }
204 } // else, skip, doesn't need to be moved or recursed into
205 }
206
207 /* Then, do the same for each object */
208 map<string, ghobject_t> objs_to_move;
209 for (map<string, ghobject_t>::iterator i = objects.begin();
210 i != objects.end();
211 ++i) {
212 if (i->second.match(inbits, match)) {
213 objs_to_move.insert(*i);
214 }
215 }
216
217 if (objs_to_move.empty() && to_move.empty())
218 return 0;
219
220 // Make parent directories as needed
221 while (*mkdirred < path.size()) {
222 ++*mkdirred;
223 int exists = 0;
224 vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
225 r = to.path_exists(creating_path, &exists);
226 if (r < 0)
227 return r;
228 if (exists)
229 continue;
230 subdir_info_s info;
231 info.objs = 0;
232 info.subdirs = 0;
233 info.hash_level = creating_path.size();
234 if (*mkdirred < path.size() - 1)
235 info.subdirs = 1;
236 r = to.start_col_split(creating_path);
237 if (r < 0)
238 return r;
239 r = to.create_path(creating_path);
240 if (r < 0)
241 return r;
242 r = to.set_info(creating_path, info);
243 if (r < 0)
244 return r;
245 r = to.end_split_or_merge(creating_path);
246 if (r < 0)
247 return r;
248 }
249
250 subdir_info_s from_info;
251 subdir_info_s to_info;
252 r = from.get_info(path, &from_info);
253 if (r < 0)
254 return r;
255 r = to.get_info(path, &to_info);
256 if (r < 0)
257 return r;
258
259 from.start_col_split(path);
260 to.start_col_split(path);
261
262 // Do subdir moves
263 for (set<string>::iterator i = to_move.begin();
264 i != to_move.end();
265 ++i) {
266 from_info.subdirs--;
267 to_info.subdirs++;
268 r = move_subdir(from, to, path, *i);
269 if (r < 0)
270 return r;
271 }
272
273 for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
274 i != objs_to_move.end();
275 ++i) {
276 from_info.objs--;
277 to_info.objs++;
278 r = move_object(from, to, path, *i);
279 if (r < 0)
280 return r;
281 }
282
283
284 r = to.set_info(path, to_info);
285 if (r < 0)
286 return r;
287 r = from.set_info(path, from_info);
288 if (r < 0)
289 return r;
290 from.end_split_or_merge(path);
291 to.end_split_or_merge(path);
292 return 0;
293 }
294
295 int HashIndex::_merge(
296 uint32_t bits,
297 CollectionIndex* dest) {
298 dout(20) << __func__ << " bits " << bits << dendl;
299 ceph_assert(collection_version() == dest->collection_version());
300
301 vector<string> emptypath;
302
303 // pre-split to common/target level so that any shared prefix DIR_?
304 // directories already exist at the destination. Since each
305 // directory is a nibble (4 bits),
306 unsigned shared = bits / 4;
307 dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl;
308 if (shared) {
309 split_dirs(emptypath, shared);
310 ((HashIndex*)dest)->split_dirs(emptypath, shared);
311 }
312
313 // now merge the contents
314 _merge_dirs(*this, *(HashIndex*)dest, emptypath);
315
316 return 0;
317 }
318
319 int HashIndex::_merge_dirs(
320 HashIndex& from,
321 HashIndex& to,
322 const vector<string>& path)
323 {
324 dout(20) << __func__ << " path " << path << dendl;
325 int r;
326
327 vector<string> src_subs, dst_subs;
328 r = from.list_subdirs(path, &src_subs);
329 if (r < 0) {
330 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
331 << " r " << r << " from "
332 << "from.list_subdirs"
333 << dendl;
334 return r;
335 }
336 r = to.list_subdirs(path, &dst_subs);
337 if (r < 0) {
338 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
339 << " r " << r << " from "
340 << "to.list_subdirs"
341 << dendl;
342 return r;
343 }
344
345 for (auto& i : src_subs) {
346 if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) {
347 // move it
348 r = move_subdir(from, to, path, i);
349 if (r < 0) {
350 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
351 << " r " << r << " from "
352 << "move_subdir(...,"
353 << path << "," << i << ")"
354 << dendl;
355 return r;
356 }
357 } else {
358 // common, recurse!
359 vector<string> nested = path;
360 nested.push_back(i);
361 r = _merge_dirs(from, to, nested);
362 if (r < 0) {
363 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
364 << " r " << r << " from "
365 << "rec _merge_dirs"
366 << dendl;
367 return r;
368 }
369
370 // now remove it
371 r = remove_path(nested);
372 if (r < 0) {
373 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
374 << " r " << r << " from "
375 << "remove_path "
376 << nested
377 << dendl;
378 return r;
379 }
380 }
381 }
382
383 // objects
384 map<string, ghobject_t> objects;
385 r = from.list_objects(path, 0, 0, &objects);
386 if (r < 0) {
387 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
388 << " r " << r << " from "
389 << "from.list_objects"
390 << dendl;
391 return r;
392 }
393
394 for (auto& i : objects) {
395 r = move_object(from, to, path, i);
396 if (r < 0) {
397 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
398 << " r " << r << " from "
399 << "move_object(...,"
400 << path << "," << i << ")"
401 << dendl;
402 return r;
403 }
404 }
405
406 return 0;
407 }
408
409
410 int HashIndex::_split(
411 uint32_t match,
412 uint32_t bits,
413 CollectionIndex* dest) {
414 ceph_assert(collection_version() == dest->collection_version());
415 unsigned mkdirred = 0;
416
417 return col_split_level(
418 *this,
419 *static_cast<HashIndex*>(dest),
420 vector<string>(),
421 bits,
422 match,
423 &mkdirred);
424 }
425
426 int HashIndex::split_dirs(const vector<string> &path, int target_level) {
427 dout(20) << __func__ << " " << path << " target level: "
428 << target_level << dendl;
429 subdir_info_s info;
430 int r = get_info(path, &info);
431 if (r < 0) {
432 dout(10) << "error looking up info for " << path << ": "
433 << cpp_strerror(r) << dendl;
434 return r;
435 }
436
437 if (must_split(info, target_level)) {
438 dout(1) << __func__ << " " << path << " has " << info.objs
439 << " objects, " << info.hash_level
440 << " level, starting split in pg " << coll() << "." << dendl;
441 r = initiate_split(path, info);
442 if (r < 0) {
443 dout(10) << "error initiating split on " << path << ": "
444 << cpp_strerror(r) << dendl;
445 return r;
446 }
447
448 r = complete_split(path, info);
449 dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
450 << dendl;
451 if (r < 0) {
452 dout(10) << "error completing split on " << path << ": "
453 << cpp_strerror(r) << dendl;
454 return r;
455 }
456 }
457
458 vector<string> subdirs;
459 r = list_subdirs(path, &subdirs);
460 if (r < 0) {
461 dout(10) << "error listing subdirs of " << path << ": "
462 << cpp_strerror(r) << dendl;
463 return r;
464 }
465 for (vector<string>::const_iterator it = subdirs.begin();
466 it != subdirs.end(); ++it) {
467 vector<string> subdir_path(path);
468 subdir_path.push_back(*it);
469 r = split_dirs(subdir_path, target_level);
470 if (r < 0) {
471 return r;
472 }
473 }
474
475 return r;
476 }
477
478 int HashIndex::apply_layout_settings(int target_level) {
479 vector<string> path;
480 dout(10) << __func__ << " split multiple = " << split_multiplier
481 << " merge threshold = " << merge_threshold
482 << " split rand factor = " << cct->_conf->filestore_split_rand_factor
483 << " target level = " << target_level
484 << dendl;
485 int r = write_settings();
486 if (r < 0)
487 return r;
488 return split_dirs(path, target_level);
489 }
490
491 int HashIndex::_init() {
492 subdir_info_s info;
493 vector<string> path;
494 int r = set_info(path, info);
495 if (r < 0)
496 return r;
497 return write_settings();
498 }
499
500 int HashIndex::write_settings() {
501 if (cct->_conf->filestore_split_rand_factor > 0) {
502 settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor;
503 } else {
504 settings.split_rand_factor = 0;
505 }
506 vector<string> path;
507 bufferlist bl;
508 settings.encode(bl);
509 return add_attr_path(path, SETTINGS_ATTR, bl);
510 }
511
512 int HashIndex::read_settings() {
513 vector<string> path;
514 bufferlist bl;
515 int r = get_attr_path(path, SETTINGS_ATTR, bl);
516 if (r == -ENODATA)
517 return 0;
518 if (r < 0) {
519 derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl;
520 return r;
521 }
522 auto it = bl.cbegin();
523 settings.decode(it);
524 dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl;
525 return 0;
526 }
527
528 /* LFNIndex virtual method implementations */
529 int HashIndex::_created(const vector<string> &path,
530 const ghobject_t &oid,
531 const string &mangled_name) {
532 subdir_info_s info;
533 int r;
534 r = get_info(path, &info);
535 if (r < 0)
536 return r;
537 info.objs++;
538 r = set_info(path, info);
539 if (r < 0)
540 return r;
541
542 if (must_split(info)) {
543 dout(1) << __func__ << " " << path << " has " << info.objs
544 << " objects, starting split in pg " << coll() << "." << dendl;
545 int r = initiate_split(path, info);
546 if (r < 0)
547 return r;
548 r = complete_split(path, info);
549 dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
550 << dendl;
551 return r;
552 } else {
553 return 0;
554 }
555 }
556
557 int HashIndex::_remove(const vector<string> &path,
558 const ghobject_t &oid,
559 const string &mangled_name) {
560 int r;
561 r = remove_object(path, oid);
562 if (r < 0)
563 return r;
564 subdir_info_s info;
565 r = get_info(path, &info);
566 if (r < 0)
567 return r;
568 info.objs--;
569 r = set_info(path, info);
570 if (r < 0)
571 return r;
572 if (must_merge(info)) {
573 r = initiate_merge(path, info);
574 if (r < 0)
575 return r;
576 return complete_merge(path, info);
577 } else {
578 return 0;
579 }
580 }
581
582 int HashIndex::_lookup(const ghobject_t &oid,
583 vector<string> *path,
584 string *mangled_name,
585 int *hardlink) {
586 vector<string> path_comp;
587 get_path_components(oid, &path_comp);
588 vector<string>::iterator next = path_comp.begin();
589 int exists;
590 while (1) {
591 int r = path_exists(*path, &exists);
592 if (r < 0)
593 return r;
594 if (!exists) {
595 if (path->empty())
596 return -ENOENT;
597 path->pop_back();
598 break;
599 }
600 if (next == path_comp.end())
601 break;
602 path->push_back(*(next++));
603 }
604 return get_mangled_name(*path, oid, mangled_name, hardlink);
605 }
606
607 int HashIndex::_collection_list_partial(const ghobject_t &start,
608 const ghobject_t &end,
609 int max_count,
610 vector<ghobject_t> *ls,
611 ghobject_t *next) {
612 vector<string> path;
613 ghobject_t _next;
614 if (!next)
615 next = &_next;
616 *next = start;
617 dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
618 return list_by_hash(path, end, max_count, next, ls);
619 }
620
621 int HashIndex::prep_delete() {
622 return recursive_remove(vector<string>());
623 }
624
625 int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
626 int ret;
627 vector<string> path;
628 subdir_info_s root_info;
629 // Make sure there is neither objects nor sub-folders
630 // in this collection
631 ret = get_info(path, &root_info);
632 if (ret < 0)
633 return ret;
634
635 // Do the folder splitting first
636 ret = pre_split_folder(pg_num, expected_num_objs);
637 if (ret < 0)
638 return ret;
639 // Initialize the folder info starting from root
640 return init_split_folder(path, 0);
641 }
642
643 int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
644 {
645 // If folder merging is enabled (by setting the threshold positive),
646 // no need to split
647 if (merge_threshold > 0)
648 return 0;
649 const coll_t c = coll();
650 // Do not split if the expected number of objects in this collection is zero (by default)
651 if (expected_num_objs == 0)
652 return 0;
653
654 // Calculate the number of leaf folders (which actually store files)
655 // need to be created
656 const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16;
657 uint64_t leavies = expected_num_objs / objs_per_folder ;
658 // No need to split
659 if (leavies == 0 || expected_num_objs == objs_per_folder)
660 return 0;
661
662 spg_t spgid;
663 if (!c.is_pg_prefix(&spgid))
664 return -EINVAL;
665 const ps_t ps = spgid.pgid.ps();
666
667 // the most significant bits of pg_num
668 const int pg_num_bits = calc_num_bits(pg_num - 1);
669 ps_t tmp_id = ps;
670 // calculate the number of levels we only create one sub folder
671 int num = pg_num_bits / 4;
672 // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
673 // so that splitting starts at level 3
674 if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
675 --num;
676 }
677
678 int ret;
679 // Start with creation that only has one subfolder
680 vector<string> paths;
681 int dump_num = num;
682 while (num-- > 0) {
683 ps_t v = tmp_id & 0x0000000f;
684 paths.push_back(to_hex(v));
685 ret = create_path(paths);
686 if (ret < 0 && ret != -EEXIST)
687 return ret;
688 tmp_id = tmp_id >> 4;
689 }
690
691 // Starting from here, we can split by creating multiple subfolders
692 const int left_bits = pg_num_bits - dump_num * 4;
693 // this variable denotes how many bits (for this level) that can be
694 // used for sub folder splitting
695 int split_bits = 4 - left_bits;
696 // the below logic is inspired by rados.h#ceph_stable_mod,
697 // it basically determines how many sub-folders should we
698 // create for splitting
699 ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT
700 if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
701 ++split_bits;
702 }
703 const uint32_t subs = (1 << split_bits);
704 // Calculate how many levels we create starting from here
705 int level = 0;
706 int level_limit = MAX_HASH_LEVEL - dump_num - 1;
707 uint64_t actual_leaves = subs;
708 while (actual_leaves < leavies && level < level_limit) {
709 ++level;
710 actual_leaves <<= 4;
711 }
712 for (uint32_t i = 0; i < subs; ++i) {
713 ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT
714 int v = tmp_id | (i << ((4 - split_bits) % 4));
715 paths.push_back(to_hex(v));
716 ret = create_path(paths);
717 if (ret < 0 && ret != -EEXIST)
718 return ret;
719 ret = recursive_create_path(paths, level);
720 if (ret < 0)
721 return ret;
722 paths.pop_back();
723 }
724 return 0;
725 }
726
727 int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
728 {
729 // Get the number of sub directories for the current path
730 vector<string> subdirs;
731 int ret = list_subdirs(path, &subdirs);
732 if (ret < 0)
733 return ret;
734 subdir_info_s info;
735 info.subdirs = subdirs.size();
736 info.hash_level = hash_level;
737 ret = set_info(path, info);
738 if (ret < 0)
739 return ret;
740 ret = fsync_dir(path);
741 if (ret < 0)
742 return ret;
743
744 // Do the same for subdirs
745 vector<string>::const_iterator iter;
746 for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
747 path.push_back(*iter);
748 ret = init_split_folder(path, hash_level + 1);
749 if (ret < 0)
750 return ret;
751 path.pop_back();
752 }
753 return 0;
754 }
755
756 int HashIndex::recursive_create_path(vector<string>& path, int level)
757 {
758 if (level == 0)
759 return 0;
760 for (int i = 0; i < 16; ++i) {
761 path.push_back(to_hex(i));
762 int ret = create_path(path);
763 if (ret < 0 && ret != -EEXIST)
764 return ret;
765 ret = recursive_create_path(path, level - 1);
766 if (ret < 0)
767 return ret;
768 path.pop_back();
769 }
770 return 0;
771 }
772
773 int HashIndex::recursive_remove(const vector<string> &path) {
774 return _recursive_remove(path, true);
775 }
776
777 int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
778 vector<string> subdirs;
779 dout(20) << __func__ << " path=" << path << dendl;
780 int r = list_subdirs(path, &subdirs);
781 if (r < 0)
782 return r;
783 map<string, ghobject_t> objects;
784 r = list_objects(path, 0, 0, &objects);
785 if (r < 0)
786 return r;
787 if (!objects.empty())
788 return -ENOTEMPTY;
789 vector<string> subdir(path);
790 for (vector<string>::iterator i = subdirs.begin();
791 i != subdirs.end();
792 ++i) {
793 subdir.push_back(*i);
794 r = _recursive_remove(subdir, false);
795 if (r < 0)
796 return r;
797 subdir.pop_back();
798 }
799 if (top)
800 return 0;
801 else
802 return remove_path(path);
803 }
804
805 int HashIndex::start_col_split(const vector<string> &path) {
806 bufferlist bl;
807 InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
808 op_tag.encode(bl);
809 int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
810 if (r < 0)
811 return r;
812 return fsync_dir(vector<string>());
813 }
814
815 int HashIndex::start_split(const vector<string> &path) {
816 bufferlist bl;
817 InProgressOp op_tag(InProgressOp::SPLIT, path);
818 op_tag.encode(bl);
819 int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
820 if (r < 0)
821 return r;
822 return fsync_dir(vector<string>());
823 }
824
825 int HashIndex::start_merge(const vector<string> &path) {
826 bufferlist bl;
827 InProgressOp op_tag(InProgressOp::MERGE, path);
828 op_tag.encode(bl);
829 int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
830 if (r < 0)
831 return r;
832 return fsync_dir(vector<string>());
833 }
834
835 int HashIndex::end_split_or_merge(const vector<string> &path) {
836 return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
837 }
838
839 int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
840 bufferlist buf;
841 int r = get_attr_path(path, SUBDIR_ATTR, buf);
842 if (r < 0)
843 return r;
844 auto bufiter = buf.cbegin();
845 info->decode(bufiter);
846 ceph_assert(path.size() == (unsigned)info->hash_level);
847 return 0;
848 }
849
850 int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
851 bufferlist buf;
852 ceph_assert(path.size() == (unsigned)info.hash_level);
853 info.encode(buf);
854 return add_attr_path(path, SUBDIR_ATTR, buf);
855 }
856
857 bool HashIndex::must_merge(const subdir_info_s &info) {
858 return (info.hash_level > 0 &&
859 merge_threshold > 0 &&
860 info.objs < (unsigned)merge_threshold &&
861 info.subdirs == 0);
862 }
863
864 bool HashIndex::must_split(const subdir_info_s &info, int target_level) {
865 // target_level is used for ceph-objectstore-tool to split dirs offline.
866 // if it is set (defalult is 0) and current hash level < target_level,
867 // this dir would be split no matters how many objects it has.
868 return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
869 ((target_level > 0 && info.hash_level < (unsigned)target_level) ||
870 (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16))));
871 }
872
873 int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
874 return start_merge(path);
875 }
876
877 int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
878 vector<string> dst = path;
879 dst.pop_back();
880 subdir_info_s dstinfo;
881 int r, exists;
882 r = path_exists(path, &exists);
883 if (r < 0)
884 return r;
885 r = get_info(dst, &dstinfo);
886 if (r < 0)
887 return r;
888 if (exists) {
889 r = move_objects(path, dst);
890 if (r < 0)
891 return r;
892 r = reset_attr(dst);
893 if (r < 0)
894 return r;
895 r = remove_path(path);
896 if (r < 0)
897 return r;
898 }
899 if (must_merge(dstinfo)) {
900 r = initiate_merge(dst, dstinfo);
901 if (r < 0)
902 return r;
903 r = fsync_dir(dst);
904 if (r < 0)
905 return r;
906 return complete_merge(dst, dstinfo);
907 }
908 r = fsync_dir(dst);
909 if (r < 0)
910 return r;
911 return end_split_or_merge(path);
912 }
913
914 int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
915 return start_split(path);
916 }
917
918 int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
919 int level = info.hash_level;
920 map<string, ghobject_t> objects;
921 vector<string> dst = path;
922 int r;
923 dst.push_back("");
924 r = list_objects(path, 0, 0, &objects);
925 if (r < 0)
926 return r;
927 vector<string> subdirs_vec;
928 r = list_subdirs(path, &subdirs_vec);
929 if (r < 0)
930 return r;
931 set<string> subdirs;
932 subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
933 map<string, map<string, ghobject_t> > mapped;
934 map<string, ghobject_t> moved;
935 int num_moved = 0;
936 for (map<string, ghobject_t>::iterator i = objects.begin();
937 i != objects.end();
938 ++i) {
939 vector<string> new_path;
940 get_path_components(i->second, &new_path);
941 mapped[new_path[level]][i->first] = i->second;
942 }
943 for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
944 i != mapped.end();
945 ) {
946 dst[level] = i->first;
947 /* If the info already exists, it must be correct,
948 * we may be picking up a partially finished split */
949 subdir_info_s temp;
950 // subdir has already been fully copied
951 if (subdirs.count(i->first) && !get_info(dst, &temp)) {
952 for (map<string, ghobject_t>::iterator j = i->second.begin();
953 j != i->second.end();
954 ++j) {
955 moved[j->first] = j->second;
956 num_moved++;
957 objects.erase(j->first);
958 }
959 ++i;
960 continue;
961 }
962
963 subdir_info_s info_new;
964 info_new.objs = i->second.size();
965 info_new.subdirs = 0;
966 info_new.hash_level = level + 1;
967 if (must_merge(info_new) && !subdirs.count(i->first)) {
968 mapped.erase(i++);
969 continue;
970 }
971
972 // Subdir doesn't yet exist
973 if (!subdirs.count(i->first)) {
974 info.subdirs += 1;
975 r = create_path(dst);
976 if (r < 0)
977 return r;
978 } // else subdir has been created but only partially copied
979
980 for (map<string, ghobject_t>::iterator j = i->second.begin();
981 j != i->second.end();
982 ++j) {
983 moved[j->first] = j->second;
984 num_moved++;
985 objects.erase(j->first);
986 r = link_object(path, dst, j->second, j->first);
987 // May be a partially finished split
988 if (r < 0 && r != -EEXIST) {
989 return r;
990 }
991 }
992
993 r = fsync_dir(dst);
994 if (r < 0)
995 return r;
996
997 // Presence of info must imply that all objects have been copied
998 r = set_info(dst, info_new);
999 if (r < 0)
1000 return r;
1001
1002 r = fsync_dir(dst);
1003 if (r < 0)
1004 return r;
1005
1006 ++i;
1007 }
1008 r = remove_objects(path, moved, &objects);
1009 if (r < 0)
1010 return r;
1011 info.objs = objects.size();
1012 r = reset_attr(path);
1013 if (r < 0)
1014 return r;
1015 r = fsync_dir(path);
1016 if (r < 0)
1017 return r;
1018 return end_split_or_merge(path);
1019 }
1020
1021 void HashIndex::get_path_components(const ghobject_t &oid,
1022 vector<string> *path) {
1023 char buf[MAX_HASH_LEVEL + 1];
1024 snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
1025
1026 // Path components are the hex characters of oid.hobj.hash, least
1027 // significant first
1028 for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
1029 path->push_back(string(&buf[i], 1));
1030 }
1031 }
1032
1033 string HashIndex::get_hash_str(uint32_t hash) {
1034 char buf[MAX_HASH_LEVEL + 1];
1035 snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
1036 string retval;
1037 for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
1038 retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
1039 }
1040 return retval;
1041 }
1042
1043 string HashIndex::get_path_str(const ghobject_t &oid) {
1044 ceph_assert(!oid.is_max());
1045 return get_hash_str(oid.hobj.get_hash());
1046 }
1047
1048 uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
1049 while (prefix.size() < sizeof(uint32_t) * 2) {
1050 prefix.push_back('0');
1051 }
1052 uint32_t hash;
1053 sscanf(prefix.c_str(), "%x", &hash);
1054 // nibble reverse
1055 hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
1056 hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
1057 hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
1058 return hash;
1059 }
1060
1061 int HashIndex::get_path_contents_by_hash_bitwise(
1062 const vector<string> &path,
1063 const ghobject_t *next_object,
1064 set<string, CmpHexdigitStringBitwise> *hash_prefixes,
1065 set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
1066 {
1067 map<string, ghobject_t> rev_objects;
1068 int r;
1069 r = list_objects(path, 0, 0, &rev_objects);
1070 if (r < 0)
1071 return r;
1072 // bitwise sort
1073 for (map<string, ghobject_t>::iterator i = rev_objects.begin();
1074 i != rev_objects.end();
1075 ++i) {
1076 if (next_object && i->second < *next_object)
1077 continue;
1078 string hash_prefix = get_path_str(i->second);
1079 hash_prefixes->insert(hash_prefix);
1080 objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
1081 }
1082 vector<string> subdirs;
1083 r = list_subdirs(path, &subdirs);
1084 if (r < 0)
1085 return r;
1086
1087 // sort subdirs bitwise (by reversing hex digit nibbles)
1088 std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
1089
1090 // Local to this function, we will convert the prefix strings
1091 // (previously simply the reversed hex digits) to also have each
1092 // digit's nibbles reversed. This will make the strings sort
1093 // bitwise.
1094 string cur_prefix;
1095 for (vector<string>::const_iterator i = path.begin();
1096 i != path.end();
1097 ++i) {
1098 cur_prefix.append(reverse_hexdigit_bits_string(*i));
1099 }
1100 string next_object_string;
1101 if (next_object)
1102 next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
1103 for (vector<string>::iterator i = subdirs.begin();
1104 i != subdirs.end();
1105 ++i) {
1106 string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
1107 if (next_object) {
1108 if (next_object->is_max())
1109 continue;
1110 if (candidate < next_object_string.substr(0, candidate.size()))
1111 continue;
1112 }
1113 // re-reverse the hex digit nibbles for the caller
1114 hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
1115 }
1116 return 0;
1117 }
1118
1119 int HashIndex::list_by_hash(const vector<string> &path,
1120 const ghobject_t &end,
1121 int max_count,
1122 ghobject_t *next,
1123 vector<ghobject_t> *out)
1124 {
1125 ceph_assert(out);
1126 return list_by_hash_bitwise(path, end, max_count, next, out);
1127 }
1128
1129 int HashIndex::list_by_hash_bitwise(
1130 const vector<string> &path,
1131 const ghobject_t& end,
1132 int max_count,
1133 ghobject_t *next,
1134 vector<ghobject_t> *out)
1135 {
1136 vector<string> next_path = path;
1137 next_path.push_back("");
1138 set<string, CmpHexdigitStringBitwise> hash_prefixes;
1139 set<pair<string, ghobject_t>, CmpPairBitwise> objects;
1140 int r = get_path_contents_by_hash_bitwise(path,
1141 next,
1142 &hash_prefixes,
1143 &objects);
1144 if (r < 0)
1145 return r;
1146 for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
1147 i != hash_prefixes.end();
1148 ++i) {
1149 dout(20) << __func__ << " prefix " << *i << dendl;
1150 set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
1151 make_pair(*i, ghobject_t()));
1152 if (j == objects.end() || j->first != *i) {
1153 *(next_path.rbegin()) = *(i->rbegin());
1154 ghobject_t next_recurse;
1155 if (next)
1156 next_recurse = *next;
1157 r = list_by_hash_bitwise(next_path,
1158 end,
1159 max_count,
1160 &next_recurse,
1161 out);
1162
1163 if (r < 0)
1164 return r;
1165 if (!next_recurse.is_max()) {
1166 if (next)
1167 *next = next_recurse;
1168 return 0;
1169 }
1170 } else {
1171 while (j != objects.end() && j->first == *i) {
1172 if (max_count > 0 && out->size() == (unsigned)max_count) {
1173 if (next)
1174 *next = j->second;
1175 return 0;
1176 }
1177 if (j->second >= end) {
1178 if (next)
1179 *next = j->second;
1180 return 0;
1181 }
1182 if (!next || j->second >= *next) {
1183 dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
1184 out->push_back(j->second);
1185 }
1186 ++j;
1187 }
1188 }
1189 }
1190 if (next)
1191 *next = ghobject_t::get_max();
1192 return 0;
1193 }
1194
1195