]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/filestore/HashIndex.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / os / filestore / HashIndex.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/compat.h"
16 #include "include/types.h"
17 #include "include/buffer.h"
18 #include "osd/osd_types.h"
19 #include <errno.h>
20
21 #include "HashIndex.h"
22
23 #include "common/errno.h"
24 #include "common/debug.h"
25 #define dout_context cct
26 #define dout_subsys ceph_subsys_filestore
27
28 using std::map;
29 using std::pair;
30 using std::set;
31 using std::string;
32 using std::vector;
33
34 using ceph::bufferptr;
35 using ceph::bufferlist;
36
37 const string HashIndex::SUBDIR_ATTR = "contents";
38 const string HashIndex::SETTINGS_ATTR = "settings";
39 const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
40
41 /// hex digit to integer value
42 int hex_to_int(char c)
43 {
44 if (c >= '0' && c <= '9')
45 return c - '0';
46 if (c >= 'A' && c <= 'F')
47 return c - 'A' + 10;
48 ceph_abort();
49 }
50
51 /// int value to hex digit
52 char int_to_hex(int v)
53 {
54 ceph_assert(v < 16);
55 if (v < 10)
56 return '0' + v;
57 return 'A' + v - 10;
58 }
59
60 /// reverse bits in a nibble (0..15)
61 int reverse_nibble_bits(int in)
62 {
63 ceph_assert(in < 16);
64 return
65 ((in & 8) >> 3) |
66 ((in & 4) >> 1) |
67 ((in & 2) << 1) |
68 ((in & 1) << 3);
69 }
70
71 /// reverse nibble bits in a hex digit
72 char reverse_hexdigit_bits(char c)
73 {
74 return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
75 }
76
77 /// reverse nibble bits in a hex string
78 string reverse_hexdigit_bits_string(string s)
79 {
80 for (unsigned i=0; i<s.size(); ++i)
81 s[i] = reverse_hexdigit_bits(s[i]);
82 return s;
83 }
84
85 /// compare hex digit (as length 1 string) bitwise
86 bool cmp_hexdigit_bitwise(const string& l, const string& r)
87 {
88 ceph_assert(l.length() == 1 && r.length() == 1);
89 int lv = hex_to_int(l[0]);
90 int rv = hex_to_int(r[0]);
91 ceph_assert(lv < 16);
92 ceph_assert(rv < 16);
93 return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
94 }
95
96 /// compare hex digit string bitwise
97 bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
98 {
99 string ll = reverse_hexdigit_bits_string(l);
100 string rr = reverse_hexdigit_bits_string(r);
101 return ll < rr;
102 }
103
104 int HashIndex::cleanup() {
105 bufferlist bl;
106 int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
107 if (r < 0) {
108 // No in progress operations!
109 return 0;
110 }
111 auto i = bl.cbegin();
112 InProgressOp in_progress(i);
113 subdir_info_s info;
114 r = get_info(in_progress.path, &info);
115 if (r == -ENOENT) {
116 return end_split_or_merge(in_progress.path);
117 } else if (r < 0) {
118 return r;
119 }
120
121 if (in_progress.is_split())
122 return complete_split(in_progress.path, info);
123 else if (in_progress.is_merge())
124 return complete_merge(in_progress.path, info);
125 else if (in_progress.is_col_split()) {
126 for (vector<string>::iterator i = in_progress.path.begin();
127 i != in_progress.path.end();
128 ++i) {
129 vector<string> path(in_progress.path.begin(), i);
130 int r = reset_attr(path);
131 if (r < 0)
132 return r;
133 }
134 return 0;
135 }
136 else
137 return -EINVAL;
138 }
139
140 int HashIndex::reset_attr(
141 const vector<string> &path)
142 {
143 int exists = 0;
144 int r = path_exists(path, &exists);
145 if (r < 0)
146 return r;
147 if (!exists)
148 return 0;
149 map<string, ghobject_t> objects;
150 vector<string> subdirs;
151 r = list_objects(path, 0, 0, &objects);
152 if (r < 0)
153 return r;
154 r = list_subdirs(path, &subdirs);
155 if (r < 0)
156 return r;
157
158 subdir_info_s info;
159 info.hash_level = path.size();
160 info.objs = objects.size();
161 info.subdirs = subdirs.size();
162 return set_info(path, info);
163 }
164
165 int HashIndex::col_split_level(
166 HashIndex &from,
167 HashIndex &to,
168 const vector<string> &path,
169 uint32_t inbits,
170 uint32_t match,
171 unsigned *mkdirred)
172 {
173 /* For each subdir, move, recurse, or ignore based on comparing the low order
174 * bits of the hash represented by the subdir path with inbits, match passed
175 * in.
176 */
177 vector<string> subdirs;
178 int r = from.list_subdirs(path, &subdirs);
179 if (r < 0)
180 return r;
181 map<string, ghobject_t> objects;
182 r = from.list_objects(path, 0, 0, &objects);
183 if (r < 0)
184 return r;
185
186 set<string> to_move;
187 for (vector<string>::iterator i = subdirs.begin();
188 i != subdirs.end();
189 ++i) {
190 uint32_t bits = 0;
191 uint32_t hash = 0;
192 vector<string> sub_path(path.begin(), path.end());
193 sub_path.push_back(*i);
194 path_to_hobject_hash_prefix(sub_path, &bits, &hash);
195 if (bits < inbits) {
196 if (hobject_t::match_hash(hash, bits, match)) {
197 r = col_split_level(
198 from,
199 to,
200 sub_path,
201 inbits,
202 match,
203 mkdirred);
204 if (r < 0)
205 return r;
206 if (*mkdirred > path.size())
207 *mkdirred = path.size();
208 } // else, skip, doesn't need to be moved or recursed into
209 } else {
210 if (hobject_t::match_hash(hash, inbits, match)) {
211 to_move.insert(*i);
212 }
213 } // else, skip, doesn't need to be moved or recursed into
214 }
215
216 /* Then, do the same for each object */
217 map<string, ghobject_t> objs_to_move;
218 for (map<string, ghobject_t>::iterator i = objects.begin();
219 i != objects.end();
220 ++i) {
221 if (i->second.match(inbits, match)) {
222 objs_to_move.insert(*i);
223 }
224 }
225
226 if (objs_to_move.empty() && to_move.empty())
227 return 0;
228
229 // Make parent directories as needed
230 while (*mkdirred < path.size()) {
231 ++*mkdirred;
232 int exists = 0;
233 vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
234 r = to.path_exists(creating_path, &exists);
235 if (r < 0)
236 return r;
237 if (exists)
238 continue;
239 subdir_info_s info;
240 info.objs = 0;
241 info.subdirs = 0;
242 info.hash_level = creating_path.size();
243 if (*mkdirred < path.size() - 1)
244 info.subdirs = 1;
245 r = to.start_col_split(creating_path);
246 if (r < 0)
247 return r;
248 r = to.create_path(creating_path);
249 if (r < 0)
250 return r;
251 r = to.set_info(creating_path, info);
252 if (r < 0)
253 return r;
254 r = to.end_split_or_merge(creating_path);
255 if (r < 0)
256 return r;
257 }
258
259 subdir_info_s from_info;
260 subdir_info_s to_info;
261 r = from.get_info(path, &from_info);
262 if (r < 0)
263 return r;
264 r = to.get_info(path, &to_info);
265 if (r < 0)
266 return r;
267
268 from.start_col_split(path);
269 to.start_col_split(path);
270
271 // Do subdir moves
272 for (set<string>::iterator i = to_move.begin();
273 i != to_move.end();
274 ++i) {
275 from_info.subdirs--;
276 to_info.subdirs++;
277 r = move_subdir(from, to, path, *i);
278 if (r < 0)
279 return r;
280 }
281
282 for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
283 i != objs_to_move.end();
284 ++i) {
285 from_info.objs--;
286 to_info.objs++;
287 r = move_object(from, to, path, *i);
288 if (r < 0)
289 return r;
290 }
291
292
293 r = to.set_info(path, to_info);
294 if (r < 0)
295 return r;
296 r = from.set_info(path, from_info);
297 if (r < 0)
298 return r;
299 from.end_split_or_merge(path);
300 to.end_split_or_merge(path);
301 return 0;
302 }
303
304 int HashIndex::_merge(
305 uint32_t bits,
306 CollectionIndex* dest) {
307 dout(20) << __func__ << " bits " << bits << dendl;
308 ceph_assert(collection_version() == dest->collection_version());
309
310 vector<string> emptypath;
311
312 // pre-split to common/target level so that any shared prefix DIR_?
313 // directories already exist at the destination. Since each
314 // directory is a nibble (4 bits),
315 unsigned shared = bits / 4;
316 dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl;
317 if (shared) {
318 split_dirs(emptypath, shared);
319 ((HashIndex*)dest)->split_dirs(emptypath, shared);
320 }
321
322 // now merge the contents
323 _merge_dirs(*this, *(HashIndex*)dest, emptypath);
324
325 return 0;
326 }
327
328 int HashIndex::_merge_dirs(
329 HashIndex& from,
330 HashIndex& to,
331 const vector<string>& path)
332 {
333 dout(20) << __func__ << " path " << path << dendl;
334 int r;
335
336 vector<string> src_subs, dst_subs;
337 r = from.list_subdirs(path, &src_subs);
338 if (r < 0) {
339 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
340 << " r " << r << " from "
341 << "from.list_subdirs"
342 << dendl;
343 return r;
344 }
345 r = to.list_subdirs(path, &dst_subs);
346 if (r < 0) {
347 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
348 << " r " << r << " from "
349 << "to.list_subdirs"
350 << dendl;
351 return r;
352 }
353
354 for (auto& i : src_subs) {
355 if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) {
356 // move it
357 r = move_subdir(from, to, path, i);
358 if (r < 0) {
359 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
360 << " r " << r << " from "
361 << "move_subdir(...,"
362 << path << "," << i << ")"
363 << dendl;
364 return r;
365 }
366 } else {
367 // common, recurse!
368 vector<string> nested = path;
369 nested.push_back(i);
370 r = _merge_dirs(from, to, nested);
371 if (r < 0) {
372 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
373 << " r " << r << " from "
374 << "rec _merge_dirs"
375 << dendl;
376 return r;
377 }
378
379 // now remove it
380 r = remove_path(nested);
381 if (r < 0) {
382 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
383 << " r " << r << " from "
384 << "remove_path "
385 << nested
386 << dendl;
387 return r;
388 }
389 }
390 }
391
392 // objects
393 map<string, ghobject_t> objects;
394 r = from.list_objects(path, 0, 0, &objects);
395 if (r < 0) {
396 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
397 << " r " << r << " from "
398 << "from.list_objects"
399 << dendl;
400 return r;
401 }
402
403 for (auto& i : objects) {
404 r = move_object(from, to, path, i);
405 if (r < 0) {
406 lgeneric_subdout(g_ceph_context,filestore,20) << __func__
407 << " r " << r << " from "
408 << "move_object(...,"
409 << path << "," << i << ")"
410 << dendl;
411 return r;
412 }
413 }
414
415 return 0;
416 }
417
418
419 int HashIndex::_split(
420 uint32_t match,
421 uint32_t bits,
422 CollectionIndex* dest) {
423 ceph_assert(collection_version() == dest->collection_version());
424 unsigned mkdirred = 0;
425
426 return col_split_level(
427 *this,
428 *static_cast<HashIndex*>(dest),
429 vector<string>(),
430 bits,
431 match,
432 &mkdirred);
433 }
434
435 int HashIndex::split_dirs(const vector<string> &path, int target_level) {
436 dout(20) << __func__ << " " << path << " target level: "
437 << target_level << dendl;
438 subdir_info_s info;
439 int r = get_info(path, &info);
440 if (r < 0) {
441 dout(10) << "error looking up info for " << path << ": "
442 << cpp_strerror(r) << dendl;
443 return r;
444 }
445
446 if (must_split(info, target_level)) {
447 dout(1) << __func__ << " " << path << " has " << info.objs
448 << " objects, " << info.hash_level
449 << " level, starting split in pg " << coll() << "." << dendl;
450 r = initiate_split(path, info);
451 if (r < 0) {
452 dout(10) << "error initiating split on " << path << ": "
453 << cpp_strerror(r) << dendl;
454 return r;
455 }
456
457 r = complete_split(path, info);
458 dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
459 << dendl;
460 if (r < 0) {
461 dout(10) << "error completing split on " << path << ": "
462 << cpp_strerror(r) << dendl;
463 return r;
464 }
465 }
466
467 vector<string> subdirs;
468 r = list_subdirs(path, &subdirs);
469 if (r < 0) {
470 dout(10) << "error listing subdirs of " << path << ": "
471 << cpp_strerror(r) << dendl;
472 return r;
473 }
474 for (vector<string>::const_iterator it = subdirs.begin();
475 it != subdirs.end(); ++it) {
476 vector<string> subdir_path(path);
477 subdir_path.push_back(*it);
478 r = split_dirs(subdir_path, target_level);
479 if (r < 0) {
480 return r;
481 }
482 }
483
484 return r;
485 }
486
487 int HashIndex::apply_layout_settings(int target_level) {
488 vector<string> path;
489 dout(10) << __func__ << " split multiple = " << split_multiplier
490 << " merge threshold = " << merge_threshold
491 << " split rand factor = " << cct->_conf->filestore_split_rand_factor
492 << " target level = " << target_level
493 << dendl;
494 int r = write_settings();
495 if (r < 0)
496 return r;
497 return split_dirs(path, target_level);
498 }
499
500 int HashIndex::_init() {
501 subdir_info_s info;
502 vector<string> path;
503 int r = set_info(path, info);
504 if (r < 0)
505 return r;
506 return write_settings();
507 }
508
509 int HashIndex::write_settings() {
510 if (cct->_conf->filestore_split_rand_factor > 0) {
511 settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor;
512 } else {
513 settings.split_rand_factor = 0;
514 }
515 vector<string> path;
516 bufferlist bl;
517 settings.encode(bl);
518 return add_attr_path(path, SETTINGS_ATTR, bl);
519 }
520
521 int HashIndex::read_settings() {
522 vector<string> path;
523 bufferlist bl;
524 int r = get_attr_path(path, SETTINGS_ATTR, bl);
525 if (r == -ENODATA)
526 return 0;
527 if (r < 0) {
528 derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl;
529 return r;
530 }
531 auto it = bl.cbegin();
532 settings.decode(it);
533 dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl;
534 return 0;
535 }
536
537 /* LFNIndex virtual method implementations */
538 int HashIndex::_created(const vector<string> &path,
539 const ghobject_t &oid,
540 const string &mangled_name) {
541 subdir_info_s info;
542 int r;
543 r = get_info(path, &info);
544 if (r < 0)
545 return r;
546 info.objs++;
547 r = set_info(path, info);
548 if (r < 0)
549 return r;
550
551 if (must_split(info)) {
552 dout(1) << __func__ << " " << path << " has " << info.objs
553 << " objects, starting split in pg " << coll() << "." << dendl;
554 int r = initiate_split(path, info);
555 if (r < 0) {
556 derr << __func__ << " error starting split " << path << " in pg "
557 << coll() << ": " << cpp_strerror(r) << dendl;
558 ceph_assert(!cct->_conf->filestore_fail_eio);
559 } else {
560 r = complete_split(path, info);
561 if (r < 0) {
562 derr << __func__ << " error completing split " << path << " in pg "
563 << coll() << ": " << cpp_strerror(r) << dendl;
564 ceph_assert(!cct->_conf->filestore_fail_eio);
565 }
566 dout(1) << __func__ << " " << path << " split completed in pg " << coll()
567 << "." << dendl;
568 }
569 }
570
571 return 0;
572 }
573
574 int HashIndex::_remove(const vector<string> &path,
575 const ghobject_t &oid,
576 const string &mangled_name) {
577 int r;
578 r = remove_object(path, oid);
579 if (r < 0)
580 return r;
581 subdir_info_s info;
582 r = get_info(path, &info);
583 if (r < 0)
584 return r;
585 info.objs--;
586 r = set_info(path, info);
587 if (r < 0)
588 return r;
589
590 if (must_merge(info)) {
591 dout(1) << __func__ << " " << path << " has " << info.objs
592 << " objects, starting merge in pg " << coll() << "." << dendl;
593 r = initiate_merge(path, info);
594 if (r < 0) {
595 derr << __func__ << " error starting merge " << path << " in pg "
596 << coll() << ": " << cpp_strerror(r) << dendl;
597 ceph_assert(!cct->_conf->filestore_fail_eio);
598 } else {
599 r = complete_merge(path, info);
600 if (r < 0) {
601 derr << __func__ << " error completing merge " << path << " in pg "
602 << coll() << ": " << cpp_strerror(r) << dendl;
603 ceph_assert(!cct->_conf->filestore_fail_eio);
604 }
605 dout(1) << __func__ << " " << path << " merge completed in pg " << coll()
606 << "." << dendl;
607 }
608 }
609
610 return 0;
611 }
612
613 int HashIndex::_lookup(const ghobject_t &oid,
614 vector<string> *path,
615 string *mangled_name,
616 int *hardlink) {
617 vector<string> path_comp;
618 get_path_components(oid, &path_comp);
619 vector<string>::iterator next = path_comp.begin();
620 int exists;
621 while (1) {
622 int r = path_exists(*path, &exists);
623 if (r < 0)
624 return r;
625 if (!exists) {
626 if (path->empty())
627 return -ENOENT;
628 path->pop_back();
629 break;
630 }
631 if (next == path_comp.end())
632 break;
633 path->push_back(*(next++));
634 }
635 return get_mangled_name(*path, oid, mangled_name, hardlink);
636 }
637
638 int HashIndex::_collection_list_partial(const ghobject_t &start,
639 const ghobject_t &end,
640 int max_count,
641 vector<ghobject_t> *ls,
642 ghobject_t *next) {
643 vector<string> path;
644 ghobject_t _next;
645 if (!next)
646 next = &_next;
647 *next = start;
648 dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
649 return list_by_hash(path, end, max_count, next, ls);
650 }
651
652 int HashIndex::prep_delete() {
653 return recursive_remove(vector<string>());
654 }
655
656 int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
657 int ret;
658 vector<string> path;
659 subdir_info_s root_info;
660 // Make sure there is neither objects nor sub-folders
661 // in this collection
662 ret = get_info(path, &root_info);
663 if (ret < 0)
664 return ret;
665
666 // Do the folder splitting first
667 ret = pre_split_folder(pg_num, expected_num_objs);
668 if (ret < 0)
669 return ret;
670 // Initialize the folder info starting from root
671 return init_split_folder(path, 0);
672 }
673
674 int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
675 {
676 // If folder merging is enabled (by setting the threshold positive),
677 // no need to split
678 if (merge_threshold > 0)
679 return 0;
680 const coll_t c = coll();
681 // Do not split if the expected number of objects in this collection is zero (by default)
682 if (expected_num_objs == 0)
683 return 0;
684
685 // Calculate the number of leaf folders (which actually store files)
686 // need to be created
687 const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16;
688 uint64_t leavies = expected_num_objs / objs_per_folder ;
689 // No need to split
690 if (leavies == 0 || expected_num_objs == objs_per_folder)
691 return 0;
692
693 spg_t spgid;
694 if (!c.is_pg_prefix(&spgid))
695 return -EINVAL;
696 const ps_t ps = spgid.pgid.ps();
697
698 // the most significant bits of pg_num
699 const int pg_num_bits = calc_num_bits(pg_num - 1);
700 ps_t tmp_id = ps;
701 // calculate the number of levels we only create one sub folder
702 int num = pg_num_bits / 4;
703 // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
704 // so that splitting starts at level 3
705 if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
706 --num;
707 }
708
709 int ret;
710 // Start with creation that only has one subfolder
711 vector<string> paths;
712 int dump_num = num;
713 while (num-- > 0) {
714 ps_t v = tmp_id & 0x0000000f;
715 paths.push_back(to_hex(v));
716 ret = create_path(paths);
717 if (ret < 0 && ret != -EEXIST)
718 return ret;
719 tmp_id = tmp_id >> 4;
720 }
721
722 // Starting from here, we can split by creating multiple subfolders
723 const int left_bits = pg_num_bits - dump_num * 4;
724 // this variable denotes how many bits (for this level) that can be
725 // used for sub folder splitting
726 int split_bits = 4 - left_bits;
727 // the below logic is inspired by rados.h#ceph_stable_mod,
728 // it basically determines how many sub-folders should we
729 // create for splitting
730 ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT
731 if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
732 ++split_bits;
733 }
734 const uint32_t subs = (1 << split_bits);
735 // Calculate how many levels we create starting from here
736 int level = 0;
737 int level_limit = MAX_HASH_LEVEL - dump_num - 1;
738 uint64_t actual_leaves = subs;
739 while (actual_leaves < leavies && level < level_limit) {
740 ++level;
741 actual_leaves <<= 4;
742 }
743 for (uint32_t i = 0; i < subs; ++i) {
744 ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT
745 int v = tmp_id | (i << ((4 - split_bits) % 4));
746 paths.push_back(to_hex(v));
747 ret = create_path(paths);
748 if (ret < 0 && ret != -EEXIST)
749 return ret;
750 ret = recursive_create_path(paths, level);
751 if (ret < 0)
752 return ret;
753 paths.pop_back();
754 }
755 return 0;
756 }
757
758 int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
759 {
760 // Get the number of sub directories for the current path
761 vector<string> subdirs;
762 int ret = list_subdirs(path, &subdirs);
763 if (ret < 0)
764 return ret;
765 subdir_info_s info;
766 info.subdirs = subdirs.size();
767 info.hash_level = hash_level;
768 ret = set_info(path, info);
769 if (ret < 0)
770 return ret;
771 ret = fsync_dir(path);
772 if (ret < 0)
773 return ret;
774
775 // Do the same for subdirs
776 vector<string>::const_iterator iter;
777 for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
778 path.push_back(*iter);
779 ret = init_split_folder(path, hash_level + 1);
780 if (ret < 0)
781 return ret;
782 path.pop_back();
783 }
784 return 0;
785 }
786
787 int HashIndex::recursive_create_path(vector<string>& path, int level)
788 {
789 if (level == 0)
790 return 0;
791 for (int i = 0; i < 16; ++i) {
792 path.push_back(to_hex(i));
793 int ret = create_path(path);
794 if (ret < 0 && ret != -EEXIST)
795 return ret;
796 ret = recursive_create_path(path, level - 1);
797 if (ret < 0)
798 return ret;
799 path.pop_back();
800 }
801 return 0;
802 }
803
804 int HashIndex::recursive_remove(const vector<string> &path) {
805 return _recursive_remove(path, true);
806 }
807
808 int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
809 vector<string> subdirs;
810 dout(20) << __func__ << " path=" << path << dendl;
811 int r = list_subdirs(path, &subdirs);
812 if (r < 0)
813 return r;
814 map<string, ghobject_t> objects;
815 r = list_objects(path, 0, 0, &objects);
816 if (r < 0)
817 return r;
818 if (!objects.empty())
819 return -ENOTEMPTY;
820 vector<string> subdir(path);
821 for (vector<string>::iterator i = subdirs.begin();
822 i != subdirs.end();
823 ++i) {
824 subdir.push_back(*i);
825 r = _recursive_remove(subdir, false);
826 if (r < 0)
827 return r;
828 subdir.pop_back();
829 }
830 if (top)
831 return 0;
832 else
833 return remove_path(path);
834 }
835
836 int HashIndex::start_col_split(const vector<string> &path) {
837 bufferlist bl;
838 InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
839 op_tag.encode(bl);
840 int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
841 if (r < 0)
842 return r;
843 return fsync_dir(vector<string>());
844 }
845
846 int HashIndex::start_split(const vector<string> &path) {
847 bufferlist bl;
848 InProgressOp op_tag(InProgressOp::SPLIT, path);
849 op_tag.encode(bl);
850 int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
851 if (r < 0)
852 return r;
853 return fsync_dir(vector<string>());
854 }
855
856 int HashIndex::start_merge(const vector<string> &path) {
857 bufferlist bl;
858 InProgressOp op_tag(InProgressOp::MERGE, path);
859 op_tag.encode(bl);
860 int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
861 if (r < 0)
862 return r;
863 return fsync_dir(vector<string>());
864 }
865
866 int HashIndex::end_split_or_merge(const vector<string> &path) {
867 return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
868 }
869
870 int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
871 bufferlist buf;
872 int r = get_attr_path(path, SUBDIR_ATTR, buf);
873 if (r < 0)
874 return r;
875 auto bufiter = buf.cbegin();
876 info->decode(bufiter);
877 ceph_assert(path.size() == (unsigned)info->hash_level);
878 return 0;
879 }
880
881 int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
882 bufferlist buf;
883 ceph_assert(path.size() == (unsigned)info.hash_level);
884 info.encode(buf);
885 return add_attr_path(path, SUBDIR_ATTR, buf);
886 }
887
888 bool HashIndex::must_merge(const subdir_info_s &info) {
889 return (info.hash_level > 0 &&
890 merge_threshold > 0 &&
891 info.objs < (unsigned)merge_threshold &&
892 info.subdirs == 0);
893 }
894
895 bool HashIndex::must_split(const subdir_info_s &info, int target_level) {
896 // target_level is used for ceph-objectstore-tool to split dirs offline.
897 // if it is set (defalult is 0) and current hash level < target_level,
898 // this dir would be split no matters how many objects it has.
899 return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
900 ((target_level > 0 && info.hash_level < (unsigned)target_level) ||
901 (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16))));
902 }
903
904 int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
905 return start_merge(path);
906 }
907
908 int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
909 vector<string> dst = path;
910 dst.pop_back();
911 subdir_info_s dstinfo;
912 int r, exists;
913 r = path_exists(path, &exists);
914 if (r < 0)
915 return r;
916 r = get_info(dst, &dstinfo);
917 if (r < 0)
918 return r;
919 if (exists) {
920 r = move_objects(path, dst);
921 if (r < 0)
922 return r;
923 r = reset_attr(dst);
924 if (r < 0)
925 return r;
926 r = remove_path(path);
927 if (r < 0)
928 return r;
929 }
930 if (must_merge(dstinfo)) {
931 r = initiate_merge(dst, dstinfo);
932 if (r < 0)
933 return r;
934 r = fsync_dir(dst);
935 if (r < 0)
936 return r;
937 return complete_merge(dst, dstinfo);
938 }
939 r = fsync_dir(dst);
940 if (r < 0)
941 return r;
942 return end_split_or_merge(path);
943 }
944
945 int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
946 return start_split(path);
947 }
948
949 int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
950 int level = info.hash_level;
951 map<string, ghobject_t> objects;
952 vector<string> dst = path;
953 int r;
954 dst.push_back("");
955 r = list_objects(path, 0, 0, &objects);
956 if (r < 0)
957 return r;
958 vector<string> subdirs_vec;
959 r = list_subdirs(path, &subdirs_vec);
960 if (r < 0)
961 return r;
962 set<string> subdirs;
963 subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
964 map<string, map<string, ghobject_t> > mapped;
965 map<string, ghobject_t> moved;
966 int num_moved = 0;
967 for (map<string, ghobject_t>::iterator i = objects.begin();
968 i != objects.end();
969 ++i) {
970 vector<string> new_path;
971 get_path_components(i->second, &new_path);
972 mapped[new_path[level]][i->first] = i->second;
973 }
974 for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
975 i != mapped.end();
976 ) {
977 dst[level] = i->first;
978 /* If the info already exists, it must be correct,
979 * we may be picking up a partially finished split */
980 subdir_info_s temp;
981 // subdir has already been fully copied
982 if (subdirs.count(i->first) && !get_info(dst, &temp)) {
983 for (map<string, ghobject_t>::iterator j = i->second.begin();
984 j != i->second.end();
985 ++j) {
986 moved[j->first] = j->second;
987 num_moved++;
988 objects.erase(j->first);
989 }
990 ++i;
991 continue;
992 }
993
994 subdir_info_s info_new;
995 info_new.objs = i->second.size();
996 info_new.subdirs = 0;
997 info_new.hash_level = level + 1;
998 if (must_merge(info_new) && !subdirs.count(i->first)) {
999 mapped.erase(i++);
1000 continue;
1001 }
1002
1003 // Subdir doesn't yet exist
1004 if (!subdirs.count(i->first)) {
1005 info.subdirs += 1;
1006 r = create_path(dst);
1007 if (r < 0)
1008 return r;
1009 } // else subdir has been created but only partially copied
1010
1011 for (map<string, ghobject_t>::iterator j = i->second.begin();
1012 j != i->second.end();
1013 ++j) {
1014 moved[j->first] = j->second;
1015 num_moved++;
1016 objects.erase(j->first);
1017 r = link_object(path, dst, j->second, j->first);
1018 // May be a partially finished split
1019 if (r < 0 && r != -EEXIST) {
1020 return r;
1021 }
1022 }
1023
1024 r = fsync_dir(dst);
1025 if (r < 0)
1026 return r;
1027
1028 // Presence of info must imply that all objects have been copied
1029 r = set_info(dst, info_new);
1030 if (r < 0)
1031 return r;
1032
1033 r = fsync_dir(dst);
1034 if (r < 0)
1035 return r;
1036
1037 ++i;
1038 }
1039 r = remove_objects(path, moved, &objects);
1040 if (r < 0)
1041 return r;
1042 info.objs = objects.size();
1043 r = reset_attr(path);
1044 if (r < 0)
1045 return r;
1046 r = fsync_dir(path);
1047 if (r < 0)
1048 return r;
1049 return end_split_or_merge(path);
1050 }
1051
1052 void HashIndex::get_path_components(const ghobject_t &oid,
1053 vector<string> *path) {
1054 char buf[MAX_HASH_LEVEL + 1];
1055 snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
1056
1057 // Path components are the hex characters of oid.hobj.hash, least
1058 // significant first
1059 for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
1060 path->push_back(string(&buf[i], 1));
1061 }
1062 }
1063
1064 string HashIndex::get_hash_str(uint32_t hash) {
1065 char buf[MAX_HASH_LEVEL + 1];
1066 snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
1067 string retval;
1068 for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
1069 retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
1070 }
1071 return retval;
1072 }
1073
1074 string HashIndex::get_path_str(const ghobject_t &oid) {
1075 ceph_assert(!oid.is_max());
1076 return get_hash_str(oid.hobj.get_hash());
1077 }
1078
1079 uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
1080 while (prefix.size() < sizeof(uint32_t) * 2) {
1081 prefix.push_back('0');
1082 }
1083 uint32_t hash;
1084 sscanf(prefix.c_str(), "%x", &hash);
1085 // nibble reverse
1086 hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
1087 hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
1088 hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
1089 return hash;
1090 }
1091
1092 int HashIndex::get_path_contents_by_hash_bitwise(
1093 const vector<string> &path,
1094 const ghobject_t *next_object,
1095 set<string, CmpHexdigitStringBitwise> *hash_prefixes,
1096 set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
1097 {
1098 map<string, ghobject_t> rev_objects;
1099 int r;
1100 r = list_objects(path, 0, 0, &rev_objects);
1101 if (r < 0)
1102 return r;
1103 // bitwise sort
1104 for (map<string, ghobject_t>::iterator i = rev_objects.begin();
1105 i != rev_objects.end();
1106 ++i) {
1107 if (next_object && i->second < *next_object)
1108 continue;
1109 string hash_prefix = get_path_str(i->second);
1110 hash_prefixes->insert(hash_prefix);
1111 objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
1112 }
1113 vector<string> subdirs;
1114 r = list_subdirs(path, &subdirs);
1115 if (r < 0)
1116 return r;
1117
1118 // sort subdirs bitwise (by reversing hex digit nibbles)
1119 std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
1120
1121 // Local to this function, we will convert the prefix strings
1122 // (previously simply the reversed hex digits) to also have each
1123 // digit's nibbles reversed. This will make the strings sort
1124 // bitwise.
1125 string cur_prefix;
1126 for (vector<string>::const_iterator i = path.begin();
1127 i != path.end();
1128 ++i) {
1129 cur_prefix.append(reverse_hexdigit_bits_string(*i));
1130 }
1131 string next_object_string;
1132 if (next_object)
1133 next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
1134 for (vector<string>::iterator i = subdirs.begin();
1135 i != subdirs.end();
1136 ++i) {
1137 string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
1138 if (next_object) {
1139 if (next_object->is_max())
1140 continue;
1141 if (candidate < next_object_string.substr(0, candidate.size()))
1142 continue;
1143 }
1144 // re-reverse the hex digit nibbles for the caller
1145 hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
1146 }
1147 return 0;
1148 }
1149
1150 int HashIndex::list_by_hash(const vector<string> &path,
1151 const ghobject_t &end,
1152 int max_count,
1153 ghobject_t *next,
1154 vector<ghobject_t> *out)
1155 {
1156 ceph_assert(out);
1157 return list_by_hash_bitwise(path, end, max_count, next, out);
1158 }
1159
1160 int HashIndex::list_by_hash_bitwise(
1161 const vector<string> &path,
1162 const ghobject_t& end,
1163 int max_count,
1164 ghobject_t *next,
1165 vector<ghobject_t> *out)
1166 {
1167 vector<string> next_path = path;
1168 next_path.push_back("");
1169 set<string, CmpHexdigitStringBitwise> hash_prefixes;
1170 set<pair<string, ghobject_t>, CmpPairBitwise> objects;
1171 int r = get_path_contents_by_hash_bitwise(path,
1172 next,
1173 &hash_prefixes,
1174 &objects);
1175 if (r < 0)
1176 return r;
1177 for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
1178 i != hash_prefixes.end();
1179 ++i) {
1180 dout(20) << __func__ << " prefix " << *i << dendl;
1181 set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
1182 make_pair(*i, ghobject_t()));
1183 if (j == objects.end() || j->first != *i) {
1184 *(next_path.rbegin()) = *(i->rbegin());
1185 ghobject_t next_recurse;
1186 if (next)
1187 next_recurse = *next;
1188 r = list_by_hash_bitwise(next_path,
1189 end,
1190 max_count,
1191 &next_recurse,
1192 out);
1193
1194 if (r < 0)
1195 return r;
1196 if (!next_recurse.is_max()) {
1197 if (next)
1198 *next = next_recurse;
1199 return 0;
1200 }
1201 } else {
1202 while (j != objects.end() && j->first == *i) {
1203 if (max_count > 0 && out->size() == (unsigned)max_count) {
1204 if (next)
1205 *next = j->second;
1206 return 0;
1207 }
1208 if (j->second >= end) {
1209 if (next)
1210 *next = j->second;
1211 return 0;
1212 }
1213 if (!next || j->second >= *next) {
1214 dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
1215 out->push_back(j->second);
1216 }
1217 ++j;
1218 }
1219 }
1220 }
1221 if (next)
1222 *next = ghobject_t::get_max();
1223 return 0;
1224 }
1225
1226