]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
c07f9fc5 | 15 | #include "include/compat.h" |
7c673cae FG |
16 | #include "include/types.h" |
17 | #include "include/buffer.h" | |
18 | #include "osd/osd_types.h" | |
19 | #include <errno.h> | |
20 | ||
21 | #include "HashIndex.h" | |
22 | ||
23 | #include "common/errno.h" | |
24 | #include "common/debug.h" | |
25 | #define dout_context cct | |
26 | #define dout_subsys ceph_subsys_filestore | |
27 | ||
28 | const string HashIndex::SUBDIR_ATTR = "contents"; | |
224ce89b | 29 | const string HashIndex::SETTINGS_ATTR = "settings"; |
7c673cae FG |
30 | const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op"; |
31 | ||
32 | /// hex digit to integer value | |
33 | int hex_to_int(char c) | |
34 | { | |
35 | if (c >= '0' && c <= '9') | |
36 | return c - '0'; | |
37 | if (c >= 'A' && c <= 'F') | |
38 | return c - 'A' + 10; | |
39 | ceph_abort(); | |
40 | } | |
41 | ||
42 | /// int value to hex digit | |
43 | char int_to_hex(int v) | |
44 | { | |
45 | assert(v < 16); | |
46 | if (v < 10) | |
47 | return '0' + v; | |
48 | return 'A' + v - 10; | |
49 | } | |
50 | ||
51 | /// reverse bits in a nibble (0..15) | |
52 | int reverse_nibble_bits(int in) | |
53 | { | |
54 | assert(in < 16); | |
55 | return | |
56 | ((in & 8) >> 3) | | |
57 | ((in & 4) >> 1) | | |
58 | ((in & 2) << 1) | | |
59 | ((in & 1) << 3); | |
60 | } | |
61 | ||
62 | /// reverse nibble bits in a hex digit | |
63 | char reverse_hexdigit_bits(char c) | |
64 | { | |
65 | return int_to_hex(reverse_nibble_bits(hex_to_int(c))); | |
66 | } | |
67 | ||
68 | /// reverse nibble bits in a hex string | |
69 | string reverse_hexdigit_bits_string(string s) | |
70 | { | |
71 | for (unsigned i=0; i<s.size(); ++i) | |
72 | s[i] = reverse_hexdigit_bits(s[i]); | |
73 | return s; | |
74 | } | |
75 | ||
76 | /// compare hex digit (as length 1 string) bitwise | |
77 | bool cmp_hexdigit_bitwise(const string& l, const string& r) | |
78 | { | |
79 | assert(l.length() == 1 && r.length() == 1); | |
80 | int lv = hex_to_int(l[0]); | |
81 | int rv = hex_to_int(r[0]); | |
82 | assert(lv < 16); | |
83 | assert(rv < 16); | |
84 | return reverse_nibble_bits(lv) < reverse_nibble_bits(rv); | |
85 | } | |
86 | ||
87 | /// compare hex digit string bitwise | |
88 | bool cmp_hexdigit_string_bitwise(const string& l, const string& r) | |
89 | { | |
90 | string ll = reverse_hexdigit_bits_string(l); | |
91 | string rr = reverse_hexdigit_bits_string(r); | |
92 | return ll < rr; | |
93 | } | |
94 | ||
95 | int HashIndex::cleanup() { | |
96 | bufferlist bl; | |
97 | int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); | |
98 | if (r < 0) { | |
99 | // No in progress operations! | |
100 | return 0; | |
101 | } | |
102 | bufferlist::iterator i = bl.begin(); | |
103 | InProgressOp in_progress(i); | |
104 | subdir_info_s info; | |
105 | r = get_info(in_progress.path, &info); | |
106 | if (r == -ENOENT) { | |
107 | return end_split_or_merge(in_progress.path); | |
108 | } else if (r < 0) { | |
109 | return r; | |
110 | } | |
111 | ||
112 | if (in_progress.is_split()) | |
113 | return complete_split(in_progress.path, info); | |
114 | else if (in_progress.is_merge()) | |
115 | return complete_merge(in_progress.path, info); | |
116 | else if (in_progress.is_col_split()) { | |
117 | for (vector<string>::iterator i = in_progress.path.begin(); | |
118 | i != in_progress.path.end(); | |
119 | ++i) { | |
120 | vector<string> path(in_progress.path.begin(), i); | |
121 | int r = reset_attr(path); | |
122 | if (r < 0) | |
123 | return r; | |
124 | } | |
125 | return 0; | |
126 | } | |
127 | else | |
128 | return -EINVAL; | |
129 | } | |
130 | ||
131 | int HashIndex::reset_attr( | |
132 | const vector<string> &path) | |
133 | { | |
134 | int exists = 0; | |
135 | int r = path_exists(path, &exists); | |
136 | if (r < 0) | |
137 | return r; | |
138 | if (!exists) | |
139 | return 0; | |
140 | map<string, ghobject_t> objects; | |
141 | vector<string> subdirs; | |
142 | r = list_objects(path, 0, 0, &objects); | |
143 | if (r < 0) | |
144 | return r; | |
145 | r = list_subdirs(path, &subdirs); | |
146 | if (r < 0) | |
147 | return r; | |
148 | ||
149 | subdir_info_s info; | |
150 | info.hash_level = path.size(); | |
151 | info.objs = objects.size(); | |
152 | info.subdirs = subdirs.size(); | |
153 | return set_info(path, info); | |
154 | } | |
155 | ||
156 | int HashIndex::col_split_level( | |
157 | HashIndex &from, | |
158 | HashIndex &to, | |
159 | const vector<string> &path, | |
160 | uint32_t inbits, | |
161 | uint32_t match, | |
162 | unsigned *mkdirred) | |
163 | { | |
164 | /* For each subdir, move, recurse, or ignore based on comparing the low order | |
165 | * bits of the hash represented by the subdir path with inbits, match passed | |
166 | * in. | |
167 | */ | |
168 | vector<string> subdirs; | |
169 | int r = from.list_subdirs(path, &subdirs); | |
170 | if (r < 0) | |
171 | return r; | |
172 | map<string, ghobject_t> objects; | |
173 | r = from.list_objects(path, 0, 0, &objects); | |
174 | if (r < 0) | |
175 | return r; | |
176 | ||
177 | set<string> to_move; | |
178 | for (vector<string>::iterator i = subdirs.begin(); | |
179 | i != subdirs.end(); | |
180 | ++i) { | |
181 | uint32_t bits = 0; | |
182 | uint32_t hash = 0; | |
183 | vector<string> sub_path(path.begin(), path.end()); | |
184 | sub_path.push_back(*i); | |
185 | path_to_hobject_hash_prefix(sub_path, &bits, &hash); | |
186 | if (bits < inbits) { | |
187 | if (hobject_t::match_hash(hash, bits, match)) { | |
188 | r = col_split_level( | |
189 | from, | |
190 | to, | |
191 | sub_path, | |
192 | inbits, | |
193 | match, | |
194 | mkdirred); | |
195 | if (r < 0) | |
196 | return r; | |
197 | if (*mkdirred > path.size()) | |
198 | *mkdirred = path.size(); | |
199 | } // else, skip, doesn't need to be moved or recursed into | |
200 | } else { | |
201 | if (hobject_t::match_hash(hash, inbits, match)) { | |
202 | to_move.insert(*i); | |
203 | } | |
204 | } // else, skip, doesn't need to be moved or recursed into | |
205 | } | |
206 | ||
207 | /* Then, do the same for each object */ | |
208 | map<string, ghobject_t> objs_to_move; | |
209 | for (map<string, ghobject_t>::iterator i = objects.begin(); | |
210 | i != objects.end(); | |
211 | ++i) { | |
212 | if (i->second.match(inbits, match)) { | |
213 | objs_to_move.insert(*i); | |
214 | } | |
215 | } | |
216 | ||
217 | if (objs_to_move.empty() && to_move.empty()) | |
218 | return 0; | |
219 | ||
220 | // Make parent directories as needed | |
221 | while (*mkdirred < path.size()) { | |
222 | ++*mkdirred; | |
223 | int exists = 0; | |
224 | vector<string> creating_path(path.begin(), path.begin()+*mkdirred); | |
225 | r = to.path_exists(creating_path, &exists); | |
226 | if (r < 0) | |
227 | return r; | |
228 | if (exists) | |
229 | continue; | |
230 | subdir_info_s info; | |
231 | info.objs = 0; | |
232 | info.subdirs = 0; | |
233 | info.hash_level = creating_path.size(); | |
234 | if (*mkdirred < path.size() - 1) | |
235 | info.subdirs = 1; | |
236 | r = to.start_col_split(creating_path); | |
237 | if (r < 0) | |
238 | return r; | |
239 | r = to.create_path(creating_path); | |
240 | if (r < 0) | |
241 | return r; | |
242 | r = to.set_info(creating_path, info); | |
243 | if (r < 0) | |
244 | return r; | |
245 | r = to.end_split_or_merge(creating_path); | |
246 | if (r < 0) | |
247 | return r; | |
248 | } | |
249 | ||
250 | subdir_info_s from_info; | |
251 | subdir_info_s to_info; | |
252 | r = from.get_info(path, &from_info); | |
253 | if (r < 0) | |
254 | return r; | |
255 | r = to.get_info(path, &to_info); | |
256 | if (r < 0) | |
257 | return r; | |
258 | ||
259 | from.start_col_split(path); | |
260 | to.start_col_split(path); | |
261 | ||
262 | // Do subdir moves | |
263 | for (set<string>::iterator i = to_move.begin(); | |
264 | i != to_move.end(); | |
265 | ++i) { | |
266 | from_info.subdirs--; | |
267 | to_info.subdirs++; | |
268 | r = move_subdir(from, to, path, *i); | |
269 | if (r < 0) | |
270 | return r; | |
271 | } | |
272 | ||
273 | for (map<string, ghobject_t>::iterator i = objs_to_move.begin(); | |
274 | i != objs_to_move.end(); | |
275 | ++i) { | |
276 | from_info.objs--; | |
277 | to_info.objs++; | |
278 | r = move_object(from, to, path, *i); | |
279 | if (r < 0) | |
280 | return r; | |
281 | } | |
282 | ||
283 | ||
284 | r = to.set_info(path, to_info); | |
285 | if (r < 0) | |
286 | return r; | |
287 | r = from.set_info(path, from_info); | |
288 | if (r < 0) | |
289 | return r; | |
290 | from.end_split_or_merge(path); | |
291 | to.end_split_or_merge(path); | |
292 | return 0; | |
293 | } | |
294 | ||
295 | int HashIndex::_split( | |
296 | uint32_t match, | |
297 | uint32_t bits, | |
298 | CollectionIndex* dest) { | |
299 | assert(collection_version() == dest->collection_version()); | |
300 | unsigned mkdirred = 0; | |
301 | return col_split_level( | |
302 | *this, | |
303 | *static_cast<HashIndex*>(dest), | |
304 | vector<string>(), | |
305 | bits, | |
306 | match, | |
307 | &mkdirred); | |
308 | } | |
309 | ||
1adf2230 AA |
310 | int HashIndex::split_dirs(const vector<string> &path, int target_level) { |
311 | dout(20) << __func__ << " " << path << " target level: " | |
312 | << target_level << dendl; | |
7c673cae FG |
313 | subdir_info_s info; |
314 | int r = get_info(path, &info); | |
315 | if (r < 0) { | |
316 | dout(10) << "error looking up info for " << path << ": " | |
317 | << cpp_strerror(r) << dendl; | |
318 | return r; | |
319 | } | |
320 | ||
1adf2230 | 321 | if (must_split(info, target_level)) { |
7c673cae | 322 | dout(1) << __func__ << " " << path << " has " << info.objs |
1adf2230 AA |
323 | << " objects, " << info.hash_level |
324 | << " level, starting split in pg " << coll() << "." << dendl; | |
7c673cae FG |
325 | r = initiate_split(path, info); |
326 | if (r < 0) { | |
327 | dout(10) << "error initiating split on " << path << ": " | |
328 | << cpp_strerror(r) << dendl; | |
329 | return r; | |
330 | } | |
331 | ||
332 | r = complete_split(path, info); | |
1adf2230 | 333 | dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "." |
7c673cae FG |
334 | << dendl; |
335 | if (r < 0) { | |
336 | dout(10) << "error completing split on " << path << ": " | |
337 | << cpp_strerror(r) << dendl; | |
338 | return r; | |
339 | } | |
340 | } | |
341 | ||
342 | vector<string> subdirs; | |
343 | r = list_subdirs(path, &subdirs); | |
344 | if (r < 0) { | |
345 | dout(10) << "error listing subdirs of " << path << ": " | |
346 | << cpp_strerror(r) << dendl; | |
347 | return r; | |
348 | } | |
349 | for (vector<string>::const_iterator it = subdirs.begin(); | |
350 | it != subdirs.end(); ++it) { | |
351 | vector<string> subdir_path(path); | |
352 | subdir_path.push_back(*it); | |
1adf2230 | 353 | r = split_dirs(subdir_path, target_level); |
7c673cae FG |
354 | if (r < 0) { |
355 | return r; | |
356 | } | |
357 | } | |
358 | ||
359 | return r; | |
360 | } | |
361 | ||
1adf2230 | 362 | int HashIndex::apply_layout_settings(int target_level) { |
7c673cae FG |
363 | vector<string> path; |
364 | dout(10) << __func__ << " split multiple = " << split_multiplier | |
224ce89b WB |
365 | << " merge threshold = " << merge_threshold |
366 | << " split rand factor = " << cct->_conf->filestore_split_rand_factor | |
1adf2230 | 367 | << " target level = " << target_level |
224ce89b WB |
368 | << dendl; |
369 | int r = write_settings(); | |
370 | if (r < 0) | |
371 | return r; | |
1adf2230 | 372 | return split_dirs(path, target_level); |
7c673cae FG |
373 | } |
374 | ||
375 | int HashIndex::_init() { | |
376 | subdir_info_s info; | |
377 | vector<string> path; | |
224ce89b WB |
378 | int r = set_info(path, info); |
379 | if (r < 0) | |
380 | return r; | |
381 | return write_settings(); | |
382 | } | |
383 | ||
384 | int HashIndex::write_settings() { | |
385 | if (cct->_conf->filestore_split_rand_factor > 0) { | |
386 | settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor; | |
387 | } else { | |
388 | settings.split_rand_factor = 0; | |
389 | } | |
390 | vector<string> path; | |
391 | bufferlist bl; | |
392 | settings.encode(bl); | |
393 | return add_attr_path(path, SETTINGS_ATTR, bl); | |
394 | } | |
395 | ||
396 | int HashIndex::read_settings() { | |
397 | vector<string> path; | |
398 | bufferlist bl; | |
399 | int r = get_attr_path(path, SETTINGS_ATTR, bl); | |
400 | if (r == -ENODATA) | |
401 | return 0; | |
402 | if (r < 0) { | |
403 | derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl; | |
404 | return r; | |
405 | } | |
406 | bufferlist::iterator it = bl.begin(); | |
407 | settings.decode(it); | |
408 | dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl; | |
409 | return 0; | |
7c673cae FG |
410 | } |
411 | ||
412 | /* LFNIndex virtual method implementations */ | |
413 | int HashIndex::_created(const vector<string> &path, | |
414 | const ghobject_t &oid, | |
415 | const string &mangled_name) { | |
416 | subdir_info_s info; | |
417 | int r; | |
418 | r = get_info(path, &info); | |
419 | if (r < 0) | |
420 | return r; | |
421 | info.objs++; | |
422 | r = set_info(path, info); | |
423 | if (r < 0) | |
424 | return r; | |
425 | ||
426 | if (must_split(info)) { | |
427 | dout(1) << __func__ << " " << path << " has " << info.objs | |
1adf2230 | 428 | << " objects, starting split in pg " << coll() << "." << dendl; |
7c673cae FG |
429 | int r = initiate_split(path, info); |
430 | if (r < 0) | |
431 | return r; | |
432 | r = complete_split(path, info); | |
1adf2230 | 433 | dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "." |
7c673cae FG |
434 | << dendl; |
435 | return r; | |
436 | } else { | |
437 | return 0; | |
438 | } | |
439 | } | |
440 | ||
441 | int HashIndex::_remove(const vector<string> &path, | |
442 | const ghobject_t &oid, | |
443 | const string &mangled_name) { | |
444 | int r; | |
445 | r = remove_object(path, oid); | |
446 | if (r < 0) | |
447 | return r; | |
448 | subdir_info_s info; | |
449 | r = get_info(path, &info); | |
450 | if (r < 0) | |
451 | return r; | |
452 | info.objs--; | |
453 | r = set_info(path, info); | |
454 | if (r < 0) | |
455 | return r; | |
456 | if (must_merge(info)) { | |
457 | r = initiate_merge(path, info); | |
458 | if (r < 0) | |
459 | return r; | |
460 | return complete_merge(path, info); | |
461 | } else { | |
462 | return 0; | |
463 | } | |
464 | } | |
465 | ||
466 | int HashIndex::_lookup(const ghobject_t &oid, | |
467 | vector<string> *path, | |
468 | string *mangled_name, | |
469 | int *hardlink) { | |
470 | vector<string> path_comp; | |
471 | get_path_components(oid, &path_comp); | |
472 | vector<string>::iterator next = path_comp.begin(); | |
473 | int exists; | |
474 | while (1) { | |
475 | int r = path_exists(*path, &exists); | |
476 | if (r < 0) | |
477 | return r; | |
478 | if (!exists) { | |
479 | if (path->empty()) | |
480 | return -ENOENT; | |
481 | path->pop_back(); | |
482 | break; | |
483 | } | |
484 | if (next == path_comp.end()) | |
485 | break; | |
486 | path->push_back(*(next++)); | |
487 | } | |
488 | return get_mangled_name(*path, oid, mangled_name, hardlink); | |
489 | } | |
490 | ||
491 | int HashIndex::_collection_list_partial(const ghobject_t &start, | |
492 | const ghobject_t &end, | |
493 | int max_count, | |
494 | vector<ghobject_t> *ls, | |
495 | ghobject_t *next) { | |
496 | vector<string> path; | |
497 | ghobject_t _next; | |
498 | if (!next) | |
499 | next = &_next; | |
500 | *next = start; | |
501 | dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl; | |
502 | return list_by_hash(path, end, max_count, next, ls); | |
503 | } | |
504 | ||
505 | int HashIndex::prep_delete() { | |
506 | return recursive_remove(vector<string>()); | |
507 | } | |
508 | ||
509 | int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) { | |
510 | int ret; | |
511 | vector<string> path; | |
512 | subdir_info_s root_info; | |
513 | // Make sure there is neither objects nor sub-folders | |
514 | // in this collection | |
515 | ret = get_info(path, &root_info); | |
516 | if (ret < 0) | |
517 | return ret; | |
518 | ||
519 | // Do the folder splitting first | |
520 | ret = pre_split_folder(pg_num, expected_num_objs); | |
521 | if (ret < 0) | |
522 | return ret; | |
523 | // Initialize the folder info starting from root | |
524 | return init_split_folder(path, 0); | |
525 | } | |
526 | ||
527 | int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs) | |
528 | { | |
529 | // If folder merging is enabled (by setting the threshold positive), | |
530 | // no need to split | |
531 | if (merge_threshold > 0) | |
532 | return 0; | |
533 | const coll_t c = coll(); | |
534 | // Do not split if the expected number of objects in this collection is zero (by default) | |
535 | if (expected_num_objs == 0) | |
536 | return 0; | |
537 | ||
538 | // Calculate the number of leaf folders (which actually store files) | |
539 | // need to be created | |
224ce89b | 540 | const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16; |
7c673cae FG |
541 | uint64_t leavies = expected_num_objs / objs_per_folder ; |
542 | // No need to split | |
543 | if (leavies == 0 || expected_num_objs == objs_per_folder) | |
544 | return 0; | |
545 | ||
546 | spg_t spgid; | |
547 | if (!c.is_pg_prefix(&spgid)) | |
548 | return -EINVAL; | |
549 | const ps_t ps = spgid.pgid.ps(); | |
550 | ||
551 | // the most significant bits of pg_num | |
552 | const int pg_num_bits = calc_num_bits(pg_num - 1); | |
553 | ps_t tmp_id = ps; | |
554 | // calculate the number of levels we only create one sub folder | |
555 | int num = pg_num_bits / 4; | |
556 | // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111, | |
557 | // so that splitting starts at level 3 | |
558 | if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) { | |
559 | --num; | |
560 | } | |
561 | ||
562 | int ret; | |
563 | // Start with creation that only has one subfolder | |
564 | vector<string> paths; | |
565 | int dump_num = num; | |
566 | while (num-- > 0) { | |
567 | ps_t v = tmp_id & 0x0000000f; | |
568 | paths.push_back(to_hex(v)); | |
569 | ret = create_path(paths); | |
570 | if (ret < 0 && ret != -EEXIST) | |
571 | return ret; | |
572 | tmp_id = tmp_id >> 4; | |
573 | } | |
574 | ||
575 | // Starting from here, we can split by creating multiple subfolders | |
576 | const int left_bits = pg_num_bits - dump_num * 4; | |
577 | // this variable denotes how many bits (for this level) that can be | |
578 | // used for sub folder splitting | |
579 | int split_bits = 4 - left_bits; | |
580 | // the below logic is inspired by rados.h#ceph_stable_mod, | |
581 | // it basically determines how many sub-folders should we | |
582 | // create for splitting | |
583 | assert(pg_num_bits > 0); // otherwise BAD_SHIFT | |
584 | if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) { | |
585 | ++split_bits; | |
586 | } | |
587 | const uint32_t subs = (1 << split_bits); | |
588 | // Calculate how many levels we create starting from here | |
589 | int level = 0; | |
590 | leavies /= subs; | |
591 | while (leavies > 1) { | |
592 | ++level; | |
593 | leavies = leavies >> 4; | |
594 | } | |
595 | for (uint32_t i = 0; i < subs; ++i) { | |
596 | assert(split_bits <= 4); // otherwise BAD_SHIFT | |
597 | int v = tmp_id | (i << ((4 - split_bits) % 4)); | |
598 | paths.push_back(to_hex(v)); | |
599 | ret = create_path(paths); | |
600 | if (ret < 0 && ret != -EEXIST) | |
601 | return ret; | |
602 | ret = recursive_create_path(paths, level); | |
603 | if (ret < 0) | |
604 | return ret; | |
605 | paths.pop_back(); | |
606 | } | |
607 | return 0; | |
608 | } | |
609 | ||
610 | int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level) | |
611 | { | |
612 | // Get the number of sub directories for the current path | |
613 | vector<string> subdirs; | |
614 | int ret = list_subdirs(path, &subdirs); | |
615 | if (ret < 0) | |
616 | return ret; | |
617 | subdir_info_s info; | |
618 | info.subdirs = subdirs.size(); | |
619 | info.hash_level = hash_level; | |
620 | ret = set_info(path, info); | |
621 | if (ret < 0) | |
622 | return ret; | |
623 | ret = fsync_dir(path); | |
624 | if (ret < 0) | |
625 | return ret; | |
626 | ||
627 | // Do the same for subdirs | |
628 | vector<string>::const_iterator iter; | |
629 | for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) { | |
630 | path.push_back(*iter); | |
631 | ret = init_split_folder(path, hash_level + 1); | |
632 | if (ret < 0) | |
633 | return ret; | |
634 | path.pop_back(); | |
635 | } | |
636 | return 0; | |
637 | } | |
638 | ||
639 | int HashIndex::recursive_create_path(vector<string>& path, int level) | |
640 | { | |
641 | if (level == 0) | |
642 | return 0; | |
643 | for (int i = 0; i < 16; ++i) { | |
644 | path.push_back(to_hex(i)); | |
645 | int ret = create_path(path); | |
646 | if (ret < 0 && ret != -EEXIST) | |
647 | return ret; | |
648 | ret = recursive_create_path(path, level - 1); | |
649 | if (ret < 0) | |
650 | return ret; | |
651 | path.pop_back(); | |
652 | } | |
653 | return 0; | |
654 | } | |
655 | ||
656 | int HashIndex::recursive_remove(const vector<string> &path) { | |
657 | return _recursive_remove(path, true); | |
658 | } | |
659 | ||
660 | int HashIndex::_recursive_remove(const vector<string> &path, bool top) { | |
661 | vector<string> subdirs; | |
662 | dout(20) << __func__ << " path=" << path << dendl; | |
663 | int r = list_subdirs(path, &subdirs); | |
664 | if (r < 0) | |
665 | return r; | |
666 | map<string, ghobject_t> objects; | |
667 | r = list_objects(path, 0, 0, &objects); | |
668 | if (r < 0) | |
669 | return r; | |
670 | if (!objects.empty()) | |
671 | return -ENOTEMPTY; | |
672 | vector<string> subdir(path); | |
673 | for (vector<string>::iterator i = subdirs.begin(); | |
674 | i != subdirs.end(); | |
675 | ++i) { | |
676 | subdir.push_back(*i); | |
677 | r = _recursive_remove(subdir, false); | |
678 | if (r < 0) | |
679 | return r; | |
680 | subdir.pop_back(); | |
681 | } | |
682 | if (top) | |
683 | return 0; | |
684 | else | |
685 | return remove_path(path); | |
686 | } | |
687 | ||
688 | int HashIndex::start_col_split(const vector<string> &path) { | |
689 | bufferlist bl; | |
690 | InProgressOp op_tag(InProgressOp::COL_SPLIT, path); | |
691 | op_tag.encode(bl); | |
692 | int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); | |
693 | if (r < 0) | |
694 | return r; | |
695 | return fsync_dir(vector<string>()); | |
696 | } | |
697 | ||
698 | int HashIndex::start_split(const vector<string> &path) { | |
699 | bufferlist bl; | |
700 | InProgressOp op_tag(InProgressOp::SPLIT, path); | |
701 | op_tag.encode(bl); | |
702 | int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); | |
703 | if (r < 0) | |
704 | return r; | |
705 | return fsync_dir(vector<string>()); | |
706 | } | |
707 | ||
708 | int HashIndex::start_merge(const vector<string> &path) { | |
709 | bufferlist bl; | |
710 | InProgressOp op_tag(InProgressOp::MERGE, path); | |
711 | op_tag.encode(bl); | |
712 | int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); | |
713 | if (r < 0) | |
714 | return r; | |
715 | return fsync_dir(vector<string>()); | |
716 | } | |
717 | ||
718 | int HashIndex::end_split_or_merge(const vector<string> &path) { | |
719 | return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG); | |
720 | } | |
721 | ||
722 | int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) { | |
723 | bufferlist buf; | |
724 | int r = get_attr_path(path, SUBDIR_ATTR, buf); | |
725 | if (r < 0) | |
726 | return r; | |
727 | bufferlist::iterator bufiter = buf.begin(); | |
728 | info->decode(bufiter); | |
729 | assert(path.size() == (unsigned)info->hash_level); | |
730 | return 0; | |
731 | } | |
732 | ||
733 | int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) { | |
734 | bufferlist buf; | |
735 | assert(path.size() == (unsigned)info.hash_level); | |
736 | info.encode(buf); | |
737 | return add_attr_path(path, SUBDIR_ATTR, buf); | |
738 | } | |
739 | ||
740 | bool HashIndex::must_merge(const subdir_info_s &info) { | |
741 | return (info.hash_level > 0 && | |
742 | merge_threshold > 0 && | |
743 | info.objs < (unsigned)merge_threshold && | |
744 | info.subdirs == 0); | |
745 | } | |
746 | ||
1adf2230 AA |
747 | bool HashIndex::must_split(const subdir_info_s &info, int target_level) { |
748 | // target_level is used for ceph-objectstore-tool to split dirs offline. | |
749 | // if it is set (defalult is 0) and current hash level < target_level, | |
750 | // this dir would be split no matters how many objects it has. | |
7c673cae | 751 | return (info.hash_level < (unsigned)MAX_HASH_LEVEL && |
1adf2230 AA |
752 | ((target_level > 0 && info.hash_level < (unsigned)target_level) || |
753 | (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16)))); | |
7c673cae FG |
754 | } |
755 | ||
756 | int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) { | |
757 | return start_merge(path); | |
758 | } | |
759 | ||
760 | int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) { | |
761 | vector<string> dst = path; | |
762 | dst.pop_back(); | |
763 | subdir_info_s dstinfo; | |
764 | int r, exists; | |
765 | r = path_exists(path, &exists); | |
766 | if (r < 0) | |
767 | return r; | |
768 | r = get_info(dst, &dstinfo); | |
769 | if (r < 0) | |
770 | return r; | |
771 | if (exists) { | |
772 | r = move_objects(path, dst); | |
773 | if (r < 0) | |
774 | return r; | |
775 | r = reset_attr(dst); | |
776 | if (r < 0) | |
777 | return r; | |
778 | r = remove_path(path); | |
779 | if (r < 0) | |
780 | return r; | |
781 | } | |
782 | if (must_merge(dstinfo)) { | |
783 | r = initiate_merge(dst, dstinfo); | |
784 | if (r < 0) | |
785 | return r; | |
786 | r = fsync_dir(dst); | |
787 | if (r < 0) | |
788 | return r; | |
789 | return complete_merge(dst, dstinfo); | |
790 | } | |
791 | r = fsync_dir(dst); | |
792 | if (r < 0) | |
793 | return r; | |
794 | return end_split_or_merge(path); | |
795 | } | |
796 | ||
797 | int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) { | |
798 | return start_split(path); | |
799 | } | |
800 | ||
801 | int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) { | |
802 | int level = info.hash_level; | |
803 | map<string, ghobject_t> objects; | |
804 | vector<string> dst = path; | |
805 | int r; | |
806 | dst.push_back(""); | |
807 | r = list_objects(path, 0, 0, &objects); | |
808 | if (r < 0) | |
809 | return r; | |
810 | vector<string> subdirs_vec; | |
811 | r = list_subdirs(path, &subdirs_vec); | |
812 | if (r < 0) | |
813 | return r; | |
814 | set<string> subdirs; | |
815 | subdirs.insert(subdirs_vec.begin(), subdirs_vec.end()); | |
816 | map<string, map<string, ghobject_t> > mapped; | |
817 | map<string, ghobject_t> moved; | |
818 | int num_moved = 0; | |
819 | for (map<string, ghobject_t>::iterator i = objects.begin(); | |
820 | i != objects.end(); | |
821 | ++i) { | |
822 | vector<string> new_path; | |
823 | get_path_components(i->second, &new_path); | |
824 | mapped[new_path[level]][i->first] = i->second; | |
825 | } | |
826 | for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin(); | |
827 | i != mapped.end(); | |
828 | ) { | |
829 | dst[level] = i->first; | |
830 | /* If the info already exists, it must be correct, | |
831 | * we may be picking up a partially finished split */ | |
832 | subdir_info_s temp; | |
833 | // subdir has already been fully copied | |
834 | if (subdirs.count(i->first) && !get_info(dst, &temp)) { | |
835 | for (map<string, ghobject_t>::iterator j = i->second.begin(); | |
836 | j != i->second.end(); | |
837 | ++j) { | |
838 | moved[j->first] = j->second; | |
839 | num_moved++; | |
840 | objects.erase(j->first); | |
841 | } | |
842 | ++i; | |
843 | continue; | |
844 | } | |
845 | ||
846 | subdir_info_s info_new; | |
847 | info_new.objs = i->second.size(); | |
848 | info_new.subdirs = 0; | |
849 | info_new.hash_level = level + 1; | |
850 | if (must_merge(info_new) && !subdirs.count(i->first)) { | |
851 | mapped.erase(i++); | |
852 | continue; | |
853 | } | |
854 | ||
855 | // Subdir doesn't yet exist | |
856 | if (!subdirs.count(i->first)) { | |
857 | info.subdirs += 1; | |
858 | r = create_path(dst); | |
859 | if (r < 0) | |
860 | return r; | |
861 | } // else subdir has been created but only partially copied | |
862 | ||
863 | for (map<string, ghobject_t>::iterator j = i->second.begin(); | |
864 | j != i->second.end(); | |
865 | ++j) { | |
866 | moved[j->first] = j->second; | |
867 | num_moved++; | |
868 | objects.erase(j->first); | |
869 | r = link_object(path, dst, j->second, j->first); | |
870 | // May be a partially finished split | |
871 | if (r < 0 && r != -EEXIST) { | |
872 | return r; | |
873 | } | |
874 | } | |
875 | ||
876 | r = fsync_dir(dst); | |
877 | if (r < 0) | |
878 | return r; | |
879 | ||
880 | // Presence of info must imply that all objects have been copied | |
881 | r = set_info(dst, info_new); | |
882 | if (r < 0) | |
883 | return r; | |
884 | ||
885 | r = fsync_dir(dst); | |
886 | if (r < 0) | |
887 | return r; | |
888 | ||
889 | ++i; | |
890 | } | |
891 | r = remove_objects(path, moved, &objects); | |
892 | if (r < 0) | |
893 | return r; | |
894 | info.objs = objects.size(); | |
895 | r = reset_attr(path); | |
896 | if (r < 0) | |
897 | return r; | |
898 | r = fsync_dir(path); | |
899 | if (r < 0) | |
900 | return r; | |
901 | return end_split_or_merge(path); | |
902 | } | |
903 | ||
904 | void HashIndex::get_path_components(const ghobject_t &oid, | |
905 | vector<string> *path) { | |
906 | char buf[MAX_HASH_LEVEL + 1]; | |
907 | snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key()); | |
908 | ||
909 | // Path components are the hex characters of oid.hobj.hash, least | |
910 | // significant first | |
911 | for (int i = 0; i < MAX_HASH_LEVEL; ++i) { | |
912 | path->push_back(string(&buf[i], 1)); | |
913 | } | |
914 | } | |
915 | ||
916 | string HashIndex::get_hash_str(uint32_t hash) { | |
917 | char buf[MAX_HASH_LEVEL + 1]; | |
918 | snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash); | |
919 | string retval; | |
920 | for (int i = 0; i < MAX_HASH_LEVEL; ++i) { | |
921 | retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]); | |
922 | } | |
923 | return retval; | |
924 | } | |
925 | ||
926 | string HashIndex::get_path_str(const ghobject_t &oid) { | |
927 | assert(!oid.is_max()); | |
928 | return get_hash_str(oid.hobj.get_hash()); | |
929 | } | |
930 | ||
931 | uint32_t HashIndex::hash_prefix_to_hash(string prefix) { | |
932 | while (prefix.size() < sizeof(uint32_t) * 2) { | |
933 | prefix.push_back('0'); | |
934 | } | |
935 | uint32_t hash; | |
936 | sscanf(prefix.c_str(), "%x", &hash); | |
937 | // nibble reverse | |
938 | hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4); | |
939 | hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8); | |
940 | hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16); | |
941 | return hash; | |
942 | } | |
943 | ||
944 | int HashIndex::get_path_contents_by_hash_bitwise( | |
945 | const vector<string> &path, | |
946 | const ghobject_t *next_object, | |
947 | set<string, CmpHexdigitStringBitwise> *hash_prefixes, | |
948 | set<pair<string, ghobject_t>, CmpPairBitwise> *objects) | |
949 | { | |
950 | map<string, ghobject_t> rev_objects; | |
951 | int r; | |
952 | r = list_objects(path, 0, 0, &rev_objects); | |
953 | if (r < 0) | |
954 | return r; | |
955 | // bitwise sort | |
956 | for (map<string, ghobject_t>::iterator i = rev_objects.begin(); | |
957 | i != rev_objects.end(); | |
958 | ++i) { | |
959 | if (next_object && i->second < *next_object) | |
960 | continue; | |
961 | string hash_prefix = get_path_str(i->second); | |
962 | hash_prefixes->insert(hash_prefix); | |
963 | objects->insert(pair<string, ghobject_t>(hash_prefix, i->second)); | |
964 | } | |
965 | vector<string> subdirs; | |
966 | r = list_subdirs(path, &subdirs); | |
967 | if (r < 0) | |
968 | return r; | |
969 | ||
970 | // sort subdirs bitwise (by reversing hex digit nibbles) | |
971 | std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise); | |
972 | ||
973 | // Local to this function, we will convert the prefix strings | |
974 | // (previously simply the reversed hex digits) to also have each | |
975 | // digit's nibbles reversed. This will make the strings sort | |
976 | // bitwise. | |
977 | string cur_prefix; | |
978 | for (vector<string>::const_iterator i = path.begin(); | |
979 | i != path.end(); | |
980 | ++i) { | |
981 | cur_prefix.append(reverse_hexdigit_bits_string(*i)); | |
982 | } | |
983 | string next_object_string; | |
984 | if (next_object) | |
985 | next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object)); | |
986 | for (vector<string>::iterator i = subdirs.begin(); | |
987 | i != subdirs.end(); | |
988 | ++i) { | |
989 | string candidate = cur_prefix + reverse_hexdigit_bits_string(*i); | |
990 | if (next_object) { | |
991 | if (next_object->is_max()) | |
992 | continue; | |
993 | if (candidate < next_object_string.substr(0, candidate.size())) | |
994 | continue; | |
995 | } | |
996 | // re-reverse the hex digit nibbles for the caller | |
997 | hash_prefixes->insert(reverse_hexdigit_bits_string(candidate)); | |
998 | } | |
999 | return 0; | |
1000 | } | |
1001 | ||
1002 | int HashIndex::list_by_hash(const vector<string> &path, | |
1003 | const ghobject_t &end, | |
1004 | int max_count, | |
1005 | ghobject_t *next, | |
1006 | vector<ghobject_t> *out) | |
1007 | { | |
1008 | assert(out); | |
1009 | return list_by_hash_bitwise(path, end, max_count, next, out); | |
1010 | } | |
1011 | ||
1012 | int HashIndex::list_by_hash_bitwise( | |
1013 | const vector<string> &path, | |
1014 | const ghobject_t& end, | |
1015 | int max_count, | |
1016 | ghobject_t *next, | |
1017 | vector<ghobject_t> *out) | |
1018 | { | |
1019 | vector<string> next_path = path; | |
1020 | next_path.push_back(""); | |
1021 | set<string, CmpHexdigitStringBitwise> hash_prefixes; | |
1022 | set<pair<string, ghobject_t>, CmpPairBitwise> objects; | |
1023 | int r = get_path_contents_by_hash_bitwise(path, | |
1024 | next, | |
1025 | &hash_prefixes, | |
1026 | &objects); | |
1027 | if (r < 0) | |
1028 | return r; | |
1029 | for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin(); | |
1030 | i != hash_prefixes.end(); | |
1031 | ++i) { | |
1032 | dout(20) << __func__ << " prefix " << *i << dendl; | |
1033 | set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound( | |
1034 | make_pair(*i, ghobject_t())); | |
1035 | if (j == objects.end() || j->first != *i) { | |
1036 | *(next_path.rbegin()) = *(i->rbegin()); | |
1037 | ghobject_t next_recurse; | |
1038 | if (next) | |
1039 | next_recurse = *next; | |
1040 | r = list_by_hash_bitwise(next_path, | |
1041 | end, | |
1042 | max_count, | |
1043 | &next_recurse, | |
1044 | out); | |
1045 | ||
1046 | if (r < 0) | |
1047 | return r; | |
1048 | if (!next_recurse.is_max()) { | |
1049 | if (next) | |
1050 | *next = next_recurse; | |
1051 | return 0; | |
1052 | } | |
1053 | } else { | |
1054 | while (j != objects.end() && j->first == *i) { | |
1055 | if (max_count > 0 && out->size() == (unsigned)max_count) { | |
1056 | if (next) | |
1057 | *next = j->second; | |
1058 | return 0; | |
1059 | } | |
1060 | if (j->second >= end) { | |
1061 | if (next) | |
1062 | *next = j->second; | |
1063 | return 0; | |
1064 | } | |
1065 | if (!next || j->second >= *next) { | |
1066 | dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl; | |
1067 | out->push_back(j->second); | |
1068 | } | |
1069 | ++j; | |
1070 | } | |
1071 | } | |
1072 | } | |
1073 | if (next) | |
1074 | *next = ghobject_t::get_max(); | |
1075 | return 0; | |
1076 | } | |
1077 | ||
1078 |