]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "include/types.h" | |
16 | #include "include/buffer.h" | |
17 | #include "osd/osd_types.h" | |
18 | #include <errno.h> | |
19 | ||
20 | #include "HashIndex.h" | |
21 | ||
22 | #include "common/errno.h" | |
23 | #include "common/debug.h" | |
24 | #define dout_context cct | |
25 | #define dout_subsys ceph_subsys_filestore | |
26 | ||
27 | const string HashIndex::SUBDIR_ATTR = "contents"; | |
224ce89b | 28 | const string HashIndex::SETTINGS_ATTR = "settings"; |
7c673cae FG |
29 | const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op"; |
30 | ||
31 | /// hex digit to integer value | |
32 | int hex_to_int(char c) | |
33 | { | |
34 | if (c >= '0' && c <= '9') | |
35 | return c - '0'; | |
36 | if (c >= 'A' && c <= 'F') | |
37 | return c - 'A' + 10; | |
38 | ceph_abort(); | |
39 | } | |
40 | ||
41 | /// int value to hex digit | |
42 | char int_to_hex(int v) | |
43 | { | |
44 | assert(v < 16); | |
45 | if (v < 10) | |
46 | return '0' + v; | |
47 | return 'A' + v - 10; | |
48 | } | |
49 | ||
50 | /// reverse bits in a nibble (0..15) | |
51 | int reverse_nibble_bits(int in) | |
52 | { | |
53 | assert(in < 16); | |
54 | return | |
55 | ((in & 8) >> 3) | | |
56 | ((in & 4) >> 1) | | |
57 | ((in & 2) << 1) | | |
58 | ((in & 1) << 3); | |
59 | } | |
60 | ||
61 | /// reverse nibble bits in a hex digit | |
62 | char reverse_hexdigit_bits(char c) | |
63 | { | |
64 | return int_to_hex(reverse_nibble_bits(hex_to_int(c))); | |
65 | } | |
66 | ||
67 | /// reverse nibble bits in a hex string | |
68 | string reverse_hexdigit_bits_string(string s) | |
69 | { | |
70 | for (unsigned i=0; i<s.size(); ++i) | |
71 | s[i] = reverse_hexdigit_bits(s[i]); | |
72 | return s; | |
73 | } | |
74 | ||
75 | /// compare hex digit (as length 1 string) bitwise | |
76 | bool cmp_hexdigit_bitwise(const string& l, const string& r) | |
77 | { | |
78 | assert(l.length() == 1 && r.length() == 1); | |
79 | int lv = hex_to_int(l[0]); | |
80 | int rv = hex_to_int(r[0]); | |
81 | assert(lv < 16); | |
82 | assert(rv < 16); | |
83 | return reverse_nibble_bits(lv) < reverse_nibble_bits(rv); | |
84 | } | |
85 | ||
86 | /// compare hex digit string bitwise | |
87 | bool cmp_hexdigit_string_bitwise(const string& l, const string& r) | |
88 | { | |
89 | string ll = reverse_hexdigit_bits_string(l); | |
90 | string rr = reverse_hexdigit_bits_string(r); | |
91 | return ll < rr; | |
92 | } | |
93 | ||
94 | int HashIndex::cleanup() { | |
95 | bufferlist bl; | |
96 | int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); | |
97 | if (r < 0) { | |
98 | // No in progress operations! | |
99 | return 0; | |
100 | } | |
101 | bufferlist::iterator i = bl.begin(); | |
102 | InProgressOp in_progress(i); | |
103 | subdir_info_s info; | |
104 | r = get_info(in_progress.path, &info); | |
105 | if (r == -ENOENT) { | |
106 | return end_split_or_merge(in_progress.path); | |
107 | } else if (r < 0) { | |
108 | return r; | |
109 | } | |
110 | ||
111 | if (in_progress.is_split()) | |
112 | return complete_split(in_progress.path, info); | |
113 | else if (in_progress.is_merge()) | |
114 | return complete_merge(in_progress.path, info); | |
115 | else if (in_progress.is_col_split()) { | |
116 | for (vector<string>::iterator i = in_progress.path.begin(); | |
117 | i != in_progress.path.end(); | |
118 | ++i) { | |
119 | vector<string> path(in_progress.path.begin(), i); | |
120 | int r = reset_attr(path); | |
121 | if (r < 0) | |
122 | return r; | |
123 | } | |
124 | return 0; | |
125 | } | |
126 | else | |
127 | return -EINVAL; | |
128 | } | |
129 | ||
130 | int HashIndex::reset_attr( | |
131 | const vector<string> &path) | |
132 | { | |
133 | int exists = 0; | |
134 | int r = path_exists(path, &exists); | |
135 | if (r < 0) | |
136 | return r; | |
137 | if (!exists) | |
138 | return 0; | |
139 | map<string, ghobject_t> objects; | |
140 | vector<string> subdirs; | |
141 | r = list_objects(path, 0, 0, &objects); | |
142 | if (r < 0) | |
143 | return r; | |
144 | r = list_subdirs(path, &subdirs); | |
145 | if (r < 0) | |
146 | return r; | |
147 | ||
148 | subdir_info_s info; | |
149 | info.hash_level = path.size(); | |
150 | info.objs = objects.size(); | |
151 | info.subdirs = subdirs.size(); | |
152 | return set_info(path, info); | |
153 | } | |
154 | ||
155 | int HashIndex::col_split_level( | |
156 | HashIndex &from, | |
157 | HashIndex &to, | |
158 | const vector<string> &path, | |
159 | uint32_t inbits, | |
160 | uint32_t match, | |
161 | unsigned *mkdirred) | |
162 | { | |
163 | /* For each subdir, move, recurse, or ignore based on comparing the low order | |
164 | * bits of the hash represented by the subdir path with inbits, match passed | |
165 | * in. | |
166 | */ | |
167 | vector<string> subdirs; | |
168 | int r = from.list_subdirs(path, &subdirs); | |
169 | if (r < 0) | |
170 | return r; | |
171 | map<string, ghobject_t> objects; | |
172 | r = from.list_objects(path, 0, 0, &objects); | |
173 | if (r < 0) | |
174 | return r; | |
175 | ||
176 | set<string> to_move; | |
177 | for (vector<string>::iterator i = subdirs.begin(); | |
178 | i != subdirs.end(); | |
179 | ++i) { | |
180 | uint32_t bits = 0; | |
181 | uint32_t hash = 0; | |
182 | vector<string> sub_path(path.begin(), path.end()); | |
183 | sub_path.push_back(*i); | |
184 | path_to_hobject_hash_prefix(sub_path, &bits, &hash); | |
185 | if (bits < inbits) { | |
186 | if (hobject_t::match_hash(hash, bits, match)) { | |
187 | r = col_split_level( | |
188 | from, | |
189 | to, | |
190 | sub_path, | |
191 | inbits, | |
192 | match, | |
193 | mkdirred); | |
194 | if (r < 0) | |
195 | return r; | |
196 | if (*mkdirred > path.size()) | |
197 | *mkdirred = path.size(); | |
198 | } // else, skip, doesn't need to be moved or recursed into | |
199 | } else { | |
200 | if (hobject_t::match_hash(hash, inbits, match)) { | |
201 | to_move.insert(*i); | |
202 | } | |
203 | } // else, skip, doesn't need to be moved or recursed into | |
204 | } | |
205 | ||
206 | /* Then, do the same for each object */ | |
207 | map<string, ghobject_t> objs_to_move; | |
208 | for (map<string, ghobject_t>::iterator i = objects.begin(); | |
209 | i != objects.end(); | |
210 | ++i) { | |
211 | if (i->second.match(inbits, match)) { | |
212 | objs_to_move.insert(*i); | |
213 | } | |
214 | } | |
215 | ||
216 | if (objs_to_move.empty() && to_move.empty()) | |
217 | return 0; | |
218 | ||
219 | // Make parent directories as needed | |
220 | while (*mkdirred < path.size()) { | |
221 | ++*mkdirred; | |
222 | int exists = 0; | |
223 | vector<string> creating_path(path.begin(), path.begin()+*mkdirred); | |
224 | r = to.path_exists(creating_path, &exists); | |
225 | if (r < 0) | |
226 | return r; | |
227 | if (exists) | |
228 | continue; | |
229 | subdir_info_s info; | |
230 | info.objs = 0; | |
231 | info.subdirs = 0; | |
232 | info.hash_level = creating_path.size(); | |
233 | if (*mkdirred < path.size() - 1) | |
234 | info.subdirs = 1; | |
235 | r = to.start_col_split(creating_path); | |
236 | if (r < 0) | |
237 | return r; | |
238 | r = to.create_path(creating_path); | |
239 | if (r < 0) | |
240 | return r; | |
241 | r = to.set_info(creating_path, info); | |
242 | if (r < 0) | |
243 | return r; | |
244 | r = to.end_split_or_merge(creating_path); | |
245 | if (r < 0) | |
246 | return r; | |
247 | } | |
248 | ||
249 | subdir_info_s from_info; | |
250 | subdir_info_s to_info; | |
251 | r = from.get_info(path, &from_info); | |
252 | if (r < 0) | |
253 | return r; | |
254 | r = to.get_info(path, &to_info); | |
255 | if (r < 0) | |
256 | return r; | |
257 | ||
258 | from.start_col_split(path); | |
259 | to.start_col_split(path); | |
260 | ||
261 | // Do subdir moves | |
262 | for (set<string>::iterator i = to_move.begin(); | |
263 | i != to_move.end(); | |
264 | ++i) { | |
265 | from_info.subdirs--; | |
266 | to_info.subdirs++; | |
267 | r = move_subdir(from, to, path, *i); | |
268 | if (r < 0) | |
269 | return r; | |
270 | } | |
271 | ||
272 | for (map<string, ghobject_t>::iterator i = objs_to_move.begin(); | |
273 | i != objs_to_move.end(); | |
274 | ++i) { | |
275 | from_info.objs--; | |
276 | to_info.objs++; | |
277 | r = move_object(from, to, path, *i); | |
278 | if (r < 0) | |
279 | return r; | |
280 | } | |
281 | ||
282 | ||
283 | r = to.set_info(path, to_info); | |
284 | if (r < 0) | |
285 | return r; | |
286 | r = from.set_info(path, from_info); | |
287 | if (r < 0) | |
288 | return r; | |
289 | from.end_split_or_merge(path); | |
290 | to.end_split_or_merge(path); | |
291 | return 0; | |
292 | } | |
293 | ||
294 | int HashIndex::_split( | |
295 | uint32_t match, | |
296 | uint32_t bits, | |
297 | CollectionIndex* dest) { | |
298 | assert(collection_version() == dest->collection_version()); | |
299 | unsigned mkdirred = 0; | |
300 | return col_split_level( | |
301 | *this, | |
302 | *static_cast<HashIndex*>(dest), | |
303 | vector<string>(), | |
304 | bits, | |
305 | match, | |
306 | &mkdirred); | |
307 | } | |
308 | ||
309 | int HashIndex::split_dirs(const vector<string> &path) { | |
310 | dout(20) << __func__ << " " << path << dendl; | |
311 | subdir_info_s info; | |
312 | int r = get_info(path, &info); | |
313 | if (r < 0) { | |
314 | dout(10) << "error looking up info for " << path << ": " | |
315 | << cpp_strerror(r) << dendl; | |
316 | return r; | |
317 | } | |
318 | ||
319 | if (must_split(info)) { | |
320 | dout(1) << __func__ << " " << path << " has " << info.objs | |
321 | << " objects, starting split." << dendl; | |
322 | r = initiate_split(path, info); | |
323 | if (r < 0) { | |
324 | dout(10) << "error initiating split on " << path << ": " | |
325 | << cpp_strerror(r) << dendl; | |
326 | return r; | |
327 | } | |
328 | ||
329 | r = complete_split(path, info); | |
330 | dout(1) << __func__ << " " << path << " split completed." | |
331 | << dendl; | |
332 | if (r < 0) { | |
333 | dout(10) << "error completing split on " << path << ": " | |
334 | << cpp_strerror(r) << dendl; | |
335 | return r; | |
336 | } | |
337 | } | |
338 | ||
339 | vector<string> subdirs; | |
340 | r = list_subdirs(path, &subdirs); | |
341 | if (r < 0) { | |
342 | dout(10) << "error listing subdirs of " << path << ": " | |
343 | << cpp_strerror(r) << dendl; | |
344 | return r; | |
345 | } | |
346 | for (vector<string>::const_iterator it = subdirs.begin(); | |
347 | it != subdirs.end(); ++it) { | |
348 | vector<string> subdir_path(path); | |
349 | subdir_path.push_back(*it); | |
350 | r = split_dirs(subdir_path); | |
351 | if (r < 0) { | |
352 | return r; | |
353 | } | |
354 | } | |
355 | ||
356 | return r; | |
357 | } | |
358 | ||
359 | int HashIndex::apply_layout_settings() { | |
360 | vector<string> path; | |
361 | dout(10) << __func__ << " split multiple = " << split_multiplier | |
224ce89b WB |
362 | << " merge threshold = " << merge_threshold |
363 | << " split rand factor = " << cct->_conf->filestore_split_rand_factor | |
364 | << dendl; | |
365 | int r = write_settings(); | |
366 | if (r < 0) | |
367 | return r; | |
7c673cae FG |
368 | return split_dirs(path); |
369 | } | |
370 | ||
371 | int HashIndex::_init() { | |
372 | subdir_info_s info; | |
373 | vector<string> path; | |
224ce89b WB |
374 | int r = set_info(path, info); |
375 | if (r < 0) | |
376 | return r; | |
377 | return write_settings(); | |
378 | } | |
379 | ||
380 | int HashIndex::write_settings() { | |
381 | if (cct->_conf->filestore_split_rand_factor > 0) { | |
382 | settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor; | |
383 | } else { | |
384 | settings.split_rand_factor = 0; | |
385 | } | |
386 | vector<string> path; | |
387 | bufferlist bl; | |
388 | settings.encode(bl); | |
389 | return add_attr_path(path, SETTINGS_ATTR, bl); | |
390 | } | |
391 | ||
392 | int HashIndex::read_settings() { | |
393 | vector<string> path; | |
394 | bufferlist bl; | |
395 | int r = get_attr_path(path, SETTINGS_ATTR, bl); | |
396 | if (r == -ENODATA) | |
397 | return 0; | |
398 | if (r < 0) { | |
399 | derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl; | |
400 | return r; | |
401 | } | |
402 | bufferlist::iterator it = bl.begin(); | |
403 | settings.decode(it); | |
404 | dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl; | |
405 | return 0; | |
7c673cae FG |
406 | } |
407 | ||
408 | /* LFNIndex virtual method implementations */ | |
409 | int HashIndex::_created(const vector<string> &path, | |
410 | const ghobject_t &oid, | |
411 | const string &mangled_name) { | |
412 | subdir_info_s info; | |
413 | int r; | |
414 | r = get_info(path, &info); | |
415 | if (r < 0) | |
416 | return r; | |
417 | info.objs++; | |
418 | r = set_info(path, info); | |
419 | if (r < 0) | |
420 | return r; | |
421 | ||
422 | if (must_split(info)) { | |
423 | dout(1) << __func__ << " " << path << " has " << info.objs | |
424 | << " objects, starting split." << dendl; | |
425 | int r = initiate_split(path, info); | |
426 | if (r < 0) | |
427 | return r; | |
428 | r = complete_split(path, info); | |
429 | dout(1) << __func__ << " " << path << " split completed." | |
430 | << dendl; | |
431 | return r; | |
432 | } else { | |
433 | return 0; | |
434 | } | |
435 | } | |
436 | ||
437 | int HashIndex::_remove(const vector<string> &path, | |
438 | const ghobject_t &oid, | |
439 | const string &mangled_name) { | |
440 | int r; | |
441 | r = remove_object(path, oid); | |
442 | if (r < 0) | |
443 | return r; | |
444 | subdir_info_s info; | |
445 | r = get_info(path, &info); | |
446 | if (r < 0) | |
447 | return r; | |
448 | info.objs--; | |
449 | r = set_info(path, info); | |
450 | if (r < 0) | |
451 | return r; | |
452 | if (must_merge(info)) { | |
453 | r = initiate_merge(path, info); | |
454 | if (r < 0) | |
455 | return r; | |
456 | return complete_merge(path, info); | |
457 | } else { | |
458 | return 0; | |
459 | } | |
460 | } | |
461 | ||
462 | int HashIndex::_lookup(const ghobject_t &oid, | |
463 | vector<string> *path, | |
464 | string *mangled_name, | |
465 | int *hardlink) { | |
466 | vector<string> path_comp; | |
467 | get_path_components(oid, &path_comp); | |
468 | vector<string>::iterator next = path_comp.begin(); | |
469 | int exists; | |
470 | while (1) { | |
471 | int r = path_exists(*path, &exists); | |
472 | if (r < 0) | |
473 | return r; | |
474 | if (!exists) { | |
475 | if (path->empty()) | |
476 | return -ENOENT; | |
477 | path->pop_back(); | |
478 | break; | |
479 | } | |
480 | if (next == path_comp.end()) | |
481 | break; | |
482 | path->push_back(*(next++)); | |
483 | } | |
484 | return get_mangled_name(*path, oid, mangled_name, hardlink); | |
485 | } | |
486 | ||
487 | int HashIndex::_collection_list_partial(const ghobject_t &start, | |
488 | const ghobject_t &end, | |
489 | int max_count, | |
490 | vector<ghobject_t> *ls, | |
491 | ghobject_t *next) { | |
492 | vector<string> path; | |
493 | ghobject_t _next; | |
494 | if (!next) | |
495 | next = &_next; | |
496 | *next = start; | |
497 | dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl; | |
498 | return list_by_hash(path, end, max_count, next, ls); | |
499 | } | |
500 | ||
501 | int HashIndex::prep_delete() { | |
502 | return recursive_remove(vector<string>()); | |
503 | } | |
504 | ||
505 | int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) { | |
506 | int ret; | |
507 | vector<string> path; | |
508 | subdir_info_s root_info; | |
509 | // Make sure there is neither objects nor sub-folders | |
510 | // in this collection | |
511 | ret = get_info(path, &root_info); | |
512 | if (ret < 0) | |
513 | return ret; | |
514 | ||
515 | // Do the folder splitting first | |
516 | ret = pre_split_folder(pg_num, expected_num_objs); | |
517 | if (ret < 0) | |
518 | return ret; | |
519 | // Initialize the folder info starting from root | |
520 | return init_split_folder(path, 0); | |
521 | } | |
522 | ||
523 | int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs) | |
524 | { | |
525 | // If folder merging is enabled (by setting the threshold positive), | |
526 | // no need to split | |
527 | if (merge_threshold > 0) | |
528 | return 0; | |
529 | const coll_t c = coll(); | |
530 | // Do not split if the expected number of objects in this collection is zero (by default) | |
531 | if (expected_num_objs == 0) | |
532 | return 0; | |
533 | ||
534 | // Calculate the number of leaf folders (which actually store files) | |
535 | // need to be created | |
224ce89b | 536 | const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16; |
7c673cae FG |
537 | uint64_t leavies = expected_num_objs / objs_per_folder ; |
538 | // No need to split | |
539 | if (leavies == 0 || expected_num_objs == objs_per_folder) | |
540 | return 0; | |
541 | ||
542 | spg_t spgid; | |
543 | if (!c.is_pg_prefix(&spgid)) | |
544 | return -EINVAL; | |
545 | const ps_t ps = spgid.pgid.ps(); | |
546 | ||
547 | // the most significant bits of pg_num | |
548 | const int pg_num_bits = calc_num_bits(pg_num - 1); | |
549 | ps_t tmp_id = ps; | |
550 | // calculate the number of levels we only create one sub folder | |
551 | int num = pg_num_bits / 4; | |
552 | // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111, | |
553 | // so that splitting starts at level 3 | |
554 | if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) { | |
555 | --num; | |
556 | } | |
557 | ||
558 | int ret; | |
559 | // Start with creation that only has one subfolder | |
560 | vector<string> paths; | |
561 | int dump_num = num; | |
562 | while (num-- > 0) { | |
563 | ps_t v = tmp_id & 0x0000000f; | |
564 | paths.push_back(to_hex(v)); | |
565 | ret = create_path(paths); | |
566 | if (ret < 0 && ret != -EEXIST) | |
567 | return ret; | |
568 | tmp_id = tmp_id >> 4; | |
569 | } | |
570 | ||
571 | // Starting from here, we can split by creating multiple subfolders | |
572 | const int left_bits = pg_num_bits - dump_num * 4; | |
573 | // this variable denotes how many bits (for this level) that can be | |
574 | // used for sub folder splitting | |
575 | int split_bits = 4 - left_bits; | |
576 | // the below logic is inspired by rados.h#ceph_stable_mod, | |
577 | // it basically determines how many sub-folders should we | |
578 | // create for splitting | |
579 | assert(pg_num_bits > 0); // otherwise BAD_SHIFT | |
580 | if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) { | |
581 | ++split_bits; | |
582 | } | |
583 | const uint32_t subs = (1 << split_bits); | |
584 | // Calculate how many levels we create starting from here | |
585 | int level = 0; | |
586 | leavies /= subs; | |
587 | while (leavies > 1) { | |
588 | ++level; | |
589 | leavies = leavies >> 4; | |
590 | } | |
591 | for (uint32_t i = 0; i < subs; ++i) { | |
592 | assert(split_bits <= 4); // otherwise BAD_SHIFT | |
593 | int v = tmp_id | (i << ((4 - split_bits) % 4)); | |
594 | paths.push_back(to_hex(v)); | |
595 | ret = create_path(paths); | |
596 | if (ret < 0 && ret != -EEXIST) | |
597 | return ret; | |
598 | ret = recursive_create_path(paths, level); | |
599 | if (ret < 0) | |
600 | return ret; | |
601 | paths.pop_back(); | |
602 | } | |
603 | return 0; | |
604 | } | |
605 | ||
606 | int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level) | |
607 | { | |
608 | // Get the number of sub directories for the current path | |
609 | vector<string> subdirs; | |
610 | int ret = list_subdirs(path, &subdirs); | |
611 | if (ret < 0) | |
612 | return ret; | |
613 | subdir_info_s info; | |
614 | info.subdirs = subdirs.size(); | |
615 | info.hash_level = hash_level; | |
616 | ret = set_info(path, info); | |
617 | if (ret < 0) | |
618 | return ret; | |
619 | ret = fsync_dir(path); | |
620 | if (ret < 0) | |
621 | return ret; | |
622 | ||
623 | // Do the same for subdirs | |
624 | vector<string>::const_iterator iter; | |
625 | for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) { | |
626 | path.push_back(*iter); | |
627 | ret = init_split_folder(path, hash_level + 1); | |
628 | if (ret < 0) | |
629 | return ret; | |
630 | path.pop_back(); | |
631 | } | |
632 | return 0; | |
633 | } | |
634 | ||
635 | int HashIndex::recursive_create_path(vector<string>& path, int level) | |
636 | { | |
637 | if (level == 0) | |
638 | return 0; | |
639 | for (int i = 0; i < 16; ++i) { | |
640 | path.push_back(to_hex(i)); | |
641 | int ret = create_path(path); | |
642 | if (ret < 0 && ret != -EEXIST) | |
643 | return ret; | |
644 | ret = recursive_create_path(path, level - 1); | |
645 | if (ret < 0) | |
646 | return ret; | |
647 | path.pop_back(); | |
648 | } | |
649 | return 0; | |
650 | } | |
651 | ||
652 | int HashIndex::recursive_remove(const vector<string> &path) { | |
653 | return _recursive_remove(path, true); | |
654 | } | |
655 | ||
656 | int HashIndex::_recursive_remove(const vector<string> &path, bool top) { | |
657 | vector<string> subdirs; | |
658 | dout(20) << __func__ << " path=" << path << dendl; | |
659 | int r = list_subdirs(path, &subdirs); | |
660 | if (r < 0) | |
661 | return r; | |
662 | map<string, ghobject_t> objects; | |
663 | r = list_objects(path, 0, 0, &objects); | |
664 | if (r < 0) | |
665 | return r; | |
666 | if (!objects.empty()) | |
667 | return -ENOTEMPTY; | |
668 | vector<string> subdir(path); | |
669 | for (vector<string>::iterator i = subdirs.begin(); | |
670 | i != subdirs.end(); | |
671 | ++i) { | |
672 | subdir.push_back(*i); | |
673 | r = _recursive_remove(subdir, false); | |
674 | if (r < 0) | |
675 | return r; | |
676 | subdir.pop_back(); | |
677 | } | |
678 | if (top) | |
679 | return 0; | |
680 | else | |
681 | return remove_path(path); | |
682 | } | |
683 | ||
684 | int HashIndex::start_col_split(const vector<string> &path) { | |
685 | bufferlist bl; | |
686 | InProgressOp op_tag(InProgressOp::COL_SPLIT, path); | |
687 | op_tag.encode(bl); | |
688 | int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); | |
689 | if (r < 0) | |
690 | return r; | |
691 | return fsync_dir(vector<string>()); | |
692 | } | |
693 | ||
694 | int HashIndex::start_split(const vector<string> &path) { | |
695 | bufferlist bl; | |
696 | InProgressOp op_tag(InProgressOp::SPLIT, path); | |
697 | op_tag.encode(bl); | |
698 | int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); | |
699 | if (r < 0) | |
700 | return r; | |
701 | return fsync_dir(vector<string>()); | |
702 | } | |
703 | ||
704 | int HashIndex::start_merge(const vector<string> &path) { | |
705 | bufferlist bl; | |
706 | InProgressOp op_tag(InProgressOp::MERGE, path); | |
707 | op_tag.encode(bl); | |
708 | int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl); | |
709 | if (r < 0) | |
710 | return r; | |
711 | return fsync_dir(vector<string>()); | |
712 | } | |
713 | ||
714 | int HashIndex::end_split_or_merge(const vector<string> &path) { | |
715 | return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG); | |
716 | } | |
717 | ||
718 | int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) { | |
719 | bufferlist buf; | |
720 | int r = get_attr_path(path, SUBDIR_ATTR, buf); | |
721 | if (r < 0) | |
722 | return r; | |
723 | bufferlist::iterator bufiter = buf.begin(); | |
724 | info->decode(bufiter); | |
725 | assert(path.size() == (unsigned)info->hash_level); | |
726 | return 0; | |
727 | } | |
728 | ||
729 | int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) { | |
730 | bufferlist buf; | |
731 | assert(path.size() == (unsigned)info.hash_level); | |
732 | info.encode(buf); | |
733 | return add_attr_path(path, SUBDIR_ATTR, buf); | |
734 | } | |
735 | ||
736 | bool HashIndex::must_merge(const subdir_info_s &info) { | |
737 | return (info.hash_level > 0 && | |
738 | merge_threshold > 0 && | |
739 | info.objs < (unsigned)merge_threshold && | |
740 | info.subdirs == 0); | |
741 | } | |
742 | ||
743 | bool HashIndex::must_split(const subdir_info_s &info) { | |
744 | return (info.hash_level < (unsigned)MAX_HASH_LEVEL && | |
224ce89b | 745 | info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16)); |
7c673cae FG |
746 | |
747 | } | |
748 | ||
749 | int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) { | |
750 | return start_merge(path); | |
751 | } | |
752 | ||
753 | int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) { | |
754 | vector<string> dst = path; | |
755 | dst.pop_back(); | |
756 | subdir_info_s dstinfo; | |
757 | int r, exists; | |
758 | r = path_exists(path, &exists); | |
759 | if (r < 0) | |
760 | return r; | |
761 | r = get_info(dst, &dstinfo); | |
762 | if (r < 0) | |
763 | return r; | |
764 | if (exists) { | |
765 | r = move_objects(path, dst); | |
766 | if (r < 0) | |
767 | return r; | |
768 | r = reset_attr(dst); | |
769 | if (r < 0) | |
770 | return r; | |
771 | r = remove_path(path); | |
772 | if (r < 0) | |
773 | return r; | |
774 | } | |
775 | if (must_merge(dstinfo)) { | |
776 | r = initiate_merge(dst, dstinfo); | |
777 | if (r < 0) | |
778 | return r; | |
779 | r = fsync_dir(dst); | |
780 | if (r < 0) | |
781 | return r; | |
782 | return complete_merge(dst, dstinfo); | |
783 | } | |
784 | r = fsync_dir(dst); | |
785 | if (r < 0) | |
786 | return r; | |
787 | return end_split_or_merge(path); | |
788 | } | |
789 | ||
790 | int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) { | |
791 | return start_split(path); | |
792 | } | |
793 | ||
794 | int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) { | |
795 | int level = info.hash_level; | |
796 | map<string, ghobject_t> objects; | |
797 | vector<string> dst = path; | |
798 | int r; | |
799 | dst.push_back(""); | |
800 | r = list_objects(path, 0, 0, &objects); | |
801 | if (r < 0) | |
802 | return r; | |
803 | vector<string> subdirs_vec; | |
804 | r = list_subdirs(path, &subdirs_vec); | |
805 | if (r < 0) | |
806 | return r; | |
807 | set<string> subdirs; | |
808 | subdirs.insert(subdirs_vec.begin(), subdirs_vec.end()); | |
809 | map<string, map<string, ghobject_t> > mapped; | |
810 | map<string, ghobject_t> moved; | |
811 | int num_moved = 0; | |
812 | for (map<string, ghobject_t>::iterator i = objects.begin(); | |
813 | i != objects.end(); | |
814 | ++i) { | |
815 | vector<string> new_path; | |
816 | get_path_components(i->second, &new_path); | |
817 | mapped[new_path[level]][i->first] = i->second; | |
818 | } | |
819 | for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin(); | |
820 | i != mapped.end(); | |
821 | ) { | |
822 | dst[level] = i->first; | |
823 | /* If the info already exists, it must be correct, | |
824 | * we may be picking up a partially finished split */ | |
825 | subdir_info_s temp; | |
826 | // subdir has already been fully copied | |
827 | if (subdirs.count(i->first) && !get_info(dst, &temp)) { | |
828 | for (map<string, ghobject_t>::iterator j = i->second.begin(); | |
829 | j != i->second.end(); | |
830 | ++j) { | |
831 | moved[j->first] = j->second; | |
832 | num_moved++; | |
833 | objects.erase(j->first); | |
834 | } | |
835 | ++i; | |
836 | continue; | |
837 | } | |
838 | ||
839 | subdir_info_s info_new; | |
840 | info_new.objs = i->second.size(); | |
841 | info_new.subdirs = 0; | |
842 | info_new.hash_level = level + 1; | |
843 | if (must_merge(info_new) && !subdirs.count(i->first)) { | |
844 | mapped.erase(i++); | |
845 | continue; | |
846 | } | |
847 | ||
848 | // Subdir doesn't yet exist | |
849 | if (!subdirs.count(i->first)) { | |
850 | info.subdirs += 1; | |
851 | r = create_path(dst); | |
852 | if (r < 0) | |
853 | return r; | |
854 | } // else subdir has been created but only partially copied | |
855 | ||
856 | for (map<string, ghobject_t>::iterator j = i->second.begin(); | |
857 | j != i->second.end(); | |
858 | ++j) { | |
859 | moved[j->first] = j->second; | |
860 | num_moved++; | |
861 | objects.erase(j->first); | |
862 | r = link_object(path, dst, j->second, j->first); | |
863 | // May be a partially finished split | |
864 | if (r < 0 && r != -EEXIST) { | |
865 | return r; | |
866 | } | |
867 | } | |
868 | ||
869 | r = fsync_dir(dst); | |
870 | if (r < 0) | |
871 | return r; | |
872 | ||
873 | // Presence of info must imply that all objects have been copied | |
874 | r = set_info(dst, info_new); | |
875 | if (r < 0) | |
876 | return r; | |
877 | ||
878 | r = fsync_dir(dst); | |
879 | if (r < 0) | |
880 | return r; | |
881 | ||
882 | ++i; | |
883 | } | |
884 | r = remove_objects(path, moved, &objects); | |
885 | if (r < 0) | |
886 | return r; | |
887 | info.objs = objects.size(); | |
888 | r = reset_attr(path); | |
889 | if (r < 0) | |
890 | return r; | |
891 | r = fsync_dir(path); | |
892 | if (r < 0) | |
893 | return r; | |
894 | return end_split_or_merge(path); | |
895 | } | |
896 | ||
897 | void HashIndex::get_path_components(const ghobject_t &oid, | |
898 | vector<string> *path) { | |
899 | char buf[MAX_HASH_LEVEL + 1]; | |
900 | snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key()); | |
901 | ||
902 | // Path components are the hex characters of oid.hobj.hash, least | |
903 | // significant first | |
904 | for (int i = 0; i < MAX_HASH_LEVEL; ++i) { | |
905 | path->push_back(string(&buf[i], 1)); | |
906 | } | |
907 | } | |
908 | ||
909 | string HashIndex::get_hash_str(uint32_t hash) { | |
910 | char buf[MAX_HASH_LEVEL + 1]; | |
911 | snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash); | |
912 | string retval; | |
913 | for (int i = 0; i < MAX_HASH_LEVEL; ++i) { | |
914 | retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]); | |
915 | } | |
916 | return retval; | |
917 | } | |
918 | ||
919 | string HashIndex::get_path_str(const ghobject_t &oid) { | |
920 | assert(!oid.is_max()); | |
921 | return get_hash_str(oid.hobj.get_hash()); | |
922 | } | |
923 | ||
924 | uint32_t HashIndex::hash_prefix_to_hash(string prefix) { | |
925 | while (prefix.size() < sizeof(uint32_t) * 2) { | |
926 | prefix.push_back('0'); | |
927 | } | |
928 | uint32_t hash; | |
929 | sscanf(prefix.c_str(), "%x", &hash); | |
930 | // nibble reverse | |
931 | hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4); | |
932 | hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8); | |
933 | hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16); | |
934 | return hash; | |
935 | } | |
936 | ||
937 | int HashIndex::get_path_contents_by_hash_bitwise( | |
938 | const vector<string> &path, | |
939 | const ghobject_t *next_object, | |
940 | set<string, CmpHexdigitStringBitwise> *hash_prefixes, | |
941 | set<pair<string, ghobject_t>, CmpPairBitwise> *objects) | |
942 | { | |
943 | map<string, ghobject_t> rev_objects; | |
944 | int r; | |
945 | r = list_objects(path, 0, 0, &rev_objects); | |
946 | if (r < 0) | |
947 | return r; | |
948 | // bitwise sort | |
949 | for (map<string, ghobject_t>::iterator i = rev_objects.begin(); | |
950 | i != rev_objects.end(); | |
951 | ++i) { | |
952 | if (next_object && i->second < *next_object) | |
953 | continue; | |
954 | string hash_prefix = get_path_str(i->second); | |
955 | hash_prefixes->insert(hash_prefix); | |
956 | objects->insert(pair<string, ghobject_t>(hash_prefix, i->second)); | |
957 | } | |
958 | vector<string> subdirs; | |
959 | r = list_subdirs(path, &subdirs); | |
960 | if (r < 0) | |
961 | return r; | |
962 | ||
963 | // sort subdirs bitwise (by reversing hex digit nibbles) | |
964 | std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise); | |
965 | ||
966 | // Local to this function, we will convert the prefix strings | |
967 | // (previously simply the reversed hex digits) to also have each | |
968 | // digit's nibbles reversed. This will make the strings sort | |
969 | // bitwise. | |
970 | string cur_prefix; | |
971 | for (vector<string>::const_iterator i = path.begin(); | |
972 | i != path.end(); | |
973 | ++i) { | |
974 | cur_prefix.append(reverse_hexdigit_bits_string(*i)); | |
975 | } | |
976 | string next_object_string; | |
977 | if (next_object) | |
978 | next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object)); | |
979 | for (vector<string>::iterator i = subdirs.begin(); | |
980 | i != subdirs.end(); | |
981 | ++i) { | |
982 | string candidate = cur_prefix + reverse_hexdigit_bits_string(*i); | |
983 | if (next_object) { | |
984 | if (next_object->is_max()) | |
985 | continue; | |
986 | if (candidate < next_object_string.substr(0, candidate.size())) | |
987 | continue; | |
988 | } | |
989 | // re-reverse the hex digit nibbles for the caller | |
990 | hash_prefixes->insert(reverse_hexdigit_bits_string(candidate)); | |
991 | } | |
992 | return 0; | |
993 | } | |
994 | ||
995 | int HashIndex::list_by_hash(const vector<string> &path, | |
996 | const ghobject_t &end, | |
997 | int max_count, | |
998 | ghobject_t *next, | |
999 | vector<ghobject_t> *out) | |
1000 | { | |
1001 | assert(out); | |
1002 | return list_by_hash_bitwise(path, end, max_count, next, out); | |
1003 | } | |
1004 | ||
1005 | int HashIndex::list_by_hash_bitwise( | |
1006 | const vector<string> &path, | |
1007 | const ghobject_t& end, | |
1008 | int max_count, | |
1009 | ghobject_t *next, | |
1010 | vector<ghobject_t> *out) | |
1011 | { | |
1012 | vector<string> next_path = path; | |
1013 | next_path.push_back(""); | |
1014 | set<string, CmpHexdigitStringBitwise> hash_prefixes; | |
1015 | set<pair<string, ghobject_t>, CmpPairBitwise> objects; | |
1016 | int r = get_path_contents_by_hash_bitwise(path, | |
1017 | next, | |
1018 | &hash_prefixes, | |
1019 | &objects); | |
1020 | if (r < 0) | |
1021 | return r; | |
1022 | for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin(); | |
1023 | i != hash_prefixes.end(); | |
1024 | ++i) { | |
1025 | dout(20) << __func__ << " prefix " << *i << dendl; | |
1026 | set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound( | |
1027 | make_pair(*i, ghobject_t())); | |
1028 | if (j == objects.end() || j->first != *i) { | |
1029 | *(next_path.rbegin()) = *(i->rbegin()); | |
1030 | ghobject_t next_recurse; | |
1031 | if (next) | |
1032 | next_recurse = *next; | |
1033 | r = list_by_hash_bitwise(next_path, | |
1034 | end, | |
1035 | max_count, | |
1036 | &next_recurse, | |
1037 | out); | |
1038 | ||
1039 | if (r < 0) | |
1040 | return r; | |
1041 | if (!next_recurse.is_max()) { | |
1042 | if (next) | |
1043 | *next = next_recurse; | |
1044 | return 0; | |
1045 | } | |
1046 | } else { | |
1047 | while (j != objects.end() && j->first == *i) { | |
1048 | if (max_count > 0 && out->size() == (unsigned)max_count) { | |
1049 | if (next) | |
1050 | *next = j->second; | |
1051 | return 0; | |
1052 | } | |
1053 | if (j->second >= end) { | |
1054 | if (next) | |
1055 | *next = j->second; | |
1056 | return 0; | |
1057 | } | |
1058 | if (!next || j->second >= *next) { | |
1059 | dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl; | |
1060 | out->push_back(j->second); | |
1061 | } | |
1062 | ++j; | |
1063 | } | |
1064 | } | |
1065 | } | |
1066 | if (next) | |
1067 | *next = ghobject_t::get_max(); | |
1068 | return 0; | |
1069 | } | |
1070 | ||
1071 |