ceph/src/os/filestore/HashIndex.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "include/types.h"
  16 #include "include/buffer.h"
  17 #include "osd/osd_types.h"
  18 #include <errno.h>
  19
  20 #include "HashIndex.h"
  21
  22 #include "common/errno.h"
  23 #include "common/debug.h"
  24 #define dout_context cct
  25 #define dout_subsys ceph_subsys_filestore
  26
  27 const string HashIndex::SUBDIR_ATTR = "contents";
  28 const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
  29
  30 /// hex digit to integer value
  31 int hex_to_int(char c)
  32 {
  33   if (c >= '0' && c <= '9')
  34     return c - '0';
  35   if (c >= 'A' && c <= 'F')
  36     return c - 'A' + 10;
  37   ceph_abort();
  38 }
  39
  40 /// int value to hex digit
  41 char int_to_hex(int v)
  42 {
  43   assert(v < 16);
  44   if (v < 10)
  45     return '0' + v;
  46   return 'A' + v - 10;
  47 }
  48
  49 /// reverse bits in a nibble (0..15)
  50 int reverse_nibble_bits(int in)
  51 {
  52   assert(in < 16);
  53   return
  54     ((in & 8) >> 3) |
  55     ((in & 4) >> 1) |
  56     ((in & 2) << 1) |
  57     ((in & 1) << 3);
  58 }
  59
  60 /// reverse nibble bits in a hex digit
  61 char reverse_hexdigit_bits(char c)
  62 {
  63   return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
  64 }
  65
  66 /// reverse nibble bits in a hex string
  67 string reverse_hexdigit_bits_string(string s)
  68 {
  69   for (unsigned i=0; i<s.size(); ++i)
  70     s[i] = reverse_hexdigit_bits(s[i]);
  71   return s;
  72 }
  73
  74 /// compare hex digit (as length 1 string) bitwise
  75 bool cmp_hexdigit_bitwise(const string& l, const string& r)
  76 {
  77   assert(l.length() == 1 && r.length() == 1);
  78   int lv = hex_to_int(l[0]);
  79   int rv = hex_to_int(r[0]);
  80   assert(lv < 16);
  81   assert(rv < 16);
  82   return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
  83 }
  84
  85 /// compare hex digit string bitwise
  86 bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
  87 {
  88   string ll = reverse_hexdigit_bits_string(l);
  89   string rr = reverse_hexdigit_bits_string(r);
  90   return ll < rr;
  91 }
  92
  93 int HashIndex::cleanup() {
  94   bufferlist bl;
  95   int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
  96   if (r < 0) {
  97     // No in progress operations!
  98     return 0;
  99   }
 100   bufferlist::iterator i = bl.begin();
 101   InProgressOp in_progress(i);
 102   subdir_info_s info;
 103   r = get_info(in_progress.path, &info);
 104   if (r == -ENOENT) {
 105     return end_split_or_merge(in_progress.path);
 106   } else if (r < 0) {
 107     return r;
 108   }
 109
 110   if (in_progress.is_split())
 111     return complete_split(in_progress.path, info);
 112   else if (in_progress.is_merge())
 113     return complete_merge(in_progress.path, info);
 114   else if (in_progress.is_col_split()) {
 115     for (vector<string>::iterator i = in_progress.path.begin();
 116          i != in_progress.path.end();
 117          ++i) {
 118       vector<string> path(in_progress.path.begin(), i);
 119       int r = reset_attr(path);
 120       if (r < 0)
 121         return r;
 122     }
 123     return 0;
 124   }
 125   else
 126     return -EINVAL;
 127 }
 128
 129 int HashIndex::reset_attr(
 130   const vector<string> &path)
 131 {
 132   int exists = 0;
 133   int r = path_exists(path, &exists);
 134   if (r < 0)
 135     return r;
 136   if (!exists)
 137     return 0;
 138   map<string, ghobject_t> objects;
 139   vector<string> subdirs;
 140   r = list_objects(path, 0, 0, &objects);
 141   if (r < 0)
 142     return r;
 143   r = list_subdirs(path, &subdirs);
 144   if (r < 0)
 145     return r;
 146
 147   subdir_info_s info;
 148   info.hash_level = path.size();
 149   info.objs = objects.size();
 150   info.subdirs = subdirs.size();
 151   return set_info(path, info);
 152 }
 153
 154 int HashIndex::col_split_level(
 155   HashIndex &from,
 156   HashIndex &to,
 157   const vector<string> &path,
 158   uint32_t inbits,
 159   uint32_t match,
 160   unsigned *mkdirred)
 161 {
 162   /* For each subdir, move, recurse, or ignore based on comparing the low order
 163    * bits of the hash represented by the subdir path with inbits, match passed
 164    * in.
 165    */
 166   vector<string> subdirs;
 167   int r = from.list_subdirs(path, &subdirs);
 168   if (r < 0)
 169     return r;
 170   map<string, ghobject_t> objects;
 171   r = from.list_objects(path, 0, 0, &objects);
 172   if (r < 0)
 173     return r;
 174
 175   set<string> to_move;
 176   for (vector<string>::iterator i = subdirs.begin();
 177        i != subdirs.end();
 178        ++i) {
 179     uint32_t bits = 0;
 180     uint32_t hash = 0;
 181     vector<string> sub_path(path.begin(), path.end());
 182     sub_path.push_back(*i);
 183     path_to_hobject_hash_prefix(sub_path, &bits, &hash);
 184     if (bits < inbits) {
 185       if (hobject_t::match_hash(hash, bits, match)) {
 186         r = col_split_level(
 187           from,
 188           to,
 189           sub_path,
 190           inbits,
 191           match,
 192           mkdirred);
 193         if (r < 0)
 194           return r;
 195         if (*mkdirred > path.size())
 196           *mkdirred = path.size();
 197       } // else, skip, doesn't need to be moved or recursed into
 198     } else {
 199       if (hobject_t::match_hash(hash, inbits, match)) {
 200         to_move.insert(*i);
 201       }
 202     } // else, skip, doesn't need to be moved or recursed into
 203   }
 204
 205   /* Then, do the same for each object */
 206   map<string, ghobject_t> objs_to_move;
 207   for (map<string, ghobject_t>::iterator i = objects.begin();
 208        i != objects.end();
 209        ++i) {
 210     if (i->second.match(inbits, match)) {
 211       objs_to_move.insert(*i);
 212     }
 213   }
 214
 215   if (objs_to_move.empty() && to_move.empty())
 216     return 0;
 217
 218   // Make parent directories as needed
 219   while (*mkdirred < path.size()) {
 220     ++*mkdirred;
 221     int exists = 0;
 222     vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
 223     r = to.path_exists(creating_path, &exists);
 224     if (r < 0)
 225       return r;
 226     if (exists)
 227       continue;
 228     subdir_info_s info;
 229     info.objs = 0;
 230     info.subdirs = 0;
 231     info.hash_level = creating_path.size();
 232     if (*mkdirred < path.size() - 1)
 233       info.subdirs = 1;
 234     r = to.start_col_split(creating_path);
 235     if (r < 0)
 236       return r;
 237     r = to.create_path(creating_path);
 238     if (r < 0)
 239       return r;
 240     r = to.set_info(creating_path, info);
 241     if (r < 0)
 242       return r;
 243     r = to.end_split_or_merge(creating_path);
 244     if (r < 0)
 245       return r;
 246   }
 247
 248   subdir_info_s from_info;
 249   subdir_info_s to_info;
 250   r = from.get_info(path, &from_info);
 251   if (r < 0)
 252     return r;
 253   r = to.get_info(path, &to_info);
 254   if (r < 0)
 255     return r;
 256
 257   from.start_col_split(path);
 258   to.start_col_split(path);
 259
 260   // Do subdir moves
 261   for (set<string>::iterator i = to_move.begin();
 262        i != to_move.end();
 263        ++i) {
 264     from_info.subdirs--;
 265     to_info.subdirs++;
 266     r = move_subdir(from, to, path, *i);
 267     if (r < 0)
 268       return r;
 269   }
 270
 271   for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
 272        i != objs_to_move.end();
 273        ++i) {
 274     from_info.objs--;
 275     to_info.objs++;
 276     r = move_object(from, to, path, *i);
 277     if (r < 0)
 278       return r;
 279   }
 280
 281
 282   r = to.set_info(path, to_info);
 283   if (r < 0)
 284     return r;
 285   r = from.set_info(path, from_info);
 286   if (r < 0)
 287     return r;
 288   from.end_split_or_merge(path);
 289   to.end_split_or_merge(path);
 290   return 0;
 291 }
 292
 293 int HashIndex::_split(
 294   uint32_t match,
 295   uint32_t bits,
 296   CollectionIndex* dest) {
 297   assert(collection_version() == dest->collection_version());
 298   unsigned mkdirred = 0;
 299   return col_split_level(
 300     *this,
 301     *static_cast<HashIndex*>(dest),
 302     vector<string>(),
 303     bits,
 304     match,
 305     &mkdirred);
 306 }
 307
 308 int HashIndex::split_dirs(const vector<string> &path) {
 309   dout(20) << __func__ << " " << path << dendl;
 310   subdir_info_s info;
 311   int r = get_info(path, &info);
 312   if (r < 0) {
 313     dout(10) << "error looking up info for " << path << ": "
 314              << cpp_strerror(r) << dendl;
 315     return r;
 316   }
 317
 318   if (must_split(info)) {
 319     dout(1) << __func__ << " " << path << " has " << info.objs
 320             << " objects, starting split." << dendl;
 321     r = initiate_split(path, info);
 322     if (r < 0) {
 323       dout(10) << "error initiating split on " << path << ": "
 324                << cpp_strerror(r) << dendl;
 325       return r;
 326     }
 327
 328     r = complete_split(path, info);
 329     dout(1) << __func__ << " " << path << " split completed."
 330             << dendl;
 331     if (r < 0) {
 332       dout(10) << "error completing split on " << path << ": "
 333                << cpp_strerror(r) << dendl;
 334       return r;
 335     }
 336   }
 337
 338   vector<string> subdirs;
 339   r = list_subdirs(path, &subdirs);
 340   if (r < 0) {
 341     dout(10) << "error listing subdirs of " << path << ": "
 342              << cpp_strerror(r) << dendl;
 343     return r;
 344   }
 345   for (vector<string>::const_iterator it = subdirs.begin();
 346        it != subdirs.end(); ++it) {
 347     vector<string> subdir_path(path);
 348     subdir_path.push_back(*it);
 349     r = split_dirs(subdir_path);
 350     if (r < 0) {
 351       return r;
 352     }
 353   }
 354
 355   return r;
 356 }
 357
 358 int HashIndex::apply_layout_settings() {
 359   vector<string> path;
 360   dout(10) << __func__ << " split multiple = " << split_multiplier
 361            << " merge threshold = " << merge_threshold << dendl;
 362   return split_dirs(path);
 363 }
 364
 365 int HashIndex::_init() {
 366   subdir_info_s info;
 367   vector<string> path;
 368   return set_info(path, info);
 369 }
 370
 371 /* LFNIndex virtual method implementations */
 372 int HashIndex::_created(const vector<string> &path,
 373                         const ghobject_t &oid,
 374                         const string &mangled_name) {
 375   subdir_info_s info;
 376   int r;
 377   r = get_info(path, &info);
 378   if (r < 0)
 379     return r;
 380   info.objs++;
 381   r = set_info(path, info);
 382   if (r < 0)
 383     return r;
 384
 385   if (must_split(info)) {
 386     dout(1) << __func__ << " " << path << " has " << info.objs
 387             << " objects, starting split." << dendl;
 388     int r = initiate_split(path, info);
 389     if (r < 0)
 390       return r;
 391     r = complete_split(path, info);
 392     dout(1) << __func__ << " " << path << " split completed."
 393             << dendl;
 394     return r;
 395   } else {
 396     return 0;
 397   }
 398 }
 399
 400 int HashIndex::_remove(const vector<string> &path,
 401                        const ghobject_t &oid,
 402                        const string &mangled_name) {
 403   int r;
 404   r = remove_object(path, oid);
 405   if (r < 0)
 406     return r;
 407   subdir_info_s info;
 408   r = get_info(path, &info);
 409   if (r < 0)
 410     return r;
 411   info.objs--;
 412   r = set_info(path, info);
 413   if (r < 0)
 414     return r;
 415   if (must_merge(info)) {
 416     r = initiate_merge(path, info);
 417     if (r < 0)
 418       return r;
 419     return complete_merge(path, info);
 420   } else {
 421     return 0;
 422   }
 423 }
 424
 425 int HashIndex::_lookup(const ghobject_t &oid,
 426                        vector<string> *path,
 427                        string *mangled_name,
 428                        int *hardlink) {
 429   vector<string> path_comp;
 430   get_path_components(oid, &path_comp);
 431   vector<string>::iterator next = path_comp.begin();
 432   int exists;
 433   while (1) {
 434     int r = path_exists(*path, &exists);
 435     if (r < 0)
 436       return r;
 437     if (!exists) {
 438       if (path->empty())
 439         return -ENOENT;
 440       path->pop_back();
 441       break;
 442     }
 443     if (next == path_comp.end())
 444       break;
 445     path->push_back(*(next++));
 446   }
 447   return get_mangled_name(*path, oid, mangled_name, hardlink);
 448 }
 449
 450 int HashIndex::_collection_list_partial(const ghobject_t &start,
 451                                         const ghobject_t &end,
 452                                         int max_count,
 453                                         vector<ghobject_t> *ls,
 454                                         ghobject_t *next) {
 455   vector<string> path;
 456   ghobject_t _next;
 457   if (!next)
 458     next = &_next;
 459   *next = start;
 460   dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
 461   return list_by_hash(path, end, max_count, next, ls);
 462 }
 463
 464 int HashIndex::prep_delete() {
 465   return recursive_remove(vector<string>());
 466 }
 467
 468 int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
 469   int ret;
 470   vector<string> path;
 471   subdir_info_s root_info;
 472   // Make sure there is neither objects nor sub-folders
 473   // in this collection
 474   ret = get_info(path, &root_info);
 475   if (ret < 0)
 476     return ret;
 477
 478   // Do the folder splitting first
 479   ret = pre_split_folder(pg_num, expected_num_objs);
 480   if (ret < 0)
 481     return ret;
 482   // Initialize the folder info starting from root
 483   return init_split_folder(path, 0);
 484 }
 485
 486 int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
 487 {
 488   // If folder merging is enabled (by setting the threshold positive),
 489   // no need to split
 490   if (merge_threshold > 0)
 491     return 0;
 492   const coll_t c = coll();
 493   // Do not split if the expected number of objects in this collection is zero (by default)
 494   if (expected_num_objs == 0)
 495     return 0;
 496
 497   // Calculate the number of leaf folders (which actually store files)
 498   // need to be created
 499   const uint64_t objs_per_folder = (uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier * 16;
 500   uint64_t leavies = expected_num_objs / objs_per_folder ;
 501   // No need to split
 502   if (leavies == 0 || expected_num_objs == objs_per_folder)
 503     return 0;
 504
 505   spg_t spgid;
 506   if (!c.is_pg_prefix(&spgid))
 507     return -EINVAL;
 508   const ps_t ps = spgid.pgid.ps();
 509
 510   // the most significant bits of pg_num
 511   const int pg_num_bits = calc_num_bits(pg_num - 1);
 512   ps_t tmp_id = ps;
 513   // calculate the number of levels we only create one sub folder
 514   int num = pg_num_bits / 4;
 515   // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
 516   // so that splitting starts at level 3
 517   if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
 518     --num;
 519   }
 520
 521   int ret;
 522   // Start with creation that only has one subfolder
 523   vector<string> paths;
 524   int dump_num = num;
 525   while (num-- > 0) {
 526     ps_t v = tmp_id & 0x0000000f;
 527     paths.push_back(to_hex(v));
 528     ret = create_path(paths);
 529     if (ret < 0 && ret != -EEXIST)
 530       return ret;
 531     tmp_id = tmp_id >> 4;
 532   }
 533
 534   // Starting from here, we can split by creating multiple subfolders
 535   const int left_bits = pg_num_bits - dump_num * 4;
 536   // this variable denotes how many bits (for this level) that can be
 537   // used for sub folder splitting
 538   int split_bits = 4 - left_bits;
 539   // the below logic is inspired by rados.h#ceph_stable_mod,
 540   // it basically determines how many sub-folders should we
 541   // create for splitting
 542   assert(pg_num_bits > 0); // otherwise BAD_SHIFT
 543   if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
 544     ++split_bits;
 545   }
 546   const uint32_t subs = (1 << split_bits);
 547   // Calculate how many levels we create starting from here
 548   int level  = 0;
 549   leavies /= subs;
 550   while (leavies > 1) {
 551     ++level;
 552     leavies = leavies >> 4;
 553   }
 554   for (uint32_t i = 0; i < subs; ++i) {
 555     assert(split_bits <= 4); // otherwise BAD_SHIFT
 556     int v = tmp_id | (i << ((4 - split_bits) % 4));
 557     paths.push_back(to_hex(v));
 558     ret = create_path(paths);
 559     if (ret < 0 && ret != -EEXIST)
 560       return ret;
 561     ret = recursive_create_path(paths, level);
 562     if (ret < 0)
 563       return ret;
 564     paths.pop_back();
 565   }
 566   return 0;
 567 }
 568
 569 int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
 570 {
 571   // Get the number of sub directories for the current path
 572   vector<string> subdirs;
 573   int ret = list_subdirs(path, &subdirs);
 574   if (ret < 0)
 575     return ret;
 576   subdir_info_s info;
 577   info.subdirs = subdirs.size();
 578   info.hash_level = hash_level;
 579   ret = set_info(path, info);
 580   if (ret < 0)
 581     return ret;
 582   ret = fsync_dir(path);
 583   if (ret < 0)
 584     return ret;
 585
 586   // Do the same for subdirs
 587   vector<string>::const_iterator iter;
 588   for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
 589     path.push_back(*iter);
 590     ret = init_split_folder(path, hash_level + 1);
 591     if (ret < 0)
 592       return ret;
 593     path.pop_back();
 594   }
 595   return 0;
 596 }
 597
 598 int HashIndex::recursive_create_path(vector<string>& path, int level)
 599 {
 600   if (level == 0)
 601     return 0;
 602   for (int i = 0; i < 16; ++i) {
 603     path.push_back(to_hex(i));
 604     int ret = create_path(path);
 605     if (ret < 0 && ret != -EEXIST)
 606       return ret;
 607     ret = recursive_create_path(path, level - 1);
 608     if (ret < 0)
 609       return ret;
 610     path.pop_back();
 611   }
 612   return 0;
 613 }
 614
 615 int HashIndex::recursive_remove(const vector<string> &path) {
 616   return _recursive_remove(path, true);
 617 }
 618
 619 int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
 620   vector<string> subdirs;
 621   dout(20) << __func__ << " path=" << path << dendl;
 622   int r = list_subdirs(path, &subdirs);
 623   if (r < 0)
 624     return r;
 625   map<string, ghobject_t> objects;
 626   r = list_objects(path, 0, 0, &objects);
 627   if (r < 0)
 628     return r;
 629   if (!objects.empty())
 630     return -ENOTEMPTY;
 631   vector<string> subdir(path);
 632   for (vector<string>::iterator i = subdirs.begin();
 633        i != subdirs.end();
 634        ++i) {
 635     subdir.push_back(*i);
 636     r = _recursive_remove(subdir, false);
 637     if (r < 0)
 638       return r;
 639     subdir.pop_back();
 640   }
 641   if (top)
 642     return 0;
 643   else
 644     return remove_path(path);
 645 }
 646
 647 int HashIndex::start_col_split(const vector<string> &path) {
 648   bufferlist bl;
 649   InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
 650   op_tag.encode(bl);
 651   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 652   if (r < 0)
 653     return r;
 654   return fsync_dir(vector<string>());
 655 }
 656
 657 int HashIndex::start_split(const vector<string> &path) {
 658   bufferlist bl;
 659   InProgressOp op_tag(InProgressOp::SPLIT, path);
 660   op_tag.encode(bl);
 661   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 662   if (r < 0)
 663     return r;
 664   return fsync_dir(vector<string>());
 665 }
 666
 667 int HashIndex::start_merge(const vector<string> &path) {
 668   bufferlist bl;
 669   InProgressOp op_tag(InProgressOp::MERGE, path);
 670   op_tag.encode(bl);
 671   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 672   if (r < 0)
 673     return r;
 674   return fsync_dir(vector<string>());
 675 }
 676
 677 int HashIndex::end_split_or_merge(const vector<string> &path) {
 678   return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
 679 }
 680
 681 int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
 682   bufferlist buf;
 683   int r = get_attr_path(path, SUBDIR_ATTR, buf);
 684   if (r < 0)
 685     return r;
 686   bufferlist::iterator bufiter = buf.begin();
 687   info->decode(bufiter);
 688   assert(path.size() == (unsigned)info->hash_level);
 689   return 0;
 690 }
 691
 692 int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
 693   bufferlist buf;
 694   assert(path.size() == (unsigned)info.hash_level);
 695   info.encode(buf);
 696   return add_attr_path(path, SUBDIR_ATTR, buf);
 697 }
 698
 699 bool HashIndex::must_merge(const subdir_info_s &info) {
 700   return (info.hash_level > 0 &&
 701           merge_threshold > 0 &&
 702           info.objs < (unsigned)merge_threshold &&
 703           info.subdirs == 0);
 704 }
 705
 706 bool HashIndex::must_split(const subdir_info_s &info) {
 707   return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
 708           info.objs > ((unsigned)(abs(merge_threshold)) * 16 * split_multiplier));
 709
 710 }
 711
 712 int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
 713   return start_merge(path);
 714 }
 715
 716 int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
 717   vector<string> dst = path;
 718   dst.pop_back();
 719   subdir_info_s dstinfo;
 720   int r, exists;
 721   r = path_exists(path, &exists);
 722   if (r < 0)
 723     return r;
 724   r = get_info(dst, &dstinfo);
 725   if (r < 0)
 726     return r;
 727   if (exists) {
 728     r = move_objects(path, dst);
 729     if (r < 0)
 730       return r;
 731     r = reset_attr(dst);
 732     if (r < 0)
 733       return r;
 734     r = remove_path(path);
 735     if (r < 0)
 736       return r;
 737   }
 738   if (must_merge(dstinfo)) {
 739     r = initiate_merge(dst, dstinfo);
 740     if (r < 0)
 741       return r;
 742     r = fsync_dir(dst);
 743     if (r < 0)
 744       return r;
 745     return complete_merge(dst, dstinfo);
 746   }
 747   r = fsync_dir(dst);
 748   if (r < 0)
 749     return r;
 750   return end_split_or_merge(path);
 751 }
 752
 753 int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
 754   return start_split(path);
 755 }
 756
 757 int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
 758   int level = info.hash_level;
 759   map<string, ghobject_t> objects;
 760   vector<string> dst = path;
 761   int r;
 762   dst.push_back("");
 763   r = list_objects(path, 0, 0, &objects);
 764   if (r < 0)
 765     return r;
 766   vector<string> subdirs_vec;
 767   r = list_subdirs(path, &subdirs_vec);
 768   if (r < 0)
 769     return r;
 770   set<string> subdirs;
 771   subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
 772   map<string, map<string, ghobject_t> > mapped;
 773   map<string, ghobject_t> moved;
 774   int num_moved = 0;
 775   for (map<string, ghobject_t>::iterator i = objects.begin();
 776        i != objects.end();
 777        ++i) {
 778     vector<string> new_path;
 779     get_path_components(i->second, &new_path);
 780     mapped[new_path[level]][i->first] = i->second;
 781   }
 782   for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
 783        i != mapped.end();
 784        ) {
 785     dst[level] = i->first;
 786     /* If the info already exists, it must be correct,
 787      * we may be picking up a partially finished split */
 788     subdir_info_s temp;
 789     // subdir has already been fully copied
 790     if (subdirs.count(i->first) && !get_info(dst, &temp)) {
 791       for (map<string, ghobject_t>::iterator j = i->second.begin();
 792            j != i->second.end();
 793            ++j) {
 794         moved[j->first] = j->second;
 795         num_moved++;
 796         objects.erase(j->first);
 797       }
 798       ++i;
 799       continue;
 800     }
 801
 802     subdir_info_s info_new;
 803     info_new.objs = i->second.size();
 804     info_new.subdirs = 0;
 805     info_new.hash_level = level + 1;
 806     if (must_merge(info_new) && !subdirs.count(i->first)) {
 807       mapped.erase(i++);
 808       continue;
 809     }
 810
 811     // Subdir doesn't yet exist
 812     if (!subdirs.count(i->first)) {
 813       info.subdirs += 1;
 814       r = create_path(dst);
 815       if (r < 0)
 816         return r;
 817     } // else subdir has been created but only partially copied
 818
 819     for (map<string, ghobject_t>::iterator j = i->second.begin();
 820          j != i->second.end();
 821          ++j) {
 822       moved[j->first] = j->second;
 823       num_moved++;
 824       objects.erase(j->first);
 825       r = link_object(path, dst, j->second, j->first);
 826       // May be a partially finished split
 827       if (r < 0 && r != -EEXIST) {
 828         return r;
 829       }
 830     }
 831
 832     r = fsync_dir(dst);
 833     if (r < 0)
 834       return r;
 835
 836     // Presence of info must imply that all objects have been copied
 837     r = set_info(dst, info_new);
 838     if (r < 0)
 839       return r;
 840
 841     r = fsync_dir(dst);
 842     if (r < 0)
 843       return r;
 844
 845     ++i;
 846   }
 847   r = remove_objects(path, moved, &objects);
 848   if (r < 0)
 849     return r;
 850   info.objs = objects.size();
 851   r = reset_attr(path);
 852   if (r < 0)
 853     return r;
 854   r = fsync_dir(path);
 855   if (r < 0)
 856     return r;
 857   return end_split_or_merge(path);
 858 }
 859
 860 void HashIndex::get_path_components(const ghobject_t &oid,
 861                                     vector<string> *path) {
 862   char buf[MAX_HASH_LEVEL + 1];
 863   snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
 864
 865   // Path components are the hex characters of oid.hobj.hash, least
 866   // significant first
 867   for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
 868     path->push_back(string(&buf[i], 1));
 869   }
 870 }
 871
 872 string HashIndex::get_hash_str(uint32_t hash) {
 873   char buf[MAX_HASH_LEVEL + 1];
 874   snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
 875   string retval;
 876   for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
 877     retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
 878   }
 879   return retval;
 880 }
 881
 882 string HashIndex::get_path_str(const ghobject_t &oid) {
 883   assert(!oid.is_max());
 884   return get_hash_str(oid.hobj.get_hash());
 885 }
 886
 887 uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
 888   while (prefix.size() < sizeof(uint32_t) * 2) {
 889     prefix.push_back('0');
 890   }
 891   uint32_t hash;
 892   sscanf(prefix.c_str(), "%x", &hash);
 893   // nibble reverse
 894   hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
 895   hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
 896   hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
 897   return hash;
 898 }
 899
 900 int HashIndex::get_path_contents_by_hash_bitwise(
 901   const vector<string> &path,
 902   const ghobject_t *next_object,
 903   set<string, CmpHexdigitStringBitwise> *hash_prefixes,
 904   set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
 905 {
 906   map<string, ghobject_t> rev_objects;
 907   int r;
 908   r = list_objects(path, 0, 0, &rev_objects);
 909   if (r < 0)
 910     return r;
 911   // bitwise sort
 912   for (map<string, ghobject_t>::iterator i = rev_objects.begin();
 913        i != rev_objects.end();
 914        ++i) {
 915     if (next_object && i->second < *next_object)
 916       continue;
 917     string hash_prefix = get_path_str(i->second);
 918     hash_prefixes->insert(hash_prefix);
 919     objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
 920   }
 921   vector<string> subdirs;
 922   r = list_subdirs(path, &subdirs);
 923   if (r < 0)
 924     return r;
 925
 926   // sort subdirs bitwise (by reversing hex digit nibbles)
 927   std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
 928
 929   // Local to this function, we will convert the prefix strings
 930   // (previously simply the reversed hex digits) to also have each
 931   // digit's nibbles reversed.  This will make the strings sort
 932   // bitwise.
 933   string cur_prefix;
 934   for (vector<string>::const_iterator i = path.begin();
 935        i != path.end();
 936        ++i) {
 937     cur_prefix.append(reverse_hexdigit_bits_string(*i));
 938   }
 939   string next_object_string;
 940   if (next_object)
 941     next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
 942   for (vector<string>::iterator i = subdirs.begin();
 943        i != subdirs.end();
 944        ++i) {
 945     string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
 946     if (next_object) {
 947       if (next_object->is_max())
 948         continue;
 949       if (candidate < next_object_string.substr(0, candidate.size()))
 950         continue;
 951     }
 952     // re-reverse the hex digit nibbles for the caller
 953     hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
 954   }
 955   return 0;
 956 }
 957
 958 int HashIndex::list_by_hash(const vector<string> &path,
 959                             const ghobject_t &end,
 960                             int max_count,
 961                             ghobject_t *next,
 962                             vector<ghobject_t> *out)
 963 {
 964   assert(out);
 965   return list_by_hash_bitwise(path, end, max_count, next, out);
 966 }
 967
 968 int HashIndex::list_by_hash_bitwise(
 969   const vector<string> &path,
 970   const ghobject_t& end,
 971   int max_count,
 972   ghobject_t *next,
 973   vector<ghobject_t> *out)
 974 {
 975   vector<string> next_path = path;
 976   next_path.push_back("");
 977   set<string, CmpHexdigitStringBitwise> hash_prefixes;
 978   set<pair<string, ghobject_t>, CmpPairBitwise> objects;
 979   int r = get_path_contents_by_hash_bitwise(path,
 980                                             next,
 981                                             &hash_prefixes,
 982                                             &objects);
 983   if (r < 0)
 984     return r;
 985   for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
 986        i != hash_prefixes.end();
 987        ++i) {
 988     dout(20) << __func__ << " prefix " << *i << dendl;
 989     set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
 990       make_pair(*i, ghobject_t()));
 991     if (j == objects.end() || j->first != *i) {
 992       *(next_path.rbegin()) = *(i->rbegin());
 993       ghobject_t next_recurse;
 994       if (next)
 995         next_recurse = *next;
 996       r = list_by_hash_bitwise(next_path,
 997                                end,
 998                                max_count,
 999                                &next_recurse,
1000                                out);
1001
1002       if (r < 0)
1003         return r;
1004       if (!next_recurse.is_max()) {
1005         if (next)
1006           *next = next_recurse;
1007         return 0;
1008       }
1009     } else {
1010       while (j != objects.end() && j->first == *i) {
1011         if (max_count > 0 && out->size() == (unsigned)max_count) {
1012           if (next)
1013             *next = j->second;
1014           return 0;
1015         }
1016         if (j->second >= end) {
1017           if (next)
1018             *next = j->second;
1019           return 0;
1020         }
1021         if (!next || j->second >= *next) {
1022           dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
1023           out->push_back(j->second);
1024         }
1025         ++j;
1026       }
1027     }
1028   }
1029   if (next)
1030     *next = ghobject_t::get_max();
1031   return 0;
1032 }
1033
1034