ceph/src/os/filestore/HashIndex.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "include/compat.h"
  16 #include "include/types.h"
  17 #include "include/buffer.h"
  18 #include "osd/osd_types.h"
  19 #include <errno.h>
  20
  21 #include "HashIndex.h"
  22
  23 #include "common/errno.h"
  24 #include "common/debug.h"
  25 #define dout_context cct
  26 #define dout_subsys ceph_subsys_filestore
  27
  28 const string HashIndex::SUBDIR_ATTR = "contents";
  29 const string HashIndex::SETTINGS_ATTR = "settings";
  30 const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
  31
  32 /// hex digit to integer value
  33 int hex_to_int(char c)
  34 {
  35   if (c >= '0' && c <= '9')
  36     return c - '0';
  37   if (c >= 'A' && c <= 'F')
  38     return c - 'A' + 10;
  39   ceph_abort();
  40 }
  41
  42 /// int value to hex digit
  43 char int_to_hex(int v)
  44 {
  45   ceph_assert(v < 16);
  46   if (v < 10)
  47     return '0' + v;
  48   return 'A' + v - 10;
  49 }
  50
  51 /// reverse bits in a nibble (0..15)
  52 int reverse_nibble_bits(int in)
  53 {
  54   ceph_assert(in < 16);
  55   return
  56     ((in & 8) >> 3) |
  57     ((in & 4) >> 1) |
  58     ((in & 2) << 1) |
  59     ((in & 1) << 3);
  60 }
  61
  62 /// reverse nibble bits in a hex digit
  63 char reverse_hexdigit_bits(char c)
  64 {
  65   return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
  66 }
  67
  68 /// reverse nibble bits in a hex string
  69 string reverse_hexdigit_bits_string(string s)
  70 {
  71   for (unsigned i=0; i<s.size(); ++i)
  72     s[i] = reverse_hexdigit_bits(s[i]);
  73   return s;
  74 }
  75
  76 /// compare hex digit (as length 1 string) bitwise
  77 bool cmp_hexdigit_bitwise(const string& l, const string& r)
  78 {
  79   ceph_assert(l.length() == 1 && r.length() == 1);
  80   int lv = hex_to_int(l[0]);
  81   int rv = hex_to_int(r[0]);
  82   ceph_assert(lv < 16);
  83   ceph_assert(rv < 16);
  84   return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
  85 }
  86
  87 /// compare hex digit string bitwise
  88 bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
  89 {
  90   string ll = reverse_hexdigit_bits_string(l);
  91   string rr = reverse_hexdigit_bits_string(r);
  92   return ll < rr;
  93 }
  94
  95 int HashIndex::cleanup() {
  96   bufferlist bl;
  97   int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
  98   if (r < 0) {
  99     // No in progress operations!
 100     return 0;
 101   }
 102   auto i = bl.cbegin();
 103   InProgressOp in_progress(i);
 104   subdir_info_s info;
 105   r = get_info(in_progress.path, &info);
 106   if (r == -ENOENT) {
 107     return end_split_or_merge(in_progress.path);
 108   } else if (r < 0) {
 109     return r;
 110   }
 111
 112   if (in_progress.is_split())
 113     return complete_split(in_progress.path, info);
 114   else if (in_progress.is_merge())
 115     return complete_merge(in_progress.path, info);
 116   else if (in_progress.is_col_split()) {
 117     for (vector<string>::iterator i = in_progress.path.begin();
 118          i != in_progress.path.end();
 119          ++i) {
 120       vector<string> path(in_progress.path.begin(), i);
 121       int r = reset_attr(path);
 122       if (r < 0)
 123         return r;
 124     }
 125     return 0;
 126   }
 127   else
 128     return -EINVAL;
 129 }
 130
 131 int HashIndex::reset_attr(
 132   const vector<string> &path)
 133 {
 134   int exists = 0;
 135   int r = path_exists(path, &exists);
 136   if (r < 0)
 137     return r;
 138   if (!exists)
 139     return 0;
 140   map<string, ghobject_t> objects;
 141   vector<string> subdirs;
 142   r = list_objects(path, 0, 0, &objects);
 143   if (r < 0)
 144     return r;
 145   r = list_subdirs(path, &subdirs);
 146   if (r < 0)
 147     return r;
 148
 149   subdir_info_s info;
 150   info.hash_level = path.size();
 151   info.objs = objects.size();
 152   info.subdirs = subdirs.size();
 153   return set_info(path, info);
 154 }
 155
 156 int HashIndex::col_split_level(
 157   HashIndex &from,
 158   HashIndex &to,
 159   const vector<string> &path,
 160   uint32_t inbits,
 161   uint32_t match,
 162   unsigned *mkdirred)
 163 {
 164   /* For each subdir, move, recurse, or ignore based on comparing the low order
 165    * bits of the hash represented by the subdir path with inbits, match passed
 166    * in.
 167    */
 168   vector<string> subdirs;
 169   int r = from.list_subdirs(path, &subdirs);
 170   if (r < 0)
 171     return r;
 172   map<string, ghobject_t> objects;
 173   r = from.list_objects(path, 0, 0, &objects);
 174   if (r < 0)
 175     return r;
 176
 177   set<string> to_move;
 178   for (vector<string>::iterator i = subdirs.begin();
 179        i != subdirs.end();
 180        ++i) {
 181     uint32_t bits = 0;
 182     uint32_t hash = 0;
 183     vector<string> sub_path(path.begin(), path.end());
 184     sub_path.push_back(*i);
 185     path_to_hobject_hash_prefix(sub_path, &bits, &hash);
 186     if (bits < inbits) {
 187       if (hobject_t::match_hash(hash, bits, match)) {
 188         r = col_split_level(
 189           from,
 190           to,
 191           sub_path,
 192           inbits,
 193           match,
 194           mkdirred);
 195         if (r < 0)
 196           return r;
 197         if (*mkdirred > path.size())
 198           *mkdirred = path.size();
 199       } // else, skip, doesn't need to be moved or recursed into
 200     } else {
 201       if (hobject_t::match_hash(hash, inbits, match)) {
 202         to_move.insert(*i);
 203       }
 204     } // else, skip, doesn't need to be moved or recursed into
 205   }
 206
 207   /* Then, do the same for each object */
 208   map<string, ghobject_t> objs_to_move;
 209   for (map<string, ghobject_t>::iterator i = objects.begin();
 210        i != objects.end();
 211        ++i) {
 212     if (i->second.match(inbits, match)) {
 213       objs_to_move.insert(*i);
 214     }
 215   }
 216
 217   if (objs_to_move.empty() && to_move.empty())
 218     return 0;
 219
 220   // Make parent directories as needed
 221   while (*mkdirred < path.size()) {
 222     ++*mkdirred;
 223     int exists = 0;
 224     vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
 225     r = to.path_exists(creating_path, &exists);
 226     if (r < 0)
 227       return r;
 228     if (exists)
 229       continue;
 230     subdir_info_s info;
 231     info.objs = 0;
 232     info.subdirs = 0;
 233     info.hash_level = creating_path.size();
 234     if (*mkdirred < path.size() - 1)
 235       info.subdirs = 1;
 236     r = to.start_col_split(creating_path);
 237     if (r < 0)
 238       return r;
 239     r = to.create_path(creating_path);
 240     if (r < 0)
 241       return r;
 242     r = to.set_info(creating_path, info);
 243     if (r < 0)
 244       return r;
 245     r = to.end_split_or_merge(creating_path);
 246     if (r < 0)
 247       return r;
 248   }
 249
 250   subdir_info_s from_info;
 251   subdir_info_s to_info;
 252   r = from.get_info(path, &from_info);
 253   if (r < 0)
 254     return r;
 255   r = to.get_info(path, &to_info);
 256   if (r < 0)
 257     return r;
 258
 259   from.start_col_split(path);
 260   to.start_col_split(path);
 261
 262   // Do subdir moves
 263   for (set<string>::iterator i = to_move.begin();
 264        i != to_move.end();
 265        ++i) {
 266     from_info.subdirs--;
 267     to_info.subdirs++;
 268     r = move_subdir(from, to, path, *i);
 269     if (r < 0)
 270       return r;
 271   }
 272
 273   for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
 274        i != objs_to_move.end();
 275        ++i) {
 276     from_info.objs--;
 277     to_info.objs++;
 278     r = move_object(from, to, path, *i);
 279     if (r < 0)
 280       return r;
 281   }
 282
 283
 284   r = to.set_info(path, to_info);
 285   if (r < 0)
 286     return r;
 287   r = from.set_info(path, from_info);
 288   if (r < 0)
 289     return r;
 290   from.end_split_or_merge(path);
 291   to.end_split_or_merge(path);
 292   return 0;
 293 }
 294
 295 int HashIndex::_merge(
 296   uint32_t bits,
 297   CollectionIndex* dest) {
 298   dout(20) << __func__ << " bits " << bits << dendl;
 299   ceph_assert(collection_version() == dest->collection_version());
 300
 301   vector<string> emptypath;
 302
 303   // pre-split to common/target level so that any shared prefix DIR_?
 304   // directories already exist at the destination.  Since each
 305   // directory is a nibble (4 bits),
 306   unsigned shared = bits / 4;
 307   dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl;
 308   if (shared) {
 309     split_dirs(emptypath, shared);
 310     ((HashIndex*)dest)->split_dirs(emptypath, shared);
 311   }
 312
 313   // now merge the contents
 314   _merge_dirs(*this, *(HashIndex*)dest, emptypath);
 315
 316   return 0;
 317 }
 318
 319 int HashIndex::_merge_dirs(
 320   HashIndex& from,
 321   HashIndex& to,
 322   const vector<string>& path)
 323 {
 324   dout(20) << __func__ << " path " << path << dendl;
 325   int r;
 326
 327   vector<string> src_subs, dst_subs;
 328   r = from.list_subdirs(path, &src_subs);
 329   if (r < 0) {
 330     lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 331                                                   << " r " << r << " from "
 332                                                   << "from.list_subdirs"
 333                                                   << dendl;
 334     return r;
 335   }
 336   r = to.list_subdirs(path, &dst_subs);
 337   if (r < 0) {
 338     lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 339                                                   << " r " << r << " from "
 340                                                   << "to.list_subdirs"
 341                                                   << dendl;
 342     return r;
 343   }
 344
 345   for (auto& i : src_subs) {
 346     if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) {
 347       // move it
 348       r = move_subdir(from, to, path, i);
 349       if (r < 0) {
 350         lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 351                                                       << " r " << r << " from "
 352                                                       << "move_subdir(...,"
 353                                                       << path << "," << i << ")"
 354                                                       << dendl;
 355         return r;
 356       }
 357     } else {
 358       // common, recurse!
 359       vector<string> nested = path;
 360       nested.push_back(i);
 361       r = _merge_dirs(from, to, nested);
 362       if (r < 0) {
 363         lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 364                                                       << " r " << r << " from "
 365                                                       << "rec _merge_dirs"
 366                                                       << dendl;
 367         return r;
 368       }
 369
 370       // now remove it
 371       r = remove_path(nested);
 372       if (r < 0) {
 373         lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 374                                                       << " r " << r << " from "
 375                                                       << "remove_path "
 376                                                       << nested
 377                                                       << dendl;
 378         return r;
 379       }
 380     }
 381   }
 382
 383   // objects
 384   map<string, ghobject_t> objects;
 385   r = from.list_objects(path, 0, 0, &objects);
 386   if (r < 0) {
 387     lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 388                                                   << " r " << r << " from "
 389                                                   << "from.list_objects"
 390                                                   << dendl;
 391     return r;
 392   }
 393
 394   for (auto& i : objects) {
 395     r = move_object(from, to, path, i);
 396     if (r < 0) {
 397       lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 398                                                     << " r " << r << " from "
 399                                                     << "move_object(...,"
 400                                                     << path << "," << i << ")"
 401                                                     << dendl;
 402       return r;
 403     }
 404   }
 405
 406   return 0;
 407 }
 408
 409
 410 int HashIndex::_split(
 411   uint32_t match,
 412   uint32_t bits,
 413   CollectionIndex* dest) {
 414   ceph_assert(collection_version() == dest->collection_version());
 415   unsigned mkdirred = 0;
 416
 417   return col_split_level(
 418     *this,
 419     *static_cast<HashIndex*>(dest),
 420     vector<string>(),
 421     bits,
 422     match,
 423     &mkdirred);
 424 }
 425
 426 int HashIndex::split_dirs(const vector<string> &path, int target_level) {
 427   dout(20) << __func__ << " " << path << " target level: "
 428            << target_level << dendl;
 429   subdir_info_s info;
 430   int r = get_info(path, &info);
 431   if (r < 0) {
 432     dout(10) << "error looking up info for " << path << ": "
 433              << cpp_strerror(r) << dendl;
 434     return r;
 435   }
 436
 437   if (must_split(info, target_level)) {
 438     dout(1) << __func__ << " " << path << " has " << info.objs
 439             << " objects, " << info.hash_level
 440             << " level, starting split in pg " << coll() << "." << dendl;
 441     r = initiate_split(path, info);
 442     if (r < 0) {
 443       dout(10) << "error initiating split on " << path << ": "
 444                << cpp_strerror(r) << dendl;
 445       return r;
 446     }
 447
 448     r = complete_split(path, info);
 449     dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
 450             << dendl;
 451     if (r < 0) {
 452       dout(10) << "error completing split on " << path << ": "
 453                << cpp_strerror(r) << dendl;
 454       return r;
 455     }
 456   }
 457
 458   vector<string> subdirs;
 459   r = list_subdirs(path, &subdirs);
 460   if (r < 0) {
 461     dout(10) << "error listing subdirs of " << path << ": "
 462              << cpp_strerror(r) << dendl;
 463     return r;
 464   }
 465   for (vector<string>::const_iterator it = subdirs.begin();
 466        it != subdirs.end(); ++it) {
 467     vector<string> subdir_path(path);
 468     subdir_path.push_back(*it);
 469     r = split_dirs(subdir_path, target_level);
 470     if (r < 0) {
 471       return r;
 472     }
 473   }
 474
 475   return r;
 476 }
 477
 478 int HashIndex::apply_layout_settings(int target_level) {
 479   vector<string> path;
 480   dout(10) << __func__ << " split multiple = " << split_multiplier
 481            << " merge threshold = " << merge_threshold
 482            << " split rand factor = " << cct->_conf->filestore_split_rand_factor
 483            << " target level = " << target_level
 484            << dendl;
 485   int r = write_settings();
 486   if (r < 0)
 487     return r;
 488   return split_dirs(path, target_level);
 489 }
 490
 491 int HashIndex::_init() {
 492   subdir_info_s info;
 493   vector<string> path;
 494   int r = set_info(path, info);
 495   if (r < 0)
 496     return r;
 497   return write_settings();
 498 }
 499
 500 int HashIndex::write_settings() {
 501   if (cct->_conf->filestore_split_rand_factor > 0) {
 502     settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor;
 503   } else {
 504     settings.split_rand_factor = 0;
 505   }
 506   vector<string> path;
 507   bufferlist bl;
 508   settings.encode(bl);
 509   return add_attr_path(path, SETTINGS_ATTR, bl);
 510 }
 511
 512 int HashIndex::read_settings() {
 513   vector<string> path;
 514   bufferlist bl;
 515   int r = get_attr_path(path, SETTINGS_ATTR, bl);
 516   if (r == -ENODATA)
 517     return 0;
 518   if (r < 0) {
 519     derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl;
 520     return r;
 521   }
 522   auto it = bl.cbegin();
 523   settings.decode(it);
 524   dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl;
 525   return 0;
 526 }
 527
 528 /* LFNIndex virtual method implementations */
 529 int HashIndex::_created(const vector<string> &path,
 530                         const ghobject_t &oid,
 531                         const string &mangled_name) {
 532   subdir_info_s info;
 533   int r;
 534   r = get_info(path, &info);
 535   if (r < 0)
 536     return r;
 537   info.objs++;
 538   r = set_info(path, info);
 539   if (r < 0)
 540     return r;
 541
 542   if (must_split(info)) {
 543     dout(1) << __func__ << " " << path << " has " << info.objs
 544             << " objects, starting split in pg " << coll() << "." << dendl;
 545     int r = initiate_split(path, info);
 546     if (r < 0)
 547       return r;
 548     r = complete_split(path, info);
 549     dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
 550             << dendl;
 551     return r;
 552   } else {
 553     return 0;
 554   }
 555 }
 556
 557 int HashIndex::_remove(const vector<string> &path,
 558                        const ghobject_t &oid,
 559                        const string &mangled_name) {
 560   int r;
 561   r = remove_object(path, oid);
 562   if (r < 0)
 563     return r;
 564   subdir_info_s info;
 565   r = get_info(path, &info);
 566   if (r < 0)
 567     return r;
 568   info.objs--;
 569   r = set_info(path, info);
 570   if (r < 0)
 571     return r;
 572   if (must_merge(info)) {
 573     r = initiate_merge(path, info);
 574     if (r < 0)
 575       return r;
 576     return complete_merge(path, info);
 577   } else {
 578     return 0;
 579   }
 580 }
 581
 582 int HashIndex::_lookup(const ghobject_t &oid,
 583                        vector<string> *path,
 584                        string *mangled_name,
 585                        int *hardlink) {
 586   vector<string> path_comp;
 587   get_path_components(oid, &path_comp);
 588   vector<string>::iterator next = path_comp.begin();
 589   int exists;
 590   while (1) {
 591     int r = path_exists(*path, &exists);
 592     if (r < 0)
 593       return r;
 594     if (!exists) {
 595       if (path->empty())
 596         return -ENOENT;
 597       path->pop_back();
 598       break;
 599     }
 600     if (next == path_comp.end())
 601       break;
 602     path->push_back(*(next++));
 603   }
 604   return get_mangled_name(*path, oid, mangled_name, hardlink);
 605 }
 606
 607 int HashIndex::_collection_list_partial(const ghobject_t &start,
 608                                         const ghobject_t &end,
 609                                         int max_count,
 610                                         vector<ghobject_t> *ls,
 611                                         ghobject_t *next) {
 612   vector<string> path;
 613   ghobject_t _next;
 614   if (!next)
 615     next = &_next;
 616   *next = start;
 617   dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
 618   return list_by_hash(path, end, max_count, next, ls);
 619 }
 620
 621 int HashIndex::prep_delete() {
 622   return recursive_remove(vector<string>());
 623 }
 624
 625 int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
 626   int ret;
 627   vector<string> path;
 628   subdir_info_s root_info;
 629   // Make sure there is neither objects nor sub-folders
 630   // in this collection
 631   ret = get_info(path, &root_info);
 632   if (ret < 0)
 633     return ret;
 634
 635   // Do the folder splitting first
 636   ret = pre_split_folder(pg_num, expected_num_objs);
 637   if (ret < 0)
 638     return ret;
 639   // Initialize the folder info starting from root
 640   return init_split_folder(path, 0);
 641 }
 642
 643 int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
 644 {
 645   // If folder merging is enabled (by setting the threshold positive),
 646   // no need to split
 647   if (merge_threshold > 0)
 648     return 0;
 649   const coll_t c = coll();
 650   // Do not split if the expected number of objects in this collection is zero (by default)
 651   if (expected_num_objs == 0)
 652     return 0;
 653
 654   // Calculate the number of leaf folders (which actually store files)
 655   // need to be created
 656   const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16;
 657   uint64_t leavies = expected_num_objs / objs_per_folder ;
 658   // No need to split
 659   if (leavies == 0 || expected_num_objs == objs_per_folder)
 660     return 0;
 661
 662   spg_t spgid;
 663   if (!c.is_pg_prefix(&spgid))
 664     return -EINVAL;
 665   const ps_t ps = spgid.pgid.ps();
 666
 667   // the most significant bits of pg_num
 668   const int pg_num_bits = calc_num_bits(pg_num - 1);
 669   ps_t tmp_id = ps;
 670   // calculate the number of levels we only create one sub folder
 671   int num = pg_num_bits / 4;
 672   // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
 673   // so that splitting starts at level 3
 674   if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
 675     --num;
 676   }
 677
 678   int ret;
 679   // Start with creation that only has one subfolder
 680   vector<string> paths;
 681   int dump_num = num;
 682   while (num-- > 0) {
 683     ps_t v = tmp_id & 0x0000000f;
 684     paths.push_back(to_hex(v));
 685     ret = create_path(paths);
 686     if (ret < 0 && ret != -EEXIST)
 687       return ret;
 688     tmp_id = tmp_id >> 4;
 689   }
 690
 691   // Starting from here, we can split by creating multiple subfolders
 692   const int left_bits = pg_num_bits - dump_num * 4;
 693   // this variable denotes how many bits (for this level) that can be
 694   // used for sub folder splitting
 695   int split_bits = 4 - left_bits;
 696   // the below logic is inspired by rados.h#ceph_stable_mod,
 697   // it basically determines how many sub-folders should we
 698   // create for splitting
 699   ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT
 700   if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
 701     ++split_bits;
 702   }
 703   const uint32_t subs = (1 << split_bits);
 704   // Calculate how many levels we create starting from here
 705   int level  = 0;
 706   int level_limit = MAX_HASH_LEVEL - dump_num - 1;
 707   uint64_t actual_leaves = subs;
 708   while (actual_leaves < leavies && level < level_limit) {
 709     ++level;
 710     actual_leaves <<= 4;
 711   }
 712   for (uint32_t i = 0; i < subs; ++i) {
 713     ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT
 714     int v = tmp_id | (i << ((4 - split_bits) % 4));
 715     paths.push_back(to_hex(v));
 716     ret = create_path(paths);
 717     if (ret < 0 && ret != -EEXIST)
 718       return ret;
 719     ret = recursive_create_path(paths, level);
 720     if (ret < 0)
 721       return ret;
 722     paths.pop_back();
 723   }
 724   return 0;
 725 }
 726
 727 int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
 728 {
 729   // Get the number of sub directories for the current path
 730   vector<string> subdirs;
 731   int ret = list_subdirs(path, &subdirs);
 732   if (ret < 0)
 733     return ret;
 734   subdir_info_s info;
 735   info.subdirs = subdirs.size();
 736   info.hash_level = hash_level;
 737   ret = set_info(path, info);
 738   if (ret < 0)
 739     return ret;
 740   ret = fsync_dir(path);
 741   if (ret < 0)
 742     return ret;
 743
 744   // Do the same for subdirs
 745   vector<string>::const_iterator iter;
 746   for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
 747     path.push_back(*iter);
 748     ret = init_split_folder(path, hash_level + 1);
 749     if (ret < 0)
 750       return ret;
 751     path.pop_back();
 752   }
 753   return 0;
 754 }
 755
 756 int HashIndex::recursive_create_path(vector<string>& path, int level)
 757 {
 758   if (level == 0)
 759     return 0;
 760   for (int i = 0; i < 16; ++i) {
 761     path.push_back(to_hex(i));
 762     int ret = create_path(path);
 763     if (ret < 0 && ret != -EEXIST)
 764       return ret;
 765     ret = recursive_create_path(path, level - 1);
 766     if (ret < 0)
 767       return ret;
 768     path.pop_back();
 769   }
 770   return 0;
 771 }
 772
 773 int HashIndex::recursive_remove(const vector<string> &path) {
 774   return _recursive_remove(path, true);
 775 }
 776
 777 int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
 778   vector<string> subdirs;
 779   dout(20) << __func__ << " path=" << path << dendl;
 780   int r = list_subdirs(path, &subdirs);
 781   if (r < 0)
 782     return r;
 783   map<string, ghobject_t> objects;
 784   r = list_objects(path, 0, 0, &objects);
 785   if (r < 0)
 786     return r;
 787   if (!objects.empty())
 788     return -ENOTEMPTY;
 789   vector<string> subdir(path);
 790   for (vector<string>::iterator i = subdirs.begin();
 791        i != subdirs.end();
 792        ++i) {
 793     subdir.push_back(*i);
 794     r = _recursive_remove(subdir, false);
 795     if (r < 0)
 796       return r;
 797     subdir.pop_back();
 798   }
 799   if (top)
 800     return 0;
 801   else
 802     return remove_path(path);
 803 }
 804
 805 int HashIndex::start_col_split(const vector<string> &path) {
 806   bufferlist bl;
 807   InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
 808   op_tag.encode(bl);
 809   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 810   if (r < 0)
 811     return r;
 812   return fsync_dir(vector<string>());
 813 }
 814
 815 int HashIndex::start_split(const vector<string> &path) {
 816   bufferlist bl;
 817   InProgressOp op_tag(InProgressOp::SPLIT, path);
 818   op_tag.encode(bl);
 819   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 820   if (r < 0)
 821     return r;
 822   return fsync_dir(vector<string>());
 823 }
 824
 825 int HashIndex::start_merge(const vector<string> &path) {
 826   bufferlist bl;
 827   InProgressOp op_tag(InProgressOp::MERGE, path);
 828   op_tag.encode(bl);
 829   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 830   if (r < 0)
 831     return r;
 832   return fsync_dir(vector<string>());
 833 }
 834
 835 int HashIndex::end_split_or_merge(const vector<string> &path) {
 836   return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
 837 }
 838
 839 int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
 840   bufferlist buf;
 841   int r = get_attr_path(path, SUBDIR_ATTR, buf);
 842   if (r < 0)
 843     return r;
 844   auto bufiter = buf.cbegin();
 845   info->decode(bufiter);
 846   ceph_assert(path.size() == (unsigned)info->hash_level);
 847   return 0;
 848 }
 849
 850 int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
 851   bufferlist buf;
 852   ceph_assert(path.size() == (unsigned)info.hash_level);
 853   info.encode(buf);
 854   return add_attr_path(path, SUBDIR_ATTR, buf);
 855 }
 856
 857 bool HashIndex::must_merge(const subdir_info_s &info) {
 858   return (info.hash_level > 0 &&
 859           merge_threshold > 0 &&
 860           info.objs < (unsigned)merge_threshold &&
 861           info.subdirs == 0);
 862 }
 863
 864 bool HashIndex::must_split(const subdir_info_s &info, int target_level) {
 865   // target_level is used for ceph-objectstore-tool to split dirs offline.
 866   // if it is set (defalult is 0) and current hash level < target_level,
 867   // this dir would be split no matters how many objects it has.
 868   return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
 869          ((target_level > 0 && info.hash_level < (unsigned)target_level) ||
 870          (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16))));
 871 }
 872
 873 int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
 874   return start_merge(path);
 875 }
 876
 877 int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
 878   vector<string> dst = path;
 879   dst.pop_back();
 880   subdir_info_s dstinfo;
 881   int r, exists;
 882   r = path_exists(path, &exists);
 883   if (r < 0)
 884     return r;
 885   r = get_info(dst, &dstinfo);
 886   if (r < 0)
 887     return r;
 888   if (exists) {
 889     r = move_objects(path, dst);
 890     if (r < 0)
 891       return r;
 892     r = reset_attr(dst);
 893     if (r < 0)
 894       return r;
 895     r = remove_path(path);
 896     if (r < 0)
 897       return r;
 898   }
 899   if (must_merge(dstinfo)) {
 900     r = initiate_merge(dst, dstinfo);
 901     if (r < 0)
 902       return r;
 903     r = fsync_dir(dst);
 904     if (r < 0)
 905       return r;
 906     return complete_merge(dst, dstinfo);
 907   }
 908   r = fsync_dir(dst);
 909   if (r < 0)
 910     return r;
 911   return end_split_or_merge(path);
 912 }
 913
 914 int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
 915   return start_split(path);
 916 }
 917
 918 int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
 919   int level = info.hash_level;
 920   map<string, ghobject_t> objects;
 921   vector<string> dst = path;
 922   int r;
 923   dst.push_back("");
 924   r = list_objects(path, 0, 0, &objects);
 925   if (r < 0)
 926     return r;
 927   vector<string> subdirs_vec;
 928   r = list_subdirs(path, &subdirs_vec);
 929   if (r < 0)
 930     return r;
 931   set<string> subdirs;
 932   subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
 933   map<string, map<string, ghobject_t> > mapped;
 934   map<string, ghobject_t> moved;
 935   int num_moved = 0;
 936   for (map<string, ghobject_t>::iterator i = objects.begin();
 937        i != objects.end();
 938        ++i) {
 939     vector<string> new_path;
 940     get_path_components(i->second, &new_path);
 941     mapped[new_path[level]][i->first] = i->second;
 942   }
 943   for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
 944        i != mapped.end();
 945        ) {
 946     dst[level] = i->first;
 947     /* If the info already exists, it must be correct,
 948      * we may be picking up a partially finished split */
 949     subdir_info_s temp;
 950     // subdir has already been fully copied
 951     if (subdirs.count(i->first) && !get_info(dst, &temp)) {
 952       for (map<string, ghobject_t>::iterator j = i->second.begin();
 953            j != i->second.end();
 954            ++j) {
 955         moved[j->first] = j->second;
 956         num_moved++;
 957         objects.erase(j->first);
 958       }
 959       ++i;
 960       continue;
 961     }
 962
 963     subdir_info_s info_new;
 964     info_new.objs = i->second.size();
 965     info_new.subdirs = 0;
 966     info_new.hash_level = level + 1;
 967     if (must_merge(info_new) && !subdirs.count(i->first)) {
 968       mapped.erase(i++);
 969       continue;
 970     }
 971
 972     // Subdir doesn't yet exist
 973     if (!subdirs.count(i->first)) {
 974       info.subdirs += 1;
 975       r = create_path(dst);
 976       if (r < 0)
 977         return r;
 978     } // else subdir has been created but only partially copied
 979
 980     for (map<string, ghobject_t>::iterator j = i->second.begin();
 981          j != i->second.end();
 982          ++j) {
 983       moved[j->first] = j->second;
 984       num_moved++;
 985       objects.erase(j->first);
 986       r = link_object(path, dst, j->second, j->first);
 987       // May be a partially finished split
 988       if (r < 0 && r != -EEXIST) {
 989         return r;
 990       }
 991     }
 992
 993     r = fsync_dir(dst);
 994     if (r < 0)
 995       return r;
 996
 997     // Presence of info must imply that all objects have been copied
 998     r = set_info(dst, info_new);
 999     if (r < 0)
1000       return r;
1001
1002     r = fsync_dir(dst);
1003     if (r < 0)
1004       return r;
1005
1006     ++i;
1007   }
1008   r = remove_objects(path, moved, &objects);
1009   if (r < 0)
1010     return r;
1011   info.objs = objects.size();
1012   r = reset_attr(path);
1013   if (r < 0)
1014     return r;
1015   r = fsync_dir(path);
1016   if (r < 0)
1017     return r;
1018   return end_split_or_merge(path);
1019 }
1020
1021 void HashIndex::get_path_components(const ghobject_t &oid,
1022                                     vector<string> *path) {
1023   char buf[MAX_HASH_LEVEL + 1];
1024   snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
1025
1026   // Path components are the hex characters of oid.hobj.hash, least
1027   // significant first
1028   for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
1029     path->push_back(string(&buf[i], 1));
1030   }
1031 }
1032
1033 string HashIndex::get_hash_str(uint32_t hash) {
1034   char buf[MAX_HASH_LEVEL + 1];
1035   snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
1036   string retval;
1037   for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
1038     retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
1039   }
1040   return retval;
1041 }
1042
1043 string HashIndex::get_path_str(const ghobject_t &oid) {
1044   ceph_assert(!oid.is_max());
1045   return get_hash_str(oid.hobj.get_hash());
1046 }
1047
1048 uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
1049   while (prefix.size() < sizeof(uint32_t) * 2) {
1050     prefix.push_back('0');
1051   }
1052   uint32_t hash;
1053   sscanf(prefix.c_str(), "%x", &hash);
1054   // nibble reverse
1055   hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
1056   hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
1057   hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
1058   return hash;
1059 }
1060
1061 int HashIndex::get_path_contents_by_hash_bitwise(
1062   const vector<string> &path,
1063   const ghobject_t *next_object,
1064   set<string, CmpHexdigitStringBitwise> *hash_prefixes,
1065   set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
1066 {
1067   map<string, ghobject_t> rev_objects;
1068   int r;
1069   r = list_objects(path, 0, 0, &rev_objects);
1070   if (r < 0)
1071     return r;
1072   // bitwise sort
1073   for (map<string, ghobject_t>::iterator i = rev_objects.begin();
1074        i != rev_objects.end();
1075        ++i) {
1076     if (next_object && i->second < *next_object)
1077       continue;
1078     string hash_prefix = get_path_str(i->second);
1079     hash_prefixes->insert(hash_prefix);
1080     objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
1081   }
1082   vector<string> subdirs;
1083   r = list_subdirs(path, &subdirs);
1084   if (r < 0)
1085     return r;
1086
1087   // sort subdirs bitwise (by reversing hex digit nibbles)
1088   std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
1089
1090   // Local to this function, we will convert the prefix strings
1091   // (previously simply the reversed hex digits) to also have each
1092   // digit's nibbles reversed.  This will make the strings sort
1093   // bitwise.
1094   string cur_prefix;
1095   for (vector<string>::const_iterator i = path.begin();
1096        i != path.end();
1097        ++i) {
1098     cur_prefix.append(reverse_hexdigit_bits_string(*i));
1099   }
1100   string next_object_string;
1101   if (next_object)
1102     next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
1103   for (vector<string>::iterator i = subdirs.begin();
1104        i != subdirs.end();
1105        ++i) {
1106     string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
1107     if (next_object) {
1108       if (next_object->is_max())
1109         continue;
1110       if (candidate < next_object_string.substr(0, candidate.size()))
1111         continue;
1112     }
1113     // re-reverse the hex digit nibbles for the caller
1114     hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
1115   }
1116   return 0;
1117 }
1118
1119 int HashIndex::list_by_hash(const vector<string> &path,
1120                             const ghobject_t &end,
1121                             int max_count,
1122                             ghobject_t *next,
1123                             vector<ghobject_t> *out)
1124 {
1125   ceph_assert(out);
1126   return list_by_hash_bitwise(path, end, max_count, next, out);
1127 }
1128
1129 int HashIndex::list_by_hash_bitwise(
1130   const vector<string> &path,
1131   const ghobject_t& end,
1132   int max_count,
1133   ghobject_t *next,
1134   vector<ghobject_t> *out)
1135 {
1136   vector<string> next_path = path;
1137   next_path.push_back("");
1138   set<string, CmpHexdigitStringBitwise> hash_prefixes;
1139   set<pair<string, ghobject_t>, CmpPairBitwise> objects;
1140   int r = get_path_contents_by_hash_bitwise(path,
1141                                             next,
1142                                             &hash_prefixes,
1143                                             &objects);
1144   if (r < 0)
1145     return r;
1146   for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
1147        i != hash_prefixes.end();
1148        ++i) {
1149     dout(20) << __func__ << " prefix " << *i << dendl;
1150     set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
1151       make_pair(*i, ghobject_t()));
1152     if (j == objects.end() || j->first != *i) {
1153       *(next_path.rbegin()) = *(i->rbegin());
1154       ghobject_t next_recurse;
1155       if (next)
1156         next_recurse = *next;
1157       r = list_by_hash_bitwise(next_path,
1158                                end,
1159                                max_count,
1160                                &next_recurse,
1161                                out);
1162
1163       if (r < 0)
1164         return r;
1165       if (!next_recurse.is_max()) {
1166         if (next)
1167           *next = next_recurse;
1168         return 0;
1169       }
1170     } else {
1171       while (j != objects.end() && j->first == *i) {
1172         if (max_count > 0 && out->size() == (unsigned)max_count) {
1173           if (next)
1174             *next = j->second;
1175           return 0;
1176         }
1177         if (j->second >= end) {
1178           if (next)
1179             *next = j->second;
1180           return 0;
1181         }
1182         if (!next || j->second >= *next) {
1183           dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
1184           out->push_back(j->second);
1185         }
1186         ++j;
1187       }
1188     }
1189   }
1190   if (next)
1191     *next = ghobject_t::get_max();
1192   return 0;
1193 }
1194
1195