ceph/src/os/filestore/HashIndex.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "include/compat.h"
  16 #include "include/types.h"
  17 #include "include/buffer.h"
  18 #include "osd/osd_types.h"
  19 #include <errno.h>
  20
  21 #include "HashIndex.h"
  22
  23 #include "common/errno.h"
  24 #include "common/debug.h"
  25 #define dout_context cct
  26 #define dout_subsys ceph_subsys_filestore
  27
  28 using std::map;
  29 using std::pair;
  30 using std::set;
  31 using std::string;
  32 using std::vector;
  33
  34 using ceph::bufferptr;
  35 using ceph::bufferlist;
  36
  37 const string HashIndex::SUBDIR_ATTR = "contents";
  38 const string HashIndex::SETTINGS_ATTR = "settings";
  39 const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
  40
  41 /// hex digit to integer value
  42 int hex_to_int(char c)
  43 {
  44   if (c >= '0' && c <= '9')
  45     return c - '0';
  46   if (c >= 'A' && c <= 'F')
  47     return c - 'A' + 10;
  48   ceph_abort();
  49 }
  50
  51 /// int value to hex digit
  52 char int_to_hex(int v)
  53 {
  54   ceph_assert(v < 16);
  55   if (v < 10)
  56     return '0' + v;
  57   return 'A' + v - 10;
  58 }
  59
  60 /// reverse bits in a nibble (0..15)
  61 int reverse_nibble_bits(int in)
  62 {
  63   ceph_assert(in < 16);
  64   return
  65     ((in & 8) >> 3) |
  66     ((in & 4) >> 1) |
  67     ((in & 2) << 1) |
  68     ((in & 1) << 3);
  69 }
  70
  71 /// reverse nibble bits in a hex digit
  72 char reverse_hexdigit_bits(char c)
  73 {
  74   return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
  75 }
  76
  77 /// reverse nibble bits in a hex string
  78 string reverse_hexdigit_bits_string(string s)
  79 {
  80   for (unsigned i=0; i<s.size(); ++i)
  81     s[i] = reverse_hexdigit_bits(s[i]);
  82   return s;
  83 }
  84
  85 /// compare hex digit (as length 1 string) bitwise
  86 bool cmp_hexdigit_bitwise(const string& l, const string& r)
  87 {
  88   ceph_assert(l.length() == 1 && r.length() == 1);
  89   int lv = hex_to_int(l[0]);
  90   int rv = hex_to_int(r[0]);
  91   ceph_assert(lv < 16);
  92   ceph_assert(rv < 16);
  93   return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
  94 }
  95
  96 /// compare hex digit string bitwise
  97 bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
  98 {
  99   string ll = reverse_hexdigit_bits_string(l);
 100   string rr = reverse_hexdigit_bits_string(r);
 101   return ll < rr;
 102 }
 103
 104 int HashIndex::cleanup() {
 105   bufferlist bl;
 106   int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 107   if (r < 0) {
 108     // No in progress operations!
 109     return 0;
 110   }
 111   auto i = bl.cbegin();
 112   InProgressOp in_progress(i);
 113   subdir_info_s info;
 114   r = get_info(in_progress.path, &info);
 115   if (r == -ENOENT) {
 116     return end_split_or_merge(in_progress.path);
 117   } else if (r < 0) {
 118     return r;
 119   }
 120
 121   if (in_progress.is_split())
 122     return complete_split(in_progress.path, info);
 123   else if (in_progress.is_merge())
 124     return complete_merge(in_progress.path, info);
 125   else if (in_progress.is_col_split()) {
 126     for (vector<string>::iterator i = in_progress.path.begin();
 127          i != in_progress.path.end();
 128          ++i) {
 129       vector<string> path(in_progress.path.begin(), i);
 130       int r = reset_attr(path);
 131       if (r < 0)
 132         return r;
 133     }
 134     return 0;
 135   }
 136   else
 137     return -EINVAL;
 138 }
 139
 140 int HashIndex::reset_attr(
 141   const vector<string> &path)
 142 {
 143   int exists = 0;
 144   int r = path_exists(path, &exists);
 145   if (r < 0)
 146     return r;
 147   if (!exists)
 148     return 0;
 149   map<string, ghobject_t> objects;
 150   vector<string> subdirs;
 151   r = list_objects(path, 0, 0, &objects);
 152   if (r < 0)
 153     return r;
 154   r = list_subdirs(path, &subdirs);
 155   if (r < 0)
 156     return r;
 157
 158   subdir_info_s info;
 159   info.hash_level = path.size();
 160   info.objs = objects.size();
 161   info.subdirs = subdirs.size();
 162   return set_info(path, info);
 163 }
 164
 165 int HashIndex::col_split_level(
 166   HashIndex &from,
 167   HashIndex &to,
 168   const vector<string> &path,
 169   uint32_t inbits,
 170   uint32_t match,
 171   unsigned *mkdirred)
 172 {
 173   /* For each subdir, move, recurse, or ignore based on comparing the low order
 174    * bits of the hash represented by the subdir path with inbits, match passed
 175    * in.
 176    */
 177   vector<string> subdirs;
 178   int r = from.list_subdirs(path, &subdirs);
 179   if (r < 0)
 180     return r;
 181   map<string, ghobject_t> objects;
 182   r = from.list_objects(path, 0, 0, &objects);
 183   if (r < 0)
 184     return r;
 185
 186   set<string> to_move;
 187   for (vector<string>::iterator i = subdirs.begin();
 188        i != subdirs.end();
 189        ++i) {
 190     uint32_t bits = 0;
 191     uint32_t hash = 0;
 192     vector<string> sub_path(path.begin(), path.end());
 193     sub_path.push_back(*i);
 194     path_to_hobject_hash_prefix(sub_path, &bits, &hash);
 195     if (bits < inbits) {
 196       if (hobject_t::match_hash(hash, bits, match)) {
 197         r = col_split_level(
 198           from,
 199           to,
 200           sub_path,
 201           inbits,
 202           match,
 203           mkdirred);
 204         if (r < 0)
 205           return r;
 206         if (*mkdirred > path.size())
 207           *mkdirred = path.size();
 208       } // else, skip, doesn't need to be moved or recursed into
 209     } else {
 210       if (hobject_t::match_hash(hash, inbits, match)) {
 211         to_move.insert(*i);
 212       }
 213     } // else, skip, doesn't need to be moved or recursed into
 214   }
 215
 216   /* Then, do the same for each object */
 217   map<string, ghobject_t> objs_to_move;
 218   for (map<string, ghobject_t>::iterator i = objects.begin();
 219        i != objects.end();
 220        ++i) {
 221     if (i->second.match(inbits, match)) {
 222       objs_to_move.insert(*i);
 223     }
 224   }
 225
 226   if (objs_to_move.empty() && to_move.empty())
 227     return 0;
 228
 229   // Make parent directories as needed
 230   while (*mkdirred < path.size()) {
 231     ++*mkdirred;
 232     int exists = 0;
 233     vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
 234     r = to.path_exists(creating_path, &exists);
 235     if (r < 0)
 236       return r;
 237     if (exists)
 238       continue;
 239     subdir_info_s info;
 240     info.objs = 0;
 241     info.subdirs = 0;
 242     info.hash_level = creating_path.size();
 243     if (*mkdirred < path.size() - 1)
 244       info.subdirs = 1;
 245     r = to.start_col_split(creating_path);
 246     if (r < 0)
 247       return r;
 248     r = to.create_path(creating_path);
 249     if (r < 0)
 250       return r;
 251     r = to.set_info(creating_path, info);
 252     if (r < 0)
 253       return r;
 254     r = to.end_split_or_merge(creating_path);
 255     if (r < 0)
 256       return r;
 257   }
 258
 259   subdir_info_s from_info;
 260   subdir_info_s to_info;
 261   r = from.get_info(path, &from_info);
 262   if (r < 0)
 263     return r;
 264   r = to.get_info(path, &to_info);
 265   if (r < 0)
 266     return r;
 267
 268   from.start_col_split(path);
 269   to.start_col_split(path);
 270
 271   // Do subdir moves
 272   for (set<string>::iterator i = to_move.begin();
 273        i != to_move.end();
 274        ++i) {
 275     from_info.subdirs--;
 276     to_info.subdirs++;
 277     r = move_subdir(from, to, path, *i);
 278     if (r < 0)
 279       return r;
 280   }
 281
 282   for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
 283        i != objs_to_move.end();
 284        ++i) {
 285     from_info.objs--;
 286     to_info.objs++;
 287     r = move_object(from, to, path, *i);
 288     if (r < 0)
 289       return r;
 290   }
 291
 292
 293   r = to.set_info(path, to_info);
 294   if (r < 0)
 295     return r;
 296   r = from.set_info(path, from_info);
 297   if (r < 0)
 298     return r;
 299   from.end_split_or_merge(path);
 300   to.end_split_or_merge(path);
 301   return 0;
 302 }
 303
 304 int HashIndex::_merge(
 305   uint32_t bits,
 306   CollectionIndex* dest) {
 307   dout(20) << __func__ << " bits " << bits << dendl;
 308   ceph_assert(collection_version() == dest->collection_version());
 309
 310   vector<string> emptypath;
 311
 312   // pre-split to common/target level so that any shared prefix DIR_?
 313   // directories already exist at the destination.  Since each
 314   // directory is a nibble (4 bits),
 315   unsigned shared = bits / 4;
 316   dout(20) << __func__ << " pre-splitting to shared level " << shared << dendl;
 317   if (shared) {
 318     split_dirs(emptypath, shared);
 319     ((HashIndex*)dest)->split_dirs(emptypath, shared);
 320   }
 321
 322   // now merge the contents
 323   _merge_dirs(*this, *(HashIndex*)dest, emptypath);
 324
 325   return 0;
 326 }
 327
 328 int HashIndex::_merge_dirs(
 329   HashIndex& from,
 330   HashIndex& to,
 331   const vector<string>& path)
 332 {
 333   dout(20) << __func__ << " path " << path << dendl;
 334   int r;
 335
 336   vector<string> src_subs, dst_subs;
 337   r = from.list_subdirs(path, &src_subs);
 338   if (r < 0) {
 339     lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 340                                                   << " r " << r << " from "
 341                                                   << "from.list_subdirs"
 342                                                   << dendl;
 343     return r;
 344   }
 345   r = to.list_subdirs(path, &dst_subs);
 346   if (r < 0) {
 347     lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 348                                                   << " r " << r << " from "
 349                                                   << "to.list_subdirs"
 350                                                   << dendl;
 351     return r;
 352   }
 353
 354   for (auto& i : src_subs) {
 355     if (std::find(dst_subs.begin(), dst_subs.end(), i) == dst_subs.end()) {
 356       // move it
 357       r = move_subdir(from, to, path, i);
 358       if (r < 0) {
 359         lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 360                                                       << " r " << r << " from "
 361                                                       << "move_subdir(...,"
 362                                                       << path << "," << i << ")"
 363                                                       << dendl;
 364         return r;
 365       }
 366     } else {
 367       // common, recurse!
 368       vector<string> nested = path;
 369       nested.push_back(i);
 370       r = _merge_dirs(from, to, nested);
 371       if (r < 0) {
 372         lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 373                                                       << " r " << r << " from "
 374                                                       << "rec _merge_dirs"
 375                                                       << dendl;
 376         return r;
 377       }
 378
 379       // now remove it
 380       r = remove_path(nested);
 381       if (r < 0) {
 382         lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 383                                                       << " r " << r << " from "
 384                                                       << "remove_path "
 385                                                       << nested
 386                                                       << dendl;
 387         return r;
 388       }
 389     }
 390   }
 391
 392   // objects
 393   map<string, ghobject_t> objects;
 394   r = from.list_objects(path, 0, 0, &objects);
 395   if (r < 0) {
 396     lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 397                                                   << " r " << r << " from "
 398                                                   << "from.list_objects"
 399                                                   << dendl;
 400     return r;
 401   }
 402
 403   for (auto& i : objects) {
 404     r = move_object(from, to, path, i);
 405     if (r < 0) {
 406       lgeneric_subdout(g_ceph_context,filestore,20) << __func__
 407                                                     << " r " << r << " from "
 408                                                     << "move_object(...,"
 409                                                     << path << "," << i << ")"
 410                                                     << dendl;
 411       return r;
 412     }
 413   }
 414
 415   return 0;
 416 }
 417
 418
 419 int HashIndex::_split(
 420   uint32_t match,
 421   uint32_t bits,
 422   CollectionIndex* dest) {
 423   ceph_assert(collection_version() == dest->collection_version());
 424   unsigned mkdirred = 0;
 425
 426   return col_split_level(
 427     *this,
 428     *static_cast<HashIndex*>(dest),
 429     vector<string>(),
 430     bits,
 431     match,
 432     &mkdirred);
 433 }
 434
 435 int HashIndex::split_dirs(const vector<string> &path, int target_level) {
 436   dout(20) << __func__ << " " << path << " target level: "
 437            << target_level << dendl;
 438   subdir_info_s info;
 439   int r = get_info(path, &info);
 440   if (r < 0) {
 441     dout(10) << "error looking up info for " << path << ": "
 442              << cpp_strerror(r) << dendl;
 443     return r;
 444   }
 445
 446   if (must_split(info, target_level)) {
 447     dout(1) << __func__ << " " << path << " has " << info.objs
 448             << " objects, " << info.hash_level
 449             << " level, starting split in pg " << coll() << "." << dendl;
 450     r = initiate_split(path, info);
 451     if (r < 0) {
 452       dout(10) << "error initiating split on " << path << ": "
 453                << cpp_strerror(r) << dendl;
 454       return r;
 455     }
 456
 457     r = complete_split(path, info);
 458     dout(1) << __func__ << " " << path << " split completed in pg " << coll() << "."
 459             << dendl;
 460     if (r < 0) {
 461       dout(10) << "error completing split on " << path << ": "
 462                << cpp_strerror(r) << dendl;
 463       return r;
 464     }
 465   }
 466
 467   vector<string> subdirs;
 468   r = list_subdirs(path, &subdirs);
 469   if (r < 0) {
 470     dout(10) << "error listing subdirs of " << path << ": "
 471              << cpp_strerror(r) << dendl;
 472     return r;
 473   }
 474   for (vector<string>::const_iterator it = subdirs.begin();
 475        it != subdirs.end(); ++it) {
 476     vector<string> subdir_path(path);
 477     subdir_path.push_back(*it);
 478     r = split_dirs(subdir_path, target_level);
 479     if (r < 0) {
 480       return r;
 481     }
 482   }
 483
 484   return r;
 485 }
 486
 487 int HashIndex::apply_layout_settings(int target_level) {
 488   vector<string> path;
 489   dout(10) << __func__ << " split multiple = " << split_multiplier
 490            << " merge threshold = " << merge_threshold
 491            << " split rand factor = " << cct->_conf->filestore_split_rand_factor
 492            << " target level = " << target_level
 493            << dendl;
 494   int r = write_settings();
 495   if (r < 0)
 496     return r;
 497   return split_dirs(path, target_level);
 498 }
 499
 500 int HashIndex::_init() {
 501   subdir_info_s info;
 502   vector<string> path;
 503   int r = set_info(path, info);
 504   if (r < 0)
 505     return r;
 506   return write_settings();
 507 }
 508
 509 int HashIndex::write_settings() {
 510   if (cct->_conf->filestore_split_rand_factor > 0) {
 511     settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor;
 512   } else {
 513     settings.split_rand_factor = 0;
 514   }
 515   vector<string> path;
 516   bufferlist bl;
 517   settings.encode(bl);
 518   return add_attr_path(path, SETTINGS_ATTR, bl);
 519 }
 520
 521 int HashIndex::read_settings() {
 522   vector<string> path;
 523   bufferlist bl;
 524   int r = get_attr_path(path, SETTINGS_ATTR, bl);
 525   if (r == -ENODATA)
 526     return 0;
 527   if (r < 0) {
 528     derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl;
 529     return r;
 530   }
 531   auto it = bl.cbegin();
 532   settings.decode(it);
 533   dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl;
 534   return 0;
 535 }
 536
 537 /* LFNIndex virtual method implementations */
 538 int HashIndex::_created(const vector<string> &path,
 539                         const ghobject_t &oid,
 540                         const string &mangled_name) {
 541   subdir_info_s info;
 542   int r;
 543   r = get_info(path, &info);
 544   if (r < 0)
 545     return r;
 546   info.objs++;
 547   r = set_info(path, info);
 548   if (r < 0)
 549     return r;
 550
 551   if (must_split(info)) {
 552     dout(1) << __func__ << " " << path << " has " << info.objs
 553             << " objects, starting split in pg " << coll() << "." << dendl;
 554     int r = initiate_split(path, info);
 555     if (r < 0) {
 556       derr << __func__ << " error starting split " << path << " in pg "
 557            << coll() << ": " << cpp_strerror(r) << dendl;
 558       ceph_assert(!cct->_conf->filestore_fail_eio);
 559     } else {
 560       r = complete_split(path, info);
 561       if (r < 0) {
 562         derr << __func__ << " error completing split " << path << " in pg "
 563              << coll() << ": " << cpp_strerror(r) << dendl;
 564         ceph_assert(!cct->_conf->filestore_fail_eio);
 565       }
 566       dout(1) << __func__ << " " << path << " split completed in pg " << coll()
 567               << "." << dendl;
 568     }
 569   }
 570
 571   return 0;
 572 }
 573
 574 int HashIndex::_remove(const vector<string> &path,
 575                        const ghobject_t &oid,
 576                        const string &mangled_name) {
 577   int r;
 578   r = remove_object(path, oid);
 579   if (r < 0)
 580     return r;
 581   subdir_info_s info;
 582   r = get_info(path, &info);
 583   if (r < 0)
 584     return r;
 585   info.objs--;
 586   r = set_info(path, info);
 587   if (r < 0)
 588     return r;
 589
 590   if (must_merge(info)) {
 591     dout(1) << __func__ << " " << path << " has " << info.objs
 592             << " objects, starting merge in pg " << coll() << "." << dendl;
 593     r = initiate_merge(path, info);
 594     if (r < 0) {
 595       derr << __func__ << " error starting merge " << path << " in pg "
 596            << coll() << ": " << cpp_strerror(r) << dendl;
 597       ceph_assert(!cct->_conf->filestore_fail_eio);
 598     } else {
 599       r = complete_merge(path, info);
 600       if (r < 0) {
 601         derr << __func__ << " error completing merge " << path << " in pg "
 602              << coll() << ": " << cpp_strerror(r) << dendl;
 603         ceph_assert(!cct->_conf->filestore_fail_eio);
 604       }
 605       dout(1) << __func__ << " " << path << " merge completed in pg " << coll()
 606               << "." << dendl;
 607     }
 608   }
 609
 610   return 0;
 611 }
 612
 613 int HashIndex::_lookup(const ghobject_t &oid,
 614                        vector<string> *path,
 615                        string *mangled_name,
 616                        int *hardlink) {
 617   vector<string> path_comp;
 618   get_path_components(oid, &path_comp);
 619   vector<string>::iterator next = path_comp.begin();
 620   int exists;
 621   while (1) {
 622     int r = path_exists(*path, &exists);
 623     if (r < 0)
 624       return r;
 625     if (!exists) {
 626       if (path->empty())
 627         return -ENOENT;
 628       path->pop_back();
 629       break;
 630     }
 631     if (next == path_comp.end())
 632       break;
 633     path->push_back(*(next++));
 634   }
 635   return get_mangled_name(*path, oid, mangled_name, hardlink);
 636 }
 637
 638 int HashIndex::_collection_list_partial(const ghobject_t &start,
 639                                         const ghobject_t &end,
 640                                         int max_count,
 641                                         vector<ghobject_t> *ls,
 642                                         ghobject_t *next) {
 643   vector<string> path;
 644   ghobject_t _next;
 645   if (!next)
 646     next = &_next;
 647   *next = start;
 648   dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
 649   return list_by_hash(path, end, max_count, next, ls);
 650 }
 651
 652 int HashIndex::prep_delete() {
 653   return recursive_remove(vector<string>());
 654 }
 655
 656 int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
 657   int ret;
 658   vector<string> path;
 659   subdir_info_s root_info;
 660   // Make sure there is neither objects nor sub-folders
 661   // in this collection
 662   ret = get_info(path, &root_info);
 663   if (ret < 0)
 664     return ret;
 665
 666   // Do the folder splitting first
 667   ret = pre_split_folder(pg_num, expected_num_objs);
 668   if (ret < 0)
 669     return ret;
 670   // Initialize the folder info starting from root
 671   return init_split_folder(path, 0);
 672 }
 673
 674 int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
 675 {
 676   // If folder merging is enabled (by setting the threshold positive),
 677   // no need to split
 678   if (merge_threshold > 0)
 679     return 0;
 680   const coll_t c = coll();
 681   // Do not split if the expected number of objects in this collection is zero (by default)
 682   if (expected_num_objs == 0)
 683     return 0;
 684
 685   // Calculate the number of leaf folders (which actually store files)
 686   // need to be created
 687   const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16;
 688   uint64_t leavies = expected_num_objs / objs_per_folder ;
 689   // No need to split
 690   if (leavies == 0 || expected_num_objs == objs_per_folder)
 691     return 0;
 692
 693   spg_t spgid;
 694   if (!c.is_pg_prefix(&spgid))
 695     return -EINVAL;
 696   const ps_t ps = spgid.pgid.ps();
 697
 698   // the most significant bits of pg_num
 699   const int pg_num_bits = calc_num_bits(pg_num - 1);
 700   ps_t tmp_id = ps;
 701   // calculate the number of levels we only create one sub folder
 702   int num = pg_num_bits / 4;
 703   // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
 704   // so that splitting starts at level 3
 705   if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
 706     --num;
 707   }
 708
 709   int ret;
 710   // Start with creation that only has one subfolder
 711   vector<string> paths;
 712   int dump_num = num;
 713   while (num-- > 0) {
 714     ps_t v = tmp_id & 0x0000000f;
 715     paths.push_back(to_hex(v));
 716     ret = create_path(paths);
 717     if (ret < 0 && ret != -EEXIST)
 718       return ret;
 719     tmp_id = tmp_id >> 4;
 720   }
 721
 722   // Starting from here, we can split by creating multiple subfolders
 723   const int left_bits = pg_num_bits - dump_num * 4;
 724   // this variable denotes how many bits (for this level) that can be
 725   // used for sub folder splitting
 726   int split_bits = 4 - left_bits;
 727   // the below logic is inspired by rados.h#ceph_stable_mod,
 728   // it basically determines how many sub-folders should we
 729   // create for splitting
 730   ceph_assert(pg_num_bits > 0); // otherwise BAD_SHIFT
 731   if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
 732     ++split_bits;
 733   }
 734   const uint32_t subs = (1 << split_bits);
 735   // Calculate how many levels we create starting from here
 736   int level  = 0;
 737   int level_limit = MAX_HASH_LEVEL - dump_num - 1;
 738   uint64_t actual_leaves = subs;
 739   while (actual_leaves < leavies && level < level_limit) {
 740     ++level;
 741     actual_leaves <<= 4;
 742   }
 743   for (uint32_t i = 0; i < subs; ++i) {
 744     ceph_assert(split_bits <= 4); // otherwise BAD_SHIFT
 745     int v = tmp_id | (i << ((4 - split_bits) % 4));
 746     paths.push_back(to_hex(v));
 747     ret = create_path(paths);
 748     if (ret < 0 && ret != -EEXIST)
 749       return ret;
 750     ret = recursive_create_path(paths, level);
 751     if (ret < 0)
 752       return ret;
 753     paths.pop_back();
 754   }
 755   return 0;
 756 }
 757
 758 int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
 759 {
 760   // Get the number of sub directories for the current path
 761   vector<string> subdirs;
 762   int ret = list_subdirs(path, &subdirs);
 763   if (ret < 0)
 764     return ret;
 765   subdir_info_s info;
 766   info.subdirs = subdirs.size();
 767   info.hash_level = hash_level;
 768   ret = set_info(path, info);
 769   if (ret < 0)
 770     return ret;
 771   ret = fsync_dir(path);
 772   if (ret < 0)
 773     return ret;
 774
 775   // Do the same for subdirs
 776   vector<string>::const_iterator iter;
 777   for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
 778     path.push_back(*iter);
 779     ret = init_split_folder(path, hash_level + 1);
 780     if (ret < 0)
 781       return ret;
 782     path.pop_back();
 783   }
 784   return 0;
 785 }
 786
 787 int HashIndex::recursive_create_path(vector<string>& path, int level)
 788 {
 789   if (level == 0)
 790     return 0;
 791   for (int i = 0; i < 16; ++i) {
 792     path.push_back(to_hex(i));
 793     int ret = create_path(path);
 794     if (ret < 0 && ret != -EEXIST)
 795       return ret;
 796     ret = recursive_create_path(path, level - 1);
 797     if (ret < 0)
 798       return ret;
 799     path.pop_back();
 800   }
 801   return 0;
 802 }
 803
 804 int HashIndex::recursive_remove(const vector<string> &path) {
 805   return _recursive_remove(path, true);
 806 }
 807
 808 int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
 809   vector<string> subdirs;
 810   dout(20) << __func__ << " path=" << path << dendl;
 811   int r = list_subdirs(path, &subdirs);
 812   if (r < 0)
 813     return r;
 814   map<string, ghobject_t> objects;
 815   r = list_objects(path, 0, 0, &objects);
 816   if (r < 0)
 817     return r;
 818   if (!objects.empty())
 819     return -ENOTEMPTY;
 820   vector<string> subdir(path);
 821   for (vector<string>::iterator i = subdirs.begin();
 822        i != subdirs.end();
 823        ++i) {
 824     subdir.push_back(*i);
 825     r = _recursive_remove(subdir, false);
 826     if (r < 0)
 827       return r;
 828     subdir.pop_back();
 829   }
 830   if (top)
 831     return 0;
 832   else
 833     return remove_path(path);
 834 }
 835
 836 int HashIndex::start_col_split(const vector<string> &path) {
 837   bufferlist bl;
 838   InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
 839   op_tag.encode(bl);
 840   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 841   if (r < 0)
 842     return r;
 843   return fsync_dir(vector<string>());
 844 }
 845
 846 int HashIndex::start_split(const vector<string> &path) {
 847   bufferlist bl;
 848   InProgressOp op_tag(InProgressOp::SPLIT, path);
 849   op_tag.encode(bl);
 850   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 851   if (r < 0)
 852     return r;
 853   return fsync_dir(vector<string>());
 854 }
 855
 856 int HashIndex::start_merge(const vector<string> &path) {
 857   bufferlist bl;
 858   InProgressOp op_tag(InProgressOp::MERGE, path);
 859   op_tag.encode(bl);
 860   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 861   if (r < 0)
 862     return r;
 863   return fsync_dir(vector<string>());
 864 }
 865
 866 int HashIndex::end_split_or_merge(const vector<string> &path) {
 867   return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
 868 }
 869
 870 int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
 871   bufferlist buf;
 872   int r = get_attr_path(path, SUBDIR_ATTR, buf);
 873   if (r < 0)
 874     return r;
 875   auto bufiter = buf.cbegin();
 876   info->decode(bufiter);
 877   ceph_assert(path.size() == (unsigned)info->hash_level);
 878   return 0;
 879 }
 880
 881 int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
 882   bufferlist buf;
 883   ceph_assert(path.size() == (unsigned)info.hash_level);
 884   info.encode(buf);
 885   return add_attr_path(path, SUBDIR_ATTR, buf);
 886 }
 887
 888 bool HashIndex::must_merge(const subdir_info_s &info) {
 889   return (info.hash_level > 0 &&
 890           merge_threshold > 0 &&
 891           info.objs < (unsigned)merge_threshold &&
 892           info.subdirs == 0);
 893 }
 894
 895 bool HashIndex::must_split(const subdir_info_s &info, int target_level) {
 896   // target_level is used for ceph-objectstore-tool to split dirs offline.
 897   // if it is set (defalult is 0) and current hash level < target_level,
 898   // this dir would be split no matters how many objects it has.
 899   return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
 900          ((target_level > 0 && info.hash_level < (unsigned)target_level) ||
 901          (info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16))));
 902 }
 903
 904 int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
 905   return start_merge(path);
 906 }
 907
 908 int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
 909   vector<string> dst = path;
 910   dst.pop_back();
 911   subdir_info_s dstinfo;
 912   int r, exists;
 913   r = path_exists(path, &exists);
 914   if (r < 0)
 915     return r;
 916   r = get_info(dst, &dstinfo);
 917   if (r < 0)
 918     return r;
 919   if (exists) {
 920     r = move_objects(path, dst);
 921     if (r < 0)
 922       return r;
 923     r = reset_attr(dst);
 924     if (r < 0)
 925       return r;
 926     r = remove_path(path);
 927     if (r < 0)
 928       return r;
 929   }
 930   if (must_merge(dstinfo)) {
 931     r = initiate_merge(dst, dstinfo);
 932     if (r < 0)
 933       return r;
 934     r = fsync_dir(dst);
 935     if (r < 0)
 936       return r;
 937     return complete_merge(dst, dstinfo);
 938   }
 939   r = fsync_dir(dst);
 940   if (r < 0)
 941     return r;
 942   return end_split_or_merge(path);
 943 }
 944
 945 int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
 946   return start_split(path);
 947 }
 948
 949 int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
 950   int level = info.hash_level;
 951   map<string, ghobject_t> objects;
 952   vector<string> dst = path;
 953   int r;
 954   dst.push_back("");
 955   r = list_objects(path, 0, 0, &objects);
 956   if (r < 0)
 957     return r;
 958   vector<string> subdirs_vec;
 959   r = list_subdirs(path, &subdirs_vec);
 960   if (r < 0)
 961     return r;
 962   set<string> subdirs;
 963   subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
 964   map<string, map<string, ghobject_t> > mapped;
 965   map<string, ghobject_t> moved;
 966   int num_moved = 0;
 967   for (map<string, ghobject_t>::iterator i = objects.begin();
 968        i != objects.end();
 969        ++i) {
 970     vector<string> new_path;
 971     get_path_components(i->second, &new_path);
 972     mapped[new_path[level]][i->first] = i->second;
 973   }
 974   for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
 975        i != mapped.end();
 976        ) {
 977     dst[level] = i->first;
 978     /* If the info already exists, it must be correct,
 979      * we may be picking up a partially finished split */
 980     subdir_info_s temp;
 981     // subdir has already been fully copied
 982     if (subdirs.count(i->first) && !get_info(dst, &temp)) {
 983       for (map<string, ghobject_t>::iterator j = i->second.begin();
 984            j != i->second.end();
 985            ++j) {
 986         moved[j->first] = j->second;
 987         num_moved++;
 988         objects.erase(j->first);
 989       }
 990       ++i;
 991       continue;
 992     }
 993
 994     subdir_info_s info_new;
 995     info_new.objs = i->second.size();
 996     info_new.subdirs = 0;
 997     info_new.hash_level = level + 1;
 998     if (must_merge(info_new) && !subdirs.count(i->first)) {
 999       mapped.erase(i++);
1000       continue;
1001     }
1002
1003     // Subdir doesn't yet exist
1004     if (!subdirs.count(i->first)) {
1005       info.subdirs += 1;
1006       r = create_path(dst);
1007       if (r < 0)
1008         return r;
1009     } // else subdir has been created but only partially copied
1010
1011     for (map<string, ghobject_t>::iterator j = i->second.begin();
1012          j != i->second.end();
1013          ++j) {
1014       moved[j->first] = j->second;
1015       num_moved++;
1016       objects.erase(j->first);
1017       r = link_object(path, dst, j->second, j->first);
1018       // May be a partially finished split
1019       if (r < 0 && r != -EEXIST) {
1020         return r;
1021       }
1022     }
1023
1024     r = fsync_dir(dst);
1025     if (r < 0)
1026       return r;
1027
1028     // Presence of info must imply that all objects have been copied
1029     r = set_info(dst, info_new);
1030     if (r < 0)
1031       return r;
1032
1033     r = fsync_dir(dst);
1034     if (r < 0)
1035       return r;
1036
1037     ++i;
1038   }
1039   r = remove_objects(path, moved, &objects);
1040   if (r < 0)
1041     return r;
1042   info.objs = objects.size();
1043   r = reset_attr(path);
1044   if (r < 0)
1045     return r;
1046   r = fsync_dir(path);
1047   if (r < 0)
1048     return r;
1049   return end_split_or_merge(path);
1050 }
1051
1052 void HashIndex::get_path_components(const ghobject_t &oid,
1053                                     vector<string> *path) {
1054   char buf[MAX_HASH_LEVEL + 1];
1055   snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
1056
1057   // Path components are the hex characters of oid.hobj.hash, least
1058   // significant first
1059   for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
1060     path->push_back(string(&buf[i], 1));
1061   }
1062 }
1063
1064 string HashIndex::get_hash_str(uint32_t hash) {
1065   char buf[MAX_HASH_LEVEL + 1];
1066   snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
1067   string retval;
1068   for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
1069     retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
1070   }
1071   return retval;
1072 }
1073
1074 string HashIndex::get_path_str(const ghobject_t &oid) {
1075   ceph_assert(!oid.is_max());
1076   return get_hash_str(oid.hobj.get_hash());
1077 }
1078
1079 uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
1080   while (prefix.size() < sizeof(uint32_t) * 2) {
1081     prefix.push_back('0');
1082   }
1083   uint32_t hash;
1084   sscanf(prefix.c_str(), "%x", &hash);
1085   // nibble reverse
1086   hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
1087   hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
1088   hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
1089   return hash;
1090 }
1091
1092 int HashIndex::get_path_contents_by_hash_bitwise(
1093   const vector<string> &path,
1094   const ghobject_t *next_object,
1095   set<string, CmpHexdigitStringBitwise> *hash_prefixes,
1096   set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
1097 {
1098   map<string, ghobject_t> rev_objects;
1099   int r;
1100   r = list_objects(path, 0, 0, &rev_objects);
1101   if (r < 0)
1102     return r;
1103   // bitwise sort
1104   for (map<string, ghobject_t>::iterator i = rev_objects.begin();
1105        i != rev_objects.end();
1106        ++i) {
1107     if (next_object && i->second < *next_object)
1108       continue;
1109     string hash_prefix = get_path_str(i->second);
1110     hash_prefixes->insert(hash_prefix);
1111     objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
1112   }
1113   vector<string> subdirs;
1114   r = list_subdirs(path, &subdirs);
1115   if (r < 0)
1116     return r;
1117
1118   // sort subdirs bitwise (by reversing hex digit nibbles)
1119   std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
1120
1121   // Local to this function, we will convert the prefix strings
1122   // (previously simply the reversed hex digits) to also have each
1123   // digit's nibbles reversed.  This will make the strings sort
1124   // bitwise.
1125   string cur_prefix;
1126   for (vector<string>::const_iterator i = path.begin();
1127        i != path.end();
1128        ++i) {
1129     cur_prefix.append(reverse_hexdigit_bits_string(*i));
1130   }
1131   string next_object_string;
1132   if (next_object)
1133     next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
1134   for (vector<string>::iterator i = subdirs.begin();
1135        i != subdirs.end();
1136        ++i) {
1137     string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
1138     if (next_object) {
1139       if (next_object->is_max())
1140         continue;
1141       if (candidate < next_object_string.substr(0, candidate.size()))
1142         continue;
1143     }
1144     // re-reverse the hex digit nibbles for the caller
1145     hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
1146   }
1147   return 0;
1148 }
1149
1150 int HashIndex::list_by_hash(const vector<string> &path,
1151                             const ghobject_t &end,
1152                             int max_count,
1153                             ghobject_t *next,
1154                             vector<ghobject_t> *out)
1155 {
1156   ceph_assert(out);
1157   return list_by_hash_bitwise(path, end, max_count, next, out);
1158 }
1159
1160 int HashIndex::list_by_hash_bitwise(
1161   const vector<string> &path,
1162   const ghobject_t& end,
1163   int max_count,
1164   ghobject_t *next,
1165   vector<ghobject_t> *out)
1166 {
1167   vector<string> next_path = path;
1168   next_path.push_back("");
1169   set<string, CmpHexdigitStringBitwise> hash_prefixes;
1170   set<pair<string, ghobject_t>, CmpPairBitwise> objects;
1171   int r = get_path_contents_by_hash_bitwise(path,
1172                                             next,
1173                                             &hash_prefixes,
1174                                             &objects);
1175   if (r < 0)
1176     return r;
1177   for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
1178        i != hash_prefixes.end();
1179        ++i) {
1180     dout(20) << __func__ << " prefix " << *i << dendl;
1181     set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
1182       make_pair(*i, ghobject_t()));
1183     if (j == objects.end() || j->first != *i) {
1184       *(next_path.rbegin()) = *(i->rbegin());
1185       ghobject_t next_recurse;
1186       if (next)
1187         next_recurse = *next;
1188       r = list_by_hash_bitwise(next_path,
1189                                end,
1190                                max_count,
1191                                &next_recurse,
1192                                out);
1193
1194       if (r < 0)
1195         return r;
1196       if (!next_recurse.is_max()) {
1197         if (next)
1198           *next = next_recurse;
1199         return 0;
1200       }
1201     } else {
1202       while (j != objects.end() && j->first == *i) {
1203         if (max_count > 0 && out->size() == (unsigned)max_count) {
1204           if (next)
1205             *next = j->second;
1206           return 0;
1207         }
1208         if (j->second >= end) {
1209           if (next)
1210             *next = j->second;
1211           return 0;
1212         }
1213         if (!next || j->second >= *next) {
1214           dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
1215           out->push_back(j->second);
1216         }
1217         ++j;
1218       }
1219     }
1220   }
1221   if (next)
1222     *next = ghobject_t::get_max();
1223   return 0;
1224 }
1225
1226