ceph/src/os/filestore/HashIndex.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "include/types.h"
  16 #include "include/buffer.h"
  17 #include "osd/osd_types.h"
  18 #include <errno.h>
  19
  20 #include "HashIndex.h"
  21
  22 #include "common/errno.h"
  23 #include "common/debug.h"
  24 #define dout_context cct
  25 #define dout_subsys ceph_subsys_filestore
  26
  27 const string HashIndex::SUBDIR_ATTR = "contents";
  28 const string HashIndex::SETTINGS_ATTR = "settings";
  29 const string HashIndex::IN_PROGRESS_OP_TAG = "in_progress_op";
  30
  31 /// hex digit to integer value
  32 int hex_to_int(char c)
  33 {
  34   if (c >= '0' && c <= '9')
  35     return c - '0';
  36   if (c >= 'A' && c <= 'F')
  37     return c - 'A' + 10;
  38   ceph_abort();
  39 }
  40
  41 /// int value to hex digit
  42 char int_to_hex(int v)
  43 {
  44   assert(v < 16);
  45   if (v < 10)
  46     return '0' + v;
  47   return 'A' + v - 10;
  48 }
  49
  50 /// reverse bits in a nibble (0..15)
  51 int reverse_nibble_bits(int in)
  52 {
  53   assert(in < 16);
  54   return
  55     ((in & 8) >> 3) |
  56     ((in & 4) >> 1) |
  57     ((in & 2) << 1) |
  58     ((in & 1) << 3);
  59 }
  60
  61 /// reverse nibble bits in a hex digit
  62 char reverse_hexdigit_bits(char c)
  63 {
  64   return int_to_hex(reverse_nibble_bits(hex_to_int(c)));
  65 }
  66
  67 /// reverse nibble bits in a hex string
  68 string reverse_hexdigit_bits_string(string s)
  69 {
  70   for (unsigned i=0; i<s.size(); ++i)
  71     s[i] = reverse_hexdigit_bits(s[i]);
  72   return s;
  73 }
  74
  75 /// compare hex digit (as length 1 string) bitwise
  76 bool cmp_hexdigit_bitwise(const string& l, const string& r)
  77 {
  78   assert(l.length() == 1 && r.length() == 1);
  79   int lv = hex_to_int(l[0]);
  80   int rv = hex_to_int(r[0]);
  81   assert(lv < 16);
  82   assert(rv < 16);
  83   return reverse_nibble_bits(lv) < reverse_nibble_bits(rv);
  84 }
  85
  86 /// compare hex digit string bitwise
  87 bool cmp_hexdigit_string_bitwise(const string& l, const string& r)
  88 {
  89   string ll = reverse_hexdigit_bits_string(l);
  90   string rr = reverse_hexdigit_bits_string(r);
  91   return ll < rr;
  92 }
  93
  94 int HashIndex::cleanup() {
  95   bufferlist bl;
  96   int r = get_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
  97   if (r < 0) {
  98     // No in progress operations!
  99     return 0;
 100   }
 101   bufferlist::iterator i = bl.begin();
 102   InProgressOp in_progress(i);
 103   subdir_info_s info;
 104   r = get_info(in_progress.path, &info);
 105   if (r == -ENOENT) {
 106     return end_split_or_merge(in_progress.path);
 107   } else if (r < 0) {
 108     return r;
 109   }
 110
 111   if (in_progress.is_split())
 112     return complete_split(in_progress.path, info);
 113   else if (in_progress.is_merge())
 114     return complete_merge(in_progress.path, info);
 115   else if (in_progress.is_col_split()) {
 116     for (vector<string>::iterator i = in_progress.path.begin();
 117          i != in_progress.path.end();
 118          ++i) {
 119       vector<string> path(in_progress.path.begin(), i);
 120       int r = reset_attr(path);
 121       if (r < 0)
 122         return r;
 123     }
 124     return 0;
 125   }
 126   else
 127     return -EINVAL;
 128 }
 129
 130 int HashIndex::reset_attr(
 131   const vector<string> &path)
 132 {
 133   int exists = 0;
 134   int r = path_exists(path, &exists);
 135   if (r < 0)
 136     return r;
 137   if (!exists)
 138     return 0;
 139   map<string, ghobject_t> objects;
 140   vector<string> subdirs;
 141   r = list_objects(path, 0, 0, &objects);
 142   if (r < 0)
 143     return r;
 144   r = list_subdirs(path, &subdirs);
 145   if (r < 0)
 146     return r;
 147
 148   subdir_info_s info;
 149   info.hash_level = path.size();
 150   info.objs = objects.size();
 151   info.subdirs = subdirs.size();
 152   return set_info(path, info);
 153 }
 154
 155 int HashIndex::col_split_level(
 156   HashIndex &from,
 157   HashIndex &to,
 158   const vector<string> &path,
 159   uint32_t inbits,
 160   uint32_t match,
 161   unsigned *mkdirred)
 162 {
 163   /* For each subdir, move, recurse, or ignore based on comparing the low order
 164    * bits of the hash represented by the subdir path with inbits, match passed
 165    * in.
 166    */
 167   vector<string> subdirs;
 168   int r = from.list_subdirs(path, &subdirs);
 169   if (r < 0)
 170     return r;
 171   map<string, ghobject_t> objects;
 172   r = from.list_objects(path, 0, 0, &objects);
 173   if (r < 0)
 174     return r;
 175
 176   set<string> to_move;
 177   for (vector<string>::iterator i = subdirs.begin();
 178        i != subdirs.end();
 179        ++i) {
 180     uint32_t bits = 0;
 181     uint32_t hash = 0;
 182     vector<string> sub_path(path.begin(), path.end());
 183     sub_path.push_back(*i);
 184     path_to_hobject_hash_prefix(sub_path, &bits, &hash);
 185     if (bits < inbits) {
 186       if (hobject_t::match_hash(hash, bits, match)) {
 187         r = col_split_level(
 188           from,
 189           to,
 190           sub_path,
 191           inbits,
 192           match,
 193           mkdirred);
 194         if (r < 0)
 195           return r;
 196         if (*mkdirred > path.size())
 197           *mkdirred = path.size();
 198       } // else, skip, doesn't need to be moved or recursed into
 199     } else {
 200       if (hobject_t::match_hash(hash, inbits, match)) {
 201         to_move.insert(*i);
 202       }
 203     } // else, skip, doesn't need to be moved or recursed into
 204   }
 205
 206   /* Then, do the same for each object */
 207   map<string, ghobject_t> objs_to_move;
 208   for (map<string, ghobject_t>::iterator i = objects.begin();
 209        i != objects.end();
 210        ++i) {
 211     if (i->second.match(inbits, match)) {
 212       objs_to_move.insert(*i);
 213     }
 214   }
 215
 216   if (objs_to_move.empty() && to_move.empty())
 217     return 0;
 218
 219   // Make parent directories as needed
 220   while (*mkdirred < path.size()) {
 221     ++*mkdirred;
 222     int exists = 0;
 223     vector<string> creating_path(path.begin(), path.begin()+*mkdirred);
 224     r = to.path_exists(creating_path, &exists);
 225     if (r < 0)
 226       return r;
 227     if (exists)
 228       continue;
 229     subdir_info_s info;
 230     info.objs = 0;
 231     info.subdirs = 0;
 232     info.hash_level = creating_path.size();
 233     if (*mkdirred < path.size() - 1)
 234       info.subdirs = 1;
 235     r = to.start_col_split(creating_path);
 236     if (r < 0)
 237       return r;
 238     r = to.create_path(creating_path);
 239     if (r < 0)
 240       return r;
 241     r = to.set_info(creating_path, info);
 242     if (r < 0)
 243       return r;
 244     r = to.end_split_or_merge(creating_path);
 245     if (r < 0)
 246       return r;
 247   }
 248
 249   subdir_info_s from_info;
 250   subdir_info_s to_info;
 251   r = from.get_info(path, &from_info);
 252   if (r < 0)
 253     return r;
 254   r = to.get_info(path, &to_info);
 255   if (r < 0)
 256     return r;
 257
 258   from.start_col_split(path);
 259   to.start_col_split(path);
 260
 261   // Do subdir moves
 262   for (set<string>::iterator i = to_move.begin();
 263        i != to_move.end();
 264        ++i) {
 265     from_info.subdirs--;
 266     to_info.subdirs++;
 267     r = move_subdir(from, to, path, *i);
 268     if (r < 0)
 269       return r;
 270   }
 271
 272   for (map<string, ghobject_t>::iterator i = objs_to_move.begin();
 273        i != objs_to_move.end();
 274        ++i) {
 275     from_info.objs--;
 276     to_info.objs++;
 277     r = move_object(from, to, path, *i);
 278     if (r < 0)
 279       return r;
 280   }
 281
 282
 283   r = to.set_info(path, to_info);
 284   if (r < 0)
 285     return r;
 286   r = from.set_info(path, from_info);
 287   if (r < 0)
 288     return r;
 289   from.end_split_or_merge(path);
 290   to.end_split_or_merge(path);
 291   return 0;
 292 }
 293
 294 int HashIndex::_split(
 295   uint32_t match,
 296   uint32_t bits,
 297   CollectionIndex* dest) {
 298   assert(collection_version() == dest->collection_version());
 299   unsigned mkdirred = 0;
 300   return col_split_level(
 301     *this,
 302     *static_cast<HashIndex*>(dest),
 303     vector<string>(),
 304     bits,
 305     match,
 306     &mkdirred);
 307 }
 308
 309 int HashIndex::split_dirs(const vector<string> &path) {
 310   dout(20) << __func__ << " " << path << dendl;
 311   subdir_info_s info;
 312   int r = get_info(path, &info);
 313   if (r < 0) {
 314     dout(10) << "error looking up info for " << path << ": "
 315              << cpp_strerror(r) << dendl;
 316     return r;
 317   }
 318
 319   if (must_split(info)) {
 320     dout(1) << __func__ << " " << path << " has " << info.objs
 321             << " objects, starting split." << dendl;
 322     r = initiate_split(path, info);
 323     if (r < 0) {
 324       dout(10) << "error initiating split on " << path << ": "
 325                << cpp_strerror(r) << dendl;
 326       return r;
 327     }
 328
 329     r = complete_split(path, info);
 330     dout(1) << __func__ << " " << path << " split completed."
 331             << dendl;
 332     if (r < 0) {
 333       dout(10) << "error completing split on " << path << ": "
 334                << cpp_strerror(r) << dendl;
 335       return r;
 336     }
 337   }
 338
 339   vector<string> subdirs;
 340   r = list_subdirs(path, &subdirs);
 341   if (r < 0) {
 342     dout(10) << "error listing subdirs of " << path << ": "
 343              << cpp_strerror(r) << dendl;
 344     return r;
 345   }
 346   for (vector<string>::const_iterator it = subdirs.begin();
 347        it != subdirs.end(); ++it) {
 348     vector<string> subdir_path(path);
 349     subdir_path.push_back(*it);
 350     r = split_dirs(subdir_path);
 351     if (r < 0) {
 352       return r;
 353     }
 354   }
 355
 356   return r;
 357 }
 358
 359 int HashIndex::apply_layout_settings() {
 360   vector<string> path;
 361   dout(10) << __func__ << " split multiple = " << split_multiplier
 362            << " merge threshold = " << merge_threshold
 363            << " split rand factor = " << cct->_conf->filestore_split_rand_factor
 364            << dendl;
 365   int r = write_settings();
 366   if (r < 0)
 367     return r;
 368   return split_dirs(path);
 369 }
 370
 371 int HashIndex::_init() {
 372   subdir_info_s info;
 373   vector<string> path;
 374   int r = set_info(path, info);
 375   if (r < 0)
 376     return r;
 377   return write_settings();
 378 }
 379
 380 int HashIndex::write_settings() {
 381   if (cct->_conf->filestore_split_rand_factor > 0) {
 382     settings.split_rand_factor = rand() % cct->_conf->filestore_split_rand_factor;
 383   } else {
 384     settings.split_rand_factor = 0;
 385   }
 386   vector<string> path;
 387   bufferlist bl;
 388   settings.encode(bl);
 389   return add_attr_path(path, SETTINGS_ATTR, bl);
 390 }
 391
 392 int HashIndex::read_settings() {
 393   vector<string> path;
 394   bufferlist bl;
 395   int r = get_attr_path(path, SETTINGS_ATTR, bl);
 396   if (r == -ENODATA)
 397     return 0;
 398   if (r < 0) {
 399     derr << __func__ << " error reading settings: " << cpp_strerror(r) << dendl;
 400     return r;
 401   }
 402   bufferlist::iterator it = bl.begin();
 403   settings.decode(it);
 404   dout(20) << __func__ << " split_rand_factor = " << settings.split_rand_factor << dendl;
 405   return 0;
 406 }
 407
 408 /* LFNIndex virtual method implementations */
 409 int HashIndex::_created(const vector<string> &path,
 410                         const ghobject_t &oid,
 411                         const string &mangled_name) {
 412   subdir_info_s info;
 413   int r;
 414   r = get_info(path, &info);
 415   if (r < 0)
 416     return r;
 417   info.objs++;
 418   r = set_info(path, info);
 419   if (r < 0)
 420     return r;
 421
 422   if (must_split(info)) {
 423     dout(1) << __func__ << " " << path << " has " << info.objs
 424             << " objects, starting split." << dendl;
 425     int r = initiate_split(path, info);
 426     if (r < 0)
 427       return r;
 428     r = complete_split(path, info);
 429     dout(1) << __func__ << " " << path << " split completed."
 430             << dendl;
 431     return r;
 432   } else {
 433     return 0;
 434   }
 435 }
 436
 437 int HashIndex::_remove(const vector<string> &path,
 438                        const ghobject_t &oid,
 439                        const string &mangled_name) {
 440   int r;
 441   r = remove_object(path, oid);
 442   if (r < 0)
 443     return r;
 444   subdir_info_s info;
 445   r = get_info(path, &info);
 446   if (r < 0)
 447     return r;
 448   info.objs--;
 449   r = set_info(path, info);
 450   if (r < 0)
 451     return r;
 452   if (must_merge(info)) {
 453     r = initiate_merge(path, info);
 454     if (r < 0)
 455       return r;
 456     return complete_merge(path, info);
 457   } else {
 458     return 0;
 459   }
 460 }
 461
 462 int HashIndex::_lookup(const ghobject_t &oid,
 463                        vector<string> *path,
 464                        string *mangled_name,
 465                        int *hardlink) {
 466   vector<string> path_comp;
 467   get_path_components(oid, &path_comp);
 468   vector<string>::iterator next = path_comp.begin();
 469   int exists;
 470   while (1) {
 471     int r = path_exists(*path, &exists);
 472     if (r < 0)
 473       return r;
 474     if (!exists) {
 475       if (path->empty())
 476         return -ENOENT;
 477       path->pop_back();
 478       break;
 479     }
 480     if (next == path_comp.end())
 481       break;
 482     path->push_back(*(next++));
 483   }
 484   return get_mangled_name(*path, oid, mangled_name, hardlink);
 485 }
 486
 487 int HashIndex::_collection_list_partial(const ghobject_t &start,
 488                                         const ghobject_t &end,
 489                                         int max_count,
 490                                         vector<ghobject_t> *ls,
 491                                         ghobject_t *next) {
 492   vector<string> path;
 493   ghobject_t _next;
 494   if (!next)
 495     next = &_next;
 496   *next = start;
 497   dout(20) << __func__ << " start:" << start << " end:" << end << "-" << max_count << " ls.size " << ls->size() << dendl;
 498   return list_by_hash(path, end, max_count, next, ls);
 499 }
 500
 501 int HashIndex::prep_delete() {
 502   return recursive_remove(vector<string>());
 503 }
 504
 505 int HashIndex::_pre_hash_collection(uint32_t pg_num, uint64_t expected_num_objs) {
 506   int ret;
 507   vector<string> path;
 508   subdir_info_s root_info;
 509   // Make sure there is neither objects nor sub-folders
 510   // in this collection
 511   ret = get_info(path, &root_info);
 512   if (ret < 0)
 513     return ret;
 514
 515   // Do the folder splitting first
 516   ret = pre_split_folder(pg_num, expected_num_objs);
 517   if (ret < 0)
 518     return ret;
 519   // Initialize the folder info starting from root
 520   return init_split_folder(path, 0);
 521 }
 522
 523 int HashIndex::pre_split_folder(uint32_t pg_num, uint64_t expected_num_objs)
 524 {
 525   // If folder merging is enabled (by setting the threshold positive),
 526   // no need to split
 527   if (merge_threshold > 0)
 528     return 0;
 529   const coll_t c = coll();
 530   // Do not split if the expected number of objects in this collection is zero (by default)
 531   if (expected_num_objs == 0)
 532     return 0;
 533
 534   // Calculate the number of leaf folders (which actually store files)
 535   // need to be created
 536   const uint64_t objs_per_folder = ((uint64_t)(abs(merge_threshold)) * (uint64_t)split_multiplier + settings.split_rand_factor) * 16;
 537   uint64_t leavies = expected_num_objs / objs_per_folder ;
 538   // No need to split
 539   if (leavies == 0 || expected_num_objs == objs_per_folder)
 540     return 0;
 541
 542   spg_t spgid;
 543   if (!c.is_pg_prefix(&spgid))
 544     return -EINVAL;
 545   const ps_t ps = spgid.pgid.ps();
 546
 547   // the most significant bits of pg_num
 548   const int pg_num_bits = calc_num_bits(pg_num - 1);
 549   ps_t tmp_id = ps;
 550   // calculate the number of levels we only create one sub folder
 551   int num = pg_num_bits / 4;
 552   // pg num's hex value is like 1xxx,xxxx,xxxx but not 1111,1111,1111,
 553   // so that splitting starts at level 3
 554   if (pg_num_bits % 4 == 0 && pg_num < ((uint32_t)1 << pg_num_bits)) {
 555     --num;
 556   }
 557
 558   int ret;
 559   // Start with creation that only has one subfolder
 560   vector<string> paths;
 561   int dump_num = num;
 562   while (num-- > 0) {
 563     ps_t v = tmp_id & 0x0000000f;
 564     paths.push_back(to_hex(v));
 565     ret = create_path(paths);
 566     if (ret < 0 && ret != -EEXIST)
 567       return ret;
 568     tmp_id = tmp_id >> 4;
 569   }
 570
 571   // Starting from here, we can split by creating multiple subfolders
 572   const int left_bits = pg_num_bits - dump_num * 4;
 573   // this variable denotes how many bits (for this level) that can be
 574   // used for sub folder splitting
 575   int split_bits = 4 - left_bits;
 576   // the below logic is inspired by rados.h#ceph_stable_mod,
 577   // it basically determines how many sub-folders should we
 578   // create for splitting
 579   assert(pg_num_bits > 0); // otherwise BAD_SHIFT
 580   if (((1 << (pg_num_bits - 1)) | ps) >= pg_num) {
 581     ++split_bits;
 582   }
 583   const uint32_t subs = (1 << split_bits);
 584   // Calculate how many levels we create starting from here
 585   int level  = 0;
 586   leavies /= subs;
 587   while (leavies > 1) {
 588     ++level;
 589     leavies = leavies >> 4;
 590   }
 591   for (uint32_t i = 0; i < subs; ++i) {
 592     assert(split_bits <= 4); // otherwise BAD_SHIFT
 593     int v = tmp_id | (i << ((4 - split_bits) % 4));
 594     paths.push_back(to_hex(v));
 595     ret = create_path(paths);
 596     if (ret < 0 && ret != -EEXIST)
 597       return ret;
 598     ret = recursive_create_path(paths, level);
 599     if (ret < 0)
 600       return ret;
 601     paths.pop_back();
 602   }
 603   return 0;
 604 }
 605
 606 int HashIndex::init_split_folder(vector<string> &path, uint32_t hash_level)
 607 {
 608   // Get the number of sub directories for the current path
 609   vector<string> subdirs;
 610   int ret = list_subdirs(path, &subdirs);
 611   if (ret < 0)
 612     return ret;
 613   subdir_info_s info;
 614   info.subdirs = subdirs.size();
 615   info.hash_level = hash_level;
 616   ret = set_info(path, info);
 617   if (ret < 0)
 618     return ret;
 619   ret = fsync_dir(path);
 620   if (ret < 0)
 621     return ret;
 622
 623   // Do the same for subdirs
 624   vector<string>::const_iterator iter;
 625   for (iter = subdirs.begin(); iter != subdirs.end(); ++iter) {
 626     path.push_back(*iter);
 627     ret = init_split_folder(path, hash_level + 1);
 628     if (ret < 0)
 629       return ret;
 630     path.pop_back();
 631   }
 632   return 0;
 633 }
 634
 635 int HashIndex::recursive_create_path(vector<string>& path, int level)
 636 {
 637   if (level == 0)
 638     return 0;
 639   for (int i = 0; i < 16; ++i) {
 640     path.push_back(to_hex(i));
 641     int ret = create_path(path);
 642     if (ret < 0 && ret != -EEXIST)
 643       return ret;
 644     ret = recursive_create_path(path, level - 1);
 645     if (ret < 0)
 646       return ret;
 647     path.pop_back();
 648   }
 649   return 0;
 650 }
 651
 652 int HashIndex::recursive_remove(const vector<string> &path) {
 653   return _recursive_remove(path, true);
 654 }
 655
 656 int HashIndex::_recursive_remove(const vector<string> &path, bool top) {
 657   vector<string> subdirs;
 658   dout(20) << __func__ << " path=" << path << dendl;
 659   int r = list_subdirs(path, &subdirs);
 660   if (r < 0)
 661     return r;
 662   map<string, ghobject_t> objects;
 663   r = list_objects(path, 0, 0, &objects);
 664   if (r < 0)
 665     return r;
 666   if (!objects.empty())
 667     return -ENOTEMPTY;
 668   vector<string> subdir(path);
 669   for (vector<string>::iterator i = subdirs.begin();
 670        i != subdirs.end();
 671        ++i) {
 672     subdir.push_back(*i);
 673     r = _recursive_remove(subdir, false);
 674     if (r < 0)
 675       return r;
 676     subdir.pop_back();
 677   }
 678   if (top)
 679     return 0;
 680   else
 681     return remove_path(path);
 682 }
 683
 684 int HashIndex::start_col_split(const vector<string> &path) {
 685   bufferlist bl;
 686   InProgressOp op_tag(InProgressOp::COL_SPLIT, path);
 687   op_tag.encode(bl);
 688   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 689   if (r < 0)
 690     return r;
 691   return fsync_dir(vector<string>());
 692 }
 693
 694 int HashIndex::start_split(const vector<string> &path) {
 695   bufferlist bl;
 696   InProgressOp op_tag(InProgressOp::SPLIT, path);
 697   op_tag.encode(bl);
 698   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 699   if (r < 0)
 700     return r;
 701   return fsync_dir(vector<string>());
 702 }
 703
 704 int HashIndex::start_merge(const vector<string> &path) {
 705   bufferlist bl;
 706   InProgressOp op_tag(InProgressOp::MERGE, path);
 707   op_tag.encode(bl);
 708   int r = add_attr_path(vector<string>(), IN_PROGRESS_OP_TAG, bl);
 709   if (r < 0)
 710     return r;
 711   return fsync_dir(vector<string>());
 712 }
 713
 714 int HashIndex::end_split_or_merge(const vector<string> &path) {
 715   return remove_attr_path(vector<string>(), IN_PROGRESS_OP_TAG);
 716 }
 717
 718 int HashIndex::get_info(const vector<string> &path, subdir_info_s *info) {
 719   bufferlist buf;
 720   int r = get_attr_path(path, SUBDIR_ATTR, buf);
 721   if (r < 0)
 722     return r;
 723   bufferlist::iterator bufiter = buf.begin();
 724   info->decode(bufiter);
 725   assert(path.size() == (unsigned)info->hash_level);
 726   return 0;
 727 }
 728
 729 int HashIndex::set_info(const vector<string> &path, const subdir_info_s &info) {
 730   bufferlist buf;
 731   assert(path.size() == (unsigned)info.hash_level);
 732   info.encode(buf);
 733   return add_attr_path(path, SUBDIR_ATTR, buf);
 734 }
 735
 736 bool HashIndex::must_merge(const subdir_info_s &info) {
 737   return (info.hash_level > 0 &&
 738           merge_threshold > 0 &&
 739           info.objs < (unsigned)merge_threshold &&
 740           info.subdirs == 0);
 741 }
 742
 743 bool HashIndex::must_split(const subdir_info_s &info) {
 744   return (info.hash_level < (unsigned)MAX_HASH_LEVEL &&
 745           info.objs > ((unsigned)(abs(merge_threshold) * split_multiplier + settings.split_rand_factor) * 16));
 746
 747 }
 748
 749 int HashIndex::initiate_merge(const vector<string> &path, subdir_info_s info) {
 750   return start_merge(path);
 751 }
 752
 753 int HashIndex::complete_merge(const vector<string> &path, subdir_info_s info) {
 754   vector<string> dst = path;
 755   dst.pop_back();
 756   subdir_info_s dstinfo;
 757   int r, exists;
 758   r = path_exists(path, &exists);
 759   if (r < 0)
 760     return r;
 761   r = get_info(dst, &dstinfo);
 762   if (r < 0)
 763     return r;
 764   if (exists) {
 765     r = move_objects(path, dst);
 766     if (r < 0)
 767       return r;
 768     r = reset_attr(dst);
 769     if (r < 0)
 770       return r;
 771     r = remove_path(path);
 772     if (r < 0)
 773       return r;
 774   }
 775   if (must_merge(dstinfo)) {
 776     r = initiate_merge(dst, dstinfo);
 777     if (r < 0)
 778       return r;
 779     r = fsync_dir(dst);
 780     if (r < 0)
 781       return r;
 782     return complete_merge(dst, dstinfo);
 783   }
 784   r = fsync_dir(dst);
 785   if (r < 0)
 786     return r;
 787   return end_split_or_merge(path);
 788 }
 789
 790 int HashIndex::initiate_split(const vector<string> &path, subdir_info_s info) {
 791   return start_split(path);
 792 }
 793
 794 int HashIndex::complete_split(const vector<string> &path, subdir_info_s info) {
 795   int level = info.hash_level;
 796   map<string, ghobject_t> objects;
 797   vector<string> dst = path;
 798   int r;
 799   dst.push_back("");
 800   r = list_objects(path, 0, 0, &objects);
 801   if (r < 0)
 802     return r;
 803   vector<string> subdirs_vec;
 804   r = list_subdirs(path, &subdirs_vec);
 805   if (r < 0)
 806     return r;
 807   set<string> subdirs;
 808   subdirs.insert(subdirs_vec.begin(), subdirs_vec.end());
 809   map<string, map<string, ghobject_t> > mapped;
 810   map<string, ghobject_t> moved;
 811   int num_moved = 0;
 812   for (map<string, ghobject_t>::iterator i = objects.begin();
 813        i != objects.end();
 814        ++i) {
 815     vector<string> new_path;
 816     get_path_components(i->second, &new_path);
 817     mapped[new_path[level]][i->first] = i->second;
 818   }
 819   for (map<string, map<string, ghobject_t> >::iterator i = mapped.begin();
 820        i != mapped.end();
 821        ) {
 822     dst[level] = i->first;
 823     /* If the info already exists, it must be correct,
 824      * we may be picking up a partially finished split */
 825     subdir_info_s temp;
 826     // subdir has already been fully copied
 827     if (subdirs.count(i->first) && !get_info(dst, &temp)) {
 828       for (map<string, ghobject_t>::iterator j = i->second.begin();
 829            j != i->second.end();
 830            ++j) {
 831         moved[j->first] = j->second;
 832         num_moved++;
 833         objects.erase(j->first);
 834       }
 835       ++i;
 836       continue;
 837     }
 838
 839     subdir_info_s info_new;
 840     info_new.objs = i->second.size();
 841     info_new.subdirs = 0;
 842     info_new.hash_level = level + 1;
 843     if (must_merge(info_new) && !subdirs.count(i->first)) {
 844       mapped.erase(i++);
 845       continue;
 846     }
 847
 848     // Subdir doesn't yet exist
 849     if (!subdirs.count(i->first)) {
 850       info.subdirs += 1;
 851       r = create_path(dst);
 852       if (r < 0)
 853         return r;
 854     } // else subdir has been created but only partially copied
 855
 856     for (map<string, ghobject_t>::iterator j = i->second.begin();
 857          j != i->second.end();
 858          ++j) {
 859       moved[j->first] = j->second;
 860       num_moved++;
 861       objects.erase(j->first);
 862       r = link_object(path, dst, j->second, j->first);
 863       // May be a partially finished split
 864       if (r < 0 && r != -EEXIST) {
 865         return r;
 866       }
 867     }
 868
 869     r = fsync_dir(dst);
 870     if (r < 0)
 871       return r;
 872
 873     // Presence of info must imply that all objects have been copied
 874     r = set_info(dst, info_new);
 875     if (r < 0)
 876       return r;
 877
 878     r = fsync_dir(dst);
 879     if (r < 0)
 880       return r;
 881
 882     ++i;
 883   }
 884   r = remove_objects(path, moved, &objects);
 885   if (r < 0)
 886     return r;
 887   info.objs = objects.size();
 888   r = reset_attr(path);
 889   if (r < 0)
 890     return r;
 891   r = fsync_dir(path);
 892   if (r < 0)
 893     return r;
 894   return end_split_or_merge(path);
 895 }
 896
 897 void HashIndex::get_path_components(const ghobject_t &oid,
 898                                     vector<string> *path) {
 899   char buf[MAX_HASH_LEVEL + 1];
 900   snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, (uint32_t)oid.hobj.get_nibblewise_key());
 901
 902   // Path components are the hex characters of oid.hobj.hash, least
 903   // significant first
 904   for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
 905     path->push_back(string(&buf[i], 1));
 906   }
 907 }
 908
 909 string HashIndex::get_hash_str(uint32_t hash) {
 910   char buf[MAX_HASH_LEVEL + 1];
 911   snprintf(buf, sizeof(buf), "%.*X", MAX_HASH_LEVEL, hash);
 912   string retval;
 913   for (int i = 0; i < MAX_HASH_LEVEL; ++i) {
 914     retval.push_back(buf[MAX_HASH_LEVEL - 1 - i]);
 915   }
 916   return retval;
 917 }
 918
 919 string HashIndex::get_path_str(const ghobject_t &oid) {
 920   assert(!oid.is_max());
 921   return get_hash_str(oid.hobj.get_hash());
 922 }
 923
 924 uint32_t HashIndex::hash_prefix_to_hash(string prefix) {
 925   while (prefix.size() < sizeof(uint32_t) * 2) {
 926     prefix.push_back('0');
 927   }
 928   uint32_t hash;
 929   sscanf(prefix.c_str(), "%x", &hash);
 930   // nibble reverse
 931   hash = ((hash & 0x0f0f0f0f) << 4) | ((hash & 0xf0f0f0f0) >> 4);
 932   hash = ((hash & 0x00ff00ff) << 8) | ((hash & 0xff00ff00) >> 8);
 933   hash = ((hash & 0x0000ffff) << 16) | ((hash & 0xffff0000) >> 16);
 934   return hash;
 935 }
 936
 937 int HashIndex::get_path_contents_by_hash_bitwise(
 938   const vector<string> &path,
 939   const ghobject_t *next_object,
 940   set<string, CmpHexdigitStringBitwise> *hash_prefixes,
 941   set<pair<string, ghobject_t>, CmpPairBitwise> *objects)
 942 {
 943   map<string, ghobject_t> rev_objects;
 944   int r;
 945   r = list_objects(path, 0, 0, &rev_objects);
 946   if (r < 0)
 947     return r;
 948   // bitwise sort
 949   for (map<string, ghobject_t>::iterator i = rev_objects.begin();
 950        i != rev_objects.end();
 951        ++i) {
 952     if (next_object && i->second < *next_object)
 953       continue;
 954     string hash_prefix = get_path_str(i->second);
 955     hash_prefixes->insert(hash_prefix);
 956     objects->insert(pair<string, ghobject_t>(hash_prefix, i->second));
 957   }
 958   vector<string> subdirs;
 959   r = list_subdirs(path, &subdirs);
 960   if (r < 0)
 961     return r;
 962
 963   // sort subdirs bitwise (by reversing hex digit nibbles)
 964   std::sort(subdirs.begin(), subdirs.end(), cmp_hexdigit_bitwise);
 965
 966   // Local to this function, we will convert the prefix strings
 967   // (previously simply the reversed hex digits) to also have each
 968   // digit's nibbles reversed.  This will make the strings sort
 969   // bitwise.
 970   string cur_prefix;
 971   for (vector<string>::const_iterator i = path.begin();
 972        i != path.end();
 973        ++i) {
 974     cur_prefix.append(reverse_hexdigit_bits_string(*i));
 975   }
 976   string next_object_string;
 977   if (next_object)
 978     next_object_string = reverse_hexdigit_bits_string(get_path_str(*next_object));
 979   for (vector<string>::iterator i = subdirs.begin();
 980        i != subdirs.end();
 981        ++i) {
 982     string candidate = cur_prefix + reverse_hexdigit_bits_string(*i);
 983     if (next_object) {
 984       if (next_object->is_max())
 985         continue;
 986       if (candidate < next_object_string.substr(0, candidate.size()))
 987         continue;
 988     }
 989     // re-reverse the hex digit nibbles for the caller
 990     hash_prefixes->insert(reverse_hexdigit_bits_string(candidate));
 991   }
 992   return 0;
 993 }
 994
 995 int HashIndex::list_by_hash(const vector<string> &path,
 996                             const ghobject_t &end,
 997                             int max_count,
 998                             ghobject_t *next,
 999                             vector<ghobject_t> *out)
1000 {
1001   assert(out);
1002   return list_by_hash_bitwise(path, end, max_count, next, out);
1003 }
1004
1005 int HashIndex::list_by_hash_bitwise(
1006   const vector<string> &path,
1007   const ghobject_t& end,
1008   int max_count,
1009   ghobject_t *next,
1010   vector<ghobject_t> *out)
1011 {
1012   vector<string> next_path = path;
1013   next_path.push_back("");
1014   set<string, CmpHexdigitStringBitwise> hash_prefixes;
1015   set<pair<string, ghobject_t>, CmpPairBitwise> objects;
1016   int r = get_path_contents_by_hash_bitwise(path,
1017                                             next,
1018                                             &hash_prefixes,
1019                                             &objects);
1020   if (r < 0)
1021     return r;
1022   for (set<string, CmpHexdigitStringBitwise>::iterator i = hash_prefixes.begin();
1023        i != hash_prefixes.end();
1024        ++i) {
1025     dout(20) << __func__ << " prefix " << *i << dendl;
1026     set<pair<string, ghobject_t>, CmpPairBitwise>::iterator j = objects.lower_bound(
1027       make_pair(*i, ghobject_t()));
1028     if (j == objects.end() || j->first != *i) {
1029       *(next_path.rbegin()) = *(i->rbegin());
1030       ghobject_t next_recurse;
1031       if (next)
1032         next_recurse = *next;
1033       r = list_by_hash_bitwise(next_path,
1034                                end,
1035                                max_count,
1036                                &next_recurse,
1037                                out);
1038
1039       if (r < 0)
1040         return r;
1041       if (!next_recurse.is_max()) {
1042         if (next)
1043           *next = next_recurse;
1044         return 0;
1045       }
1046     } else {
1047       while (j != objects.end() && j->first == *i) {
1048         if (max_count > 0 && out->size() == (unsigned)max_count) {
1049           if (next)
1050             *next = j->second;
1051           return 0;
1052         }
1053         if (j->second >= end) {
1054           if (next)
1055             *next = j->second;
1056           return 0;
1057         }
1058         if (!next || j->second >= *next) {
1059           dout(20) << __func__ << " prefix " << *i << " ob " << j->second << dendl;
1060           out->push_back(j->second);
1061         }
1062         ++j;
1063       }
1064     }
1065   }
1066   if (next)
1067     *next = ghobject_t::get_max();
1068   return 0;
1069 }
1070
1071