ceph/src/common/FastCDC.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #include <random>
   5
   6 #include "FastCDC.h"
   7
   8
   9 // Unlike FastCDC described in the paper, if we are close to the
  10 // target, use the target mask.  If we are very small or very large,
  11 // use an adjusted mask--like the paper.  This tries to keep more
  12 // cut points using the same mask, and fewer using the small or large
  13 // masks.
  14
  15 // How many more/fewer bits to set in the small/large masks.
  16 //
  17 // This is the "normalization level" or "NC level" in the FastCDC
  18 // paper.
  19 #define TARGET_WINDOW_MASK_BITS  2
  20
  21 // How big the 'target window' is (in which we use the target mask).
  22 //
  23 // In the FastCDC paper, this is always 0: there is not "target
  24 // window," and either small_mask (maskS) or large_mask (maskL) is
  25 // used--never target_mask (maskA).
  26 #define TARGET_WINDOW_BITS       1
  27
  28 // How many bits larger/smaller than target for hard limits on chunk
  29 // size.
  30 //
  31 // We assume the min and max sizes are always this many bits
  32 // larger/smaller than the target.  (Note that the FastCDC paper 8KB
  33 // example has a min of 2KB (2 bits smaller) and max of 64 KB (3 bits
  34 // larger), although it is not clear why they chose those values.)
  35 #define SIZE_WINDOW_BITS         2
  36
  37 void FastCDC::_setup(int target, int size_window_bits)
  38 {
  39   target_bits = target;
  40
  41   if (!size_window_bits) {
  42     size_window_bits = SIZE_WINDOW_BITS;
  43   }
  44   min_bits = target - size_window_bits;
  45   max_bits = target + size_window_bits;
  46
  47   std::mt19937_64 engine;
  48
  49   // prefill table
  50   for (unsigned i = 0; i < 256; ++i) {
  51     table[i] = engine();
  52   }
  53
  54   // set mask
  55   int did = 0;
  56   uint64_t m = 0;
  57   while (did < target_bits + TARGET_WINDOW_MASK_BITS) {
  58     uint64_t bit = 1ull << (engine() & 63);
  59     if (m & bit) {
  60       continue; // this bit is already set
  61     }
  62     m |= bit;
  63     ++did;
  64     if (did == target_bits - TARGET_WINDOW_MASK_BITS) {
  65       large_mask = m;
  66     } else if (did == target_bits) {
  67       target_mask = m;
  68     } else if (did == target_bits + TARGET_WINDOW_MASK_BITS) {
  69       small_mask = m;
  70     }
  71   }
  72 }
  73
  74 static inline bool _scan(
  75   // these are our cursor/postion...
  76   bufferlist::buffers_t::const_iterator *p,
  77   const char **pp, const char **pe,
  78   size_t& pos,
  79   size_t max,   // how much to read
  80   uint64_t& fp, // fingerprint
  81   uint64_t mask, const uint64_t *table)
  82 {
  83   while (pos < max) {
  84     if (*pp == *pe) {
  85       ++(*p);
  86       *pp = (*p)->c_str();
  87       *pe = *pp + (*p)->length();
  88     }
  89     const char *te = std::min(*pe, *pp + max - pos);
  90     for (; *pp < te; ++(*pp), ++pos) {
  91       if ((fp & mask) == mask) {
  92         return false;
  93       }
  94       fp = (fp << 1) ^ table[*(unsigned char*)*pp];
  95     }
  96     if (pos >= max) {
  97       return true;
  98     }
  99   }
 100   return true;
 101 }
 102
 103 void FastCDC::calc_chunks(
 104   const bufferlist& bl,
 105   std::vector<std::pair<uint64_t, uint64_t>> *chunks) const
 106 {
 107   if (bl.length() == 0) {
 108     return;
 109   }
 110   auto p = bl.buffers().begin();
 111   const char *pp = p->c_str();
 112   const char *pe = pp + p->length();
 113
 114   size_t pos = 0;
 115   size_t len = bl.length();
 116   while (pos < len) {
 117     size_t cstart = pos;
 118     uint64_t fp = 0;
 119
 120     // are we left with a min-sized (or smaller) chunk?
 121     if (len - pos <= (1ul << min_bits)) {
 122       chunks->push_back(std::pair<uint64_t,uint64_t>(pos, len - pos));
 123       break;
 124     }
 125
 126     // skip forward to the min chunk size cut point (minus the window, so
 127     // we can initialize the rolling fingerprint).
 128     size_t skip = (1 << min_bits) - window;
 129     pos += skip;
 130     while (skip) {
 131       size_t s = std::min<size_t>(pe - pp, skip);
 132       skip -= s;
 133       pp += s;
 134       if (pp == pe) {
 135         ++p;
 136         pp = p->c_str();
 137         pe = pp + p->length();
 138       }
 139     }
 140
 141     // first fill the window
 142     size_t max = pos + window;
 143     while (pos < max) {
 144       if (pp == pe) {
 145         ++p;
 146         pp = p->c_str();
 147         pe = pp + p->length();
 148       }
 149       const char *te = std::min(pe, pp + (max - pos));
 150       for (; pp < te; ++pp, ++pos) {
 151         fp = (fp << 1) ^ table[*(unsigned char*)pp];
 152       }
 153     }
 154     ceph_assert(pos < len);
 155
 156     // find an end marker
 157     if (
 158       // for the first "small" region
 159       _scan(&p, &pp, &pe, pos,
 160             std::min(len, cstart + (1 << (target_bits - TARGET_WINDOW_BITS))),
 161             fp, small_mask, table) &&
 162       // for the middle range (close to our target)
 163       (TARGET_WINDOW_BITS == 0 ||
 164        _scan(&p, &pp, &pe, pos,
 165              std::min(len, cstart + (1 << (target_bits + TARGET_WINDOW_BITS))),
 166              fp, target_mask, table)) &&
 167       // we're past target, use large_mask!
 168       _scan(&p, &pp, &pe, pos,
 169             std::min(len,
 170                      cstart + (1 << max_bits)),
 171             fp, large_mask, table))
 172       ;
 173
 174     chunks->push_back(std::pair<uint64_t,uint64_t>(cstart, pos - cstart));
 175   }
 176 }