[ceph.git] / ceph / src / common / FastCDC.cc

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab

#include <random>

#include "FastCDC.h"


// Unlike FastCDC described in the paper, if we are close to the
// target, use the target mask.  If we are very small or very large,
// use an adjusted mask--like the paper.  This tries to keep more
// cut points using the same mask, and fewer using the small or large
// masks.

// How many more/fewer bits to set in the small/large masks.
//
// This is the "normalization level" or "NC level" in the FastCDC
// paper.
#define TARGET_WINDOW_MASK_BITS  2

// How big the 'target window' is (in which we use the target mask).
//
// In the FastCDC paper, this is always 0: there is not "target
// window," and either small_mask (maskS) or large_mask (maskL) is
// used--never target_mask (maskA).
#define TARGET_WINDOW_BITS       1

// How many bits larger/smaller than target for hard limits on chunk
// size.
//
// We assume the min and max sizes are always this many bits
// larger/smaller than the target.  (Note that the FastCDC paper 8KB
// example has a min of 2KB (2 bits smaller) and max of 64 KB (3 bits
// larger), although it is not clear why they chose those values.)
#define SIZE_WINDOW_BITS         2

void FastCDC::_setup(int target, int size_window_bits)
{
  target_bits = target;

  if (!size_window_bits) {
    size_window_bits = SIZE_WINDOW_BITS;
  }
  min_bits = target - size_window_bits;
  max_bits = target + size_window_bits;

  std::mt19937_64 engine;

  // prefill table
  for (unsigned i = 0; i < 256; ++i) {
    table[i] = engine();
  }

  // set mask
  int did = 0;
  uint64_t m = 0;
  while (did < target_bits + TARGET_WINDOW_MASK_BITS) {
    uint64_t bit = 1ull << (engine() & 63);
    if (m & bit) {
      continue;	// this bit is already set
    }
    m |= bit;
    ++did;
    if (did == target_bits - TARGET_WINDOW_MASK_BITS) {
      large_mask = m;
    } else if (did == target_bits) {
      target_mask = m;
    } else if (did == target_bits + TARGET_WINDOW_MASK_BITS) {
      small_mask = m;
    }
  }
}

static inline bool _scan(
  // these are our cursor/postion...
  bufferlist::buffers_t::const_iterator *p,
  const char **pp, const char **pe,
  size_t& pos,
  size_t max,   // how much to read
  uint64_t& fp, // fingerprint
  uint64_t mask, const uint64_t *table)
{
  while (pos < max) {
    if (*pp == *pe) {
      ++(*p);
      *pp = (*p)->c_str();
      *pe = *pp + (*p)->length();
    }
    const char *te = std::min(*pe, *pp + max - pos);
    for (; *pp < te; ++(*pp), ++pos) {
      if ((fp & mask) == mask) {
	return false;
      }
      fp = (fp << 1) ^ table[*(unsigned char*)*pp];
    }
    if (pos >= max) {
      return true;
    }
  }
  return true;
}

void FastCDC::calc_chunks(
  const bufferlist& bl,
  std::vector<std::pair<uint64_t, uint64_t>> *chunks) const
{
  if (bl.length() == 0) {
    return;
  }
  auto p = bl.buffers().begin();
  const char *pp = p->c_str();
  const char *pe = pp + p->length();

  size_t pos = 0;
  size_t len = bl.length();
  while (pos < len) {
    size_t cstart = pos;
    uint64_t fp = 0;

    // are we left with a min-sized (or smaller) chunk?
    if (len - pos <= (1ul << min_bits)) {
      chunks->push_back(std::pair<uint64_t,uint64_t>(pos, len - pos));
      break;
    }

    // skip forward to the min chunk size cut point (minus the window, so
    // we can initialize the rolling fingerprint).
    size_t skip = (1 << min_bits) - window;
    pos += skip;
    while (skip) {
      size_t s = std::min<size_t>(pe - pp, skip);
      skip -= s;
      pp += s;
      if (pp == pe) {
	++p;
	pp = p->c_str();
	pe = pp + p->length();
      }
    }

    // first fill the window
    size_t max = pos + window;
    while (pos < max) {
      if (pp == pe) {
	++p;
	pp = p->c_str();
	pe = pp + p->length();
      }
      const char *te = std::min(pe, pp + (max - pos));
      for (; pp < te; ++pp, ++pos) {
	fp = (fp << 1) ^ table[*(unsigned char*)pp];
      }
    }
    ceph_assert(pos < len);

    // find an end marker
    if (
      // for the first "small" region
      _scan(&p, &pp, &pe, pos,
	    std::min(len, cstart + (1 << (target_bits - TARGET_WINDOW_BITS))),
	    fp, small_mask, table) &&
      // for the middle range (close to our target)
      (TARGET_WINDOW_BITS == 0 ||
       _scan(&p, &pp, &pe, pos,
	     std::min(len, cstart + (1 << (target_bits + TARGET_WINDOW_BITS))),
	     fp, target_mask, table)) &&
      // we're past target, use large_mask!
      _scan(&p, &pp, &pe, pos,
	    std::min(len,
		     cstart + (1 << max_bits)),
	    fp, large_mask, table))
      ;

    chunks->push_back(std::pair<uint64_t,uint64_t>(cstart, pos - cstart));
  }
}
Commit	Line	Data
f67539c2 TL	1	// -- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t --
	2	// vim: ts=8 sw=2 smarttab
	3
	4	#include <random>
	5
	6	#include "FastCDC.h"
	7
	8
	9	// Unlike FastCDC described in the paper, if we are close to the
	10	// target, use the target mask. If we are very small or very large,
	11	// use an adjusted mask--like the paper. This tries to keep more
	12	// cut points using the same mask, and fewer using the small or large
	13	// masks.
	14
	15	// How many more/fewer bits to set in the small/large masks.
	16	//
	17	// This is the "normalization level" or "NC level" in the FastCDC
	18	// paper.
	19	#define TARGET_WINDOW_MASK_BITS 2
	20
	21	// How big the 'target window' is (in which we use the target mask).
	22	//
	23	// In the FastCDC paper, this is always 0: there is not "target
	24	// window," and either small_mask (maskS) or large_mask (maskL) is
	25	// used--never target_mask (maskA).
	26	#define TARGET_WINDOW_BITS 1
	27
	28	// How many bits larger/smaller than target for hard limits on chunk
	29	// size.
	30	//
	31	// We assume the min and max sizes are always this many bits
	32	// larger/smaller than the target. (Note that the FastCDC paper 8KB
	33	// example has a min of 2KB (2 bits smaller) and max of 64 KB (3 bits
	34	// larger), although it is not clear why they chose those values.)
	35	#define SIZE_WINDOW_BITS 2
	36
	37	void FastCDC::_setup(int target, int size_window_bits)
	38	{
	39	target_bits = target;
	40
	41	if (!size_window_bits) {
	42	size_window_bits = SIZE_WINDOW_BITS;
	43	}
	44	min_bits = target - size_window_bits;
	45	max_bits = target + size_window_bits;
	46
	47	std::mt19937_64 engine;
	48
	49	// prefill table
	50	for (unsigned i = 0; i < 256; ++i) {
	51	table[i] = engine();
	52	}
	53
	54	// set mask
	55	int did = 0;
	56	uint64_t m = 0;
	57	while (did < target_bits + TARGET_WINDOW_MASK_BITS) {
	58	uint64_t bit = 1ull << (engine() & 63);
	59	if (m & bit) {
	60	continue; // this bit is already set
	61	}
	62	m \|= bit;
	63	++did;
	64	if (did == target_bits - TARGET_WINDOW_MASK_BITS) {
65	large_mask = m;
66	} else if (did == target_bits) {
67	target_mask = m;
68	} else if (did == target_bits + TARGET_WINDOW_MASK_BITS) {
69	small_mask = m;
70	}
71	}
72	}
73
74	static inline bool _scan(
75	// these are our cursor/postion...
76	bufferlist::buffers_t::const_iterator *p,
77	const char pp, const char pe,
78	size_t& pos,
79	size_t max, // how much to read
80	uint64_t& fp, // fingerprint
81	uint64_t mask, const uint64_t *table)
82	{
83	while (pos < max) {
84	if (pp == pe) {
85	++(*p);
86	pp = (p)->c_str();
87	pe = pp + (*p)->length();
88	}
89	const char te = std::min(pe, *pp + max - pos);
90	for (; pp < te; ++(pp), ++pos) {
91	if ((fp & mask) == mask) {
92	return false;
93	}
94	fp = (fp << 1) ^ table[(unsigned char)*pp];
95	}
96	if (pos >= max) {
97	return true;
98	}
99	}
100	return true;
101	}
102
103	void FastCDC::calc_chunks(
104	const bufferlist& bl,
105	std::vector<std::pair<uint64_t, uint64_t>> *chunks) const
106	{
107	if (bl.length() == 0) {
108	return;
109	}
110	auto p = bl.buffers().begin();
111	const char *pp = p->c_str();
112	const char *pe = pp + p->length();
113
114	size_t pos = 0;
115	size_t len = bl.length();
116	while (pos < len) {
117	size_t cstart = pos;
118	uint64_t fp = 0;
119
120	// are we left with a min-sized (or smaller) chunk?
121	if (len - pos <= (1ul << min_bits)) {
122	chunks->push_back(std::pair<uint64_t,uint64_t>(pos, len - pos));
123	break;
124	}
125
126	// skip forward to the min chunk size cut point (minus the window, so
127	// we can initialize the rolling fingerprint).
128	size_t skip = (1 << min_bits) - window;
129	pos += skip;
130	while (skip) {
131	size_t s = std::min<size_t>(pe - pp, skip);
132	skip -= s;
133	pp += s;
134	if (pp == pe) {
135	++p;
136	pp = p->c_str();
137	pe = pp + p->length();
138	}
139	}
140
141	// first fill the window
142	size_t max = pos + window;
143	while (pos < max) {
144	if (pp == pe) {
145	++p;
146	pp = p->c_str();
147	pe = pp + p->length();
148	}
149	const char *te = std::min(pe, pp + (max - pos));
150	for (; pp < te; ++pp, ++pos) {
151	fp = (fp << 1) ^ table[(unsigned char)pp];
152	}
153	}
154	ceph_assert(pos < len);
155
156	// find an end marker
157	if (
158	// for the first "small" region
159	_scan(&p, &pp, &pe, pos,
160	std::min(len, cstart + (1 << (target_bits - TARGET_WINDOW_BITS))),
161	fp, small_mask, table) &&
162	// for the middle range (close to our target)
163	(TARGET_WINDOW_BITS == 0 \|\|
164	_scan(&p, &pp, &pe, pos,
165	std::min(len, cstart + (1 << (target_bits + TARGET_WINDOW_BITS))),
166	fp, target_mask, table)) &&
167	// we're past target, use large_mask!
168	_scan(&p, &pp, &pe, pos,
169	std::min(len,
170	cstart + (1 << max_bits)),
171	fp, large_mask, table))
172	;
173
174	chunks->push_back(std::pair<uint64_t,uint64_t>(cstart, pos - cstart));
175	}
176	}