]> git.proxmox.com Git - ceph.git/blame - ceph/src/common/FastCDC.cc
import ceph quincy 17.2.6
[ceph.git] / ceph / src / common / FastCDC.cc
CommitLineData
f67539c2
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include <random>
5
6#include "FastCDC.h"
7
8
9// Unlike FastCDC described in the paper, if we are close to the
10// target, use the target mask. If we are very small or very large,
11// use an adjusted mask--like the paper. This tries to keep more
12// cut points using the same mask, and fewer using the small or large
13// masks.
14
15// How many more/fewer bits to set in the small/large masks.
16//
17// This is the "normalization level" or "NC level" in the FastCDC
18// paper.
19#define TARGET_WINDOW_MASK_BITS 2
20
21// How big the 'target window' is (in which we use the target mask).
22//
23// In the FastCDC paper, this is always 0: there is not "target
24// window," and either small_mask (maskS) or large_mask (maskL) is
25// used--never target_mask (maskA).
26#define TARGET_WINDOW_BITS 1
27
28// How many bits larger/smaller than target for hard limits on chunk
29// size.
30//
31// We assume the min and max sizes are always this many bits
32// larger/smaller than the target. (Note that the FastCDC paper 8KB
33// example has a min of 2KB (2 bits smaller) and max of 64 KB (3 bits
34// larger), although it is not clear why they chose those values.)
35#define SIZE_WINDOW_BITS 2
36
37void FastCDC::_setup(int target, int size_window_bits)
38{
39 target_bits = target;
40
41 if (!size_window_bits) {
42 size_window_bits = SIZE_WINDOW_BITS;
43 }
44 min_bits = target - size_window_bits;
45 max_bits = target + size_window_bits;
46
47 std::mt19937_64 engine;
48
49 // prefill table
50 for (unsigned i = 0; i < 256; ++i) {
51 table[i] = engine();
52 }
53
54 // set mask
55 int did = 0;
56 uint64_t m = 0;
57 while (did < target_bits + TARGET_WINDOW_MASK_BITS) {
58 uint64_t bit = 1ull << (engine() & 63);
59 if (m & bit) {
60 continue; // this bit is already set
61 }
62 m |= bit;
63 ++did;
64 if (did == target_bits - TARGET_WINDOW_MASK_BITS) {
65 large_mask = m;
66 } else if (did == target_bits) {
67 target_mask = m;
68 } else if (did == target_bits + TARGET_WINDOW_MASK_BITS) {
69 small_mask = m;
70 }
71 }
72}
73
74static inline bool _scan(
75 // these are our cursor/postion...
76 bufferlist::buffers_t::const_iterator *p,
77 const char **pp, const char **pe,
78 size_t& pos,
79 size_t max, // how much to read
80 uint64_t& fp, // fingerprint
81 uint64_t mask, const uint64_t *table)
82{
83 while (pos < max) {
84 if (*pp == *pe) {
85 ++(*p);
86 *pp = (*p)->c_str();
87 *pe = *pp + (*p)->length();
88 }
89 const char *te = std::min(*pe, *pp + max - pos);
90 for (; *pp < te; ++(*pp), ++pos) {
91 if ((fp & mask) == mask) {
92 return false;
93 }
94 fp = (fp << 1) ^ table[*(unsigned char*)*pp];
95 }
96 if (pos >= max) {
97 return true;
98 }
99 }
100 return true;
101}
102
103void FastCDC::calc_chunks(
104 const bufferlist& bl,
105 std::vector<std::pair<uint64_t, uint64_t>> *chunks) const
106{
107 if (bl.length() == 0) {
108 return;
109 }
110 auto p = bl.buffers().begin();
111 const char *pp = p->c_str();
112 const char *pe = pp + p->length();
113
114 size_t pos = 0;
115 size_t len = bl.length();
116 while (pos < len) {
117 size_t cstart = pos;
118 uint64_t fp = 0;
119
120 // are we left with a min-sized (or smaller) chunk?
121 if (len - pos <= (1ul << min_bits)) {
122 chunks->push_back(std::pair<uint64_t,uint64_t>(pos, len - pos));
123 break;
124 }
125
126 // skip forward to the min chunk size cut point (minus the window, so
127 // we can initialize the rolling fingerprint).
128 size_t skip = (1 << min_bits) - window;
129 pos += skip;
130 while (skip) {
131 size_t s = std::min<size_t>(pe - pp, skip);
132 skip -= s;
133 pp += s;
134 if (pp == pe) {
135 ++p;
136 pp = p->c_str();
137 pe = pp + p->length();
138 }
139 }
140
141 // first fill the window
142 size_t max = pos + window;
143 while (pos < max) {
144 if (pp == pe) {
145 ++p;
146 pp = p->c_str();
147 pe = pp + p->length();
148 }
149 const char *te = std::min(pe, pp + (max - pos));
150 for (; pp < te; ++pp, ++pos) {
151 fp = (fp << 1) ^ table[*(unsigned char*)pp];
152 }
153 }
154 ceph_assert(pos < len);
155
156 // find an end marker
157 if (
158 // for the first "small" region
159 _scan(&p, &pp, &pe, pos,
160 std::min(len, cstart + (1 << (target_bits - TARGET_WINDOW_BITS))),
161 fp, small_mask, table) &&
162 // for the middle range (close to our target)
163 (TARGET_WINDOW_BITS == 0 ||
164 _scan(&p, &pp, &pe, pos,
165 std::min(len, cstart + (1 << (target_bits + TARGET_WINDOW_BITS))),
166 fp, target_mask, table)) &&
167 // we're past target, use large_mask!
168 _scan(&p, &pp, &pe, pos,
169 std::min(len,
170 cstart + (1 << max_bits)),
171 fp, large_mask, table))
172 ;
173
174 chunks->push_back(std::pair<uint64_t,uint64_t>(cstart, pos - cstart));
175 }
176}