]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #include <random> | |
5 | ||
6 | #include "FastCDC.h" | |
7 | ||
8 | ||
9 | // Unlike FastCDC described in the paper, if we are close to the | |
10 | // target, use the target mask. If we are very small or very large, | |
11 | // use an adjusted mask--like the paper. This tries to keep more | |
12 | // cut points using the same mask, and fewer using the small or large | |
13 | // masks. | |
14 | ||
15 | // How many more/fewer bits to set in the small/large masks. | |
16 | // | |
17 | // This is the "normalization level" or "NC level" in the FastCDC | |
18 | // paper. | |
19 | #define TARGET_WINDOW_MASK_BITS 2 | |
20 | ||
21 | // How big the 'target window' is (in which we use the target mask). | |
22 | // | |
23 | // In the FastCDC paper, this is always 0: there is not "target | |
24 | // window," and either small_mask (maskS) or large_mask (maskL) is | |
25 | // used--never target_mask (maskA). | |
26 | #define TARGET_WINDOW_BITS 1 | |
27 | ||
28 | // How many bits larger/smaller than target for hard limits on chunk | |
29 | // size. | |
30 | // | |
31 | // We assume the min and max sizes are always this many bits | |
32 | // larger/smaller than the target. (Note that the FastCDC paper 8KB | |
33 | // example has a min of 2KB (2 bits smaller) and max of 64 KB (3 bits | |
34 | // larger), although it is not clear why they chose those values.) | |
35 | #define SIZE_WINDOW_BITS 2 | |
36 | ||
37 | void FastCDC::_setup(int target, int size_window_bits) | |
38 | { | |
39 | target_bits = target; | |
40 | ||
41 | if (!size_window_bits) { | |
42 | size_window_bits = SIZE_WINDOW_BITS; | |
43 | } | |
44 | min_bits = target - size_window_bits; | |
45 | max_bits = target + size_window_bits; | |
46 | ||
47 | std::mt19937_64 engine; | |
48 | ||
49 | // prefill table | |
50 | for (unsigned i = 0; i < 256; ++i) { | |
51 | table[i] = engine(); | |
52 | } | |
53 | ||
54 | // set mask | |
55 | int did = 0; | |
56 | uint64_t m = 0; | |
57 | while (did < target_bits + TARGET_WINDOW_MASK_BITS) { | |
58 | uint64_t bit = 1ull << (engine() & 63); | |
59 | if (m & bit) { | |
60 | continue; // this bit is already set | |
61 | } | |
62 | m |= bit; | |
63 | ++did; | |
64 | if (did == target_bits - TARGET_WINDOW_MASK_BITS) { | |
65 | large_mask = m; | |
66 | } else if (did == target_bits) { | |
67 | target_mask = m; | |
68 | } else if (did == target_bits + TARGET_WINDOW_MASK_BITS) { | |
69 | small_mask = m; | |
70 | } | |
71 | } | |
72 | } | |
73 | ||
74 | static inline bool _scan( | |
75 | // these are our cursor/postion... | |
76 | bufferlist::buffers_t::const_iterator *p, | |
77 | const char **pp, const char **pe, | |
78 | size_t& pos, | |
79 | size_t max, // how much to read | |
80 | uint64_t& fp, // fingerprint | |
81 | uint64_t mask, const uint64_t *table) | |
82 | { | |
83 | while (pos < max) { | |
84 | if (*pp == *pe) { | |
85 | ++(*p); | |
86 | *pp = (*p)->c_str(); | |
87 | *pe = *pp + (*p)->length(); | |
88 | } | |
89 | const char *te = std::min(*pe, *pp + max - pos); | |
90 | for (; *pp < te; ++(*pp), ++pos) { | |
91 | if ((fp & mask) == mask) { | |
92 | return false; | |
93 | } | |
94 | fp = (fp << 1) ^ table[*(unsigned char*)*pp]; | |
95 | } | |
96 | if (pos >= max) { | |
97 | return true; | |
98 | } | |
99 | } | |
100 | return true; | |
101 | } | |
102 | ||
103 | void FastCDC::calc_chunks( | |
104 | const bufferlist& bl, | |
105 | std::vector<std::pair<uint64_t, uint64_t>> *chunks) const | |
106 | { | |
107 | if (bl.length() == 0) { | |
108 | return; | |
109 | } | |
110 | auto p = bl.buffers().begin(); | |
111 | const char *pp = p->c_str(); | |
112 | const char *pe = pp + p->length(); | |
113 | ||
114 | size_t pos = 0; | |
115 | size_t len = bl.length(); | |
116 | while (pos < len) { | |
117 | size_t cstart = pos; | |
118 | uint64_t fp = 0; | |
119 | ||
120 | // are we left with a min-sized (or smaller) chunk? | |
121 | if (len - pos <= (1ul << min_bits)) { | |
122 | chunks->push_back(std::pair<uint64_t,uint64_t>(pos, len - pos)); | |
123 | break; | |
124 | } | |
125 | ||
126 | // skip forward to the min chunk size cut point (minus the window, so | |
127 | // we can initialize the rolling fingerprint). | |
128 | size_t skip = (1 << min_bits) - window; | |
129 | pos += skip; | |
130 | while (skip) { | |
131 | size_t s = std::min<size_t>(pe - pp, skip); | |
132 | skip -= s; | |
133 | pp += s; | |
134 | if (pp == pe) { | |
135 | ++p; | |
136 | pp = p->c_str(); | |
137 | pe = pp + p->length(); | |
138 | } | |
139 | } | |
140 | ||
141 | // first fill the window | |
142 | size_t max = pos + window; | |
143 | while (pos < max) { | |
144 | if (pp == pe) { | |
145 | ++p; | |
146 | pp = p->c_str(); | |
147 | pe = pp + p->length(); | |
148 | } | |
149 | const char *te = std::min(pe, pp + (max - pos)); | |
150 | for (; pp < te; ++pp, ++pos) { | |
151 | fp = (fp << 1) ^ table[*(unsigned char*)pp]; | |
152 | } | |
153 | } | |
154 | ceph_assert(pos < len); | |
155 | ||
156 | // find an end marker | |
157 | if ( | |
158 | // for the first "small" region | |
159 | _scan(&p, &pp, &pe, pos, | |
160 | std::min(len, cstart + (1 << (target_bits - TARGET_WINDOW_BITS))), | |
161 | fp, small_mask, table) && | |
162 | // for the middle range (close to our target) | |
163 | (TARGET_WINDOW_BITS == 0 || | |
164 | _scan(&p, &pp, &pe, pos, | |
165 | std::min(len, cstart + (1 << (target_bits + TARGET_WINDOW_BITS))), | |
166 | fp, target_mask, table)) && | |
167 | // we're past target, use large_mask! | |
168 | _scan(&p, &pp, &pe, pos, | |
169 | std::min(len, | |
170 | cstart + (1 << max_bits)), | |
171 | fp, large_mask, table)) | |
172 | ; | |
173 | ||
174 | chunks->push_back(std::pair<uint64_t,uint64_t>(cstart, pos - cstart)); | |
175 | } | |
176 | } |