]>
git.proxmox.com Git - ceph.git/blob - ceph/src/common/FastCDC.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
9 // Unlike FastCDC described in the paper, if we are close to the
10 // target, use the target mask. If we are very small or very large,
11 // use an adjusted mask--like the paper. This tries to keep more
12 // cut points using the same mask, and fewer using the small or large
15 // How many more/fewer bits to set in the small/large masks.
17 // This is the "normalization level" or "NC level" in the FastCDC
19 #define TARGET_WINDOW_MASK_BITS 2
21 // How big the 'target window' is (in which we use the target mask).
23 // In the FastCDC paper, this is always 0: there is not "target
24 // window," and either small_mask (maskS) or large_mask (maskL) is
25 // used--never target_mask (maskA).
26 #define TARGET_WINDOW_BITS 1
28 // How many bits larger/smaller than target for hard limits on chunk
31 // We assume the min and max sizes are always this many bits
32 // larger/smaller than the target. (Note that the FastCDC paper 8KB
33 // example has a min of 2KB (2 bits smaller) and max of 64 KB (3 bits
34 // larger), although it is not clear why they chose those values.)
35 #define SIZE_WINDOW_BITS 2
37 void FastCDC::_setup(int target
, int size_window_bits
)
41 if (!size_window_bits
) {
42 size_window_bits
= SIZE_WINDOW_BITS
;
44 min_bits
= target
- size_window_bits
;
45 max_bits
= target
+ size_window_bits
;
47 std::mt19937_64 engine
;
50 for (unsigned i
= 0; i
< 256; ++i
) {
57 while (did
< target_bits
+ TARGET_WINDOW_MASK_BITS
) {
58 uint64_t bit
= 1ull << (engine() & 63);
60 continue; // this bit is already set
64 if (did
== target_bits
- TARGET_WINDOW_MASK_BITS
) {
66 } else if (did
== target_bits
) {
68 } else if (did
== target_bits
+ TARGET_WINDOW_MASK_BITS
) {
74 static inline bool _scan(
75 // these are our cursor/postion...
76 bufferlist::buffers_t::const_iterator
*p
,
77 const char **pp
, const char **pe
,
79 size_t max
, // how much to read
80 uint64_t& fp
, // fingerprint
81 uint64_t mask
, const uint64_t *table
)
87 *pe
= *pp
+ (*p
)->length();
89 const char *te
= std::min(*pe
, *pp
+ max
- pos
);
90 for (; *pp
< te
; ++(*pp
), ++pos
) {
91 if ((fp
& mask
) == mask
) {
94 fp
= (fp
<< 1) ^ table
[*(unsigned char*)*pp
];
103 void FastCDC::calc_chunks(
104 const bufferlist
& bl
,
105 std::vector
<std::pair
<uint64_t, uint64_t>> *chunks
) const
107 if (bl
.length() == 0) {
110 auto p
= bl
.buffers().begin();
111 const char *pp
= p
->c_str();
112 const char *pe
= pp
+ p
->length();
115 size_t len
= bl
.length();
120 // are we left with a min-sized (or smaller) chunk?
121 if (len
- pos
<= (1ul << min_bits
)) {
122 chunks
->push_back(std::pair
<uint64_t,uint64_t>(pos
, len
- pos
));
126 // skip forward to the min chunk size cut point (minus the window, so
127 // we can initialize the rolling fingerprint).
128 size_t skip
= (1 << min_bits
) - window
;
131 size_t s
= std::min
<size_t>(pe
- pp
, skip
);
137 pe
= pp
+ p
->length();
141 // first fill the window
142 size_t max
= pos
+ window
;
147 pe
= pp
+ p
->length();
149 const char *te
= std::min(pe
, pp
+ (max
- pos
));
150 for (; pp
< te
; ++pp
, ++pos
) {
151 fp
= (fp
<< 1) ^ table
[*(unsigned char*)pp
];
154 ceph_assert(pos
< len
);
156 // find an end marker
158 // for the first "small" region
159 _scan(&p
, &pp
, &pe
, pos
,
160 std::min(len
, cstart
+ (1 << (target_bits
- TARGET_WINDOW_BITS
))),
161 fp
, small_mask
, table
) &&
162 // for the middle range (close to our target)
163 (TARGET_WINDOW_BITS
== 0 ||
164 _scan(&p
, &pp
, &pe
, pos
,
165 std::min(len
, cstart
+ (1 << (target_bits
+ TARGET_WINDOW_BITS
))),
166 fp
, target_mask
, table
)) &&
167 // we're past target, use large_mask!
168 _scan(&p
, &pp
, &pe
, pos
,
170 cstart
+ (1 << max_bits
)),
171 fp
, large_mask
, table
))
174 chunks
->push_back(std::pair
<uint64_t,uint64_t>(cstart
, pos
- cstart
));