]>
Commit | Line | Data |
---|---|---|
06178f13 WB |
1 | /// Note: window size 32 or 64, is faster because we can |
2 | /// speedup modulo operations, but always computes hash 0 | |
3 | /// for constant data streams .. 0,0,0,0,0,0 | |
4 | /// so we use a modified the chunk boundary test too not | |
5 | /// use hash value 0 to detect a boundary. | |
6 | const CA_CHUNKER_WINDOW_SIZE: usize = 64; | |
7 | ||
2ff4c2cd | 8 | /// Sliding window chunker (Buzhash) |
06178f13 WB |
9 | /// |
10 | /// This is a rewrite of *casync* chunker (cachunker.h) in rust. | |
11 | /// | |
12 | /// Hashing by cyclic polynomial (also called Buzhash) has the benefit | |
13 | /// of avoiding multiplications, using barrel shifts instead. For more | |
14 | /// information please take a look at the [Rolling | |
2ff4c2cd TL |
15 | /// Hash](https://en.wikipedia.org/wiki/Rolling_hash) article from |
16 | /// Wikipedia. | |
06178f13 WB |
17 | |
18 | pub struct Chunker { | |
19 | h: u32, | |
20 | window_size: usize, | |
21 | chunk_size: usize, | |
22 | ||
23 | chunk_size_min: usize, | |
24 | chunk_size_max: usize, | |
25 | _chunk_size_avg: usize, | |
26 | ||
27 | _discriminator: u32, | |
28 | ||
29 | break_test_mask: u32, | |
30 | break_test_minimum: u32, | |
31 | ||
32 | window: [u8; CA_CHUNKER_WINDOW_SIZE], | |
33 | } | |
34 | ||
35 | const BUZHASH_TABLE: [u32; 256] = [ | |
7d83440c WB |
36 | 0x458be752, 0xc10748cc, 0xfbbcdbb8, 0x6ded5b68, 0xb10a82b5, 0x20d75648, 0xdfc5665f, 0xa8428801, |
37 | 0x7ebf5191, 0x841135c7, 0x65cc53b3, 0x280a597c, 0x16f60255, 0xc78cbc3e, 0x294415f5, 0xb938d494, | |
38 | 0xec85c4e6, 0xb7d33edc, 0xe549b544, 0xfdeda5aa, 0x882bf287, 0x3116737c, 0x05569956, 0xe8cc1f68, | |
39 | 0x0806ac5e, 0x22a14443, 0x15297e10, 0x50d090e7, 0x4ba60f6f, 0xefd9f1a7, 0x5c5c885c, 0x82482f93, | |
40 | 0x9bfd7c64, 0x0b3e7276, 0xf2688e77, 0x8fad8abc, 0xb0509568, 0xf1ada29f, 0xa53efdfe, 0xcb2b1d00, | |
41 | 0xf2a9e986, 0x6463432b, 0x95094051, 0x5a223ad2, 0x9be8401b, 0x61e579cb, 0x1a556a14, 0x5840fdc2, | |
42 | 0x9261ddf6, 0xcde002bb, 0x52432bb0, 0xbf17373e, 0x7b7c222f, 0x2955ed16, 0x9f10ca59, 0xe840c4c9, | |
43 | 0xccabd806, 0x14543f34, 0x1462417a, 0x0d4a1f9c, 0x087ed925, 0xd7f8f24c, 0x7338c425, 0xcf86c8f5, | |
44 | 0xb19165cd, 0x9891c393, 0x325384ac, 0x0308459d, 0x86141d7e, 0xc922116a, 0xe2ffa6b6, 0x53f52aed, | |
45 | 0x2cd86197, 0xf5b9f498, 0xbf319c8f, 0xe0411fae, 0x977eb18c, 0xd8770976, 0x9833466a, 0xc674df7f, | |
46 | 0x8c297d45, 0x8ca48d26, 0xc49ed8e2, 0x7344f874, 0x556f79c7, 0x6b25eaed, 0xa03e2b42, 0xf68f66a4, | |
47 | 0x8e8b09a2, 0xf2e0e62a, 0x0d3a9806, 0x9729e493, 0x8c72b0fc, 0x160b94f6, 0x450e4d3d, 0x7a320e85, | |
48 | 0xbef8f0e1, 0x21d73653, 0x4e3d977a, 0x1e7b3929, 0x1cc6c719, 0xbe478d53, 0x8d752809, 0xe6d8c2c6, | |
49 | 0x275f0892, 0xc8acc273, 0x4cc21580, 0xecc4a617, 0xf5f7be70, 0xe795248a, 0x375a2fe9, 0x425570b6, | |
50 | 0x8898dcf8, 0xdc2d97c4, 0x0106114b, 0x364dc22f, 0x1e0cad1f, 0xbe63803c, 0x5f69fac2, 0x4d5afa6f, | |
51 | 0x1bc0dfb5, 0xfb273589, 0x0ea47f7b, 0x3c1c2b50, 0x21b2a932, 0x6b1223fd, 0x2fe706a8, 0xf9bd6ce2, | |
52 | 0xa268e64e, 0xe987f486, 0x3eacf563, 0x1ca2018c, 0x65e18228, 0x2207360a, 0x57cf1715, 0x34c37d2b, | |
53 | 0x1f8f3cde, 0x93b657cf, 0x31a019fd, 0xe69eb729, 0x8bca7b9b, 0x4c9d5bed, 0x277ebeaf, 0xe0d8f8ae, | |
54 | 0xd150821c, 0x31381871, 0xafc3f1b0, 0x927db328, 0xe95effac, 0x305a47bd, 0x426ba35b, 0x1233af3f, | |
55 | 0x686a5b83, 0x50e072e5, 0xd9d3bb2a, 0x8befc475, 0x487f0de6, 0xc88dff89, 0xbd664d5e, 0x971b5d18, | |
56 | 0x63b14847, 0xd7d3c1ce, 0x7f583cf3, 0x72cbcb09, 0xc0d0a81c, 0x7fa3429b, 0xe9158a1b, 0x225ea19a, | |
57 | 0xd8ca9ea3, 0xc763b282, 0xbb0c6341, 0x020b8293, 0xd4cd299d, 0x58cfa7f8, 0x91b4ee53, 0x37e4d140, | |
58 | 0x95ec764c, 0x30f76b06, 0x5ee68d24, 0x679c8661, 0xa41979c2, 0xf2b61284, 0x4fac1475, 0x0adb49f9, | |
59 | 0x19727a23, 0x15a7e374, 0xc43a18d5, 0x3fb1aa73, 0x342fc615, 0x924c0793, 0xbee2d7f0, 0x8a279de9, | |
60 | 0x4aa2d70c, 0xe24dd37f, 0xbe862c0b, 0x177c22c2, 0x5388e5ee, 0xcd8a7510, 0xf901b4fd, 0xdbc13dbc, | |
61 | 0x6c0bae5b, 0x64efe8c7, 0x48b02079, 0x80331a49, 0xca3d8ae6, 0xf3546190, 0xfed7108b, 0xc49b941b, | |
62 | 0x32baf4a9, 0xeb833a4a, 0x88a3f1a5, 0x3a91ce0a, 0x3cc27da1, 0x7112e684, 0x4a3096b1, 0x3794574c, | |
63 | 0xa3c8b6f3, 0x1d213941, 0x6e0a2e00, 0x233479f1, 0x0f4cd82f, 0x6093edd2, 0x5d7d209e, 0x464fe319, | |
64 | 0xd4dcac9e, 0x0db845cb, 0xfb5e4bc3, 0xe0256ce1, 0x09fb4ed1, 0x0914be1e, 0xa5bdb2c3, 0xc6eb57bb, | |
65 | 0x30320350, 0x3f397e91, 0xa67791bc, 0x86bc0e2c, 0xefa0a7e2, 0xe9ff7543, 0xe733612c, 0xd185897b, | |
66 | 0x329e5388, 0x91dd236b, 0x2ecb0d93, 0xf4d82a3d, 0x35b5c03f, 0xe4e606f0, 0x05b21843, 0x37b45964, | |
67 | 0x5eff22f4, 0x6027f4cc, 0x77178b3c, 0xae507131, 0x7bf7cabc, 0xf9c18d66, 0x593ade65, 0xd95ddf11, | |
06178f13 WB |
68 | ]; |
69 | ||
70 | impl Chunker { | |
06178f13 WB |
71 | /// Create a new Chunker instance, which produces and average |
72 | /// chunk size of `chunk_size_avg` (need to be a power of two). We | |
73 | /// allow variation from `chunk_size_avg/4` up to a maximum of | |
74 | /// `chunk_size_avg*4`. | |
75 | pub fn new(chunk_size_avg: usize) -> Self { | |
76 | // The chunk cut discriminator. In order to get an average | |
77 | // chunk size of avg, we cut whenever for a hash value "h" at | |
78 | // byte "i" given the descriminator "d(avg)": h(i) mod d(avg) | |
79 | // == d(avg) - 1. Note that the discriminator calculated like | |
80 | // this only yields correct results as long as the minimal | |
81 | // chunk size is picked as avg/4, and the maximum chunk size | |
82 | // as avg*4. If they are picked differently the result might | |
83 | // be skewed into either direction. | |
84 | let avg = chunk_size_avg as f64; | |
85 | let discriminator = (avg / (-1.42888852e-7 * avg + 1.33237515)) as u32; | |
86 | ||
87 | if chunk_size_avg.count_ones() != 1 { | |
88 | panic!("got unexpected chunk size - not a power of two."); | |
89 | } | |
90 | ||
7d83440c WB |
91 | let break_test_mask = (chunk_size_avg * 2 - 1) as u32; |
92 | let break_test_minimum = break_test_mask - 2; | |
06178f13 WB |
93 | |
94 | Self { | |
95 | h: 0, | |
96 | window_size: 0, | |
97 | chunk_size: 0, | |
7d83440c WB |
98 | chunk_size_min: chunk_size_avg >> 2, |
99 | chunk_size_max: chunk_size_avg << 2, | |
06178f13 | 100 | _chunk_size_avg: chunk_size_avg, |
7d83440c | 101 | _discriminator: discriminator, |
653b1ca1 WB |
102 | break_test_mask, |
103 | break_test_minimum, | |
06178f13 WB |
104 | window: [0u8; CA_CHUNKER_WINDOW_SIZE], |
105 | } | |
106 | } | |
107 | ||
108 | /// Scans the specified data for a chunk border. Returns 0 if none | |
109 | /// was found (and the function should be called with more data | |
110 | /// later on), or another value indicating the position of a | |
111 | /// border. | |
112 | pub fn scan(&mut self, data: &[u8]) -> usize { | |
06178f13 WB |
113 | let window_len = self.window.len(); |
114 | let data_len = data.len(); | |
115 | ||
116 | let mut pos = 0; | |
117 | ||
7d83440c WB |
118 | if self.window_size < window_len { |
119 | let need = window_len - self.window_size; | |
06178f13 WB |
120 | let copy_len = if need < data_len { need } else { data_len }; |
121 | ||
122 | for _i in 0..copy_len { | |
123 | let byte = data[pos]; | |
124 | self.window[self.window_size] = byte; | |
125 | self.h = self.h.rotate_left(1) ^ BUZHASH_TABLE[byte as usize]; | |
126 | pos += 1; | |
127 | self.window_size += 1; | |
128 | } | |
129 | ||
130 | self.chunk_size += copy_len; | |
131 | ||
132 | // return if window is still not full | |
7d83440c | 133 | if self.window_size < window_len { |
06178f13 WB |
134 | return 0; |
135 | } | |
136 | } | |
137 | ||
138 | //let mut idx = self.chunk_size % CA_CHUNKER_WINDOW_SIZE; | |
139 | let mut idx = self.chunk_size & 0x3f; | |
140 | ||
141 | while pos < data_len { | |
142 | // roll window | |
143 | let enter = data[pos]; | |
144 | let leave = self.window[idx]; | |
145 | self.h = self.h.rotate_left(1) ^ | |
146 | //BUZHASH_TABLE[leave as usize].rotate_left(CA_CHUNKER_WINDOW_SIZE as u32) ^ | |
147 | BUZHASH_TABLE[leave as usize] ^ | |
148 | BUZHASH_TABLE[enter as usize]; | |
149 | ||
150 | self.chunk_size += 1; | |
151 | pos += 1; | |
152 | ||
153 | self.window[idx] = enter; | |
154 | ||
155 | if self.shall_break() { | |
156 | self.h = 0; | |
157 | self.chunk_size = 0; | |
158 | self.window_size = 0; | |
159 | return pos; | |
160 | } | |
161 | ||
162 | //idx = self.chunk_size % CA_CHUNKER_WINDOW_SIZE; | |
163 | idx = self.chunk_size & 0x3f; | |
164 | //idx += 1; if idx >= CA_CHUNKER_WINDOW_SIZE { idx = 0 }; | |
165 | } | |
166 | ||
167 | 0 | |
168 | } | |
169 | ||
170 | // fast implementation avoiding modulo | |
7d83440c | 171 | // #[inline(always)] |
06178f13 | 172 | fn shall_break(&self) -> bool { |
7d83440c WB |
173 | if self.chunk_size >= self.chunk_size_max { |
174 | return true; | |
175 | } | |
06178f13 | 176 | |
7d83440c WB |
177 | if self.chunk_size < self.chunk_size_min { |
178 | return false; | |
179 | } | |
06178f13 WB |
180 | |
181 | //(self.h & 0x1ffff) <= 2 //THIS IS SLOW!!! | |
182 | ||
183 | //(self.h & self.break_test_mask) <= 2 // Bad on 0 streams | |
184 | ||
185 | (self.h & self.break_test_mask) >= self.break_test_minimum | |
186 | } | |
187 | ||
188 | // This is the original implementation from casync | |
189 | /* | |
190 | #[inline(always)] | |
191 | fn shall_break_orig(&self) -> bool { | |
192 | ||
193 | if self.chunk_size >= self.chunk_size_max { return true; } | |
194 | ||
195 | if self.chunk_size < self.chunk_size_min { return false; } | |
196 | ||
197 | (self.h % self.discriminator) == (self.discriminator - 1) | |
198 | } | |
199 | */ | |
200 | } | |
201 | ||
202 | #[test] | |
203 | fn test_chunker1() { | |
06178f13 WB |
204 | let mut buffer = Vec::new(); |
205 | ||
7d83440c | 206 | for i in 0..(256 * 1024) { |
06178f13 | 207 | for j in 0..4 { |
7d83440c | 208 | let byte = ((i >> (j << 3)) & 0xff) as u8; |
06178f13 WB |
209 | buffer.push(byte); |
210 | } | |
211 | } | |
7d83440c | 212 | let mut chunker = Chunker::new(64 * 1024); |
06178f13 WB |
213 | |
214 | let mut pos = 0; | |
215 | let mut last = 0; | |
216 | ||
217 | let mut chunks1: Vec<(usize, usize)> = vec![]; | |
218 | let mut chunks2: Vec<(usize, usize)> = vec![]; | |
219 | ||
220 | // test1: feed single bytes | |
221 | while pos < buffer.len() { | |
7d83440c | 222 | let k = chunker.scan(&buffer[pos..pos + 1]); |
06178f13 WB |
223 | pos += 1; |
224 | if k != 0 { | |
225 | let prev = last; | |
226 | last = pos; | |
7d83440c WB |
227 | chunks1.push((prev, pos - prev)); |
228 | } | |
06178f13 WB |
229 | } |
230 | chunks1.push((last, buffer.len() - last)); | |
231 | ||
7d83440c | 232 | let mut chunker = Chunker::new(64 * 1024); |
06178f13 WB |
233 | |
234 | let mut pos = 0; | |
235 | ||
236 | // test2: feed with whole buffer | |
237 | while pos < buffer.len() { | |
238 | let k = chunker.scan(&buffer[pos..]); | |
239 | if k != 0 { | |
240 | chunks2.push((pos, k)); | |
241 | pos += k; | |
7d83440c | 242 | } else { |
06178f13 WB |
243 | break; |
244 | } | |
245 | } | |
246 | ||
247 | chunks2.push((pos, buffer.len() - pos)); | |
248 | ||
249 | if chunks1 != chunks2 { | |
06178f13 WB |
250 | let mut size1 = 0; |
251 | for (_offset, len) in &chunks1 { | |
252 | size1 += len; | |
253 | } | |
254 | println!("Chunks1:{}\n{:?}\n", size1, chunks1); | |
255 | ||
256 | let mut size2 = 0; | |
257 | for (_offset, len) in &chunks2 { | |
258 | size2 += len; | |
259 | } | |
260 | println!("Chunks2:{}\n{:?}\n", size2, chunks2); | |
261 | ||
7d83440c | 262 | if size1 != 256 * 4 * 1024 { |
06178f13 WB |
263 | panic!("wrong size for chunks1"); |
264 | } | |
7d83440c | 265 | if size2 != 256 * 4 * 1024 { |
06178f13 WB |
266 | panic!("wrong size for chunks2"); |
267 | } | |
268 | ||
269 | panic!("got different chunks"); | |
270 | } | |
06178f13 | 271 | } |