]> git.proxmox.com Git - libgit2.git/blame - src/delta.c
update internal index API to avoid cast
[libgit2.git] / src / delta.c
CommitLineData
ec1d42b7
MS
1/*
2 * diff-delta.c: generate a delta between two buffers
3 *
4 * This code was greatly inspired by parts of LibXDiff from Davide Libenzi
5 * http://www.xmailserver.org/xdiff-lib.html
6 *
7 * Rewritten for GIT by Nicolas Pitre <nico@fluxnic.net>, (C) 2005-2007
8 *
9 * Modified for libgit2 by Michael Schubert <schu@schu.io>, (C) 2012
10 *
11 * This code is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 */
15
16#include "delta.h"
17
18/* maximum hash entry list for the same hash bucket */
19#define HASH_LIMIT 64
20
21#define RABIN_SHIFT 23
22#define RABIN_WINDOW 16
23
24static const unsigned int T[256] = {
25 0x00000000, 0xab59b4d1, 0x56b369a2, 0xfdeadd73, 0x063f6795, 0xad66d344,
26 0x508c0e37, 0xfbd5bae6, 0x0c7ecf2a, 0xa7277bfb, 0x5acda688, 0xf1941259,
27 0x0a41a8bf, 0xa1181c6e, 0x5cf2c11d, 0xf7ab75cc, 0x18fd9e54, 0xb3a42a85,
28 0x4e4ef7f6, 0xe5174327, 0x1ec2f9c1, 0xb59b4d10, 0x48719063, 0xe32824b2,
29 0x1483517e, 0xbfdae5af, 0x423038dc, 0xe9698c0d, 0x12bc36eb, 0xb9e5823a,
30 0x440f5f49, 0xef56eb98, 0x31fb3ca8, 0x9aa28879, 0x6748550a, 0xcc11e1db,
31 0x37c45b3d, 0x9c9defec, 0x6177329f, 0xca2e864e, 0x3d85f382, 0x96dc4753,
32 0x6b369a20, 0xc06f2ef1, 0x3bba9417, 0x90e320c6, 0x6d09fdb5, 0xc6504964,
33 0x2906a2fc, 0x825f162d, 0x7fb5cb5e, 0xd4ec7f8f, 0x2f39c569, 0x846071b8,
34 0x798aaccb, 0xd2d3181a, 0x25786dd6, 0x8e21d907, 0x73cb0474, 0xd892b0a5,
35 0x23470a43, 0x881ebe92, 0x75f463e1, 0xdeadd730, 0x63f67950, 0xc8afcd81,
36 0x354510f2, 0x9e1ca423, 0x65c91ec5, 0xce90aa14, 0x337a7767, 0x9823c3b6,
37 0x6f88b67a, 0xc4d102ab, 0x393bdfd8, 0x92626b09, 0x69b7d1ef, 0xc2ee653e,
38 0x3f04b84d, 0x945d0c9c, 0x7b0be704, 0xd05253d5, 0x2db88ea6, 0x86e13a77,
39 0x7d348091, 0xd66d3440, 0x2b87e933, 0x80de5de2, 0x7775282e, 0xdc2c9cff,
40 0x21c6418c, 0x8a9ff55d, 0x714a4fbb, 0xda13fb6a, 0x27f92619, 0x8ca092c8,
41 0x520d45f8, 0xf954f129, 0x04be2c5a, 0xafe7988b, 0x5432226d, 0xff6b96bc,
42 0x02814bcf, 0xa9d8ff1e, 0x5e738ad2, 0xf52a3e03, 0x08c0e370, 0xa39957a1,
43 0x584ced47, 0xf3155996, 0x0eff84e5, 0xa5a63034, 0x4af0dbac, 0xe1a96f7d,
44 0x1c43b20e, 0xb71a06df, 0x4ccfbc39, 0xe79608e8, 0x1a7cd59b, 0xb125614a,
45 0x468e1486, 0xedd7a057, 0x103d7d24, 0xbb64c9f5, 0x40b17313, 0xebe8c7c2,
46 0x16021ab1, 0xbd5bae60, 0x6cb54671, 0xc7ecf2a0, 0x3a062fd3, 0x915f9b02,
47 0x6a8a21e4, 0xc1d39535, 0x3c394846, 0x9760fc97, 0x60cb895b, 0xcb923d8a,
48 0x3678e0f9, 0x9d215428, 0x66f4eece, 0xcdad5a1f, 0x3047876c, 0x9b1e33bd,
49 0x7448d825, 0xdf116cf4, 0x22fbb187, 0x89a20556, 0x7277bfb0, 0xd92e0b61,
50 0x24c4d612, 0x8f9d62c3, 0x7836170f, 0xd36fa3de, 0x2e857ead, 0x85dcca7c,
51 0x7e09709a, 0xd550c44b, 0x28ba1938, 0x83e3ade9, 0x5d4e7ad9, 0xf617ce08,
52 0x0bfd137b, 0xa0a4a7aa, 0x5b711d4c, 0xf028a99d, 0x0dc274ee, 0xa69bc03f,
53 0x5130b5f3, 0xfa690122, 0x0783dc51, 0xacda6880, 0x570fd266, 0xfc5666b7,
54 0x01bcbbc4, 0xaae50f15, 0x45b3e48d, 0xeeea505c, 0x13008d2f, 0xb85939fe,
55 0x438c8318, 0xe8d537c9, 0x153feaba, 0xbe665e6b, 0x49cd2ba7, 0xe2949f76,
56 0x1f7e4205, 0xb427f6d4, 0x4ff24c32, 0xe4abf8e3, 0x19412590, 0xb2189141,
57 0x0f433f21, 0xa41a8bf0, 0x59f05683, 0xf2a9e252, 0x097c58b4, 0xa225ec65,
58 0x5fcf3116, 0xf49685c7, 0x033df00b, 0xa86444da, 0x558e99a9, 0xfed72d78,
59 0x0502979e, 0xae5b234f, 0x53b1fe3c, 0xf8e84aed, 0x17bea175, 0xbce715a4,
60 0x410dc8d7, 0xea547c06, 0x1181c6e0, 0xbad87231, 0x4732af42, 0xec6b1b93,
61 0x1bc06e5f, 0xb099da8e, 0x4d7307fd, 0xe62ab32c, 0x1dff09ca, 0xb6a6bd1b,
62 0x4b4c6068, 0xe015d4b9, 0x3eb80389, 0x95e1b758, 0x680b6a2b, 0xc352defa,
63 0x3887641c, 0x93ded0cd, 0x6e340dbe, 0xc56db96f, 0x32c6cca3, 0x999f7872,
64 0x6475a501, 0xcf2c11d0, 0x34f9ab36, 0x9fa01fe7, 0x624ac294, 0xc9137645,
65 0x26459ddd, 0x8d1c290c, 0x70f6f47f, 0xdbaf40ae, 0x207afa48, 0x8b234e99,
66 0x76c993ea, 0xdd90273b, 0x2a3b52f7, 0x8162e626, 0x7c883b55, 0xd7d18f84,
67 0x2c043562, 0x875d81b3, 0x7ab75cc0, 0xd1eee811
68};
69
70static const unsigned int U[256] = {
71 0x00000000, 0x7eb5200d, 0x5633f4cb, 0x2886d4c6, 0x073e5d47, 0x798b7d4a,
72 0x510da98c, 0x2fb88981, 0x0e7cba8e, 0x70c99a83, 0x584f4e45, 0x26fa6e48,
73 0x0942e7c9, 0x77f7c7c4, 0x5f711302, 0x21c4330f, 0x1cf9751c, 0x624c5511,
74 0x4aca81d7, 0x347fa1da, 0x1bc7285b, 0x65720856, 0x4df4dc90, 0x3341fc9d,
75 0x1285cf92, 0x6c30ef9f, 0x44b63b59, 0x3a031b54, 0x15bb92d5, 0x6b0eb2d8,
76 0x4388661e, 0x3d3d4613, 0x39f2ea38, 0x4747ca35, 0x6fc11ef3, 0x11743efe,
77 0x3eccb77f, 0x40799772, 0x68ff43b4, 0x164a63b9, 0x378e50b6, 0x493b70bb,
78 0x61bda47d, 0x1f088470, 0x30b00df1, 0x4e052dfc, 0x6683f93a, 0x1836d937,
79 0x250b9f24, 0x5bbebf29, 0x73386bef, 0x0d8d4be2, 0x2235c263, 0x5c80e26e,
80 0x740636a8, 0x0ab316a5, 0x2b7725aa, 0x55c205a7, 0x7d44d161, 0x03f1f16c,
81 0x2c4978ed, 0x52fc58e0, 0x7a7a8c26, 0x04cfac2b, 0x73e5d470, 0x0d50f47d,
82 0x25d620bb, 0x5b6300b6, 0x74db8937, 0x0a6ea93a, 0x22e87dfc, 0x5c5d5df1,
83 0x7d996efe, 0x032c4ef3, 0x2baa9a35, 0x551fba38, 0x7aa733b9, 0x041213b4,
84 0x2c94c772, 0x5221e77f, 0x6f1ca16c, 0x11a98161, 0x392f55a7, 0x479a75aa,
85 0x6822fc2b, 0x1697dc26, 0x3e1108e0, 0x40a428ed, 0x61601be2, 0x1fd53bef,
86 0x3753ef29, 0x49e6cf24, 0x665e46a5, 0x18eb66a8, 0x306db26e, 0x4ed89263,
87 0x4a173e48, 0x34a21e45, 0x1c24ca83, 0x6291ea8e, 0x4d29630f, 0x339c4302,
88 0x1b1a97c4, 0x65afb7c9, 0x446b84c6, 0x3adea4cb, 0x1258700d, 0x6ced5000,
89 0x4355d981, 0x3de0f98c, 0x15662d4a, 0x6bd30d47, 0x56ee4b54, 0x285b6b59,
90 0x00ddbf9f, 0x7e689f92, 0x51d01613, 0x2f65361e, 0x07e3e2d8, 0x7956c2d5,
91 0x5892f1da, 0x2627d1d7, 0x0ea10511, 0x7014251c, 0x5facac9d, 0x21198c90,
92 0x099f5856, 0x772a785b, 0x4c921c31, 0x32273c3c, 0x1aa1e8fa, 0x6414c8f7,
93 0x4bac4176, 0x3519617b, 0x1d9fb5bd, 0x632a95b0, 0x42eea6bf, 0x3c5b86b2,
94 0x14dd5274, 0x6a687279, 0x45d0fbf8, 0x3b65dbf5, 0x13e30f33, 0x6d562f3e,
95 0x506b692d, 0x2ede4920, 0x06589de6, 0x78edbdeb, 0x5755346a, 0x29e01467,
96 0x0166c0a1, 0x7fd3e0ac, 0x5e17d3a3, 0x20a2f3ae, 0x08242768, 0x76910765,
97 0x59298ee4, 0x279caee9, 0x0f1a7a2f, 0x71af5a22, 0x7560f609, 0x0bd5d604,
98 0x235302c2, 0x5de622cf, 0x725eab4e, 0x0ceb8b43, 0x246d5f85, 0x5ad87f88,
99 0x7b1c4c87, 0x05a96c8a, 0x2d2fb84c, 0x539a9841, 0x7c2211c0, 0x029731cd,
100 0x2a11e50b, 0x54a4c506, 0x69998315, 0x172ca318, 0x3faa77de, 0x411f57d3,
101 0x6ea7de52, 0x1012fe5f, 0x38942a99, 0x46210a94, 0x67e5399b, 0x19501996,
102 0x31d6cd50, 0x4f63ed5d, 0x60db64dc, 0x1e6e44d1, 0x36e89017, 0x485db01a,
103 0x3f77c841, 0x41c2e84c, 0x69443c8a, 0x17f11c87, 0x38499506, 0x46fcb50b,
104 0x6e7a61cd, 0x10cf41c0, 0x310b72cf, 0x4fbe52c2, 0x67388604, 0x198da609,
105 0x36352f88, 0x48800f85, 0x6006db43, 0x1eb3fb4e, 0x238ebd5d, 0x5d3b9d50,
106 0x75bd4996, 0x0b08699b, 0x24b0e01a, 0x5a05c017, 0x728314d1, 0x0c3634dc,
107 0x2df207d3, 0x534727de, 0x7bc1f318, 0x0574d315, 0x2acc5a94, 0x54797a99,
108 0x7cffae5f, 0x024a8e52, 0x06852279, 0x78300274, 0x50b6d6b2, 0x2e03f6bf,
109 0x01bb7f3e, 0x7f0e5f33, 0x57888bf5, 0x293dabf8, 0x08f998f7, 0x764cb8fa,
110 0x5eca6c3c, 0x207f4c31, 0x0fc7c5b0, 0x7172e5bd, 0x59f4317b, 0x27411176,
111 0x1a7c5765, 0x64c97768, 0x4c4fa3ae, 0x32fa83a3, 0x1d420a22, 0x63f72a2f,
112 0x4b71fee9, 0x35c4dee4, 0x1400edeb, 0x6ab5cde6, 0x42331920, 0x3c86392d,
113 0x133eb0ac, 0x6d8b90a1, 0x450d4467, 0x3bb8646a
114};
115
116struct index_entry {
117 const unsigned char *ptr;
118 unsigned int val;
119};
120
121struct unpacked_index_entry {
122 struct index_entry entry;
123 struct unpacked_index_entry *next;
124};
125
126struct git_delta_index {
127 unsigned long memsize;
128 const void *src_buf;
129 unsigned long src_size;
130 unsigned int hash_mask;
131 struct index_entry *hash[GIT_FLEX_ARRAY];
132};
133
134struct git_delta_index *
135git_delta_create_index(const void *buf, unsigned long bufsize)
136{
137 unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
138 const unsigned char *data, *buffer = buf;
139 struct git_delta_index *index;
140 struct unpacked_index_entry *entry, **hash;
141 struct index_entry *packed_entry, **packed_hash;
142 void *mem;
143 unsigned long memsize;
144
145 if (!buf || !bufsize)
146 return NULL;
147
148 /* Determine index hash size. Note that indexing skips the
149 first byte to allow for optimizing the Rabin's polynomial
150 initialization in create_delta(). */
151 entries = (bufsize - 1) / RABIN_WINDOW;
152 if (bufsize >= 0xffffffffUL) {
153 /*
154 * Current delta format can't encode offsets into
155 * reference buffer with more than 32 bits.
156 */
157 entries = 0xfffffffeU / RABIN_WINDOW;
158 }
159 hsize = entries / 4;
160 for (i = 4; (1u << i) < hsize && i < 31; i++);
161 hsize = 1 << i;
162 hmask = hsize - 1;
163
164 /* allocate lookup index */
165 memsize = sizeof(*hash) * hsize +
166 sizeof(*entry) * entries;
167 mem = git__malloc(memsize);
168 if (!mem)
169 return NULL;
170 hash = mem;
171 mem = hash + hsize;
172 entry = mem;
173
174 memset(hash, 0, hsize * sizeof(*hash));
175
176 /* allocate an array to count hash entries */
177 hash_count = calloc(hsize, sizeof(*hash_count));
178 if (!hash_count) {
179 git__free(hash);
180 return NULL;
181 }
182
183 /* then populate the index */
184 prev_val = ~0;
185 for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
186 data >= buffer;
187 data -= RABIN_WINDOW) {
188 unsigned int val = 0;
189 for (i = 1; i <= RABIN_WINDOW; i++)
190 val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
191 if (val == prev_val) {
192 /* keep the lowest of consecutive identical blocks */
193 entry[-1].entry.ptr = data + RABIN_WINDOW;
194 --entries;
195 } else {
196 prev_val = val;
197 i = val & hmask;
198 entry->entry.ptr = data + RABIN_WINDOW;
199 entry->entry.val = val;
200 entry->next = hash[i];
201 hash[i] = entry++;
202 hash_count[i]++;
203 }
204 }
205
206 /*
207 * Determine a limit on the number of entries in the same hash
208 * bucket. This guards us against pathological data sets causing
209 * really bad hash distribution with most entries in the same hash
210 * bucket that would bring us to O(m*n) computing costs (m and n
211 * corresponding to reference and target buffer sizes).
212 *
213 * Make sure none of the hash buckets has more entries than
214 * we're willing to test. Otherwise we cull the entry list
215 * uniformly to still preserve a good repartition across
216 * the reference buffer.
217 */
218 for (i = 0; i < hsize; i++) {
219 int acc;
220
221 if (hash_count[i] <= HASH_LIMIT)
222 continue;
223
224 /* We leave exactly HASH_LIMIT entries in the bucket */
225 entries -= hash_count[i] - HASH_LIMIT;
226
227 entry = hash[i];
228 acc = 0;
229
230 /*
231 * Assume that this loop is gone through exactly
232 * HASH_LIMIT times and is entered and left with
233 * acc==0. So the first statement in the loop
234 * contributes (hash_count[i]-HASH_LIMIT)*HASH_LIMIT
235 * to the accumulator, and the inner loop consequently
236 * is run (hash_count[i]-HASH_LIMIT) times, removing
237 * one element from the list each time. Since acc
238 * balances out to 0 at the final run, the inner loop
239 * body can't be left with entry==NULL. So we indeed
240 * encounter entry==NULL in the outer loop only.
241 */
242 do {
243 acc += hash_count[i] - HASH_LIMIT;
244 if (acc > 0) {
245 struct unpacked_index_entry *keep = entry;
246 do {
247 entry = entry->next;
248 acc -= HASH_LIMIT;
249 } while (acc > 0);
250 keep->next = entry->next;
251 }
252 entry = entry->next;
253 } while (entry);
254 }
255 git__free(hash_count);
256
257 /*
258 * Now create the packed index in array form
259 * rather than linked lists.
260 */
261 memsize = sizeof(*index)
262 + sizeof(*packed_hash) * (hsize+1)
263 + sizeof(*packed_entry) * entries;
264 mem = git__malloc(memsize);
265 if (!mem) {
266 git__free(hash);
267 return NULL;
268 }
269
270 index = mem;
271 index->memsize = memsize;
272 index->src_buf = buf;
273 index->src_size = bufsize;
274 index->hash_mask = hmask;
275
276 mem = index->hash;
277 packed_hash = mem;
278 mem = packed_hash + (hsize+1);
279 packed_entry = mem;
280
281 for (i = 0; i < hsize; i++) {
282 /*
283 * Coalesce all entries belonging to one linked list
284 * into consecutive array entries.
285 */
286 packed_hash[i] = packed_entry;
287 for (entry = hash[i]; entry; entry = entry->next)
288 *packed_entry++ = entry->entry;
289 }
290
291 /* Sentinel value to indicate the length of the last hash bucket */
292 packed_hash[hsize] = packed_entry;
293
294 assert(packed_entry - (struct index_entry *)mem == entries);
295 git__free(hash);
296
297 return index;
298}
299
300void git_delta_free_index(struct git_delta_index *index)
301{
302 git__free(index);
303}
304
305unsigned long git_delta_sizeof_index(struct git_delta_index *index)
306{
307 if (index)
308 return index->memsize;
309 else
310 return 0;
311}
312
313/*
314 * The maximum size for any opcode sequence, including the initial header
315 * plus Rabin window plus biggest copy.
316 */
317#define MAX_OP_SIZE (5 + 5 + 1 + RABIN_WINDOW + 7)
318
319void *
320git_delta_create(const struct git_delta_index *index,
321 const void *trg_buf, unsigned long trg_size,
322 unsigned long *delta_size, unsigned long max_size)
323{
324 unsigned int i, outpos, outsize, moff, msize, val;
325 int inscnt;
326 const unsigned char *ref_data, *ref_top, *data, *top;
327 unsigned char *out;
328
329 if (!trg_buf || !trg_size)
330 return NULL;
331
332 outpos = 0;
333 outsize = 8192;
334 if (max_size && outsize >= max_size)
335 outsize = max_size + MAX_OP_SIZE + 1;
336 out = git__malloc(outsize);
337 if (!out)
338 return NULL;
339
340 /* store reference buffer size */
341 i = index->src_size;
342 while (i >= 0x80) {
343 out[outpos++] = i | 0x80;
344 i >>= 7;
345 }
346 out[outpos++] = i;
347
348 /* store target buffer size */
349 i = trg_size;
350 while (i >= 0x80) {
351 out[outpos++] = i | 0x80;
352 i >>= 7;
353 }
354 out[outpos++] = i;
355
356 ref_data = index->src_buf;
357 ref_top = ref_data + index->src_size;
358 data = trg_buf;
359 top = (const unsigned char *) trg_buf + trg_size;
360
361 outpos++;
362 val = 0;
363 for (i = 0; i < RABIN_WINDOW && data < top; i++, data++) {
364 out[outpos++] = *data;
365 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
366 }
367 inscnt = i;
368
369 moff = 0;
370 msize = 0;
371 while (data < top) {
372 if (msize < 4096) {
373 struct index_entry *entry;
374 val ^= U[data[-RABIN_WINDOW]];
375 val = ((val << 8) | *data) ^ T[val >> RABIN_SHIFT];
376 i = val & index->hash_mask;
377 for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
378 const unsigned char *ref = entry->ptr;
379 const unsigned char *src = data;
380 unsigned int ref_size = ref_top - ref;
381 if (entry->val != val)
382 continue;
383 if (ref_size > (unsigned int)(top - src))
384 ref_size = top - src;
385 if (ref_size <= msize)
386 break;
387 while (ref_size-- && *src++ == *ref)
388 ref++;
389 if (msize < (unsigned int)(ref - entry->ptr)) {
390 /* this is our best match so far */
391 msize = ref - entry->ptr;
392 moff = entry->ptr - ref_data;
393 if (msize >= 4096) /* good enough */
394 break;
395 }
396 }
397 }
398
399 if (msize < 4) {
400 if (!inscnt)
401 outpos++;
402 out[outpos++] = *data++;
403 inscnt++;
404 if (inscnt == 0x7f) {
405 out[outpos - inscnt - 1] = inscnt;
406 inscnt = 0;
407 }
408 msize = 0;
409 } else {
410 unsigned int left;
411 unsigned char *op;
412
413 if (inscnt) {
414 while (moff && ref_data[moff-1] == data[-1]) {
415 /* we can match one byte back */
416 msize++;
417 moff--;
418 data--;
419 outpos--;
420 if (--inscnt)
421 continue;
422 outpos--; /* remove count slot */
423 inscnt--; /* make it -1 */
424 break;
425 }
426 out[outpos - inscnt - 1] = inscnt;
427 inscnt = 0;
428 }
429
430 /* A copy op is currently limited to 64KB (pack v2) */
431 left = (msize < 0x10000) ? 0 : (msize - 0x10000);
432 msize -= left;
433
434 op = out + outpos++;
435 i = 0x80;
436
437 if (moff & 0x000000ff)
438 out[outpos++] = moff >> 0, i |= 0x01;
439 if (moff & 0x0000ff00)
440 out[outpos++] = moff >> 8, i |= 0x02;
441 if (moff & 0x00ff0000)
442 out[outpos++] = moff >> 16, i |= 0x04;
443 if (moff & 0xff000000)
444 out[outpos++] = moff >> 24, i |= 0x08;
445
446 if (msize & 0x00ff)
447 out[outpos++] = msize >> 0, i |= 0x10;
448 if (msize & 0xff00)
449 out[outpos++] = msize >> 8, i |= 0x20;
450
451 *op = i;
452
453 data += msize;
454 moff += msize;
455 msize = left;
456
457 if (msize < 4096) {
458 int j;
459 val = 0;
460 for (j = -RABIN_WINDOW; j < 0; j++)
461 val = ((val << 8) | data[j])
462 ^ T[val >> RABIN_SHIFT];
463 }
464 }
465
466 if (outpos >= outsize - MAX_OP_SIZE) {
467 void *tmp = out;
468 outsize = outsize * 3 / 2;
469 if (max_size && outsize >= max_size)
470 outsize = max_size + MAX_OP_SIZE + 1;
471 if (max_size && outpos > max_size)
472 break;
473 out = git__realloc(out, outsize);
474 if (!out) {
475 git__free(tmp);
476 return NULL;
477 }
478 }
479 }
480
481 if (inscnt)
482 out[outpos - inscnt - 1] = inscnt;
483
484 if (max_size && outpos > max_size) {
485 git__free(out);
486 return NULL;
487 }
488
489 *delta_size = outpos;
490 return out;
491}