9 #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
10 #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
11 #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3)
14 #define LDM_HASH_ENTRY_SIZE_LOG 3
16 #define LDM_HASH_ENTRY_SIZE_LOG 2
19 // Entries are inserted into the table HASH_ONLY_EVERY + 1 times "on average".
20 #ifndef HASH_ONLY_EVERY_LOG
21 #define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)))
24 #define HASH_ONLY_EVERY ((1 << (HASH_ONLY_EVERY_LOG)) - 1)
26 #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
27 #define NUM_HASH_BUCKETS_LOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG))
29 #define HASH_CHAR_OFFSET 10
31 // Take the first match in the hash bucket only.
34 static const U64 prime8bytes
= 11400714785074694791ULL;
36 // Type of the small hash used to index into the hash table.
40 typedef struct LDM_hashEntry
{
45 typedef struct LDM_hashEntry
{
50 struct LDM_compressStats
{
51 U32 windowSizeLog
, hashTableSizeLog
;
54 U64 totalLiteralLength
;
57 U32 matchLengthHistogram
[32];
59 U32 minOffset
, maxOffset
;
60 U32 offsetHistogram
[32];
63 typedef struct LDM_hashTable LDM_hashTable
;
66 size_t isize
; /* Input size */
67 size_t maxOSize
; /* Maximum output size */
69 const BYTE
*ibase
; /* Base of input */
70 const BYTE
*ip
; /* Current input position */
71 const BYTE
*iend
; /* End of input */
73 // Maximum input position such that hashing at the position does not exceed
75 const BYTE
*ihashLimit
;
77 // Maximum input position such that finding a match of at least the minimum
78 // match length does not exceed end of input.
79 const BYTE
*imatchLimit
;
81 const BYTE
*obase
; /* Base of output */
82 BYTE
*op
; /* Output */
84 const BYTE
*anchor
; /* Anchor to start of current (match) block */
86 LDM_compressStats stats
; /* Compression statistics */
88 LDM_hashTable
*hashTable
;
90 const BYTE
*lastPosHashed
; /* Last position hashed */
93 const BYTE
*nextIp
; // TODO: this is redundant (ip + step)
94 const BYTE
*nextPosHashed
;
97 unsigned step
; // ip step, should be 1.
103 struct LDM_hashTable
{
104 U32 numBuckets
; // The number of buckets.
105 U32 numEntries
; // numBuckets * HASH_BUCKET_SIZE.
107 LDM_hashEntry
*entries
;
108 BYTE
*bucketOffsets
; // A pointer (per bucket) to the next insert position.
111 static void HASH_destroyTable(LDM_hashTable
*table
) {
112 free(table
->entries
);
113 free(table
->bucketOffsets
);
118 * Create a hash table that can contain size elements.
119 * The number of buckets is determined by size >> HASH_BUCKET_SIZE_LOG.
121 * Returns NULL if table creation failed.
123 static LDM_hashTable
*HASH_createTable(U32 size
) {
124 LDM_hashTable
*table
= malloc(sizeof(LDM_hashTable
));
125 if (!table
) return NULL
;
127 table
->numBuckets
= size
>> HASH_BUCKET_SIZE_LOG
;
128 table
->numEntries
= size
;
129 table
->entries
= calloc(size
, sizeof(LDM_hashEntry
));
130 table
->bucketOffsets
= calloc(size
>> HASH_BUCKET_SIZE_LOG
, sizeof(BYTE
));
132 if (!table
->entries
|| !table
->bucketOffsets
) {
133 HASH_destroyTable(table
);
140 static LDM_hashEntry
*getBucket(const LDM_hashTable
*table
, const hash_t hash
) {
141 return table
->entries
+ (hash
<< HASH_BUCKET_SIZE_LOG
);
144 static unsigned ZSTD_NbCommonBytes (register size_t val
) {
145 if (MEM_isLittleEndian()) {
147 # if defined(_MSC_VER) && defined(_WIN64)
149 _BitScanForward64( &r
, (U64
)val
);
150 return (unsigned)(r
>>3);
151 # elif defined(__GNUC__) && (__GNUC__ >= 3)
152 return (__builtin_ctzll((U64
)val
) >> 3);
154 static const int DeBruijnBytePos
[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
155 0, 3, 1, 3, 1, 4, 2, 7,
156 0, 2, 3, 6, 1, 5, 3, 5,
157 1, 3, 4, 4, 2, 5, 6, 7,
158 7, 0, 1, 2, 3, 3, 4, 6,
159 2, 6, 5, 5, 3, 4, 5, 6,
160 7, 1, 2, 4, 6, 4, 4, 5,
161 7, 2, 6, 5, 7, 6, 7, 7 };
162 return DeBruijnBytePos
[
163 ((U64
)((val
& -(long long)val
) * 0x0218A392CDABBD3FULL
)) >> 58];
165 } else { /* 32 bits */
166 # if defined(_MSC_VER)
168 _BitScanForward( &r
, (U32
)val
);
169 return (unsigned)(r
>>3);
170 # elif defined(__GNUC__) && (__GNUC__ >= 3)
171 return (__builtin_ctz((U32
)val
) >> 3);
173 static const int DeBruijnBytePos
[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
174 3, 2, 2, 1, 3, 2, 0, 1,
175 3, 3, 1, 2, 2, 2, 2, 0,
176 3, 1, 2, 0, 1, 0, 1, 1 };
177 return DeBruijnBytePos
[
178 ((U32
)((val
& -(S32
)val
) * 0x077CB531U
)) >> 27];
181 } else { /* Big Endian CPU */
183 # if defined(_MSC_VER) && defined(_WIN64)
185 _BitScanReverse64( &r
, val
);
186 return (unsigned)(r
>>3);
187 # elif defined(__GNUC__) && (__GNUC__ >= 3)
188 return (__builtin_clzll(val
) >> 3);
191 /* calculate this way due to compiler complaining in 32-bits mode */
192 const unsigned n32
= sizeof(size_t)*4;
193 if (!(val
>>n32
)) { r
=4; } else { r
=0; val
>>=n32
; }
194 if (!(val
>>16)) { r
+=2; val
>>=8; } else { val
>>=24; }
198 } else { /* 32 bits */
199 # if defined(_MSC_VER)
201 _BitScanReverse( &r
, (unsigned long)val
);
202 return (unsigned)(r
>>3);
203 # elif defined(__GNUC__) && (__GNUC__ >= 3)
204 return (__builtin_clz((U32
)val
) >> 3);
207 if (!(val
>>16)) { r
=2; val
>>=8; } else { r
=0; val
>>=24; }
215 // From lib/compress/zstd_compress.c
216 static size_t ZSTD_count(const BYTE
*pIn
, const BYTE
*pMatch
,
217 const BYTE
*const pInLimit
) {
218 const BYTE
* const pStart
= pIn
;
219 const BYTE
* const pInLoopLimit
= pInLimit
- (sizeof(size_t)-1);
221 while (pIn
< pInLoopLimit
) {
222 size_t const diff
= MEM_readST(pMatch
) ^ MEM_readST(pIn
);
224 pIn
+= sizeof(size_t);
225 pMatch
+= sizeof(size_t);
228 pIn
+= ZSTD_NbCommonBytes(diff
);
229 return (size_t)(pIn
- pStart
);
233 if ((pIn
< (pInLimit
- 3)) && (MEM_read32(pMatch
) == MEM_read32(pIn
))) {
238 if ((pIn
< (pInLimit
- 1)) && (MEM_read16(pMatch
) == MEM_read16(pIn
))) {
242 if ((pIn
< pInLimit
) && (*pMatch
== *pIn
)) {
245 return (size_t)(pIn
- pStart
);
249 * Count number of bytes that match backwards before pIn and pMatch.
251 * We count only bytes where pMatch > pBase and pIn > pAnchor.
253 static size_t countBackwardsMatch(const BYTE
*pIn
, const BYTE
*pAnchor
,
254 const BYTE
*pMatch
, const BYTE
*pBase
) {
255 size_t matchLength
= 0;
256 while (pIn
> pAnchor
&& pMatch
> pBase
&& pIn
[-1] == pMatch
[-1]) {
265 * Returns a pointer to the entry in the hash table matching the hash and
266 * checksum with the "longest match length" as defined below. The forward and
267 * backward match lengths are written to *pForwardMatchLength and
268 * *pBackwardMatchLength.
270 * The match length is defined based on cctx->ip and the entry's offset.
271 * The forward match is computed from cctx->ip and entry->offset + cctx->ibase.
272 * The backward match is computed backwards from cctx->ip and
273 * cctx->ibase only if the forward match is longer than LDM_MIN_MATCH_LENGTH.
275 static LDM_hashEntry
*HASH_getBestEntry(const LDM_CCtx
*cctx
,
278 U64
*pForwardMatchLength
,
279 U64
*pBackwardMatchLength
) {
280 LDM_hashTable
*table
= cctx
->hashTable
;
281 LDM_hashEntry
*bucket
= getBucket(table
, hash
);
283 LDM_hashEntry
*bestEntry
= NULL
;
284 U64 bestMatchLength
= 0;
288 for (cur
= bucket
; cur
< bucket
+ HASH_BUCKET_SIZE
; ++cur
) {
289 const BYTE
*pMatch
= cur
->offset
+ cctx
->ibase
;
291 // Check checksum for faster check.
293 if (cur
->checksum
== checksum
&&
294 cctx
->ip
- pMatch
<= LDM_WINDOW_SIZE
) {
296 if (cctx
->ip
- pMatch
<= LDM_WINDOW_SIZE
) {
298 U64 forwardMatchLength
= ZSTD_count(cctx
->ip
, pMatch
, cctx
->iend
);
299 U64 backwardMatchLength
, totalMatchLength
;
301 // Only take matches where the forward match length is large enough
303 if (forwardMatchLength
< LDM_MIN_MATCH_LENGTH
) {
307 backwardMatchLength
=
308 countBackwardsMatch(cctx
->ip
, cctx
->anchor
,
309 cur
->offset
+ cctx
->ibase
,
312 totalMatchLength
= forwardMatchLength
+ backwardMatchLength
;
314 if (totalMatchLength
>= bestMatchLength
) {
315 bestMatchLength
= totalMatchLength
;
316 *pForwardMatchLength
= forwardMatchLength
;
317 *pBackwardMatchLength
= backwardMatchLength
;
326 if (bestEntry
!= NULL
) {
333 * Insert an entry into the hash table. The table uses a "circular buffer",
334 * with the oldest entry overwritten.
336 static void HASH_insert(LDM_hashTable
*table
,
337 const hash_t hash
, const LDM_hashEntry entry
) {
338 *(getBucket(table
, hash
) + table
->bucketOffsets
[hash
]) = entry
;
339 table
->bucketOffsets
[hash
]++;
340 table
->bucketOffsets
[hash
] &= HASH_BUCKET_SIZE
- 1;
343 static void HASH_outputTableOccupancy(const LDM_hashTable
*table
) {
345 LDM_hashEntry
*cur
= table
->entries
;
346 LDM_hashEntry
*end
= table
->entries
+ (table
->numBuckets
* HASH_BUCKET_SIZE
);
347 for (; cur
< end
; ++cur
) {
348 if (cur
->offset
== 0) {
353 // The number of buckets is repeated as a check for now.
354 printf("Num buckets, bucket size: %d (2^%d), %d\n",
355 table
->numBuckets
, NUM_HASH_BUCKETS_LOG
, HASH_BUCKET_SIZE
);
356 printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n",
357 table
->numEntries
, ctr
,
358 100.0 * (double)(ctr
) / table
->numEntries
);
361 // TODO: This can be done more efficiently, for example by using builtin
362 // functions (but it is not that important as it is only used for computing
364 static int intLog2(U64 x
) {
372 void LDM_printCompressStats(const LDM_compressStats
*stats
) {
373 printf("=====================\n");
374 printf("Compression statistics\n");
375 printf("Window size, hash table size (bytes): 2^%u, 2^%u\n",
376 stats
->windowSizeLog
, stats
->hashTableSizeLog
);
377 printf("num matches, total match length, %% matched: %u, %llu, %.3f\n",
379 stats
->totalMatchLength
,
380 100.0 * (double)stats
->totalMatchLength
/
381 (double)(stats
->totalMatchLength
+ stats
->totalLiteralLength
));
382 printf("avg match length: %.1f\n", ((double)stats
->totalMatchLength
) /
383 (double)stats
->numMatches
);
384 printf("avg literal length, total literalLength: %.1f, %llu\n",
385 ((double)stats
->totalLiteralLength
) / (double)stats
->numMatches
,
386 stats
->totalLiteralLength
);
387 printf("avg offset length: %.1f\n",
388 ((double)stats
->totalOffset
) / (double)stats
->numMatches
);
389 printf("min offset, max offset: %u, %u\n",
390 stats
->minOffset
, stats
->maxOffset
);
393 printf("offset histogram | match length histogram\n");
394 printf("offset/ML, num matches, %% of matches | num matches, %% of matches\n");
398 int logMaxOffset
= intLog2(stats
->maxOffset
);
399 for (i
= 0; i
<= logMaxOffset
; i
++) {
400 printf("2^%*d: %10u %6.3f%% |2^%*d: %10u %6.3f \n",
402 stats
->offsetHistogram
[i
],
403 100.0 * (double) stats
->offsetHistogram
[i
] /
404 (double) stats
->numMatches
,
406 stats
->matchLengthHistogram
[i
],
407 100.0 * (double) stats
->matchLengthHistogram
[i
] /
408 (double) stats
->numMatches
);
412 printf("=====================\n");
416 * Return the upper (most significant) NUM_HASH_BUCKETS_LOG bits.
418 static hash_t
getSmallHash(U64 hash
) {
419 return hash
>> (64 - NUM_HASH_BUCKETS_LOG
);
423 * Return the 32 bits after the upper NUM_HASH_BUCKETS_LOG bits.
425 static U32
getChecksum(U64 hash
) {
426 return (hash
>> (64 - 32 - NUM_HASH_BUCKETS_LOG
)) & 0xFFFFFFFF;
430 static U32
lowerBitsFromHfHash(U64 hash
) {
431 // The number of bits used so far is NUM_HASH_BUCKETS_LOG + 32.
432 // So there are 32 - NUM_HASH_BUCKETS_LOG bits left.
433 // Occasional hashing requires HASH_ONLY_EVERY_LOG bits.
434 // So if 32 - LDMHASHLOG < HASH_ONLY_EVERY_LOG, just return lower bits
435 // allowing for reuse of bits.
436 if (32 - NUM_HASH_BUCKETS_LOG
< HASH_ONLY_EVERY_LOG
) {
437 return hash
& HASH_ONLY_EVERY
;
439 // Otherwise shift by
440 // (32 - NUM_HASH_BUCKETS_LOG - HASH_ONLY_EVERY_LOG) bits first.
441 return (hash
>> (32 - NUM_HASH_BUCKETS_LOG
- HASH_ONLY_EVERY_LOG
)) &
448 * Get a 64-bit hash using the first len bytes from buf.
450 * Giving bytes s = s_1, s_2, ... s_k, the hash is defined to be
451 * H(s) = s_1*(a^(k-1)) + s_2*(a^(k-2)) + ... + s_k*(a^0)
453 * where the constant a is defined to be prime8bytes.
455 * The implementation adds an offset to each byte, so
456 * H(s) = (s_1 + HASH_CHAR_OFFSET)*(a^(k-1)) + ...
458 static U64
getHash(const BYTE
*buf
, U32 len
) {
461 for (i
= 0; i
< len
; i
++) {
463 ret
+= buf
[i
] + HASH_CHAR_OFFSET
;
469 static U64
ipow(U64 base
, U64 exp
) {
481 static U64
updateHash(U64 hash
, U32 len
,
482 BYTE toRemove
, BYTE toAdd
) {
483 // TODO: this relies on compiler optimization.
484 // The exponential can be calculated explicitly as len is constant.
485 hash
-= ((toRemove
+ HASH_CHAR_OFFSET
) *
486 ipow(prime8bytes
, len
- 1));
488 hash
+= toAdd
+ HASH_CHAR_OFFSET
;
493 * Update cctx->nextHash and cctx->nextPosHashed
494 * based on cctx->lastHash and cctx->lastPosHashed.
496 * This uses a rolling hash and requires that the last position hashed
497 * corresponds to cctx->nextIp - step.
499 static void setNextHash(LDM_CCtx
*cctx
) {
500 cctx
->nextHash
= updateHash(
501 cctx
->lastHash
, LDM_HASH_LENGTH
,
502 cctx
->lastPosHashed
[0],
503 cctx
->lastPosHashed
[LDM_HASH_LENGTH
]);
504 cctx
->nextPosHashed
= cctx
->nextIp
;
507 if (cctx
->ip
- cctx
->ibase
> LDM_LAG
) {
508 cctx
->lagHash
= updateHash(
509 cctx
->lagHash
, LDM_HASH_LENGTH
,
510 cctx
->lagIp
[0], cctx
->lagIp
[LDM_HASH_LENGTH
]);
516 static void putHashOfCurrentPositionFromHash(LDM_CCtx
*cctx
, U64 hash
) {
517 // Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
518 // Note: this works only when cctx->step is 1.
520 if (cctx
-> lagIp
- cctx
->ibase
> 0) {
522 U32 hashEveryMask
= lowerBitsFromHfHash(cctx
->lagHash
);
523 if (hashEveryMask
== HASH_ONLY_EVERY
) {
525 if (((cctx
->ip
- cctx
->ibase
) & HASH_ONLY_EVERY
) == HASH_ONLY_EVERY
) {
527 U32 smallHash
= getSmallHash(cctx
->lagHash
);
530 U32 checksum
= getChecksum(cctx
->lagHash
);
531 const LDM_hashEntry entry
= { cctx
->lagIp
- cctx
->ibase
, checksum
};
533 const LDM_hashEntry entry
= { cctx
->lagIp
- cctx
->ibase
};
536 HASH_insert(cctx
->hashTable
, smallHash
, entry
);
541 U32 hashEveryMask
= lowerBitsFromHfHash(hash
);
542 if (hashEveryMask
== HASH_ONLY_EVERY
) {
544 if (((cctx
->ip
- cctx
->ibase
) & HASH_ONLY_EVERY
) == HASH_ONLY_EVERY
) {
546 U32 smallHash
= getSmallHash(hash
);
549 U32 checksum
= getChecksum(hash
);
550 const LDM_hashEntry entry
= { cctx
->ip
- cctx
->ibase
, checksum
};
552 const LDM_hashEntry entry
= { cctx
->ip
- cctx
->ibase
};
554 HASH_insert(cctx
->hashTable
, smallHash
, entry
);
560 cctx
->lastPosHashed
= cctx
->ip
;
561 cctx
->lastHash
= hash
;
565 * Copy over the cctx->lastHash, and cctx->lastPosHashed
566 * fields from the "next" fields.
568 * This requires that cctx->ip == cctx->nextPosHashed.
570 static void LDM_updateLastHashFromNextHash(LDM_CCtx
*cctx
) {
571 putHashOfCurrentPositionFromHash(cctx
, cctx
->nextHash
);
575 * Insert hash of the current position into the hash table.
577 static void LDM_putHashOfCurrentPosition(LDM_CCtx
*cctx
) {
578 U64 hash
= getHash(cctx
->ip
, LDM_HASH_LENGTH
);
580 putHashOfCurrentPositionFromHash(cctx
, hash
);
583 size_t LDM_initializeCCtx(LDM_CCtx
*cctx
,
584 const void *src
, size_t srcSize
,
585 void *dst
, size_t maxDstSize
) {
586 cctx
->isize
= srcSize
;
587 cctx
->maxOSize
= maxDstSize
;
589 cctx
->ibase
= (const BYTE
*)src
;
590 cctx
->ip
= cctx
->ibase
;
591 cctx
->iend
= cctx
->ibase
+ srcSize
;
593 cctx
->ihashLimit
= cctx
->iend
- LDM_HASH_LENGTH
;
594 cctx
->imatchLimit
= cctx
->iend
- LDM_MIN_MATCH_LENGTH
;
596 cctx
->obase
= (BYTE
*)dst
;
597 cctx
->op
= (BYTE
*)dst
;
599 cctx
->anchor
= cctx
->ibase
;
601 memset(&(cctx
->stats
), 0, sizeof(cctx
->stats
));
603 cctx
->hashTable
= HASH_createTable(LDM_HASHTABLESIZE_U64
);
605 cctx
->hashTable
= HASH_createTable(LDM_HASHTABLESIZE_U32
);
608 if (!cctx
->hashTable
) return 1;
610 cctx
->stats
.minOffset
= UINT_MAX
;
611 cctx
->stats
.windowSizeLog
= LDM_WINDOW_SIZE_LOG
;
612 cctx
->stats
.hashTableSizeLog
= LDM_MEMORY_USAGE
;
614 cctx
->lastPosHashed
= NULL
;
616 cctx
->step
= 1; // Fixed to be 1 for now. Changing may break things.
617 cctx
->nextIp
= cctx
->ip
+ cctx
->step
;
618 cctx
->nextPosHashed
= 0;
623 void LDM_destroyCCtx(LDM_CCtx
*cctx
) {
624 HASH_destroyTable(cctx
->hashTable
);
628 * Finds the "best" match.
630 * Returns 0 if successful and 1 otherwise (i.e. no match can be found
631 * in the remaining input that is long enough).
633 * forwardMatchLength contains the forward length of the match.
635 static int LDM_findBestMatch(LDM_CCtx
*cctx
, const BYTE
**match
,
636 U64
*forwardMatchLength
, U64
*backwardMatchLength
) {
638 LDM_hashEntry
*entry
= NULL
;
639 cctx
->nextIp
= cctx
->ip
+ cctx
->step
;
641 while (entry
== NULL
) {
650 hash
= cctx
->nextHash
;
651 smallHash
= getSmallHash(hash
);
652 checksum
= getChecksum(hash
);
654 hashEveryMask
= lowerBitsFromHfHash(hash
);
657 cctx
->ip
= cctx
->nextIp
;
658 cctx
->nextIp
+= cctx
->step
;
660 if (cctx
->ip
> cctx
->imatchLimit
) {
664 if (hashEveryMask
== HASH_ONLY_EVERY
) {
666 entry
= HASH_getBestEntry(cctx
, smallHash
, checksum
,
667 forwardMatchLength
, backwardMatchLength
);
670 entry
= HASH_getBestEntry(cctx
, smallHash
, checksum
,
671 forwardMatchLength
, backwardMatchLength
);
675 *match
= entry
->offset
+ cctx
->ibase
;
678 putHashOfCurrentPositionFromHash(cctx
, hash
);
685 void LDM_encodeLiteralLengthAndLiterals(
686 LDM_CCtx
*cctx
, BYTE
*pToken
, const U64 literalLength
) {
687 /* Encode the literal length. */
688 if (literalLength
>= RUN_MASK
) {
689 U64 len
= (U64
)literalLength
- RUN_MASK
;
690 *pToken
= (RUN_MASK
<< ML_BITS
);
691 for (; len
>= 255; len
-= 255) {
694 *(cctx
->op
)++ = (BYTE
)len
;
696 *pToken
= (BYTE
)(literalLength
<< ML_BITS
);
699 /* Encode the literals. */
700 memcpy(cctx
->op
, cctx
->anchor
, literalLength
);
701 cctx
->op
+= literalLength
;
704 void LDM_outputBlock(LDM_CCtx
*cctx
,
705 const U64 literalLength
,
707 const U64 matchLength
) {
708 BYTE
*pToken
= cctx
->op
++;
710 /* Encode the literal length and literals. */
711 LDM_encodeLiteralLengthAndLiterals(cctx
, pToken
, literalLength
);
713 /* Encode the offset. */
714 MEM_write32(cctx
->op
, offset
);
715 cctx
->op
+= LDM_OFFSET_SIZE
;
717 /* Encode the match length. */
718 if (matchLength
>= ML_MASK
) {
719 U64 matchLengthRemaining
= matchLength
;
721 matchLengthRemaining
-= ML_MASK
;
722 MEM_write32(cctx
->op
, 0xFFFFFFFF);
723 while (matchLengthRemaining
>= 4*0xFF) {
725 MEM_write32(cctx
->op
, 0xffffffff);
726 matchLengthRemaining
-= 4*0xFF;
728 cctx
->op
+= matchLengthRemaining
/ 255;
729 *(cctx
->op
)++ = (BYTE
)(matchLengthRemaining
% 255);
731 *pToken
+= (BYTE
)(matchLength
);
735 // TODO: maxDstSize is unused. This function may seg fault when writing
736 // beyond the size of dst, as it does not check maxDstSize. Writing to
737 // a buffer and performing checks is a possible solution.
739 // This is based upon lz4.
740 size_t LDM_compress(const void *src
, size_t srcSize
,
741 void *dst
, size_t maxDstSize
) {
743 const BYTE
*match
= NULL
;
744 U64 forwardMatchLength
= 0;
745 U64 backwardsMatchLength
= 0;
747 if (LDM_initializeCCtx(&cctx
, src
, srcSize
, dst
, maxDstSize
)) {
748 // Initialization failed.
752 #ifdef OUTPUT_CONFIGURATION
753 LDM_outputConfiguration();
756 /* Hash the first position and put it into the hash table. */
757 LDM_putHashOfCurrentPosition(&cctx
);
759 cctx
.lagIp
= cctx
.ip
;
760 cctx
.lagHash
= cctx
.lastHash
;
764 * If no more matches can be found (i.e. the length of the remaining input
765 * is less than the minimum match length), then stop searching for matches
766 * and encode the final literals.
768 while (!LDM_findBestMatch(&cctx
, &match
, &forwardMatchLength
,
769 &backwardsMatchLength
)) {
772 cctx
.stats
.numMatches
++;
775 cctx
.ip
-= backwardsMatchLength
;
776 match
-= backwardsMatchLength
;
779 * Write current block (literals, literal length, match offset, match
780 * length) and update pointers and hashes.
783 const U64 literalLength
= cctx
.ip
- cctx
.anchor
;
784 const U32 offset
= cctx
.ip
- match
;
785 const U64 matchLength
= forwardMatchLength
+
786 backwardsMatchLength
-
787 LDM_MIN_MATCH_LENGTH
;
789 LDM_outputBlock(&cctx
, literalLength
, offset
, matchLength
);
792 cctx
.stats
.totalLiteralLength
+= literalLength
;
793 cctx
.stats
.totalOffset
+= offset
;
794 cctx
.stats
.totalMatchLength
+= matchLength
+ LDM_MIN_MATCH_LENGTH
;
795 cctx
.stats
.minOffset
=
796 offset
< cctx
.stats
.minOffset
? offset
: cctx
.stats
.minOffset
;
797 cctx
.stats
.maxOffset
=
798 offset
> cctx
.stats
.maxOffset
? offset
: cctx
.stats
.maxOffset
;
799 cctx
.stats
.offsetHistogram
[(U32
)intLog2(offset
)]++;
800 cctx
.stats
.matchLengthHistogram
[
801 (U32
)intLog2(matchLength
+ LDM_MIN_MATCH_LENGTH
)]++;
804 // Move ip to end of block, inserting hashes at each position.
805 cctx
.nextIp
= cctx
.ip
+ cctx
.step
;
806 while (cctx
.ip
< cctx
.anchor
+ LDM_MIN_MATCH_LENGTH
+
807 matchLength
+ literalLength
) {
808 if (cctx
.ip
> cctx
.lastPosHashed
) {
810 LDM_updateLastHashFromNextHash(&cctx
);
818 // Set start of next block to current input pointer.
819 cctx
.anchor
= cctx
.ip
;
820 LDM_updateLastHashFromNextHash(&cctx
);
823 /* Encode the last literals (no more matches). */
825 const U64 lastRun
= cctx
.iend
- cctx
.anchor
;
826 BYTE
*pToken
= cctx
.op
++;
827 LDM_encodeLiteralLengthAndLiterals(&cctx
, pToken
, lastRun
);
831 LDM_printCompressStats(&cctx
.stats
);
832 HASH_outputTableOccupancy(cctx
.hashTable
);
836 const size_t ret
= cctx
.op
- cctx
.obase
;
837 LDM_destroyCCtx(&cctx
);
842 void LDM_outputConfiguration(void) {
843 printf("=====================\n");
844 printf("Configuration\n");
845 printf("LDM_WINDOW_SIZE_LOG: %d\n", LDM_WINDOW_SIZE_LOG
);
846 printf("LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH: %d, %d\n",
847 LDM_MIN_MATCH_LENGTH
, LDM_HASH_LENGTH
);
848 printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE
);
849 printf("HASH_ONLY_EVERY_LOG: %d\n", HASH_ONLY_EVERY_LOG
);
850 printf("HASH_BUCKET_SIZE_LOG: %d\n", HASH_BUCKET_SIZE_LOG
);
851 printf("LDM_LAG: %d\n", LDM_LAG
);
852 printf("USE_CHECKSUM: %d\n", USE_CHECKSUM
);
853 printf("INSERT_BY_TAG: %d\n", INSERT_BY_TAG
);
854 printf("HASH_CHAR_OFFSET: %d\n", HASH_CHAR_OFFSET
);
855 printf("=====================\n");