2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
12 /*-**************************************
14 ****************************************/
15 #define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
16 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
17 #define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
20 /*-**************************************
22 ****************************************/
23 /* Unix Large Files support (>4GB) */
24 #define _FILE_OFFSET_BITS 64
25 #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
26 # define _LARGEFILE_SOURCE
27 #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
28 # define _LARGEFILE64_SOURCE
32 /*-*************************************
34 ***************************************/
35 #include <stdlib.h> /* malloc, free */
36 #include <string.h> /* memset */
37 #include <stdio.h> /* fprintf, fopen, ftello64 */
38 #include <time.h> /* clock */
40 #include "mem.h" /* read */
41 #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
42 #define HUF_STATIC_LINKING_ONLY
43 #include "huf.h" /* HUF_buildCTable, HUF_writeCTable */
44 #include "zstd_internal.h" /* includes zstd.h */
45 #include "xxhash.h" /* XXH64 */
46 #include "divsufsort.h"
47 #ifndef ZDICT_STATIC_LINKING_ONLY
48 # define ZDICT_STATIC_LINKING_ONLY
53 /*-*************************************
55 ***************************************/
60 #define DICTLISTSIZE_DEFAULT 10000
62 #define NOISELENGTH 32
64 static const int g_compressionLevel_default
= 3;
65 static const U32 g_selectivity_default
= 9;
68 /*-*************************************
70 ***************************************/
71 #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
72 #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
74 static clock_t ZDICT_clockSpan(clock_t nPrevious
) { return clock() - nPrevious
; }
76 static void ZDICT_printHex(const void* ptr
, size_t length
)
78 const BYTE
* const b
= (const BYTE
*)ptr
;
80 for (u
=0; u
<length
; u
++) {
82 if (c
<32 || c
>126) c
= '.'; /* non-printable char */
88 /*-********************************************************
90 **********************************************************/
91 unsigned ZDICT_isError(size_t errorCode
) { return ERR_isError(errorCode
); }
93 const char* ZDICT_getErrorName(size_t errorCode
) { return ERR_getErrorName(errorCode
); }
95 unsigned ZDICT_getDictID(const void* dictBuffer
, size_t dictSize
)
97 if (dictSize
< 8) return 0;
98 if (MEM_readLE32(dictBuffer
) != ZSTD_MAGIC_DICTIONARY
) return 0;
99 return MEM_readLE32((const char*)dictBuffer
+ 4);
103 /*-********************************************************
104 * Dictionary training functions
105 **********************************************************/
106 static unsigned ZDICT_NbCommonBytes (size_t val
)
108 if (MEM_isLittleEndian()) {
110 # if defined(_MSC_VER) && defined(_WIN64)
112 _BitScanForward64( &r
, (U64
)val
);
113 return (unsigned)(r
>>3);
114 # elif defined(__GNUC__) && (__GNUC__ >= 3)
115 return (__builtin_ctzll((U64
)val
) >> 3);
117 static const int DeBruijnBytePos
[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
118 return DeBruijnBytePos
[((U64
)((val
& -(long long)val
) * 0x0218A392CDABBD3FULL
)) >> 58];
120 } else { /* 32 bits */
121 # if defined(_MSC_VER)
123 _BitScanForward( &r
, (U32
)val
);
124 return (unsigned)(r
>>3);
125 # elif defined(__GNUC__) && (__GNUC__ >= 3)
126 return (__builtin_ctz((U32
)val
) >> 3);
128 static const int DeBruijnBytePos
[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
129 return DeBruijnBytePos
[((U32
)((val
& -(S32
)val
) * 0x077CB531U
)) >> 27];
132 } else { /* Big Endian CPU */
134 # if defined(_MSC_VER) && defined(_WIN64)
136 _BitScanReverse64( &r
, val
);
137 return (unsigned)(r
>>3);
138 # elif defined(__GNUC__) && (__GNUC__ >= 3)
139 return (__builtin_clzll(val
) >> 3);
142 const unsigned n32
= sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
143 if (!(val
>>n32
)) { r
=4; } else { r
=0; val
>>=n32
; }
144 if (!(val
>>16)) { r
+=2; val
>>=8; } else { val
>>=24; }
148 } else { /* 32 bits */
149 # if defined(_MSC_VER)
151 _BitScanReverse( &r
, (unsigned long)val
);
152 return (unsigned)(r
>>3);
153 # elif defined(__GNUC__) && (__GNUC__ >= 3)
154 return (__builtin_clz((U32
)val
) >> 3);
157 if (!(val
>>16)) { r
=2; val
>>=8; } else { r
=0; val
>>=24; }
166 Count the nb of common bytes between 2 pointers.
167 Note : this function presumes end of buffer followed by noisy guard band.
169 static size_t ZDICT_count(const void* pIn
, const void* pMatch
)
171 const char* const pStart
= (const char*)pIn
;
173 size_t const diff
= MEM_readST(pMatch
) ^ MEM_readST(pIn
);
175 pIn
= (const char*)pIn
+sizeof(size_t);
176 pMatch
= (const char*)pMatch
+sizeof(size_t);
179 pIn
= (const char*)pIn
+ZDICT_NbCommonBytes(diff
);
180 return (size_t)((const char*)pIn
- pStart
);
191 static void ZDICT_initDictItem(dictItem
* d
)
195 d
->savings
= (U32
)(-1);
199 #define LLIMIT 64 /* heuristic determined experimentally */
200 #define MINMATCHLENGTH 7 /* heuristic determined experimentally */
201 static dictItem
ZDICT_analyzePos(
203 const int* suffix
, U32 start
,
204 const void* buffer
, U32 minRatio
, U32 notificationLevel
)
206 U32 lengthList
[LLIMIT
] = {0};
207 U32 cumulLength
[LLIMIT
] = {0};
208 U32 savings
[LLIMIT
] = {0};
209 const BYTE
* b
= (const BYTE
*)buffer
;
210 size_t maxLength
= LLIMIT
;
211 size_t pos
= suffix
[start
];
216 memset(&solution
, 0, sizeof(solution
));
219 /* trivial repetition cases */
220 if ( (MEM_read16(b
+pos
+0) == MEM_read16(b
+pos
+2))
221 ||(MEM_read16(b
+pos
+1) == MEM_read16(b
+pos
+3))
222 ||(MEM_read16(b
+pos
+2) == MEM_read16(b
+pos
+4)) ) {
223 /* skip and mark segment */
224 U16
const pattern16
= MEM_read16(b
+pos
+4);
225 U32 u
, patternEnd
= 6;
226 while (MEM_read16(b
+pos
+patternEnd
) == pattern16
) patternEnd
+=2 ;
227 if (b
[pos
+patternEnd
] == b
[pos
+patternEnd
-1]) patternEnd
++;
228 for (u
=1; u
<patternEnd
; u
++)
229 doneMarks
[pos
+u
] = 1;
237 length
= ZDICT_count(b
+ pos
, b
+ suffix
[end
]);
238 } while (length
>= MINMATCHLENGTH
);
244 length
= ZDICT_count(b
+ pos
, b
+ *(suffix
+start
-1));
245 if (length
>=MINMATCHLENGTH
) start
--;
246 } while(length
>= MINMATCHLENGTH
);
249 /* exit if not found a minimum nb of repetitions */
250 if (end
-start
< minRatio
) {
252 for(idx
=start
; idx
<end
; idx
++)
253 doneMarks
[suffix
[idx
]] = 1;
259 U32 refinedStart
= start
;
260 U32 refinedEnd
= end
;
262 DISPLAYLEVEL(4, "\n");
263 DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (unsigned)(end
-start
), MINMATCHLENGTH
, (unsigned)pos
);
264 DISPLAYLEVEL(4, "\n");
266 for (mml
= MINMATCHLENGTH
; ; mml
++) {
267 BYTE currentChar
= 0;
268 U32 currentCount
= 0;
269 U32 currentID
= refinedStart
;
271 U32 selectedCount
= 0;
272 U32 selectedID
= currentID
;
273 for (id
=refinedStart
; id
< refinedEnd
; id
++) {
274 if (b
[suffix
[id
] + mml
] != currentChar
) {
275 if (currentCount
> selectedCount
) {
276 selectedCount
= currentCount
;
277 selectedID
= currentID
;
280 currentChar
= b
[ suffix
[id
] + mml
];
285 if (currentCount
> selectedCount
) { /* for last */
286 selectedCount
= currentCount
;
287 selectedID
= currentID
;
290 if (selectedCount
< minRatio
)
292 refinedStart
= selectedID
;
293 refinedEnd
= refinedStart
+ selectedCount
;
296 /* evaluate gain based on new dict */
297 start
= refinedStart
;
298 pos
= suffix
[refinedStart
];
300 memset(lengthList
, 0, sizeof(lengthList
));
306 length
= ZDICT_count(b
+ pos
, b
+ suffix
[end
]);
307 if (length
>= LLIMIT
) length
= LLIMIT
-1;
308 lengthList
[length
]++;
309 } while (length
>=MINMATCHLENGTH
);
313 { size_t length
= MINMATCHLENGTH
;
314 while ((length
>= MINMATCHLENGTH
) & (start
> 0)) {
315 length
= ZDICT_count(b
+ pos
, b
+ suffix
[start
- 1]);
316 if (length
>= LLIMIT
) length
= LLIMIT
- 1;
317 lengthList
[length
]++;
318 if (length
>= MINMATCHLENGTH
) start
--;
322 /* largest useful length */
323 memset(cumulLength
, 0, sizeof(cumulLength
));
324 cumulLength
[maxLength
-1] = lengthList
[maxLength
-1];
325 for (i
=(int)(maxLength
-2); i
>=0; i
--)
326 cumulLength
[i
] = cumulLength
[i
+1] + lengthList
[i
];
328 for (i
=LLIMIT
-1; i
>=MINMATCHLENGTH
; i
--) if (cumulLength
[i
]>=minRatio
) break;
331 /* reduce maxLength in case of final into repetitive data */
332 { U32 l
= (U32
)maxLength
;
333 BYTE
const c
= b
[pos
+ maxLength
-1];
334 while (b
[pos
+l
-2]==c
) l
--;
337 if (maxLength
< MINMATCHLENGTH
) return solution
; /* skip : no long-enough solution */
339 /* calculate savings */
341 for (i
=MINMATCHLENGTH
; i
<=(int)maxLength
; i
++)
342 savings
[i
] = savings
[i
-1] + (lengthList
[i
] * (i
-3));
344 DISPLAYLEVEL(4, "Selected dict at position %u, of length %u : saves %u (ratio: %.2f) \n",
345 (unsigned)pos
, (unsigned)maxLength
, (unsigned)savings
[maxLength
], (double)savings
[maxLength
] / maxLength
);
347 solution
.pos
= (U32
)pos
;
348 solution
.length
= (U32
)maxLength
;
349 solution
.savings
= savings
[maxLength
];
351 /* mark positions done */
353 for (id
=start
; id
<end
; id
++) {
355 U32
const testedPos
= suffix
[id
];
356 if (testedPos
== pos
)
357 length
= solution
.length
;
359 length
= (U32
)ZDICT_count(b
+pos
, b
+testedPos
);
360 if (length
> solution
.length
) length
= solution
.length
;
362 pEnd
= (U32
)(testedPos
+ length
);
363 for (p
=testedPos
; p
<pEnd
; p
++)
371 static int isIncluded(const void* in
, const void* container
, size_t length
)
373 const char* const ip
= (const char*) in
;
374 const char* const into
= (const char*) container
;
377 for (u
=0; u
<length
; u
++) { /* works because end of buffer is a noisy guard band */
378 if (ip
[u
] != into
[u
]) break;
384 /*! ZDICT_tryMerge() :
385 check if dictItem can be merged, do it if possible
386 @return : id of destination elt, 0 if not merged
388 static U32
ZDICT_tryMerge(dictItem
* table
, dictItem elt
, U32 eltNbToSkip
, const void* buffer
)
390 const U32 tableSize
= table
->pos
;
391 const U32 eltEnd
= elt
.pos
+ elt
.length
;
392 const char* const buf
= (const char*) buffer
;
395 U32 u
; for (u
=1; u
<tableSize
; u
++) {
396 if (u
==eltNbToSkip
) continue;
397 if ((table
[u
].pos
> elt
.pos
) && (table
[u
].pos
<= eltEnd
)) { /* overlap, existing > new */
399 U32
const addedLength
= table
[u
].pos
- elt
.pos
;
400 table
[u
].length
+= addedLength
;
401 table
[u
].pos
= elt
.pos
;
402 table
[u
].savings
+= elt
.savings
* addedLength
/ elt
.length
; /* rough approx */
403 table
[u
].savings
+= elt
.length
/ 8; /* rough approx bonus */
405 /* sort : improve rank */
406 while ((u
>1) && (table
[u
-1].savings
< elt
.savings
))
407 table
[u
] = table
[u
-1], u
--;
413 for (u
=1; u
<tableSize
; u
++) {
414 if (u
==eltNbToSkip
) continue;
416 if ((table
[u
].pos
+ table
[u
].length
>= elt
.pos
) && (table
[u
].pos
< elt
.pos
)) { /* overlap, existing < new */
418 int const addedLength
= (int)eltEnd
- (table
[u
].pos
+ table
[u
].length
);
419 table
[u
].savings
+= elt
.length
/ 8; /* rough approx bonus */
420 if (addedLength
> 0) { /* otherwise, elt fully included into existing */
421 table
[u
].length
+= addedLength
;
422 table
[u
].savings
+= elt
.savings
* addedLength
/ elt
.length
; /* rough approx */
424 /* sort : improve rank */
426 while ((u
>1) && (table
[u
-1].savings
< elt
.savings
))
427 table
[u
] = table
[u
-1], u
--;
432 if (MEM_read64(buf
+ table
[u
].pos
) == MEM_read64(buf
+ elt
.pos
+ 1)) {
433 if (isIncluded(buf
+ table
[u
].pos
, buf
+ elt
.pos
+ 1, table
[u
].length
)) {
434 size_t const addedLength
= MAX( (int)elt
.length
- (int)table
[u
].length
, 1 );
435 table
[u
].pos
= elt
.pos
;
436 table
[u
].savings
+= (U32
)(elt
.savings
* addedLength
/ elt
.length
);
437 table
[u
].length
= MIN(elt
.length
, table
[u
].length
+ 1);
447 static void ZDICT_removeDictItem(dictItem
* table
, U32 id
)
449 /* convention : table[0].pos stores nb of elts */
450 U32
const max
= table
[0].pos
;
452 if (!id
) return; /* protection, should never happen */
453 for (u
=id
; u
<max
-1; u
++)
454 table
[u
] = table
[u
+1];
459 static void ZDICT_insertDictItem(dictItem
* table
, U32 maxSize
, dictItem elt
, const void* buffer
)
461 /* merge if possible */
462 U32 mergeId
= ZDICT_tryMerge(table
, elt
, 0, buffer
);
466 newMerge
= ZDICT_tryMerge(table
, table
[mergeId
], mergeId
, buffer
);
467 if (newMerge
) ZDICT_removeDictItem(table
, mergeId
);
475 U32 nextElt
= table
->pos
;
476 if (nextElt
>= maxSize
) nextElt
= maxSize
-1;
478 while (table
[current
].savings
< elt
.savings
) {
479 table
[current
+1] = table
[current
];
482 table
[current
+1] = elt
;
483 table
->pos
= nextElt
+1;
488 static U32
ZDICT_dictSize(const dictItem
* dictList
)
491 for (u
=1; u
<dictList
[0].pos
; u
++)
492 dictSize
+= dictList
[u
].length
;
497 static size_t ZDICT_trainBuffer_legacy(dictItem
* dictList
, U32 dictListSize
,
498 const void* const buffer
, size_t bufferSize
, /* buffer must end with noisy guard band */
499 const size_t* fileSizes
, unsigned nbFiles
,
500 unsigned minRatio
, U32 notificationLevel
)
502 int* const suffix0
= (int*)malloc((bufferSize
+2)*sizeof(*suffix0
));
503 int* const suffix
= suffix0
+1;
504 U32
* reverseSuffix
= (U32
*)malloc((bufferSize
)*sizeof(*reverseSuffix
));
505 BYTE
* doneMarks
= (BYTE
*)malloc((bufferSize
+16)*sizeof(*doneMarks
)); /* +16 for overflow security */
506 U32
* filePos
= (U32
*)malloc(nbFiles
* sizeof(*filePos
));
508 clock_t displayClock
= 0;
509 clock_t const refreshRate
= CLOCKS_PER_SEC
* 3 / 10;
511 # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
512 if (ZDICT_clockSpan(displayClock) > refreshRate) \
513 { displayClock = clock(); DISPLAY(__VA_ARGS__); \
514 if (notificationLevel>=4) fflush(stderr); } }
517 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
518 if (!suffix0
|| !reverseSuffix
|| !doneMarks
|| !filePos
) {
519 result
= ERROR(memory_allocation
);
522 if (minRatio
< MINRATIO
) minRatio
= MINRATIO
;
523 memset(doneMarks
, 0, bufferSize
+16);
525 /* limit sample set size (divsufsort limitation)*/
526 if (bufferSize
> ZDICT_MAX_SAMPLES_SIZE
) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (unsigned)(ZDICT_MAX_SAMPLES_SIZE
>>20));
527 while (bufferSize
> ZDICT_MAX_SAMPLES_SIZE
) bufferSize
-= fileSizes
[--nbFiles
];
530 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles
, (unsigned)(bufferSize
>>20));
531 { int const divSuftSortResult
= divsufsort((const unsigned char*)buffer
, suffix
, (int)bufferSize
, 0);
532 if (divSuftSortResult
!= 0) { result
= ERROR(GENERIC
); goto _cleanup
; }
534 suffix
[bufferSize
] = (int)bufferSize
; /* leads into noise */
535 suffix0
[0] = (int)bufferSize
; /* leads into noise */
536 /* build reverse suffix sort */
538 for (pos
=0; pos
< bufferSize
; pos
++)
539 reverseSuffix
[suffix
[pos
]] = (U32
)pos
;
540 /* note filePos tracks borders between samples.
541 It's not used at this stage, but planned to become useful in a later update */
543 for (pos
=1; pos
<nbFiles
; pos
++)
544 filePos
[pos
] = (U32
)(filePos
[pos
-1] + fileSizes
[pos
-1]);
547 DISPLAYLEVEL(2, "finding patterns ... \n");
548 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio
);
550 { U32 cursor
; for (cursor
=0; cursor
< bufferSize
; ) {
552 if (doneMarks
[cursor
]) { cursor
++; continue; }
553 solution
= ZDICT_analyzePos(doneMarks
, suffix
, reverseSuffix
[cursor
], buffer
, minRatio
, notificationLevel
);
554 if (solution
.length
==0) { cursor
++; continue; }
555 ZDICT_insertDictItem(dictList
, dictListSize
, solution
, buffer
);
556 cursor
+= solution
.length
;
557 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor
/ bufferSize
* 100);
569 static void ZDICT_fillNoise(void* buffer
, size_t length
)
571 unsigned const prime1
= 2654435761U;
572 unsigned const prime2
= 2246822519U;
573 unsigned acc
= prime1
;
575 for (p
=0; p
<length
; p
++) {
577 ((unsigned char*)buffer
)[p
] = (unsigned char)(acc
>> 21);
584 ZSTD_CDict
* dict
; /* dictionary */
585 ZSTD_CCtx
* zc
; /* working context */
586 void* workPlace
; /* must be ZSTD_BLOCKSIZE_MAX allocated */
589 #define MAXREPOFFSET 1024
591 static void ZDICT_countEStats(EStats_ress_t esr
, ZSTD_parameters params
,
592 unsigned* countLit
, unsigned* offsetcodeCount
, unsigned* matchlengthCount
, unsigned* litlengthCount
, U32
* repOffsets
,
593 const void* src
, size_t srcSize
,
594 U32 notificationLevel
)
596 size_t const blockSizeMax
= MIN (ZSTD_BLOCKSIZE_MAX
, 1 << params
.cParams
.windowLog
);
599 if (srcSize
> blockSizeMax
) srcSize
= blockSizeMax
; /* protection vs large samples */
600 { size_t const errorCode
= ZSTD_compressBegin_usingCDict(esr
.zc
, esr
.dict
);
601 if (ZSTD_isError(errorCode
)) { DISPLAYLEVEL(1, "warning : ZSTD_compressBegin_usingCDict failed \n"); return; }
604 cSize
= ZSTD_compressBlock(esr
.zc
, esr
.workPlace
, ZSTD_BLOCKSIZE_MAX
, src
, srcSize
);
605 if (ZSTD_isError(cSize
)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (unsigned)srcSize
); return; }
607 if (cSize
) { /* if == 0; block is not compressible */
608 const seqStore_t
* const seqStorePtr
= ZSTD_getSeqStore(esr
.zc
);
611 { const BYTE
* bytePtr
;
612 for(bytePtr
= seqStorePtr
->litStart
; bytePtr
< seqStorePtr
->lit
; bytePtr
++)
613 countLit
[*bytePtr
]++;
617 { U32
const nbSeq
= (U32
)(seqStorePtr
->sequences
- seqStorePtr
->sequencesStart
);
618 ZSTD_seqToCodes(seqStorePtr
);
620 { const BYTE
* codePtr
= seqStorePtr
->ofCode
;
622 for (u
=0; u
<nbSeq
; u
++) offsetcodeCount
[codePtr
[u
]]++;
625 { const BYTE
* codePtr
= seqStorePtr
->mlCode
;
627 for (u
=0; u
<nbSeq
; u
++) matchlengthCount
[codePtr
[u
]]++;
630 { const BYTE
* codePtr
= seqStorePtr
->llCode
;
632 for (u
=0; u
<nbSeq
; u
++) litlengthCount
[codePtr
[u
]]++;
635 if (nbSeq
>= 2) { /* rep offsets */
636 const seqDef
* const seq
= seqStorePtr
->sequencesStart
;
637 U32 offset1
= seq
[0].offset
- 3;
638 U32 offset2
= seq
[1].offset
- 3;
639 if (offset1
>= MAXREPOFFSET
) offset1
= 0;
640 if (offset2
>= MAXREPOFFSET
) offset2
= 0;
641 repOffsets
[offset1
] += 3;
642 repOffsets
[offset2
] += 1;
646 static size_t ZDICT_totalSampleSize(const size_t* fileSizes
, unsigned nbFiles
)
650 for (u
=0; u
<nbFiles
; u
++) total
+= fileSizes
[u
];
654 typedef struct { U32 offset
; U32 count
; } offsetCount_t
;
656 static void ZDICT_insertSortCount(offsetCount_t table
[ZSTD_REP_NUM
+1], U32 val
, U32 count
)
659 table
[ZSTD_REP_NUM
].offset
= val
;
660 table
[ZSTD_REP_NUM
].count
= count
;
661 for (u
=ZSTD_REP_NUM
; u
>0; u
--) {
663 if (table
[u
-1].count
>= table
[u
].count
) break;
665 table
[u
-1] = table
[u
];
671 * rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
672 * necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
674 static void ZDICT_flatLit(unsigned* countLit
)
677 for (u
=1; u
<256; u
++) countLit
[u
] = 2;
683 #define OFFCODE_MAX 30 /* only applicable to first block */
684 static size_t ZDICT_analyzeEntropy(void* dstBuffer
, size_t maxDstSize
,
685 unsigned compressionLevel
,
686 const void* srcBuffer
, const size_t* fileSizes
, unsigned nbFiles
,
687 const void* dictBuffer
, size_t dictBufferSize
,
688 unsigned notificationLevel
)
690 unsigned countLit
[256];
691 HUF_CREATE_STATIC_CTABLE(hufTable
, 255);
692 unsigned offcodeCount
[OFFCODE_MAX
+1];
693 short offcodeNCount
[OFFCODE_MAX
+1];
694 U32 offcodeMax
= ZSTD_highbit32((U32
)(dictBufferSize
+ 128 KB
));
695 unsigned matchLengthCount
[MaxML
+1];
696 short matchLengthNCount
[MaxML
+1];
697 unsigned litLengthCount
[MaxLL
+1];
698 short litLengthNCount
[MaxLL
+1];
699 U32 repOffset
[MAXREPOFFSET
];
700 offsetCount_t bestRepOffset
[ZSTD_REP_NUM
+1];
701 EStats_ress_t esr
= { NULL
, NULL
, NULL
};
702 ZSTD_parameters params
;
703 U32 u
, huffLog
= 11, Offlog
= OffFSELog
, mlLog
= MLFSELog
, llLog
= LLFSELog
, total
;
704 size_t pos
= 0, errorCode
;
706 size_t const totalSrcSize
= ZDICT_totalSampleSize(fileSizes
, nbFiles
);
707 size_t const averageSampleSize
= totalSrcSize
/ (nbFiles
+ !nbFiles
);
708 BYTE
* dstPtr
= (BYTE
*)dstBuffer
;
711 DEBUGLOG(4, "ZDICT_analyzeEntropy");
712 if (offcodeMax
>OFFCODE_MAX
) { eSize
= ERROR(dictionaryCreation_failed
); goto _cleanup
; } /* too large dictionary */
713 for (u
=0; u
<256; u
++) countLit
[u
] = 1; /* any character must be described */
714 for (u
=0; u
<=offcodeMax
; u
++) offcodeCount
[u
] = 1;
715 for (u
=0; u
<=MaxML
; u
++) matchLengthCount
[u
] = 1;
716 for (u
=0; u
<=MaxLL
; u
++) litLengthCount
[u
] = 1;
717 memset(repOffset
, 0, sizeof(repOffset
));
718 repOffset
[1] = repOffset
[4] = repOffset
[8] = 1;
719 memset(bestRepOffset
, 0, sizeof(bestRepOffset
));
720 if (compressionLevel
==0) compressionLevel
= g_compressionLevel_default
;
721 params
= ZSTD_getParams(compressionLevel
, averageSampleSize
, dictBufferSize
);
723 esr
.dict
= ZSTD_createCDict_advanced(dictBuffer
, dictBufferSize
, ZSTD_dlm_byRef
, ZSTD_dct_rawContent
, params
.cParams
, ZSTD_defaultCMem
);
724 esr
.zc
= ZSTD_createCCtx();
725 esr
.workPlace
= malloc(ZSTD_BLOCKSIZE_MAX
);
726 if (!esr
.dict
|| !esr
.zc
|| !esr
.workPlace
) {
727 eSize
= ERROR(memory_allocation
);
728 DISPLAYLEVEL(1, "Not enough memory \n");
732 /* collect stats on all samples */
733 for (u
=0; u
<nbFiles
; u
++) {
734 ZDICT_countEStats(esr
, params
,
735 countLit
, offcodeCount
, matchLengthCount
, litLengthCount
, repOffset
,
736 (const char*)srcBuffer
+ pos
, fileSizes
[u
],
741 /* analyze, build stats, starting with literals */
742 { size_t maxNbBits
= HUF_buildCTable (hufTable
, countLit
, 255, huffLog
);
743 if (HUF_isError(maxNbBits
)) {
744 eSize
= ERROR(GENERIC
);
745 DISPLAYLEVEL(1, " HUF_buildCTable error \n");
748 if (maxNbBits
==8) { /* not compressible : will fail on HUF_writeCTable() */
749 DISPLAYLEVEL(2, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
750 ZDICT_flatLit(countLit
); /* replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode */
751 maxNbBits
= HUF_buildCTable (hufTable
, countLit
, 255, huffLog
);
752 assert(maxNbBits
==9);
754 huffLog
= (U32
)maxNbBits
;
757 /* looking for most common first offsets */
759 for (offset
=1; offset
<MAXREPOFFSET
; offset
++)
760 ZDICT_insertSortCount(bestRepOffset
, offset
, repOffset
[offset
]);
762 /* note : the result of this phase should be used to better appreciate the impact on statistics */
764 total
=0; for (u
=0; u
<=offcodeMax
; u
++) total
+=offcodeCount
[u
];
765 errorCode
= FSE_normalizeCount(offcodeNCount
, Offlog
, offcodeCount
, total
, offcodeMax
);
766 if (FSE_isError(errorCode
)) {
767 eSize
= ERROR(GENERIC
);
768 DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
771 Offlog
= (U32
)errorCode
;
773 total
=0; for (u
=0; u
<=MaxML
; u
++) total
+=matchLengthCount
[u
];
774 errorCode
= FSE_normalizeCount(matchLengthNCount
, mlLog
, matchLengthCount
, total
, MaxML
);
775 if (FSE_isError(errorCode
)) {
776 eSize
= ERROR(GENERIC
);
777 DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
780 mlLog
= (U32
)errorCode
;
782 total
=0; for (u
=0; u
<=MaxLL
; u
++) total
+=litLengthCount
[u
];
783 errorCode
= FSE_normalizeCount(litLengthNCount
, llLog
, litLengthCount
, total
, MaxLL
);
784 if (FSE_isError(errorCode
)) {
785 eSize
= ERROR(GENERIC
);
786 DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
789 llLog
= (U32
)errorCode
;
791 /* write result to buffer */
792 { size_t const hhSize
= HUF_writeCTable(dstPtr
, maxDstSize
, hufTable
, 255, huffLog
);
793 if (HUF_isError(hhSize
)) {
794 eSize
= ERROR(GENERIC
);
795 DISPLAYLEVEL(1, "HUF_writeCTable error \n");
799 maxDstSize
-= hhSize
;
803 { size_t const ohSize
= FSE_writeNCount(dstPtr
, maxDstSize
, offcodeNCount
, OFFCODE_MAX
, Offlog
);
804 if (FSE_isError(ohSize
)) {
805 eSize
= ERROR(GENERIC
);
806 DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
810 maxDstSize
-= ohSize
;
814 { size_t const mhSize
= FSE_writeNCount(dstPtr
, maxDstSize
, matchLengthNCount
, MaxML
, mlLog
);
815 if (FSE_isError(mhSize
)) {
816 eSize
= ERROR(GENERIC
);
817 DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
821 maxDstSize
-= mhSize
;
825 { size_t const lhSize
= FSE_writeNCount(dstPtr
, maxDstSize
, litLengthNCount
, MaxLL
, llLog
);
826 if (FSE_isError(lhSize
)) {
827 eSize
= ERROR(GENERIC
);
828 DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
832 maxDstSize
-= lhSize
;
837 eSize
= ERROR(GENERIC
);
838 DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
842 MEM_writeLE32(dstPtr
+0, bestRepOffset
[0].offset
);
843 MEM_writeLE32(dstPtr
+4, bestRepOffset
[1].offset
);
844 MEM_writeLE32(dstPtr
+8, bestRepOffset
[2].offset
);
846 /* at this stage, we don't use the result of "most common first offset",
847 as the impact of statistics is not properly evaluated */
848 MEM_writeLE32(dstPtr
+0, repStartValue
[0]);
849 MEM_writeLE32(dstPtr
+4, repStartValue
[1]);
850 MEM_writeLE32(dstPtr
+8, repStartValue
[2]);
855 ZSTD_freeCDict(esr
.dict
);
856 ZSTD_freeCCtx(esr
.zc
);
864 size_t ZDICT_finalizeDictionary(void* dictBuffer
, size_t dictBufferCapacity
,
865 const void* customDictContent
, size_t dictContentSize
,
866 const void* samplesBuffer
, const size_t* samplesSizes
,
867 unsigned nbSamples
, ZDICT_params_t params
)
870 #define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
871 BYTE header
[HBUFFSIZE
];
872 int const compressionLevel
= (params
.compressionLevel
== 0) ? g_compressionLevel_default
: params
.compressionLevel
;
873 U32
const notificationLevel
= params
.notificationLevel
;
875 /* check conditions */
876 DEBUGLOG(4, "ZDICT_finalizeDictionary");
877 if (dictBufferCapacity
< dictContentSize
) return ERROR(dstSize_tooSmall
);
878 if (dictContentSize
< ZDICT_CONTENTSIZE_MIN
) return ERROR(srcSize_wrong
);
879 if (dictBufferCapacity
< ZDICT_DICTSIZE_MIN
) return ERROR(dstSize_tooSmall
);
881 /* dictionary header */
882 MEM_writeLE32(header
, ZSTD_MAGIC_DICTIONARY
);
883 { U64
const randomID
= XXH64(customDictContent
, dictContentSize
, 0);
884 U32
const compliantID
= (randomID
% ((1U<<31)-32768)) + 32768;
885 U32
const dictID
= params
.dictID
? params
.dictID
: compliantID
;
886 MEM_writeLE32(header
+4, dictID
);
891 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
892 DISPLAYLEVEL(2, "statistics ... \n");
893 { size_t const eSize
= ZDICT_analyzeEntropy(header
+hSize
, HBUFFSIZE
-hSize
,
895 samplesBuffer
, samplesSizes
, nbSamples
,
896 customDictContent
, dictContentSize
,
898 if (ZDICT_isError(eSize
)) return eSize
;
902 /* copy elements in final buffer ; note : src and dst buffer can overlap */
903 if (hSize
+ dictContentSize
> dictBufferCapacity
) dictContentSize
= dictBufferCapacity
- hSize
;
904 { size_t const dictSize
= hSize
+ dictContentSize
;
905 char* dictEnd
= (char*)dictBuffer
+ dictSize
;
906 memmove(dictEnd
- dictContentSize
, customDictContent
, dictContentSize
);
907 memcpy(dictBuffer
, header
, hSize
);
913 static size_t ZDICT_addEntropyTablesFromBuffer_advanced(
914 void* dictBuffer
, size_t dictContentSize
, size_t dictBufferCapacity
,
915 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
,
916 ZDICT_params_t params
)
918 int const compressionLevel
= (params
.compressionLevel
== 0) ? g_compressionLevel_default
: params
.compressionLevel
;
919 U32
const notificationLevel
= params
.notificationLevel
;
922 /* calculate entropy tables */
923 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
924 DISPLAYLEVEL(2, "statistics ... \n");
925 { size_t const eSize
= ZDICT_analyzeEntropy((char*)dictBuffer
+hSize
, dictBufferCapacity
-hSize
,
927 samplesBuffer
, samplesSizes
, nbSamples
,
928 (char*)dictBuffer
+ dictBufferCapacity
- dictContentSize
, dictContentSize
,
930 if (ZDICT_isError(eSize
)) return eSize
;
934 /* add dictionary header (after entropy tables) */
935 MEM_writeLE32(dictBuffer
, ZSTD_MAGIC_DICTIONARY
);
936 { U64
const randomID
= XXH64((char*)dictBuffer
+ dictBufferCapacity
- dictContentSize
, dictContentSize
, 0);
937 U32
const compliantID
= (randomID
% ((1U<<31)-32768)) + 32768;
938 U32
const dictID
= params
.dictID
? params
.dictID
: compliantID
;
939 MEM_writeLE32((char*)dictBuffer
+4, dictID
);
942 if (hSize
+ dictContentSize
< dictBufferCapacity
)
943 memmove((char*)dictBuffer
+ hSize
, (char*)dictBuffer
+ dictBufferCapacity
- dictContentSize
, dictContentSize
);
944 return MIN(dictBufferCapacity
, hSize
+dictContentSize
);
947 /* Hidden declaration for dbio.c */
948 size_t ZDICT_trainFromBuffer_unsafe_legacy(
949 void* dictBuffer
, size_t maxDictSize
,
950 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
,
951 ZDICT_legacy_params_t params
);
952 /*! ZDICT_trainFromBuffer_unsafe_legacy() :
953 * Warning : `samplesBuffer` must be followed by noisy guard band.
954 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
956 size_t ZDICT_trainFromBuffer_unsafe_legacy(
957 void* dictBuffer
, size_t maxDictSize
,
958 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
,
959 ZDICT_legacy_params_t params
)
961 U32
const dictListSize
= MAX(MAX(DICTLISTSIZE_DEFAULT
, nbSamples
), (U32
)(maxDictSize
/16));
962 dictItem
* const dictList
= (dictItem
*)malloc(dictListSize
* sizeof(*dictList
));
963 unsigned const selectivity
= params
.selectivityLevel
== 0 ? g_selectivity_default
: params
.selectivityLevel
;
964 unsigned const minRep
= (selectivity
> 30) ? MINRATIO
: nbSamples
>> selectivity
;
965 size_t const targetDictSize
= maxDictSize
;
966 size_t const samplesBuffSize
= ZDICT_totalSampleSize(samplesSizes
, nbSamples
);
968 U32
const notificationLevel
= params
.zParams
.notificationLevel
;
971 if (!dictList
) return ERROR(memory_allocation
);
972 if (maxDictSize
< ZDICT_DICTSIZE_MIN
) { free(dictList
); return ERROR(dstSize_tooSmall
); } /* requested dictionary size is too small */
973 if (samplesBuffSize
< ZDICT_MIN_SAMPLES_SIZE
) { free(dictList
); return ERROR(dictionaryCreation_failed
); } /* not enough source to create dictionary */
976 ZDICT_initDictItem(dictList
);
978 /* build dictionary */
979 ZDICT_trainBuffer_legacy(dictList
, dictListSize
,
980 samplesBuffer
, samplesBuffSize
,
981 samplesSizes
, nbSamples
,
982 minRep
, notificationLevel
);
984 /* display best matches */
985 if (params
.zParams
.notificationLevel
>= 3) {
986 unsigned const nb
= MIN(25, dictList
[0].pos
);
987 unsigned const dictContentSize
= ZDICT_dictSize(dictList
);
989 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", (unsigned)dictList
[0].pos
-1, dictContentSize
);
990 DISPLAYLEVEL(3, "list %u best segments \n", nb
-1);
991 for (u
=1; u
<nb
; u
++) {
992 unsigned const pos
= dictList
[u
].pos
;
993 unsigned const length
= dictList
[u
].length
;
994 U32
const printedLength
= MIN(40, length
);
995 if ((pos
> samplesBuffSize
) || ((pos
+ length
) > samplesBuffSize
)) {
997 return ERROR(GENERIC
); /* should never happen */
999 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
1000 u
, length
, pos
, (unsigned)dictList
[u
].savings
);
1001 ZDICT_printHex((const char*)samplesBuffer
+pos
, printedLength
);
1002 DISPLAYLEVEL(3, "| \n");
1006 /* create dictionary */
1007 { unsigned dictContentSize
= ZDICT_dictSize(dictList
);
1008 if (dictContentSize
< ZDICT_CONTENTSIZE_MIN
) { free(dictList
); return ERROR(dictionaryCreation_failed
); } /* dictionary content too small */
1009 if (dictContentSize
< targetDictSize
/4) {
1010 DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize
, (unsigned)maxDictSize
);
1011 if (samplesBuffSize
< 10 * targetDictSize
)
1012 DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (unsigned)(samplesBuffSize
>>20));
1013 if (minRep
> MINRATIO
) {
1014 DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity
+1);
1015 DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
1019 if ((dictContentSize
> targetDictSize
*3) && (nbSamples
> 2*MINRATIO
) && (selectivity
>1)) {
1020 unsigned proposedSelectivity
= selectivity
-1;
1021 while ((nbSamples
>> proposedSelectivity
) <= MINRATIO
) { proposedSelectivity
--; }
1022 DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize
, (unsigned)maxDictSize
);
1023 DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity
);
1024 DISPLAYLEVEL(2, "! always test dictionary efficiency on real samples \n");
1027 /* limit dictionary size */
1028 { U32
const max
= dictList
->pos
; /* convention : nb of useful elts within dictList */
1029 U32 currentSize
= 0;
1030 U32 n
; for (n
=1; n
<max
; n
++) {
1031 currentSize
+= dictList
[n
].length
;
1032 if (currentSize
> targetDictSize
) { currentSize
-= dictList
[n
].length
; break; }
1035 dictContentSize
= currentSize
;
1038 /* build dict content */
1040 BYTE
* ptr
= (BYTE
*)dictBuffer
+ maxDictSize
;
1041 for (u
=1; u
<dictList
->pos
; u
++) {
1042 U32 l
= dictList
[u
].length
;
1044 if (ptr
<(BYTE
*)dictBuffer
) { free(dictList
); return ERROR(GENERIC
); } /* should not happen */
1045 memcpy(ptr
, (const char*)samplesBuffer
+dictList
[u
].pos
, l
);
1048 dictSize
= ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer
, dictContentSize
, maxDictSize
,
1049 samplesBuffer
, samplesSizes
, nbSamples
,
1059 /* ZDICT_trainFromBuffer_legacy() :
1060 * issue : samplesBuffer need to be followed by a noisy guard band.
1061 * work around : duplicate the buffer, and add the noise */
1062 size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer
, size_t dictBufferCapacity
,
1063 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
,
1064 ZDICT_legacy_params_t params
)
1068 size_t const sBuffSize
= ZDICT_totalSampleSize(samplesSizes
, nbSamples
);
1069 if (sBuffSize
< ZDICT_MIN_SAMPLES_SIZE
) return 0; /* not enough content => no dictionary */
1071 newBuff
= malloc(sBuffSize
+ NOISELENGTH
);
1072 if (!newBuff
) return ERROR(memory_allocation
);
1074 memcpy(newBuff
, samplesBuffer
, sBuffSize
);
1075 ZDICT_fillNoise((char*)newBuff
+ sBuffSize
, NOISELENGTH
); /* guard band, for end of buffer condition */
1078 ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer
, dictBufferCapacity
, newBuff
,
1079 samplesSizes
, nbSamples
, params
);
1085 size_t ZDICT_trainFromBuffer(void* dictBuffer
, size_t dictBufferCapacity
,
1086 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
)
1088 ZDICT_fastCover_params_t params
;
1089 DEBUGLOG(3, "ZDICT_trainFromBuffer");
1090 memset(¶ms
, 0, sizeof(params
));
1093 /* Default to level 6 since no compression level information is available */
1094 params
.zParams
.compressionLevel
= 3;
1095 #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
1096 params
.zParams
.notificationLevel
= DEBUGLEVEL
;
1098 return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer
, dictBufferCapacity
,
1099 samplesBuffer
, samplesSizes
, nbSamples
,
1103 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer
, size_t dictContentSize
, size_t dictBufferCapacity
,
1104 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
)
1106 ZDICT_params_t params
;
1107 memset(¶ms
, 0, sizeof(params
));
1108 return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer
, dictContentSize
, dictBufferCapacity
,
1109 samplesBuffer
, samplesSizes
, nbSamples
,