2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree. An additional grant
7 * of patent rights can be found in the PATENTS file in the same directory.
11 /*-**************************************
13 ****************************************/
14 #define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
15 #define ZDICT_MIN_SAMPLES_SIZE 512
18 /*-**************************************
20 ****************************************/
21 /* Unix Large Files support (>4GB) */
22 #define _FILE_OFFSET_BITS 64
23 #if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
24 # define _LARGEFILE_SOURCE
25 #elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
26 # define _LARGEFILE64_SOURCE
30 /*-*************************************
32 ***************************************/
33 #include <stdlib.h> /* malloc, free */
34 #include <string.h> /* memset */
35 #include <stdio.h> /* fprintf, fopen, ftello64 */
36 #include <time.h> /* clock */
38 #include "mem.h" /* read */
39 #include "error_private.h"
40 #include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
41 #define HUF_STATIC_LINKING_ONLY
43 #include "zstd_internal.h" /* includes zstd.h */
45 #include "divsufsort.h"
46 #ifndef ZDICT_STATIC_LINKING_ONLY
47 # define ZDICT_STATIC_LINKING_ONLY
52 /*-*************************************
54 ***************************************/
59 #define DICTLISTSIZE_DEFAULT 10000
61 #define NOISELENGTH 32
64 static const int g_compressionLevel_default
= 5;
65 static const U32 g_selectivity_default
= 9;
66 static const size_t g_provision_entropySize
= 200;
67 static const size_t g_min_fast_dictContent
= 192;
70 /*-*************************************
72 ***************************************/
73 #define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
74 #define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
76 static clock_t ZDICT_clockSpan(clock_t nPrevious
) { return clock() - nPrevious
; }
78 static void ZDICT_printHex(const void* ptr
, size_t length
)
80 const BYTE
* const b
= (const BYTE
*)ptr
;
82 for (u
=0; u
<length
; u
++) {
84 if (c
<32 || c
>126) c
= '.'; /* non-printable char */
90 /*-********************************************************
92 **********************************************************/
93 unsigned ZDICT_isError(size_t errorCode
) { return ERR_isError(errorCode
); }
95 const char* ZDICT_getErrorName(size_t errorCode
) { return ERR_getErrorName(errorCode
); }
97 unsigned ZDICT_getDictID(const void* dictBuffer
, size_t dictSize
)
99 if (dictSize
< 8) return 0;
100 if (MEM_readLE32(dictBuffer
) != ZSTD_DICT_MAGIC
) return 0;
101 return MEM_readLE32((const char*)dictBuffer
+ 4);
105 /*-********************************************************
106 * Dictionary training functions
107 **********************************************************/
108 static unsigned ZDICT_NbCommonBytes (register size_t val
)
110 if (MEM_isLittleEndian()) {
112 # if defined(_MSC_VER) && defined(_WIN64)
114 _BitScanForward64( &r
, (U64
)val
);
115 return (unsigned)(r
>>3);
116 # elif defined(__GNUC__) && (__GNUC__ >= 3)
117 return (__builtin_ctzll((U64
)val
) >> 3);
119 static const int DeBruijnBytePos
[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
120 return DeBruijnBytePos
[((U64
)((val
& -(long long)val
) * 0x0218A392CDABBD3FULL
)) >> 58];
122 } else { /* 32 bits */
123 # if defined(_MSC_VER)
125 _BitScanForward( &r
, (U32
)val
);
126 return (unsigned)(r
>>3);
127 # elif defined(__GNUC__) && (__GNUC__ >= 3)
128 return (__builtin_ctz((U32
)val
) >> 3);
130 static const int DeBruijnBytePos
[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
131 return DeBruijnBytePos
[((U32
)((val
& -(S32
)val
) * 0x077CB531U
)) >> 27];
134 } else { /* Big Endian CPU */
136 # if defined(_MSC_VER) && defined(_WIN64)
138 _BitScanReverse64( &r
, val
);
139 return (unsigned)(r
>>3);
140 # elif defined(__GNUC__) && (__GNUC__ >= 3)
141 return (__builtin_clzll(val
) >> 3);
144 const unsigned n32
= sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */
145 if (!(val
>>n32
)) { r
=4; } else { r
=0; val
>>=n32
; }
146 if (!(val
>>16)) { r
+=2; val
>>=8; } else { val
>>=24; }
150 } else { /* 32 bits */
151 # if defined(_MSC_VER)
153 _BitScanReverse( &r
, (unsigned long)val
);
154 return (unsigned)(r
>>3);
155 # elif defined(__GNUC__) && (__GNUC__ >= 3)
156 return (__builtin_clz((U32
)val
) >> 3);
159 if (!(val
>>16)) { r
=2; val
>>=8; } else { r
=0; val
>>=24; }
168 Count the nb of common bytes between 2 pointers.
169 Note : this function presumes end of buffer followed by noisy guard band.
171 static size_t ZDICT_count(const void* pIn
, const void* pMatch
)
173 const char* const pStart
= (const char*)pIn
;
175 size_t const diff
= MEM_readST(pMatch
) ^ MEM_readST(pIn
);
177 pIn
= (const char*)pIn
+sizeof(size_t);
178 pMatch
= (const char*)pMatch
+sizeof(size_t);
181 pIn
= (const char*)pIn
+ZDICT_NbCommonBytes(diff
);
182 return (size_t)((const char*)pIn
- pStart
);
193 static void ZDICT_initDictItem(dictItem
* d
)
197 d
->savings
= (U32
)(-1);
201 #define LLIMIT 64 /* heuristic determined experimentally */
202 #define MINMATCHLENGTH 7 /* heuristic determined experimentally */
203 static dictItem
ZDICT_analyzePos(
205 const int* suffix
, U32 start
,
206 const void* buffer
, U32 minRatio
, U32 notificationLevel
)
208 U32 lengthList
[LLIMIT
] = {0};
209 U32 cumulLength
[LLIMIT
] = {0};
210 U32 savings
[LLIMIT
] = {0};
211 const BYTE
* b
= (const BYTE
*)buffer
;
213 size_t maxLength
= LLIMIT
;
214 size_t pos
= suffix
[start
];
219 memset(&solution
, 0, sizeof(solution
));
222 /* trivial repetition cases */
223 if ( (MEM_read16(b
+pos
+0) == MEM_read16(b
+pos
+2))
224 ||(MEM_read16(b
+pos
+1) == MEM_read16(b
+pos
+3))
225 ||(MEM_read16(b
+pos
+2) == MEM_read16(b
+pos
+4)) ) {
226 /* skip and mark segment */
227 U16 u16
= MEM_read16(b
+pos
+4);
229 while (MEM_read16(b
+pos
+e
) == u16
) e
+=2 ;
230 if (b
[pos
+e
] == b
[pos
+e
-1]) e
++;
232 doneMarks
[pos
+u
] = 1;
239 length
= ZDICT_count(b
+ pos
, b
+ suffix
[end
]);
240 } while (length
>=MINMATCHLENGTH
);
244 length
= ZDICT_count(b
+ pos
, b
+ *(suffix
+start
-1));
245 if (length
>=MINMATCHLENGTH
) start
--;
246 } while(length
>= MINMATCHLENGTH
);
248 /* exit if not found a minimum nb of repetitions */
249 if (end
-start
< minRatio
) {
251 for(idx
=start
; idx
<end
; idx
++)
252 doneMarks
[suffix
[idx
]] = 1;
258 U32 refinedStart
= start
;
259 U32 refinedEnd
= end
;
261 DISPLAYLEVEL(4, "\n");
262 DISPLAYLEVEL(4, "found %3u matches of length >= %i at pos %7u ", (U32
)(end
-start
), MINMATCHLENGTH
, (U32
)pos
);
263 DISPLAYLEVEL(4, "\n");
265 for (searchLength
= MINMATCHLENGTH
; ; searchLength
++) {
266 BYTE currentChar
= 0;
267 U32 currentCount
= 0;
268 U32 currentID
= refinedStart
;
270 U32 selectedCount
= 0;
271 U32 selectedID
= currentID
;
272 for (id
=refinedStart
; id
< refinedEnd
; id
++) {
273 if (b
[ suffix
[id
] + searchLength
] != currentChar
) {
274 if (currentCount
> selectedCount
) {
275 selectedCount
= currentCount
;
276 selectedID
= currentID
;
279 currentChar
= b
[ suffix
[id
] + searchLength
];
284 if (currentCount
> selectedCount
) { /* for last */
285 selectedCount
= currentCount
;
286 selectedID
= currentID
;
289 if (selectedCount
< minRatio
)
291 refinedStart
= selectedID
;
292 refinedEnd
= refinedStart
+ selectedCount
;
295 /* evaluate gain based on new ref */
296 start
= refinedStart
;
297 pos
= suffix
[refinedStart
];
299 memset(lengthList
, 0, sizeof(lengthList
));
304 length
= ZDICT_count(b
+ pos
, b
+ suffix
[end
]);
305 if (length
>= LLIMIT
) length
= LLIMIT
-1;
306 lengthList
[length
]++;
307 } while (length
>=MINMATCHLENGTH
);
310 length
= MINMATCHLENGTH
;
311 while ((length
>= MINMATCHLENGTH
) & (start
> 0)) {
312 length
= ZDICT_count(b
+ pos
, b
+ suffix
[start
- 1]);
313 if (length
>= LLIMIT
) length
= LLIMIT
- 1;
314 lengthList
[length
]++;
315 if (length
>= MINMATCHLENGTH
) start
--;
318 /* largest useful length */
319 memset(cumulLength
, 0, sizeof(cumulLength
));
320 cumulLength
[maxLength
-1] = lengthList
[maxLength
-1];
321 for (i
=(int)(maxLength
-2); i
>=0; i
--)
322 cumulLength
[i
] = cumulLength
[i
+1] + lengthList
[i
];
324 for (i
=LLIMIT
-1; i
>=MINMATCHLENGTH
; i
--) if (cumulLength
[i
]>=minRatio
) break;
327 /* reduce maxLength in case of final into repetitive data */
328 { U32 l
= (U32
)maxLength
;
329 BYTE
const c
= b
[pos
+ maxLength
-1];
330 while (b
[pos
+l
-2]==c
) l
--;
333 if (maxLength
< MINMATCHLENGTH
) return solution
; /* skip : no long-enough solution */
335 /* calculate savings */
337 for (i
=MINMATCHLENGTH
; i
<=(int)maxLength
; i
++)
338 savings
[i
] = savings
[i
-1] + (lengthList
[i
] * (i
-3));
340 DISPLAYLEVEL(4, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
341 (U32
)pos
, (U32
)maxLength
, savings
[maxLength
], (double)savings
[maxLength
] / maxLength
);
343 solution
.pos
= (U32
)pos
;
344 solution
.length
= (U32
)maxLength
;
345 solution
.savings
= savings
[maxLength
];
347 /* mark positions done */
349 for (id
=start
; id
<end
; id
++) {
351 U32
const testedPos
= suffix
[id
];
352 if (testedPos
== pos
)
353 length
= solution
.length
;
355 length
= ZDICT_count(b
+pos
, b
+testedPos
);
356 if (length
> solution
.length
) length
= solution
.length
;
358 pEnd
= (U32
)(testedPos
+ length
);
359 for (p
=testedPos
; p
<pEnd
; p
++)
368 check if dictItem can be merged, do it if possible
369 @return : id of destination elt, 0 if not merged
371 static U32
ZDICT_checkMerge(dictItem
* table
, dictItem elt
, U32 eltNbToSkip
)
373 const U32 tableSize
= table
->pos
;
374 const U32 eltEnd
= elt
.pos
+ elt
.length
;
377 U32 u
; for (u
=1; u
<tableSize
; u
++) {
378 if (u
==eltNbToSkip
) continue;
379 if ((table
[u
].pos
> elt
.pos
) && (table
[u
].pos
<= eltEnd
)) { /* overlap, existing > new */
381 U32 addedLength
= table
[u
].pos
- elt
.pos
;
382 table
[u
].length
+= addedLength
;
383 table
[u
].pos
= elt
.pos
;
384 table
[u
].savings
+= elt
.savings
* addedLength
/ elt
.length
; /* rough approx */
385 table
[u
].savings
+= elt
.length
/ 8; /* rough approx bonus */
387 /* sort : improve rank */
388 while ((u
>1) && (table
[u
-1].savings
< elt
.savings
))
389 table
[u
] = table
[u
-1], u
--;
395 for (u
=1; u
<tableSize
; u
++) {
396 if (u
==eltNbToSkip
) continue;
397 if ((table
[u
].pos
+ table
[u
].length
>= elt
.pos
) && (table
[u
].pos
< elt
.pos
)) { /* overlap, existing < new */
399 int addedLength
= (int)eltEnd
- (table
[u
].pos
+ table
[u
].length
);
400 table
[u
].savings
+= elt
.length
/ 8; /* rough approx bonus */
401 if (addedLength
> 0) { /* otherwise, elt fully included into existing */
402 table
[u
].length
+= addedLength
;
403 table
[u
].savings
+= elt
.savings
* addedLength
/ elt
.length
; /* rough approx */
405 /* sort : improve rank */
407 while ((u
>1) && (table
[u
-1].savings
< elt
.savings
))
408 table
[u
] = table
[u
-1], u
--;
417 static void ZDICT_removeDictItem(dictItem
* table
, U32 id
)
419 /* convention : first element is nb of elts */
420 U32
const max
= table
->pos
;
422 if (!id
) return; /* protection, should never happen */
423 for (u
=id
; u
<max
-1; u
++)
424 table
[u
] = table
[u
+1];
429 static void ZDICT_insertDictItem(dictItem
* table
, U32 maxSize
, dictItem elt
)
431 /* merge if possible */
432 U32 mergeId
= ZDICT_checkMerge(table
, elt
, 0);
436 newMerge
= ZDICT_checkMerge(table
, table
[mergeId
], mergeId
);
437 if (newMerge
) ZDICT_removeDictItem(table
, mergeId
);
445 U32 nextElt
= table
->pos
;
446 if (nextElt
>= maxSize
) nextElt
= maxSize
-1;
448 while (table
[current
].savings
< elt
.savings
) {
449 table
[current
+1] = table
[current
];
452 table
[current
+1] = elt
;
453 table
->pos
= nextElt
+1;
458 static U32
ZDICT_dictSize(const dictItem
* dictList
)
461 for (u
=1; u
<dictList
[0].pos
; u
++)
462 dictSize
+= dictList
[u
].length
;
467 static size_t ZDICT_trainBuffer(dictItem
* dictList
, U32 dictListSize
,
468 const void* const buffer
, size_t bufferSize
, /* buffer must end with noisy guard band */
469 const size_t* fileSizes
, unsigned nbFiles
,
470 U32 minRatio
, U32 notificationLevel
)
472 int* const suffix0
= (int*)malloc((bufferSize
+2)*sizeof(*suffix0
));
473 int* const suffix
= suffix0
+1;
474 U32
* reverseSuffix
= (U32
*)malloc((bufferSize
)*sizeof(*reverseSuffix
));
475 BYTE
* doneMarks
= (BYTE
*)malloc((bufferSize
+16)*sizeof(*doneMarks
)); /* +16 for overflow security */
476 U32
* filePos
= (U32
*)malloc(nbFiles
* sizeof(*filePos
));
478 clock_t displayClock
= 0;
479 clock_t const refreshRate
= CLOCKS_PER_SEC
* 3 / 10;
481 # define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
482 if (ZDICT_clockSpan(displayClock) > refreshRate) \
483 { displayClock = clock(); DISPLAY(__VA_ARGS__); \
484 if (notificationLevel>=4) fflush(stdout); } }
487 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
488 if (!suffix0
|| !reverseSuffix
|| !doneMarks
|| !filePos
) {
489 result
= ERROR(memory_allocation
);
492 if (minRatio
< MINRATIO
) minRatio
= MINRATIO
;
493 memset(doneMarks
, 0, bufferSize
+16);
495 /* limit sample set size (divsufsort limitation)*/
496 if (bufferSize
> ZDICT_MAX_SAMPLES_SIZE
) DISPLAYLEVEL(3, "sample set too large : reduced to %u MB ...\n", (U32
)(ZDICT_MAX_SAMPLES_SIZE
>>20));
497 while (bufferSize
> ZDICT_MAX_SAMPLES_SIZE
) bufferSize
-= fileSizes
[--nbFiles
];
500 DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles
, (U32
)(bufferSize
>>20));
501 { int const divSuftSortResult
= divsufsort((const unsigned char*)buffer
, suffix
, (int)bufferSize
, 0);
502 if (divSuftSortResult
!= 0) { result
= ERROR(GENERIC
); goto _cleanup
; }
504 suffix
[bufferSize
] = (int)bufferSize
; /* leads into noise */
505 suffix0
[0] = (int)bufferSize
; /* leads into noise */
506 /* build reverse suffix sort */
508 for (pos
=0; pos
< bufferSize
; pos
++)
509 reverseSuffix
[suffix
[pos
]] = (U32
)pos
;
510 /* note filePos tracks borders between samples.
511 It's not used at this stage, but planned to become useful in a later update */
513 for (pos
=1; pos
<nbFiles
; pos
++)
514 filePos
[pos
] = (U32
)(filePos
[pos
-1] + fileSizes
[pos
-1]);
517 DISPLAYLEVEL(2, "finding patterns ... \n");
518 DISPLAYLEVEL(3, "minimum ratio : %u \n", minRatio
);
520 { U32 cursor
; for (cursor
=0; cursor
< bufferSize
; ) {
522 if (doneMarks
[cursor
]) { cursor
++; continue; }
523 solution
= ZDICT_analyzePos(doneMarks
, suffix
, reverseSuffix
[cursor
], buffer
, minRatio
, notificationLevel
);
524 if (solution
.length
==0) { cursor
++; continue; }
525 ZDICT_insertDictItem(dictList
, dictListSize
, solution
);
526 cursor
+= solution
.length
;
527 DISPLAYUPDATE(2, "\r%4.2f %% \r", (double)cursor
/ bufferSize
* 100);
539 static void ZDICT_fillNoise(void* buffer
, size_t length
)
541 unsigned const prime1
= 2654435761U;
542 unsigned const prime2
= 2246822519U;
543 unsigned acc
= prime1
;
545 for (p
=0; p
<length
; p
++) {
547 ((unsigned char*)buffer
)[p
] = (unsigned char)(acc
>> 21);
556 void* workPlace
; /* must be ZSTD_BLOCKSIZE_ABSOLUTEMAX allocated */
559 #define MAXREPOFFSET 1024
561 static void ZDICT_countEStats(EStats_ress_t esr
, ZSTD_parameters params
,
562 U32
* countLit
, U32
* offsetcodeCount
, U32
* matchlengthCount
, U32
* litlengthCount
, U32
* repOffsets
,
563 const void* src
, size_t srcSize
, U32 notificationLevel
)
565 size_t const blockSizeMax
= MIN (ZSTD_BLOCKSIZE_ABSOLUTEMAX
, 1 << params
.cParams
.windowLog
);
568 if (srcSize
> blockSizeMax
) srcSize
= blockSizeMax
; /* protection vs large samples */
569 { size_t const errorCode
= ZSTD_copyCCtx(esr
.zc
, esr
.ref
, 0);
570 if (ZSTD_isError(errorCode
)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; }
572 cSize
= ZSTD_compressBlock(esr
.zc
, esr
.workPlace
, ZSTD_BLOCKSIZE_ABSOLUTEMAX
, src
, srcSize
);
573 if (ZSTD_isError(cSize
)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32
)srcSize
); return; }
575 if (cSize
) { /* if == 0; block is not compressible */
576 const seqStore_t
* seqStorePtr
= ZSTD_getSeqStore(esr
.zc
);
579 { const BYTE
* bytePtr
;
580 for(bytePtr
= seqStorePtr
->litStart
; bytePtr
< seqStorePtr
->lit
; bytePtr
++)
581 countLit
[*bytePtr
]++;
585 { U32
const nbSeq
= (U32
)(seqStorePtr
->sequences
- seqStorePtr
->sequencesStart
);
586 ZSTD_seqToCodes(seqStorePtr
);
588 { const BYTE
* codePtr
= seqStorePtr
->ofCode
;
590 for (u
=0; u
<nbSeq
; u
++) offsetcodeCount
[codePtr
[u
]]++;
593 { const BYTE
* codePtr
= seqStorePtr
->mlCode
;
595 for (u
=0; u
<nbSeq
; u
++) matchlengthCount
[codePtr
[u
]]++;
598 { const BYTE
* codePtr
= seqStorePtr
->llCode
;
600 for (u
=0; u
<nbSeq
; u
++) litlengthCount
[codePtr
[u
]]++;
603 if (nbSeq
>= 2) { /* rep offsets */
604 const seqDef
* const seq
= seqStorePtr
->sequencesStart
;
605 U32 offset1
= seq
[0].offset
- 3;
606 U32 offset2
= seq
[1].offset
- 3;
607 if (offset1
>= MAXREPOFFSET
) offset1
= 0;
608 if (offset2
>= MAXREPOFFSET
) offset2
= 0;
609 repOffsets
[offset1
] += 3;
610 repOffsets
[offset2
] += 1;
615 static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
619 for (u=0; u<nbFiles; u++)
620 if (max < fileSizes[u]) max = fileSizes[u];
625 static size_t ZDICT_totalSampleSize(const size_t* fileSizes
, unsigned nbFiles
)
629 for (u
=0; u
<nbFiles
; u
++) total
+= fileSizes
[u
];
633 typedef struct { U32 offset
; U32 count
; } offsetCount_t
;
635 static void ZDICT_insertSortCount(offsetCount_t table
[ZSTD_REP_NUM
+1], U32 val
, U32 count
)
638 table
[ZSTD_REP_NUM
].offset
= val
;
639 table
[ZSTD_REP_NUM
].count
= count
;
640 for (u
=ZSTD_REP_NUM
; u
>0; u
--) {
642 if (table
[u
-1].count
>= table
[u
].count
) break;
644 table
[u
-1] = table
[u
];
650 #define OFFCODE_MAX 30 /* only applicable to first block */
651 static size_t ZDICT_analyzeEntropy(void* dstBuffer
, size_t maxDstSize
,
652 unsigned compressionLevel
,
653 const void* srcBuffer
, const size_t* fileSizes
, unsigned nbFiles
,
654 const void* dictBuffer
, size_t dictBufferSize
,
655 unsigned notificationLevel
)
658 HUF_CREATE_STATIC_CTABLE(hufTable
, 255);
659 U32 offcodeCount
[OFFCODE_MAX
+1];
660 short offcodeNCount
[OFFCODE_MAX
+1];
661 U32 offcodeMax
= ZSTD_highbit32((U32
)(dictBufferSize
+ 128 KB
));
662 U32 matchLengthCount
[MaxML
+1];
663 short matchLengthNCount
[MaxML
+1];
664 U32 litLengthCount
[MaxLL
+1];
665 short litLengthNCount
[MaxLL
+1];
666 U32 repOffset
[MAXREPOFFSET
];
667 offsetCount_t bestRepOffset
[ZSTD_REP_NUM
+1];
669 ZSTD_parameters params
;
670 U32 u
, huffLog
= 11, Offlog
= OffFSELog
, mlLog
= MLFSELog
, llLog
= LLFSELog
, total
;
671 size_t pos
= 0, errorCode
;
673 size_t const totalSrcSize
= ZDICT_totalSampleSize(fileSizes
, nbFiles
);
674 size_t const averageSampleSize
= totalSrcSize
/ (nbFiles
+ !nbFiles
);
675 BYTE
* dstPtr
= (BYTE
*)dstBuffer
;
678 esr
.ref
= ZSTD_createCCtx();
679 esr
.zc
= ZSTD_createCCtx();
680 esr
.workPlace
= malloc(ZSTD_BLOCKSIZE_ABSOLUTEMAX
);
681 if (!esr
.ref
|| !esr
.zc
|| !esr
.workPlace
) {
682 eSize
= ERROR(memory_allocation
);
683 DISPLAYLEVEL(1, "Not enough memory \n");
686 if (offcodeMax
>OFFCODE_MAX
) { eSize
= ERROR(dictionary_wrong
); goto _cleanup
; } /* too large dictionary */
687 for (u
=0; u
<256; u
++) countLit
[u
]=1; /* any character must be described */
688 for (u
=0; u
<=offcodeMax
; u
++) offcodeCount
[u
]=1;
689 for (u
=0; u
<=MaxML
; u
++) matchLengthCount
[u
]=1;
690 for (u
=0; u
<=MaxLL
; u
++) litLengthCount
[u
]=1;
691 memset(repOffset
, 0, sizeof(repOffset
));
692 repOffset
[1] = repOffset
[4] = repOffset
[8] = 1;
693 memset(bestRepOffset
, 0, sizeof(bestRepOffset
));
694 if (compressionLevel
==0) compressionLevel
=g_compressionLevel_default
;
695 params
= ZSTD_getParams(compressionLevel
, averageSampleSize
, dictBufferSize
);
696 { size_t const beginResult
= ZSTD_compressBegin_advanced(esr
.ref
, dictBuffer
, dictBufferSize
, params
, 0);
697 if (ZSTD_isError(beginResult
)) {
698 eSize
= ERROR(GENERIC
);
699 DISPLAYLEVEL(1, "error : ZSTD_compressBegin_advanced failed \n");
703 /* collect stats on all files */
704 for (u
=0; u
<nbFiles
; u
++) {
705 ZDICT_countEStats(esr
, params
,
706 countLit
, offcodeCount
, matchLengthCount
, litLengthCount
, repOffset
,
707 (const char*)srcBuffer
+ pos
, fileSizes
[u
],
713 errorCode
= HUF_buildCTable (hufTable
, countLit
, 255, huffLog
);
714 if (HUF_isError(errorCode
)) {
715 eSize
= ERROR(GENERIC
);
716 DISPLAYLEVEL(1, "HUF_buildCTable error \n");
719 huffLog
= (U32
)errorCode
;
721 /* looking for most common first offsets */
723 for (offset
=1; offset
<MAXREPOFFSET
; offset
++)
724 ZDICT_insertSortCount(bestRepOffset
, offset
, repOffset
[offset
]);
726 /* note : the result of this phase should be used to better appreciate the impact on statistics */
728 total
=0; for (u
=0; u
<=offcodeMax
; u
++) total
+=offcodeCount
[u
];
729 errorCode
= FSE_normalizeCount(offcodeNCount
, Offlog
, offcodeCount
, total
, offcodeMax
);
730 if (FSE_isError(errorCode
)) {
731 eSize
= ERROR(GENERIC
);
732 DISPLAYLEVEL(1, "FSE_normalizeCount error with offcodeCount \n");
735 Offlog
= (U32
)errorCode
;
737 total
=0; for (u
=0; u
<=MaxML
; u
++) total
+=matchLengthCount
[u
];
738 errorCode
= FSE_normalizeCount(matchLengthNCount
, mlLog
, matchLengthCount
, total
, MaxML
);
739 if (FSE_isError(errorCode
)) {
740 eSize
= ERROR(GENERIC
);
741 DISPLAYLEVEL(1, "FSE_normalizeCount error with matchLengthCount \n");
744 mlLog
= (U32
)errorCode
;
746 total
=0; for (u
=0; u
<=MaxLL
; u
++) total
+=litLengthCount
[u
];
747 errorCode
= FSE_normalizeCount(litLengthNCount
, llLog
, litLengthCount
, total
, MaxLL
);
748 if (FSE_isError(errorCode
)) {
749 eSize
= ERROR(GENERIC
);
750 DISPLAYLEVEL(1, "FSE_normalizeCount error with litLengthCount \n");
753 llLog
= (U32
)errorCode
;
755 /* write result to buffer */
756 { size_t const hhSize
= HUF_writeCTable(dstPtr
, maxDstSize
, hufTable
, 255, huffLog
);
757 if (HUF_isError(hhSize
)) {
758 eSize
= ERROR(GENERIC
);
759 DISPLAYLEVEL(1, "HUF_writeCTable error \n");
763 maxDstSize
-= hhSize
;
767 { size_t const ohSize
= FSE_writeNCount(dstPtr
, maxDstSize
, offcodeNCount
, OFFCODE_MAX
, Offlog
);
768 if (FSE_isError(ohSize
)) {
769 eSize
= ERROR(GENERIC
);
770 DISPLAYLEVEL(1, "FSE_writeNCount error with offcodeNCount \n");
774 maxDstSize
-= ohSize
;
778 { size_t const mhSize
= FSE_writeNCount(dstPtr
, maxDstSize
, matchLengthNCount
, MaxML
, mlLog
);
779 if (FSE_isError(mhSize
)) {
780 eSize
= ERROR(GENERIC
);
781 DISPLAYLEVEL(1, "FSE_writeNCount error with matchLengthNCount \n");
785 maxDstSize
-= mhSize
;
789 { size_t const lhSize
= FSE_writeNCount(dstPtr
, maxDstSize
, litLengthNCount
, MaxLL
, llLog
);
790 if (FSE_isError(lhSize
)) {
791 eSize
= ERROR(GENERIC
);
792 DISPLAYLEVEL(1, "FSE_writeNCount error with litlengthNCount \n");
796 maxDstSize
-= lhSize
;
801 eSize
= ERROR(GENERIC
);
802 DISPLAYLEVEL(1, "not enough space to write RepOffsets \n");
806 MEM_writeLE32(dstPtr
+0, bestRepOffset
[0].offset
);
807 MEM_writeLE32(dstPtr
+4, bestRepOffset
[1].offset
);
808 MEM_writeLE32(dstPtr
+8, bestRepOffset
[2].offset
);
810 /* at this stage, we don't use the result of "most common first offset",
811 as the impact of statistics is not properly evaluated */
812 MEM_writeLE32(dstPtr
+0, repStartValue
[0]);
813 MEM_writeLE32(dstPtr
+4, repStartValue
[1]);
814 MEM_writeLE32(dstPtr
+8, repStartValue
[2]);
820 ZSTD_freeCCtx(esr
.ref
);
821 ZSTD_freeCCtx(esr
.zc
);
828 size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer
, size_t dictContentSize
, size_t dictBufferCapacity
,
829 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
,
830 ZDICT_params_t params
)
833 int const compressionLevel
= (params
.compressionLevel
<= 0) ? g_compressionLevel_default
: params
.compressionLevel
;
834 U32
const notificationLevel
= params
.notificationLevel
;
836 /* dictionary header */
837 MEM_writeLE32(dictBuffer
, ZSTD_DICT_MAGIC
);
838 { U64
const randomID
= XXH64((char*)dictBuffer
+ dictBufferCapacity
- dictContentSize
, dictContentSize
, 0);
839 U32
const compliantID
= (randomID
% ((1U<<31)-32768)) + 32768;
840 U32
const dictID
= params
.dictID
? params
.dictID
: compliantID
;
841 MEM_writeLE32((char*)dictBuffer
+4, dictID
);
846 DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
847 DISPLAYLEVEL(2, "statistics ... \n");
848 { size_t const eSize
= ZDICT_analyzeEntropy((char*)dictBuffer
+hSize
, dictBufferCapacity
-hSize
,
850 samplesBuffer
, samplesSizes
, nbSamples
,
851 (char*)dictBuffer
+ dictBufferCapacity
- dictContentSize
, dictContentSize
,
853 if (ZDICT_isError(eSize
)) return eSize
;
858 if (hSize
+ dictContentSize
< dictBufferCapacity
)
859 memmove((char*)dictBuffer
+ hSize
, (char*)dictBuffer
+ dictBufferCapacity
- dictContentSize
, dictContentSize
);
860 return MIN(dictBufferCapacity
, hSize
+dictContentSize
);
864 /*! ZDICT_trainFromBuffer_unsafe() :
865 * Warning : `samplesBuffer` must be followed by noisy guard band.
866 * @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
868 size_t ZDICT_trainFromBuffer_unsafe(
869 void* dictBuffer
, size_t maxDictSize
,
870 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
,
871 ZDICT_params_t params
)
873 U32
const dictListSize
= MAX(MAX(DICTLISTSIZE_DEFAULT
, nbSamples
), (U32
)(maxDictSize
/16));
874 dictItem
* const dictList
= (dictItem
*)malloc(dictListSize
* sizeof(*dictList
));
875 unsigned const selectivity
= params
.selectivityLevel
== 0 ? g_selectivity_default
: params
.selectivityLevel
;
876 unsigned const minRep
= (selectivity
> 30) ? MINRATIO
: nbSamples
>> selectivity
;
877 size_t const targetDictSize
= maxDictSize
;
878 size_t const samplesBuffSize
= ZDICT_totalSampleSize(samplesSizes
, nbSamples
);
880 U32
const notificationLevel
= params
.notificationLevel
;
883 if (!dictList
) return ERROR(memory_allocation
);
884 if (maxDictSize
<= g_provision_entropySize
+ g_min_fast_dictContent
) { free(dictList
); return ERROR(dstSize_tooSmall
); }
885 if (samplesBuffSize
< ZDICT_MIN_SAMPLES_SIZE
) { free(dictList
); return 0; } /* not enough source to create dictionary */
888 ZDICT_initDictItem(dictList
);
890 /* build dictionary */
891 ZDICT_trainBuffer(dictList
, dictListSize
,
892 samplesBuffer
, samplesBuffSize
,
893 samplesSizes
, nbSamples
,
894 minRep
, notificationLevel
);
896 /* display best matches */
897 if (params
.notificationLevel
>= 3) {
898 U32
const nb
= MIN(25, dictList
[0].pos
);
899 U32
const dictContentSize
= ZDICT_dictSize(dictList
);
901 DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList
[0].pos
-1, dictContentSize
);
902 DISPLAYLEVEL(3, "list %u best segments \n", nb
-1);
903 for (u
=1; u
<nb
; u
++) {
904 U32
const pos
= dictList
[u
].pos
;
905 U32
const length
= dictList
[u
].length
;
906 U32
const printedLength
= MIN(40, length
);
907 if ((pos
> samplesBuffSize
) || ((pos
+ length
) > samplesBuffSize
))
908 return ERROR(GENERIC
); /* should never happen */
909 DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
910 u
, length
, pos
, dictList
[u
].savings
);
911 ZDICT_printHex((const char*)samplesBuffer
+pos
, printedLength
);
912 DISPLAYLEVEL(3, "| \n");
916 /* create dictionary */
917 { U32 dictContentSize
= ZDICT_dictSize(dictList
);
918 if (dictContentSize
< targetDictSize
/3) {
919 DISPLAYLEVEL(2, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize
, (U32
)maxDictSize
);
920 if (minRep
> MINRATIO
) {
921 DISPLAYLEVEL(2, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity
+1);
922 DISPLAYLEVEL(2, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
924 if (samplesBuffSize
< 10 * targetDictSize
)
925 DISPLAYLEVEL(2, "! consider increasing the number of samples (total size : %u MB)\n", (U32
)(samplesBuffSize
>>20));
928 if ((dictContentSize
> targetDictSize
*3) && (nbSamples
> 2*MINRATIO
) && (selectivity
>1)) {
929 U32 proposedSelectivity
= selectivity
-1;
930 while ((nbSamples
>> proposedSelectivity
) <= MINRATIO
) { proposedSelectivity
--; }
931 DISPLAYLEVEL(2, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize
, (U32
)maxDictSize
);
932 DISPLAYLEVEL(2, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity
);
933 DISPLAYLEVEL(2, "! always test dictionary efficiency on samples \n");
936 /* limit dictionary size */
937 { U32
const max
= dictList
->pos
; /* convention : nb of useful elts within dictList */
939 U32 n
; for (n
=1; n
<max
; n
++) {
940 currentSize
+= dictList
[n
].length
;
941 if (currentSize
> targetDictSize
) { currentSize
-= dictList
[n
].length
; break; }
944 dictContentSize
= currentSize
;
947 /* build dict content */
949 BYTE
* ptr
= (BYTE
*)dictBuffer
+ maxDictSize
;
950 for (u
=1; u
<dictList
->pos
; u
++) {
951 U32 l
= dictList
[u
].length
;
953 if (ptr
<(BYTE
*)dictBuffer
) { free(dictList
); return ERROR(GENERIC
); } /* should not happen */
954 memcpy(ptr
, (const char*)samplesBuffer
+dictList
[u
].pos
, l
);
957 dictSize
= ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer
, dictContentSize
, maxDictSize
,
958 samplesBuffer
, samplesSizes
, nbSamples
,
968 /* issue : samplesBuffer need to be followed by a noisy guard band.
969 * work around : duplicate the buffer, and add the noise */
970 size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer
, size_t dictBufferCapacity
,
971 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
,
972 ZDICT_params_t params
)
976 size_t const sBuffSize
= ZDICT_totalSampleSize(samplesSizes
, nbSamples
);
977 if (sBuffSize
< ZDICT_MIN_SAMPLES_SIZE
) return 0; /* not enough content => no dictionary */
979 newBuff
= malloc(sBuffSize
+ NOISELENGTH
);
980 if (!newBuff
) return ERROR(memory_allocation
);
982 memcpy(newBuff
, samplesBuffer
, sBuffSize
);
983 ZDICT_fillNoise((char*)newBuff
+ sBuffSize
, NOISELENGTH
); /* guard band, for end of buffer condition */
985 result
= ZDICT_trainFromBuffer_unsafe(
986 dictBuffer
, dictBufferCapacity
,
987 newBuff
, samplesSizes
, nbSamples
,
994 size_t ZDICT_trainFromBuffer(void* dictBuffer
, size_t dictBufferCapacity
,
995 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
)
997 ZDICT_params_t params
;
998 memset(¶ms
, 0, sizeof(params
));
999 return ZDICT_trainFromBuffer_advanced(dictBuffer
, dictBufferCapacity
,
1000 samplesBuffer
, samplesSizes
, nbSamples
,
1004 size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer
, size_t dictContentSize
, size_t dictBufferCapacity
,
1005 const void* samplesBuffer
, const size_t* samplesSizes
, unsigned nbSamples
)
1007 ZDICT_params_t params
;
1008 memset(¶ms
, 0, sizeof(params
));
1009 return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer
, dictContentSize
, dictBufferCapacity
,
1010 samplesBuffer
, samplesSizes
, nbSamples
,