]>
Commit | Line | Data |
---|---|---|
11fdf7f2 | 1 | /* |
f67539c2 | 2 | * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. |
7c673cae FG |
3 | * All rights reserved. |
4 | * | |
11fdf7f2 TL |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found | |
7 | * in the COPYING file in the root directory of this source tree). | |
8 | * You may select, at your option, one of the above-listed licenses. | |
7c673cae FG |
9 | */ |
10 | ||
11 | ||
12 | ||
7c673cae FG |
13 | /*-************************************ |
14 | * Dependencies | |
15 | **************************************/ | |
9f95a23c | 16 | #include "datagen.h" |
11fdf7f2 | 17 | #include "platform.h" /* SET_BINARY_MODE */ |
7c673cae FG |
18 | #include <stdlib.h> /* malloc, free */ |
19 | #include <stdio.h> /* FILE, fwrite, fprintf */ | |
20 | #include <string.h> /* memcpy */ | |
f67539c2 | 21 | #include "../lib/common/mem.h" /* U32 */ |
7c673cae FG |
22 | |
23 | ||
7c673cae FG |
24 | /*-************************************ |
25 | * Macros | |
26 | **************************************/ | |
27 | #define KB *(1 <<10) | |
28 | #define MIN(a,b) ( (a) < (b) ? (a) : (b) ) | |
29 | ||
30 | #define RDG_DEBUG 0 | |
31 | #define TRACE(...) if (RDG_DEBUG) fprintf(stderr, __VA_ARGS__ ) | |
32 | ||
33 | ||
34 | /*-************************************ | |
35 | * Local constants | |
36 | **************************************/ | |
37 | #define LTLOG 13 | |
38 | #define LTSIZE (1<<LTLOG) | |
39 | #define LTMASK (LTSIZE-1) | |
40 | ||
41 | ||
42 | /*-******************************************************* | |
43 | * Local Functions | |
44 | *********************************************************/ | |
45 | #define RDG_rotl32(x,r) ((x << r) | (x >> (32 - r))) | |
46 | static U32 RDG_rand(U32* src) | |
47 | { | |
48 | static const U32 prime1 = 2654435761U; | |
49 | static const U32 prime2 = 2246822519U; | |
50 | U32 rand32 = *src; | |
51 | rand32 *= prime1; | |
52 | rand32 ^= prime2; | |
53 | rand32 = RDG_rotl32(rand32, 13); | |
54 | *src = rand32; | |
55 | return rand32 >> 5; | |
56 | } | |
57 | ||
f67539c2 | 58 | typedef U32 fixedPoint_24_8; |
7c673cae | 59 | |
f67539c2 | 60 | static void RDG_fillLiteralDistrib(BYTE* ldt, fixedPoint_24_8 ld) |
7c673cae FG |
61 | { |
62 | BYTE const firstChar = (ld<=0.0) ? 0 : '('; | |
63 | BYTE const lastChar = (ld<=0.0) ? 255 : '}'; | |
64 | BYTE character = (ld<=0.0) ? 0 : '0'; | |
65 | U32 u; | |
66 | ||
f67539c2 | 67 | if (ld<=0) ld = 0; |
7c673cae | 68 | for (u=0; u<LTSIZE; ) { |
f67539c2 | 69 | U32 const weight = (((LTSIZE - u) * ld) >> 8) + 1; |
7c673cae FG |
70 | U32 const end = MIN ( u + weight , LTSIZE); |
71 | while (u < end) ldt[u++] = character; | |
72 | character++; | |
73 | if (character > lastChar) character = firstChar; | |
74 | } | |
75 | } | |
76 | ||
77 | ||
78 | static BYTE RDG_genChar(U32* seed, const BYTE* ldt) | |
79 | { | |
80 | U32 const id = RDG_rand(seed) & LTMASK; | |
81 | return ldt[id]; /* memory-sanitizer fails here, stating "uninitialized value" when table initialized with P==0.0. Checked : table is fully initialized */ | |
82 | } | |
83 | ||
84 | ||
9f95a23c | 85 | static U32 RDG_rand15Bits (U32* seedPtr) |
7c673cae FG |
86 | { |
87 | return RDG_rand(seedPtr) & 0x7FFF; | |
88 | } | |
89 | ||
9f95a23c | 90 | static U32 RDG_randLength(U32* seedPtr) |
7c673cae FG |
91 | { |
92 | if (RDG_rand(seedPtr) & 7) return (RDG_rand(seedPtr) & 0xF); /* small length */ | |
93 | return (RDG_rand(seedPtr) & 0x1FF) + 0xF; | |
94 | } | |
95 | ||
f67539c2 TL |
96 | static void RDG_genBlock(void* buffer, size_t buffSize, size_t prefixSize, |
97 | double matchProba, const BYTE* ldt, U32* seedPtr) | |
7c673cae FG |
98 | { |
99 | BYTE* const buffPtr = (BYTE*)buffer; | |
100 | U32 const matchProba32 = (U32)(32768 * matchProba); | |
101 | size_t pos = prefixSize; | |
102 | U32 prevOffset = 1; | |
103 | ||
104 | /* special case : sparse content */ | |
105 | while (matchProba >= 1.0) { | |
106 | size_t size0 = RDG_rand(seedPtr) & 3; | |
107 | size0 = (size_t)1 << (16 + size0 * 2); | |
108 | size0 += RDG_rand(seedPtr) & (size0-1); /* because size0 is power of 2*/ | |
109 | if (buffSize < pos + size0) { | |
110 | memset(buffPtr+pos, 0, buffSize-pos); | |
111 | return; | |
112 | } | |
113 | memset(buffPtr+pos, 0, size0); | |
114 | pos += size0; | |
115 | buffPtr[pos-1] = RDG_genChar(seedPtr, ldt); | |
116 | continue; | |
117 | } | |
118 | ||
119 | /* init */ | |
120 | if (pos==0) buffPtr[0] = RDG_genChar(seedPtr, ldt), pos=1; | |
121 | ||
122 | /* Generate compressible data */ | |
123 | while (pos < buffSize) { | |
124 | /* Select : Literal (char) or Match (within 32K) */ | |
125 | if (RDG_rand15Bits(seedPtr) < matchProba32) { | |
126 | /* Copy (within 32K) */ | |
127 | U32 const length = RDG_randLength(seedPtr) + 4; | |
128 | U32 const d = (U32) MIN(pos + length , buffSize); | |
129 | U32 const repeatOffset = (RDG_rand(seedPtr) & 15) == 2; | |
130 | U32 const randOffset = RDG_rand15Bits(seedPtr) + 1; | |
131 | U32 const offset = repeatOffset ? prevOffset : (U32) MIN(randOffset , pos); | |
132 | size_t match = pos - offset; | |
f67539c2 | 133 | while (pos < d) { buffPtr[pos++] = buffPtr[match++]; /* correctly manages overlaps */ } |
7c673cae FG |
134 | prevOffset = offset; |
135 | } else { | |
136 | /* Literal (noise) */ | |
137 | U32 const length = RDG_randLength(seedPtr); | |
138 | U32 const d = (U32) MIN(pos + length, buffSize); | |
f67539c2 | 139 | while (pos < d) { buffPtr[pos++] = RDG_genChar(seedPtr, ldt); } |
7c673cae FG |
140 | } } |
141 | } | |
142 | ||
143 | ||
144 | void RDG_genBuffer(void* buffer, size_t size, double matchProba, double litProba, unsigned seed) | |
145 | { | |
9f95a23c | 146 | U32 seed32 = seed; |
7c673cae FG |
147 | BYTE ldt[LTSIZE]; |
148 | memset(ldt, '0', sizeof(ldt)); /* yes, character '0', this is intentional */ | |
149 | if (litProba<=0.0) litProba = matchProba / 4.5; | |
f67539c2 | 150 | RDG_fillLiteralDistrib(ldt, (fixedPoint_24_8)(litProba * 256 + 0.001)); |
9f95a23c | 151 | RDG_genBlock(buffer, size, 0, matchProba, ldt, &seed32); |
7c673cae FG |
152 | } |
153 | ||
154 | ||
155 | void RDG_genStdout(unsigned long long size, double matchProba, double litProba, unsigned seed) | |
156 | { | |
9f95a23c | 157 | U32 seed32 = seed; |
7c673cae FG |
158 | size_t const stdBlockSize = 128 KB; |
159 | size_t const stdDictSize = 32 KB; | |
160 | BYTE* const buff = (BYTE*)malloc(stdDictSize + stdBlockSize); | |
161 | U64 total = 0; | |
162 | BYTE ldt[LTSIZE]; /* literals distribution table */ | |
163 | ||
164 | /* init */ | |
165 | if (buff==NULL) { perror("datagen"); exit(1); } | |
166 | if (litProba<=0.0) litProba = matchProba / 4.5; | |
167 | memset(ldt, '0', sizeof(ldt)); /* yes, character '0', this is intentional */ | |
f67539c2 | 168 | RDG_fillLiteralDistrib(ldt, (fixedPoint_24_8)(litProba * 256 + 0.001)); |
7c673cae FG |
169 | SET_BINARY_MODE(stdout); |
170 | ||
171 | /* Generate initial dict */ | |
9f95a23c | 172 | RDG_genBlock(buff, stdDictSize, 0, matchProba, ldt, &seed32); |
7c673cae FG |
173 | |
174 | /* Generate compressible data */ | |
175 | while (total < size) { | |
176 | size_t const genBlockSize = (size_t) (MIN (stdBlockSize, size-total)); | |
9f95a23c | 177 | RDG_genBlock(buff, stdDictSize+stdBlockSize, stdDictSize, matchProba, ldt, &seed32); |
7c673cae FG |
178 | total += genBlockSize; |
179 | { size_t const unused = fwrite(buff, 1, genBlockSize, stdout); (void)unused; } | |
180 | /* update dict */ | |
181 | memcpy(buff, buff + stdBlockSize, stdDictSize); | |
182 | } | |
183 | ||
184 | /* cleanup */ | |
185 | free(buff); | |
186 | } |