]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /********************************************************************** |
2 | Copyright(c) 2011-2016 Intel Corporation All rights reserved. | |
3 | ||
4 | Redistribution and use in source and binary forms, with or without | |
5 | modification, are permitted provided that the following conditions | |
6 | are met: | |
7 | * Redistributions of source code must retain the above copyright | |
8 | notice, this list of conditions and the following disclaimer. | |
9 | * Redistributions in binary form must reproduce the above copyright | |
10 | notice, this list of conditions and the following disclaimer in | |
11 | the documentation and/or other materials provided with the | |
12 | distribution. | |
13 | * Neither the name of Intel Corporation nor the names of its | |
14 | contributors may be used to endorse or promote products derived | |
15 | from this software without specific prior written permission. | |
16 | ||
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | **********************************************************************/ | |
29 | ||
30 | /* This program can be used to generate custom a custom huffman encoding to get | |
31 | * better data compression. This is most useful when the type of data being | |
32 | * compressed is well known. | |
33 | * | |
34 | * To use generate_custom_hufftables, pass a sequence of files to the program | |
35 | * that together form an accurate representation of the data that is being | |
36 | * compressed. Generate_custom_hufftables will then produce the file | |
37 | * hufftables_c.c, which should be moved to replace its counterpart in the igzip | |
38 | * source folder. After recompiling the Isa-l library, the igzip compression | |
39 | * functions will use the new hufftables. | |
40 | * | |
41 | * Generate_custom_hufftables should be compiled with the same compile time | |
42 | * parameters as the igzip source code. Generating custom hufftables with | |
43 | * different compile time parameters may cause igzip to produce invalid output | |
44 | * for the reasons described below. The default parameters used by | |
45 | * generate_custom_hufftables are the same as the default parameters used by | |
46 | * igzip. | |
47 | * | |
224ce89b WB |
48 | * *WARNING* generate custom hufftables must be compiled with a IGZIP_HIST_SIZE |
49 | * that is at least as large as the IGZIP_HIST_SIZE used by igzip. By default | |
50 | * IGZIP_HIST_SIZE is 32K, the maximum usable IGZIP_HIST_SIZE is 32K. The reason | |
51 | * for this is to generate better compression. Igzip cannot produce look back | |
52 | * distances with sizes larger than the IGZIP_HIST_SIZE igzip was compiled with, | |
53 | * so look back distances with sizes larger than IGZIP_HIST_SIZE are not | |
54 | * assigned a huffman code. The definition of LONGER_HUFFTABLES must be | |
55 | * consistent as well since that definition changes the size of the structures | |
56 | * printed by this tool. | |
7c673cae | 57 | * |
7c673cae FG |
58 | */ |
59 | ||
60 | #include <stdint.h> | |
61 | #include <stdio.h> | |
62 | #include <inttypes.h> | |
224ce89b WB |
63 | #include <string.h> |
64 | #include <stdlib.h> | |
65 | #include "igzip_lib.h" | |
7c673cae | 66 | |
20effc67 TL |
67 | #include "huff_codes.h" |
68 | #include "huffman.h" | |
69 | ||
7c673cae FG |
70 | /*These max code lengths are limited by how the data is stored in |
71 | * hufftables.asm. The deflate standard max is 15.*/ | |
72 | ||
224ce89b | 73 | #define MAX_HEADER_SIZE ISAL_DEF_MAX_HDR_SIZE |
7c673cae FG |
74 | |
75 | #define GZIP_HEADER_SIZE 10 | |
76 | #define GZIP_TRAILER_SIZE 8 | |
f91f0fd5 TL |
77 | #define ZLIB_HEADER_SIZE 2 |
78 | #define ZLIB_TRAILER_SIZE 4 | |
7c673cae FG |
79 | |
80 | /** | |
81 | * @brief Prints a table of uint8_t elements to a file. | |
82 | * @param outfile: the file the table is printed to. | |
83 | * @param table: the table to be printed. | |
84 | * @param length: number of elements to be printed. | |
85 | * @param header: header to append in front of the table. | |
86 | * @param footer: footer to append at the end of the table. | |
87 | * @param begin_line: string printed at beginning of new line | |
88 | */ | |
89 | void fprint_uint8_table(FILE * outfile, uint8_t * table, uint64_t length, char *header, | |
90 | char *footer, char *begin_line) | |
91 | { | |
92 | int i; | |
93 | fprintf(outfile, "%s", header); | |
94 | for (i = 0; i < length - 1; i++) { | |
95 | if ((i & 7) == 0) | |
96 | fprintf(outfile, "\n%s", begin_line); | |
97 | else | |
98 | fprintf(outfile, " "); | |
99 | fprintf(outfile, "0x%02x,", table[i]); | |
100 | } | |
101 | ||
102 | if ((i & 7) == 0) | |
103 | fprintf(outfile, "\n%s", begin_line); | |
104 | else | |
105 | fprintf(outfile, " "); | |
106 | fprintf(outfile, "0x%02x", table[i]); | |
107 | fprintf(outfile, "%s", footer); | |
108 | ||
109 | } | |
110 | ||
111 | /** | |
112 | * @brief Prints a table of uint16_t elements to a file. | |
113 | * @param outfile: the file the table is printed to. | |
114 | * @param table: the table to be printed. | |
115 | * @param length: number of elements to be printed. | |
116 | * @param header: header to append in front of the table. | |
117 | * @param footer: footer to append at the end of the table. | |
118 | * @param begin_line: string printed at beginning of new line | |
119 | */ | |
120 | void fprint_uint16_table(FILE * outfile, uint16_t * table, uint64_t length, char *header, | |
121 | char *footer, char *begin_line) | |
122 | { | |
123 | int i; | |
124 | fprintf(outfile, "%s", header); | |
125 | for (i = 0; i < length - 1; i++) { | |
126 | if ((i & 7) == 0) | |
127 | fprintf(outfile, "\n%s", begin_line); | |
128 | else | |
129 | fprintf(outfile, " "); | |
130 | fprintf(outfile, "0x%04x,", table[i]); | |
131 | } | |
132 | ||
133 | if ((i & 7) == 0) | |
134 | fprintf(outfile, "\n%s", begin_line); | |
135 | else | |
136 | fprintf(outfile, " "); | |
137 | fprintf(outfile, "0x%04x", table[i]); | |
138 | fprintf(outfile, "%s", footer); | |
139 | ||
140 | } | |
141 | ||
142 | /** | |
143 | * @brief Prints a table of uint32_t elements to a file. | |
144 | * @param outfile: the file the table is printed to. | |
145 | * @param table: the table to be printed. | |
146 | * @param length: number of elements to be printed. | |
147 | * @param header: header to append in front of the table. | |
148 | * @param footer: footer to append at the end of the table. | |
149 | * @param begin_line: string printed at beginning of new line | |
150 | */ | |
151 | void fprint_uint32_table(FILE * outfile, uint32_t * table, uint64_t length, char *header, | |
152 | char *footer, char *begin_line) | |
153 | { | |
154 | int i; | |
155 | fprintf(outfile, "%s", header); | |
156 | for (i = 0; i < length - 1; i++) { | |
157 | if ((i & 3) == 0) | |
158 | fprintf(outfile, "\n%s", begin_line); | |
159 | else | |
160 | fprintf(outfile, " "); | |
161 | fprintf(outfile, "0x%08x,", table[i]); | |
162 | } | |
163 | ||
164 | if ((i & 3) == 0) | |
165 | fprintf(outfile, "%s", begin_line); | |
166 | else | |
167 | fprintf(outfile, " "); | |
168 | fprintf(outfile, "0x%08x", table[i]); | |
169 | fprintf(outfile, "%s", footer); | |
170 | ||
171 | } | |
172 | ||
224ce89b WB |
173 | void fprint_hufftables(FILE * output_file, char *hufftables_name, |
174 | struct isal_hufftables *hufftables) | |
7c673cae | 175 | { |
224ce89b WB |
176 | fprintf(output_file, "struct isal_hufftables %s = {\n\n", hufftables_name); |
177 | ||
178 | fprint_uint8_table(output_file, hufftables->deflate_hdr, | |
179 | hufftables->deflate_hdr_count + | |
180 | (hufftables->deflate_hdr_extra_bits + 7) / 8, | |
181 | "\t.deflate_hdr = {", "},\n\n", "\t\t"); | |
182 | ||
183 | fprintf(output_file, "\t.deflate_hdr_count = %d,\n", hufftables->deflate_hdr_count); | |
184 | fprintf(output_file, "\t.deflate_hdr_extra_bits = %d,\n\n", | |
185 | hufftables->deflate_hdr_extra_bits); | |
186 | ||
187 | fprint_uint32_table(output_file, hufftables->dist_table, IGZIP_DIST_TABLE_SIZE, | |
188 | "\t.dist_table = {", "},\n\n", "\t\t"); | |
189 | ||
190 | fprint_uint32_table(output_file, hufftables->len_table, IGZIP_LEN_TABLE_SIZE, | |
191 | "\t.len_table = {", "},\n\n", "\t\t"); | |
192 | ||
193 | fprint_uint16_table(output_file, hufftables->lit_table, IGZIP_LIT_TABLE_SIZE, | |
194 | "\t.lit_table = {", "},\n\n", "\t\t"); | |
195 | fprint_uint8_table(output_file, hufftables->lit_table_sizes, IGZIP_LIT_TABLE_SIZE, | |
196 | "\t.lit_table_sizes = {", "},\n\n", "\t\t"); | |
197 | ||
198 | fprint_uint16_table(output_file, hufftables->dcodes, | |
199 | ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET, | |
200 | "\t.dcodes = {", "},\n\n", "\t\t"); | |
201 | fprint_uint8_table(output_file, hufftables->dcodes_sizes, | |
202 | ISAL_DEF_DIST_SYMBOLS - IGZIP_DECODE_OFFSET, | |
203 | "\t.dcodes_sizes = {", "}\n", "\t\t"); | |
7c673cae FG |
204 | fprintf(output_file, "};\n"); |
205 | } | |
206 | ||
224ce89b | 207 | void fprint_header(FILE * output_file) |
7c673cae | 208 | { |
224ce89b | 209 | |
7c673cae FG |
210 | fprintf(output_file, "#include <stdint.h>\n"); |
211 | fprintf(output_file, "#include <igzip_lib.h>\n\n"); | |
212 | ||
224ce89b WB |
213 | fprintf(output_file, "#if IGZIP_HIST_SIZE > %d\n" |
214 | "# error \"Invalid history size for the custom hufftable\"\n" | |
215 | "#endif\n", IGZIP_HIST_SIZE); | |
216 | ||
217 | #ifdef LONGER_HUFFTABLE | |
218 | fprintf(output_file, "#ifndef LONGER_HUFFTABLE\n" | |
219 | "# error \"Custom hufftable requires LONGER_HUFFTABLE to be defined \"\n" | |
220 | "#endif\n"); | |
221 | #else | |
222 | fprintf(output_file, "#ifdef LONGER_HUFFTABLE\n" | |
223 | "# error \"Custom hufftable requires LONGER_HUFFTABLE to not be defined \"\n" | |
224 | "#endif\n"); | |
225 | #endif | |
226 | fprintf(output_file, "\n"); | |
227 | ||
7c673cae FG |
228 | fprintf(output_file, "const uint8_t gzip_hdr[] = {\n" |
229 | "\t0x1f, 0x8b, 0x08, 0x00, 0x00,\n" "\t0x00, 0x00, 0x00, 0x00, 0xff\t};\n\n"); | |
230 | ||
231 | fprintf(output_file, "const uint32_t gzip_hdr_bytes = %d;\n", GZIP_HEADER_SIZE); | |
f91f0fd5 TL |
232 | fprintf(output_file, "const uint32_t gzip_trl_bytes = %d;\n\n", GZIP_TRAILER_SIZE); |
233 | ||
234 | fprintf(output_file, "const uint8_t zlib_hdr[] = { 0x78, 0x01 };\n\n"); | |
235 | fprintf(output_file, "const uint32_t zlib_hdr_bytes = %d;\n", ZLIB_HEADER_SIZE); | |
236 | fprintf(output_file, "const uint32_t zlib_trl_bytes = %d;\n", ZLIB_TRAILER_SIZE); | |
7c673cae FG |
237 | } |
238 | ||
20effc67 TL |
239 | static uint32_t convert_dist_to_dist_sym(uint32_t dist) |
240 | { | |
241 | assert(dist <= 32768 && dist > 0); | |
242 | if (dist <= 32768) { | |
243 | uint32_t msb = dist > 4 ? bsr(dist - 1) - 2 : 0; | |
244 | return (msb * 2) + ((dist - 1) >> msb); | |
245 | } else { | |
246 | return ~0; | |
247 | } | |
248 | } | |
249 | ||
250 | /** | |
251 | * @brief Returns the deflate symbol value for a repeat length. | |
252 | */ | |
253 | static uint32_t convert_length_to_len_sym(uint32_t length) | |
254 | { | |
255 | assert(length > 2 && length < 259); | |
256 | ||
257 | /* Based on tables on page 11 in RFC 1951 */ | |
258 | if (length < 11) | |
259 | return 257 + length - 3; | |
260 | else if (length < 19) | |
261 | return 261 + (length - 3) / 2; | |
262 | else if (length < 35) | |
263 | return 265 + (length - 3) / 4; | |
264 | else if (length < 67) | |
265 | return 269 + (length - 3) / 8; | |
266 | else if (length < 131) | |
267 | return 273 + (length - 3) / 16; | |
268 | else if (length < 258) | |
269 | return 277 + (length - 3) / 32; | |
270 | else | |
271 | return 285; | |
272 | } | |
273 | ||
274 | void isal_update_histogram_dict(uint8_t * start_stream, int dict_length, int length, | |
275 | struct isal_huff_histogram *histogram) | |
276 | { | |
277 | uint32_t literal = 0, hash; | |
278 | uint16_t seen, *last_seen = histogram->hash_table; | |
279 | uint8_t *current, *end_stream, *next_hash, *end, *end_dict; | |
280 | uint32_t match_length; | |
281 | uint32_t dist; | |
282 | uint64_t *lit_len_histogram = histogram->lit_len_histogram; | |
283 | uint64_t *dist_histogram = histogram->dist_histogram; | |
284 | ||
285 | if (length <= 0) | |
286 | return; | |
287 | ||
288 | end_stream = start_stream + dict_length + length; | |
289 | end_dict = start_stream + dict_length; | |
290 | ||
291 | memset(last_seen, 0, sizeof(histogram->hash_table)); /* Initialize last_seen to be 0. */ | |
292 | ||
293 | for (current = start_stream; current < end_dict - 4; current++) { | |
294 | literal = load_u32(current); | |
295 | hash = compute_hash(literal) & LVL0_HASH_MASK; | |
296 | last_seen[hash] = (current - start_stream) & 0xFFFF; | |
297 | } | |
298 | ||
299 | for (current = start_stream + dict_length; current < end_stream - 3; current++) { | |
300 | literal = load_u32(current); | |
301 | hash = compute_hash(literal) & LVL0_HASH_MASK; | |
302 | seen = last_seen[hash]; | |
303 | last_seen[hash] = (current - start_stream) & 0xFFFF; | |
304 | dist = (current - start_stream - seen) & 0xFFFF; | |
305 | if (dist - 1 < D - 1) { | |
306 | assert(start_stream <= current - dist); | |
307 | match_length = | |
308 | compare258(current - dist, current, end_stream - current); | |
309 | if (match_length >= SHORTEST_MATCH) { | |
310 | next_hash = current; | |
311 | #ifdef ISAL_LIMIT_HASH_UPDATE | |
312 | end = next_hash + 3; | |
313 | #else | |
314 | end = next_hash + match_length; | |
315 | #endif | |
316 | if (end > end_stream - 3) | |
317 | end = end_stream - 3; | |
318 | next_hash++; | |
319 | for (; next_hash < end; next_hash++) { | |
320 | literal = load_u32(next_hash); | |
321 | hash = compute_hash(literal) & LVL0_HASH_MASK; | |
322 | last_seen[hash] = (next_hash - start_stream) & 0xFFFF; | |
323 | } | |
324 | ||
325 | dist_histogram[convert_dist_to_dist_sym(dist)] += 1; | |
326 | lit_len_histogram[convert_length_to_len_sym(match_length)] += | |
327 | 1; | |
328 | current += match_length - 1; | |
329 | continue; | |
330 | } | |
331 | } | |
332 | lit_len_histogram[literal & 0xFF] += 1; | |
333 | } | |
334 | ||
335 | for (; current < end_stream; current++) | |
336 | lit_len_histogram[*current] += 1; | |
337 | ||
338 | lit_len_histogram[256] += 1; | |
339 | return; | |
340 | } | |
341 | ||
7c673cae FG |
342 | int main(int argc, char *argv[]) |
343 | { | |
344 | long int file_length; | |
20effc67 | 345 | int argi = 1; |
7c673cae | 346 | uint8_t *stream = NULL; |
224ce89b | 347 | struct isal_hufftables hufftables; |
7c673cae | 348 | struct isal_huff_histogram histogram; |
224ce89b | 349 | struct isal_zstream tmp_stream; |
20effc67 TL |
350 | FILE *file = NULL; |
351 | FILE *dict_file = NULL; | |
352 | FILE *hist_file = NULL; | |
353 | long int dict_file_length = 0; | |
354 | long int hist_file_length = 0; | |
355 | uint8_t *dict_stream = NULL; | |
7c673cae FG |
356 | |
357 | if (argc == 1) { | |
358 | printf("Error, no input file.\n"); | |
359 | return 1; | |
360 | } | |
361 | ||
20effc67 TL |
362 | if (argc > 3 && argv[1][0] == '-' && argv[1][1] == 'd') { |
363 | dict_file = fopen(argv[2], "r"); | |
364 | ||
365 | fseek(dict_file, 0, SEEK_END); | |
366 | dict_file_length = ftell(dict_file); | |
367 | fseek(dict_file, 0, SEEK_SET); | |
368 | dict_file_length -= ftell(dict_file); | |
369 | dict_stream = malloc(dict_file_length); | |
370 | if (dict_stream == NULL) { | |
371 | printf("Failed to allocate memory to read in dictionary file\n"); | |
372 | fclose(dict_file); | |
373 | return 1; | |
374 | } | |
375 | if (fread(dict_stream, 1, dict_file_length, dict_file) != dict_file_length) { | |
376 | printf("Error occurred when reading dictionary file"); | |
377 | fclose(dict_file); | |
378 | free(dict_stream); | |
379 | return 1; | |
380 | } | |
381 | isal_update_histogram(dict_stream, dict_file_length, &histogram); | |
382 | ||
383 | printf("Read %ld bytes of dictionary file %s\n", dict_file_length, argv[2]); | |
384 | argi += 2; | |
385 | fclose(dict_file); | |
386 | free(dict_stream); | |
387 | } | |
388 | ||
389 | if ((argc > argi + 1) && argv[argi][0] == '-' && argv[argi][1] == 'h') { | |
390 | hist_file = fopen(argv[argi + 1], "r+"); | |
391 | fseek(hist_file, 0, SEEK_END); | |
392 | hist_file_length = ftell(hist_file); | |
393 | fseek(hist_file, 0, SEEK_SET); | |
394 | hist_file_length -= ftell(hist_file); | |
395 | if (hist_file_length > sizeof(histogram)) { | |
396 | printf("Histogram file too long\n"); | |
397 | return 1; | |
398 | } | |
399 | if (fread(&histogram, 1, hist_file_length, hist_file) != hist_file_length) { | |
400 | printf("Error occurred when reading history file"); | |
401 | fclose(hist_file); | |
402 | return 1; | |
403 | } | |
404 | fseek(hist_file, 0, SEEK_SET); | |
7c673cae | 405 | |
20effc67 TL |
406 | printf("Read %ld bytes of history file %s\n", hist_file_length, |
407 | argv[argi + 1]); | |
408 | argi += 2; | |
409 | } else | |
410 | memset(&histogram, 0, sizeof(histogram)); /* Initialize histograms. */ | |
411 | ||
412 | while (argi < argc) { | |
413 | printf("Processing %s\n", argv[argi]); | |
414 | file = fopen(argv[argi], "r"); | |
7c673cae FG |
415 | if (file == NULL) { |
416 | printf("Error opening file\n"); | |
417 | return 1; | |
418 | } | |
419 | fseek(file, 0, SEEK_END); | |
420 | file_length = ftell(file); | |
421 | fseek(file, 0, SEEK_SET); | |
422 | file_length -= ftell(file); | |
20effc67 | 423 | stream = malloc(file_length + dict_file_length); |
7c673cae FG |
424 | if (stream == NULL) { |
425 | printf("Failed to allocate memory to read in file\n"); | |
426 | fclose(file); | |
427 | return 1; | |
428 | } | |
20effc67 TL |
429 | if (dict_file_length > 0) |
430 | memcpy(stream, dict_stream, dict_file_length); | |
431 | ||
432 | if (fread(&stream[dict_file_length], 1, file_length, file) != file_length) { | |
7c673cae FG |
433 | printf("Error occurred when reading file"); |
434 | fclose(file); | |
435 | free(stream); | |
436 | return 1; | |
437 | } | |
438 | ||
439 | /* Create a histogram of frequency of symbols found in stream to | |
440 | * generate the huffman tree.*/ | |
20effc67 TL |
441 | if (0 == dict_file_length) |
442 | isal_update_histogram(stream, file_length, &histogram); | |
443 | else | |
444 | isal_update_histogram_dict(stream, dict_file_length, file_length, | |
445 | &histogram); | |
7c673cae FG |
446 | |
447 | fclose(file); | |
448 | free(stream); | |
20effc67 | 449 | argi++; |
7c673cae FG |
450 | } |
451 | ||
224ce89b | 452 | isal_create_hufftables(&hufftables, &histogram); |
7c673cae | 453 | |
224ce89b WB |
454 | file = fopen("hufftables_c.c", "w"); |
455 | if (file == NULL) { | |
456 | printf("Error creating file hufftables_c.c\n"); | |
7c673cae FG |
457 | return 1; |
458 | } | |
459 | ||
224ce89b | 460 | fprint_header(file); |
7c673cae | 461 | |
224ce89b | 462 | fprintf(file, "\n"); |
7c673cae | 463 | |
224ce89b | 464 | fprint_hufftables(file, "hufftables_default", &hufftables); |
7c673cae | 465 | |
224ce89b | 466 | fprintf(file, "\n"); |
7c673cae | 467 | |
224ce89b WB |
468 | isal_deflate_stateless_init(&tmp_stream); |
469 | isal_deflate_set_hufftables(&tmp_stream, NULL, IGZIP_HUFFTABLE_STATIC); | |
470 | fprint_hufftables(file, "hufftables_static", tmp_stream.hufftables); | |
7c673cae FG |
471 | |
472 | fclose(file); | |
473 | ||
20effc67 TL |
474 | if (hist_file) { |
475 | int len = fwrite(&histogram, 1, sizeof(histogram), hist_file); | |
476 | printf("wrote %d bytes of histogram file\n", len); | |
477 | fclose(hist_file); | |
478 | } | |
7c673cae FG |
479 | return 0; |
480 | } |