]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/generate_custom_hufftables.c
9e7ee5f922d57558883d9195351af84f67b8ae5d
[ceph.git] / ceph / src / isa-l / igzip / generate_custom_hufftables.c
1 /**********************************************************************
2 Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
29
30 /* This program can be used to generate custom a custom huffman encoding to get
31 * better data compression. This is most useful when the type of data being
32 * compressed is well known.
33 *
34 * To use generate_custom_hufftables, pass a sequence of files to the program
35 * that together form an accurate representation of the data that is being
36 * compressed. Generate_custom_hufftables will then produce the file
37 * hufftables_c.c, which should be moved to replace its counterpart in the igzip
38 * source folder. After recompiling the Isa-l library, the igzip compression
39 * functions will use the new hufftables.
40 *
41 * Generate_custom_hufftables should be compiled with the same compile time
42 * parameters as the igzip source code. Generating custom hufftables with
43 * different compile time parameters may cause igzip to produce invalid output
44 * for the reasons described below. The default parameters used by
45 * generate_custom_hufftables are the same as the default parameters used by
46 * igzip.
47 *
48 * *WARNING* generate custom hufftables must be compiled with a HIST_SIZE that
49 * is at least as large as the HIST_SIZE used by igzip. By default HIST_SIZE is
50 * 8, the maximum usable HIST_SIZE is 32. The reason for this is to generate
51 * better compression. Igzip cannot produce look back distances with sizes
52 * larger than the HIST_SIZE * 1024 igzip was compiled with, so look back
53 * distances with sizes larger than HIST_SIZE * 1024 are not assigned a huffman
54 * code.
55 *
56 * To improve compression ratio, the compile time option LIT_SUB is provided to
57 * allow generating custom hufftables which only use a subset of all possible
58 * literals. This can be useful for getting better compression when it is known
59 * that the data being compressed will never contain certain symbols, for
60 * example text files. If this option is used, it needs to be checked that every
61 * possible literal is in fact given a valid code in the output hufftable. This
62 * can be done by checking that every required literal has a positive value for
63 * the length of the code associated with that literal. Literals which have not
64 * been given codes will have a code length of zero. The compile time option
65 * PRINT_CODES (described below) can be used to help manually perform this
66 * check.
67 *
68 * The compile time parameter PRINT_CODES causes the literal/length huffman code
69 * and the distance huffman code created by generate_custom_hufftables to be
70 * printed out. This is printed out where each line corresponds to a different
71 * symbol. The first column is the symbol used to represent each literal (Lit),
72 * end of block symbol (EOB), length (Len) or distance (Dist), the second column
73 * is the associated code value, and the third column is the length in bits of
74 * that code.
75 */
76
77 #include <stdint.h>
78 #include <stdio.h>
79 #include <inttypes.h>
80 #include "huff_codes.h"
81 #include "bitbuf2.h"
82
83 /*These max code lengths are limited by how the data is stored in
84 * hufftables.asm. The deflate standard max is 15.*/
85
86 #define LONG_DCODE_OFFSET 26
87 #define SHORT_DCODE_OFFSET 20
88
89 #define MAX_HEADER_SIZE IGZIP_MAX_DEF_HDR_SIZE
90
91 #define GZIP_HEADER_SIZE 10
92 #define GZIP_TRAILER_SIZE 8
93
94 /**
95 * @brief Prints a table of uint8_t elements to a file.
96 * @param outfile: the file the table is printed to.
97 * @param table: the table to be printed.
98 * @param length: number of elements to be printed.
99 * @param header: header to append in front of the table.
100 * @param footer: footer to append at the end of the table.
101 * @param begin_line: string printed at beginning of new line
102 */
103 void fprint_uint8_table(FILE * outfile, uint8_t * table, uint64_t length, char *header,
104 char *footer, char *begin_line)
105 {
106 int i;
107 fprintf(outfile, "%s", header);
108 for (i = 0; i < length - 1; i++) {
109 if ((i & 7) == 0)
110 fprintf(outfile, "\n%s", begin_line);
111 else
112 fprintf(outfile, " ");
113 fprintf(outfile, "0x%02x,", table[i]);
114 }
115
116 if ((i & 7) == 0)
117 fprintf(outfile, "\n%s", begin_line);
118 else
119 fprintf(outfile, " ");
120 fprintf(outfile, "0x%02x", table[i]);
121 fprintf(outfile, "%s", footer);
122
123 }
124
125 /**
126 * @brief Prints a table of uint16_t elements to a file.
127 * @param outfile: the file the table is printed to.
128 * @param table: the table to be printed.
129 * @param length: number of elements to be printed.
130 * @param header: header to append in front of the table.
131 * @param footer: footer to append at the end of the table.
132 * @param begin_line: string printed at beginning of new line
133 */
134 void fprint_uint16_table(FILE * outfile, uint16_t * table, uint64_t length, char *header,
135 char *footer, char *begin_line)
136 {
137 int i;
138 fprintf(outfile, "%s", header);
139 for (i = 0; i < length - 1; i++) {
140 if ((i & 7) == 0)
141 fprintf(outfile, "\n%s", begin_line);
142 else
143 fprintf(outfile, " ");
144 fprintf(outfile, "0x%04x,", table[i]);
145 }
146
147 if ((i & 7) == 0)
148 fprintf(outfile, "\n%s", begin_line);
149 else
150 fprintf(outfile, " ");
151 fprintf(outfile, "0x%04x", table[i]);
152 fprintf(outfile, "%s", footer);
153
154 }
155
156 /**
157 * @brief Prints a table of uint32_t elements to a file.
158 * @param outfile: the file the table is printed to.
159 * @param table: the table to be printed.
160 * @param length: number of elements to be printed.
161 * @param header: header to append in front of the table.
162 * @param footer: footer to append at the end of the table.
163 * @param begin_line: string printed at beginning of new line
164 */
165 void fprint_uint32_table(FILE * outfile, uint32_t * table, uint64_t length, char *header,
166 char *footer, char *begin_line)
167 {
168 int i;
169 fprintf(outfile, "%s", header);
170 for (i = 0; i < length - 1; i++) {
171 if ((i & 3) == 0)
172 fprintf(outfile, "\n%s", begin_line);
173 else
174 fprintf(outfile, " ");
175 fprintf(outfile, "0x%08x,", table[i]);
176 }
177
178 if ((i & 3) == 0)
179 fprintf(outfile, "%s", begin_line);
180 else
181 fprintf(outfile, " ");
182 fprintf(outfile, "0x%08x", table[i]);
183 fprintf(outfile, "%s", footer);
184
185 }
186
187 /**
188 * @brief Prints a table of uint64_t elements to a file.
189 * @param outfile: the file the table is printed to.
190 * @param table: the table to be printed.
191 * @param length: number of elements to be printed.
192 * @param header: header to append in front of the table.
193 * @param footer: footer to append at the end of the table.
194 */
195 void fprint_uint64_table(FILE * outfile, uint64_t * table, uint64_t length, char *header,
196 char *footer)
197 {
198 int i;
199 fprintf(outfile, "%s\n", header);
200 for (i = 0; i < length - 1; i++)
201 fprintf(outfile, "\t0x%016" PRIx64 ",\n", table[i]);
202 fprintf(outfile, "\t0x%016" PRIx64, table[i]);
203 fprintf(outfile, "%s", footer);
204
205 }
206
207 void fprint_hufftables(FILE * output_file, uint8_t * header, uint32_t bit_count,
208 uint16_t * lit_code_table, uint8_t * lit_code_size_table,
209 uint16_t * dcodes_code_table, uint8_t * dcodes_code_size_table,
210 uint32_t * packed_len_table, uint32_t * packed_dist_table)
211 {
212 fprintf(output_file, "struct isal_hufftables hufftables_default = {\n\n");
213
214 fprint_uint8_table(output_file, header, (bit_count + 7) / 8,
215 "\t.deflate_hdr = {", "\t},\n\n", "\t\t");
216 fprintf(output_file, "\t.deflate_hdr_count = %d,\n", bit_count / 8);
217 fprintf(output_file, "\t.deflate_hdr_extra_bits = %d,\n\n", bit_count & 7);
218
219 fprint_uint32_table(output_file, packed_dist_table, SHORT_DIST_TABLE_SIZE,
220 "\t.dist_table = {", ",\n", "\t\t");
221 fprint_uint32_table(output_file, &packed_dist_table[SHORT_DIST_TABLE_SIZE],
222 LONG_DIST_TABLE_SIZE - SHORT_DIST_TABLE_SIZE,
223 "#ifdef LONGER_HUFFTABLE",
224 "\n#endif /* LONGER_HUFFTABLE */\n\t},\n\n", "\t\t");
225
226 fprint_uint32_table(output_file, packed_len_table, LEN_TABLE_SIZE, "\t.len_table = {",
227 "\t},\n\n", "\t\t");
228 fprint_uint16_table(output_file, lit_code_table, LIT_TABLE_SIZE, "\t.lit_table = {",
229 "\t},\n\n", "\t\t");
230 fprint_uint8_table(output_file, lit_code_size_table, LIT_TABLE_SIZE,
231 "\t.lit_table_sizes = {", "\t},\n\n", "\t\t");
232
233 fprintf(output_file, "#ifndef LONGER_HUFFTABLE\n");
234 fprint_uint16_table(output_file, dcodes_code_table + SHORT_DCODE_OFFSET,
235 DIST_LEN - SHORT_DCODE_OFFSET, "\t.dcodes = {", "\t},\n\n",
236 "\t\t");
237 fprint_uint8_table(output_file, dcodes_code_size_table + SHORT_DCODE_OFFSET,
238 DIST_LEN - SHORT_DCODE_OFFSET, "\t.dcodes_sizes = {", "\t}\n",
239 "\t\t");
240 fprintf(output_file, "#else\n");
241 fprint_uint16_table(output_file, dcodes_code_table + LONG_DCODE_OFFSET,
242 DIST_LEN - LONG_DCODE_OFFSET, "\t.dcodes = {", "\t},\n\n", "\t\t");
243 fprint_uint8_table(output_file, dcodes_code_size_table + LONG_DCODE_OFFSET,
244 DIST_LEN - LONG_DCODE_OFFSET, "\t.dcodes_sizes = {", "\t}\n",
245 "\t\t");
246 fprintf(output_file, "#endif\n");
247 fprintf(output_file, "};\n");
248 }
249
250 void fprint_header(FILE * output_file, uint8_t * header, uint32_t bit_count,
251 uint16_t * lit_code_table, uint8_t * lit_code_size_table,
252 uint16_t * dcodes_code_table, uint8_t * dcodes_code_size_table,
253 uint32_t * packed_len_table, uint32_t * packed_dist_table)
254 {
255 fprintf(output_file, "#include <stdint.h>\n");
256 fprintf(output_file, "#include <igzip_lib.h>\n\n");
257
258 fprintf(output_file, "const uint8_t gzip_hdr[] = {\n"
259 "\t0x1f, 0x8b, 0x08, 0x00, 0x00,\n" "\t0x00, 0x00, 0x00, 0x00, 0xff\t};\n\n");
260
261 fprintf(output_file, "const uint32_t gzip_hdr_bytes = %d;\n", GZIP_HEADER_SIZE);
262 fprintf(output_file, "const uint32_t gzip_trl_bytes = %d;\n\n", GZIP_TRAILER_SIZE);
263
264 fprint_hufftables(output_file, header, bit_count, lit_code_table, lit_code_size_table,
265 dcodes_code_table, dcodes_code_size_table, packed_len_table,
266 packed_dist_table);
267 }
268
269 int main(int argc, char *argv[])
270 {
271 long int file_length;
272 uint8_t *stream = NULL;
273 struct isal_huff_histogram histogram;
274 uint64_t *lit_histogram = histogram.lit_len_histogram;
275 uint64_t *dist_histogram = histogram.dist_histogram;
276 uint8_t header[MAX_HEADER_SIZE];
277 FILE *file;
278 struct huff_tree lit_tree, dist_tree;
279 struct huff_tree lit_tree_array[2 * LIT_LEN - 1], dist_tree_array[2 * DIST_LEN - 1];
280 struct huff_code lit_huff_table[LIT_LEN], dist_huff_table[DIST_LEN];
281 uint64_t bit_count;
282 uint32_t packed_len_table[LEN_TABLE_SIZE];
283 uint32_t packed_dist_table[LONG_DIST_TABLE_SIZE];
284 uint16_t lit_code_table[LIT_TABLE_SIZE];
285 uint16_t dcodes_code_table[DIST_LEN];
286 uint8_t lit_code_size_table[LIT_TABLE_SIZE];
287 uint8_t dcodes_code_size_table[DIST_LEN];
288 int max_dist = convert_dist_to_dist_sym(D);
289
290 if (argc == 1) {
291 printf("Error, no input file.\n");
292 return 1;
293 }
294
295 memset(&histogram, 0, sizeof(histogram)); /* Initialize histograms. */
296 memset(lit_tree_array, 0, sizeof(lit_tree_array));
297 memset(dist_tree_array, 0, sizeof(dist_tree_array));
298 memset(lit_huff_table, 0, sizeof(lit_huff_table));
299 memset(dist_huff_table, 0, sizeof(dist_huff_table));
300
301 while (argc > 1) {
302 printf("Processing %s\n", argv[argc - 1]);
303 file = fopen(argv[argc - 1], "r");
304 if (file == NULL) {
305 printf("Error opening file\n");
306 return 1;
307 }
308 fseek(file, 0, SEEK_END);
309 file_length = ftell(file);
310 fseek(file, 0, SEEK_SET);
311 file_length -= ftell(file);
312 stream = malloc(file_length);
313 if (stream == NULL) {
314 printf("Failed to allocate memory to read in file\n");
315 fclose(file);
316 return 1;
317 }
318 fread(stream, 1, file_length, file);
319 if (ferror(file)) {
320 printf("Error occurred when reading file");
321 fclose(file);
322 free(stream);
323 return 1;
324 }
325
326 /* Create a histogram of frequency of symbols found in stream to
327 * generate the huffman tree.*/
328 isal_update_histogram(stream, file_length, &histogram);
329
330 fclose(file);
331 free(stream);
332 argc--;
333 }
334
335 /* Create a huffman tree corresponding to the histograms created in
336 * gen_histogram*/
337 #ifdef LIT_SUB
338 int j;
339 /* Guarantee every possible repeat length is given a symbol. It is hard
340 * to guarantee data will never have a repeat of a given length */
341 for (j = LIT_TABLE_SIZE; j < LIT_LEN; j++)
342 if (lit_histogram[j] == 0)
343 lit_histogram[j]++;
344
345 lit_tree = create_symbol_subset_huff_tree(lit_tree_array, lit_histogram, LIT_LEN);
346 #else
347 lit_tree = create_huff_tree(lit_tree_array, lit_histogram, LIT_LEN);
348 #endif
349 dist_tree = create_huff_tree(dist_tree_array, dist_histogram, max_dist + 1);
350
351 /* Create a look up table to represent huffman tree above in deflate
352 * standard form after it has been modified to satisfy max depth
353 * criteria.*/
354 if (create_huff_lookup(lit_huff_table, LIT_LEN, lit_tree, MAX_DEFLATE_CODE_LEN) > 0) {
355 printf("Error, code with invalid length for Deflate standard.\n");
356 return 1;
357 }
358
359 if (create_huff_lookup(dist_huff_table, DIST_LEN, dist_tree, MAX_DEFLATE_CODE_LEN) > 0) {
360 printf("Error, code with invalid length for Deflate standard.\n");
361 return 1;
362 }
363
364 if (are_hufftables_useable(lit_huff_table, dist_huff_table)) {
365 if (create_huff_lookup
366 (lit_huff_table, LIT_LEN, lit_tree, MAX_SAFE_LIT_CODE_LEN) > 0)
367 printf("Error, code with invalid length for Deflate standard.\n");
368 return 1;
369
370 if (create_huff_lookup
371 (dist_huff_table, DIST_LEN, dist_tree, MAX_SAFE_DIST_CODE_LEN) > 0)
372 printf("Error, code with invalid length for Deflate standard.\n");
373 return 1;
374
375 if (are_hufftables_useable(lit_huff_table, dist_huff_table)) {
376 printf("Error, hufftable is not usable\n");
377 return 1;
378 }
379 }
380 #ifdef PRINT_CODES
381 int i;
382 printf("Lit/Len codes\n");
383 for (i = 0; i < LIT_TABLE_SIZE - 1; i++)
384 printf("Lit %3d: Code 0x%04x, Code_Len %d\n", i, lit_huff_table[i].code,
385 lit_huff_table[i].length);
386
387 printf("EOB %3d: Code 0x%04x, Code_Len %d\n", 256, lit_huff_table[256].code,
388 lit_huff_table[256].length);
389
390 for (i = LIT_TABLE_SIZE; i < LIT_LEN; i++)
391 printf("Len %d: Code 0x%04x, Code_Len %d\n", i, lit_huff_table[i].code,
392 lit_huff_table[i].length);
393 printf("\n");
394
395 printf("Dist codes \n");
396 for (i = 0; i < DIST_LEN; i++)
397 printf("Dist %2d: Code 0x%04x, Code_Len %d\n", i, dist_huff_table[i].code,
398 dist_huff_table[i].length);
399 printf("\n");
400 #endif
401
402 create_code_tables(lit_code_table, lit_code_size_table, LIT_TABLE_SIZE,
403 lit_huff_table);
404 create_code_tables(dcodes_code_table, dcodes_code_size_table, DIST_LEN,
405 dist_huff_table);
406 create_packed_len_table(packed_len_table, lit_huff_table);
407 create_packed_dist_table(packed_dist_table, LONG_DIST_TABLE_SIZE, dist_huff_table);
408
409 bit_count =
410 create_header(header, sizeof(header), lit_huff_table, dist_huff_table, LAST_BLOCK);
411
412 file = fopen("hufftables_c.c", "w");
413 if (file == NULL) {
414 printf("Error creating file hufftables_c.c\n");
415 return 1;
416 }
417
418 fprint_header(file, header, bit_count, lit_code_table, lit_code_size_table,
419 dcodes_code_table, dcodes_code_size_table, packed_len_table,
420 packed_dist_table);
421
422 fclose(file);
423
424 return 0;
425 }