1 /**********************************************************************
2 Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 * Neither the name of Intel Corporation nor the names of its
14 contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 **********************************************************************/
30 /* This program can be used to generate custom a custom huffman encoding to get
31 * better data compression. This is most useful when the type of data being
32 * compressed is well known.
34 * To use generate_custom_hufftables, pass a sequence of files to the program
35 * that together form an accurate representation of the data that is being
36 * compressed. Generate_custom_hufftables will then produce the file
37 * hufftables_c.c, which should be moved to replace its counterpart in the igzip
38 * source folder. After recompiling the Isa-l library, the igzip compression
39 * functions will use the new hufftables.
41 * Generate_custom_hufftables should be compiled with the same compile time
42 * parameters as the igzip source code. Generating custom hufftables with
43 * different compile time parameters may cause igzip to produce invalid output
44 * for the reasons described below. The default parameters used by
45 * generate_custom_hufftables are the same as the default parameters used by
48 * *WARNING* generate custom hufftables must be compiled with a HIST_SIZE that
49 * is at least as large as the HIST_SIZE used by igzip. By default HIST_SIZE is
50 * 8, the maximum usable HIST_SIZE is 32. The reason for this is to generate
51 * better compression. Igzip cannot produce look back distances with sizes
52 * larger than the HIST_SIZE * 1024 igzip was compiled with, so look back
53 * distances with sizes larger than HIST_SIZE * 1024 are not assigned a huffman
56 * To improve compression ratio, the compile time option LIT_SUB is provided to
57 * allow generating custom hufftables which only use a subset of all possible
58 * literals. This can be useful for getting better compression when it is known
59 * that the data being compressed will never contain certain symbols, for
60 * example text files. If this option is used, it needs to be checked that every
61 * possible literal is in fact given a valid code in the output hufftable. This
62 * can be done by checking that every required literal has a positive value for
63 * the length of the code associated with that literal. Literals which have not
64 * been given codes will have a code length of zero. The compile time option
65 * PRINT_CODES (described below) can be used to help manually perform this
68 * The compile time parameter PRINT_CODES causes the literal/length huffman code
69 * and the distance huffman code created by generate_custom_hufftables to be
70 * printed out. This is printed out where each line corresponds to a different
71 * symbol. The first column is the symbol used to represent each literal (Lit),
72 * end of block symbol (EOB), length (Len) or distance (Dist), the second column
73 * is the associated code value, and the third column is the length in bits of
80 #include "huff_codes.h"
83 /*These max code lengths are limited by how the data is stored in
84 * hufftables.asm. The deflate standard max is 15.*/
86 #define LONG_DCODE_OFFSET 26
87 #define SHORT_DCODE_OFFSET 20
89 #define MAX_HEADER_SIZE IGZIP_MAX_DEF_HDR_SIZE
91 #define GZIP_HEADER_SIZE 10
92 #define GZIP_TRAILER_SIZE 8
95 * @brief Prints a table of uint8_t elements to a file.
96 * @param outfile: the file the table is printed to.
97 * @param table: the table to be printed.
98 * @param length: number of elements to be printed.
99 * @param header: header to append in front of the table.
100 * @param footer: footer to append at the end of the table.
101 * @param begin_line: string printed at beginning of new line
103 void fprint_uint8_table(FILE * outfile
, uint8_t * table
, uint64_t length
, char *header
,
104 char *footer
, char *begin_line
)
107 fprintf(outfile
, "%s", header
);
108 for (i
= 0; i
< length
- 1; i
++) {
110 fprintf(outfile
, "\n%s", begin_line
);
112 fprintf(outfile
, " ");
113 fprintf(outfile
, "0x%02x,", table
[i
]);
117 fprintf(outfile
, "\n%s", begin_line
);
119 fprintf(outfile
, " ");
120 fprintf(outfile
, "0x%02x", table
[i
]);
121 fprintf(outfile
, "%s", footer
);
126 * @brief Prints a table of uint16_t elements to a file.
127 * @param outfile: the file the table is printed to.
128 * @param table: the table to be printed.
129 * @param length: number of elements to be printed.
130 * @param header: header to append in front of the table.
131 * @param footer: footer to append at the end of the table.
132 * @param begin_line: string printed at beginning of new line
134 void fprint_uint16_table(FILE * outfile
, uint16_t * table
, uint64_t length
, char *header
,
135 char *footer
, char *begin_line
)
138 fprintf(outfile
, "%s", header
);
139 for (i
= 0; i
< length
- 1; i
++) {
141 fprintf(outfile
, "\n%s", begin_line
);
143 fprintf(outfile
, " ");
144 fprintf(outfile
, "0x%04x,", table
[i
]);
148 fprintf(outfile
, "\n%s", begin_line
);
150 fprintf(outfile
, " ");
151 fprintf(outfile
, "0x%04x", table
[i
]);
152 fprintf(outfile
, "%s", footer
);
157 * @brief Prints a table of uint32_t elements to a file.
158 * @param outfile: the file the table is printed to.
159 * @param table: the table to be printed.
160 * @param length: number of elements to be printed.
161 * @param header: header to append in front of the table.
162 * @param footer: footer to append at the end of the table.
163 * @param begin_line: string printed at beginning of new line
165 void fprint_uint32_table(FILE * outfile
, uint32_t * table
, uint64_t length
, char *header
,
166 char *footer
, char *begin_line
)
169 fprintf(outfile
, "%s", header
);
170 for (i
= 0; i
< length
- 1; i
++) {
172 fprintf(outfile
, "\n%s", begin_line
);
174 fprintf(outfile
, " ");
175 fprintf(outfile
, "0x%08x,", table
[i
]);
179 fprintf(outfile
, "%s", begin_line
);
181 fprintf(outfile
, " ");
182 fprintf(outfile
, "0x%08x", table
[i
]);
183 fprintf(outfile
, "%s", footer
);
188 * @brief Prints a table of uint64_t elements to a file.
189 * @param outfile: the file the table is printed to.
190 * @param table: the table to be printed.
191 * @param length: number of elements to be printed.
192 * @param header: header to append in front of the table.
193 * @param footer: footer to append at the end of the table.
195 void fprint_uint64_table(FILE * outfile
, uint64_t * table
, uint64_t length
, char *header
,
199 fprintf(outfile
, "%s\n", header
);
200 for (i
= 0; i
< length
- 1; i
++)
201 fprintf(outfile
, "\t0x%016" PRIx64
",\n", table
[i
]);
202 fprintf(outfile
, "\t0x%016" PRIx64
, table
[i
]);
203 fprintf(outfile
, "%s", footer
);
207 void fprint_hufftables(FILE * output_file
, uint8_t * header
, uint32_t bit_count
,
208 uint16_t * lit_code_table
, uint8_t * lit_code_size_table
,
209 uint16_t * dcodes_code_table
, uint8_t * dcodes_code_size_table
,
210 uint32_t * packed_len_table
, uint32_t * packed_dist_table
)
212 fprintf(output_file
, "struct isal_hufftables hufftables_default = {\n\n");
214 fprint_uint8_table(output_file
, header
, (bit_count
+ 7) / 8,
215 "\t.deflate_hdr = {", "\t},\n\n", "\t\t");
216 fprintf(output_file
, "\t.deflate_hdr_count = %d,\n", bit_count
/ 8);
217 fprintf(output_file
, "\t.deflate_hdr_extra_bits = %d,\n\n", bit_count
& 7);
219 fprint_uint32_table(output_file
, packed_dist_table
, SHORT_DIST_TABLE_SIZE
,
220 "\t.dist_table = {", ",\n", "\t\t");
221 fprint_uint32_table(output_file
, &packed_dist_table
[SHORT_DIST_TABLE_SIZE
],
222 LONG_DIST_TABLE_SIZE
- SHORT_DIST_TABLE_SIZE
,
223 "#ifdef LONGER_HUFFTABLE",
224 "\n#endif /* LONGER_HUFFTABLE */\n\t},\n\n", "\t\t");
226 fprint_uint32_table(output_file
, packed_len_table
, LEN_TABLE_SIZE
, "\t.len_table = {",
228 fprint_uint16_table(output_file
, lit_code_table
, LIT_TABLE_SIZE
, "\t.lit_table = {",
230 fprint_uint8_table(output_file
, lit_code_size_table
, LIT_TABLE_SIZE
,
231 "\t.lit_table_sizes = {", "\t},\n\n", "\t\t");
233 fprintf(output_file
, "#ifndef LONGER_HUFFTABLE\n");
234 fprint_uint16_table(output_file
, dcodes_code_table
+ SHORT_DCODE_OFFSET
,
235 DIST_LEN
- SHORT_DCODE_OFFSET
, "\t.dcodes = {", "\t},\n\n",
237 fprint_uint8_table(output_file
, dcodes_code_size_table
+ SHORT_DCODE_OFFSET
,
238 DIST_LEN
- SHORT_DCODE_OFFSET
, "\t.dcodes_sizes = {", "\t}\n",
240 fprintf(output_file
, "#else\n");
241 fprint_uint16_table(output_file
, dcodes_code_table
+ LONG_DCODE_OFFSET
,
242 DIST_LEN
- LONG_DCODE_OFFSET
, "\t.dcodes = {", "\t},\n\n", "\t\t");
243 fprint_uint8_table(output_file
, dcodes_code_size_table
+ LONG_DCODE_OFFSET
,
244 DIST_LEN
- LONG_DCODE_OFFSET
, "\t.dcodes_sizes = {", "\t}\n",
246 fprintf(output_file
, "#endif\n");
247 fprintf(output_file
, "};\n");
250 void fprint_header(FILE * output_file
, uint8_t * header
, uint32_t bit_count
,
251 uint16_t * lit_code_table
, uint8_t * lit_code_size_table
,
252 uint16_t * dcodes_code_table
, uint8_t * dcodes_code_size_table
,
253 uint32_t * packed_len_table
, uint32_t * packed_dist_table
)
255 fprintf(output_file
, "#include <stdint.h>\n");
256 fprintf(output_file
, "#include <igzip_lib.h>\n\n");
258 fprintf(output_file
, "const uint8_t gzip_hdr[] = {\n"
259 "\t0x1f, 0x8b, 0x08, 0x00, 0x00,\n" "\t0x00, 0x00, 0x00, 0x00, 0xff\t};\n\n");
261 fprintf(output_file
, "const uint32_t gzip_hdr_bytes = %d;\n", GZIP_HEADER_SIZE
);
262 fprintf(output_file
, "const uint32_t gzip_trl_bytes = %d;\n\n", GZIP_TRAILER_SIZE
);
264 fprint_hufftables(output_file
, header
, bit_count
, lit_code_table
, lit_code_size_table
,
265 dcodes_code_table
, dcodes_code_size_table
, packed_len_table
,
269 int main(int argc
, char *argv
[])
271 long int file_length
;
272 uint8_t *stream
= NULL
;
273 struct isal_huff_histogram histogram
;
274 uint64_t *lit_histogram
= histogram
.lit_len_histogram
;
275 uint64_t *dist_histogram
= histogram
.dist_histogram
;
276 uint8_t header
[MAX_HEADER_SIZE
];
278 struct huff_tree lit_tree
, dist_tree
;
279 struct huff_tree lit_tree_array
[2 * LIT_LEN
- 1], dist_tree_array
[2 * DIST_LEN
- 1];
280 struct huff_code lit_huff_table
[LIT_LEN
], dist_huff_table
[DIST_LEN
];
282 uint32_t packed_len_table
[LEN_TABLE_SIZE
];
283 uint32_t packed_dist_table
[LONG_DIST_TABLE_SIZE
];
284 uint16_t lit_code_table
[LIT_TABLE_SIZE
];
285 uint16_t dcodes_code_table
[DIST_LEN
];
286 uint8_t lit_code_size_table
[LIT_TABLE_SIZE
];
287 uint8_t dcodes_code_size_table
[DIST_LEN
];
288 int max_dist
= convert_dist_to_dist_sym(D
);
291 printf("Error, no input file.\n");
295 memset(&histogram
, 0, sizeof(histogram
)); /* Initialize histograms. */
296 memset(lit_tree_array
, 0, sizeof(lit_tree_array
));
297 memset(dist_tree_array
, 0, sizeof(dist_tree_array
));
298 memset(lit_huff_table
, 0, sizeof(lit_huff_table
));
299 memset(dist_huff_table
, 0, sizeof(dist_huff_table
));
302 printf("Processing %s\n", argv
[argc
- 1]);
303 file
= fopen(argv
[argc
- 1], "r");
305 printf("Error opening file\n");
308 fseek(file
, 0, SEEK_END
);
309 file_length
= ftell(file
);
310 fseek(file
, 0, SEEK_SET
);
311 file_length
-= ftell(file
);
312 stream
= malloc(file_length
);
313 if (stream
== NULL
) {
314 printf("Failed to allocate memory to read in file\n");
318 fread(stream
, 1, file_length
, file
);
320 printf("Error occurred when reading file");
326 /* Create a histogram of frequency of symbols found in stream to
327 * generate the huffman tree.*/
328 isal_update_histogram(stream
, file_length
, &histogram
);
335 /* Create a huffman tree corresponding to the histograms created in
339 /* Guarantee every possible repeat length is given a symbol. It is hard
340 * to guarantee data will never have a repeat of a given length */
341 for (j
= LIT_TABLE_SIZE
; j
< LIT_LEN
; j
++)
342 if (lit_histogram
[j
] == 0)
345 lit_tree
= create_symbol_subset_huff_tree(lit_tree_array
, lit_histogram
, LIT_LEN
);
347 lit_tree
= create_huff_tree(lit_tree_array
, lit_histogram
, LIT_LEN
);
349 dist_tree
= create_huff_tree(dist_tree_array
, dist_histogram
, max_dist
+ 1);
351 /* Create a look up table to represent huffman tree above in deflate
352 * standard form after it has been modified to satisfy max depth
354 if (create_huff_lookup(lit_huff_table
, LIT_LEN
, lit_tree
, MAX_DEFLATE_CODE_LEN
) > 0) {
355 printf("Error, code with invalid length for Deflate standard.\n");
359 if (create_huff_lookup(dist_huff_table
, DIST_LEN
, dist_tree
, MAX_DEFLATE_CODE_LEN
) > 0) {
360 printf("Error, code with invalid length for Deflate standard.\n");
364 if (are_hufftables_useable(lit_huff_table
, dist_huff_table
)) {
365 if (create_huff_lookup
366 (lit_huff_table
, LIT_LEN
, lit_tree
, MAX_SAFE_LIT_CODE_LEN
) > 0)
367 printf("Error, code with invalid length for Deflate standard.\n");
370 if (create_huff_lookup
371 (dist_huff_table
, DIST_LEN
, dist_tree
, MAX_SAFE_DIST_CODE_LEN
) > 0)
372 printf("Error, code with invalid length for Deflate standard.\n");
375 if (are_hufftables_useable(lit_huff_table
, dist_huff_table
)) {
376 printf("Error, hufftable is not usable\n");
382 printf("Lit/Len codes\n");
383 for (i
= 0; i
< LIT_TABLE_SIZE
- 1; i
++)
384 printf("Lit %3d: Code 0x%04x, Code_Len %d\n", i
, lit_huff_table
[i
].code
,
385 lit_huff_table
[i
].length
);
387 printf("EOB %3d: Code 0x%04x, Code_Len %d\n", 256, lit_huff_table
[256].code
,
388 lit_huff_table
[256].length
);
390 for (i
= LIT_TABLE_SIZE
; i
< LIT_LEN
; i
++)
391 printf("Len %d: Code 0x%04x, Code_Len %d\n", i
, lit_huff_table
[i
].code
,
392 lit_huff_table
[i
].length
);
395 printf("Dist codes \n");
396 for (i
= 0; i
< DIST_LEN
; i
++)
397 printf("Dist %2d: Code 0x%04x, Code_Len %d\n", i
, dist_huff_table
[i
].code
,
398 dist_huff_table
[i
].length
);
402 create_code_tables(lit_code_table
, lit_code_size_table
, LIT_TABLE_SIZE
,
404 create_code_tables(dcodes_code_table
, dcodes_code_size_table
, DIST_LEN
,
406 create_packed_len_table(packed_len_table
, lit_huff_table
);
407 create_packed_dist_table(packed_dist_table
, LONG_DIST_TABLE_SIZE
, dist_huff_table
);
410 create_header(header
, sizeof(header
), lit_huff_table
, dist_huff_table
, LAST_BLOCK
);
412 file
= fopen("hufftables_c.c", "w");
414 printf("Error creating file hufftables_c.c\n");
418 fprint_header(file
, header
, bit_count
, lit_code_table
, lit_code_size_table
,
419 dcodes_code_table
, dcodes_code_size_table
, packed_len_table
,