1 /* Copyright 2013 Google Inc. All Rights Reserved.
3 Distributed under MIT license.
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
7 #include "./static_dict.h"
9 #include "../common/dictionary.h"
10 #include "../common/platform.h"
11 #include "../common/transform.h"
12 #include "./encoder_dict.h"
13 #include "./find_match_length.h"
15 #if defined(__cplusplus) || defined(c_plusplus)
19 static BROTLI_INLINE
uint32_t Hash(const uint8_t* data
) {
20 uint32_t h
= BROTLI_UNALIGNED_LOAD32LE(data
) * kDictHashMul32
;
21 /* The higher bits contain more mixture from the multiplication,
22 so we take our results from there. */
23 return h
>> (32 - kDictNumBits
);
26 static BROTLI_INLINE
void AddMatch(size_t distance
, size_t len
, size_t len_code
,
28 uint32_t match
= (uint32_t)((distance
<< 5) + len_code
);
29 matches
[len
] = BROTLI_MIN(uint32_t, matches
[len
], match
);
32 static BROTLI_INLINE
size_t DictMatchLength(const BrotliDictionary
* dictionary
,
37 const size_t offset
= dictionary
->offsets_by_length
[len
] + len
* id
;
38 return FindMatchLengthWithLimit(&dictionary
->data
[offset
], data
,
39 BROTLI_MIN(size_t, len
, maxlen
));
42 static BROTLI_INLINE BROTLI_BOOL
IsMatch(const BrotliDictionary
* dictionary
,
43 DictWord w
, const uint8_t* data
, size_t max_length
) {
44 if (w
.len
> max_length
) {
47 const size_t offset
= dictionary
->offsets_by_length
[w
.len
] +
48 (size_t)w
.len
* (size_t)w
.idx
;
49 const uint8_t* dict
= &dictionary
->data
[offset
];
50 if (w
.transform
== 0) {
51 /* Match against base dictionary word. */
53 TO_BROTLI_BOOL(FindMatchLengthWithLimit(dict
, data
, w
.len
) == w
.len
);
54 } else if (w
.transform
== 10) {
55 /* Match against uppercase first transform.
56 Note that there are only ASCII uppercase words in the lookup table. */
57 return TO_BROTLI_BOOL(dict
[0] >= 'a' && dict
[0] <= 'z' &&
58 (dict
[0] ^ 32) == data
[0] &&
59 FindMatchLengthWithLimit(&dict
[1], &data
[1], w
.len
- 1u) ==
62 /* Match against uppercase all transform.
63 Note that there are only ASCII uppercase words in the lookup table. */
65 for (i
= 0; i
< w
.len
; ++i
) {
66 if (dict
[i
] >= 'a' && dict
[i
] <= 'z') {
67 if ((dict
[i
] ^ 32) != data
[i
]) return BROTLI_FALSE
;
69 if (dict
[i
] != data
[i
]) return BROTLI_FALSE
;
77 BROTLI_BOOL
BrotliFindAllStaticDictionaryMatches(
78 const BrotliEncoderDictionary
* dictionary
, const uint8_t* data
,
79 size_t min_length
, size_t max_length
, uint32_t* matches
) {
80 BROTLI_BOOL has_found_match
= BROTLI_FALSE
;
82 size_t offset
= dictionary
->buckets
[Hash(data
)];
83 BROTLI_BOOL end
= !offset
;
85 DictWord w
= dictionary
->dict_words
[offset
++];
86 const size_t l
= w
.len
& 0x1F;
87 const size_t n
= (size_t)1 << dictionary
->words
->size_bits_by_length
[l
];
88 const size_t id
= w
.idx
;
89 end
= !!(w
.len
& 0x80);
91 if (w
.transform
== 0) {
92 const size_t matchlen
=
93 DictMatchLength(dictionary
->words
, data
, id
, l
, max_length
);
98 /* Transform "" + BROTLI_TRANSFORM_IDENTITY + "" */
100 AddMatch(id
, l
, l
, matches
);
101 has_found_match
= BROTLI_TRUE
;
103 /* Transforms "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "" and
104 "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "ing " */
105 if (matchlen
>= l
- 1) {
106 AddMatch(id
+ 12 * n
, l
- 1, l
, matches
);
107 if (l
+ 2 < max_length
&&
108 data
[l
- 1] == 'i' && data
[l
] == 'n' && data
[l
+ 1] == 'g' &&
109 data
[l
+ 2] == ' ') {
110 AddMatch(id
+ 49 * n
, l
+ 3, l
, matches
);
112 has_found_match
= BROTLI_TRUE
;
114 /* Transform "" + BROTLI_TRANSFORM_OMIT_LAST_# + "" (# = 2 .. 9) */
116 if (l
> 9) minlen
= BROTLI_MAX(size_t, minlen
, l
- 9);
117 maxlen
= BROTLI_MIN(size_t, matchlen
, l
- 2);
118 for (len
= minlen
; len
<= maxlen
; ++len
) {
119 size_t cut
= l
- len
;
120 size_t transform_id
= (cut
<< 2) +
121 (size_t)((dictionary
->cutoffTransforms
>> (cut
* 6)) & 0x3F);
122 AddMatch(id
+ transform_id
* n
, len
, l
, matches
);
123 has_found_match
= BROTLI_TRUE
;
125 if (matchlen
< l
|| l
+ 6 >= max_length
) {
129 /* Transforms "" + BROTLI_TRANSFORM_IDENTITY + <suffix> */
131 AddMatch(id
+ n
, l
+ 1, l
, matches
);
134 AddMatch(id
+ 28 * n
, l
+ 3, l
, matches
);
135 } else if (s
[2] == 's') {
136 if (s
[3] == ' ') AddMatch(id
+ 46 * n
, l
+ 4, l
, matches
);
137 } else if (s
[2] == 't') {
138 if (s
[3] == ' ') AddMatch(id
+ 60 * n
, l
+ 4, l
, matches
);
139 } else if (s
[2] == 'n') {
140 if (s
[3] == 'd' && s
[4] == ' ') {
141 AddMatch(id
+ 10 * n
, l
+ 5, l
, matches
);
144 } else if (s
[1] == 'b') {
145 if (s
[2] == 'y' && s
[3] == ' ') {
146 AddMatch(id
+ 38 * n
, l
+ 4, l
, matches
);
148 } else if (s
[1] == 'i') {
150 if (s
[3] == ' ') AddMatch(id
+ 16 * n
, l
+ 4, l
, matches
);
151 } else if (s
[2] == 's') {
152 if (s
[3] == ' ') AddMatch(id
+ 47 * n
, l
+ 4, l
, matches
);
154 } else if (s
[1] == 'f') {
156 if (s
[3] == 'r' && s
[4] == ' ') {
157 AddMatch(id
+ 25 * n
, l
+ 5, l
, matches
);
159 } else if (s
[2] == 'r') {
160 if (s
[3] == 'o' && s
[4] == 'm' && s
[5] == ' ') {
161 AddMatch(id
+ 37 * n
, l
+ 6, l
, matches
);
164 } else if (s
[1] == 'o') {
166 if (s
[3] == ' ') AddMatch(id
+ 8 * n
, l
+ 4, l
, matches
);
167 } else if (s
[2] == 'n') {
168 if (s
[3] == ' ') AddMatch(id
+ 45 * n
, l
+ 4, l
, matches
);
170 } else if (s
[1] == 'n') {
171 if (s
[2] == 'o' && s
[3] == 't' && s
[4] == ' ') {
172 AddMatch(id
+ 80 * n
, l
+ 5, l
, matches
);
174 } else if (s
[1] == 't') {
177 if (s
[4] == ' ') AddMatch(id
+ 5 * n
, l
+ 5, l
, matches
);
178 } else if (s
[3] == 'a') {
179 if (s
[4] == 't' && s
[5] == ' ') {
180 AddMatch(id
+ 29 * n
, l
+ 6, l
, matches
);
183 } else if (s
[2] == 'o') {
184 if (s
[3] == ' ') AddMatch(id
+ 17 * n
, l
+ 4, l
, matches
);
186 } else if (s
[1] == 'w') {
187 if (s
[2] == 'i' && s
[3] == 't' && s
[4] == 'h' && s
[5] == ' ') {
188 AddMatch(id
+ 35 * n
, l
+ 6, l
, matches
);
191 } else if (s
[0] == '"') {
192 AddMatch(id
+ 19 * n
, l
+ 1, l
, matches
);
194 AddMatch(id
+ 21 * n
, l
+ 2, l
, matches
);
196 } else if (s
[0] == '.') {
197 AddMatch(id
+ 20 * n
, l
+ 1, l
, matches
);
199 AddMatch(id
+ 31 * n
, l
+ 2, l
, matches
);
200 if (s
[2] == 'T' && s
[3] == 'h') {
202 if (s
[5] == ' ') AddMatch(id
+ 43 * n
, l
+ 6, l
, matches
);
203 } else if (s
[4] == 'i') {
204 if (s
[5] == 's' && s
[6] == ' ') {
205 AddMatch(id
+ 75 * n
, l
+ 7, l
, matches
);
210 } else if (s
[0] == ',') {
211 AddMatch(id
+ 76 * n
, l
+ 1, l
, matches
);
213 AddMatch(id
+ 14 * n
, l
+ 2, l
, matches
);
215 } else if (s
[0] == '\n') {
216 AddMatch(id
+ 22 * n
, l
+ 1, l
, matches
);
218 AddMatch(id
+ 50 * n
, l
+ 2, l
, matches
);
220 } else if (s
[0] == ']') {
221 AddMatch(id
+ 24 * n
, l
+ 1, l
, matches
);
222 } else if (s
[0] == '\'') {
223 AddMatch(id
+ 36 * n
, l
+ 1, l
, matches
);
224 } else if (s
[0] == ':') {
225 AddMatch(id
+ 51 * n
, l
+ 1, l
, matches
);
226 } else if (s
[0] == '(') {
227 AddMatch(id
+ 57 * n
, l
+ 1, l
, matches
);
228 } else if (s
[0] == '=') {
230 AddMatch(id
+ 70 * n
, l
+ 2, l
, matches
);
231 } else if (s
[1] == '\'') {
232 AddMatch(id
+ 86 * n
, l
+ 2, l
, matches
);
234 } else if (s
[0] == 'a') {
235 if (s
[1] == 'l' && s
[2] == ' ') {
236 AddMatch(id
+ 84 * n
, l
+ 3, l
, matches
);
238 } else if (s
[0] == 'e') {
240 if (s
[2] == ' ') AddMatch(id
+ 53 * n
, l
+ 3, l
, matches
);
241 } else if (s
[1] == 'r') {
242 if (s
[2] == ' ') AddMatch(id
+ 82 * n
, l
+ 3, l
, matches
);
243 } else if (s
[1] == 's') {
244 if (s
[2] == 't' && s
[3] == ' ') {
245 AddMatch(id
+ 95 * n
, l
+ 4, l
, matches
);
248 } else if (s
[0] == 'f') {
249 if (s
[1] == 'u' && s
[2] == 'l' && s
[3] == ' ') {
250 AddMatch(id
+ 90 * n
, l
+ 4, l
, matches
);
252 } else if (s
[0] == 'i') {
254 if (s
[2] == 'e' && s
[3] == ' ') {
255 AddMatch(id
+ 92 * n
, l
+ 4, l
, matches
);
257 } else if (s
[1] == 'z') {
258 if (s
[2] == 'e' && s
[3] == ' ') {
259 AddMatch(id
+ 100 * n
, l
+ 4, l
, matches
);
262 } else if (s
[0] == 'l') {
264 if (s
[2] == 's' && s
[3] == 's' && s
[4] == ' ') {
265 AddMatch(id
+ 93 * n
, l
+ 5, l
, matches
);
267 } else if (s
[1] == 'y') {
268 if (s
[2] == ' ') AddMatch(id
+ 61 * n
, l
+ 3, l
, matches
);
270 } else if (s
[0] == 'o') {
271 if (s
[1] == 'u' && s
[2] == 's' && s
[3] == ' ') {
272 AddMatch(id
+ 106 * n
, l
+ 4, l
, matches
);
276 /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and
277 is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL)
279 const BROTLI_BOOL is_all_caps
=
280 TO_BROTLI_BOOL(w
.transform
!= BROTLI_TRANSFORM_UPPERCASE_FIRST
);
282 if (!IsMatch(dictionary
->words
, w
, data
, max_length
)) {
285 /* Transform "" + kUppercase{First,All} + "" */
286 AddMatch(id
+ (is_all_caps
? 44 : 9) * n
, l
, l
, matches
);
287 has_found_match
= BROTLI_TRUE
;
288 if (l
+ 1 >= max_length
) {
291 /* Transforms "" + kUppercase{First,All} + <suffix> */
294 AddMatch(id
+ (is_all_caps
? 68 : 4) * n
, l
+ 1, l
, matches
);
295 } else if (s
[0] == '"') {
296 AddMatch(id
+ (is_all_caps
? 87 : 66) * n
, l
+ 1, l
, matches
);
298 AddMatch(id
+ (is_all_caps
? 97 : 69) * n
, l
+ 2, l
, matches
);
300 } else if (s
[0] == '.') {
301 AddMatch(id
+ (is_all_caps
? 101 : 79) * n
, l
+ 1, l
, matches
);
303 AddMatch(id
+ (is_all_caps
? 114 : 88) * n
, l
+ 2, l
, matches
);
305 } else if (s
[0] == ',') {
306 AddMatch(id
+ (is_all_caps
? 112 : 99) * n
, l
+ 1, l
, matches
);
308 AddMatch(id
+ (is_all_caps
? 107 : 58) * n
, l
+ 2, l
, matches
);
310 } else if (s
[0] == '\'') {
311 AddMatch(id
+ (is_all_caps
? 94 : 74) * n
, l
+ 1, l
, matches
);
312 } else if (s
[0] == '(') {
313 AddMatch(id
+ (is_all_caps
? 113 : 78) * n
, l
+ 1, l
, matches
);
314 } else if (s
[0] == '=') {
316 AddMatch(id
+ (is_all_caps
? 105 : 104) * n
, l
+ 2, l
, matches
);
317 } else if (s
[1] == '\'') {
318 AddMatch(id
+ (is_all_caps
? 116 : 108) * n
, l
+ 2, l
, matches
);
324 /* Transforms with prefixes " " and "." */
325 if (max_length
>= 5 && (data
[0] == ' ' || data
[0] == '.')) {
326 BROTLI_BOOL is_space
= TO_BROTLI_BOOL(data
[0] == ' ');
327 size_t offset
= dictionary
->buckets
[Hash(&data
[1])];
328 BROTLI_BOOL end
= !offset
;
330 DictWord w
= dictionary
->dict_words
[offset
++];
331 const size_t l
= w
.len
& 0x1F;
332 const size_t n
= (size_t)1 << dictionary
->words
->size_bits_by_length
[l
];
333 const size_t id
= w
.idx
;
334 end
= !!(w
.len
& 0x80);
336 if (w
.transform
== 0) {
338 if (!IsMatch(dictionary
->words
, w
, &data
[1], max_length
- 1)) {
341 /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + "" and
342 "." + BROTLI_TRANSFORM_IDENTITY + "" */
343 AddMatch(id
+ (is_space
? 6 : 32) * n
, l
+ 1, l
, matches
);
344 has_found_match
= BROTLI_TRUE
;
345 if (l
+ 2 >= max_length
) {
348 /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + <suffix> and
349 "." + BROTLI_TRANSFORM_IDENTITY + <suffix>
353 AddMatch(id
+ (is_space
? 2 : 77) * n
, l
+ 2, l
, matches
);
354 } else if (s
[0] == '(') {
355 AddMatch(id
+ (is_space
? 89 : 67) * n
, l
+ 2, l
, matches
);
356 } else if (is_space
) {
358 AddMatch(id
+ 103 * n
, l
+ 2, l
, matches
);
360 AddMatch(id
+ 33 * n
, l
+ 3, l
, matches
);
362 } else if (s
[0] == '.') {
363 AddMatch(id
+ 71 * n
, l
+ 2, l
, matches
);
365 AddMatch(id
+ 52 * n
, l
+ 3, l
, matches
);
367 } else if (s
[0] == '=') {
369 AddMatch(id
+ 81 * n
, l
+ 3, l
, matches
);
370 } else if (s
[1] == '\'') {
371 AddMatch(id
+ 98 * n
, l
+ 3, l
, matches
);
375 } else if (is_space
) {
376 /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and
377 is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL)
379 const BROTLI_BOOL is_all_caps
=
380 TO_BROTLI_BOOL(w
.transform
!= BROTLI_TRANSFORM_UPPERCASE_FIRST
);
382 if (!IsMatch(dictionary
->words
, w
, &data
[1], max_length
- 1)) {
385 /* Transforms " " + kUppercase{First,All} + "" */
386 AddMatch(id
+ (is_all_caps
? 85 : 30) * n
, l
+ 1, l
, matches
);
387 has_found_match
= BROTLI_TRUE
;
388 if (l
+ 2 >= max_length
) {
391 /* Transforms " " + kUppercase{First,All} + <suffix> */
394 AddMatch(id
+ (is_all_caps
? 83 : 15) * n
, l
+ 2, l
, matches
);
395 } else if (s
[0] == ',') {
397 AddMatch(id
+ 109 * n
, l
+ 2, l
, matches
);
400 AddMatch(id
+ (is_all_caps
? 111 : 65) * n
, l
+ 3, l
, matches
);
402 } else if (s
[0] == '.') {
403 AddMatch(id
+ (is_all_caps
? 115 : 96) * n
, l
+ 2, l
, matches
);
405 AddMatch(id
+ (is_all_caps
? 117 : 91) * n
, l
+ 3, l
, matches
);
407 } else if (s
[0] == '=') {
409 AddMatch(id
+ (is_all_caps
? 110 : 118) * n
, l
+ 3, l
, matches
);
410 } else if (s
[1] == '\'') {
411 AddMatch(id
+ (is_all_caps
? 119 : 120) * n
, l
+ 3, l
, matches
);
417 if (max_length
>= 6) {
418 /* Transforms with prefixes "e ", "s ", ", " and "\xC2\xA0" */
419 if ((data
[1] == ' ' &&
420 (data
[0] == 'e' || data
[0] == 's' || data
[0] == ',')) ||
421 (data
[0] == 0xC2 && data
[1] == 0xA0)) {
422 size_t offset
= dictionary
->buckets
[Hash(&data
[2])];
423 BROTLI_BOOL end
= !offset
;
425 DictWord w
= dictionary
->dict_words
[offset
++];
426 const size_t l
= w
.len
& 0x1F;
427 const size_t n
= (size_t)1 << dictionary
->words
->size_bits_by_length
[l
];
428 const size_t id
= w
.idx
;
429 end
= !!(w
.len
& 0x80);
431 if (w
.transform
== 0 &&
432 IsMatch(dictionary
->words
, w
, &data
[2], max_length
- 2)) {
433 if (data
[0] == 0xC2) {
434 AddMatch(id
+ 102 * n
, l
+ 2, l
, matches
);
435 has_found_match
= BROTLI_TRUE
;
436 } else if (l
+ 2 < max_length
&& data
[l
+ 2] == ' ') {
437 size_t t
= data
[0] == 'e' ? 18 : (data
[0] == 's' ? 7 : 13);
438 AddMatch(id
+ t
* n
, l
+ 3, l
, matches
);
439 has_found_match
= BROTLI_TRUE
;
445 if (max_length
>= 9) {
446 /* Transforms with prefixes " the " and ".com/" */
447 if ((data
[0] == ' ' && data
[1] == 't' && data
[2] == 'h' &&
448 data
[3] == 'e' && data
[4] == ' ') ||
449 (data
[0] == '.' && data
[1] == 'c' && data
[2] == 'o' &&
450 data
[3] == 'm' && data
[4] == '/')) {
451 size_t offset
= dictionary
->buckets
[Hash(&data
[5])];
452 BROTLI_BOOL end
= !offset
;
454 DictWord w
= dictionary
->dict_words
[offset
++];
455 const size_t l
= w
.len
& 0x1F;
456 const size_t n
= (size_t)1 << dictionary
->words
->size_bits_by_length
[l
];
457 const size_t id
= w
.idx
;
458 end
= !!(w
.len
& 0x80);
460 if (w
.transform
== 0 &&
461 IsMatch(dictionary
->words
, w
, &data
[5], max_length
- 5)) {
462 AddMatch(id
+ (data
[0] == ' ' ? 41 : 72) * n
, l
+ 5, l
, matches
);
463 has_found_match
= BROTLI_TRUE
;
464 if (l
+ 5 < max_length
) {
465 const uint8_t* s
= &data
[l
+ 5];
466 if (data
[0] == ' ') {
467 if (l
+ 8 < max_length
&&
468 s
[0] == ' ' && s
[1] == 'o' && s
[2] == 'f' && s
[3] == ' ') {
469 AddMatch(id
+ 62 * n
, l
+ 9, l
, matches
);
470 if (l
+ 12 < max_length
&&
471 s
[4] == 't' && s
[5] == 'h' && s
[6] == 'e' && s
[7] == ' ') {
472 AddMatch(id
+ 73 * n
, l
+ 13, l
, matches
);
481 return has_found_match
;
484 #if defined(__cplusplus) || defined(c_plusplus)