1 /* Copyright 2013 Google Inc. All Rights Reserved.
3 Distributed under MIT license.
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
7 #include "./static_dict.h"
9 #include "../common/dictionary.h"
10 #include "./find_match_length.h"
12 #include "./static_dict_lut.h"
14 #if defined(__cplusplus) || defined(c_plusplus)
18 static const uint8_t kUppercaseFirst
= 10;
19 static const uint8_t kOmitLastNTransforms
[10] = {
20 0, 12, 27, 23, 42, 63, 56, 48, 59, 64,
23 static BROTLI_INLINE
uint32_t Hash(const uint8_t *data
) {
24 uint32_t h
= BROTLI_UNALIGNED_LOAD32(data
) * kDictHashMul32
;
25 /* The higher bits contain more mixture from the multiplication,
26 so we take our results from there. */
27 return h
>> (32 - kDictNumBits
);
30 static BROTLI_INLINE
void AddMatch(size_t distance
, size_t len
, size_t len_code
,
32 uint32_t match
= (uint32_t)((distance
<< 5) + len_code
);
33 matches
[len
] = BROTLI_MIN(uint32_t, matches
[len
], match
);
36 static BROTLI_INLINE
size_t DictMatchLength(const uint8_t* data
,
40 const size_t offset
= kBrotliDictionaryOffsetsByLength
[len
] + len
* id
;
41 return FindMatchLengthWithLimit(&kBrotliDictionary
[offset
], data
,
42 BROTLI_MIN(size_t, len
, maxlen
));
45 static BROTLI_INLINE BROTLI_BOOL
IsMatch(
46 DictWord w
, const uint8_t* data
, size_t max_length
) {
47 if (w
.len
> max_length
) {
50 const size_t offset
= kBrotliDictionaryOffsetsByLength
[w
.len
] +
51 (size_t)w
.len
* (size_t)w
.idx
;
52 const uint8_t* dict
= &kBrotliDictionary
[offset
];
53 if (w
.transform
== 0) {
54 /* Match against base dictionary word. */
56 TO_BROTLI_BOOL(FindMatchLengthWithLimit(dict
, data
, w
.len
) == w
.len
);
57 } else if (w
.transform
== 10) {
58 /* Match against uppercase first transform.
59 Note that there are only ASCII uppercase words in the lookup table. */
60 return TO_BROTLI_BOOL(dict
[0] >= 'a' && dict
[0] <= 'z' &&
61 (dict
[0] ^ 32) == data
[0] &&
62 FindMatchLengthWithLimit(&dict
[1], &data
[1], w
.len
- 1u) ==
65 /* Match against uppercase all transform.
66 Note that there are only ASCII uppercase words in the lookup table. */
68 for (i
= 0; i
< w
.len
; ++i
) {
69 if (dict
[i
] >= 'a' && dict
[i
] <= 'z') {
70 if ((dict
[i
] ^ 32) != data
[i
]) return BROTLI_FALSE
;
72 if (dict
[i
] != data
[i
]) return BROTLI_FALSE
;
80 BROTLI_BOOL
BrotliFindAllStaticDictionaryMatches(
81 const uint8_t* data
, size_t min_length
, size_t max_length
,
83 BROTLI_BOOL has_found_match
= BROTLI_FALSE
;
85 size_t offset
= kStaticDictionaryBuckets
[Hash(data
)];
86 BROTLI_BOOL end
= !offset
;
88 DictWord w
= kStaticDictionaryWords
[offset
++];
89 const size_t l
= w
.len
& 0x7F;
90 const size_t n
= (size_t)1 << kBrotliDictionarySizeBitsByLength
[l
];
91 const size_t id
= w
.idx
;
92 end
= !!(w
.len
& 0x80);
94 if (w
.transform
== 0) {
95 const size_t matchlen
= DictMatchLength(data
, id
, l
, max_length
);
100 /* Transform "" + kIdentity + "" */
102 AddMatch(id
, l
, l
, matches
);
103 has_found_match
= BROTLI_TRUE
;
105 /* Transforms "" + kOmitLast1 + "" and "" + kOmitLast1 + "ing " */
106 if (matchlen
>= l
- 1) {
107 AddMatch(id
+ 12 * n
, l
- 1, l
, matches
);
108 if (l
+ 2 < max_length
&&
109 data
[l
- 1] == 'i' && data
[l
] == 'n' && data
[l
+ 1] == 'g' &&
110 data
[l
+ 2] == ' ') {
111 AddMatch(id
+ 49 * n
, l
+ 3, l
, matches
);
113 has_found_match
= BROTLI_TRUE
;
115 /* Transform "" + kOmitLastN + "" (N = 2 .. 9) */
117 if (l
> 9) minlen
= BROTLI_MAX(size_t, minlen
, l
- 9);
118 maxlen
= BROTLI_MIN(size_t, matchlen
, l
- 2);
119 for (len
= minlen
; len
<= maxlen
; ++len
) {
120 AddMatch(id
+ kOmitLastNTransforms
[l
- len
] * n
, len
, l
, matches
);
121 has_found_match
= BROTLI_TRUE
;
123 if (matchlen
< l
|| l
+ 6 >= max_length
) {
127 /* Transforms "" + kIdentity + <suffix> */
129 AddMatch(id
+ n
, l
+ 1, l
, matches
);
132 AddMatch(id
+ 28 * n
, l
+ 3, l
, matches
);
133 } else if (s
[2] == 's') {
134 if (s
[3] == ' ') AddMatch(id
+ 46 * n
, l
+ 4, l
, matches
);
135 } else if (s
[2] == 't') {
136 if (s
[3] == ' ') AddMatch(id
+ 60 * n
, l
+ 4, l
, matches
);
137 } else if (s
[2] == 'n') {
138 if (s
[3] == 'd' && s
[4] == ' ') {
139 AddMatch(id
+ 10 * n
, l
+ 5, l
, matches
);
142 } else if (s
[1] == 'b') {
143 if (s
[2] == 'y' && s
[3] == ' ') {
144 AddMatch(id
+ 38 * n
, l
+ 4, l
, matches
);
146 } else if (s
[1] == 'i') {
148 if (s
[3] == ' ') AddMatch(id
+ 16 * n
, l
+ 4, l
, matches
);
149 } else if (s
[2] == 's') {
150 if (s
[3] == ' ') AddMatch(id
+ 47 * n
, l
+ 4, l
, matches
);
152 } else if (s
[1] == 'f') {
154 if (s
[3] == 'r' && s
[4] == ' ') {
155 AddMatch(id
+ 25 * n
, l
+ 5, l
, matches
);
157 } else if (s
[2] == 'r') {
158 if (s
[3] == 'o' && s
[4] == 'm' && s
[5] == ' ') {
159 AddMatch(id
+ 37 * n
, l
+ 6, l
, matches
);
162 } else if (s
[1] == 'o') {
164 if (s
[3] == ' ') AddMatch(id
+ 8 * n
, l
+ 4, l
, matches
);
165 } else if (s
[2] == 'n') {
166 if (s
[3] == ' ') AddMatch(id
+ 45 * n
, l
+ 4, l
, matches
);
168 } else if (s
[1] == 'n') {
169 if (s
[2] == 'o' && s
[3] == 't' && s
[4] == ' ') {
170 AddMatch(id
+ 80 * n
, l
+ 5, l
, matches
);
172 } else if (s
[1] == 't') {
175 if (s
[4] == ' ') AddMatch(id
+ 5 * n
, l
+ 5, l
, matches
);
176 } else if (s
[3] == 'a') {
177 if (s
[4] == 't' && s
[5] == ' ') {
178 AddMatch(id
+ 29 * n
, l
+ 6, l
, matches
);
181 } else if (s
[2] == 'o') {
182 if (s
[3] == ' ') AddMatch(id
+ 17 * n
, l
+ 4, l
, matches
);
184 } else if (s
[1] == 'w') {
185 if (s
[2] == 'i' && s
[3] == 't' && s
[4] == 'h' && s
[5] == ' ') {
186 AddMatch(id
+ 35 * n
, l
+ 6, l
, matches
);
189 } else if (s
[0] == '"') {
190 AddMatch(id
+ 19 * n
, l
+ 1, l
, matches
);
192 AddMatch(id
+ 21 * n
, l
+ 2, l
, matches
);
194 } else if (s
[0] == '.') {
195 AddMatch(id
+ 20 * n
, l
+ 1, l
, matches
);
197 AddMatch(id
+ 31 * n
, l
+ 2, l
, matches
);
198 if (s
[2] == 'T' && s
[3] == 'h') {
200 if (s
[5] == ' ') AddMatch(id
+ 43 * n
, l
+ 6, l
, matches
);
201 } else if (s
[4] == 'i') {
202 if (s
[5] == 's' && s
[6] == ' ') {
203 AddMatch(id
+ 75 * n
, l
+ 7, l
, matches
);
208 } else if (s
[0] == ',') {
209 AddMatch(id
+ 76 * n
, l
+ 1, l
, matches
);
211 AddMatch(id
+ 14 * n
, l
+ 2, l
, matches
);
213 } else if (s
[0] == '\n') {
214 AddMatch(id
+ 22 * n
, l
+ 1, l
, matches
);
216 AddMatch(id
+ 50 * n
, l
+ 2, l
, matches
);
218 } else if (s
[0] == ']') {
219 AddMatch(id
+ 24 * n
, l
+ 1, l
, matches
);
220 } else if (s
[0] == '\'') {
221 AddMatch(id
+ 36 * n
, l
+ 1, l
, matches
);
222 } else if (s
[0] == ':') {
223 AddMatch(id
+ 51 * n
, l
+ 1, l
, matches
);
224 } else if (s
[0] == '(') {
225 AddMatch(id
+ 57 * n
, l
+ 1, l
, matches
);
226 } else if (s
[0] == '=') {
228 AddMatch(id
+ 70 * n
, l
+ 2, l
, matches
);
229 } else if (s
[1] == '\'') {
230 AddMatch(id
+ 86 * n
, l
+ 2, l
, matches
);
232 } else if (s
[0] == 'a') {
233 if (s
[1] == 'l' && s
[2] == ' ') {
234 AddMatch(id
+ 84 * n
, l
+ 3, l
, matches
);
236 } else if (s
[0] == 'e') {
238 if (s
[2] == ' ') AddMatch(id
+ 53 * n
, l
+ 3, l
, matches
);
239 } else if (s
[1] == 'r') {
240 if (s
[2] == ' ') AddMatch(id
+ 82 * n
, l
+ 3, l
, matches
);
241 } else if (s
[1] == 's') {
242 if (s
[2] == 't' && s
[3] == ' ') {
243 AddMatch(id
+ 95 * n
, l
+ 4, l
, matches
);
246 } else if (s
[0] == 'f') {
247 if (s
[1] == 'u' && s
[2] == 'l' && s
[3] == ' ') {
248 AddMatch(id
+ 90 * n
, l
+ 4, l
, matches
);
250 } else if (s
[0] == 'i') {
252 if (s
[2] == 'e' && s
[3] == ' ') {
253 AddMatch(id
+ 92 * n
, l
+ 4, l
, matches
);
255 } else if (s
[1] == 'z') {
256 if (s
[2] == 'e' && s
[3] == ' ') {
257 AddMatch(id
+ 100 * n
, l
+ 4, l
, matches
);
260 } else if (s
[0] == 'l') {
262 if (s
[2] == 's' && s
[3] == 's' && s
[4] == ' ') {
263 AddMatch(id
+ 93 * n
, l
+ 5, l
, matches
);
265 } else if (s
[1] == 'y') {
266 if (s
[2] == ' ') AddMatch(id
+ 61 * n
, l
+ 3, l
, matches
);
268 } else if (s
[0] == 'o') {
269 if (s
[1] == 'u' && s
[2] == 's' && s
[3] == ' ') {
270 AddMatch(id
+ 106 * n
, l
+ 4, l
, matches
);
274 /* Set is_all_caps=0 for kUppercaseFirst and
275 is_all_caps=1 otherwise (kUppercaseAll) transform. */
276 const BROTLI_BOOL is_all_caps
=
277 TO_BROTLI_BOOL(w
.transform
!= kUppercaseFirst
);
279 if (!IsMatch(w
, data
, max_length
)) {
282 /* Transform "" + kUppercase{First,All} + "" */
283 AddMatch(id
+ (is_all_caps
? 44 : 9) * n
, l
, l
, matches
);
284 has_found_match
= BROTLI_TRUE
;
285 if (l
+ 1 >= max_length
) {
288 /* Transforms "" + kUppercase{First,All} + <suffix> */
291 AddMatch(id
+ (is_all_caps
? 68 : 4) * n
, l
+ 1, l
, matches
);
292 } else if (s
[0] == '"') {
293 AddMatch(id
+ (is_all_caps
? 87 : 66) * n
, l
+ 1, l
, matches
);
295 AddMatch(id
+ (is_all_caps
? 97 : 69) * n
, l
+ 2, l
, matches
);
297 } else if (s
[0] == '.') {
298 AddMatch(id
+ (is_all_caps
? 101 : 79) * n
, l
+ 1, l
, matches
);
300 AddMatch(id
+ (is_all_caps
? 114 : 88) * n
, l
+ 2, l
, matches
);
302 } else if (s
[0] == ',') {
303 AddMatch(id
+ (is_all_caps
? 112 : 99) * n
, l
+ 1, l
, matches
);
305 AddMatch(id
+ (is_all_caps
? 107 : 58) * n
, l
+ 2, l
, matches
);
307 } else if (s
[0] == '\'') {
308 AddMatch(id
+ (is_all_caps
? 94 : 74) * n
, l
+ 1, l
, matches
);
309 } else if (s
[0] == '(') {
310 AddMatch(id
+ (is_all_caps
? 113 : 78) * n
, l
+ 1, l
, matches
);
311 } else if (s
[0] == '=') {
313 AddMatch(id
+ (is_all_caps
? 105 : 104) * n
, l
+ 2, l
, matches
);
314 } else if (s
[1] == '\'') {
315 AddMatch(id
+ (is_all_caps
? 116 : 108) * n
, l
+ 2, l
, matches
);
321 /* Transforms with prefixes " " and "." */
322 if (max_length
>= 5 && (data
[0] == ' ' || data
[0] == '.')) {
323 BROTLI_BOOL is_space
= TO_BROTLI_BOOL(data
[0] == ' ');
324 size_t offset
= kStaticDictionaryBuckets
[Hash(&data
[1])];
325 BROTLI_BOOL end
= !offset
;
327 DictWord w
= kStaticDictionaryWords
[offset
++];
328 const size_t l
= w
.len
& 0x7F;
329 const size_t n
= (size_t)1 << kBrotliDictionarySizeBitsByLength
[l
];
330 const size_t id
= w
.idx
;
331 end
= !!(w
.len
& 0x80);
333 if (w
.transform
== 0) {
335 if (!IsMatch(w
, &data
[1], max_length
- 1)) {
338 /* Transforms " " + kIdentity + "" and "." + kIdentity + "" */
339 AddMatch(id
+ (is_space
? 6 : 32) * n
, l
+ 1, l
, matches
);
340 has_found_match
= BROTLI_TRUE
;
341 if (l
+ 2 >= max_length
) {
344 /* Transforms " " + kIdentity + <suffix> and "." + kIdentity + <suffix>
348 AddMatch(id
+ (is_space
? 2 : 77) * n
, l
+ 2, l
, matches
);
349 } else if (s
[0] == '(') {
350 AddMatch(id
+ (is_space
? 89 : 67) * n
, l
+ 2, l
, matches
);
351 } else if (is_space
) {
353 AddMatch(id
+ 103 * n
, l
+ 2, l
, matches
);
355 AddMatch(id
+ 33 * n
, l
+ 3, l
, matches
);
357 } else if (s
[0] == '.') {
358 AddMatch(id
+ 71 * n
, l
+ 2, l
, matches
);
360 AddMatch(id
+ 52 * n
, l
+ 3, l
, matches
);
362 } else if (s
[0] == '=') {
364 AddMatch(id
+ 81 * n
, l
+ 3, l
, matches
);
365 } else if (s
[1] == '\'') {
366 AddMatch(id
+ 98 * n
, l
+ 3, l
, matches
);
370 } else if (is_space
) {
371 /* Set is_all_caps=0 for kUppercaseFirst and
372 is_all_caps=1 otherwise (kUppercaseAll) transform. */
373 const BROTLI_BOOL is_all_caps
=
374 TO_BROTLI_BOOL(w
.transform
!= kUppercaseFirst
);
376 if (!IsMatch(w
, &data
[1], max_length
- 1)) {
379 /* Transforms " " + kUppercase{First,All} + "" */
380 AddMatch(id
+ (is_all_caps
? 85 : 30) * n
, l
+ 1, l
, matches
);
381 has_found_match
= BROTLI_TRUE
;
382 if (l
+ 2 >= max_length
) {
385 /* Transforms " " + kUppercase{First,All} + <suffix> */
388 AddMatch(id
+ (is_all_caps
? 83 : 15) * n
, l
+ 2, l
, matches
);
389 } else if (s
[0] == ',') {
391 AddMatch(id
+ 109 * n
, l
+ 2, l
, matches
);
394 AddMatch(id
+ (is_all_caps
? 111 : 65) * n
, l
+ 3, l
, matches
);
396 } else if (s
[0] == '.') {
397 AddMatch(id
+ (is_all_caps
? 115 : 96) * n
, l
+ 2, l
, matches
);
399 AddMatch(id
+ (is_all_caps
? 117 : 91) * n
, l
+ 3, l
, matches
);
401 } else if (s
[0] == '=') {
403 AddMatch(id
+ (is_all_caps
? 110 : 118) * n
, l
+ 3, l
, matches
);
404 } else if (s
[1] == '\'') {
405 AddMatch(id
+ (is_all_caps
? 119 : 120) * n
, l
+ 3, l
, matches
);
411 if (max_length
>= 6) {
412 /* Transforms with prefixes "e ", "s ", ", " and "\xc2\xa0" */
413 if ((data
[1] == ' ' &&
414 (data
[0] == 'e' || data
[0] == 's' || data
[0] == ',')) ||
415 (data
[0] == 0xc2 && data
[1] == 0xa0)) {
416 size_t offset
= kStaticDictionaryBuckets
[Hash(&data
[2])];
417 BROTLI_BOOL end
= !offset
;
419 DictWord w
= kStaticDictionaryWords
[offset
++];
420 const size_t l
= w
.len
& 0x7F;
421 const size_t n
= (size_t)1 << kBrotliDictionarySizeBitsByLength
[l
];
422 const size_t id
= w
.idx
;
423 end
= !!(w
.len
& 0x80);
425 if (w
.transform
== 0 && IsMatch(w
, &data
[2], max_length
- 2)) {
426 if (data
[0] == 0xc2) {
427 AddMatch(id
+ 102 * n
, l
+ 2, l
, matches
);
428 has_found_match
= BROTLI_TRUE
;
429 } else if (l
+ 2 < max_length
&& data
[l
+ 2] == ' ') {
430 size_t t
= data
[0] == 'e' ? 18 : (data
[0] == 's' ? 7 : 13);
431 AddMatch(id
+ t
* n
, l
+ 3, l
, matches
);
432 has_found_match
= BROTLI_TRUE
;
438 if (max_length
>= 9) {
439 /* Transforms with prefixes " the " and ".com/" */
440 if ((data
[0] == ' ' && data
[1] == 't' && data
[2] == 'h' &&
441 data
[3] == 'e' && data
[4] == ' ') ||
442 (data
[0] == '.' && data
[1] == 'c' && data
[2] == 'o' &&
443 data
[3] == 'm' && data
[4] == '/')) {
444 size_t offset
= kStaticDictionaryBuckets
[Hash(&data
[5])];
445 BROTLI_BOOL end
= !offset
;
447 DictWord w
= kStaticDictionaryWords
[offset
++];
448 const size_t l
= w
.len
& 0x7F;
449 const size_t n
= (size_t)1 << kBrotliDictionarySizeBitsByLength
[l
];
450 const size_t id
= w
.idx
;
451 end
= !!(w
.len
& 0x80);
453 if (w
.transform
== 0 && IsMatch(w
, &data
[5], max_length
- 5)) {
454 AddMatch(id
+ (data
[0] == ' ' ? 41 : 72) * n
, l
+ 5, l
, matches
);
455 has_found_match
= BROTLI_TRUE
;
456 if (l
+ 5 < max_length
) {
457 const uint8_t* s
= &data
[l
+ 5];
458 if (data
[0] == ' ') {
459 if (l
+ 8 < max_length
&&
460 s
[0] == ' ' && s
[1] == 'o' && s
[2] == 'f' && s
[3] == ' ') {
461 AddMatch(id
+ 62 * n
, l
+ 9, l
, matches
);
462 if (l
+ 12 < max_length
&&
463 s
[4] == 't' && s
[5] == 'h' && s
[6] == 'e' && s
[7] == ' ') {
464 AddMatch(id
+ 73 * n
, l
+ 13, l
, matches
);
473 return has_found_match
;
476 #if defined(__cplusplus) || defined(c_plusplus)