]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | /* |
2 | * String hash computation (interning). | |
3 | * | |
4 | * String hashing is performance critical because a string hash is computed | |
5 | * for all new strings which are candidates to be added to the string table. | |
6 | * However, strings actually added to the string table go through a codepoint | |
7 | * length calculation which dominates performance because it goes through | |
8 | * every byte of the input string (but only for strings added). | |
9 | * | |
10 | * The string hash algorithm should be fast, but on the other hand provide | |
11 | * good enough hashes to ensure both string table and object property table | |
12 | * hash tables work reasonably well (i.e., there aren't too many collisions | |
13 | * with real world inputs). Unless the hash is cryptographic, it's always | |
14 | * possible to craft inputs with maximal hash collisions. | |
15 | * | |
16 | * NOTE: The hash algorithms must match src/dukutil.py:duk_heap_hashstring() | |
17 | * for ROM string support! | |
18 | */ | |
19 | ||
20 | #include "duk_internal.h" | |
21 | ||
22 | #if defined(DUK_USE_STRHASH_DENSE) | |
23 | /* Constants for duk_hashstring(). */ | |
24 | #define DUK__STRHASH_SHORTSTRING 4096L | |
25 | #define DUK__STRHASH_MEDIUMSTRING (256L * 1024L) | |
26 | #define DUK__STRHASH_BLOCKSIZE 256L | |
27 | ||
28 | DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) { | |
29 | duk_uint32_t hash; | |
30 | ||
31 | /* Use Murmurhash2 directly for short strings, and use "block skipping" | |
32 | * for long strings: hash an initial part and then sample the rest of | |
33 | * the string with reasonably sized chunks. An initial offset for the | |
34 | * sampling is computed based on a hash of the initial part of the string; | |
35 | * this is done to (usually) avoid the case where all long strings have | |
36 | * certain offset ranges which are never sampled. | |
37 | * | |
38 | * Skip should depend on length and bound the total time to roughly | |
39 | * logarithmic. With current values: | |
40 | * | |
41 | * 1M string => 256 * 241 = 61696 bytes (0.06M) of hashing | |
42 | * 1G string => 256 * 16321 = 4178176 bytes (3.98M) of hashing | |
43 | * | |
44 | * XXX: It would be better to compute the skip offset more "smoothly" | |
45 | * instead of having a few boundary values. | |
46 | */ | |
47 | ||
48 | /* note: mixing len into seed improves hashing when skipping */ | |
49 | duk_uint32_t str_seed = heap->hash_seed ^ ((duk_uint32_t) len); | |
50 | ||
51 | if (len <= DUK__STRHASH_SHORTSTRING) { | |
52 | hash = duk_util_hashbytes(str, len, str_seed); | |
53 | } else { | |
54 | duk_size_t off; | |
55 | duk_size_t skip; | |
56 | ||
57 | if (len <= DUK__STRHASH_MEDIUMSTRING) { | |
58 | skip = (duk_size_t) (16 * DUK__STRHASH_BLOCKSIZE + DUK__STRHASH_BLOCKSIZE); | |
59 | } else { | |
60 | skip = (duk_size_t) (256 * DUK__STRHASH_BLOCKSIZE + DUK__STRHASH_BLOCKSIZE); | |
61 | } | |
62 | ||
63 | hash = duk_util_hashbytes(str, (duk_size_t) DUK__STRHASH_SHORTSTRING, str_seed); | |
64 | off = DUK__STRHASH_SHORTSTRING + (skip * (hash % 256)) / 256; | |
65 | ||
66 | /* XXX: inefficient loop */ | |
67 | while (off < len) { | |
68 | duk_size_t left = len - off; | |
69 | duk_size_t now = (duk_size_t) (left > DUK__STRHASH_BLOCKSIZE ? DUK__STRHASH_BLOCKSIZE : left); | |
70 | hash ^= duk_util_hashbytes(str + off, now, str_seed); | |
71 | off += skip; | |
72 | } | |
73 | } | |
74 | ||
75 | #if defined(DUK_USE_STRHASH16) | |
76 | /* Truncate to 16 bits here, so that a computed hash can be compared | |
77 | * against a hash stored in a 16-bit field. | |
78 | */ | |
79 | hash &= 0x0000ffffUL; | |
80 | #endif | |
81 | return hash; | |
82 | } | |
83 | ||
84 | #undef DUK__STRHASH_SHORTSTRING | |
85 | #undef DUK__STRHASH_MEDIUMSTRING | |
86 | #undef DUK__STRHASH_BLOCKSIZE | |
87 | #else /* DUK_USE_STRHASH_DENSE */ | |
88 | DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) { | |
89 | duk_uint32_t hash; | |
90 | duk_size_t step; | |
91 | duk_size_t off; | |
92 | ||
93 | /* Slightly modified "Bernstein hash" from: | |
94 | * | |
95 | * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx | |
96 | * | |
97 | * Modifications: string skipping and reverse direction similar to | |
98 | * Lua 5.1.5, and different hash initializer. | |
99 | * | |
100 | * The reverse direction ensures last byte it always included in the | |
101 | * hash which is a good default as changing parts of the string are | |
102 | * more often in the suffix than in the prefix. | |
103 | */ | |
104 | ||
105 | hash = heap->hash_seed ^ ((duk_uint32_t) len); /* Bernstein hash init value is normally 5381 */ | |
106 | step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1; | |
107 | for (off = len; off >= step; off -= step) { | |
108 | DUK_ASSERT(off >= 1); /* off >= step, and step >= 1 */ | |
109 | hash = (hash * 33) + str[off - 1]; | |
110 | } | |
111 | ||
112 | #if defined(DUK_USE_STRHASH16) | |
113 | /* Truncate to 16 bits here, so that a computed hash can be compared | |
114 | * against a hash stored in a 16-bit field. | |
115 | */ | |
116 | hash &= 0x0000ffffUL; | |
117 | #endif | |
118 | return hash; | |
119 | } | |
120 | #endif /* DUK_USE_STRHASH_DENSE */ |