]> git.proxmox.com Git - ceph.git/blob - ceph/src/civetweb/src/third_party/duktape-1.5.2/src-separate/duk_heap_hashstring.c
buildsys: switch source download to quincy
[ceph.git] / ceph / src / civetweb / src / third_party / duktape-1.5.2 / src-separate / duk_heap_hashstring.c
1 /*
2 * String hash computation (interning).
3 *
4 * String hashing is performance critical because a string hash is computed
5 * for all new strings which are candidates to be added to the string table.
6 * However, strings actually added to the string table go through a codepoint
7 * length calculation which dominates performance because it goes through
8 * every byte of the input string (but only for strings added).
9 *
10 * The string hash algorithm should be fast, but on the other hand provide
11 * good enough hashes to ensure both string table and object property table
12 * hash tables work reasonably well (i.e., there aren't too many collisions
13 * with real world inputs). Unless the hash is cryptographic, it's always
14 * possible to craft inputs with maximal hash collisions.
15 *
16 * NOTE: The hash algorithms must match src/dukutil.py:duk_heap_hashstring()
17 * for ROM string support!
18 */
19
20 #include "duk_internal.h"
21
22 #if defined(DUK_USE_STRHASH_DENSE)
23 /* Constants for duk_hashstring(). */
24 #define DUK__STRHASH_SHORTSTRING 4096L
25 #define DUK__STRHASH_MEDIUMSTRING (256L * 1024L)
26 #define DUK__STRHASH_BLOCKSIZE 256L
27
28 DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
29 duk_uint32_t hash;
30
31 /* Use Murmurhash2 directly for short strings, and use "block skipping"
32 * for long strings: hash an initial part and then sample the rest of
33 * the string with reasonably sized chunks. An initial offset for the
34 * sampling is computed based on a hash of the initial part of the string;
35 * this is done to (usually) avoid the case where all long strings have
36 * certain offset ranges which are never sampled.
37 *
38 * Skip should depend on length and bound the total time to roughly
39 * logarithmic. With current values:
40 *
41 * 1M string => 256 * 241 = 61696 bytes (0.06M) of hashing
42 * 1G string => 256 * 16321 = 4178176 bytes (3.98M) of hashing
43 *
44 * XXX: It would be better to compute the skip offset more "smoothly"
45 * instead of having a few boundary values.
46 */
47
48 /* note: mixing len into seed improves hashing when skipping */
49 duk_uint32_t str_seed = heap->hash_seed ^ ((duk_uint32_t) len);
50
51 if (len <= DUK__STRHASH_SHORTSTRING) {
52 hash = duk_util_hashbytes(str, len, str_seed);
53 } else {
54 duk_size_t off;
55 duk_size_t skip;
56
57 if (len <= DUK__STRHASH_MEDIUMSTRING) {
58 skip = (duk_size_t) (16 * DUK__STRHASH_BLOCKSIZE + DUK__STRHASH_BLOCKSIZE);
59 } else {
60 skip = (duk_size_t) (256 * DUK__STRHASH_BLOCKSIZE + DUK__STRHASH_BLOCKSIZE);
61 }
62
63 hash = duk_util_hashbytes(str, (duk_size_t) DUK__STRHASH_SHORTSTRING, str_seed);
64 off = DUK__STRHASH_SHORTSTRING + (skip * (hash % 256)) / 256;
65
66 /* XXX: inefficient loop */
67 while (off < len) {
68 duk_size_t left = len - off;
69 duk_size_t now = (duk_size_t) (left > DUK__STRHASH_BLOCKSIZE ? DUK__STRHASH_BLOCKSIZE : left);
70 hash ^= duk_util_hashbytes(str + off, now, str_seed);
71 off += skip;
72 }
73 }
74
75 #if defined(DUK_USE_STRHASH16)
76 /* Truncate to 16 bits here, so that a computed hash can be compared
77 * against a hash stored in a 16-bit field.
78 */
79 hash &= 0x0000ffffUL;
80 #endif
81 return hash;
82 }
83
84 #undef DUK__STRHASH_SHORTSTRING
85 #undef DUK__STRHASH_MEDIUMSTRING
86 #undef DUK__STRHASH_BLOCKSIZE
87 #else /* DUK_USE_STRHASH_DENSE */
88 DUK_INTERNAL duk_uint32_t duk_heap_hashstring(duk_heap *heap, const duk_uint8_t *str, duk_size_t len) {
89 duk_uint32_t hash;
90 duk_size_t step;
91 duk_size_t off;
92
93 /* Slightly modified "Bernstein hash" from:
94 *
95 * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx
96 *
97 * Modifications: string skipping and reverse direction similar to
98 * Lua 5.1.5, and different hash initializer.
99 *
100 * The reverse direction ensures last byte it always included in the
101 * hash which is a good default as changing parts of the string are
102 * more often in the suffix than in the prefix.
103 */
104
105 hash = heap->hash_seed ^ ((duk_uint32_t) len); /* Bernstein hash init value is normally 5381 */
106 step = (len >> DUK_USE_STRHASH_SKIP_SHIFT) + 1;
107 for (off = len; off >= step; off -= step) {
108 DUK_ASSERT(off >= 1); /* off >= step, and step >= 1 */
109 hash = (hash * 33) + str[off - 1];
110 }
111
112 #if defined(DUK_USE_STRHASH16)
113 /* Truncate to 16 bits here, so that a computed hash can be compared
114 * against a hash stored in a 16-bit field.
115 */
116 hash &= 0x0000ffffUL;
117 #endif
118 return hash;
119 }
120 #endif /* DUK_USE_STRHASH_DENSE */