]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * String cache. | |
3 | * | |
4 | * Provides a cache to optimize indexed string lookups. The cache keeps | |
5 | * track of (byte offset, char offset) states for a fixed number of strings. | |
6 | * Otherwise we'd need to scan from either end of the string, as we store | |
7 | * strings in (extended) UTF-8. | |
8 | */ | |
9 | ||
10 | #include "duk_internal.h" | |
11 | ||
12 | /* | |
13 | * Delete references to given hstring from the heap string cache. | |
14 | * | |
15 | * String cache references are 'weak': they are not counted towards | |
16 | * reference counts, nor serve as roots for mark-and-sweep. When an | |
17 | * object is about to be freed, such references need to be removed. | |
18 | */ | |
19 | ||
20 | DUK_INTERNAL void duk_heap_strcache_string_remove(duk_heap *heap, duk_hstring *h) { | |
21 | duk_small_int_t i; | |
22 | for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) { | |
23 | duk_strcache *c = heap->strcache + i; | |
24 | if (c->h == h) { | |
25 | DUK_DD(DUK_DDPRINT("deleting weak strcache reference to hstring %p from heap %p", | |
26 | (void *) h, (void *) heap)); | |
27 | c->h = NULL; | |
28 | ||
29 | /* XXX: the string shouldn't appear twice, but we now loop to the | |
30 | * end anyway; if fixed, add a looping assertion to ensure there | |
31 | * is no duplicate. | |
32 | */ | |
33 | } | |
34 | } | |
35 | } | |
36 | ||
37 | /* | |
38 | * String scanning helpers | |
11fdf7f2 TL |
39 | * |
40 | * All bytes other than UTF-8 continuation bytes ([0x80,0xbf]) are | |
41 | * considered to contribute a character. This must match how string | |
42 | * character length is computed. | |
7c673cae FG |
43 | */ |
44 | ||
11fdf7f2 | 45 | DUK_LOCAL const duk_uint8_t *duk__scan_forwards(const duk_uint8_t *p, const duk_uint8_t *q, duk_uint_fast32_t n) { |
7c673cae FG |
46 | while (n > 0) { |
47 | for (;;) { | |
48 | p++; | |
49 | if (p >= q) { | |
50 | return NULL; | |
51 | } | |
52 | if ((*p & 0xc0) != 0x80) { | |
53 | break; | |
54 | } | |
55 | } | |
56 | n--; | |
57 | } | |
58 | return p; | |
59 | } | |
60 | ||
11fdf7f2 | 61 | DUK_LOCAL const duk_uint8_t *duk__scan_backwards(const duk_uint8_t *p, const duk_uint8_t *q, duk_uint_fast32_t n) { |
7c673cae FG |
62 | while (n > 0) { |
63 | for (;;) { | |
64 | p--; | |
65 | if (p < q) { | |
66 | return NULL; | |
67 | } | |
68 | if ((*p & 0xc0) != 0x80) { | |
69 | break; | |
70 | } | |
71 | } | |
72 | n--; | |
73 | } | |
74 | return p; | |
75 | } | |
76 | ||
77 | /* | |
78 | * Convert char offset to byte offset | |
79 | * | |
80 | * Avoid using the string cache if possible: for ASCII strings byte and | |
81 | * char offsets are equal and for short strings direct scanning may be | |
82 | * better than using the string cache (which may evict a more important | |
83 | * entry). | |
84 | * | |
85 | * Typing now assumes 32-bit string byte/char offsets (duk_uint_fast32_t). | |
86 | * Better typing might be to use duk_size_t. | |
87 | */ | |
88 | ||
89 | DUK_INTERNAL duk_uint_fast32_t duk_heap_strcache_offset_char2byte(duk_hthread *thr, duk_hstring *h, duk_uint_fast32_t char_offset) { | |
90 | duk_heap *heap; | |
91 | duk_strcache *sce; | |
92 | duk_uint_fast32_t byte_offset; | |
93 | duk_small_int_t i; | |
94 | duk_bool_t use_cache; | |
95 | duk_uint_fast32_t dist_start, dist_end, dist_sce; | |
11fdf7f2 TL |
96 | const duk_uint8_t *p_start; |
97 | const duk_uint8_t *p_end; | |
98 | const duk_uint8_t *p_found; | |
7c673cae FG |
99 | |
100 | if (char_offset > DUK_HSTRING_GET_CHARLEN(h)) { | |
101 | goto error; | |
102 | } | |
103 | ||
104 | /* | |
105 | * For ASCII strings, the answer is simple. | |
106 | */ | |
107 | ||
108 | if (DUK_HSTRING_IS_ASCII(h)) { | |
109 | /* clen == blen -> pure ascii */ | |
110 | return char_offset; | |
111 | } | |
112 | ||
113 | /* | |
114 | * For non-ASCII strings, we need to scan forwards or backwards | |
115 | * from some starting point. The starting point may be the start | |
116 | * or end of the string, or some cached midpoint in the string | |
117 | * cache. | |
118 | * | |
119 | * For "short" strings we simply scan without checking or updating | |
120 | * the cache. For longer strings we check and update the cache as | |
121 | * necessary, inserting a new cache entry if none exists. | |
122 | */ | |
123 | ||
124 | DUK_DDD(DUK_DDDPRINT("non-ascii string %p, char_offset=%ld, clen=%ld, blen=%ld", | |
125 | (void *) h, (long) char_offset, | |
126 | (long) DUK_HSTRING_GET_CHARLEN(h), | |
127 | (long) DUK_HSTRING_GET_BYTELEN(h))); | |
128 | ||
129 | heap = thr->heap; | |
130 | sce = NULL; | |
131 | use_cache = (DUK_HSTRING_GET_CHARLEN(h) > DUK_HEAP_STRINGCACHE_NOCACHE_LIMIT); | |
132 | ||
133 | if (use_cache) { | |
134 | #ifdef DUK_USE_DDDPRINT | |
135 | DUK_DDD(DUK_DDDPRINT("stringcache before char2byte (using cache):")); | |
136 | for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) { | |
137 | duk_strcache *c = heap->strcache + i; | |
138 | DUK_DDD(DUK_DDDPRINT(" [%ld] -> h=%p, cidx=%ld, bidx=%ld", | |
139 | (long) i, (void *) c->h, (long) c->cidx, (long) c->bidx)); | |
140 | } | |
141 | #endif | |
142 | ||
143 | for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) { | |
144 | duk_strcache *c = heap->strcache + i; | |
145 | ||
146 | if (c->h == h) { | |
147 | sce = c; | |
148 | break; | |
149 | } | |
150 | } | |
151 | } | |
152 | ||
153 | /* | |
154 | * Scan from shortest distance: | |
155 | * - start of string | |
156 | * - end of string | |
157 | * - cache entry (if exists) | |
158 | */ | |
159 | ||
160 | DUK_ASSERT(DUK_HSTRING_GET_CHARLEN(h) >= char_offset); | |
161 | dist_start = char_offset; | |
162 | dist_end = DUK_HSTRING_GET_CHARLEN(h) - char_offset; | |
163 | dist_sce = 0; DUK_UNREF(dist_sce); /* initialize for debug prints, needed if sce==NULL */ | |
164 | ||
11fdf7f2 TL |
165 | p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h); |
166 | p_end = (const duk_uint8_t *) (p_start + DUK_HSTRING_GET_BYTELEN(h)); | |
7c673cae FG |
167 | p_found = NULL; |
168 | ||
169 | if (sce) { | |
170 | if (char_offset >= sce->cidx) { | |
171 | dist_sce = char_offset - sce->cidx; | |
172 | if ((dist_sce <= dist_start) && (dist_sce <= dist_end)) { | |
173 | DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, " | |
174 | "dist_start=%ld, dist_end=%ld, dist_sce=%ld => " | |
175 | "scan forwards from sce", | |
176 | (long) use_cache, (void *) (sce ? sce->h : NULL), | |
177 | (sce ? (long) sce->cidx : (long) -1), | |
178 | (sce ? (long) sce->bidx : (long) -1), | |
179 | (long) dist_start, (long) dist_end, (long) dist_sce)); | |
180 | ||
181 | p_found = duk__scan_forwards(p_start + sce->bidx, | |
182 | p_end, | |
183 | dist_sce); | |
184 | goto scan_done; | |
185 | } | |
186 | } else { | |
187 | dist_sce = sce->cidx - char_offset; | |
188 | if ((dist_sce <= dist_start) && (dist_sce <= dist_end)) { | |
189 | DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, " | |
190 | "dist_start=%ld, dist_end=%ld, dist_sce=%ld => " | |
191 | "scan backwards from sce", | |
192 | (long) use_cache, (void *) (sce ? sce->h : NULL), | |
193 | (sce ? (long) sce->cidx : (long) -1), | |
194 | (sce ? (long) sce->bidx : (long) -1), | |
195 | (long) dist_start, (long) dist_end, (long) dist_sce)); | |
196 | ||
197 | p_found = duk__scan_backwards(p_start + sce->bidx, | |
198 | p_start, | |
199 | dist_sce); | |
200 | goto scan_done; | |
201 | } | |
202 | } | |
203 | } | |
204 | ||
205 | /* no sce, or sce scan not best */ | |
206 | ||
207 | if (dist_start <= dist_end) { | |
208 | DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, " | |
209 | "dist_start=%ld, dist_end=%ld, dist_sce=%ld => " | |
210 | "scan forwards from string start", | |
211 | (long) use_cache, (void *) (sce ? sce->h : NULL), | |
212 | (sce ? (long) sce->cidx : (long) -1), | |
213 | (sce ? (long) sce->bidx : (long) -1), | |
214 | (long) dist_start, (long) dist_end, (long) dist_sce)); | |
215 | ||
216 | p_found = duk__scan_forwards(p_start, | |
217 | p_end, | |
218 | dist_start); | |
219 | } else { | |
220 | DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, " | |
221 | "dist_start=%ld, dist_end=%ld, dist_sce=%ld => " | |
222 | "scan backwards from string end", | |
223 | (long) use_cache, (void *) (sce ? sce->h : NULL), | |
224 | (sce ? (long) sce->cidx : (long) -1), | |
225 | (sce ? (long) sce->bidx : (long) -1), | |
226 | (long) dist_start, (long) dist_end, (long) dist_sce)); | |
227 | ||
228 | p_found = duk__scan_backwards(p_end, | |
229 | p_start, | |
230 | dist_end); | |
231 | } | |
232 | ||
233 | scan_done: | |
234 | ||
235 | if (!p_found) { | |
236 | /* Scan error: this shouldn't normally happen; it could happen if | |
237 | * string is not valid UTF-8 data, and clen/blen are not consistent | |
238 | * with the scanning algorithm. | |
239 | */ | |
240 | goto error; | |
241 | } | |
242 | ||
243 | DUK_ASSERT(p_found >= p_start); | |
244 | DUK_ASSERT(p_found <= p_end); /* may be equal */ | |
245 | byte_offset = (duk_uint32_t) (p_found - p_start); | |
246 | ||
247 | DUK_DDD(DUK_DDDPRINT("-> string %p, cidx %ld -> bidx %ld", | |
248 | (void *) h, (long) char_offset, (long) byte_offset)); | |
249 | ||
250 | /* | |
251 | * Update cache entry (allocating if necessary), and move the | |
252 | * cache entry to the first place (in an "LRU" policy). | |
253 | */ | |
254 | ||
255 | if (use_cache) { | |
256 | /* update entry, allocating if necessary */ | |
257 | if (!sce) { | |
258 | sce = heap->strcache + DUK_HEAP_STRCACHE_SIZE - 1; /* take last entry */ | |
259 | sce->h = h; | |
260 | } | |
261 | DUK_ASSERT(sce != NULL); | |
262 | sce->bidx = (duk_uint32_t) (p_found - p_start); | |
263 | sce->cidx = (duk_uint32_t) char_offset; | |
264 | ||
265 | /* LRU: move our entry to first */ | |
266 | if (sce > &heap->strcache[0]) { | |
267 | /* | |
268 | * A C | |
269 | * B A | |
270 | * C <- sce ==> B | |
271 | * D D | |
272 | */ | |
273 | duk_strcache tmp; | |
274 | ||
275 | tmp = *sce; | |
276 | DUK_MEMMOVE((void *) (&heap->strcache[1]), | |
11fdf7f2 | 277 | (const void *) (&heap->strcache[0]), |
7c673cae FG |
278 | (size_t) (((char *) sce) - ((char *) &heap->strcache[0]))); |
279 | heap->strcache[0] = tmp; | |
280 | ||
281 | /* 'sce' points to the wrong entry here, but is no longer used */ | |
282 | } | |
283 | #ifdef DUK_USE_DDDPRINT | |
284 | DUK_DDD(DUK_DDDPRINT("stringcache after char2byte (using cache):")); | |
285 | for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) { | |
286 | duk_strcache *c = heap->strcache + i; | |
287 | DUK_DDD(DUK_DDDPRINT(" [%ld] -> h=%p, cidx=%ld, bidx=%ld", | |
288 | (long) i, (void *) c->h, (long) c->cidx, (long) c->bidx)); | |
289 | } | |
290 | #endif | |
291 | } | |
292 | ||
293 | return byte_offset; | |
294 | ||
295 | error: | |
11fdf7f2 | 296 | DUK_ERROR_INTERNAL_DEFMSG(thr); |
7c673cae FG |
297 | return 0; |
298 | } |