4 * Provides a cache to optimize indexed string lookups. The cache keeps
5 * track of (byte offset, char offset) states for a fixed number of strings.
6 * Otherwise we'd need to scan from either end of the string, as we store
7 * strings in (extended) UTF-8.
10 #include "duk_internal.h"
13 * Delete references to given hstring from the heap string cache.
15 * String cache references are 'weak': they are not counted towards
16 * reference counts, nor serve as roots for mark-and-sweep. When an
17 * object is about to be freed, such references need to be removed.
20 DUK_INTERNAL
void duk_heap_strcache_string_remove(duk_heap
*heap
, duk_hstring
*h
) {
22 for (i
= 0; i
< DUK_HEAP_STRCACHE_SIZE
; i
++) {
23 duk_strcache
*c
= heap
->strcache
+ i
;
25 DUK_DD(DUK_DDPRINT("deleting weak strcache reference to hstring %p from heap %p",
26 (void *) h
, (void *) heap
));
29 /* XXX: the string shouldn't appear twice, but we now loop to the
30 * end anyway; if fixed, add a looping assertion to ensure there
38 * String scanning helpers
40 * All bytes other than UTF-8 continuation bytes ([0x80,0xbf]) are
41 * considered to contribute a character. This must match how string
42 * character length is computed.
45 DUK_LOCAL
const duk_uint8_t
*duk__scan_forwards(const duk_uint8_t
*p
, const duk_uint8_t
*q
, duk_uint_fast32_t n
) {
52 if ((*p
& 0xc0) != 0x80) {
61 DUK_LOCAL
const duk_uint8_t
*duk__scan_backwards(const duk_uint8_t
*p
, const duk_uint8_t
*q
, duk_uint_fast32_t n
) {
68 if ((*p
& 0xc0) != 0x80) {
78 * Convert char offset to byte offset
80 * Avoid using the string cache if possible: for ASCII strings byte and
81 * char offsets are equal and for short strings direct scanning may be
82 * better than using the string cache (which may evict a more important
85 * Typing now assumes 32-bit string byte/char offsets (duk_uint_fast32_t).
86 * Better typing might be to use duk_size_t.
89 DUK_INTERNAL duk_uint_fast32_t
duk_heap_strcache_offset_char2byte(duk_hthread
*thr
, duk_hstring
*h
, duk_uint_fast32_t char_offset
) {
92 duk_uint_fast32_t byte_offset
;
95 duk_uint_fast32_t dist_start
, dist_end
, dist_sce
;
96 const duk_uint8_t
*p_start
;
97 const duk_uint8_t
*p_end
;
98 const duk_uint8_t
*p_found
;
100 if (char_offset
> DUK_HSTRING_GET_CHARLEN(h
)) {
105 * For ASCII strings, the answer is simple.
108 if (DUK_HSTRING_IS_ASCII(h
)) {
109 /* clen == blen -> pure ascii */
114 * For non-ASCII strings, we need to scan forwards or backwards
115 * from some starting point. The starting point may be the start
116 * or end of the string, or some cached midpoint in the string
119 * For "short" strings we simply scan without checking or updating
120 * the cache. For longer strings we check and update the cache as
121 * necessary, inserting a new cache entry if none exists.
124 DUK_DDD(DUK_DDDPRINT("non-ascii string %p, char_offset=%ld, clen=%ld, blen=%ld",
125 (void *) h
, (long) char_offset
,
126 (long) DUK_HSTRING_GET_CHARLEN(h
),
127 (long) DUK_HSTRING_GET_BYTELEN(h
)));
131 use_cache
= (DUK_HSTRING_GET_CHARLEN(h
) > DUK_HEAP_STRINGCACHE_NOCACHE_LIMIT
);
134 #ifdef DUK_USE_DDDPRINT
135 DUK_DDD(DUK_DDDPRINT("stringcache before char2byte (using cache):"));
136 for (i
= 0; i
< DUK_HEAP_STRCACHE_SIZE
; i
++) {
137 duk_strcache
*c
= heap
->strcache
+ i
;
138 DUK_DDD(DUK_DDDPRINT(" [%ld] -> h=%p, cidx=%ld, bidx=%ld",
139 (long) i
, (void *) c
->h
, (long) c
->cidx
, (long) c
->bidx
));
143 for (i
= 0; i
< DUK_HEAP_STRCACHE_SIZE
; i
++) {
144 duk_strcache
*c
= heap
->strcache
+ i
;
154 * Scan from shortest distance:
157 * - cache entry (if exists)
160 DUK_ASSERT(DUK_HSTRING_GET_CHARLEN(h
) >= char_offset
);
161 dist_start
= char_offset
;
162 dist_end
= DUK_HSTRING_GET_CHARLEN(h
) - char_offset
;
163 dist_sce
= 0; DUK_UNREF(dist_sce
); /* initialize for debug prints, needed if sce==NULL */
165 p_start
= (const duk_uint8_t
*) DUK_HSTRING_GET_DATA(h
);
166 p_end
= (const duk_uint8_t
*) (p_start
+ DUK_HSTRING_GET_BYTELEN(h
));
170 if (char_offset
>= sce
->cidx
) {
171 dist_sce
= char_offset
- sce
->cidx
;
172 if ((dist_sce
<= dist_start
) && (dist_sce
<= dist_end
)) {
173 DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
174 "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
175 "scan forwards from sce",
176 (long) use_cache
, (void *) (sce
? sce
->h
: NULL
),
177 (sce
? (long) sce
->cidx
: (long) -1),
178 (sce
? (long) sce
->bidx
: (long) -1),
179 (long) dist_start
, (long) dist_end
, (long) dist_sce
));
181 p_found
= duk__scan_forwards(p_start
+ sce
->bidx
,
187 dist_sce
= sce
->cidx
- char_offset
;
188 if ((dist_sce
<= dist_start
) && (dist_sce
<= dist_end
)) {
189 DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
190 "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
191 "scan backwards from sce",
192 (long) use_cache
, (void *) (sce
? sce
->h
: NULL
),
193 (sce
? (long) sce
->cidx
: (long) -1),
194 (sce
? (long) sce
->bidx
: (long) -1),
195 (long) dist_start
, (long) dist_end
, (long) dist_sce
));
197 p_found
= duk__scan_backwards(p_start
+ sce
->bidx
,
205 /* no sce, or sce scan not best */
207 if (dist_start
<= dist_end
) {
208 DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
209 "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
210 "scan forwards from string start",
211 (long) use_cache
, (void *) (sce
? sce
->h
: NULL
),
212 (sce
? (long) sce
->cidx
: (long) -1),
213 (sce
? (long) sce
->bidx
: (long) -1),
214 (long) dist_start
, (long) dist_end
, (long) dist_sce
));
216 p_found
= duk__scan_forwards(p_start
,
220 DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
221 "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
222 "scan backwards from string end",
223 (long) use_cache
, (void *) (sce
? sce
->h
: NULL
),
224 (sce
? (long) sce
->cidx
: (long) -1),
225 (sce
? (long) sce
->bidx
: (long) -1),
226 (long) dist_start
, (long) dist_end
, (long) dist_sce
));
228 p_found
= duk__scan_backwards(p_end
,
236 /* Scan error: this shouldn't normally happen; it could happen if
237 * string is not valid UTF-8 data, and clen/blen are not consistent
238 * with the scanning algorithm.
243 DUK_ASSERT(p_found
>= p_start
);
244 DUK_ASSERT(p_found
<= p_end
); /* may be equal */
245 byte_offset
= (duk_uint32_t
) (p_found
- p_start
);
247 DUK_DDD(DUK_DDDPRINT("-> string %p, cidx %ld -> bidx %ld",
248 (void *) h
, (long) char_offset
, (long) byte_offset
));
251 * Update cache entry (allocating if necessary), and move the
252 * cache entry to the first place (in an "LRU" policy).
256 /* update entry, allocating if necessary */
258 sce
= heap
->strcache
+ DUK_HEAP_STRCACHE_SIZE
- 1; /* take last entry */
261 DUK_ASSERT(sce
!= NULL
);
262 sce
->bidx
= (duk_uint32_t
) (p_found
- p_start
);
263 sce
->cidx
= (duk_uint32_t
) char_offset
;
265 /* LRU: move our entry to first */
266 if (sce
> &heap
->strcache
[0]) {
276 DUK_MEMMOVE((void *) (&heap
->strcache
[1]),
277 (const void *) (&heap
->strcache
[0]),
278 (size_t) (((char *) sce
) - ((char *) &heap
->strcache
[0])));
279 heap
->strcache
[0] = tmp
;
281 /* 'sce' points to the wrong entry here, but is no longer used */
283 #ifdef DUK_USE_DDDPRINT
284 DUK_DDD(DUK_DDDPRINT("stringcache after char2byte (using cache):"));
285 for (i
= 0; i
< DUK_HEAP_STRCACHE_SIZE
; i
++) {
286 duk_strcache
*c
= heap
->strcache
+ i
;
287 DUK_DDD(DUK_DDDPRINT(" [%ld] -> h=%p, cidx=%ld, bidx=%ld",
288 (long) i
, (void *) c
->h
, (long) c
->cidx
, (long) c
->bidx
));
296 DUK_ERROR_INTERNAL_DEFMSG(thr
);