]>
git.proxmox.com Git - ceph.git/blob - ceph/src/civetweb/src/third_party/lua-5.3.3/src/lutf8lib.c
9042582d1e4302185ab9b1d5abbfe9bf9101ba35
2 ** $Id: lutf8lib.c,v 1.15 2015/03/28 19:16:55 roberto Exp $
3 ** Standard library for UTF-8 manipulation
4 ** See Copyright Notice in lua.h
23 #define MAXUNICODE 0x10FFFF
25 #define iscont(p) ((*(p) & 0xC0) == 0x80)
29 /* translate a relative string position: negative means back from end */
30 static lua_Integer
u_posrelat (lua_Integer pos
, size_t len
) {
31 if (pos
>= 0) return pos
;
32 else if (0u - (size_t)pos
> len
) return 0;
33 else return (lua_Integer
)len
+ pos
+ 1;
38 ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
40 static const char *utf8_decode (const char *o
, int *val
) {
41 static const unsigned int limits
[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
42 const unsigned char *s
= (const unsigned char *)o
;
43 unsigned int c
= s
[0];
44 unsigned int res
= 0; /* final result */
45 if (c
< 0x80) /* ascii? */
48 int count
= 0; /* to count number of continuation bytes */
49 while (c
& 0x40) { /* still have continuation bytes? */
50 int cc
= s
[++count
]; /* read next byte */
51 if ((cc
& 0xC0) != 0x80) /* not a continuation byte? */
52 return NULL
; /* invalid byte sequence */
53 res
= (res
<< 6) | (cc
& 0x3F); /* add lower 6 bits from cont. byte */
54 c
<<= 1; /* to test next bit */
56 res
|= ((c
& 0x7F) << (count
* 5)); /* add first byte */
57 if (count
> 3 || res
> MAXUNICODE
|| res
<= limits
[count
])
58 return NULL
; /* invalid byte sequence */
59 s
+= count
; /* skip continuation bytes read */
62 return (const char *)s
+ 1; /* +1 to include first byte */
67 ** utf8len(s [, i [, j]]) --> number of characters that start in the
68 ** range [i,j], or nil + current position if 's' is not well formed in
71 static int utflen (lua_State
*L
) {
74 const char *s
= luaL_checklstring(L
, 1, &len
);
75 lua_Integer posi
= u_posrelat(luaL_optinteger(L
, 2, 1), len
);
76 lua_Integer posj
= u_posrelat(luaL_optinteger(L
, 3, -1), len
);
77 luaL_argcheck(L
, 1 <= posi
&& --posi
<= (lua_Integer
)len
, 2,
78 "initial position out of string");
79 luaL_argcheck(L
, --posj
< (lua_Integer
)len
, 3,
80 "final position out of string");
81 while (posi
<= posj
) {
82 const char *s1
= utf8_decode(s
+ posi
, NULL
);
83 if (s1
== NULL
) { /* conversion error? */
84 lua_pushnil(L
); /* return nil ... */
85 lua_pushinteger(L
, posi
+ 1); /* ... and current position */
91 lua_pushinteger(L
, n
);
97 ** codepoint(s, [i, [j]]) -> returns codepoints for all characters
98 ** that start in the range [i,j]
100 static int codepoint (lua_State
*L
) {
102 const char *s
= luaL_checklstring(L
, 1, &len
);
103 lua_Integer posi
= u_posrelat(luaL_optinteger(L
, 2, 1), len
);
104 lua_Integer pose
= u_posrelat(luaL_optinteger(L
, 3, posi
), len
);
107 luaL_argcheck(L
, posi
>= 1, 2, "out of range");
108 luaL_argcheck(L
, pose
<= (lua_Integer
)len
, 3, "out of range");
109 if (posi
> pose
) return 0; /* empty interval; return no values */
110 if (pose
- posi
>= INT_MAX
) /* (lua_Integer -> int) overflow? */
111 return luaL_error(L
, "string slice too long");
112 n
= (int)(pose
- posi
) + 1;
113 luaL_checkstack(L
, n
, "string slice too long");
116 for (s
+= posi
- 1; s
< se
;) {
118 s
= utf8_decode(s
, &code
);
120 return luaL_error(L
, "invalid UTF-8 code");
121 lua_pushinteger(L
, code
);
128 static void pushutfchar (lua_State
*L
, int arg
) {
129 lua_Integer code
= luaL_checkinteger(L
, arg
);
130 luaL_argcheck(L
, 0 <= code
&& code
<= MAXUNICODE
, arg
, "value out of range");
131 lua_pushfstring(L
, "%U", (long)code
);
136 ** utfchar(n1, n2, ...) -> char(n1)..char(n2)...
138 static int utfchar (lua_State
*L
) {
139 int n
= lua_gettop(L
); /* number of arguments */
140 if (n
== 1) /* optimize common case of single char */
145 luaL_buffinit(L
, &b
);
146 for (i
= 1; i
<= n
; i
++) {
157 ** offset(s, n, [i]) -> index where n-th character counting from
158 ** position 'i' starts; 0 means character at 'i'.
160 static int byteoffset (lua_State
*L
) {
162 const char *s
= luaL_checklstring(L
, 1, &len
);
163 lua_Integer n
= luaL_checkinteger(L
, 2);
164 lua_Integer posi
= (n
>= 0) ? 1 : len
+ 1;
165 posi
= u_posrelat(luaL_optinteger(L
, 3, posi
), len
);
166 luaL_argcheck(L
, 1 <= posi
&& --posi
<= (lua_Integer
)len
, 3,
167 "position out of range");
169 /* find beginning of current byte sequence */
170 while (posi
> 0 && iscont(s
+ posi
)) posi
--;
173 if (iscont(s
+ posi
))
174 luaL_error(L
, "initial position is a continuation byte");
176 while (n
< 0 && posi
> 0) { /* move back */
177 do { /* find beginning of previous character */
179 } while (posi
> 0 && iscont(s
+ posi
));
184 n
--; /* do not move for 1st character */
185 while (n
> 0 && posi
< (lua_Integer
)len
) {
186 do { /* find beginning of next character */
188 } while (iscont(s
+ posi
)); /* (cannot pass final '\0') */
193 if (n
== 0) /* did it find given character? */
194 lua_pushinteger(L
, posi
+ 1);
195 else /* no such character */
201 static int iter_aux (lua_State
*L
) {
203 const char *s
= luaL_checklstring(L
, 1, &len
);
204 lua_Integer n
= lua_tointeger(L
, 2) - 1;
205 if (n
< 0) /* first iteration? */
206 n
= 0; /* start from here */
207 else if (n
< (lua_Integer
)len
) {
208 n
++; /* skip current byte */
209 while (iscont(s
+ n
)) n
++; /* and its continuations */
211 if (n
>= (lua_Integer
)len
)
212 return 0; /* no more codepoints */
215 const char *next
= utf8_decode(s
+ n
, &code
);
216 if (next
== NULL
|| iscont(next
))
217 return luaL_error(L
, "invalid UTF-8 code");
218 lua_pushinteger(L
, n
+ 1);
219 lua_pushinteger(L
, code
);
225 static int iter_codes (lua_State
*L
) {
226 luaL_checkstring(L
, 1);
227 lua_pushcfunction(L
, iter_aux
);
229 lua_pushinteger(L
, 0);
234 /* pattern to match a single UTF-8 character */
235 #define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
238 static const luaL_Reg funcs
[] = {
239 {"offset", byteoffset
},
240 {"codepoint", codepoint
},
243 {"codes", iter_codes
},
245 {"charpattern", NULL
},
250 LUAMOD_API
int luaopen_utf8 (lua_State
*L
) {
251 luaL_newlib(L
, funcs
);
252 lua_pushlstring(L
, UTF8PATT
, sizeof(UTF8PATT
)/sizeof(char) - 1);
253 lua_setfield(L
, -2, "charpattern");